diff --git a/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch b/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch
new file mode 100644
index 000000000000..c4878b5bdacd
--- /dev/null
+++ b/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch
@@ -0,0 +1,108 @@
+commit 0000000000000000000000000000000000000000
+Author: Dragonfly Defrag Hackathon <dev@dragonflydb.io>
+Date:   Tue May 5 17:30:00 2026 +0000
+
+    feat: skip defrag-targeted pages in mi_malloc
+
+    Adds a defrag_skip byte to mi_page_t and a public API
+    mi_page_set_defrag_skip(page_addr, skip). When set, mi_malloc skips
+    the page in the small-size fast path, the queue-head fast path, and
+    mi_page_queue_find_free_ex. Prevents new allocations from landing
+    on a page that phased defrag is trying to drain.
+
+--- a/include/mimalloc/internal.h
++++ b/include/mimalloc/internal.h
+@@ -516,7 +516,14 @@
+   mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+   const size_t idx = _mi_wsize_from_size(size);
+   mi_assert_internal(idx < MI_PAGES_DIRECT);
+-  return heap->pages_free_direct[idx];
++  mi_page_t* page = heap->pages_free_direct[idx];
++  // dragonfly: when the cached small-page is a defrag target, force the
++  // generic slow path so the allocation goes through `mi_find_free_page` ->
++  // `mi_page_queue_find_free_ex` which skips defrag-targeted pages.
++  if (mi_unlikely(page->defrag_skip)) {
++    return (mi_page_t*) &_mi_page_empty;
++  }
++  return page;
+ }
+ 
+ // Segment that contains the pointer
+--- a/include/mimalloc/types.h
++++ b/include/mimalloc/types.h
+@@ -337,6 +337,7 @@
+   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
++  uint8_t               defrag_skip;       // dragonfly: when nonzero, alloc paths skip this page (it is being drained by defrag)
+                                            // padding
+   size_t                block_size;        // size available in each block (always `>0`)
+   uint8_t*              page_start;        // start of the page area containing the blocks
+--- a/src/init.c
++++ b/src/init.c
+@@ -26,6 +26,7 @@
+   0,       // used
+   0,       // block size shift
+   0,       // heap tag
++  0,       // defrag_skip (dragonfly)
+   0,       // block_size
+   NULL,    // page_start
+   #if (MI_PADDING || MI_ENCODE_FREELIST)
+--- a/src/page.c
++++ b/src/page.c
+@@ -697,6 +697,7 @@
+   page->keys[1] = _mi_heap_random_next(heap);
+   #endif
+   page->free_is_zero = page->is_zero_init;
++  page->defrag_skip = 0;  // dragonfly: fresh page is not a defrag target
+   #if MI_DEBUG>2
+   if (page->is_zero_init) {
+     mi_track_mem_defined(page->page_start, page_size);
+@@ -763,6 +764,14 @@
+   while (page != NULL)
+   {
+     mi_page_t* next = page->next; // remember next
++
++    // dragonfly: pages tagged by defrag are being drained; skip them so new
++    // allocations don't refill targets while EVACUATE moves entries off.
++    if (page->defrag_skip) {
++      page = next;
++      continue;
++    }
++
+     #if MI_STAT
+     count++;
+     #endif
+@@ -860,6 +869,12 @@
+ 
+   // check the first page: we even do this with candidate search or otherwise we re-search every time
+   mi_page_t* page = pq->first;
++  // dragonfly: skip the queue-head fast path when it points at a defrag
++  // target so the search falls through to mi_page_queue_find_free_ex which
++  // walks past target pages.
++  if (page != NULL && page->defrag_skip) {
++    page = NULL;
++  }
+   if (page != NULL) {
+    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+     if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
+--- a/src/alloc.c
++++ b/src/alloc.c
+@@ -711,6 +711,17 @@
+   return result;
+ }
+ 
++// dragonfly: mark a page so that mi_malloc skips it when picking a page to
++// allocate from. Used by phased defrag to prevent EVACUATE moves from
++// refilling target pages that we are trying to drain. `page_addr` must be
++// a value previously returned in mi_page_usage_stats_t::page_address (i.e.
++// a `(uintptr_t)mi_page_t*`).
++void mi_page_set_defrag_skip(uintptr_t page_addr, bool skip) mi_attr_noexcept {
++  if (page_addr == 0) return;
++  mi_page_t* page = (mi_page_t*) page_addr;
++  page->defrag_skip = (skip ? 1 : 0);
++}
++
+ // ------------------------------------------------------
+ // ensure explicit external inline definitions are emitted!
+ // ------------------------------------------------------
diff --git a/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch b/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch
new file mode 100644
index 000000000000..837353a81018
--- /dev/null
+++ b/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch
@@ -0,0 +1,84 @@
+commit 0000000000000000000000000000000000000000
+Author: Dragonfly Defrag Hackathon <dev@dragonflydb.io>
+Date:   Tue May 5 18:00:00 2026 +0000
+
+    feat: underutilized-page callback for reactive defrag
+
+    Adds a public API mi_dfly_set_underutil_callback(cb) that fires on
+    local-thread free when a page's used count drops below a configured
+    threshold for the first time. Lets phased defrag enqueue pages
+    reactively instead of doing a full prime-table CENSUS scan to
+    discover them.
+
+--- a/include/mimalloc.h
++++ b/include/mimalloc.h
+@@ -271,6 +271,13 @@
+ 
+ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+ 
++// dragonfly: callback fired on local-thread free when a page's used count
++// crosses the configured threshold downward. Lets phased defrag enqueue
++// pages reactively instead of doing a full prime-table CENSUS scan.
++typedef void (*mi_dfly_underutil_callback_t)(uintptr_t page_addr);
++mi_decl_export void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb);
++mi_decl_export void mi_dfly_set_underutil_threshold_pct(uint8_t pct);
++
+ // Experimental
+ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+ mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
+--- a/src/free.c
++++ b/src/free.c
+@@ -18,6 +18,24 @@
+ static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
+ 
++// ------------------------------------------------------
++// Dragonfly: underutilized-page callback
++// Fired on local-thread free when the page's used count crosses the
++// configured threshold downward. Lets defrag enqueue pages reactively
++// instead of doing a full prime-table CENSUS scan.
++// ------------------------------------------------------
++static mi_dfly_underutil_callback_t _mi_dfly_underutil_cb = NULL;
++static uint8_t                       _mi_dfly_underutil_pct = 80;
++
++void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb) {
++  _mi_dfly_underutil_cb = cb;
++}
++
++void mi_dfly_set_underutil_threshold_pct(uint8_t pct) {
++  if (pct > 100) pct = 100;
++  _mi_dfly_underutil_pct = pct;
++}
++
+ 
+ // ------------------------------------------------------
+ // Free
+@@ -44,12 +62,28 @@
+   // actual free: push on the local free list
+   mi_block_set_next(page, block, page->local_free);
+   page->local_free = block;
++  // dragonfly: decide whether to fire the underutilized-page callback BEFORE
++  // _mi_page_retire below. _mi_page_retire may call _mi_page_free which
++  // returns the page metadata to the segment, after which reading
++  // page->used / page->capacity is a UAF. We gate on page->used > 1 so that
++  // after --, used > 0 (page not retired and still alive), which makes
++  // (uintptr_t)page a valid address to hand to the callback.
++  bool fire_underutil_cb = false;
++  if (mi_unlikely(_mi_dfly_underutil_cb != NULL && page->used > 1)) {
++    const uint32_t cap_thr   = (uint32_t)page->capacity * _mi_dfly_underutil_pct;
++    const uint32_t prev_x100 = (uint32_t)page->used * 100;
++    const uint32_t cur_x100  = (uint32_t)(page->used - 1) * 100;
++    fire_underutil_cb = (prev_x100 > cap_thr && cur_x100 <= cap_thr);
++  }
+   if mi_unlikely(--page->used == 0) {
+     _mi_page_retire(page);
+   }
+   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+     _mi_page_unfull(page);
+   }
++  if (fire_underutil_cb) {
++    _mi_dfly_underutil_cb((uintptr_t)page);
++  }
+ }
+ 
+ // Adjust a block that was allocated aligned, to the actual start of the block in the page.
diff --git a/src/core/dash.h b/src/core/dash.h
index a2710122d2b3..5b57355e0bbd 100644
--- a/src/core/dash.h
+++ b/src/core/dash.h
@@ -412,14 +412,18 @@ class DashTable : public detail::DashTableBase {
     return stash_unloaded_;
   }
 
+  // Advances cursor by exactly one logical bucket in bucket-major order, without
+  // visiting bucket contents. Used by sampled walkers (e.g. defrag CENSUS) to
+  // skip buckets between Traverse calls. Returns Cursor::end() once the table
+  // is exhausted.
+  Cursor AdvanceCursorBucketOrder(Cursor cursor);
+
  private:
   enum class InsertMode {
     kInsertIfNotFound,
     kForceInsert,
   };
 
-  Cursor AdvanceCursorBucketOrder(Cursor cursor);
-
   template <typename U, typename V, typename EvictionPolicy>
   std::pair<iterator, bool> InsertInternal(U&& key, V&& value, EvictionPolicy& policy,
                                            InsertMode mode);
diff --git a/src/core/page_usage/CMakeLists.txt b/src/core/page_usage/CMakeLists.txt
index 207668767244..17654130b81e 100644
--- a/src/core/page_usage/CMakeLists.txt
+++ b/src/core/page_usage/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_library(dfly_page_usage page_usage_stats.cc)
+add_library(dfly_page_usage page_usage_stats.cc page_usage_visitors.cc)
 target_link_libraries(dfly_page_usage base TRDP::hdr_histogram redis_lib absl::strings)
diff --git a/src/core/page_usage/page_usage_stats.cc b/src/core/page_usage/page_usage_stats.cc
index a62c533cfa61..7d4b1d80da59 100644
--- a/src/core/page_usage/page_usage_stats.cc
+++ b/src/core/page_usage/page_usage_stats.cc
@@ -207,9 +207,9 @@ uint64_t PageUsage::UsedQuotaCycles() const {
 }
 
 bool PageUsage::IsPageForObjectUnderUtilized(void* object) {
-  mi_page_usage_stats_t stat;
-  zmalloc_page_is_underutilized(object, threshold_, collect_stats_ == CollectPageStats::YES, &stat);
-  return ConsumePageStats(stat);
+  return ConsumePageStats(mi_heap_page_is_underutilized(static_cast<mi_heap_t*>(zmalloc_heap),
+                                                        object, threshold_,
+                                                        collect_stats_ == CollectPageStats::YES));
 }
 
 bool PageUsage::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) {
diff --git a/src/core/page_usage/page_usage_stats.h b/src/core/page_usage/page_usage_stats.h
index c6f5c127bb50..db99875fc244 100644
--- a/src/core/page_usage/page_usage_stats.h
+++ b/src/core/page_usage/page_usage_stats.h
@@ -20,7 +20,10 @@ namespace dfly {
 class CycleQuota {
  public:
   static constexpr uint64_t kMaxQuota = std::numeric_limits<uint64_t>::max();
-  static constexpr uint64_t kDefaultDefragQuota = 150;
+  // 40000 here is ~10ms of real time because helio's CycleClock mixes raw rdtsc
+  // with abseil's shifted frequency, making FromUsec/ToUsec ~4x off on x86.
+  // Once the helio bug is fixed, drop this to 10000.
+  static constexpr uint64_t kDefaultDefragQuota = 40'000;
 
   explicit CycleQuota(uint64_t quota_usec);
 
@@ -83,9 +86,11 @@ class PageUsage {
 
   uint64_t UsedQuotaCycles() const;
 
+  // Returns true when the object on the page should be reallocated. Subclasses
+  // (Evacuator, CensusTaker) override to short-circuit or extend the decision.
+  // Out-of-line in page_usage_stats.cc.
   virtual bool IsPageForObjectUnderUtilized(void* object);
-
-  bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object);
+  virtual bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object);
 
   CollectedPageStats CollectedStats() const {
     return unique_pages_.CollectedStats();
@@ -107,6 +112,31 @@ class PageUsage {
 
   bool QuotaDepleted() const;
 
+  virtual bool ShouldStop() const {
+    return false;
+  }
+
+  // Read-only walkers (e.g. CENSUS) never reallocate, so callers can skip
+  // pre/post sizing work that only matters when an object may move.
+  virtual bool IsReadOnly() const {
+    return false;
+  }
+
+  // When true, the traversal should also defrag keys (it->first) in addition
+  // to values. Only the phased algorithm (CENSUS + EVACUATE) enables this.
+  virtual bool ShouldDefragKeys() const {
+    return false;
+  }
+
+  // Walkers may stash the bucket cursor about to be visited so that downstream
+  // Observe() calls can attribute candidates back to a bucket. Default no-op.
+  virtual void SetCurrentBucketCursor(uint64_t /*cursor*/) {
+  }
+
+  float threshold() const {
+    return threshold_;
+  }
+
   void ExtendQuota(uint64_t quota_usec);
 
  private:
@@ -136,6 +166,7 @@ class PageUsage {
 
   CycleQuota quota_;
 
+ protected:
   // For use in testing, forces reallocate check to always return true
   bool force_reallocate_{false};
 };
diff --git a/src/core/page_usage/page_usage_visitors.cc b/src/core/page_usage/page_usage_visitors.cc
new file mode 100644
index 000000000000..56dacad9d4ce
--- /dev/null
+++ b/src/core/page_usage/page_usage_visitors.cc
@@ -0,0 +1,612 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "core/page_usage/page_usage_visitors.h"
+
+#include <absl/flags/flag.h>
+
+#define MI_BUILD_RELEASE 1
+#include <mimalloc/internal.h>
+
+#include <algorithm>
+#include <ranges>
+#include <utility>
+
+#include "base/flags.h"
+#include "base/logging.h"
+
+extern "C" {
+#include "redis/zmalloc.h"
+}
+
+ABSL_FLAG(bool, defrag_use_skip_bit, false,
+          "If true, mark target pages with mimalloc's defrag_skip bit so EVAC moves don't "
+          "refill them. Disable to A/B compare against an unmarked baseline.");
+
+ABSL_FLAG(bool, defrag_keys, false,
+          "If true, the phased defragmenter also defragments key allocations "
+          "(it->first) in addition to values. Set to false to measure the "
+          "incremental benefit of key defrag.");
+
+ABSL_FLAG(double, defrag_skip_percentile, 0.5,
+          "Fraction of the target plan (sorted by retention_score, most-fragmented first) "
+          "to apply the mimalloc defrag_skip bit to. 0.5 (default) marks the top half "
+          "and lets the bottom half stay refillable, which empirically gives the best "
+          "floor-vs-bulge tradeoff. 1.0 marks every target (max reclaim, biggest bulge). "
+          "Lower values shrink the lockout footprint: only the most-fragmented top-K "
+          "pages are protected from refill while higher-utilization targets stay "
+          "refillable.");
+
+extern "C" {
+// Dragonfly mimalloc patch: tell mi_malloc to skip a page during defrag so
+// EVACUATE moves don't refill pages we're trying to drain.
+void mi_page_set_defrag_skip(uintptr_t page_addr, bool skip);
+}
+
+namespace dfly {
+
+namespace {
+
+constexpr uint64_t kPerTargetSlotCostBytes = 16 * 1024;
+
+uint64_t ReclaimableBytes(uint16_t capacity_blocks, uint16_t used_blocks, uint32_t block_size) {
+  if (used_blocks >= capacity_blocks)
+    return 0;
+  return uint64_t(capacity_blocks - used_blocks) * block_size;
+}
+
+uint64_t MoveBytes(uint16_t used_blocks, uint32_t block_size) {
+  return uint64_t(used_blocks) * block_size;
+}
+
+uint64_t ReclaimableBytes(const TargetPage& target) {
+  return ReclaimableBytes(target.capacity_blocks, target.blocks_at_census, target.block_size);
+}
+
+uint64_t MoveBytes(const TargetPage& target) {
+  return MoveBytes(target.blocks_at_census, target.block_size);
+}
+
+float ComputeRetentionScore(uint16_t capacity_blocks, uint16_t used_blocks, uint32_t block_size,
+                            uint64_t per_block_move_cost_bytes) {
+  const uint64_t reclaim = ReclaimableBytes(capacity_blocks, used_blocks, block_size);
+  const uint64_t move = MoveBytes(used_blocks, block_size);
+  const uint64_t cost =
+      move + uint64_t(used_blocks) * per_block_move_cost_bytes + kPerTargetSlotCostBytes;
+  return static_cast<float>(static_cast<double>(reclaim) / std::max<uint64_t>(1, cost));
+}
+
+void PopulateAgg(PageAgg& agg, const mi_page_usage_stats_t& stat, float score) {
+  agg.page_address = stat.page_address;
+  agg.block_size = static_cast<uint32_t>(stat.block_size);
+  agg.capacity_blocks = stat.capacity;
+  agg.used_blocks = stat.used;
+  agg.flags = stat.flags;
+  ++agg.observed_movable_blocks;
+  ++agg.generation;
+  agg.retention_score = score;
+}
+
+TargetFilterReason ClassifyForTarget(const PageAgg& agg) {
+  if (agg.observed_movable_blocks == 0)
+    return TargetFilterReason::kNoObservedBlocks;
+  if (agg.used_blocks == 0)
+    return TargetFilterReason::kAlreadyEmpty;
+  if (agg.observed_movable_blocks > agg.used_blocks)
+    return TargetFilterReason::kStaleObservation;
+  if (agg.observed_movable_blocks < agg.used_blocks)
+    return TargetFilterReason::kHasImmovableData;
+  return TargetFilterReason::kKeep;
+}
+
+TargetPage MakeTargetPage(const PageAgg& agg) {
+  TargetPage tp;
+  tp.page_address = agg.page_address;
+  tp.block_size = agg.block_size;
+  tp.capacity_blocks = agg.capacity_blocks;
+  tp.blocks_at_census = agg.used_blocks;
+  tp.retention_score_at_census = agg.retention_score;
+  return tp;
+}
+
+// Wrapper around mimalloc's defrag_skip setter. Gated by a runtime flag so
+// experiments can disable the skip-bit logic and observe whether refills on
+// drained pages re-emerge.
+//
+// SHARP EDGE: mi_page_set_defrag_skip writes through the page address as
+// mi_page_t*. If that memory has been unmapped (page retired -> segment
+// freed -> OS reclaim, particularly under Dragonfly's aggressive purge
+// settings) or reused for something else, the write segfaults or silently
+// corrupts unrelated state. The window opens between SELECT_TARGETS adding
+// the page to the plan and ~TargetPlan clearing the bit at end-of-cycle;
+// any external drain-to-empty plus retire-and-unmap during that window
+// makes the page address stale.
+//
+// The EvacDecide success / revalidation paths clear the bit *before* the
+// triggering move runs (so the page is still mapped). The destructor sweep
+// is the riskier site — it touches every plan target unconditionally. To
+// harden, options are: (a) move the skip flag to a per-shard side table
+// keyed by page address, (b) add a mimalloc refcount keeping target pages
+// mapped for the cycle, (c) validate the page is still in a known segment
+// before writing. (a) is the cleanest if it ever bites in production.
+void SetDefragSkipIfEnabled(uintptr_t page_addr, bool skip) {
+  if (absl::GetFlag(FLAGS_defrag_use_skip_bit)) {
+    mi_page_set_defrag_skip(page_addr, skip);
+  }
+}
+
+void AttributeBlockSkip(EvacStats& stats, RevalidationFailureReason reason, uint32_t block_size) {
+  switch (reason) {
+    case RevalidationFailureReason::kHeapMismatch:
+      ++stats.blocks_revalidation_heap_mismatch;
+      stats.bytes_revalidation_heap_mismatch += block_size;
+      break;
+    case RevalidationFailureReason::kActiveMallocPage:
+      ++stats.blocks_revalidation_active_malloc_page;
+      stats.bytes_revalidation_active_malloc_page += block_size;
+      break;
+    case RevalidationFailureReason::kFullPage:
+      ++stats.blocks_revalidation_full_page;
+      stats.bytes_revalidation_full_page += block_size;
+      break;
+    case RevalidationFailureReason::kAboveThreshold:
+      ++stats.blocks_revalidation_above_threshold;
+      stats.bytes_revalidation_above_threshold += block_size;
+      break;
+    case RevalidationFailureReason::kNone:
+      break;
+  }
+}
+
+void RecordFirstFailure(TargetPage* target, EvacStats& stats, RevalidationFailureReason reason,
+                        uint32_t block_size) {
+  target->revalidation_failed = true;
+  target->failure_reason = reason;
+  ++stats.blocks_skipped_revalidation_failed;
+  stats.bytes_skipped_revalidation_failed += block_size;
+  AttributeBlockSkip(stats, reason, block_size);
+  switch (reason) {
+    case RevalidationFailureReason::kHeapMismatch:
+      ++stats.targets_revalidation_heap_mismatch;
+      break;
+    case RevalidationFailureReason::kActiveMallocPage:
+      ++stats.targets_revalidation_active_malloc_page;
+      break;
+    case RevalidationFailureReason::kFullPage:
+      ++stats.targets_revalidation_full_page;
+      break;
+    case RevalidationFailureReason::kAboveThreshold:
+      ++stats.targets_revalidation_above_threshold;
+      break;
+    case RevalidationFailureReason::kNone:
+      break;
+  }
+  ++stats.targets_abandoned_revalidation;
+}
+
+}  // namespace
+
+void CensusStats::Merge(const CensusStats& other) {
+  allocations_seen += other.allocations_seen;
+  allocations_recorded += other.allocations_recorded;
+  skipped_above_threshold += other.skipped_above_threshold;
+  skipped_full_page += other.skipped_full_page;
+  skipped_wrong_heap += other.skipped_wrong_heap;
+  skipped_active_malloc_page += other.skipped_active_malloc_page;
+  skipped_low_score += other.skipped_low_score;
+  pages_evicted_from_retained += other.pages_evicted_from_retained;
+  heap_rebuilds += other.heap_rebuilds;
+}
+
+void PlanStats::Merge(const PlanStats& other) {
+  targets_kept += other.targets_kept;
+  filtered_no_observed_blocks += other.filtered_no_observed_blocks;
+  filtered_stale += other.filtered_stale;
+  filtered_has_immovable_data += other.filtered_has_immovable_data;
+  filtered_already_empty += other.filtered_already_empty;
+  truncated_by_cap += other.truncated_by_cap;
+  selected_capacity_bytes_at_census += other.selected_capacity_bytes_at_census;
+  selected_used_bytes_at_census += other.selected_used_bytes_at_census;
+  selected_reclaimable_bytes_at_census += other.selected_reclaimable_bytes_at_census;
+  truncated_reclaimable_bytes += other.truncated_reclaimable_bytes;
+  filtered_immovable_reclaimable_bytes += other.filtered_immovable_reclaimable_bytes;
+}
+
+void EvacStats::Merge(const EvacStats& other) {
+  blocks_skipped_not_target += other.blocks_skipped_not_target;
+  blocks_skipped_target_done += other.blocks_skipped_target_done;
+  blocks_skipped_revalidation_failed += other.blocks_skipped_revalidation_failed;
+  blocks_move_committed += other.blocks_move_committed;
+  bytes_skipped_target_done += other.bytes_skipped_target_done;
+  bytes_skipped_revalidation_failed += other.bytes_skipped_revalidation_failed;
+  bytes_move_committed += other.bytes_move_committed;
+  targets_revalidation_heap_mismatch += other.targets_revalidation_heap_mismatch;
+  targets_revalidation_active_malloc_page += other.targets_revalidation_active_malloc_page;
+  targets_revalidation_full_page += other.targets_revalidation_full_page;
+  targets_revalidation_above_threshold += other.targets_revalidation_above_threshold;
+  blocks_revalidation_heap_mismatch += other.blocks_revalidation_heap_mismatch;
+  blocks_revalidation_active_malloc_page += other.blocks_revalidation_active_malloc_page;
+  blocks_revalidation_full_page += other.blocks_revalidation_full_page;
+  blocks_revalidation_above_threshold += other.blocks_revalidation_above_threshold;
+  bytes_revalidation_heap_mismatch += other.bytes_revalidation_heap_mismatch;
+  bytes_revalidation_active_malloc_page += other.bytes_revalidation_active_malloc_page;
+  bytes_revalidation_full_page += other.bytes_revalidation_full_page;
+  bytes_revalidation_above_threshold += other.bytes_revalidation_above_threshold;
+  targets_abandoned_revalidation += other.targets_abandoned_revalidation;
+  targets_completed_during_evac += other.targets_completed_during_evac;
+}
+
+PageCensus::PageCensus(CensusStats* stats, size_t max_retained_pages,
+                       uint64_t per_block_move_cost_bytes)
+    : stats_(stats),
+      max_retained_pages_(max_retained_pages),
+      per_block_move_cost_bytes_(per_block_move_cost_bytes) {
+}
+
+void PageCensus::Observe(const mi_page_usage_stats_t& stat, uint64_t bucket_cursor) {
+  ++stats_->allocations_seen;
+
+  if (stat.flags & MI_DFLY_HEAP_MISMATCH) {
+    ++stats_->skipped_wrong_heap;
+    return;
+  }
+  if (stat.flags & MI_DFLY_PAGE_USED_FOR_MALLOC) {
+    ++stats_->skipped_active_malloc_page;
+    return;
+  }
+  if (stat.flags & MI_DFLY_PAGE_FULL) {
+    ++stats_->skipped_full_page;
+    return;
+  }
+  if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) {
+    ++stats_->skipped_above_threshold;
+    return;
+  }
+
+  // Object lives on a candidate page; remember its bucket so EVACUATE can
+  // skip buckets that contain no candidates at all.
+  cursor_hints_.insert(bucket_cursor);
+
+  const float new_score = ComputeRetentionScore(
+      stat.capacity, stat.used, static_cast<uint32_t>(stat.block_size), per_block_move_cost_bytes_);
+
+  if constexpr (kEnableTopK) {
+    auto add_entry = [&] {
+      PageAgg& agg = pages_[stat.page_address];
+      PopulateAgg(agg, stat, new_score);
+      worst_retained_.push({stat.page_address, new_score, agg.generation});
+      ++stats_->allocations_recorded;
+    };
+
+    if (auto it = pages_.find(stat.page_address); it != pages_.end()) {
+      PopulateAgg(it->second, stat, new_score);
+      worst_retained_.push({stat.page_address, new_score, it->second.generation});
+      ++stats_->allocations_recorded;
+    } else if (pages_.size() < max_retained_pages_) {
+      add_entry();
+    } else {
+      while (!worst_retained_.empty()) {
+        const HeapEntry top = worst_retained_.top();
+        auto evict_it = pages_.find(top.page_address);
+        if (evict_it == pages_.end() || evict_it->second.generation != top.generation) {
+          worst_retained_.pop();
+          continue;
+        }
+        if (new_score > top.score) {
+          worst_retained_.pop();
+          pages_.erase(evict_it);
+          ++stats_->pages_evicted_from_retained;
+          add_entry();
+        } else {
+          ++stats_->skipped_low_score;
+        }
+        break;
+      }
+    }
+
+    if (worst_retained_.size() > 2 * max_retained_pages_) {
+      RebuildHeap();
+      ++stats_->heap_rebuilds;
+    }
+  } else {
+    if (auto it = pages_.find(stat.page_address); it != pages_.end()) {
+      PopulateAgg(it->second, stat, new_score);
+    } else {
+      CHECK_LT(pages_.size(), max_retained_pages_)
+          << "PageCensus exceeded max_retained_pages_=" << max_retained_pages_
+          << " with kEnableTopK=false";
+      PageAgg& agg = pages_[stat.page_address];
+      PopulateAgg(agg, stat, new_score);
+    }
+    ++stats_->allocations_recorded;
+  }
+}
+
+void PageCensus::ObservePage(const mi_page_usage_stats_t& stat) {
+  ++stats_->allocations_seen;
+
+  if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) {
+    ++stats_->skipped_above_threshold;
+    return;
+  }
+
+  const float new_score = ComputeRetentionScore(
+      stat.capacity, stat.used, static_cast<uint32_t>(stat.block_size), per_block_move_cost_bytes_);
+
+  if constexpr (!kEnableTopK) {
+    if (!pages_.contains(stat.page_address)) {
+      CHECK_LT(pages_.size(), max_retained_pages_)
+          << "PageCensus exceeded max_retained_pages_=" << max_retained_pages_
+          << " with kEnableTopK=false";
+    }
+  }
+
+  PageAgg& agg = pages_[stat.page_address];
+  agg.page_address = stat.page_address;
+  agg.block_size = static_cast<uint32_t>(stat.block_size);
+  agg.capacity_blocks = stat.capacity;
+  agg.used_blocks = stat.used;
+  agg.flags = stat.flags;
+  // No per-object visibility on this path; assume every used block is movable.
+  // EVAC's per-page revalidation drops pages whose blocks turn out immovable.
+  agg.observed_movable_blocks = stat.used;
+  agg.generation = 1;
+  agg.retention_score = new_score;
+
+  ++stats_->allocations_recorded;
+
+  if constexpr (kEnableTopK) {
+    worst_retained_.push({stat.page_address, new_score, agg.generation});
+    if (worst_retained_.size() > 2 * max_retained_pages_) {
+      RebuildHeap();
+      ++stats_->heap_rebuilds;
+    }
+  }
+}
+
+void PageCensus::RebuildHeap() {
+  std::vector<HeapEntry> entries;
+  entries.reserve(pages_.size());
+  for (const auto& [addr, agg] : pages_) {
+    entries.push_back({addr, agg.retention_score, agg.generation});
+  }
+  worst_retained_ = std::priority_queue(WorseFirst{}, std::move(entries));
+}
+
+std::vector<uint64_t> PageCensus::TakeCursorHints() {
+  std::vector out(cursor_hints_.begin(), cursor_hints_.end());
+  cursor_hints_.clear();
+  std::ranges::sort(out);
+  return out;
+}
+
+TargetPlan::TargetPlan(PlanStats* stats) : stats_(stats) {
+}
+
+TargetPlan::~TargetPlan() {
+  // Clear the mimalloc defrag_skip bit on every active target so the page
+  // becomes eligible for new allocations again. Tail entries are not marked
+  // (only active plan entries are), so they need no clear.
+  for (const TargetPage& tp : targets_) {
+    SetDefragSkipIfEnabled(tp.page_address, false);
+  }
+}
+
+void TargetPlan::BuildFrom(const PageCensus& census, size_t max_targets) {
+  targets_.clear();
+  address_to_index_.clear();
+  *stats_ = PlanStats{};
+
+  std::vector<TargetPage> candidates;
+  candidates.reserve(census.pages().size());
+
+  for (const auto& agg : census.pages() | std::views::values) {
+    switch (ClassifyForTarget(agg)) {
+      case TargetFilterReason::kKeep:
+        candidates.push_back(MakeTargetPage(agg));
+        break;
+      case TargetFilterReason::kNoObservedBlocks:
+        ++stats_->filtered_no_observed_blocks;
+        break;
+      case TargetFilterReason::kAlreadyEmpty:
+        ++stats_->filtered_already_empty;
+        break;
+      case TargetFilterReason::kStaleObservation:
+        ++stats_->filtered_stale;
+        break;
+      case TargetFilterReason::kHasImmovableData:
+        ++stats_->filtered_has_immovable_data;
+        stats_->filtered_immovable_reclaimable_bytes +=
+            uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size;
+        break;
+    }
+  }
+
+  std::ranges::sort(candidates, [](const TargetPage& a, const TargetPage& b) {
+    if (a.retention_score_at_census != b.retention_score_at_census)
+      return a.retention_score_at_census > b.retention_score_at_census;
+    const uint64_t a_reclaim = ReclaimableBytes(a);
+    const uint64_t b_reclaim = ReclaimableBytes(b);
+    if (a_reclaim != b_reclaim)
+      return a_reclaim > b_reclaim;
+    const uint64_t a_move = MoveBytes(a);
+    const uint64_t b_move = MoveBytes(b);
+    if (a_move != b_move)
+      return a_move < b_move;
+    return a.page_address < b.page_address;
+  });
+
+  max_targets = std::min(max_targets, candidates.size());
+  if (candidates.size() > max_targets) {
+    stats_->truncated_by_cap = candidates.size() - max_targets;
+    for (size_t i = max_targets; i < candidates.size(); ++i) {
+      const TargetPage& tp = candidates[i];
+      stats_->truncated_reclaimable_bytes +=
+          uint64_t(tp.capacity_blocks - tp.blocks_at_census) * tp.block_size;
+    }
+    candidates.resize(max_targets);
+  }
+
+  targets_ = std::move(candidates);
+  address_to_index_.reserve(targets_.size());
+  // Selective skip: targets_ is sorted descending by retention_score, which
+  // correlates inversely with used/capacity. Head of the vector = most
+  // fragmented. Apply skip_bit only to the top fraction so high-utilization
+  // targets stay refillable, shrinking lockout pressure on the workload.
+  const double skip_pct = std::clamp(absl::GetFlag(FLAGS_defrag_skip_percentile), 0.0, 1.0);
+  const size_t skip_count = static_cast<size_t>(static_cast<double>(targets_.size()) * skip_pct);
+  for (size_t i = 0; i < targets_.size(); ++i) {
+    address_to_index_[targets_[i].page_address] = i;
+    if (i < skip_count) {
+      // Tell mimalloc to skip this page in alloc paths; EVACUATE moves should
+      // not refill pages we are about to drain.
+      SetDefragSkipIfEnabled(targets_[i].page_address, true);
+    }
+  }
+  stats_->targets_kept = targets_.size();
+  pending_targets_ = targets_.size();
+
+  for (const TargetPage& tp : targets_) {
+    stats_->selected_capacity_bytes_at_census += uint64_t(tp.capacity_blocks) * tp.block_size;
+    stats_->selected_used_bytes_at_census += uint64_t(tp.blocks_at_census) * tp.block_size;
+    stats_->selected_reclaimable_bytes_at_census +=
+        uint64_t(tp.capacity_blocks - tp.blocks_at_census) * tp.block_size;
+  }
+}
+
+bool TargetPlan::Contains(uintptr_t addr) const {
+  return address_to_index_.contains(addr);
+}
+
+const TargetPage* TargetPlan::Find(uintptr_t addr) const {
+  const auto it = address_to_index_.find(addr);
+  return it == address_to_index_.end() ? nullptr : &targets_[it->second];
+}
+
+TargetPage* TargetPlan::FindMut(uintptr_t addr) {
+  const auto it = address_to_index_.find(addr);
+  return it == address_to_index_.end() ? nullptr : &targets_[it->second];
+}
+
+EvacOutcome EvacDecide(TargetPlan& plan, TargetPage* target, const mi_page_usage_stats_t& stat,
+                       EvacStats& stats) {
+  if (target->revalidation_failed) {
+    ++stats.blocks_skipped_revalidation_failed;
+    stats.bytes_skipped_revalidation_failed += stat.block_size;
+    AttributeBlockSkip(stats, target->failure_reason, stat.block_size);
+    return EvacOutcome::kRevalidationFailed;
+  }
+  // Order matches the precedence in CENSUS skip-checks. The first matching
+  // flag is attributed; targets_revalidation_* sums to targets_abandoned_revalidation.
+  if (stat.flags & MI_DFLY_HEAP_MISMATCH) {
+    RecordFirstFailure(target, stats, RevalidationFailureReason::kHeapMismatch, stat.block_size);
+    SetDefragSkipIfEnabled(target->page_address, false);
+    plan.NotifyTargetDone();
+    return EvacOutcome::kRevalidationFailed;
+  }
+  if (stat.flags & MI_DFLY_PAGE_USED_FOR_MALLOC) {
+    RecordFirstFailure(target, stats, RevalidationFailureReason::kActiveMallocPage,
+                       stat.block_size);
+    SetDefragSkipIfEnabled(target->page_address, false);
+    plan.NotifyTargetDone();
+    return EvacOutcome::kRevalidationFailed;
+  }
+  if (stat.flags & MI_DFLY_PAGE_FULL) {
+    RecordFirstFailure(target, stats, RevalidationFailureReason::kFullPage, stat.block_size);
+    SetDefragSkipIfEnabled(target->page_address, false);
+    plan.NotifyTargetDone();
+    return EvacOutcome::kRevalidationFailed;
+  }
+  if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) {
+    RecordFirstFailure(target, stats, RevalidationFailureReason::kAboveThreshold, stat.block_size);
+    SetDefragSkipIfEnabled(target->page_address, false);
+    plan.NotifyTargetDone();
+    return EvacOutcome::kRevalidationFailed;
+  }
+  if (target->blocks_evacuated >= target->blocks_at_census) {
+    ++stats.blocks_skipped_target_done;
+    stats.bytes_skipped_target_done += stat.block_size;
+    return EvacOutcome::kTargetAlreadyDone;
+  }
+  ++target->blocks_evacuated;
+  ++stats.blocks_move_committed;
+  stats.bytes_move_committed += stat.block_size;
+  if (target->blocks_evacuated == target->blocks_at_census) {
+    // First-time completion: drop the skip bit so the now-drained page can be
+    // reused by mi_malloc immediately if needed (no need to wait for plan
+    // teardown).
+    SetDefragSkipIfEnabled(target->page_address, false);
+    plan.NotifyTargetDone();
+    ++stats.targets_completed_during_evac;
+  }
+  return EvacOutcome::kCommitMove;
+}
+
+EvacOutcome EvacDecide(TargetPlan& plan, const mi_page_usage_stats_t& stat, EvacStats& stats) {
+  TargetPage* target = plan.FindMut(stat.page_address);
+  if (target == nullptr) {
+    ++stats.blocks_skipped_not_target;
+    return EvacOutcome::kNotATarget;
+  }
+  return EvacDecide(plan, target, stat, stats);
+}
+
+CensusTaker::CensusTaker(PageCensus* census, float threshold, CycleQuota quota)
+    : PageUsage(CollectPageStats::NO, threshold, quota), census_(census), threshold_(threshold) {
+}
+
+bool CensusTaker::IsPageForObjectUnderUtilized(void* object) {
+  mi_page_usage_stats_t stat = mi_heap_page_is_underutilized(
+      static_cast<mi_heap_t*>(zmalloc_heap), object, threshold_, /*collect_stats=*/true);
+  census_->Observe(stat, current_cursor_);
+  return false;
+}
+
+bool CensusTaker::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) {
+  mi_page_usage_stats_t stat =
+      mi_heap_page_is_underutilized(heap, object, threshold_, /*collect_stats=*/true);
+  census_->Observe(stat, current_cursor_);
+  return false;
+}
+
+bool CensusTaker::ShouldDefragKeys() const {
+  return ::absl::GetFlag(FLAGS_defrag_keys);
+}
+
+Evacuator::Evacuator(TargetPlan* plan, float threshold, EvacStats* evac_stats, CycleQuota quota)
+    : PageUsage(CollectPageStats::NO, threshold, quota),
+      plan_(plan),
+      threshold_(threshold),
+      evac_stats_(evac_stats) {
+}
+
+bool Evacuator::ShouldDefragKeys() const {
+  return ::absl::GetFlag(FLAGS_defrag_keys);
+}
+
+bool Evacuator::IsPageForObjectUnderUtilized(void* object) {
+  const uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(object));
+  TargetPage* target = plan_->FindMut(addr);
+  if (target == nullptr) {
+    ++evac_stats_->blocks_skipped_not_target;
+    return false;
+  }
+  const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized(
+      static_cast<mi_heap_t*>(zmalloc_heap), object, threshold_, /*collect_stats=*/true);
+  return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove;
+}
+
+bool Evacuator::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) {
+  const uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(object));
+  TargetPage* target = plan_->FindMut(addr);
+  if (target == nullptr) {
+    ++evac_stats_->blocks_skipped_not_target;
+    return false;
+  }
+  const mi_page_usage_stats_t stat =
+      mi_heap_page_is_underutilized(heap, object, threshold_, /*collect_stats=*/true);
+  return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove;
+}
+
+}  // namespace dfly
diff --git a/src/core/page_usage/page_usage_visitors.h b/src/core/page_usage/page_usage_visitors.h
new file mode 100644
index 000000000000..01da85fe6e56
--- /dev/null
+++ b/src/core/page_usage/page_usage_visitors.h
@@ -0,0 +1,338 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/container/flat_hash_set.h>
+
+#include <cstdint>
+#include <limits>
+#include <queue>
+#include <vector>
+
+#include "core/page_usage/page_usage_stats.h"
+
+extern "C" {
+#include "redis/zmalloc.h"
+mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio,
+                                                    bool collect_stats);
+}
+
+namespace dfly {
+
+struct PageAgg {
+  uintptr_t page_address = 0;
+
+  uint32_t block_size = 0;
+  float retention_score = 0.0f;
+
+  uint16_t capacity_blocks = 0;
+  uint16_t used_blocks = 0;
+  uint16_t observed_movable_blocks = 0;
+  uint16_t generation = 0;
+
+  uint8_t flags = 0;
+};
+
+struct CensusStats {
+  uint64_t allocations_seen = 0;
+  uint64_t allocations_recorded = 0;
+  uint64_t skipped_above_threshold = 0;
+  uint64_t skipped_full_page = 0;
+  uint64_t skipped_wrong_heap = 0;
+  uint64_t skipped_active_malloc_page = 0;
+  uint64_t skipped_low_score = 0;
+
+  // Top-K bookkeeping
+  uint64_t pages_evicted_from_retained = 0;
+  uint64_t heap_rebuilds = 0;
+
+  void Merge(const CensusStats& other);
+};
+
+class PageCensus {
+ public:
+  static constexpr size_t kDefaultMaxRetainedPages = 300'000;
+
+  // When false, PageCensus skips the top-k heap entirely and just inserts every
+  // observed page into pages_. Cheap (no priority_queue work per Observe), but
+  // loses the cap-and-evict-worst guard: the map hard-crashes on a new entry
+  // once it reaches max_retained_pages_. Flip to true to restore heap-based
+  // eviction so workloads exceeding the cap stay bounded.
+  static constexpr bool kEnableTopK = false;
+
+  explicit PageCensus(CensusStats* stats, size_t max_retained_pages = kDefaultMaxRetainedPages,
+                      uint64_t per_block_move_cost_bytes = 256);
+
+  // bucket_cursor is the DashTable cursor of the bucket the observed object
+  // currently lives in. Recorded so EVACUATE can restrict its walk to buckets
+  // known to contain at least one candidate object. Pass 0 if unknown
+  // (callers outside the hot defrag path may not have a cursor).
+  void Observe(const mi_page_usage_stats_t& stat, uint64_t bucket_cursor = 0);
+
+  // Page-level observe used by the underutil-set fast path: caller has already
+  // verified the page is in-heap, non-full, and below threshold. We don't have
+  // per-object visibility, so we assume every used block is movable and let
+  // EVACUATE's per-page revalidation correct any wrongly-classified entries.
+  // No bucket cursor is recorded (cursor_hints stays empty in this path).
+  void ObservePage(const mi_page_usage_stats_t& stat);
+
+  const CensusStats& stats() const {
+    return *stats_;
+  }
+
+  const absl::flat_hash_map<uintptr_t, PageAgg>& pages() const {
+    return pages_;
+  }
+
+  const absl::flat_hash_set<uint64_t>& cursor_hints() const {
+    return cursor_hints_;
+  }
+
+  // Move-out accessor: lets SELECT_TARGETS hand the hint set off to
+  // DefragTaskState before the census itself is released. Returns a sorted
+  // vector so the EVACUATE walker can iterate deterministically and resume
+  // from a saved cursor index across DoDefrag invocations.
+  std::vector<uint64_t> TakeCursorHints();
+
+ private:
+  void RebuildHeap();
+
+  struct HeapEntry {
+    uintptr_t page_address;
+    float score;
+    uint16_t generation;
+  };
+
+  struct WorseFirst {
+    bool operator()(const HeapEntry& a, const HeapEntry& b) const {
+      return a.score > b.score;
+    }
+  };
+
+  absl::flat_hash_map<uintptr_t, PageAgg> pages_;
+  std::priority_queue<HeapEntry, std::vector<HeapEntry>, WorseFirst> worst_retained_;
+  // Buckets that observed at least one object on a candidate (under-threshold)
+  // page. Consumed by EVACUATE to skip buckets with no targets.
+  absl::flat_hash_set<uint64_t> cursor_hints_;
+  CensusStats* stats_;
+  size_t max_retained_pages_;
+  uint64_t per_block_move_cost_bytes_;
+};
+
+enum class TargetStatus : uint8_t {
+  kPending,
+  kSuccess,
+  kPartial,
+  kFailed,
+};
+
+enum class TargetFilterReason : uint8_t {
+  kKeep,
+  kNoObservedBlocks,  // observed_movable_blocks == 0 (defensive)
+  kStaleObservation,  // observed_movable_blocks > used_blocks
+  kHasImmovableData,  // observed_movable_blocks < used_blocks (non-movables pin the page)
+  kAlreadyEmpty,      // used_blocks == 0
+};
+
+enum class RevalidationFailureReason : uint8_t {
+  kNone = 0,
+  kHeapMismatch,
+  kActiveMallocPage,
+  kFullPage,
+  kAboveThreshold,
+};
+
+struct TargetPage {
+  // Snapshot from census (immutable after BuildFrom).
+  uintptr_t page_address = 0;
+  uint32_t block_size = 0;
+  uint16_t capacity_blocks = 0;
+  uint16_t blocks_at_census = 0;  // used_blocks at census time
+  float retention_score_at_census = 0.0f;
+
+  // Mutated during EVACUATE.
+  uint16_t blocks_evacuated = 0;
+  uint16_t evacuation_failures = 0;
+  TargetStatus status = TargetStatus::kPending;
+  bool revalidation_failed = false;
+  // Set on the first revalidation failure; consulted on sticky-skip branches
+  // to attribute subsequent block/byte skips to the originating reason.
+  RevalidationFailureReason failure_reason = RevalidationFailureReason::kNone;
+};
+
+struct PlanStats {
+  uint64_t targets_kept = 0;
+  uint64_t filtered_no_observed_blocks = 0;
+  uint64_t filtered_stale = 0;
+  uint64_t filtered_has_immovable_data = 0;
+  uint64_t filtered_already_empty = 0;
+  uint64_t truncated_by_cap = 0;  // pages dropped because over max_targets
+
+  uint64_t selected_capacity_bytes_at_census = 0;
+  uint64_t selected_used_bytes_at_census = 0;
+  uint64_t selected_reclaimable_bytes_at_census = 0;
+
+  uint64_t truncated_reclaimable_bytes = 0;
+  uint64_t filtered_immovable_reclaimable_bytes = 0;
+
+  void Merge(const PlanStats& other);
+};
+
+enum class EvacOutcome : uint8_t {
+  kNotATarget,          // page is not in the plan
+  kTargetAlreadyDone,   // target's blocks_evacuated already at blocks_at_census
+  kRevalidationFailed,  // page state shifted (full / above threshold / heap mismatch / etc.)
+  kCommitMove,          // caller should perform the move; counter pre-bumped
+};
+
+struct EvacStats {
+  uint64_t blocks_skipped_not_target = 0;
+  uint64_t blocks_skipped_target_done = 0;
+  uint64_t blocks_skipped_revalidation_failed = 0;
+
+  uint64_t blocks_move_committed = 0;
+
+  uint64_t bytes_skipped_target_done = 0;
+  uint64_t bytes_skipped_revalidation_failed = 0;
+  uint64_t bytes_move_committed = 0;
+
+  uint64_t targets_revalidation_heap_mismatch = 0;
+  uint64_t targets_revalidation_active_malloc_page = 0;
+  uint64_t targets_revalidation_full_page = 0;
+  uint64_t targets_revalidation_above_threshold = 0;
+
+  // Block/byte breakdown of revalidation skips by originating reason. Sums to
+  // blocks_skipped_revalidation_failed / bytes_skipped_revalidation_failed.
+  uint64_t blocks_revalidation_heap_mismatch = 0;
+  uint64_t blocks_revalidation_active_malloc_page = 0;
+  uint64_t blocks_revalidation_full_page = 0;
+  uint64_t blocks_revalidation_above_threshold = 0;
+  uint64_t bytes_revalidation_heap_mismatch = 0;
+  uint64_t bytes_revalidation_active_malloc_page = 0;
+  uint64_t bytes_revalidation_full_page = 0;
+  uint64_t bytes_revalidation_above_threshold = 0;
+
+  uint64_t targets_abandoned_revalidation = 0;
+  uint64_t targets_completed_during_evac = 0;
+
+  void Merge(const EvacStats& other);
+};
+
+class TargetPlan {
+ public:
+  explicit TargetPlan(PlanStats* stats);
+  ~TargetPlan();
+
+  // Non-copyable, non-movable: destructor clears mimalloc defrag_skip bits on
+  // active targets, so move-from would double-clear (harmless) but copies
+  // would set bits this object doesn't own.
+  TargetPlan(const TargetPlan&) = delete;
+  TargetPlan& operator=(const TargetPlan&) = delete;
+  TargetPlan(TargetPlan&&) = delete;
+  TargetPlan& operator=(TargetPlan&&) = delete;
+
+  // Default `max_targets` is effectively unlimited; selective skip-bit (top
+  // skip_pct fraction) bounds lockout pressure. Tests pass explicit small
+  // values to exercise the cap path.
+  void BuildFrom(const PageCensus& census, size_t max_targets = std::numeric_limits<size_t>::max());
+
+  const std::vector<TargetPage>& targets() const {
+    return targets_;
+  }
+
+  const PlanStats& stats() const {
+    return *stats_;
+  }
+
+  bool Contains(uintptr_t addr) const;
+
+  const TargetPage* Find(uintptr_t addr) const;
+  TargetPage* FindMut(uintptr_t addr);
+
+  size_t size() const {
+    return targets_.size();
+  }
+
+  bool empty() const {
+    return targets_.empty();
+  }
+
+  bool AllTargetsDone() const {
+    return pending_targets_ == 0;
+  }
+
+  void NotifyTargetDone() {
+    --pending_targets_;
+  }
+
+ private:
+  std::vector<TargetPage> targets_;  // sorted by retention_score desc
+  absl::flat_hash_map<uintptr_t, size_t> address_to_index_;
+  PlanStats* stats_;
+  size_t pending_targets_ = 0;
+};
+
+// Hot-path variant: caller has already resolved the target (e.g. via
+// plan.FindMut). Must be non-null. Skips the redundant lookup that the
+// 3-arg variant otherwise performs.
+EvacOutcome EvacDecide(TargetPlan& plan, TargetPage* target, const mi_page_usage_stats_t& stat,
+                       EvacStats& stats);
+
+// Convenience variant: looks up the target from stat.page_address. Returns
+// kNotATarget on miss. Kept for test ergonomics; production callers should
+// prefer the 4-arg form so they can fold the lookup with their own
+// fast-path checks.
+EvacOutcome EvacDecide(TargetPlan& plan, const mi_page_usage_stats_t& stat, EvacStats& stats);
+
+class CensusTaker final : public PageUsage {
+ public:
+  CensusTaker(PageCensus* census, float threshold, CycleQuota quota = CycleQuota::Unlimited());
+
+  // Override to call the mimalloc syscall and feed the resulting stats into
+  // the census; CensusTaker never reallocates so the return is always false.
+  bool IsPageForObjectUnderUtilized(void* object) override;
+  bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) override;
+
+  bool IsReadOnly() const final {
+    return true;
+  }
+
+  bool ShouldDefragKeys() const final;
+
+  void SetCurrentBucketCursor(uint64_t cursor) final {
+    current_cursor_ = cursor;
+  }
+
+ private:
+  PageCensus* census_;
+  float threshold_;
+  uint64_t current_cursor_ = 0;
+};
+
+class Evacuator final : public PageUsage {
+ public:
+  Evacuator(TargetPlan* plan, float threshold, EvacStats* evac_stats,
+            CycleQuota quota = CycleQuota::Unlimited());
+
+  // Override to filter through the plan: a per-object hashmap lookup short-
+  // circuits the expensive mi_heap_page_is_underutilized syscall when the
+  // object isn't on a target page. On hit, calls the syscall + EvacDecide.
+  bool IsPageForObjectUnderUtilized(void* object) override;
+  bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) override;
+
+  bool ShouldDefragKeys() const final;
+
+  bool ShouldStop() const final {
+    return plan_->AllTargetsDone();
+  }
+
+ private:
+  TargetPlan* plan_;
+  float threshold_;
+  EvacStats* evac_stats_;
+};
+
+}  // namespace dfly
diff --git a/src/core/page_usage_stats_test.cc b/src/core/page_usage_stats_test.cc
index fec2a7287b08..10c60cd0ab43 100644
--- a/src/core/page_usage_stats_test.cc
+++ b/src/core/page_usage_stats_test.cc
@@ -49,7 +49,7 @@ std::string GenerateTestJSON(size_t num_objects) {
 }
 
 // Helper to defragment only if a randomly generated value is less than preset probability. For
-// benchmarking realistic situations, where some nodes are fragmented and others are not
+// benchmarking realistic situations, where some nodes are fragmented and others are not.
 class SelectiveDefragment : public PageUsage {
  public:
   explicit SelectiveDefragment(const double fragmentation_probability)
diff --git a/src/external_libs.cmake b/src/external_libs.cmake
index e3e270dc5020..d879ac9f0efe 100644
--- a/src/external_libs.cmake
+++ b/src/external_libs.cmake
@@ -77,6 +77,8 @@ ExternalProject_Add(mimalloc2_project
       COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/2_return_stat.patch
       COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/3_track_full_size.patch
       COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/4_fix_heap_collect.patch
+      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/5_skip_defrag_targets.patch
+      COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/6_dfly_underutil_callback.patch
   BUILD_COMMAND make mimalloc-static
 
   INSTALL_COMMAND make install
diff --git a/src/redis/zmalloc.h b/src/redis/zmalloc.h
index eb53e169a9c9..ef7fee9bffad 100644
--- a/src/redis/zmalloc.h
+++ b/src/redis/zmalloc.h
@@ -148,6 +148,11 @@ char* zstrdup(const char* s);
 
 void init_zmalloc_threadlocal(void* heap);
 extern __thread ssize_t zmalloc_used_memory_tl;
+// The heap zmalloc operates on. Exposed so C++ hot-path callers can avoid
+// the zmalloc_page_is_underutilized wrapper indirection (~3.7ns/call) and
+// invoke mi_heap_page_is_underutilized directly. Treat as opaque (cast to
+// mi_heap_t*).
+extern __thread void* zmalloc_heap;
 
 #undef __zm_str
 #undef __xstr
diff --git a/src/redis/zmalloc_mi.c b/src/redis/zmalloc_mi.c
index 8209b9d7d35e..5b22cdcde2a1 100644
--- a/src/redis/zmalloc_mi.c
+++ b/src/redis/zmalloc_mi.c
@@ -13,7 +13,10 @@
 #include "zmalloc.h"
 
 __thread ssize_t zmalloc_used_memory_tl = 0;
-__thread mi_heap_t* zmalloc_heap = NULL;
+// Linkage matches the extern declaration in zmalloc.h, which uses void* to
+// avoid pulling mimalloc headers into every C++ TU. The actual stored
+// pointer is always a mi_heap_t*.
+__thread void* zmalloc_heap = NULL;
 
 mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio,
                                                     bool collect_stats);
@@ -175,12 +178,13 @@ int zmalloc_get_allocator_wasted_blocks(float ratio, size_t* allocated, size_t*
 // Implemented based on this mimalloc code:
 // https://github.com/microsoft/mimalloc/blob/main/src/heap.c#L27
 int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_info* info) {
-  if (zmalloc_heap->page_count == 0 || info->bin >= MI_BIN_FULL) {
+  mi_heap_t* heap = (mi_heap_t*)zmalloc_heap;
+  if (heap->page_count == 0 || info->bin >= MI_BIN_FULL) {
     // We avoid iterating over full pages since they are fully utilized.
     return 0;
   }
 
-  mi_page_queue_t* pq = &zmalloc_heap->pages[info->bin];
+  mi_page_queue_t* pq = &heap->pages[info->bin];
   const mi_page_t* page = pq->first;
   while (page != NULL) {
     const mi_page_t* next = page->next;
@@ -205,11 +209,11 @@ int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_i
     info->committed_golden = info->committed;
     // Add total comitted size of MI_BIN_FULL that we do not traverse
     // as its tracked by zmalloc_heap->full_page_size variable.
-    info->committed += zmalloc_heap->full_page_size;
+    info->committed += heap->full_page_size;
 
     // TODO: it's a test code that makes sure `full_page_size` is correct.
     // Remove it once we are confident with the implementation.
-    mi_page_queue_t* pq = &zmalloc_heap->pages[MI_BIN_FULL];
+    mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL];
     const mi_page_t* page = pq->first;
     while (page != NULL) {
       info->committed_golden += page->capacity * page->block_size;
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
index f7bbd89d80b5..a05899110f02 100644
--- a/src/server/CMakeLists.txt
+++ b/src/server/CMakeLists.txt
@@ -82,6 +82,7 @@ endif()
 
 # Optionally include tiered_storage which interfaces with tiering_module
 add_library(dragonfly_lib
+            defrag.cc
             engine_shard.cc engine_shard_set.cc
             config_registry.cc conn_context.cc
             debugcmd.cc dflycmd.cc error.cc family_utils.cc string_stats.cc ${DF_SEARCH_SRCS}
@@ -166,6 +167,7 @@ helio_cxx_test(cluster/cluster_family_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(acl/acl_family_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(engine_shard_set_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(serializer_base_test dfly_test_lib LABELS DFLY)
+helio_cxx_test(defrag_test dfly_test_lib LABELS DFLY)
 
 add_dependencies(check_dfly dragonfly_test json_family_test list_family_test
                  generic_family_test memcache_parser_test rdb_test journal_test
diff --git a/src/server/defrag.cc b/src/server/defrag.cc
new file mode 100644
index 000000000000..f17b8f5e41af
--- /dev/null
+++ b/src/server/defrag.cc
@@ -0,0 +1,671 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "server/defrag.h"
+
+#include <absl/strings/str_cat.h>
+#include <absl/strings/str_format.h>
+#include <absl/strings/strip.h>
+#include <absl/time/clock.h>
+#include <mimalloc/internal.h>
+#include <mimalloc/types.h>
+
+#include <algorithm>
+#include <mutex>
+#include <ranges>
+
+#include "base/flags.h"
+#include "base/logging.h"
+#include "core/page_usage/page_usage_visitors.h"
+
+ABSL_FLAG(uint64_t, defrag_min_plan_reclaimable_bytes, 64u << 20,
+          "Minimum bytes-reclaimable threshold the SELECT_TARGETS plan must hit to "
+          "justify running EVACUATE. Below this, the cycle is skipped (PLAN_SKIPPED) "
+          "and we return to IDLE without walking the dashtable. The underutil set is "
+          "left intact so the next cycle picks it up if churn refills the pages above "
+          "threshold (or new fragmentation appears). Default 64 MiB.");
+
+extern "C" {
+#include "redis/zmalloc.h"
+// Dragonfly mimalloc patch: per-process callback fired by mi_free_block_local
+// when a page's used count crosses below the configured underutil threshold.
+typedef void (*mi_dfly_underutil_callback_t)(uintptr_t page_addr);
+void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb);
+void mi_dfly_set_underutil_threshold_pct(uint8_t pct);
+}
+
+namespace dfly {
+
+namespace {
+
+#define DEFRAG_STEP_LOG LOG(INFO)
+
+uint64_t NowNs() {
+  return absl::GetCurrentTimeNanos();
+}
+
+double NsToMs(uint64_t ns) {
+  return static_cast<double>(ns) / 1e6;
+}
+
+std::string FormatMiB(uint64_t bytes) {
+  return absl::StrFormat("%.2fMiB", static_cast<double>(bytes) / (1024.0 * 1024.0));
+}
+
+}  // namespace
+
+namespace defrag_underutil {
+
+namespace {
+
+thread_local absl::flat_hash_set<uintptr_t> tl_underutil_pages;
+
+void OnPageUnderutil(uintptr_t page_addr) {
+  tl_underutil_pages.insert(page_addr);
+}
+
+}  // namespace
+
+void InitOnce() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    mi_dfly_set_underutil_callback(&OnPageUnderutil);
+    LOG(INFO) << "defrag[underutil_cb] registered with mimalloc";
+  });
+}
+
+void SetThresholdPct(uint8_t pct) {
+  mi_dfly_set_underutil_threshold_pct(pct);
+}
+
+size_t Size() {
+  return tl_underutil_pages.size();
+}
+
+std::vector<uintptr_t> Snapshot() {
+  return {tl_underutil_pages.begin(), tl_underutil_pages.end()};
+}
+
+void Remove(uintptr_t page_addr) {
+  tl_underutil_pages.erase(page_addr);
+}
+
+void Clear() {
+  tl_underutil_pages.clear();
+}
+
+bool IsPageMaybeUnderutil(uintptr_t page_addr) {
+  if (tl_underutil_pages.empty()) {
+    return true;  // bootstrap: no info, fall through to original mimalloc check
+  }
+  return tl_underutil_pages.contains(page_addr);
+}
+
+}  // namespace defrag_underutil
+
+void DefragTaskState::UpdateScanState(uint64_t cursor_val) {
+  cursor = cursor_val;
+  if (cursor == 0u) {
+    ++dbid;
+  }
+}
+
+void DefragTaskState::ResetScanState() {
+  dbid = 0;
+  cursor = 0;
+}
+
+void DefragTaskState::FinishCycle() {
+  phase = DefragPhase::IDLE;
+  census.reset();
+  plan.reset();
+  cursor_hints.clear();
+  hint_cursor_idx = 0;
+  ResetScanState();
+}
+
+void CycleProgress::Merge(const CycleProgress& other) {
+  targets_complete += other.targets_complete;
+  targets_partial += other.targets_partial;
+  targets_no_progress += other.targets_no_progress;
+  targets_abandoned += other.targets_abandoned;
+  blocks_total_at_census += other.blocks_total_at_census;
+  blocks_evacuated += other.blocks_evacuated;
+  blocks_remaining += other.blocks_remaining;
+  bytes_total_at_census += other.bytes_total_at_census;
+  bytes_evacuated += other.bytes_evacuated;
+  bytes_remaining += other.bytes_remaining;
+  bytes_freed += other.bytes_freed;
+}
+
+void DefragCycleStats::Merge(const DefragCycleStats& other) {
+  census.Merge(other.census);
+  plan.Merge(other.plan);
+  evac.Merge(other.evac);
+  verify.Merge(other.verify);
+  census_db_objects_scanned += other.census_db_objects_scanned;
+  evac_db_objects_scanned += other.evac_db_objects_scanned;
+  evac_reallocations += other.evac_reallocations;
+  evac_key_reallocations += other.evac_key_reallocations;
+  evac_val_reallocations += other.evac_val_reallocations;
+  evac_bytes_moved += other.evac_bytes_moved;
+  census_retained_pages += other.census_retained_pages;
+  plan_target_pages += other.plan_target_pages;
+  census_potential_reclaim_bytes += other.census_potential_reclaim_bytes;
+  census_movable_bytes_observed += other.census_movable_bytes_observed;
+  // cycle_finished is per-shard semantics; not meaningfully mergeable.
+  // On the merged report, callers should use phase_per_shard to answer
+  // "is every shard done?" — cycle_finished stays at its default (false).
+}
+
+DefragMergedReport DefragMergedReport::Merge(std::vector<DefragShardReport>&& shards) {
+  DefragMergedReport result;
+  result.shard_summaries.reserve(shards.size());
+  std::vector<CollectedPageStats> page_usage_list;
+  page_usage_list.reserve(shards.size());
+
+  for (DefragShardReport& shard : shards) {
+    result.shard_summaries.push_back(shard.summary);
+    result.cycle_stats.Merge(shard.cycle_stats);
+    page_usage_list.push_back(std::move(shard.page_usage_stats));
+  }
+  // CollectedPageStats::Merge takes a threshold; carry the first-shard value
+  // forward (all shards are configured with the same threshold).
+  const float threshold =
+      page_usage_list.empty() ? 0.0f : static_cast<float>(page_usage_list.front().threshold);
+  result.page_usage_stats = CollectedPageStats::Merge(std::move(page_usage_list), threshold);
+  return result;
+}
+
+const char* PhaseName(DefragPhase phase) {
+  switch (phase) {
+    case DefragPhase::IDLE:
+      return "IDLE";
+    case DefragPhase::CENSUS:
+      return "CENSUS";
+    case DefragPhase::SELECT_TARGETS:
+      return "SELECT_TARGETS";
+    case DefragPhase::EVACUATE:
+      return "EVACUATE";
+    case DefragPhase::VERIFY:
+      return "VERIFY";
+  }
+  return "UNKNOWN";
+}
+
+std::string DefragMergedReport::ToString() const {
+  std::string out;
+  const auto& cs = cycle_stats;
+
+  absl::StrAppend(&out, "Per-shard summary:\n");
+  absl::StrAppend(&out, "  shard | phase_start    -> phase_end      | duration_us | exit_reason\n");
+  for (size_t i = 0; i < shard_summaries.size(); ++i) {
+    const DefragShardSummary& s = shard_summaries[i];
+    const char* exit_reason = s.finished_all_dbs ? "finished" : (s.quota_depleted ? "quota" : "-");
+    absl::StrAppend(
+        &out, "  ", absl::Dec(i, absl::PadSpec::kSpacePad5), " | ",
+        absl::StrFormat("%-14s -> %-14s", PhaseName(s.phase_start), PhaseName(s.phase_end)), " | ",
+        absl::Dec(s.duration_us, absl::PadSpec::kSpacePad11), " | ", exit_reason, "\n");
+  }
+
+  absl::StrAppend(&out, "\n[CENSUS]\n");
+  absl::StrAppend(&out, "Allocations seen: ", cs.census.allocations_seen, "\n");
+  absl::StrAppend(&out, "Allocations recorded: ", cs.census.allocations_recorded, "\n");
+  absl::StrAppend(&out, "Skipped (above threshold): ", cs.census.skipped_above_threshold, "\n");
+  absl::StrAppend(&out, "Skipped (full page): ", cs.census.skipped_full_page, "\n");
+  absl::StrAppend(&out, "Skipped (wrong heap): ", cs.census.skipped_wrong_heap, "\n");
+  absl::StrAppend(&out, "Skipped (active malloc page): ", cs.census.skipped_active_malloc_page,
+                  "\n");
+  absl::StrAppend(&out, "Skipped (low score): ", cs.census.skipped_low_score, "\n");
+  absl::StrAppend(&out, "Pages evicted from retained: ", cs.census.pages_evicted_from_retained,
+                  "\n");
+  absl::StrAppend(&out, "Heap rebuilds: ", cs.census.heap_rebuilds, "\n");
+  absl::StrAppend(&out, "DB objects scanned: ", cs.census_db_objects_scanned, "\n");
+  absl::StrAppend(&out, "Retained pages (total): ", cs.census_retained_pages, "\n");
+  absl::StrAppend(
+      &out, "Potential reclaimable bytes (observed): ", cs.census_potential_reclaim_bytes, "\n");
+  absl::StrAppend(&out, "Movable bytes (observed): ", cs.census_movable_bytes_observed, "\n");
+
+  absl::StrAppend(&out, "\n[SELECT]\n");
+  absl::StrAppend(&out, "Targets kept: ", cs.plan.targets_kept, "\n");
+  absl::StrAppend(&out, "Filtered (no observed blocks): ", cs.plan.filtered_no_observed_blocks,
+                  "\n");
+  absl::StrAppend(&out, "Filtered (stale): ", cs.plan.filtered_stale, "\n");
+  absl::StrAppend(&out, "Filtered (has immovable data): ", cs.plan.filtered_has_immovable_data,
+                  "\n");
+  absl::StrAppend(&out, "Filtered (already empty): ", cs.plan.filtered_already_empty, "\n");
+  absl::StrAppend(&out, "Truncated by cap: ", cs.plan.truncated_by_cap, "\n");
+  absl::StrAppend(&out, "Target pages (total): ", cs.plan_target_pages, "\n");
+  absl::StrAppend(&out, "Selected capacity bytes (at census): ",
+                  cs.plan.selected_capacity_bytes_at_census, "\n");
+  absl::StrAppend(&out, "Selected used bytes (at census): ", cs.plan.selected_used_bytes_at_census,
+                  "\n");
+  absl::StrAppend(&out, "Selected reclaimable bytes (at census): ",
+                  cs.plan.selected_reclaimable_bytes_at_census, "\n");
+  absl::StrAppend(&out, "Truncated reclaimable bytes: ", cs.plan.truncated_reclaimable_bytes, "\n");
+  absl::StrAppend(&out, "Filtered immovable reclaimable bytes: ",
+                  cs.plan.filtered_immovable_reclaimable_bytes, "\n");
+
+  absl::StrAppend(&out, "\n[EVACUATE]\n");
+  absl::StrAppend(&out, "DB objects scanned: ", cs.evac_db_objects_scanned, "\n");
+  absl::StrAppend(&out, "Reallocations: ", cs.evac_reallocations, "\n");
+  absl::StrAppend(&out, "Keys reallocated: ", cs.evac_key_reallocations, "\n");
+  absl::StrAppend(&out, "Values reallocated: ", cs.evac_val_reallocations, "\n");
+  absl::StrAppend(&out, "Bytes moved: ", FormatMiB(cs.evac_bytes_moved), "\n");
+  absl::StrAppend(&out, "Blocks moved (committed): ", cs.evac.blocks_move_committed, "\n");
+  absl::StrAppend(&out, "Bytes moved (committed): ", cs.evac.bytes_move_committed, "\n");
+  absl::StrAppend(&out, "Blocks skipped (not target): ", cs.evac.blocks_skipped_not_target, "\n");
+  absl::StrAppend(&out, "Blocks skipped (target done): ", cs.evac.blocks_skipped_target_done, "\n");
+  absl::StrAppend(&out, "Blocks skipped (revalidation failed): ",
+                  cs.evac.blocks_skipped_revalidation_failed, "\n");
+  absl::StrAppend(&out, "Bytes skipped (target done): ", cs.evac.bytes_skipped_target_done, "\n");
+  absl::StrAppend(&out, "Bytes skipped (revalidation failed): ",
+                  cs.evac.bytes_skipped_revalidation_failed, "\n");
+  absl::StrAppend(&out, "Targets revalidation (heap mismatch): ",
+                  cs.evac.targets_revalidation_heap_mismatch, "\n");
+  absl::StrAppend(&out, "Targets revalidation (active malloc page): ",
+                  cs.evac.targets_revalidation_active_malloc_page, "\n");
+  absl::StrAppend(
+      &out, "Targets revalidation (full page): ", cs.evac.targets_revalidation_full_page, "\n");
+  absl::StrAppend(&out, "Targets revalidation (above threshold): ",
+                  cs.evac.targets_revalidation_above_threshold, "\n");
+  absl::StrAppend(&out, "Blocks revalidation (heap mismatch): ",
+                  cs.evac.blocks_revalidation_heap_mismatch, "\n");
+  absl::StrAppend(&out, "Blocks revalidation (active malloc page): ",
+                  cs.evac.blocks_revalidation_active_malloc_page, "\n");
+  absl::StrAppend(&out, "Blocks revalidation (full page): ", cs.evac.blocks_revalidation_full_page,
+                  "\n");
+  absl::StrAppend(&out, "Blocks revalidation (above threshold): ",
+                  cs.evac.blocks_revalidation_above_threshold, "\n");
+  absl::StrAppend(
+      &out, "Bytes revalidation (heap mismatch): ", cs.evac.bytes_revalidation_heap_mismatch, "\n");
+  absl::StrAppend(&out, "Bytes revalidation (active malloc page): ",
+                  cs.evac.bytes_revalidation_active_malloc_page, "\n");
+  absl::StrAppend(&out, "Bytes revalidation (full page): ", cs.evac.bytes_revalidation_full_page,
+                  "\n");
+  absl::StrAppend(&out, "Bytes revalidation (above threshold): ",
+                  cs.evac.bytes_revalidation_above_threshold, "\n");
+  absl::StrAppend(
+      &out, "Targets abandoned (revalidation): ", cs.evac.targets_abandoned_revalidation, "\n");
+  absl::StrAppend(&out, "Targets completed during evac: ", cs.evac.targets_completed_during_evac,
+                  "\n");
+
+  absl::StrAppend(&out, "\n[VERIFY]\n");
+  absl::StrAppend(&out, "Targets complete: ", cs.verify.targets_complete, "\n");
+  absl::StrAppend(&out, "Targets partial: ", cs.verify.targets_partial, "\n");
+  absl::StrAppend(&out, "Targets no progress: ", cs.verify.targets_no_progress, "\n");
+  absl::StrAppend(&out, "Targets abandoned: ", cs.verify.targets_abandoned, "\n");
+  absl::StrAppend(&out, "Blocks total (at census): ", cs.verify.blocks_total_at_census, "\n");
+  absl::StrAppend(&out, "Blocks evacuated: ", cs.verify.blocks_evacuated, "\n");
+  absl::StrAppend(&out, "Blocks remaining: ", cs.verify.blocks_remaining, "\n");
+  absl::StrAppend(&out, "Bytes total (at census): ", cs.verify.bytes_total_at_census, "\n");
+  absl::StrAppend(&out, "Bytes evacuated: ", cs.verify.bytes_evacuated, "\n");
+  absl::StrAppend(&out, "Bytes remaining: ", cs.verify.bytes_remaining, "\n");
+
+  absl::StrAppend(&out, "\n[PAGE USAGE]\n", page_usage_stats.ToString());
+
+  absl::StripTrailingAsciiWhitespace(&out);
+  return out;
+}
+
+// Build a usage-stat struct directly from a page address, validating that the
+// page still belongs to `heap`, isn't empty, isn't full, and is still below
+// threshold. Returns true on success; on false the caller should drop the
+// address from the underutil set (page recovered, was reclaimed, or never was
+// ours). Threshold is fractional in [0, 1] matching the dragonfly setting.
+static bool BuildPageStatFromAddress(uintptr_t page_addr, mi_heap_t* heap, float threshold,
+                                     mi_page_usage_stats_t* out) {
+  if (page_addr == 0)
+    return false;
+  mi_page_t* page = reinterpret_cast<mi_page_t*>(page_addr);
+
+  if (mi_page_heap(page) != heap)
+    return false;
+
+  const uint16_t used = page->used;
+  const uint16_t cap = page->capacity;
+  if (used == 0 || cap == 0)
+    return false;
+  if (used >= cap)
+    return false;  // full; mimalloc may have re-filled it
+
+  const float used_ratio = static_cast<float>(used) / static_cast<float>(cap);
+  if (used_ratio > threshold)
+    return false;  // recovered above threshold
+
+  out->page_address = page_addr;
+  out->block_size = mi_page_block_size(page);
+  out->capacity = cap;
+  out->reserved = page->reserved;
+  out->used = used;
+  out->flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+  return true;
+}
+
+CycleProgress RunVerify(const TargetPlan& plan) {
+  CycleProgress p;
+  for (const TargetPage& target : plan.targets()) {
+    const bool is_complete = target.blocks_evacuated >= target.blocks_at_census;
+    if (target.blocks_evacuated == 0) {
+      ++p.targets_no_progress;
+    } else if (is_complete) {
+      ++p.targets_complete;
+    } else {
+      ++p.targets_partial;
+    }
+    if (target.revalidation_failed) {
+      ++p.targets_abandoned;
+    }
+
+    const uint16_t evac_clamped = std::min(target.blocks_evacuated, target.blocks_at_census);
+    p.blocks_total_at_census += target.blocks_at_census;
+    p.blocks_evacuated += evac_clamped;
+    p.bytes_total_at_census += uint64_t(target.blocks_at_census) * target.block_size;
+    p.bytes_evacuated += uint64_t(evac_clamped) * target.block_size;
+
+    if (is_complete) {
+      p.bytes_freed +=
+          uint64_t(target.capacity_blocks - target.blocks_at_census) * target.block_size;
+    }
+  }
+  p.blocks_remaining = p.blocks_total_at_census - p.blocks_evacuated;
+  p.bytes_remaining = p.bytes_total_at_census - p.bytes_evacuated;
+  return p;
+}
+
+void DefragIdleStep(DefragTaskState* state, float threshold) {
+  state->cycle_stats = {};
+  state->ResetScanState();
+  ++state->cycle_id;
+  const uint64_t now = NowNs();
+  state->cycle_start_ns = now;
+  state->phase_start_ns = now;
+  state->phase_active_ns = 0;
+
+  state->census.emplace(&state->cycle_stats.census, PageCensus::kDefaultMaxRetainedPages,
+                        state->per_block_move_cost_bytes);
+  LOG(INFO) << absl::StrFormat("defrag[CYCLE_START] shard=%u cycle=%llu threshold=%.2f",
+                               state->shard_id, state->cycle_id, threshold);
+  state->phase = DefragPhase::CENSUS;
+}
+
+void DefragCensusStep(DefragTaskState* state, float threshold, CycleQuota quota,
+                      const DbSliceWalker& walk) {
+  const uint64_t step_start_ns = NowNs();
+
+  // Reactive fast path: if mimalloc has flagged any pages via the underutil
+  // callback, hydrate the census from that set and skip the dashtable walk.
+  // Falls back to the legacy walk if the set is empty (bootstrap, or workload
+  // with no recent threshold-crossing frees).
+  const size_t underutil_set_size = defrag_underutil::Size();
+  LOG_FIRST_N(INFO, 8) << absl::StrFormat("defrag[CENSUS_ENTRY] shard=%u cycle=%llu set_size=%zu",
+                                          state->shard_id, state->cycle_id, underutil_set_size);
+  if (underutil_set_size > 0) {
+    auto* heap = static_cast<mi_heap_t*>(zmalloc_heap);
+    const std::vector<uintptr_t> snapshot = defrag_underutil::Snapshot();
+    size_t recovered = 0;
+    for (uintptr_t addr : snapshot) {
+      mi_page_usage_stats_t stat;
+      if (!BuildPageStatFromAddress(addr, heap, threshold, &stat)) {
+        defrag_underutil::Remove(addr);
+        ++recovered;
+        continue;
+      }
+      state->census->ObservePage(stat);
+    }
+
+    state->cycle_stats.census_retained_pages = state->census->pages().size();
+    for (const auto& agg : state->census->pages() | std::views::values) {
+      state->cycle_stats.census_potential_reclaim_bytes +=
+          uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size;
+      state->cycle_stats.census_movable_bytes_observed +=
+          uint64_t(agg.observed_movable_blocks) * agg.block_size;
+    }
+
+    const uint64_t now = NowNs();
+    state->phase_active_ns += now - step_start_ns;
+    DEFRAG_STEP_LOG << absl::StrFormat(
+        "defrag[CENSUS_REACTIVE] shard=%u cycle=%llu set_in=%zu retained=%zu recovered=%zu "
+        "potential_reclaim=%s movable_observed=%s took=%.1fms cpu=%.1fms",
+        state->shard_id, state->cycle_id, underutil_set_size,
+        state->cycle_stats.census_retained_pages, recovered,
+        FormatMiB(state->cycle_stats.census_potential_reclaim_bytes),
+        FormatMiB(state->cycle_stats.census_movable_bytes_observed),
+        NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns));
+
+    state->phase_start_ns = now;
+    state->phase_active_ns = 0;
+    state->phase = DefragPhase::SELECT_TARGETS;
+    return;
+  }
+
+  // Fallback: full dashtable walk.
+  CensusTaker visitor(&*state->census, threshold, quota);
+  const DbSliceResult result = walk(&visitor, /*hints=*/nullptr, /*hint_cursor=*/nullptr);
+  state->cycle_stats.census_db_objects_scanned += result.attempts;
+  if (!result.finished_all_dbs) {
+    state->phase_active_ns += NowNs() - step_start_ns;
+    return;
+  }
+
+  // Aggregate page-level totals here so the [CENSUS] log can report them
+  // before SELECT_TARGETS runs.
+  state->cycle_stats.census_retained_pages = state->census->pages().size();
+  for (const auto& agg : state->census->pages() | std::views::values) {
+    state->cycle_stats.census_potential_reclaim_bytes +=
+        uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size;
+    state->cycle_stats.census_movable_bytes_observed +=
+        uint64_t(agg.observed_movable_blocks) * agg.block_size;
+  }
+
+  const CensusStats& c = state->cycle_stats.census;
+  const uint64_t now = NowNs();
+  state->phase_active_ns += now - step_start_ns;
+  DEFRAG_STEP_LOG << absl::StrFormat(
+      "defrag[CENSUS] shard=%u cycle=%llu db_objects=%llu retained=%zu/%zu "
+      "recorded/seen=%llu/%llu cursor_hints=%zu potential_reclaim=%s movable_observed=%s "
+      "skipped{above_thr=%llu full=%llu wrong_heap=%llu active=%llu low_score=%llu} "
+      "topk{evicted=%llu rebuilds=%llu} took=%.1fms cpu=%.1fms",
+      state->shard_id, state->cycle_id, state->cycle_stats.census_db_objects_scanned,
+      state->cycle_stats.census_retained_pages, PageCensus::kDefaultMaxRetainedPages,
+      c.allocations_recorded, c.allocations_seen, state->census->cursor_hints().size(),
+      FormatMiB(state->cycle_stats.census_potential_reclaim_bytes),
+      FormatMiB(state->cycle_stats.census_movable_bytes_observed), c.skipped_above_threshold,
+      c.skipped_full_page, c.skipped_wrong_heap, c.skipped_active_malloc_page, c.skipped_low_score,
+      c.pages_evicted_from_retained, c.heap_rebuilds, NsToMs(now - state->phase_start_ns),
+      NsToMs(state->phase_active_ns));
+
+  state->phase_start_ns = now;
+  state->phase_active_ns = 0;
+  state->phase = DefragPhase::SELECT_TARGETS;
+}
+
+void DefragSelectTargetsStep(DefragTaskState* state) {
+  const uint64_t step_start_ns = NowNs();
+  state->plan.emplace(&state->cycle_stats.plan);
+  state->plan->BuildFrom(*state->census);
+  state->cycle_stats.plan_target_pages = state->plan->size();
+  // Hand the bucket-cursor hints off to the task state so EVACUATE can use
+  // them after we release the census itself (the page map is large).
+  state->cursor_hints = state->census->TakeCursorHints();
+  state->hint_cursor_idx = 0;
+  state->census.reset();
+  // EVACUATE walks the prime table again from the start.
+  state->ResetScanState();
+
+  const PlanStats& p = state->cycle_stats.plan;
+  const uint64_t now = NowNs();
+  state->phase_active_ns += now - step_start_ns;
+  DEFRAG_STEP_LOG << absl::StrFormat(
+      "defrag[PLAN] shard=%u cycle=%llu targets=%zu/%zu kept=%llu reclaimable=%s "
+      "filtered{no_obs=%llu stale=%llu immovable=%llu empty=%llu} truncated_by_cap=%llu "
+      "filtered_immovable=%s truncated=%s took=%.1fms cpu=%.1fms",
+      state->shard_id, state->cycle_id, state->cycle_stats.plan_target_pages,
+      state->cycle_stats.census_retained_pages, p.targets_kept,
+      FormatMiB(p.selected_reclaimable_bytes_at_census), p.filtered_no_observed_blocks,
+      p.filtered_stale, p.filtered_has_immovable_data, p.filtered_already_empty, p.truncated_by_cap,
+      FormatMiB(p.filtered_immovable_reclaimable_bytes), FormatMiB(p.truncated_reclaimable_bytes),
+      NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns));
+
+  // Skip EVAC when the prize is too small to justify the dashtable walk. The
+  // underutil set is left intact: future cycles re-enter via reactive CENSUS
+  // and re-plan; if churn pushes more pages below threshold, the plan grows
+  // back above the bar naturally.
+  const uint64_t min_reclaimable = absl::GetFlag(FLAGS_defrag_min_plan_reclaimable_bytes);
+  if (p.selected_reclaimable_bytes_at_census < min_reclaimable) {
+    LOG(INFO) << absl::StrFormat(
+        "defrag[PLAN_SKIPPED] shard=%u cycle=%llu reclaimable=%s threshold=%s targets=%zu",
+        state->shard_id, state->cycle_id, FormatMiB(p.selected_reclaimable_bytes_at_census),
+        FormatMiB(min_reclaimable), state->cycle_stats.plan_target_pages);
+    state->cycle_stats.cycle_finished = true;
+    state->FinishCycle();
+    return;
+  }
+
+  state->phase_start_ns = now;
+  state->phase_active_ns = 0;
+  state->phase = DefragPhase::EVACUATE;
+}
+
+void DefragEvacuateStep(DefragTaskState* state, float threshold, CycleQuota quota,
+                        const DbSliceWalker& walk) {
+  const uint64_t step_start_ns = NowNs();
+  Evacuator visitor(&*state->plan, threshold, &state->cycle_stats.evac, quota);
+  const bool use_hints = !state->cursor_hints.empty();
+  const DbSliceResult result = walk(&visitor, use_hints ? &state->cursor_hints : nullptr,
+                                    use_hints ? &state->hint_cursor_idx : nullptr);
+  state->cycle_stats.evac_db_objects_scanned += result.attempts;
+  state->cycle_stats.evac_reallocations += result.reallocations;
+  state->cycle_stats.evac_key_reallocations += result.key_reallocations;
+  state->cycle_stats.evac_val_reallocations += result.val_reallocations;
+  state->cycle_stats.evac_bytes_moved += result.bytes_moved;
+  if (!result.finished_all_dbs && !state->plan->AllTargetsDone()) {
+    state->phase_active_ns += NowNs() - step_start_ns;
+    return;
+  }
+
+  const EvacStats& e = state->cycle_stats.evac;
+  const uint64_t attempted = e.blocks_move_committed + e.blocks_skipped_revalidation_failed;
+  const double commit_pct =
+      attempted == 0 ? 0.0 : 100.0 * static_cast<double>(e.blocks_move_committed) / attempted;
+  const uint64_t now = NowNs();
+  state->phase_active_ns += now - step_start_ns;
+  DEFRAG_STEP_LOG << absl::StrFormat(
+      "defrag[EVACUATE] shard=%u cycle=%llu db_objects=%llu "
+      "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%s "
+      "commit=%llu/%llu (%.1f%%) bytes_committed=%s "
+      "skipped_blocks{not_target=%llu target_done=%llu revalid=%llu} "
+      "reval_fail{heap=%llu active=%llu full=%llu above_thr=%llu} "
+      "abandoned=%llu completed_during_evac=%llu took=%.1fms cpu=%.1fms",
+      state->shard_id, state->cycle_id, state->cycle_stats.evac_db_objects_scanned,
+      state->cycle_stats.evac_reallocations, state->cycle_stats.evac_key_reallocations,
+      state->cycle_stats.evac_val_reallocations, FormatMiB(state->cycle_stats.evac_bytes_moved),
+      e.blocks_move_committed, attempted, commit_pct, FormatMiB(e.bytes_move_committed),
+      e.blocks_skipped_not_target, e.blocks_skipped_target_done,
+      e.blocks_skipped_revalidation_failed, e.targets_revalidation_heap_mismatch,
+      e.targets_revalidation_active_malloc_page, e.targets_revalidation_full_page,
+      e.targets_revalidation_above_threshold, e.targets_abandoned_revalidation,
+      e.targets_completed_during_evac, NsToMs(now - state->phase_start_ns),
+      NsToMs(state->phase_active_ns));
+
+  state->phase_start_ns = now;
+  state->phase_active_ns = 0;
+  state->phase = DefragPhase::VERIFY;
+}
+
+void DefragVerifyStep(DefragTaskState* state) {
+  const uint64_t step_start_ns = NowNs();
+  state->cycle_stats.verify = RunVerify(*state->plan);
+  state->cycle_stats.cycle_finished = true;
+
+  const CycleProgress& v = state->cycle_stats.verify;
+  // complete/partial/no_progress are mutually exclusive and cover every target;
+  // abandoned is a parallel dimension (revalidation_failed) that overlaps with
+  // those three, so it is not part of the denominator.
+  const uint64_t total_targets = v.targets_complete + v.targets_partial + v.targets_no_progress;
+  const double done_pct =
+      total_targets == 0 ? 0.0 : 100.0 * static_cast<double>(v.targets_complete) / total_targets;
+  const double bytes_pct =
+      v.bytes_total_at_census == 0
+          ? 0.0
+          : 100.0 * static_cast<double>(v.bytes_evacuated) / v.bytes_total_at_census;
+  const uint64_t planned_reclaim = state->cycle_stats.plan.selected_reclaimable_bytes_at_census;
+  const double freed_pct =
+      planned_reclaim == 0 ? 0.0 : 100.0 * static_cast<double>(v.bytes_freed) / planned_reclaim;
+  const uint64_t now = NowNs();
+  state->phase_active_ns += now - step_start_ns;
+  DEFRAG_STEP_LOG << absl::StrFormat(
+      "defrag[VERIFY] shard=%u cycle=%llu targets{done=%llu/%llu (%.1f%%) "
+      "partial=%llu none=%llu abandoned=%llu} "
+      "bytes{moved=%s/%s (%.1f%%) freed=%s/%s (%.1f%%) remaining=%s} took=%.1fms cpu=%.1fms",
+      state->shard_id, state->cycle_id, v.targets_complete, total_targets, done_pct,
+      v.targets_partial, v.targets_no_progress, v.targets_abandoned, FormatMiB(v.bytes_evacuated),
+      FormatMiB(v.bytes_total_at_census), bytes_pct, FormatMiB(v.bytes_freed),
+      FormatMiB(planned_reclaim), freed_pct, FormatMiB(v.bytes_remaining),
+      NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns));
+
+  const double cycle_ms = NsToMs(now - state->cycle_start_ns);
+  const double freed_mib_per_s =
+      cycle_ms <= 0.0
+          ? 0.0
+          : (static_cast<double>(v.bytes_freed) / (1024.0 * 1024.0)) / (cycle_ms / 1000.0);
+  LOG(INFO) << absl::StrFormat(
+      "defrag[CYCLE_DONE] shard=%u cycle=%llu targets_done=%llu/%llu (%.1f%%) "
+      "bytes_freed=%s/%s (%.1f%%) bytes_moved=%s cycle_took=%.1fms freed_rate=%.1fMiB/s",
+      state->shard_id, state->cycle_id, v.targets_complete, total_targets, done_pct,
+      FormatMiB(v.bytes_freed), FormatMiB(planned_reclaim), freed_pct, FormatMiB(v.bytes_evacuated),
+      cycle_ms, freed_mib_per_s);
+
+  state->FinishCycle();
+}
+
+namespace {
+
+struct StepTransition {
+  DefragPhase before;
+  DefragPhase after;
+};
+
+StepTransition RunPhaseStep(DefragTaskState* state, float threshold, CycleQuota quota,
+                            const DbSliceWalker& walk) {
+  const DefragPhase before = state->phase;
+  switch (state->phase) {
+    case DefragPhase::IDLE:
+      DefragIdleStep(state, threshold);
+      break;
+    case DefragPhase::CENSUS:
+      DefragCensusStep(state, threshold, quota, walk);
+      break;
+    case DefragPhase::SELECT_TARGETS:
+      DefragSelectTargetsStep(state);
+      break;
+    case DefragPhase::EVACUATE:
+      DefragEvacuateStep(state, threshold, quota, walk);
+      break;
+    case DefragPhase::VERIFY:
+      DefragVerifyStep(state);
+      break;
+  }
+  return {before, state->phase};
+}
+
+bool CycleEnded(StepTransition t) {
+  if (t.after != DefragPhase::IDLE)
+    return false;
+  // Normal end (VERIFY -> IDLE) and PLAN_SKIPPED bail-out (SELECT_TARGETS ->
+  // IDLE) both terminate the cycle.
+  return t.before == DefragPhase::VERIFY || t.before == DefragPhase::SELECT_TARGETS;
+}
+
+}  // namespace
+
+void RunPhaseDefrag(DefragTaskState* state, float threshold, CycleQuota quota,
+                    const DbSliceWalker& walk) {
+  StepTransition t;
+  do {
+    t = RunPhaseStep(state, threshold, quota, walk);
+  } while (!quota.Depleted() && !CycleEnded(t));
+}
+
+}  // namespace dfly
diff --git a/src/server/defrag.h b/src/server/defrag.h
new file mode 100644
index 000000000000..f6b1c800a1d2
--- /dev/null
+++ b/src/server/defrag.h
@@ -0,0 +1,223 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#pragma once
+
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "core/page_usage/page_usage_stats.h"
+#include "core/page_usage/page_usage_visitors.h"
+
+namespace dfly {
+
+// Tracks pages that mimalloc has signalled (via the dragonfly underutil
+// callback) as having dropped below the defrag utilization threshold during a
+// free. Storage is per-thread (via thread_local) so each shard observes only
+// its own heap. Replaces the dashtable-walking CENSUS as the source of
+// candidate target pages.
+namespace defrag_underutil {
+
+// Register the mimalloc callback once per process. Safe to call from any
+// thread; only the first call performs the registration. Subsequent calls are
+// no-ops.
+void InitOnce();
+
+// Set the threshold (in percent, 0-100) used by mimalloc to decide when a page
+// has crossed below the underutil watermark during a free. Must match the
+// dragonfly-side `mem_defrag_page_utilization_threshold` so census and EVAC
+// use the same definition of "underutilized".
+void SetThresholdPct(uint8_t pct);
+
+// Number of pages currently tracked on this thread's set.
+size_t Size();
+
+// Returns a copy of the page-address set as a vector. Does not modify the
+// set; callers drop entries explicitly via Remove. Returned order is
+// unspecified.
+std::vector<uintptr_t> Snapshot();
+
+// Drop a page from this thread's set. Used by VERIFY to retire targets that
+// were successfully drained, and by the new CENSUS to drop entries that have
+// since recovered above threshold.
+void Remove(uintptr_t page_addr);
+
+// Clear the entire set on this thread.
+void Clear();
+
+// Filter predicate for the legacy DefragIfNeeded fast path. Returns true when
+// the page is "interesting" (a candidate for the expensive
+// mi_heap_page_is_underutilized check):
+//   - empty set ⇒ bootstrap, no info available, return true so callers
+//     fall through to the original behavior;
+//   - non-empty set ⇒ return true iff the page appears in the set.
+// Conservative-positive: pages that crossed threshold without a recent free
+// will be missed until the next free on them lands them in the set.
+bool IsPageMaybeUnderutil(uintptr_t page_addr);
+
+}  // namespace defrag_underutil
+
+enum class DefragPhase : uint8_t {
+  IDLE,
+  CENSUS,
+  SELECT_TARGETS,
+  EVACUATE,
+  VERIFY,
+};
+
+struct CycleProgress {
+  uint64_t targets_complete = 0;     // blocks_evacuated >= blocks_at_census
+  uint64_t targets_partial = 0;      // 0 < blocks_evacuated < blocks_at_census
+  uint64_t targets_no_progress = 0;  // blocks_evacuated == 0
+  uint64_t targets_abandoned = 0;    // revalidation_failed; orthogonal to the trio above
+
+  uint64_t blocks_total_at_census = 0;
+  uint64_t blocks_evacuated = 0;
+  uint64_t blocks_remaining = 0;
+
+  uint64_t bytes_total_at_census = 0;
+  uint64_t bytes_evacuated = 0;
+  uint64_t bytes_remaining = 0;
+
+  // Bytes mimalloc can return once fully-drained source pages are reclaimed:
+  // sum over completed targets of (capacity - used)_at_census * block_size.
+  // Compares directly to PlanStats::selected_reclaimable_bytes_at_census.
+  uint64_t bytes_freed = 0;
+
+  void Merge(const CycleProgress& other);
+};
+
+CycleProgress RunVerify(const TargetPlan& plan);
+
+struct DefragCycleStats {
+  CensusStats census;
+  PlanStats plan;
+  EvacStats evac;
+  CycleProgress verify;
+
+  uint64_t census_db_objects_scanned = 0;
+  uint64_t evac_db_objects_scanned = 0;
+  uint64_t evac_reallocations = 0;
+  uint64_t evac_key_reallocations = 0;  // keys whose allocation was moved
+  uint64_t evac_val_reallocations = 0;  // values whose allocation was moved
+  uint64_t evac_bytes_moved = 0;        // total bytes read+written during reallocations
+
+  size_t census_retained_pages = 0;
+  size_t plan_target_pages = 0;
+
+  uint64_t census_potential_reclaim_bytes = 0;
+  uint64_t census_movable_bytes_observed = 0;
+
+  bool cycle_finished = false;
+
+  void Merge(const DefragCycleStats& other);
+};
+
+struct DefragShardSummary {
+  DefragPhase phase_start = DefragPhase::IDLE;  // phase on entry to DoDefrag
+  DefragPhase phase_end = DefragPhase::IDLE;    // phase on exit from DoDefrag
+  uint64_t duration_us = 0;                     // wall-clock time spent in DoDefrag
+  bool quota_depleted = false;                  // visitor exhausted its CycleQuota
+  bool finished_all_dbs = false;                // legacy: walked all dbs; phased: cycle complete
+};
+
+struct DefragShardReport {
+  DefragShardSummary summary;           // per-shard exit info
+  DefragCycleStats cycle_stats;         // empty on the legacy path
+  CollectedPageStats page_usage_stats;  // search-index defrag stats from PageUsage
+  bool work_pending = false;            // bg-task scheduler hint: true = high priority
+};
+
+struct DefragMergedReport {
+  std::vector<DefragShardSummary> shard_summaries;  // index = shard_id
+  DefragCycleStats cycle_stats;                     // summed across shards
+  CollectedPageStats page_usage_stats;              // merged via CollectedPageStats::Merge
+
+  static DefragMergedReport Merge(std::vector<DefragShardReport>&& shards);
+
+  std::string ToString() const;
+};
+
+const char* PhaseName(DefragPhase phase);
+
+struct DefragTaskState {
+  // Cycle position, used by both legacy and phased paths.
+  size_t dbid = 0;
+  uint64_t cursor = 0;
+
+  // Threshold-gate state, consulted before starting a new cycle.
+  time_t last_check_time = 0;
+  float page_utilization_threshold = 0.8f;
+
+  // Per-block move-cost weight in the page retention score:
+  //   score = reclaim / (move_bytes + used_blocks * per_block_move_cost + slot_overhead)
+  // Higher values penalize many-entry pages more strongly, pushing pages with
+  // small block sizes (more entries per page) toward the back of the candidate
+  // ordering. Useful for wide/mixed workloads where evacuating small-block
+  // pages is expensive per byte reclaimed.
+  uint64_t per_block_move_cost_bytes = 256;
+
+  // Phased-only state, untouched in legacy mode.
+  DefragPhase phase = DefragPhase::IDLE;
+  std::optional<PageCensus> census;
+  std::optional<TargetPlan> plan;
+  // Bucket cursors observed during CENSUS that contained at least one object
+  // on a candidate page. Moved here from PageCensus before SELECT_TARGETS
+  // releases the census, then consumed by EVACUATE. Sorted vector + cursor
+  // index lets the hinted walker resume mid-iteration across DoDefrag calls
+  // when one EVAC quota slice can't drain the full hint set.
+  std::vector<uint64_t> cursor_hints;
+  size_t hint_cursor_idx = 0;
+  DefragCycleStats cycle_stats;
+
+  uint16_t shard_id = 0;
+  uint64_t cycle_id = 0;
+  uint64_t cycle_start_ns = 0;
+  uint64_t phase_start_ns = 0;
+  // CPU time spent doing actual work in the current phase, summed across
+  // DoDefrag invocations. Resets at each phase transition. Distinguishes
+  // CPU-only effort from wall-clock (phase_start_ns -> now), which includes
+  // idle gaps between invocations.
+  uint64_t phase_active_ns = 0;
+
+  void UpdateScanState(uint64_t cursor_val);
+
+  void ResetScanState();
+
+  void FinishCycle();
+};
+
+struct DbSliceResult {
+  uint64_t attempts = 0;           // # of (key, value) pairs visited
+  uint64_t reallocations = 0;      // # of entries where key or value was reallocated
+  uint64_t key_reallocations = 0;  // # of keys reallocated
+  uint64_t val_reallocations = 0;  // # of values reallocated
+  uint64_t bytes_moved = 0;        // bytes read+written across all reallocations
+  bool finished_all_dbs = false;
+};
+
+// Walker callable. If `hints` is non-null and non-empty, the walker should
+// visit only the buckets listed in the hint vector starting at *hint_cursor
+// (used by EVACUATE to skip buckets without candidate objects); the walker
+// updates *hint_cursor to where it stopped so the next call can resume. If
+// `hints` is null, the walker performs a full slice walk (used by CENSUS).
+using DbSliceWalker = std::function<DbSliceResult(PageUsage*, const std::vector<uint64_t>* hints,
+                                                  size_t* hint_cursor)>;
+
+void DefragIdleStep(DefragTaskState* state, float threshold);
+void DefragCensusStep(DefragTaskState* state, float threshold, CycleQuota quota,
+                      const DbSliceWalker& walk);
+void DefragSelectTargetsStep(DefragTaskState* state);
+void DefragEvacuateStep(DefragTaskState* state, float threshold, CycleQuota quota,
+                        const DbSliceWalker& walk);
+void DefragVerifyStep(DefragTaskState* state);
+
+void RunPhaseDefrag(DefragTaskState* state, float threshold, CycleQuota quota,
+                    const DbSliceWalker& walk);
+
+}  // namespace dfly
diff --git a/src/server/defrag_test.cc b/src/server/defrag_test.cc
new file mode 100644
index 000000000000..5bc13b331ef0
--- /dev/null
+++ b/src/server/defrag_test.cc
@@ -0,0 +1,1216 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "server/defrag.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <mimalloc.h>
+#include <mimalloc/internal.h>
+
+#include <vector>
+
+#include "base/flags.h"
+#include "base/gtest.h"
+#include "base/logging.h"
+#include "core/page_usage/page_usage_visitors.h"
+
+ABSL_DECLARE_FLAG(bool, defrag_use_skip_bit);
+
+extern "C" {
+#include "redis/zmalloc.h"
+mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio,
+                                                    bool collect_stats);
+}
+
+namespace dfly {
+
+namespace {
+
+mi_page_usage_stats_t MakeStat(uintptr_t addr, uint16_t capacity, uint16_t used,
+                               uint8_t flags = MI_DFLY_PAGE_BELOW_THRESHOLD,
+                               size_t block_size = 64) {
+  mi_page_usage_stats_t s{};
+  s.page_address = addr;
+  s.block_size = block_size;
+  s.capacity = capacity;
+  s.used = used;
+  s.flags = flags;
+  return s;
+}
+
+}  // namespace
+
+TEST(PageCensusEvictionTest, EvictsLowestScorePageWhenOverCap) {
+  if (!PageCensus::kEnableTopK) {
+    GTEST_SKIP() << "PageCensus::kEnableTopK is false; eviction path inactive";
+  }
+  CensusStats cstats;
+  PageCensus census(&cstats, /*max_retained_pages=*/4);
+
+  // Scores (cap/used): 5.0, 3.33, 2.5, 2.0 — page 4 is the lowest.
+  census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2));
+  census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3));
+  census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4));
+  census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5));
+
+  ASSERT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u);
+
+  // New page with score 2.5 — pushes us over cap, page 4 (score 2.0) should be evicted.
+  census.Observe(MakeStat(/*addr=*/5, /*capacity=*/10, /*used=*/4));
+
+  EXPECT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 1u);
+  EXPECT_TRUE(census.pages().contains(1));
+  EXPECT_TRUE(census.pages().contains(2));
+  EXPECT_TRUE(census.pages().contains(3));
+  EXPECT_FALSE(census.pages().contains(4));
+  EXPECT_TRUE(census.pages().contains(5));
+}
+
+TEST(PageCensusEvictionTest, StaleHeapEntryDoesNotEvictWrongPage) {
+  if (!PageCensus::kEnableTopK) {
+    GTEST_SKIP() << "PageCensus::kEnableTopK is false; eviction path inactive";
+  }
+  CensusStats cstats;
+  PageCensus census(&cstats, /*max_retained_pages=*/4);
+
+  // Scores: 5.0, 3.33, 2.5, 2.0. Page 4 starts as the lowest.
+  census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2));
+  census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3));
+  census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4));
+  census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5));
+
+  // Re-observe page 4 with a much higher score (5.0). The old heap entry
+  // (score 2.0, gen=1) is now stale — page 3 (score 2.5) is genuinely lowest.
+  census.Observe(MakeStat(/*addr=*/4, /*capacity=*/20, /*used=*/4));
+
+  ASSERT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u);
+
+  // New page with score 5.0 — pushes us over cap. Lazy-pop should skip the
+  // stale page-4 entry and evict page 3 (the genuinely-lowest live page,
+  // score 2.5). Use a score strictly above page 3's so page 3 is the
+  // unambiguous next-lowest after the stale skip.
+  census.Observe(MakeStat(/*addr=*/6, /*capacity=*/10, /*used=*/2));
+
+  EXPECT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 1u);
+  EXPECT_TRUE(census.pages().contains(1));
+  EXPECT_TRUE(census.pages().contains(2));
+  EXPECT_FALSE(census.pages().contains(3));
+  EXPECT_TRUE(census.pages().contains(4));
+  EXPECT_TRUE(census.pages().contains(6));
+}
+
+TEST(PageCensusEvictionTest, RebuildFiresWhenHeapDoublesCap) {
+  if (!PageCensus::kEnableTopK) {
+    GTEST_SKIP() << "PageCensus::kEnableTopK is false; heap rebuild inactive";
+  }
+  CensusStats cstats;
+  PageCensus census(&cstats, /*max_retained_pages=*/4);
+
+  // Fill to cap. Heap size = 4.
+  census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2));
+  census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3));
+  census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4));
+  census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5));
+
+  EXPECT_EQ(census.stats().heap_rebuilds, 0u);
+
+  // Re-observe page 1 repeatedly. Each call pushes a new heap entry without
+  // changing pages_.size(). Heap grows: 5, 6, 7, 8, 9 — rebuild fires once
+  // we cross 2 * max_retained_pages_ (i.e., > 8).
+  for (int i = 0; i < 5; ++i) {
+    census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2));
+  }
+
+  EXPECT_GE(census.stats().heap_rebuilds, 1u);
+  EXPECT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u);
+}
+
+TEST(PageCensusEvictionTest, RejectsNewPageWithScoreBelowWorstRetained) {
+  if (!PageCensus::kEnableTopK) {
+    GTEST_SKIP() << "PageCensus::kEnableTopK is false; reject-on-cap inactive";
+  }
+  CensusStats cstats;
+  PageCensus census(&cstats, /*max_retained_pages=*/4);
+
+  // Fill to cap with scores 5.0, 3.33, 2.5, 2.0. Worst retained is page 4 at 2.0.
+  census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2));
+  census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3));
+  census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4));
+  census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5));
+
+  ASSERT_EQ(census.pages().size(), 4u);
+  EXPECT_EQ(census.stats().skipped_low_score, 0u);
+
+  // New page with score 1.42 (cap=10, used=7) — strictly below the worst retained.
+  // Should be rejected upfront: not inserted, no eviction, no allocations_recorded bump.
+  const uint64_t recorded_before = census.stats().allocations_recorded;
+  census.Observe(MakeStat(/*addr=*/5, /*capacity=*/10, /*used=*/7));
+
+  EXPECT_EQ(census.pages().size(), 4u);
+  EXPECT_FALSE(census.pages().contains(5));
+  EXPECT_EQ(census.stats().skipped_low_score, 1u);
+  EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u);
+  EXPECT_EQ(census.stats().allocations_recorded, recorded_before);
+}
+
+namespace {
+
+// Drives N observations on a single page so that observed_movable_blocks lands at N
+// and used_blocks/capacity reflect the last call.
+void ObserveTimes(PageCensus& census, int times, uintptr_t addr, uint16_t capacity, uint16_t used) {
+  for (int i = 0; i < times; ++i) {
+    census.Observe(MakeStat(addr, capacity, used));
+  }
+}
+
+}  // namespace
+
+TEST(TargetPlanTest, AppliesFilterClassification) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+
+  // KEEP: movable == used.
+  ObserveTimes(census, /*times=*/4, /*addr=*/1, /*capacity=*/10, /*used=*/4);
+  // kAlreadyEmpty: used == 0.
+  ObserveTimes(census, /*times=*/1, /*addr=*/2, /*capacity=*/10, /*used=*/0);
+  // kStaleObservation: movable (5) > used (3).
+  ObserveTimes(census, /*times=*/5, /*addr=*/3, /*capacity=*/10, /*used=*/3);
+  // kHasImmovableData: movable (2) < used (5).
+  ObserveTimes(census, /*times=*/2, /*addr=*/4, /*capacity=*/10, /*used=*/5);
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EXPECT_EQ(plan.size(), 1u);
+  EXPECT_EQ(plan.stats().targets_kept, 1u);
+  EXPECT_EQ(plan.stats().filtered_already_empty, 1u);
+  EXPECT_EQ(plan.stats().filtered_stale, 1u);
+  EXPECT_EQ(plan.stats().filtered_has_immovable_data, 1u);
+  EXPECT_EQ(plan.stats().filtered_no_observed_blocks, 0u);
+  EXPECT_EQ(plan.stats().truncated_by_cap, 0u);
+  EXPECT_TRUE(plan.Contains(1));
+  EXPECT_FALSE(plan.Contains(2));
+  EXPECT_FALSE(plan.Contains(3));
+  EXPECT_FALSE(plan.Contains(4));
+}
+
+TEST(TargetPlanTest, SortsByScoreDescending) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+
+  // Three KEEP-eligible pages with distinct scores: 5.0, 2.5, 2.0.
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+  ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4);
+  ObserveTimes(census, /*times=*/5, /*addr=*/300, /*capacity=*/10, /*used=*/5);
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  ASSERT_EQ(plan.size(), 3u);
+  EXPECT_EQ(plan.targets()[0].page_address, 100u);
+  EXPECT_EQ(plan.targets()[1].page_address, 200u);
+  EXPECT_EQ(plan.targets()[2].page_address, 300u);
+  EXPECT_GT(plan.targets()[0].retention_score_at_census,
+            plan.targets()[1].retention_score_at_census);
+  EXPECT_GT(plan.targets()[1].retention_score_at_census,
+            plan.targets()[2].retention_score_at_census);
+}
+
+TEST(TargetPlanTest, TruncatesToMaxTargets) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+
+  // Four KEEP-eligible pages with descending scores.
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);  // 5.0
+  ObserveTimes(census, /*times=*/3, /*addr=*/200, /*capacity=*/10, /*used=*/3);  // 3.33
+  ObserveTimes(census, /*times=*/4, /*addr=*/300, /*capacity=*/10, /*used=*/4);  // 2.5
+  ObserveTimes(census, /*times=*/5, /*addr=*/400, /*capacity=*/10, /*used=*/5);  // 2.0
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census, /*max_targets=*/2);
+
+  EXPECT_EQ(plan.size(), 2u);
+  EXPECT_EQ(plan.stats().targets_kept, 2u);
+  EXPECT_EQ(plan.stats().truncated_by_cap, 2u);
+  EXPECT_TRUE(plan.Contains(100));
+  EXPECT_TRUE(plan.Contains(200));
+  EXPECT_FALSE(plan.Contains(300));
+  EXPECT_FALSE(plan.Contains(400));
+}
+
+TEST(TargetPlanTest, AddressIndexLookup) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/2, /*addr=*/0x1000, /*capacity=*/10, /*used=*/2);
+  ObserveTimes(census, /*times=*/4, /*addr=*/0x2000, /*capacity=*/10, /*used=*/4);
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  ASSERT_EQ(plan.size(), 2u);
+
+  const TargetPage* found = plan.Find(0x1000);
+  ASSERT_NE(found, nullptr);
+  EXPECT_EQ(found->page_address, 0x1000u);
+  EXPECT_EQ(found->blocks_at_census, 2u);
+  EXPECT_EQ(found->capacity_blocks, 10u);
+
+  EXPECT_EQ(plan.Find(0xDEAD), nullptr);
+  EXPECT_FALSE(plan.Contains(0xDEAD));
+}
+
+TEST(TargetPlanTest, BuildFromIsIdempotent) {
+  CensusStats cstats1;
+  PageCensus census1(&cstats1);
+  ObserveTimes(census1, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+  ObserveTimes(census1, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4);
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census1);
+  ASSERT_EQ(plan.size(), 2u);
+
+  // Second census with different pages — plan should fully replace its state.
+  CensusStats cstats2;
+  PageCensus census2(&cstats2);
+  ObserveTimes(census2, /*times=*/3, /*addr=*/500, /*capacity=*/10, /*used=*/3);
+
+  plan.BuildFrom(census2);
+  EXPECT_EQ(plan.size(), 1u);
+  EXPECT_TRUE(plan.Contains(500));
+  EXPECT_FALSE(plan.Contains(100));
+  EXPECT_FALSE(plan.Contains(200));
+  EXPECT_EQ(plan.stats().targets_kept, 1u);
+}
+
+namespace {
+
+// Convenience: stat for an eligible page (BELOW_THRESHOLD set, no other dfly flags).
+mi_page_usage_stats_t EligibleStat(uintptr_t addr) {
+  return MakeStat(addr, /*capacity=*/10, /*used=*/4);
+}
+
+}  // namespace
+
+TEST(EvacDecideTest, NotATargetWhenPageMissingFromPlan) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  EvacOutcome outcome = EvacDecide(plan, EligibleStat(/*addr=*/0xDEAD), stats);
+
+  EXPECT_EQ(outcome, EvacOutcome::kNotATarget);
+  EXPECT_EQ(stats.blocks_skipped_not_target, 1u);
+  EXPECT_EQ(stats.blocks_skipped_target_done, 0u);
+  EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 0u);
+  EXPECT_EQ(stats.blocks_move_committed, 0u);
+}
+
+TEST(EvacDecideTest, CommitsMoveAndBumpsCountersOnFirstCall) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  ASSERT_EQ(plan.size(), 1u);
+
+  EvacStats stats;
+  EvacOutcome outcome = EvacDecide(plan, EligibleStat(/*addr=*/100), stats);
+
+  EXPECT_EQ(outcome, EvacOutcome::kCommitMove);
+  EXPECT_EQ(stats.blocks_move_committed, 1u);
+  EXPECT_EQ(stats.blocks_skipped_not_target, 0u);
+  EXPECT_EQ(stats.blocks_skipped_target_done, 0u);
+  EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 0u);
+  const TargetPage* target = plan.Find(100);
+  ASSERT_NE(target, nullptr);
+  EXPECT_EQ(target->blocks_evacuated, 1u);
+  EXPECT_FALSE(target->revalidation_failed);
+}
+
+TEST(EvacDecideTest, ReturnsTargetAlreadyDoneOnceCounterReachesCensus) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  ASSERT_EQ(plan.Find(100)->blocks_at_census, 2u);
+
+  EvacStats stats;
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kTargetAlreadyDone);
+
+  EXPECT_EQ(stats.blocks_move_committed, 2u);
+  EXPECT_EQ(stats.blocks_skipped_target_done, 1u);
+  EXPECT_EQ(plan.Find(100)->blocks_evacuated, 2u);
+}
+
+TEST(EvacDecideTest, MultiTargetCountersAreIndependent) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/3, /*addr=*/100, /*capacity=*/10, /*used=*/3);
+  ObserveTimes(census, /*times=*/2, /*addr=*/200, /*capacity=*/10, /*used=*/2);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kTargetAlreadyDone);
+
+  EXPECT_EQ(plan.Find(100)->blocks_evacuated, 2u);
+  EXPECT_EQ(plan.Find(200)->blocks_evacuated, 2u);
+  EXPECT_EQ(stats.blocks_move_committed, 4u);
+  EXPECT_EQ(stats.blocks_skipped_target_done, 1u);
+}
+
+TEST(EvacDecideTest, RevalidationFailsForIneligibleFlags) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  // Page is now FULL — no longer a defrag candidate.
+  EvacStats stats;
+  mi_page_usage_stats_t bad =
+      MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL);
+  EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 1u);
+  EXPECT_TRUE(plan.Find(100)->revalidation_failed);
+  EXPECT_EQ(plan.Find(100)->blocks_evacuated, 0u);
+}
+
+TEST(EvacDecideTest, RevalidationFailureIsSticky) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  // First call: page is no longer below threshold (flags=0). Sticky flag set.
+  mi_page_usage_stats_t bad = MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/9, /*flags=*/0);
+  EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed);
+
+  // Subsequent calls — even with eligible flags — still fail via the sticky path.
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed);
+
+  EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 3u);
+  EXPECT_EQ(stats.blocks_move_committed, 0u);
+  EXPECT_EQ(plan.Find(100)->blocks_evacuated, 0u);
+}
+
+TEST(EvacDecideTest, RevalidationBreakdownAttributesBlocksToOriginatingReason) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  constexpr uint32_t kBlockSize = 64;
+
+  // Target 100 fails with HEAP_MISMATCH; revisit twice on sticky path.
+  mi_page_usage_stats_t mismatch =
+      MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_HEAP_MISMATCH);
+  EXPECT_EQ(EvacDecide(plan, mismatch, stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed);
+
+  // Target 200 fails with PAGE_FULL; revisit once.
+  mi_page_usage_stats_t full =
+      MakeStat(/*addr=*/200, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL);
+  EXPECT_EQ(EvacDecide(plan, full, stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kRevalidationFailed);
+
+  // Target-grain counters: one each.
+  EXPECT_EQ(stats.targets_revalidation_heap_mismatch, 1u);
+  EXPECT_EQ(stats.targets_revalidation_full_page, 1u);
+  EXPECT_EQ(stats.targets_revalidation_active_malloc_page, 0u);
+  EXPECT_EQ(stats.targets_revalidation_above_threshold, 0u);
+
+  // Block-grain breakdown: 3 blocks attributed to heap_mismatch, 2 to full_page.
+  EXPECT_EQ(stats.blocks_revalidation_heap_mismatch, 3u);
+  EXPECT_EQ(stats.blocks_revalidation_full_page, 2u);
+  EXPECT_EQ(stats.blocks_revalidation_active_malloc_page, 0u);
+  EXPECT_EQ(stats.blocks_revalidation_above_threshold, 0u);
+
+  // Bytes mirror blocks * block_size.
+  EXPECT_EQ(stats.bytes_revalidation_heap_mismatch, 3u * kBlockSize);
+  EXPECT_EQ(stats.bytes_revalidation_full_page, 2u * kBlockSize);
+
+  // Aggregates equal the sum of the breakdown.
+  EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 5u);
+  EXPECT_EQ(stats.bytes_skipped_revalidation_failed, 5u * kBlockSize);
+  EXPECT_EQ(stats.blocks_revalidation_heap_mismatch + stats.blocks_revalidation_full_page,
+            stats.blocks_skipped_revalidation_failed);
+}
+
+TEST(EvacDecideTest, AllTargetsDoneTrueForEmptyPlan) {
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  EXPECT_TRUE(plan.AllTargetsDone());
+}
+
+TEST(EvacDecideTest, AllTargetsDoneFalseAfterBuild) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  ASSERT_EQ(plan.size(), 1u);
+  EXPECT_FALSE(plan.AllTargetsDone());
+}
+
+TEST(EvacDecideTest, CompletionFlipsAllTargetsDone) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  // Single target with blocks_at_census = 2.
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  EXPECT_FALSE(plan.AllTargetsDone());
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_FALSE(plan.AllTargetsDone());  // 1 of 2 done, target still pending
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_TRUE(plan.AllTargetsDone());  // 2 of 2 done — target completed
+  // Subsequent calls don't double-decrement.
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kTargetAlreadyDone);
+  EXPECT_TRUE(plan.AllTargetsDone());
+}
+
+TEST(EvacDecideTest, RevalidationFailureFlipsAllTargetsDone) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+
+  EvacStats stats;
+  EXPECT_FALSE(plan.AllTargetsDone());
+  mi_page_usage_stats_t bad =
+      MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL);
+  EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_TRUE(plan.AllTargetsDone());  // single target now abandoned
+  // Subsequent sticky calls don't double-decrement.
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_TRUE(plan.AllTargetsDone());
+}
+
+TEST(EvacDecideTest, AllTargetsDoneOnlyWhenEveryTargetSettled) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+  ObserveTimes(census, /*times=*/2, /*addr=*/200, /*capacity=*/10, /*used=*/2);
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  ASSERT_EQ(plan.size(), 2u);
+
+  EvacStats stats;
+  EXPECT_FALSE(plan.AllTargetsDone());
+
+  // Complete target 100 fully — plan still has work pending on 200.
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove);
+  EXPECT_FALSE(plan.AllTargetsDone());
+
+  // Mix: revalidation-fail target 200 — plan now fully settled.
+  mi_page_usage_stats_t bad =
+      MakeStat(/*addr=*/200, /*capacity=*/10, /*used=*/2, /*flags=*/MI_DFLY_PAGE_FULL);
+  EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed);
+  EXPECT_TRUE(plan.AllTargetsDone());
+}
+
+TEST(VerifyTest, EmptyPlanGivesZeros) {
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  CycleProgress p = RunVerify(plan);
+  EXPECT_EQ(p.targets_complete, 0u);
+  EXPECT_EQ(p.targets_partial, 0u);
+  EXPECT_EQ(p.targets_no_progress, 0u);
+}
+
+TEST(VerifyTest, ClassifiesByBlocksEvacuated) {
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  // Three targets, each with blocks_at_census = N (since movable == used).
+  ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4);  // complete
+  ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4);  // partial
+  ObserveTimes(census, /*times=*/4, /*addr=*/300, /*capacity=*/10, /*used=*/4);  // no progress
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  ASSERT_EQ(plan.size(), 3u);
+
+  // Simulate EVACUATE outcomes via direct mutation through FindMut.
+  plan.FindMut(100)->blocks_evacuated = 4;  // == blocks_at_census → complete
+  plan.FindMut(200)->blocks_evacuated = 2;  // 0 < x < blocks_at_census → partial
+  plan.FindMut(300)->blocks_evacuated = 0;  // → no progress
+
+  CycleProgress p = RunVerify(plan);
+  EXPECT_EQ(p.targets_complete, 1u);
+  EXPECT_EQ(p.targets_partial, 1u);
+  EXPECT_EQ(p.targets_no_progress, 1u);
+}
+
+TEST(VerifyTest, OvershootCountsAsComplete) {
+  // Defensive: blocks_evacuated > blocks_at_census shouldn't happen given the
+  // EvacDecide guard, but verify the boundary check uses >= not ==.
+  CensusStats cstats;
+  PageCensus census(&cstats);
+  ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2);
+
+  PlanStats pstats;
+  TargetPlan plan(&pstats);
+  plan.BuildFrom(census);
+  plan.FindMut(100)->blocks_evacuated = 5;  // > blocks_at_census (2)
+
+  CycleProgress p = RunVerify(plan);
+  EXPECT_EQ(p.targets_complete, 1u);
+  EXPECT_EQ(p.targets_partial, 0u);
+  EXPECT_EQ(p.targets_no_progress, 0u);
+}
+
+// =====================================================================
+// Microbenchmarks for the CENSUS / EVACUATE per-object hot path.
+// Run with: ./defrag_test --benchmark_filter='BM_.*'
+// =====================================================================
+namespace {
+
+constexpr size_t kBenchObjectCount = 10000;
+constexpr size_t kBenchBlockSize = 64;
+
+void InitBenchEnv() {
+  static bool initialized = false;
+  if (!initialized) {
+    init_zmalloc_threadlocal(mi_heap_get_backing());
+    initialized = true;
+  }
+}
+
+// Holds a batch of allocations; frees them on destruction.
+struct AllocationBatch {
+  std::vector<void*> pointers;
+  ~AllocationBatch() {
+    for (void* p : pointers) {
+      mi_free(p);
+    }
+  }
+};
+
+AllocationBatch AllocBatch(size_t count, size_t block_size) {
+  AllocationBatch ab;
+  ab.pointers.reserve(count);
+  mi_heap_t* heap = mi_heap_get_default();
+  for (size_t i = 0; i < count; ++i) {
+    ab.pointers.push_back(mi_heap_malloc(heap, block_size));
+  }
+  return ab;
+}
+
+}  // namespace
+
+void BM_PtrPage(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(_mi_ptr_page(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_PtrPage)->Arg(kBenchObjectCount);
+
+void BM_ProbeHeap(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  mi_heap_t* heap = mi_heap_get_default();
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true);
+      benchmark::DoNotOptimize(stat);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_ProbeHeap)->Arg(kBenchObjectCount);
+
+void BM_ProbeNoHeap(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      mi_page_usage_stats_t stat;
+      zmalloc_page_is_underutilized(p, 0.8f, /*collect_stats=*/true, &stat);
+      benchmark::DoNotOptimize(stat);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_ProbeNoHeap)->Arg(kBenchObjectCount);
+
+void BM_HashFindMiss(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  absl::flat_hash_map<uintptr_t, size_t> empty;
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+      benchmark::DoNotOptimize(empty.find(addr));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_HashFindMiss)->Arg(kBenchObjectCount);
+
+void BM_HashFindHit(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  absl::flat_hash_map<uintptr_t, size_t> populated;
+  for (void* p : ab.pointers) {
+    populated[reinterpret_cast<uintptr_t>(_mi_ptr_page(p))] = 1;
+  }
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+      benchmark::DoNotOptimize(populated.find(addr));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_HashFindHit)->Arg(kBenchObjectCount);
+
+void BM_HashEmplaceFresh(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  for (auto _ : state) {
+    state.PauseTiming();
+    absl::flat_hash_map<uintptr_t, size_t> m;
+    state.ResumeTiming();
+    for (void* p : ab.pointers) {
+      m.emplace(reinterpret_cast<uintptr_t>(_mi_ptr_page(p)), 1);
+    }
+    benchmark::DoNotOptimize(m);
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_HashEmplaceFresh)->Arg(kBenchObjectCount);
+
+void BM_ProbeAndObserve(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  mi_heap_t* heap = mi_heap_get_default();
+  for (auto _ : state) {
+    state.PauseTiming();
+    CensusStats cs;
+    PageCensus census(&cs);
+    state.ResumeTiming();
+    for (void* p : ab.pointers) {
+      auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true);
+      census.Observe(stat, /*bucket_cursor=*/0);
+    }
+    benchmark::DoNotOptimize(census);
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_ProbeAndObserve)->Arg(kBenchObjectCount);
+
+void BM_EvacDecideMiss(benchmark::State& state) {
+  // Empty plan -> every pointer hits the kNotATarget early-bail. Models
+  // the 95% non-target case in EVACUATE.
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  for (auto _ : state) {
+    EvacStats es{};
+    for (void* p : ab.pointers) {
+      uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+      TargetPage* target = plan.FindMut(addr);
+      if (target == nullptr) {
+        ++es.blocks_skipped_not_target;
+        continue;
+      }
+      // Unreachable in this microbench.
+      auto stat = mi_heap_page_is_underutilized(mi_heap_get_default(), p, 0.8f, true);
+      EvacDecide(plan, target, stat, es);
+    }
+    benchmark::DoNotOptimize(es);
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideMiss)->Arg(kBenchObjectCount);
+
+void BM_EvacDecideHit(benchmark::State& state) {
+  // Plan contains every page our allocations live on -> every pointer hits
+  // the EvacDecide commit path. Models the 5% on-target case.
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  mi_heap_t* heap = mi_heap_get_default();
+
+  // Build a synthetic census whose retention scoring doesn't filter our pages.
+  CensusStats cs;
+  PageCensus census(&cs);
+  for (void* p : ab.pointers) {
+    mi_page_usage_stats_t s{};
+    s.page_address = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+    s.block_size = kBenchBlockSize;
+    s.capacity = 64;
+    s.used = 4;  // very low used -> high retention score
+    s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    census.Observe(s, 0);
+  }
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+
+  for (auto _ : state) {
+    EvacStats es{};
+    for (void* p : ab.pointers) {
+      uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+      TargetPage* target = plan.FindMut(addr);
+      if (target == nullptr) {
+        ++es.blocks_skipped_not_target;
+        continue;
+      }
+      auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true);
+      EvacDecide(plan, target, stat, es);
+    }
+    benchmark::DoNotOptimize(es);
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideHit)->Arg(kBenchObjectCount);
+
+// Same shape as BM_EvacDecideMiss but goes through the full Evacuator:
+// bloom precheck rejects every page (empty plan -> bloom is empty -> no
+// hashes computed beyond bloom). Compared against BM_EvacDecideMiss this
+// shows the bloom's contribution on the no-target hot path.
+void BM_EvacDecideMiss_Evacuator(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  EvacStats es;
+  Evacuator visitor(&plan, 0.8f, &es);
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideMiss_Evacuator)->Arg(kBenchObjectCount);
+
+// Same shape as BM_EvacDecideHit but goes through the full Evacuator:
+// bloom hits, FindMut hits, per-page slice cache fills on first object per
+// page and short-circuits mi_heap_page_is_underutilized for siblings.
+// Compared against BM_EvacDecideHit this shows the cache's contribution.
+void BM_EvacDecideHit_Evacuator(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+
+  CensusStats cs;
+  PageCensus census(&cs);
+  for (void* p : ab.pointers) {
+    mi_page_usage_stats_t s{};
+    s.page_address = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+    s.block_size = kBenchBlockSize;
+    s.capacity = 64;
+    s.used = 4;
+    s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    census.Observe(s, 0);
+  }
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+  EvacStats es;
+  Evacuator visitor(&plan, 0.8f, &es);
+
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideHit_Evacuator)->Arg(kBenchObjectCount);
+
+namespace {
+
+// Non-virtual mirror of Evacuator's hot path. Identical body, but no
+// inheritance, no vtable, and the method is defined inline so the compiler
+// can fold it into the caller. Comparing against BM_EvacDecideHit_Evacuator
+// isolates how much of the per-call overhead is virtual dispatch + worse
+// inlining vs structural cost (member access through `this`).
+class EvacuatorNonVirt {
+ public:
+  EvacuatorNonVirt(TargetPlan* plan, float threshold, EvacStats* evac_stats)
+      : plan_(plan), threshold_(threshold), evac_stats_(evac_stats) {
+  }
+
+  bool IsPageForObjectUnderUtilized(void* object) {
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(object));
+    TargetPage* target = plan_->FindMut(addr);
+    if (target == nullptr) {
+      ++evac_stats_->blocks_skipped_not_target;
+      return false;
+    }
+    const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized(
+        static_cast<mi_heap_t*>(zmalloc_heap), object, threshold_, /*collect_stats=*/true);
+    return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove;
+  }
+
+ private:
+  TargetPlan* plan_;
+  float threshold_;
+  EvacStats* evac_stats_;
+};
+
+}  // namespace
+
+void BM_EvacDecideHit_NonVirt(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+
+  CensusStats cs;
+  PageCensus census(&cs);
+  for (void* p : ab.pointers) {
+    mi_page_usage_stats_t s{};
+    s.page_address = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+    s.block_size = kBenchBlockSize;
+    s.capacity = 64;
+    s.used = 4;
+    s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    census.Observe(s, 0);
+  }
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+  EvacStats es;
+  EvacuatorNonVirt visitor(&plan, 0.8f, &es);
+
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideHit_NonVirt)->Arg(kBenchObjectCount);
+
+namespace {
+
+// Tagged-dispatch variant: simulates the design alternative where PageUsage
+// has a non-virtual IsPageForObjectUnderUtilized that switches on a kind_
+// enum and forwards to the concrete subclass's non-virtual impl. All bodies
+// are inline so the compiler can see end-to-end. The bench calls through a
+// base-class pointer to mimic how the production walker holds PageUsage*.
+//
+// If this matches NonVirt, tagged dispatch is a viable refactor — no virtual
+// dispatch, no template cascade, just an enum + switch in the base class.
+
+enum class TestVisitorKind : uint8_t { kEvacuator };
+
+class TestEvacuatorTagged;
+
+class TestPageUsageBase {
+ public:
+  // Non-virtual, defined inline (after subclass impls) so the switch can
+  // inline the called method directly.
+  inline bool IsPageForObjectUnderUtilized(void* object);
+
+ protected:
+  TestVisitorKind kind_;
+};
+
+class TestEvacuatorTagged : public TestPageUsageBase {
+ public:
+  TestEvacuatorTagged(TargetPlan* plan, float threshold, EvacStats* evac_stats)
+      : plan_(plan), threshold_(threshold), evac_stats_(evac_stats) {
+    kind_ = TestVisitorKind::kEvacuator;
+  }
+
+  // Non-virtual, inline. Body is identical to Evacuator's production impl.
+  bool IsPageForObjectUnderUtilizedImpl(void* object) {
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(object));
+    TargetPage* target = plan_->FindMut(addr);
+    if (target == nullptr) {
+      ++evac_stats_->blocks_skipped_not_target;
+      return false;
+    }
+    const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized(
+        static_cast<mi_heap_t*>(zmalloc_heap), object, threshold_, /*collect_stats=*/true);
+    return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove;
+  }
+
+ private:
+  TargetPlan* plan_;
+  float threshold_;
+  EvacStats* evac_stats_;
+};
+
+inline bool TestPageUsageBase::IsPageForObjectUnderUtilized(void* object) {
+  switch (kind_) {
+    case TestVisitorKind::kEvacuator:
+      return static_cast<TestEvacuatorTagged*>(this)->IsPageForObjectUnderUtilizedImpl(object);
+  }
+  __builtin_unreachable();
+}
+
+}  // namespace
+
+void BM_EvacDecideHit_TaggedDispatch(benchmark::State& state) {
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+
+  CensusStats cs;
+  PageCensus census(&cs);
+  for (void* p : ab.pointers) {
+    mi_page_usage_stats_t s{};
+    s.page_address = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+    s.block_size = kBenchBlockSize;
+    s.capacity = 64;
+    s.used = 4;
+    s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    census.Observe(s, 0);
+  }
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+  EvacStats es;
+  TestEvacuatorTagged visitor(&plan, 0.8f, &es);
+  // Call through base pointer to mimic how the production walker dispatches.
+  TestPageUsageBase* base = &visitor;
+
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(base->IsPageForObjectUnderUtilized(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideHit_TaggedDispatch)->Arg(kBenchObjectCount);
+
+// Populated plan + queries that all miss. Models the dominant production
+// case: EVAC walks the prime table over millions of objects while the plan
+// holds a few thousand target pages — most objects are on non-target pages
+// and need fast rejection. Plan is built from synthetic addresses unrelated
+// to the real allocations so every query miss hits the populated-map find()
+// path (raw variant) or the bloom-rejection path (Evacuator variant).
+namespace {
+
+constexpr size_t kBenchSyntheticPlanSize = 4000;
+
+void PopulatePlanWithSyntheticAddrs(PageCensus* census, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    mi_page_usage_stats_t s{};
+    // Addresses well above any mimalloc-managed range; 64 KiB stride matches
+    // mimalloc page alignment so the addresses look plausible to the bloom
+    // hasher.
+    s.page_address = 0x100000000ULL + i * 0x10000ULL;
+    s.block_size = kBenchBlockSize;
+    s.capacity = 64;
+    s.used = 4;
+    s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD;
+    census->Observe(s, 0);
+  }
+}
+
+// RAII: turn off the skip-bit setter for the duration of this object. The
+// populated-plan benchmarks use synthetic addresses that aren't valid
+// mi_page_t*; SetDefragSkipIfEnabled would dereference them and segfault on
+// BuildFrom and on ~TargetPlan. Restoring on dtor (declare this BEFORE the
+// TargetPlan so the plan destructs first while the flag is still off).
+struct DefragSkipBitOff {
+  bool prev;
+  DefragSkipBitOff() : prev(absl::GetFlag(FLAGS_defrag_use_skip_bit)) {
+    absl::SetFlag(&FLAGS_defrag_use_skip_bit, false);
+  }
+  ~DefragSkipBitOff() {
+    absl::SetFlag(&FLAGS_defrag_use_skip_bit, prev);
+  }
+};
+
+}  // namespace
+
+void BM_EvacDecideMiss_Populated(benchmark::State& state) {
+  DefragSkipBitOff skip_bit_off;  // declared first → destroyed last
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+
+  CensusStats cs;
+  PageCensus census(&cs);
+  PopulatePlanWithSyntheticAddrs(&census, kBenchSyntheticPlanSize);
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+
+  for (auto _ : state) {
+    EvacStats es{};
+    for (void* p : ab.pointers) {
+      uintptr_t addr = reinterpret_cast<uintptr_t>(_mi_ptr_page(p));
+      TargetPage* target = plan.FindMut(addr);
+      if (target == nullptr) {
+        ++es.blocks_skipped_not_target;
+        continue;
+      }
+      auto stat = mi_heap_page_is_underutilized(mi_heap_get_default(), p, 0.8f, true);
+      EvacDecide(plan, target, stat, es);
+    }
+    benchmark::DoNotOptimize(es);
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideMiss_Populated)->Arg(kBenchObjectCount);
+
+void BM_EvacDecideMiss_Populated_Evacuator(benchmark::State& state) {
+  DefragSkipBitOff skip_bit_off;  // declared first → destroyed last
+  InitBenchEnv();
+  AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize);
+
+  CensusStats cs;
+  PageCensus census(&cs);
+  PopulatePlanWithSyntheticAddrs(&census, kBenchSyntheticPlanSize);
+  PlanStats ps;
+  TargetPlan plan(&ps);
+  plan.BuildFrom(census);
+  EvacStats es;
+  Evacuator visitor(&plan, 0.8f, &es);
+
+  for (auto _ : state) {
+    for (void* p : ab.pointers) {
+      benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * ab.pointers.size());
+}
+BENCHMARK(BM_EvacDecideMiss_Populated_Evacuator)->Arg(kBenchObjectCount);
+
+// =====================================================================
+// Microbenchmarks for the underutil-callback per-free overhead.
+//
+// Each variant allocates a fresh batch of objects (untimed) and frees them
+// (timed); the delta between variants isolates the cost the dragonfly
+// mimalloc patch adds to mi_free_block_local:
+//
+//   BM_Free_CallbackOff     callback unregistered. Only the unconditional
+//                           prev_used load runs; the arith block and
+//                           indirect call are short-circuited by the NULL
+//                           check on _mi_dfly_underutil_cb.
+//   BM_Free_CallbackNoOp    callback registered, body just bumps a counter.
+//                           Adds the per-free arith block (cap_thr,
+//                           prev_x100, cur_x100, two compares) on EVERY
+//                           free, plus an indirect call on edge crossings.
+//   BM_Free_CallbackInsert  callback registered, body inserts into a
+//                           thread_local flat_hash_set. Adds hash work on
+//                           top of NoOp, only on edge crossings.
+//
+// Differences:
+//   (NoOp  - Off)   = per-free arith cost (paid by every delete)
+//   (Insert - NoOp) = per-edge insert cost amortized over total frees
+//
+// The "edges/iter" counter reports the number of callback invocations per
+// iteration so per-edge cost can be derived.
+// Run with: ./defrag_test --benchmark_filter='BM_Free_.*'
+// =====================================================================
+
+namespace {
+
+thread_local absl::flat_hash_set<uintptr_t> g_bench_underutil_set;
+thread_local size_t g_bench_underutil_count = 0;
+
+void BenchCallbackNoOp(uintptr_t /*addr*/) {
+  ++g_bench_underutil_count;
+}
+
+void BenchCallbackInsert(uintptr_t addr) {
+  ++g_bench_underutil_count;
+  g_bench_underutil_set.insert(addr);
+}
+
+void RunFreeBench(benchmark::State& state, size_t count) {
+  InitBenchEnv();
+  size_t total_edges = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    g_bench_underutil_set.clear();
+    g_bench_underutil_count = 0;
+    std::vector<void*> ptrs;
+    ptrs.reserve(count);
+    mi_heap_t* heap = mi_heap_get_default();
+    for (size_t i = 0; i < count; ++i) {
+      ptrs.push_back(mi_heap_malloc(heap, kBenchBlockSize));
+    }
+    state.ResumeTiming();
+    for (void* p : ptrs) {
+      mi_free(p);
+    }
+    state.PauseTiming();
+    total_edges += g_bench_underutil_count;
+    state.ResumeTiming();
+  }
+  state.counters["edges/iter"] =
+      benchmark::Counter(static_cast<double>(total_edges) / state.iterations());
+  state.SetItemsProcessed(state.iterations() * count);
+}
+
+}  // namespace
+
+void BM_Free_CallbackOff(benchmark::State& state) {
+  mi_dfly_set_underutil_callback(nullptr);
+  RunFreeBench(state, state.range(0));
+}
+BENCHMARK(BM_Free_CallbackOff)->Arg(kBenchObjectCount);
+
+void BM_Free_CallbackNoOp(benchmark::State& state) {
+  mi_dfly_set_underutil_threshold_pct(80);
+  mi_dfly_set_underutil_callback(&BenchCallbackNoOp);
+  RunFreeBench(state, state.range(0));
+  mi_dfly_set_underutil_callback(nullptr);
+}
+BENCHMARK(BM_Free_CallbackNoOp)->Arg(kBenchObjectCount);
+
+void BM_Free_CallbackInsert(benchmark::State& state) {
+  mi_dfly_set_underutil_threshold_pct(80);
+  mi_dfly_set_underutil_callback(&BenchCallbackInsert);
+  RunFreeBench(state, state.range(0));
+  mi_dfly_set_underutil_callback(nullptr);
+}
+BENCHMARK(BM_Free_CallbackInsert)->Arg(kBenchObjectCount);
+
+}  // namespace dfly
diff --git a/src/server/dragonfly_test.cc b/src/server/dragonfly_test.cc
index c2e2204f941e..d3cfebf49636 100644
--- a/src/server/dragonfly_test.cc
+++ b/src/server/dragonfly_test.cc
@@ -29,6 +29,9 @@ ABSL_DECLARE_FLAG(bool, lua_resp2_legacy_float);
 ABSL_DECLARE_FLAG(double, eviction_memory_budget_threshold);
 ABSL_DECLARE_FLAG(std::vector<std::string>, command_alias);
 ABSL_DECLARE_FLAG(bool, latency_tracking);
+ABSL_DECLARE_FLAG(bool, experimental_defrag);
+ABSL_DECLARE_FLAG(uint64_t, defrag_min_plan_reclaimable_bytes);
+ABSL_DECLARE_FLAG(bool, defrag_keys);
 
 namespace dfly {
 
@@ -883,6 +886,103 @@ TEST_F(DefragDflyEngineTest, DefragEventuallyFinishes) {
   });
 }
 
+// Validates that key defragmentation in the phased algorithm doesn't corrupt data.
+// Uses large keys (>18 bytes) to force external heap allocations, fragments them,
+// runs defrag, then verifies all keys are still accessible and that both key and
+// value reallocations occurred.
+TEST_F(DefragDflyEngineTest, KeyDefragIntegrity) {
+  absl::FlagSaver fs;
+  absl::SetFlag(&FLAGS_experimental_defrag, true);
+  absl::SetFlag(&FLAGS_defrag_keys, true);
+  // Disable the minimum-reclaimable guard so small-scale fragmentation triggers EVACUATE.
+  absl::SetFlag(&FLAGS_defrag_min_plan_reclaimable_bytes, 0);
+
+  // Use keys > 18 bytes to force external allocation (not inline in CompactObj).
+  // Use values > 18 bytes to also force external allocation for values.
+  constexpr int kNumKeys = 5000;
+  constexpr int kValueSize = 64;
+  // Threshold: pages with used/capacity < 0.8 are candidates.
+  constexpr float kThreshold = 0.8f;
+
+  // Populate with large keys and values.
+  for (int i = 0; i < kNumKeys; ++i) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    string val(kValueSize, 'A' + (i % 26));
+    Run({"SET", key, val});
+  }
+
+  // Delete every other key to fragment key and value pages.
+  for (int i = 0; i < kNumKeys; i += 2) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    Run({"DEL", key});
+  }
+
+  // Run phased defrag. Track total key and value reallocations across all cycles.
+  uint64_t total_key_reallocs = 0;
+  uint64_t total_val_reallocs = 0;
+  uint64_t total_bytes_moved = 0;
+
+  shard_set->pool()->AwaitFiberOnAll([&](unsigned, ProactorBase*) {
+    auto* shard = EngineShard::tlocal();
+    if (!shard)
+      return;
+
+    uint64_t shard_key_reallocs = 0;
+    uint64_t shard_val_reallocs = 0;
+    uint64_t shard_bytes_moved = 0;
+
+    for (int i = 0; i < 500; ++i) {
+      PageUsage page_usage{CollectPageStats::NO, kThreshold,
+                           CycleQuota{CycleQuota::kDefaultDefragQuota}};
+      page_usage.SetForceReallocate(true);
+      auto report = shard->DoDefrag(&page_usage);
+      shard_key_reallocs += report.cycle_stats.evac_key_reallocations;
+      shard_val_reallocs += report.cycle_stats.evac_val_reallocations;
+      shard_bytes_moved += report.cycle_stats.evac_bytes_moved;
+    }
+
+    // Atomic accumulation into shared counters (single shard in this fixture, but safe).
+    __atomic_fetch_add(&total_key_reallocs, shard_key_reallocs, __ATOMIC_RELAXED);
+    __atomic_fetch_add(&total_val_reallocs, shard_val_reallocs, __ATOMIC_RELAXED);
+    __atomic_fetch_add(&total_bytes_moved, shard_bytes_moved, __ATOMIC_RELAXED);
+  });
+
+  LOG(INFO) << "Defrag stats: key_reallocs=" << total_key_reallocs
+            << " val_reallocs=" << total_val_reallocs << " bytes_moved=" << total_bytes_moved;
+
+  // Verify that both key AND value reallocations happened.
+  EXPECT_GT(total_key_reallocs, 0u) << "Expected key reallocations during defrag";
+  EXPECT_GT(total_val_reallocs, 0u) << "Expected value reallocations during defrag";
+  EXPECT_GT(total_bytes_moved, 0u) << "Expected non-zero bytes moved during defrag";
+
+  // Validate all remaining keys are intact after defrag — no corruption.
+  for (int i = 1; i < kNumKeys; i += 2) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    string expected_val(kValueSize, 'A' + (i % 26));
+    auto resp = Run({"GET", key});
+    ASSERT_EQ(resp, expected_val) << "Value corrupted after defrag: " << key;
+  }
+
+  // Verify deleted keys are still gone (no resurrection from dangling pointers).
+  for (int i = 0; i < kNumKeys; i += 2) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    auto resp = Run({"EXISTS", key});
+    ASSERT_THAT(resp, IntArg(0)) << "Deleted key reappeared after defrag: " << key;
+  }
+
+  // Write new values to the defragged keys — verifies no dangling pointers.
+  for (int i = 1; i < kNumKeys; i += 2) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    ASSERT_EQ(Run({"SET", key, "new-value"}), "OK");
+  }
+
+  // Final read-back.
+  for (int i = 1; i < kNumKeys; i += 2) {
+    string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i);
+    ASSERT_EQ(Run({"GET", key}), "new-value");
+  }
+}
+
 TEST_F(DflyEngineTest, Issue752) {
   // https://github.com/dragonflydb/dragonfly/issues/752
   // local_result_ member was not reset between commands
diff --git a/src/server/engine_shard.cc b/src/server/engine_shard.cc
index 05b50ba35a85..f20d32c1bb40 100644
--- a/src/server/engine_shard.cc
+++ b/src/server/engine_shard.cc
@@ -21,6 +21,7 @@ extern "C" {
 }
 #include "server/blocking_controller.h"
 #include "server/db_slice.h"
+#include "server/defrag.h"
 #include "server/engine_shard_set.h"
 #include "server/journal/journal.h"
 #include "server/namespaces.h"
@@ -47,6 +48,29 @@ ABSL_FLAG(float, mem_defrag_page_utilization_threshold, 0.8,
           "memory page under utilization threshold. Ratio between used and committed size, below "
           "this, memory in this page will defragmented");
 
+ABSL_FLAG(bool, enable_bg_defrag, false,
+          "If true, run periodic defragmentation as an idle background task. "
+          "Defaults to false on this branch so tests only trigger defrag via "
+          "explicit MEMORY DEFRAGMENT calls.");
+
+ABSL_FLAG(bool, experimental_defrag, true,
+          "When true, run the phased defragmentation strategy (CENSUS / SELECT_TARGETS / "
+          "EVACUATE / VERIFY) instead of the legacy single-pass defragmenter. Experimental.");
+
+ABSL_FLAG(bool, disable_huffman_check, true,
+          "If true, skip the periodic huffman frequency-table check task that fires once "
+          "key memory crosses 50 MiB on shard 0. The task is currently informational only "
+          "(it logs the resulting table) and its multi-second prime-table walk overruns "
+          "the proactor's 500us idle-task budget on large datasets, generating chatter "
+          "and stealing single-core cycles from the workload. Default true while the "
+          "task remains a build-and-log experiment; set false to re-enable.");
+
+ABSL_FLAG(uint64_t, defrag_per_block_move_cost_bytes, 256,
+          "Per-block move-cost weight in the page retention score. Higher values push "
+          "pages with many small entries toward the back of the candidate ranking, "
+          "favoring large-block pages that reclaim more bytes per bucket walked during "
+          "EVACUATE. Useful for wide/mixed-size workloads.");
+
 ABSL_FLAG(int32_t, hz, 100,
           "Base frequency at which the server performs other background tasks. "
           "Warning: not advised to decrease in production.");
@@ -78,6 +102,191 @@ namespace {
 
 constexpr uint64_t kCursorDoneState = 0u;
 
+// Runs one slice of the prime-table traversal: advances `*dbid` past invalid
+// dbs, walks the current db's prime table dispatching DefragIfNeeded through
+// `visitor`, and persists the new cursor position via `*cursor`. Stops when
+// the visitor's quota depletes, the cursor wraps, or the global namespace
+// pointer goes away. Mutates per-db memory accounting on the slice.
+DbSliceResult RunPrimeTableSlice(DbSlice* slice, size_t* dbid, uint64_t* cursor,
+                                 PageUsage* visitor) {
+  // Skip past invalid dbs (e.g., dropped, not yet allocated).
+  while (!slice->IsDbValid(*dbid) && *dbid + 1 < slice->db_array_size())
+    ++*dbid;
+
+  if (!slice->IsDbValid(*dbid)) {
+    return DbSliceResult{.finished_all_dbs = true};
+  }
+
+  auto* prime_table = slice->GetTables(*dbid);
+  PrimeTable::Cursor cur{*cursor};
+  DbSliceResult result;
+  const DbTable* db_table = slice->GetDBTable(*dbid);
+
+  const size_t dbid_before = *dbid;
+  const uint64_t cursor_before = cur.token();
+  const uint64_t start_ns = absl::GetCurrentTimeNanos();
+  uint64_t traverse_calls = 0;
+
+  bool quota_depleted = false;
+  bool should_stop = false;
+  bool cursor_done = false;
+  bool namespaces_null = false;
+
+  const bool read_only = visitor->IsReadOnly();
+  const bool defrag_keys = visitor->ShouldDefragKeys();
+  do {
+    visitor->SetCurrentBucketCursor(cur.token());
+    cur = prime_table->Traverse(cur, [&](PrimeIterator it) {
+      if (read_only) {
+        if (defrag_keys && it->first.HasAllocated()) {
+          it->first.DefragIfNeeded(visitor);
+        }
+        it->second.DefragIfNeeded(visitor);
+        ++result.attempts;
+        return;
+      }
+      const ssize_t orig_val_size = it->second.MallocUsed();
+      const bool did_val = it->second.DefragIfNeeded(visitor);
+      bool did_key = false;
+      ssize_t orig_key_size = 0;
+      if (defrag_keys && it->first.HasAllocated()) {
+        orig_key_size = it->first.MallocUsed();
+        did_key = it->first.DefragIfNeeded(visitor);
+        if (did_key) {
+          if (const ssize_t delta = it->first.MallocUsed() - orig_key_size; delta != 0) {
+            db_table->stats.AddTypeMemoryUsage(OBJ_KEY, delta);
+          }
+        }
+      }
+      ++result.attempts;
+      if (did_val || did_key) {
+        ++result.reallocations;
+        if (did_val) {
+          result.bytes_moved += static_cast<uint64_t>(orig_val_size);
+          ++result.val_reallocations;
+          if (const ssize_t delta = it->second.MallocUsed() - orig_val_size; delta != 0) {
+            db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta);
+          }
+        }
+        if (did_key) {
+          result.bytes_moved += static_cast<uint64_t>(orig_key_size);
+          ++result.key_reallocations;
+        }
+      }
+    });
+    ++traverse_calls;
+
+    quota_depleted = visitor->QuotaDepleted();
+    should_stop = visitor->ShouldStop();
+    cursor_done = !cur;
+    namespaces_null = !namespaces;
+  } while (!quota_depleted && !should_stop && !cursor_done && !namespaces_null);
+
+  const double elapsed_ms = static_cast<double>(absl::GetCurrentTimeNanos() - start_ns) / 1e6;
+  LOG(INFO) << absl::StrFormat(
+      "defrag[Slice] dbid=%zu cursor=%llu->%llu traverses=%llu attempts=%llu "
+      "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%.2fMiB "
+      "took=%.1fms exit{quota=%d stop=%d cursor_done=%d ns_null=%d}",
+      dbid_before, cursor_before, cur.token(), traverse_calls, result.attempts,
+      result.reallocations, result.key_reallocations, result.val_reallocations,
+      static_cast<double>(result.bytes_moved) / (1024.0 * 1024.0), elapsed_ms, quota_depleted,
+      should_stop, cursor_done, namespaces_null);
+
+  *cursor = cur.token();
+  if (*cursor == kCursorDoneState) {
+    ++*dbid;
+  }
+  return result;
+}
+
+// Hinted variant: visits only the buckets in `hints` for the current dbid,
+// resuming from `*cursor_idx` so quota-bounded calls can drain a large hint
+// set across multiple invocations. Each hint is replayed via a single
+// Traverse(Cursor{hint}, cb) call which the underlying DashTable resolves
+// to that one logical bucket (see core/dash.h). On quota exhaustion we
+// persist the next index back into *cursor_idx; once we reach hints.size()
+// the slice is "finished".
+DbSliceResult RunPrimeTableHinted(DbSlice* slice, size_t dbid, const std::vector<uint64_t>& hints,
+                                  size_t* cursor_idx, PageUsage* visitor) {
+  if (!slice->IsDbValid(dbid)) {
+    return DbSliceResult{.finished_all_dbs = true};
+  }
+
+  auto* prime_table = slice->GetTables(dbid);
+  const DbTable* db_table = slice->GetDBTable(dbid);
+
+  DbSliceResult result;
+  const uint64_t start_ns = absl::GetCurrentTimeNanos();
+  bool quota_depleted = false;
+  bool should_stop = false;
+  bool namespaces_null = false;
+  const bool defrag_keys = visitor->ShouldDefragKeys();
+
+  const size_t start_idx = cursor_idx ? *cursor_idx : 0;
+  size_t i = start_idx;
+
+  for (; i < hints.size(); ++i) {
+    const uint64_t h = hints[i];
+    PrimeTable::Cursor cur{h};
+    prime_table->Traverse(cur, [&](PrimeIterator it) {
+      const ssize_t orig_val_size = it->second.MallocUsed();
+      const bool did_val = it->second.DefragIfNeeded(visitor);
+      bool did_key = false;
+      ssize_t orig_key_size = 0;
+      if (defrag_keys && it->first.HasAllocated()) {
+        orig_key_size = it->first.MallocUsed();
+        did_key = it->first.DefragIfNeeded(visitor);
+        if (did_key) {
+          if (const ssize_t delta = it->first.MallocUsed() - orig_key_size; delta != 0) {
+            db_table->stats.AddTypeMemoryUsage(OBJ_KEY, delta);
+          }
+        }
+      }
+      ++result.attempts;
+      if (did_val || did_key) {
+        ++result.reallocations;
+        if (did_val) {
+          result.bytes_moved += static_cast<uint64_t>(orig_val_size);
+          ++result.val_reallocations;
+          if (const ssize_t delta = it->second.MallocUsed() - orig_val_size; delta != 0) {
+            db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta);
+          }
+        }
+        if (did_key) {
+          result.bytes_moved += static_cast<uint64_t>(orig_key_size);
+          ++result.key_reallocations;
+        }
+      }
+    });
+
+    quota_depleted = visitor->QuotaDepleted();
+    should_stop = visitor->ShouldStop();
+    namespaces_null = !namespaces;
+    if (quota_depleted || should_stop || namespaces_null) {
+      ++i;  // we finished bucket i; resume from i+1 next call
+      break;
+    }
+  }
+
+  if (cursor_idx) {
+    *cursor_idx = i;
+  }
+
+  const size_t hints_visited = i - start_idx;
+  const double elapsed_ms = static_cast<double>(absl::GetCurrentTimeNanos() - start_ns) / 1e6;
+  LOG(INFO) << absl::StrFormat(
+      "defrag[Hinted] dbid=%zu hints_this_call=%zu pos=%zu/%zu attempts=%llu "
+      "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%.2fMiB "
+      "took=%.1fms exit{quota=%d stop=%d ns_null=%d}",
+      dbid, hints_visited, i, hints.size(), result.attempts, result.reallocations,
+      result.key_reallocations, result.val_reallocations,
+      static_cast<double>(result.bytes_moved) / (1024.0 * 1024.0), elapsed_ms, quota_depleted,
+      should_stop, namespaces_null);
+
+  result.finished_all_dbs = !quota_depleted && !should_stop && i == hints.size();
+  return result;
+}
+
 bool HasContendedLocks(ShardId shard_id, Transaction* trx, const DbTable* table) {
   auto is_contended = [table](LockFp fp) { return table->trans_locks.Find(fp)->IsContended(); };
 
@@ -259,17 +468,14 @@ EngineShard::Stats& EngineShard::Stats::operator+=(const Stats& o) {
   return *this;
 }
 
-void EngineShard::DefragTaskState::UpdateScanState(uint64_t cursor_val) {
-  cursor = cursor_val;
-  // Once we're done with a db, jump to the next
-  if (cursor == kCursorDoneState) {
-    dbid++;
-  }
-}
-
-void EngineShard::DefragTaskState::ResetScanState() {
-  dbid = cursor = 0u;
-}
+enum class DefragSkipReason : uint8_t {
+  MemoryTooLow,
+  MemoryBelowThreshold,
+  CheckWithinInterval,
+  NotEnoughFragmentation,
+  CheckInProgress,
+  NotSkipped,
+};
 
 // This function checks 3 things:
 // 1. Don't try memory fragmentation if we don't use "enough" memory (control by
@@ -277,10 +483,12 @@ void EngineShard::DefragTaskState::ResetScanState() {
 // 2. We have memory blocks that can be better utilized (there is a "wasted memory" in them).
 // 3. in case the above is OK, make sure that we have a "gap" between usage and commited memory
 // (control by mem_defrag_waste_threshold flag)
-EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequired() {
-  using enum SkipReason;
-  if (cursor > kCursorDoneState) {
-    VLOG(2) << "cursor: " << cursor;
+DefragSkipReason ShouldStartDefrag(DefragTaskState* state) {
+  using enum DefragSkipReason;
+  // Mid-cycle: keep going regardless of memory / interval gates. Legacy uses
+  // cursor > 0; phased uses phase != IDLE. Either signals "in progress".
+  if (state->cursor > kCursorDoneState || state->phase != DefragPhase::IDLE) {
+    VLOG(2) << "cursor: " << state->cursor << " phase: " << static_cast<int>(state->phase);
     return NotSkipped;
   }
 
@@ -302,7 +510,7 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ
 
   if (finfo.bin == 0) {  // did not start the iterative checking yet
     const auto now = time(nullptr);
-    const auto seconds_from_prev_check = now - last_check_time;
+    const auto seconds_from_prev_check = now - state->last_check_time;
     const auto mem_defrag_interval = GetFlag(FLAGS_mem_defrag_check_sec_interval);
 
     if (seconds_from_prev_check < mem_defrag_interval) {
@@ -312,17 +520,17 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ
     // start checking.
     finfo.committed = finfo.committed_golden = 0;
     finfo.wasted = 0;
-    page_utilization_threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
+    state->page_utilization_threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
   }
 
   uint64_t start = absl::GetCurrentTimeNanos();
-  int res = zmalloc_get_allocator_fragmentation_step(page_utilization_threshold, &finfo);
+  int res = zmalloc_get_allocator_fragmentation_step(state->page_utilization_threshold, &finfo);
   uint64_t duration = absl::GetCurrentTimeNanos() - start;
   VLOG(1) << "Reading memory usage took " << duration << " ns on bin " << finfo.bin - 1;
 
   if (res == 0) {
     // finished checking.
-    last_check_time = time(nullptr);
+    state->last_check_time = time(nullptr);
 
     if (finfo.committed != finfo.committed_golden) {
       LOG_FIRST_N(ERROR, 100) << "committed memory computed incorrectly: " << finfo.committed
@@ -339,7 +547,7 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ
   return CheckInProgress;
 }
 
-std::optional<CollectedPageStats> EngineShard::DoDefrag(PageUsage* page_usage) {
+DefragShardReport EngineShard::DoDefrag(PageUsage* page_usage) {
   // --------------------------------------------------------------------------
   // NOTE: This task is running with exclusive access to the shard.
   // i.e. - Since we are using shared nothing access here, and all access
@@ -350,43 +558,88 @@ std::optional<CollectedPageStats> EngineShard::DoDefrag(PageUsage* page_usage) {
   // TODO: enable tiered storage on non-default db slice
   DbSlice& slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_->shard_id());
 
-  // If we moved to an invalid db, skip as long as it's not the last one
-  while (!slice.IsDbValid(defrag_state_.dbid) && defrag_state_.dbid + 1 < slice.db_array_size())
-    defrag_state_.dbid++;
+  const uint64_t start_ns = absl::GetCurrentTimeNanos();
+  const DefragPhase phase_start = defrag_state_.phase;
 
-  // If we found no valid db, we finished traversing and start from scratch next time
-  if (!slice.IsDbValid(defrag_state_.dbid)) {
-    defrag_state_.ResetScanState();
-    return std::nullopt;
-  }
+  // Re-arm page_usage's quota so it counts only DoDefrag's actual work. Without
+  // this, expensive HLL/hdr_histogram allocation in PageUsage's constructor
+  // (in the caller) is charged against the defrag budget.
+  page_usage->ArmQuotaTimer();
 
-  DCHECK(slice.IsDbValid(defrag_state_.dbid));
-  auto* prime_table = slice.GetTables(defrag_state_.dbid);
-  PrimeTable::Cursor cur{defrag_state_.cursor};
-  uint64_t reallocations = 0;
-  uint64_t attempts = 0;
+  if (GetFlag(FLAGS_experimental_defrag)) {
+    LOG(INFO) << absl::StrFormat("defrag[DoDefrag] shard=%u enter phase=%s threshold=%.2f",
+                                 shard_id_, PhaseName(phase_start), page_usage->threshold());
 
-  DbTable* db_table = slice.GetDBTable(defrag_state_.dbid);
-  do {
-    cur = prime_table->Traverse(cur, [&](PrimeIterator it) {
-      // for each value check whether we should move it because it
-      // seats on underutilized page of memory, and if so, do it.
-      const ssize_t original_size = it->second.MallocUsed();
-      const bool did = it->second.DefragIfNeeded(page_usage);
-      attempts++;
-      if (did) {
-        reallocations++;
-        if (const ssize_t delta = it->second.MallocUsed() - original_size; delta != 0) {
-          db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta);
-        }
+    // Picked up at IDLE so a new cycle reflects flag changes; mid-cycle changes
+    // are deferred until the next cycle so CENSUS/EVACUATE see consistent state.
+    if (defrag_state_.phase == DefragPhase::IDLE) {
+      defrag_state_.per_block_move_cost_bytes = GetFlag(FLAGS_defrag_per_block_move_cost_bytes);
+    }
+
+    auto walker = [&](PageUsage* visitor, const std::vector<uint64_t>* hints, size_t* hint_cursor) {
+      if (hints != nullptr) {
+        return RunPrimeTableHinted(&slice, defrag_state_.dbid, *hints, hint_cursor, visitor);
       }
-    });
-  } while (!page_usage->QuotaDepleted() && cur && namespaces);
+      return RunPrimeTableSlice(&slice, &defrag_state_.dbid, &defrag_state_.cursor, visitor);
+    };
+    RunPhaseDefrag(&defrag_state_, page_usage->threshold(),
+                   CycleQuota{CycleQuota::kDefaultDefragQuota}, walker);
+
+    page_usage->ExtendQuota(50);
+    shard_search_indices_->Defragment(page_usage);
+
+    stats_.defrag_task_invocation_total++;
+
+    DefragShardReport report;
+    report.summary.phase_start = phase_start;
+    report.summary.phase_end = defrag_state_.phase;
+    report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000;
+    // TODO(defrag): quota_depleted here reflects page_usage's quota, which arms
+    // when page_usage is constructed (in the caller, before we entered DoDefrag)
+    // and only gates the trailing search-index defrag. RunPhaseDefrag uses its
+    // own internal CycleQuota that's not surfaced. Empirically on an empty DB
+    // we observe quota_depleted=true while cycle_finished=false, which suggests
+    // the phased CycleQuota is exhausted inside individual step walks (150us is
+    // tight) and the cycle never reaches VERIFY in a single invocation. Add
+    // per-phase logging and consider plumbing the phased quota's depletion bit
+    // into the summary so we can tell which quota actually stopped us.
+    report.summary.quota_depleted = page_usage->QuotaDepleted();
+    report.summary.finished_all_dbs = defrag_state_.cycle_stats.cycle_finished;
+    report.cycle_stats = defrag_state_.cycle_stats;
+    report.page_usage_stats = page_usage->CollectedStats();
+    report.work_pending = defrag_state_.phase != DefragPhase::IDLE;
+
+    LOG(INFO) << absl::StrFormat(
+        "defrag[DoDefrag] shard=%u exit phase=%s->%s duration=%lluus quota_depleted=%d "
+        "work_pending=%d cycle_finished=%d",
+        shard_id_, PhaseName(report.summary.phase_start), PhaseName(report.summary.phase_end),
+        report.summary.duration_us, report.summary.quota_depleted, report.work_pending,
+        report.summary.finished_all_dbs);
+    return report;
+  }
+
+  const DbSliceResult slice_result =
+      RunPrimeTableSlice(&slice, &defrag_state_.dbid, &defrag_state_.cursor, page_usage);
+  const uint64_t attempts = slice_result.attempts;
+  uint64_t reallocations = slice_result.reallocations;
+  const bool finished_all_dbs = slice_result.finished_all_dbs;
+
+  DefragShardReport report;
+  report.summary.phase_start = phase_start;  // legacy path: always IDLE
+  report.summary.phase_end = DefragPhase::IDLE;
+
+  if (finished_all_dbs) {
+    defrag_state_.ResetScanState();
+    report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000;
+    report.summary.quota_depleted = page_usage->QuotaDepleted();
+    report.summary.finished_all_dbs = true;
+    report.work_pending = false;
+    return report;
+  }
+
   const uint64_t used_cycles = page_usage->UsedQuotaCycles();
   const uint64_t usec = base::CycleClock::ToUsec(used_cycles);
 
-  defrag_state_.UpdateScanState(cur.token());
-
   page_usage->ExtendQuota(50);
   const auto [quota_depleted, objects_moved] = shard_search_indices_->Defragment(page_usage);
   reallocations += objects_moved;
@@ -409,7 +662,12 @@ std::optional<CollectedPageStats> EngineShard::DoDefrag(PageUsage* page_usage) {
         slice.shard_id(), used_cycles, usec, cursor_state);
   }
 
-  return page_usage->CollectedStats();
+  report.page_usage_stats = page_usage->CollectedStats();
+  report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000;
+  report.summary.quota_depleted = page_usage->QuotaDepleted();
+  report.summary.finished_all_dbs = false;
+  report.work_pending = true;  // !finished_all_dbs path
+  return report;
 }
 
 // the memory defragmentation task is as follow:
@@ -421,21 +679,20 @@ std::optional<CollectedPageStats> EngineShard::DoDefrag(PageUsage* page_usage) {
 //     priority.
 //     otherwise lower the task priority so that it would not use the CPU when not required
 uint32_t EngineShard::DefragTask() {
-  using enum DefragTaskState::SkipReason;
+  using enum DefragSkipReason;
 
   constexpr uint32_t kRunAtLowPriority = 0u;
   if (!namespaces) {
     return kRunAtLowPriority;
   }
 
-  if (auto check_result = defrag_state_.CheckRequired(); check_result == NotSkipped) {
+  if (auto check_result = ShouldStartDefrag(&defrag_state_); check_result == NotSkipped) {
     VLOG(2) << shard_id_ << ": need to run defrag memory cursor state: " << defrag_state_.cursor;
     static const float threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
     // TODO (abhijat): implement move ctor for PageUsage so this object can be moved into the task.
     PageUsage page_usage{CollectPageStats::NO, threshold,
                          CycleQuota{CycleQuota::kDefaultDefragQuota}};
-    if (DoDefrag(&page_usage)) {
-      // we didn't finish the scan
+    if (DoDefrag(&page_usage).work_pending) {
       return ProactorBase::kOnIdleMaxLevel;
     }
   } else {
@@ -474,6 +731,7 @@ EngineShard::EngineShard(util::ProactorBase* pb, mi_heap_t* heap)
       queue2_(kQueueLen / 2, 2, 2),
       shard_id_(pb->GetPoolIndex()),
       mi_resource_(heap) {
+  defrag_state_.shard_id = shard_id_;
   queue_.Start(absl::StrCat("shard_queue_", shard_id()));
   queue2_.Start(absl::StrCat("l2_queue_", shard_id()));
 }
@@ -537,7 +795,9 @@ void EngineShard::StartPeriodicHeartbeatFiber(util::ProactorBase* pb) {
   fiber_heartbeat_periodic_ = fb2::Fiber(fb_opts, [this, period_ms, heartbeat]() mutable {
     RunFPeriodically(heartbeat, period_ms, "heartbeat", &fiber_heartbeat_periodic_done_);
   });
-  defrag_task_id_ = pb->AddOnIdleTask([this]() { return DefragTask(); });
+  if (absl::GetFlag(FLAGS_enable_bg_defrag)) {
+    defrag_task_id_ = pb->AddOnIdleTask([this]() { return DefragTask(); });
+  }
 }
 
 void EngineShard::StartPeriodicShardHandlerFiber(util::ProactorBase* pb,
@@ -568,6 +828,14 @@ void EngineShard::InitThreadLocal(ProactorBase* pb) {
   SmallString::InitThreadLocal(data_heap);
   InitTLStatelessAllocMR(shard_->memory_resource());
 
+  // Register the mimalloc underutil callback once (process-wide); each shard
+  // thread's tracker storage is thread_local, so the single callback dispatches
+  // naturally to per-shard sets. Threshold is the same value defrag uses to
+  // classify under-utilized pages.
+  defrag_underutil::InitOnce();
+  const float thr = GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
+  defrag_underutil::SetThresholdPct(static_cast<uint8_t>(std::clamp(thr * 100.0f, 0.0f, 100.0f)));
+
   shard_->shard_search_indices_ = std::make_unique<ShardDocIndices>();
 }
 
@@ -775,7 +1043,7 @@ void EngineShard::Heartbeat() {
   stalled_start_ns_ = 0;
 
   thread_local bool check_huffman = (shard_id_ == 0);  // run it only on shard 0.
-  if (check_huffman) {
+  if (check_huffman && !absl::GetFlag(FLAGS_disable_huffman_check)) {
     auto* ptr = db_slice.GetDBTable(0);
     if (ptr) {
       size_t key_usage = ptr->stats.memory_usage_by_type[OBJ_KEY];
diff --git a/src/server/engine_shard.h b/src/server/engine_shard.h
index 513fd8b388e8..08322608ba04 100644
--- a/src/server/engine_shard.h
+++ b/src/server/engine_shard.h
@@ -10,6 +10,7 @@
 #include "core/task_queue.h"
 #include "core/tx_queue.h"
 #include "server/common_types.h"
+#include "server/defrag.h"
 #include "util/sliding_counter.h"
 
 typedef char* sds;
@@ -209,8 +210,7 @@ class EngineShard {
   void FinalizeMulti(Transaction* tx);
 
   // Scan the shard with the cursor and apply defragmentation for database entries.
-  // Returns collected page stats if defragmentation was performed.
-  std::optional<CollectedPageStats> DoDefrag(PageUsage* page_usage);
+  DefragShardReport DoDefrag(PageUsage* page_usage);
 
   uint64_t GetDefragCursor() const {
     return defrag_state_.cursor;
@@ -220,29 +220,6 @@ class EngineShard {
   size_t CompactTable(double threshold, DbIndex db_idx);
 
  private:
-  struct DefragTaskState {
-    size_t dbid = 0u;
-    uint64_t cursor = 0u;
-    time_t last_check_time = 0;
-    float page_utilization_threshold = 0.8;
-
-    enum class SkipReason : uint8_t {
-      MemoryTooLow,
-      MemoryBelowThreshold,
-      CheckWithinInterval,
-      NotEnoughFragmentation,
-      CheckInProgress,
-      NotSkipped,
-    };
-
-    // check the current threshold and return a reason if we skip the defragmentation
-    SkipReason CheckRequired();
-
-    void UpdateScanState(uint64_t cursor_val);
-
-    void ResetScanState();
-  };
-
   struct EvictionTaskState {
     void Reset(bool rss_eviction_enabled_flag) {
       rss_eviction_enabled = rss_eviction_enabled_flag;
diff --git a/src/server/memory_cmd.cc b/src/server/memory_cmd.cc
index 8ec9c0c4a6ae..56d0e97744fc 100644
--- a/src/server/memory_cmd.cc
+++ b/src/server/memory_cmd.cc
@@ -282,19 +282,20 @@ void MemoryCmd::Run(CmdArgList args) {
     static const float default_threshold =
         absl::GetFlag(FLAGS_mem_defrag_page_utilization_threshold);
     const float threshold = parser.NextOrDefault(default_threshold);
+    if (parser.HasError()) {
+      return cmd_cntx_->SendError(parser.TakeError().MakeReply());
+    }
 
-    std::vector<CollectedPageStats> results(shard_set->size());
+    std::vector<DefragShardReport> results(shard_set->size());
     shard_set->pool()->AwaitFiberOnAll([threshold, &results](util::ProactorBase*) {
       if (auto* shard = EngineShard::tlocal(); shard) {
         PageUsage page_usage{CollectPageStats::YES, threshold,
                              CycleQuota{CycleQuota::kDefaultDefragQuota}};
-        if (auto shard_res = shard->DoDefrag(&page_usage); shard_res.has_value()) {
-          results[shard->shard_id()] = std::move(shard_res.value());
-        }
+        results[shard->shard_id()] = shard->DoDefrag(&page_usage);
       }
     });
 
-    const CollectedPageStats merged = CollectedPageStats::Merge(std::move(results), threshold);
+    const DefragMergedReport merged = DefragMergedReport::Merge(std::move(results));
     auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx_->rb());
     return rb->SendVerbatimString(merged.ToString());
   }
diff --git a/tools/defrag_baseline.py b/tools/defrag_baseline.py
new file mode 100755
index 000000000000..d75713bf442b
--- /dev/null
+++ b/tools/defrag_baseline.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python3
+"""Create an uneven memory-fragmentation baseline for defrag experiments.
+
+This script only creates and deletes keys. It does not run MEMORY DEFRAGMENT and
+does not wait for background defrag counters.
+"""
+
+import argparse
+import asyncio
+import contextlib
+import os
+import random
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+
+import aioredis
+
+
+@dataclass(frozen=True)
+class WideBand:
+    name: str
+    value_size: int
+    byte_share: float
+    profile: str
+
+
+WIDE_BANDS = [
+    WideBand("tiny", 64, 0.02, "0.99:6,0.90:3,0.60:1"),
+    WideBand("small", 128, 0.04, "0.98:4,0.80:3,0.35:2,0.08:1"),
+    WideBand("med1", 256, 0.10, "0.98:3,0.65:2,0.25:3,0.04:2"),
+    WideBand("med2", 512, 0.22, "0.95:3,0.55:3,0.15:3,0.03:1"),
+    WideBand("large1", 1024, 0.28, "0.95:3,0.60:3,0.20:3,0.05:1"),
+    WideBand("large2", 2048, 0.24, "0.90:4,0.45:3,0.12:2,0.03:1"),
+    WideBand("huge", 4096, 0.10, "0.85:5,0.35:3,0.08:2"),
+]
+
+
+def parse_profile(profile: str) -> list[tuple[float, int]]:
+    """Parse "live_ratio:weight,..." into weighted live-ratio entries."""
+    entries = []
+    for raw_part in profile.split(","):
+        part = raw_part.strip()
+        if not part:
+            continue
+        ratio_text, weight_text = part.split(":", 1)
+        ratio = float(ratio_text)
+        weight = int(weight_text)
+        if not 0.0 <= ratio <= 1.0:
+            raise ValueError(f"live ratio must be in [0, 1]: {ratio}")
+        if weight <= 0:
+            raise ValueError(f"profile weight must be positive: {weight}")
+        entries.append((ratio, weight))
+
+    if not entries:
+        raise ValueError("profile must contain at least one live_ratio:weight entry")
+    return entries
+
+
+def make_ratio_deck(profile: list[tuple[float, int]], rng: random.Random) -> list[float]:
+    deck = []
+    for live_ratio, weight in profile:
+        deck.extend([live_ratio] * weight)
+    rng.shuffle(deck)
+    return deck
+
+
+def key_name(prefix: str, key_id: int) -> str:
+    return f"{prefix}:{key_id}"
+
+
+def parse_arena_summary(report: str) -> tuple[str | None, list[tuple[int, int, int, int, float]]]:
+    """Return the machine-wide Total line and top-level bin rows from MEMORY ARENA SUMMARY."""
+    lines = report.splitlines()
+    machine_start = 0
+    for index, line in enumerate(lines):
+        if "Arena statistics for machine" in line:
+            machine_start = index
+
+    machine_lines = lines[machine_start:]
+
+    total_line = None
+    rows = []
+    for line in machine_lines:
+        parts = line.replace("%", "").split()
+        if not parts:
+            continue
+        if parts[0] == "Total:" and len(parts) >= 6:
+            total_line = line
+            continue
+        if len(parts) != 6 or not parts[0].isdigit():
+            continue
+
+        block_size, _reserved, committed, used, wasted, waste_pct = parts
+        rows.append((int(wasted), int(block_size), int(committed), int(used), float(waste_pct)))
+
+    rows.sort(reverse=True)
+    return total_line, rows
+
+
+def print_arena_summary(report: str, top_bins: int) -> None:
+    total_line, rows = parse_arena_summary(report)
+
+    if total_line:
+        parts = total_line.replace("%", "").split()
+        print(
+            "arena_total "
+            f"reserved={int(parts[1]):,} "
+            f"committed={int(parts[2]):,} "
+            f"used={int(parts[3]):,} "
+            f"wasted={int(parts[4]):,} "
+            f"waste_pct={float(parts[5]):.2f}%"
+        )
+    else:
+        print("arena_total unavailable")
+
+    if top_bins <= 0:
+        return
+
+    print("top_waste_bins:")
+    for wasted, block_size, committed, used, waste_pct in rows[:top_bins]:
+        print(
+            f"  block={block_size} committed={committed:,} used={used:,} "
+            f"wasted={wasted:,} waste_pct={waste_pct:.2f}%"
+        )
+
+
+def format_bytes(size: int) -> str:
+    value = float(size)
+    for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
+        if value < 1024.0 or unit == "TiB":
+            if unit == "B":
+                return f"{int(value)}B"
+            return f"{value:.1f}{unit}"
+        value /= 1024.0
+
+    raise AssertionError("unreachable")
+
+
+def format_bytes_pair(size: int) -> str:
+    return f"{size:,} ({format_bytes(size)})"
+
+
+async def snapshot(
+    connection: aioredis.Redis, label: str, include_arena: bool, top_bins: int
+) -> None:
+    print(f"\n=== {label} ===")
+
+    info = await connection.execute_command("INFO", "memory")
+    print(f"used_memory={int(info.get('used_memory', 0)):,}")
+    print(f"used_memory_rss={int(info.get('used_memory_rss', 0)):,}")
+    print(f"object_used_memory={int(info.get('object_used_memory', 0)):,}")
+    print(f"table_used_memory={int(info.get('table_used_memory', 0)):,}")
+
+    if include_arena:
+        report = await connection.execute_command("MEMORY", "ARENA", "SUMMARY")
+        print_arena_summary(report, top_bins)
+
+
+async def populate(
+    connection: aioredis.Redis, keys_count: int, prefix: str, value_size: int
+) -> None:
+    print(f"creating {keys_count:,} keys prefix={prefix!r} value_size={value_size:,}")
+    await connection.execute_command("DEBUG", "POPULATE", keys_count, prefix, value_size)
+
+
+async def flushdb(connection: aioredis.Redis) -> None:
+    print("flushing current database")
+    await connection.execute_command("FLUSHDB")
+
+
+async def delete_batches(connection: aioredis.Redis, keys: list[str], batch_size: int) -> int:
+    deleted = 0
+    for start in range(0, len(keys), batch_size):
+        deleted += await connection.delete(*keys[start : start + batch_size])
+    return deleted
+
+
+async def delete_fragmented_chunks(
+    connection: aioredis.Redis,
+    *,
+    rng: random.Random,
+    prefix: str,
+    keys_count: int,
+    value_size: int,
+    chunk_keys: int,
+    profile_text: str,
+    delete_batch: int,
+    snapshot_every_chunks: int,
+    snapshot_prefix: str,
+    include_arena: bool,
+    top_arena_bins: int,
+) -> dict:
+    profile = parse_profile(profile_text)
+    deck = make_ratio_deck(profile, rng)
+
+    chunks_by_ratio = Counter()
+    deleted_by_ratio = defaultdict(int)
+    live_by_ratio = defaultdict(int)
+    total_deleted = 0
+    chunks = 0
+
+    for chunk_start in range(0, keys_count, chunk_keys):
+        chunk_end = min(keys_count, chunk_start + chunk_keys)
+        chunk_size = chunk_end - chunk_start
+
+        if chunks and chunks % len(deck) == 0:
+            deck = make_ratio_deck(profile, rng)
+
+        live_ratio = deck[chunks % len(deck)]
+        live_count = round(chunk_size * live_ratio)
+        delete_count = chunk_size - live_count
+
+        ids = list(range(chunk_start, chunk_end))
+        delete_ids = rng.sample(ids, delete_count)
+        delete_keys = [key_name(prefix, key_id) for key_id in delete_ids]
+
+        deleted = await delete_batches(connection, delete_keys, delete_batch)
+        live = chunk_size - deleted
+
+        chunks_by_ratio[live_ratio] += 1
+        deleted_by_ratio[live_ratio] += deleted
+        live_by_ratio[live_ratio] += live
+        total_deleted += deleted
+        chunks += 1
+
+        if snapshot_every_chunks and chunks % snapshot_every_chunks == 0:
+            await snapshot(
+                connection,
+                f"{snapshot_prefix}_after_delete_chunk_{chunks}",
+                include_arena,
+                top_arena_bins,
+            )
+
+    return {
+        "chunks": chunks,
+        "chunk_keys": chunk_keys,
+        "profile": profile_text,
+        "created_keys": keys_count,
+        "deleted_keys": total_deleted,
+        "live_keys": keys_count - total_deleted,
+        "value_size": value_size,
+        "chunks_by_ratio": chunks_by_ratio,
+        "deleted_by_ratio": deleted_by_ratio,
+        "live_by_ratio": live_by_ratio,
+    }
+
+
+def print_fragmentation_summary(summary: dict, *, label: str, seed: int) -> None:
+    print("\n=== planned_fragmentation ===")
+    print(f"label={label}")
+    print(f"chunks={summary['chunks']:,}")
+    print(f"chunk_keys={summary['chunk_keys']:,}")
+    print(f"seed={seed}")
+    print(f"profile={summary['profile']}")
+    print(f"value_size={summary['value_size']:,}")
+    print(f"created_keys={summary['created_keys']:,}")
+    print(f"deleted_keys={summary['deleted_keys']:,}")
+    print(f"live_keys={summary['live_keys']:,}")
+    print(
+        "estimated_created_value_bytes="
+        f"{format_bytes_pair(summary['created_keys'] * summary['value_size'])}"
+    )
+    print(
+        "estimated_deleted_value_bytes="
+        f"{format_bytes_pair(summary['deleted_keys'] * summary['value_size'])}"
+    )
+    print(
+        "estimated_live_value_bytes="
+        f"{format_bytes_pair(summary['live_keys'] * summary['value_size'])}"
+    )
+
+    print("\nby_live_ratio:")
+    for ratio in sorted(summary["chunks_by_ratio"].keys(), reverse=True):
+        print(
+            f"  live_ratio={ratio:.2f} chunks={summary['chunks_by_ratio'][ratio]:,} "
+            f"live_keys={summary['live_by_ratio'][ratio]:,} "
+            f"deleted_keys={summary['deleted_by_ratio'][ratio]:,}"
+        )
+
+
+def print_wide_distribution(
+    summaries: list[tuple[WideBand, dict]], target_value_bytes: int
+) -> None:
+    total_created_keys = sum(summary["created_keys"] for _, summary in summaries)
+    total_live_keys = sum(summary["live_keys"] for _, summary in summaries)
+    total_deleted_keys = sum(summary["deleted_keys"] for _, summary in summaries)
+    total_created_bytes = sum(
+        summary["created_keys"] * summary["value_size"] for _, summary in summaries
+    )
+    total_live_bytes = sum(summary["live_keys"] * summary["value_size"] for _, summary in summaries)
+    total_deleted_bytes = sum(
+        summary["deleted_keys"] * summary["value_size"] for _, summary in summaries
+    )
+
+    print("\n=== wide_object_distribution ===")
+    print(f"target_value_bytes={format_bytes_pair(target_value_bytes)}")
+    print(f"created_value_bytes={format_bytes_pair(total_created_bytes)}")
+    print(f"live_value_bytes={format_bytes_pair(total_live_bytes)}")
+    print(f"deleted_value_bytes={format_bytes_pair(total_deleted_bytes)}")
+    print()
+
+    print(
+        f"{'band':<8} {'size':>6} {'share':>7} {'keys_created':>13} "
+        f"{'keys_live':>12} {'keys_deleted':>13} {'bytes_created':>13} "
+        f"{'bytes_live':>10} {'bytes_deleted':>13} {'live%':>7} {'chunks':>7}"
+    )
+
+    for band, summary in summaries:
+        created_keys = summary["created_keys"]
+        live_keys = summary["live_keys"]
+        deleted_keys = summary["deleted_keys"]
+        value_size = summary["value_size"]
+        created_bytes = created_keys * value_size
+        live_bytes = live_keys * value_size
+        deleted_bytes = deleted_keys * value_size
+        live_ratio = live_keys / created_keys if created_keys else 0.0
+
+        print(
+            f"{band.name:<8} {value_size:>6,} {band.byte_share:>6.1%} "
+            f"{created_keys:>13,} {live_keys:>12,} {deleted_keys:>13,} "
+            f"{format_bytes(created_bytes):>13} {format_bytes(live_bytes):>10} "
+            f"{format_bytes(deleted_bytes):>13} {live_ratio:>6.1%} "
+            f"{summary['chunks']:>7,}"
+        )
+
+    total_live_ratio = total_live_keys / total_created_keys if total_created_keys else 0.0
+    print(
+        f"{'total':<8} {'-':>6} {'100.0%':>7} {total_created_keys:>12,} "
+        f"{total_live_keys:>12,} {total_deleted_keys:>13,} "
+        f"{format_bytes(total_created_bytes):>13} {format_bytes(total_live_bytes):>10} "
+        f"{format_bytes(total_deleted_bytes):>13} {total_live_ratio:>6.1%} "
+        f"{sum(summary['chunks'] for _, summary in summaries):>7,}"
+    )
+
+
+def print_wide_live_ratio_distribution(summaries: list[tuple[WideBand, dict]]) -> None:
+    print("\n=== wide_live_ratio_distribution ===")
+    print(
+        f"{'band':<8} {'ratio':>7} {'chunks':>8} {'keys_total':>12} "
+        f"{'keys_live':>12} {'keys_deleted':>13} {'bytes_deleted':>14}"
+    )
+
+    for band, summary in summaries:
+        value_size = summary["value_size"]
+        for ratio in sorted(summary["chunks_by_ratio"].keys(), reverse=True):
+            live_keys = summary["live_by_ratio"][ratio]
+            deleted_keys = summary["deleted_by_ratio"][ratio]
+            keys = live_keys + deleted_keys
+            deleted_bytes = deleted_keys * value_size
+
+            print(
+                f"{band.name:<8} {ratio:>6.1%} {summary['chunks_by_ratio'][ratio]:>8,} "
+                f"{keys:>12,} {live_keys:>12,} {deleted_keys:>12,} "
+                f"{format_bytes(deleted_bytes):>14}"
+            )
+
+
+def profile_histogram_counts(summary: dict) -> list[int]:
+    # Buckets are live-ratio ranges: [0.80, 1.00], [0.40, 0.80), [0.10, 0.40), [0.00, 0.10).
+    buckets = [0, 0, 0, 0]
+    for ratio, chunks in summary["chunks_by_ratio"].items():
+        if ratio >= 0.80:
+            buckets[0] += chunks
+        elif ratio >= 0.40:
+            buckets[1] += chunks
+        elif ratio >= 0.10:
+            buckets[2] += chunks
+        else:
+            buckets[3] += chunks
+    return buckets
+
+
+def render_histogram_cell(count: int, total: int, width: int = 12) -> str:
+    if count == 0 or total == 0:
+        return "-"
+
+    pct = 100.0 * count / total
+    bars = max(1, round(width * count / total))
+    return f"{'#' * bars} {pct:.0f}%"
+
+
+def print_wide_profile_histogram(summaries: list[tuple[WideBand, dict]]) -> None:
+    print("\n=== wide_profile_histogram ===")
+    print(
+        f"{'band':<8} {'size':>6} {'chunks':>8} {'80-100% live':<18} "
+        f"{'40-80% live':<18} {'10-40% live':<18} {'0-10% live':<18}"
+    )
+
+    totals = [0, 0, 0, 0]
+    total_chunks = 0
+    for band, summary in summaries:
+        counts = profile_histogram_counts(summary)
+        chunks = summary["chunks"]
+        totals = [cur + add for cur, add in zip(totals, counts)]
+        total_chunks += chunks
+
+        print(
+            f"{band.name:<8} {band.value_size:>6,} {chunks:>8,} "
+            f"{render_histogram_cell(counts[0], chunks):<18} "
+            f"{render_histogram_cell(counts[1], chunks):<18} "
+            f"{render_histogram_cell(counts[2], chunks):<18} "
+            f"{render_histogram_cell(counts[3], chunks):<18}"
+        )
+
+    print(
+        f"{'total':<8} {'-':>6} {total_chunks:>8,} "
+        f"{render_histogram_cell(totals[0], total_chunks):<18} "
+        f"{render_histogram_cell(totals[1], total_chunks):<18} "
+        f"{render_histogram_cell(totals[2], total_chunks):<18} "
+        f"{render_histogram_cell(totals[3], total_chunks):<18}"
+    )
+
+
+async def create_uniform_fragmentation(
+    connection: aioredis.Redis, args: argparse.Namespace
+) -> None:
+    rng = random.Random(args.seed)
+
+    await flushdb(connection)
+    await snapshot(connection, "before_populate", args.arena, args.top_arena_bins)
+    await populate(connection, args.keys, args.key_name, args.value_size)
+    await snapshot(connection, "after_populate", args.arena, args.top_arena_bins)
+
+    summary = await delete_fragmented_chunks(
+        connection,
+        rng=rng,
+        prefix=args.key_name,
+        keys_count=args.keys,
+        value_size=args.value_size,
+        chunk_keys=args.chunk_keys,
+        profile_text=args.profile,
+        delete_batch=args.delete_batch,
+        snapshot_every_chunks=args.snapshot_every_chunks,
+        snapshot_prefix="uniform",
+        include_arena=args.arena,
+        top_arena_bins=args.top_arena_bins,
+    )
+
+    print_fragmentation_summary(summary, label="uniform", seed=args.seed)
+    await snapshot(connection, "after_delete", args.arena, args.top_arena_bins)
+
+
+def wide_chunk_keys(value_size: int) -> int:
+    return max(32, round((256 * 1024) / value_size))
+
+
+async def create_wide_fragmentation(connection: aioredis.Redis, args: argparse.Namespace) -> None:
+    rng = random.Random(args.seed)
+    target_value_bytes = args.keys * args.value_size
+
+    await flushdb(connection)
+    await snapshot(connection, "before_populate", args.arena, args.top_arena_bins)
+
+    band_specs = []
+    for band in WIDE_BANDS:
+        keys_count = max(1, round((target_value_bytes * band.byte_share) / band.value_size))
+        prefix = f"{args.key_name}:{band.name}"
+        chunk_keys = wide_chunk_keys(band.value_size)
+        band_specs.append((band, prefix, keys_count, chunk_keys))
+        await populate(connection, keys_count, prefix, band.value_size)
+
+    await snapshot(connection, "after_populate", args.arena, args.top_arena_bins)
+
+    summaries = []
+    for band, prefix, keys_count, chunk_keys in band_specs:
+        summary = await delete_fragmented_chunks(
+            connection,
+            rng=rng,
+            prefix=prefix,
+            keys_count=keys_count,
+            value_size=band.value_size,
+            chunk_keys=chunk_keys,
+            profile_text=band.profile,
+            delete_batch=args.delete_batch,
+            snapshot_every_chunks=args.snapshot_every_chunks,
+            snapshot_prefix=band.name,
+            include_arena=args.arena,
+            top_arena_bins=args.top_arena_bins,
+        )
+        summaries.append((band, summary))
+
+    print("\n=== wide_workload ===")
+    print(f"seed={args.seed}")
+    print(f"target_value_bytes={target_value_bytes:,}")
+    print(f"bands={len(WIDE_BANDS)}")
+
+    created_keys = sum(summary["created_keys"] for _, summary in summaries)
+    deleted_keys = sum(summary["deleted_keys"] for _, summary in summaries)
+    live_keys = sum(summary["live_keys"] for _, summary in summaries)
+    deleted_value_bytes = sum(
+        summary["deleted_keys"] * summary["value_size"] for _, summary in summaries
+    )
+    live_value_bytes = sum(summary["live_keys"] * summary["value_size"] for _, summary in summaries)
+
+    print(f"created_keys={created_keys:,}")
+    print(f"deleted_keys={deleted_keys:,}")
+    print(f"live_keys={live_keys:,}")
+    print(f"estimated_deleted_value_bytes={format_bytes_pair(deleted_value_bytes)}")
+    print(f"estimated_live_value_bytes={format_bytes_pair(live_value_bytes)}")
+
+    print_wide_distribution(summaries, target_value_bytes)
+    print_wide_profile_histogram(summaries)
+    print_wide_live_ratio_distribution(summaries)
+
+    print("\nby_band:")
+    for band, summary in summaries:
+        print(
+            f"  band={band.name} value_size={band.value_size:,} "
+            f"byte_share={band.byte_share:.2f} chunk_keys={summary['chunk_keys']:,} "
+            f"profile={band.profile} created={summary['created_keys']:,} "
+            f"deleted={summary['deleted_keys']:,} live={summary['live_keys']:,}"
+        )
+
+    for band, summary in summaries:
+        print_fragmentation_summary(summary, label=f"wide:{band.name}", seed=args.seed)
+
+    await snapshot(connection, "after_delete", args.arena, args.top_arena_bins)
+
+
+async def create_fragmentation(connection: aioredis.Redis, args: argparse.Namespace) -> None:
+    if args.workload == "wide":
+        await create_wide_fragmentation(connection, args)
+    else:
+        await create_uniform_fragmentation(connection, args)
+
+
+async def main(args: argparse.Namespace) -> None:
+    pool = aioredis.ConnectionPool(
+        host=args.server,
+        port=args.port,
+        db=0,
+        decode_responses=True,
+        max_connections=args.max_connections,
+    )
+    connection = aioredis.Redis(connection_pool=pool)
+    if args.quiet:
+        with open(os.devnull, "w") as sink, contextlib.redirect_stdout(sink):
+            await create_fragmentation(connection, args)
+    else:
+        await create_fragmentation(connection, args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Create an uneven fragmentation baseline.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("-k", "--keys", type=int, default=800_000, help="number of keys to create")
+    parser.add_argument(
+        "--mul",
+        type=float,
+        default=1.0,
+        help="scale --keys by this multiplier (e.g. --mul 5 -> 5x keys)",
+    )
+    parser.add_argument(
+        "-v", "--value-size", type=int, default=645, help="value size for DEBUG POPULATE"
+    )
+    parser.add_argument("-n", "--key-name", default="key-for-testing", help="base key name")
+    parser.add_argument("-s", "--server", default="localhost", help="server host")
+    parser.add_argument("-p", "--port", type=int, default=6379, help="server port")
+    parser.add_argument(
+        "--workload",
+        choices=["uniform", "wide"],
+        default="uniform",
+        help="fragmentation workload preset",
+    )
+
+    parser.add_argument("--seed", type=int, default=12345, help="deterministic deletion seed")
+    parser.add_argument(
+        "--chunk-keys",
+        type=int,
+        default=512,
+        help="contiguous key-id region size assigned one live ratio",
+    )
+    parser.add_argument("--delete-batch", type=int, default=1000, help="DEL batch size")
+    parser.add_argument(
+        "--max-connections", type=int, default=16, help="redis connection pool size"
+    )
+    parser.add_argument(
+        "--profile",
+        default="0.95:2,0.80:2,0.60:3,0.30:2,0.10:1",
+        help="comma-separated live_ratio:weight entries",
+    )
+    parser.add_argument(
+        "--arena", action="store_true", help="include MEMORY ARENA SUMMARY snapshots"
+    )
+    parser.add_argument(
+        "--top-arena-bins",
+        type=int,
+        default=5,
+        help="number of machine-wide arena waste bins to print",
+    )
+    parser.add_argument(
+        "--snapshot-every-chunks",
+        type=int,
+        default=0,
+        help="print snapshots every N deleted chunks; 0 disables intermediate snapshots",
+    )
+    parser.add_argument("--quiet", action="store_true", help="suppress normal progress output")
+
+    args = parser.parse_args()
+    if args.mul != 1.0:
+        args.keys = int(args.keys * args.mul)
+    asyncio.run(main(args))
diff --git a/tools/defrag_compare.py b/tools/defrag_compare.py
new file mode 100755
index 000000000000..48b60ae0b820
--- /dev/null
+++ b/tools/defrag_compare.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+
+
+def field(row, side, name):
+    v = row.get(side)
+    if isinstance(v, dict):
+        return v.get(name)
+    return None
+
+
+def default_label(path: str, df) -> str:
+    labels = df.get("label")
+    if labels is not None:
+        non_null = labels.dropna()
+        if not non_null.empty:
+            return str(non_null.iloc[0])
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def load_run(path: str, pd):
+    df = pd.read_json(path, lines=True)
+    label = default_label(path, df)
+    df["waste_pct"] = df.apply(lambda r: field(r, "after", "waste_pct"), axis=1)
+    df = df[["cycle", "waste_pct"]].dropna()
+    return label, df
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("runs", nargs="+", help="defrag driver JSONL files")
+    parser.add_argument("--out", default="defrag_compare.png")
+    parser.add_argument("--title", default="Defrag Fragmentation Progress")
+    parser.add_argument(
+        "--guide",
+        type=float,
+        action="append",
+        default=[20.0, 10.0, 5.0],
+        help="horizontal waste percentage guide line; may be repeated",
+    )
+    args = parser.parse_args()
+
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    from matplotlib.ticker import MaxNLocator
+
+    fig, ax = plt.subplots(figsize=(16, 7), constrained_layout=True)
+
+    for path in args.runs:
+        label, df = load_run(path, pd)
+        if df.empty:
+            print(f"skipping empty run: {path}")
+            continue
+        markevery = max(1, len(df) // 90)
+        ax.plot(
+            df["cycle"],
+            df["waste_pct"],
+            linewidth=2.5,
+            marker="o",
+            markersize=3,
+            markevery=markevery,
+            label=label,
+        )
+
+    for guide in args.guide:
+        ax.axhline(guide, color="gray", linewidth=1, alpha=0.25)
+        ax.text(
+            0.995,
+            guide,
+            f"{guide:g}%",
+            transform=ax.get_yaxis_transform(),
+            ha="right",
+            va="bottom",
+            color="gray",
+            fontsize=9,
+        )
+
+    ax.set_title(args.title)
+    ax.set_xlabel("Cycle")
+    ax.set_ylabel("Waste %")
+    ax.grid(True, alpha=0.25)
+    ax.legend()
+    ax.margins(x=0.01)
+    ax.xaxis.set_major_locator(MaxNLocator(nbins=18, integer=True))
+
+    fig.savefig(args.out, dpi=160)
+    print(f"wrote {args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/defrag_drive.py b/tools/defrag_drive.py
new file mode 100755
index 000000000000..1b70115d55e4
--- /dev/null
+++ b/tools/defrag_drive.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+"""Drive MEMORY DEFRAGMENT in a loop, recording fragmentation and per-shard
+phase timings to JSONL for plotting.
+
+Connects to an already-running dragonfly. Each cycle:
+  1. Capture stderr-log byte offset
+  2. MEMORY ARENA SUMMARY -> before
+  3. MEMORY DEFRAGMENT    -> blocks until the slice finishes; capture reply
+  4. MEMORY ARENA SUMMARY -> after
+  5. Read log delta, parse defrag[*] lines, attribute per shard
+  6. Emit one JSONL record
+  7. Sleep
+
+After the final summary, the script runs FLUSHALL to clean up the generated
+dataset.
+
+Run dragonfly with:
+    ./build-dbg/dragonfly --alsologtostderr ...
+This writes glog files into /tmp/ with /tmp/dragonfly.INFO as a stable symlink
+to the current INFO log; the script reads from that symlink.
+
+Plot in pandas with:
+    pd.read_json("run.jsonl", lines=True)
+
+Use the required label to keep old/new runs separate:
+    ./tools/defrag_drive.py old
+    ./tools/defrag_drive.py new
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import time
+
+import redis.asyncio as aioredis
+
+
+ARENA_TOTAL_RE = re.compile(r"Total:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([\d.]+)%")
+
+
+def parse_arena_total(report: str) -> dict | None:
+    """Pull the machine-wide Total: row from MEMORY ARENA SUMMARY."""
+    last_total = None
+    for line in report.splitlines():
+        match = ARENA_TOTAL_RE.search(line)
+        if match:
+            last_total = match
+    if not last_total:
+        return None
+    reserved, committed, used, wasted, waste_pct = last_total.groups()
+    return {
+        "reserved": int(reserved),
+        "committed": int(committed),
+        "used": int(used),
+        "wasted": int(wasted),
+        "waste_pct": float(waste_pct),
+    }
+
+
+# defrag[CYCLE_DONE] shard=0 cycle=7 targets_done=12/15 (80.0%) bytes_freed=...
+CYCLE_DONE_RE = re.compile(
+    r"defrag\[CYCLE_DONE\]\s+shard=(\d+)\s+cycle=(\d+)\s+"
+    r"targets_done=(\d+)/(\d+)\s+\(([\d.]+)%\)\s+"
+    r"bytes_freed=([\d.]+)(KiB|MiB|GiB|B)/[^\s]+\s+\(([\d.]+)%\)\s+"
+    r"bytes_moved=([\d.]+)(KiB|MiB|GiB|B)\s+"
+    r"cycle_took=([\d.]+)ms\s+freed_rate=([\d.]+)MiB/s"
+)
+
+PHASE_RE = re.compile(
+    r"defrag\[(CENSUS|PLAN|EVACUATE|VERIFY)\]\s+shard=(\d+)\s+cycle=(\d+)\s+"
+    r".*?\stook=([\d.]+)ms(?:\s+cpu=([\d.]+)ms)?"
+)
+
+DO_DEFRAG_EXIT_RE = re.compile(
+    r"defrag\[DoDefrag\]\s+shard=(\d+)\s+exit\s+"
+    r"phase=([A-Z_]+)->([A-Z_]+)\s+"
+    r"duration=(\d+)us.*?"
+    r"quota_depleted=(\d+)\s+work_pending=(\d+)\s+cycle_finished=(\d+)"
+)
+
+UNIT = {"B": 1, "KiB": 1024, "MiB": 1024 * 1024, "GiB": 1024 * 1024 * 1024}
+
+
+def record_committed_drop(record: dict) -> int:
+    before = record.get("before") or {}
+    after = record.get("after") or {}
+    if not before or not after:
+        return 0
+    return before.get("committed", 0) - after.get("committed", 0)
+
+
+def record_waste_drop(record: dict) -> float:
+    before = record.get("before") or {}
+    after = record.get("after") or {}
+    before_pct = before.get("waste_pct")
+    after_pct = after.get("waste_pct")
+    if before_pct is None or after_pct is None:
+        return 0.0
+    return before_pct - after_pct
+
+
+def record_work_pending(record: dict) -> bool | None:
+    pending = [s["work_pending"] for s in record.get("shards", []) if "work_pending" in s]
+    if not pending:
+        return None
+    return any(pending)
+
+
+def stall_reason(records: list[dict], args: argparse.Namespace, stall_armed: bool) -> str | None:
+    if not stall_armed or args.stall_window <= 0 or len(records) < args.stall_window:
+        return None
+    if record_work_pending(records[-1]) is not False:
+        return None
+
+    recent = records[-args.stall_window :]
+    first_before = recent[0].get("before") or {}
+    last_after = recent[-1].get("after") or {}
+    first_waste = first_before.get("waste_pct")
+    last_waste = last_after.get("waste_pct")
+    waste_drop = 0.0
+    if first_waste is not None and last_waste is not None:
+        waste_drop = first_waste - last_waste
+
+    committed_drop = sum(record_committed_drop(r) for r in recent)
+    min_committed_drop = int(args.stall_min_committed_drop_mb * 1024 * 1024)
+
+    if waste_drop < args.stall_min_waste_drop and committed_drop < min_committed_drop:
+        return (
+            f"stalled for {args.stall_window} driver iterations: "
+            f"waste_drop={waste_drop:.3f}pp "
+            f"committed_drop={committed_drop:,}B"
+        )
+
+    return None
+
+
+def parse_log_delta(text: str) -> dict[int, dict]:
+    """Extract per-shard summary from a slice of dragonfly stderr."""
+    by_shard: dict[int, dict] = {}
+
+    for match in PHASE_RE.finditer(text):
+        phase, shard_str, cycle_str, took_ms, cpu_ms = match.groups()
+        shard = int(shard_str)
+        rec = by_shard.setdefault(
+            shard,
+            {
+                "shard": shard,
+                "cycle_id": int(cycle_str),
+                "phase_ms": {},
+                "phase_cpu_ms": {},
+            },
+        )
+        rec.setdefault("phase_cpu_ms", {})
+        rec["phase_ms"][phase.lower()] = float(took_ms)
+        if cpu_ms is not None:
+            rec["phase_cpu_ms"][phase.lower()] = float(cpu_ms)
+
+    for match in CYCLE_DONE_RE.finditer(text):
+        (
+            shard_str,
+            cycle_str,
+            done,
+            total,
+            _done_pct,
+            freed_v,
+            freed_u,
+            _freed_pct,
+            moved_v,
+            moved_u,
+            cycle_took,
+            freed_rate,
+        ) = match.groups()
+        shard = int(shard_str)
+        rec = by_shard.setdefault(
+            shard, {"shard": shard, "cycle_id": int(cycle_str), "phase_ms": {}}
+        )
+        rec.update(
+            {
+                "cycle_id": int(cycle_str),
+                "targets_complete": int(done),
+                "targets_total": int(total),
+                "bytes_freed": int(float(freed_v) * UNIT[freed_u]),
+                "bytes_moved": int(float(moved_v) * UNIT[moved_u]),
+                "cycle_took_ms": float(cycle_took),
+                "freed_rate_mibs": float(freed_rate),
+            }
+        )
+
+    for match in DO_DEFRAG_EXIT_RE.finditer(text):
+        shard_str, phase_start, phase_end, duration_us, quota, pending, finished = match.groups()
+        shard = int(shard_str)
+        rec = by_shard.setdefault(shard, {"shard": shard, "phase_ms": {}})
+        rec.update(
+            {
+                "phase_start": phase_start,
+                "phase_end": phase_end,
+                "duration_us": int(duration_us),
+                "quota_depleted": bool(int(quota)),
+                "work_pending": bool(int(pending)),
+                "cycle_finished": bool(int(finished)),
+            }
+        )
+
+    return by_shard
+
+
+async def get_arena(client: aioredis.Redis) -> dict | None:
+    report = await client.execute_command("MEMORY", "ARENA", "SUMMARY")
+    if isinstance(report, bytes):
+        report = report.decode()
+    return parse_arena_total(report)
+
+
+async def run_defragment(client: aioredis.Redis) -> str:
+    reply = await client.execute_command("MEMORY", "DEFRAGMENT")
+    if isinstance(reply, bytes):
+        reply = reply.decode()
+    return reply
+
+
+async def flushall(client: aioredis.Redis) -> None:
+    await client.execute_command("FLUSHALL")
+
+
+def read_log_delta(path: str, start_offset: int) -> tuple[str, int]:
+    """Return text written to `path` since `start_offset`, plus new EOF."""
+    with open(path, "rb") as fh:
+        fh.seek(0, os.SEEK_END)
+        end = fh.tell()
+        if end < start_offset:
+            # log was truncated/rotated under us; restart from 0
+            start_offset = 0
+        fh.seek(start_offset)
+        data = fh.read(end - start_offset)
+    return data.decode(errors="replace"), end
+
+
+def log_size(path: str) -> int:
+    try:
+        return os.path.getsize(path)
+    except FileNotFoundError:
+        return 0
+
+
+def print_summary(records: list[dict]) -> None:
+    if not records:
+        return
+    print("\n=== summary ===")
+    for rec in records:
+        before = rec.get("before") or {}
+        after = rec.get("after") or {}
+        waste_before = before.get("waste_pct")
+        waste_after = after.get("waste_pct")
+        waste_str = (
+            f"{waste_before:.2f}% -> {waste_after:.2f}%"
+            if waste_before is not None and waste_after is not None
+            else "waste n/a"
+        )
+        call_ms = rec.get("call_ms", 0.0)
+
+        shards = rec.get("shards") or []
+        committed_drop = (
+            (before.get("committed", 0) - after.get("committed", 0)) if before and after else 0
+        )
+
+        # Aggregate per-phase CPU=ms across shards (max), preferring the new
+        # cpu= field over the wall-clock took= field. "-" for unparsed.
+        phase_keys = ("census", "plan", "evacuate", "verify")
+        phase_max: dict[str, float | None] = {k: None for k in phase_keys}
+        any_cpu = False
+        any_wall = False
+        for s in shards:
+            cpu_map = s.get("phase_cpu_ms") or {}
+            wall_map = s.get("phase_ms") or {}
+            for k in phase_keys:
+                v = cpu_map.get(k, wall_map.get(k))
+                if k in cpu_map:
+                    any_cpu = True
+                elif k in wall_map:
+                    any_wall = True
+                if v is None:
+                    continue
+                phase_max[k] = v if phase_max.get(k) is None else max(phase_max[k], v)
+
+        if any_cpu or any_wall:
+            phase_str = " ".join(
+                f"{k[:4]}={'-' if phase_max[k] is None else f'{phase_max[k]:.1f}ms'}"
+                for k in phase_keys
+            )
+            label = "cpu" if any_cpu else "wall"
+            phases_part = f"phases.{label}[{phase_str}]"
+        else:
+            phases_part = "phases[no transitions]"
+
+        print(
+            f"cycle {rec['cycle']:>3}: waste {waste_str}  call={call_ms:.1f}ms  "
+            f"{phases_part}  "
+            f"committed_drop={committed_drop:>+13,}B"
+        )
+
+    total_cpu_ms = sum(r.get("call_ms", 0.0) for r in records)
+    first_ts = records[0].get("ts_ns")
+    last_ts = records[-1].get("ts_ns")
+    wall_ms = (last_ts - first_ts) / 1_000_000.0 if first_ts and last_ts else 0.0
+    first_waste = (records[0].get("before") or {}).get("waste_pct")
+    last_waste = (records[-1].get("after") or {}).get("waste_pct")
+    waste_summary = (
+        f"{first_waste:.2f}% -> {last_waste:.2f}%"
+        if first_waste is not None and last_waste is not None
+        else "n/a"
+    )
+    print(
+        f"\ntotals: {len(records)} cycles  "
+        f"defrag_cpu={total_cpu_ms:.1f}ms  wall={wall_ms / 1000.0:.2f}s  "
+        f"waste {waste_summary}"
+    )
+    print("(phases.cpu = server CPU per phase; phases.wall = older log without cpu= field)")
+
+
+async def main(args: argparse.Namespace) -> None:
+    client = aioredis.Redis(host=args.host, port=args.port, db=0)
+    output_path = args.output or os.path.join("runs", f"{args.label}.jsonl")
+
+    out_dir = os.path.dirname(output_path)
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    records: list[dict] = []
+    last_waste: float | None = None
+    stall_armed = False
+    stop_reason = f"reached --cycles={args.cycles}"
+    print(f"writing run: {output_path}")
+    with open(output_path, "w") as out_fh:
+        for cycle in range(args.cycles):
+            log_offset = log_size(args.log_path)
+            ts_ns = time.time_ns()
+
+            before = await get_arena(client)
+            call_start_ns = time.monotonic_ns()
+            reply = await run_defragment(client)
+            call_ms = (time.monotonic_ns() - call_start_ns) / 1_000_000.0
+            after = await get_arena(client)
+
+            # Tiny pause so any tail-end log line lands before we read.
+            await asyncio.sleep(0.05)
+            log_text, _new_offset = read_log_delta(args.log_path, log_offset)
+            shards = parse_log_delta(log_text)
+
+            record = {
+                "label": args.label,
+                "cycle": cycle,
+                "ts_ns": ts_ns,
+                "call_ms": call_ms,
+                "before": before,
+                "after": after,
+                "defrag_reply": reply,
+                "shards": [shards[k] for k in sorted(shards)],
+            }
+            out_fh.write(json.dumps(record) + "\n")
+            out_fh.flush()
+            records.append(record)
+
+            committed_drop = record_committed_drop(record)
+            waste_before = (before or {}).get("waste_pct")
+            waste_after = (after or {}).get("waste_pct")
+            waste_str = (
+                f"{waste_before:.2f}% -> {waste_after:.2f}%"
+                if waste_before is not None and waste_after is not None
+                else "n/a"
+            )
+            print(f"cycle={cycle} waste={waste_str} committed_drop={committed_drop:,}B")
+
+            current_waste = (after or {}).get("waste_pct")
+            last_waste = current_waste if current_waste is not None else last_waste
+            if (
+                args.target_waste is not None
+                and current_waste is not None
+                and current_waste <= args.target_waste
+            ):
+                stop_reason = (
+                    f"reached target waste {args.target_waste:.2f}% at "
+                    f"{current_waste:.2f}% (cycle {cycle})"
+                )
+                break
+
+            if committed_drop > 0 or record_waste_drop(record) > 0:
+                stall_armed = True
+
+            reason = stall_reason(records, args, stall_armed)
+            if reason is not None:
+                stop_reason = f"{reason} (cycle {cycle})"
+                break
+
+            if cycle + 1 < args.cycles:
+                await asyncio.sleep(args.sleep_ms / 1000.0)
+
+    try:
+        if (
+            stop_reason.startswith("reached --cycles=")
+            and args.target_waste is not None
+            and last_waste is not None
+        ):
+            stop_reason = (
+                f"reached --cycles={args.cycles} (target {args.target_waste:.2f}% "
+                f"not reached, final {last_waste:.2f}%)"
+            )
+        print(f"\nstopped: {stop_reason}")
+        print_summary(records)
+        await flushall(client)
+        print("\ncleanup: FLUSHALL")
+    finally:
+        await client.aclose()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "label", choices=("old", "new"), help="run label used for runs/<label>.jsonl"
+    )
+    parser.add_argument("--host", default="localhost")
+    parser.add_argument("--port", type=int, default=6379)
+    parser.add_argument(
+        "--cycles", type=int, default=15, help="max cycles; loop also stops on --target-waste"
+    )
+    parser.add_argument(
+        "--target-waste",
+        type=float,
+        default=None,
+        help="stop when arena waste_pct drops to this value or below; unset = run until --cycles",
+    )
+    parser.add_argument("--sleep-ms", type=int, default=500)
+    parser.add_argument("--log-path", default="/tmp/dragonfly.INFO")
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="output JSONL path; defaults to runs/<label>.jsonl",
+    )
+    parser.add_argument(
+        "--stall-window",
+        type=int,
+        default=8,
+        help=(
+            "look back this many driver iterations after arena progress is observed; "
+            "0 disables stall stopping"
+        ),
+    )
+    parser.add_argument(
+        "--stall-min-waste-drop",
+        type=float,
+        default=0.05,
+        help="minimum waste percentage-point drop over --stall-window to count as progress",
+    )
+    parser.add_argument(
+        "--stall-min-committed-drop-mb",
+        type=float,
+        default=16.0,
+        help="minimum committed byte drop over --stall-window to count as progress",
+    )
+    asyncio.run(main(parser.parse_args()))
diff --git a/tools/defrag_experiment.py b/tools/defrag_experiment.py
new file mode 100755
index 000000000000..8a82dfd92137
--- /dev/null
+++ b/tools/defrag_experiment.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Run old/new defrag experiments from a clean Dragonfly process each time."""
+
+import argparse
+import os
+import shutil
+import socket
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+def ping(host: str, port: int, timeout_s: float = 0.5) -> bool:
+    request = b"*1\r\n$4\r\nPING\r\n"
+    try:
+        with socket.create_connection((host, port), timeout=timeout_s) as sock:
+            sock.sendall(request)
+            return sock.recv(16).startswith(b"+PONG")
+    except OSError:
+        return False
+
+
+def wait_for_dragonfly(proc: subprocess.Popen, host: str, port: int, timeout_s: float) -> None:
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        if proc.poll() is not None:
+            raise RuntimeError(f"dragonfly exited during startup with code {proc.returncode}")
+        if ping(host, port):
+            return
+        time.sleep(0.1)
+    raise TimeoutError(f"dragonfly did not respond to PING on {host}:{port}")
+
+
+def stop_process(proc: subprocess.Popen) -> None:
+    if proc.poll() is not None:
+        return
+
+    proc.terminate()
+    try:
+        proc.wait(timeout=10)
+        return
+    except subprocess.TimeoutExpired:
+        pass
+
+    proc.kill()
+    proc.wait(timeout=10)
+
+
+def run_checked(cmd: list[str], *, cwd: Path) -> None:
+    print(f"+ {' '.join(cmd)}", flush=True)
+    subprocess.run(cmd, cwd=cwd, check=True)
+
+
+def dragonfly_cmd(args: argparse.Namespace, data_dir: Path, experimental_defrag: bool) -> list[str]:
+    return [
+        str(args.dragonfly),
+        "--proactor_threads=1",
+        "--alsologtostderr",
+        f"--dir={data_dir}",
+        f"--bind={args.bind}",
+        f"--port={args.port}",
+        f"--experimental_defrag={str(experimental_defrag).lower()}",
+    ]
+
+
+def run_one(label: str, experimental_defrag: bool, args: argparse.Namespace) -> None:
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    data_dir = Path(args.dfly_dir_base).with_name(f"{Path(args.dfly_dir_base).name}-{label}")
+    if data_dir.exists():
+        shutil.rmtree(data_dir)
+    data_dir.mkdir(parents=True)
+
+    log_path = out_dir / f"{label}.dragonfly.log"
+    run_path = out_dir / f"{label}.jsonl"
+
+    print(f"\n=== {label} experimental_defrag={experimental_defrag} ===", flush=True)
+    print(f"dragonfly log: {log_path}", flush=True)
+    with open(log_path, "w") as log_file:
+        proc = subprocess.Popen(
+            dragonfly_cmd(args, data_dir, experimental_defrag),
+            cwd=REPO_ROOT,
+            stdout=log_file,
+            stderr=subprocess.STDOUT,
+        )
+        try:
+            wait_for_dragonfly(proc, args.host, args.port, args.startup_timeout_s)
+
+            run_checked(
+                [
+                    sys.executable,
+                    "tools/defrag_baseline.py",
+                    "--workload",
+                    args.workload,
+                    "--mul",
+                    str(args.mul),
+                    "--server",
+                    args.host,
+                    "--port",
+                    str(args.port),
+                    "--quiet",
+                ],
+                cwd=REPO_ROOT,
+            )
+
+            run_checked(
+                [
+                    sys.executable,
+                    "tools/defrag_drive.py",
+                    "--cycles",
+                    str(args.cycles),
+                    "--target-waste",
+                    str(args.target_waste),
+                    "--host",
+                    args.host,
+                    "--port",
+                    str(args.port),
+                    "--output",
+                    str(run_path),
+                    label,
+                ],
+                cwd=REPO_ROOT,
+            )
+        finally:
+            stop_process(proc)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--dragonfly", type=Path, default=Path("build-opt/dragonfly"))
+    parser.add_argument("--host", default="127.0.0.1", help="host used by scripts to connect")
+    parser.add_argument("--bind", default="0.0.0.0", help="Dragonfly bind address")
+    parser.add_argument("--port", type=int, default=6379)
+    parser.add_argument("--workload", choices=("uniform", "wide"), default="wide")
+    parser.add_argument("--mul", type=float, default=40.0)
+    parser.add_argument("--cycles", type=int, default=500)
+    parser.add_argument("--target-waste", type=float, default=5.0)
+    parser.add_argument("--out-dir", default="runs")
+    parser.add_argument("--dfly-dir-base", default="/tmp/dfly-defrag")
+    parser.add_argument("--startup-timeout-s", type=float, default=30.0)
+    args = parser.parse_args()
+
+    run_one("old", False, args)
+    run_one("new", True, args)
+
+    compare_out = Path(args.out_dir) / "compare.png"
+    run_checked(
+        [
+            sys.executable,
+            "tools/defrag_compare.py",
+            str(Path(args.out_dir) / "old.jsonl"),
+            str(Path(args.out_dir) / "new.jsonl"),
+            "--out",
+            str(compare_out),
+        ],
+        cwd=REPO_ROOT,
+    )
+    print(f"\ncomparison plot: {compare_out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/defrag_plot.py b/tools/defrag_plot.py
new file mode 100755
index 000000000000..68d88a2d983c
--- /dev/null
+++ b/tools/defrag_plot.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
+
+MIB = 1024 * 1024
+
+
+def field(row, side, name):
+    v = row.get(side)
+    if isinstance(v, dict):
+        return v.get(name)
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("jsonl")
+    parser.add_argument("--x", choices=["cycle", "time"], default="cycle")
+    parser.add_argument("--out", default="defrag_progress.png")
+    args = parser.parse_args()
+
+    df = pd.read_json(args.jsonl, lines=True)
+
+    for side in ("before", "after"):
+        for key in ("reserved", "committed", "used", "wasted", "waste_pct"):
+            df[f"{side}_{key}"] = df.apply(lambda r: field(r, side, key), axis=1)
+
+    df["time_s"] = (df["ts_ns"] - df["ts_ns"].iloc[0]) / 1e9
+    df["committed_drop_mib"] = (df["before_committed"] - df["after_committed"]) / MIB
+    df["committed_drop_cumulative_mib"] = (
+        df["before_committed"].iloc[0] - df["after_committed"]
+    ) / MIB
+
+    x = df["cycle"] if args.x == "cycle" else df["time_s"]
+    xlabel = "Cycle" if args.x == "cycle" else "Wall time (s)"
+    markevery = max(1, len(df) // 90)
+
+    fig, axes = plt.subplots(3, 1, figsize=(18, 8), sharex=True, constrained_layout=True)
+
+    axes[0].plot(
+        x,
+        df["after_waste_pct"],
+        linewidth=2.4,
+        marker="o",
+        markersize=3,
+        markevery=markevery,
+    )
+    axes[0].set_ylabel("Waste %")
+    axes[0].grid(True, alpha=0.25)
+
+    axes[1].plot(x, df["after_committed"] / MIB, label="committed", linewidth=2.4)
+    axes[1].plot(
+        x,
+        df["after_used"] / MIB,
+        label="used",
+        linewidth=1.8,
+        linestyle="--",
+        alpha=0.75,
+    )
+    axes[1].set_ylabel("MiB")
+    axes[1].legend()
+    axes[1].grid(True, alpha=0.25)
+
+    if args.x == "cycle":
+        bar_width = 0.8
+    elif len(x) > 1:
+        bar_width = max((x.iloc[-1] - x.iloc[0]) / len(x) * 0.7, 0.001)
+    else:
+        bar_width = 0.8
+
+    axes[2].plot(
+        x,
+        df["committed_drop_cumulative_mib"],
+        color="tab:blue",
+        linewidth=2.4,
+        marker="o",
+        markersize=3,
+        markevery=markevery,
+        label="cumulative drop",
+    )
+    axes[2].set_ylabel("Cumulative drop MiB")
+    axes[2].set_xlabel(xlabel)
+    axes[2].grid(True, alpha=0.25)
+
+    per_cycle_axis = axes[2].twinx()
+    per_cycle_axis.bar(
+        x,
+        df["committed_drop_mib"],
+        width=bar_width,
+        color="tab:orange",
+        alpha=0.25,
+        label="per-cycle drop",
+    )
+    per_cycle_axis.set_ylabel("Per-cycle drop MiB")
+
+    lines, labels = axes[2].get_legend_handles_labels()
+    bars, bar_labels = per_cycle_axis.get_legend_handles_labels()
+    axes[2].legend(lines + bars, labels + bar_labels, loc="upper left")
+
+    for ax in axes:
+        ax.margins(x=0.01)
+    axes[-1].xaxis.set_major_locator(MaxNLocator(nbins=18, integer=args.x == "cycle"))
+
+    fig.suptitle("Defrag Progress")
+    fig.savefig(args.out, dpi=160)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/defrag_run.sh b/tools/defrag_run.sh
new file mode 100755
index 000000000000..43860b6cb709
--- /dev/null
+++ b/tools/defrag_run.sh
@@ -0,0 +1,258 @@
+#!/usr/bin/env bash
+# run_defrag_experiment.sh — single defrag experiment: start → fragment → defrag → report
+#
+# Starts a pinned single-shard Dragonfly, creates heap fragmentation with
+# defrag_baseline.py, then repeatedly calls MEMORY DEFRAGMENT via defrag_drive.py
+# until arena waste drops below a target percentage (or max cycles is reached).
+# Results are written as JSONL for later analysis.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+TOOLS_DIR="${REPO_ROOT}/tools"
+RUNS_DIR="${REPO_ROOT}/runs"
+
+# ── Defaults (all overridable via flags or environment variables) ─────────────
+DRAGONFLY="${DRAGONFLY:-${REPO_ROOT}/build-opt/dragonfly}"
+PORT="${PORT:-6379}"
+WORKLOAD="${WORKLOAD:-wide}"        # uniform | wide
+MUL="${MUL:-20}"                    # scale key count by this multiplier
+CYCLES="${CYCLES:-100}"             # max MEMORY DEFRAGMENT calls
+TARGET_WASTE="${TARGET_WASTE:-5}"   # stop when arena waste_pct <= this %
+SLEEP_MS="${SLEEP_MS:-200}"         # ms between defrag calls
+LOG_PATH="/tmp/dragonfly.INFO"
+OUTPUT=""                           # set after mode is known
+# Single core each — keeps logs single-shard and reproducible.
+DF_CORES=1
+CLIENT_CORES=1
+# ─────────────────────────────────────────────────────────────────────────────
+
+usage() {
+    cat <<'HEADER'
+Usage: defrag_run.sh <phased|legacy> [OPTIONS]
+
+Run one defrag experiment end-to-end:
+  1. Start a single-shard Dragonfly (CPU-pinned to core 0)
+  2. Create fragmentation baseline (defrag_baseline.py)
+  3. Drive MEMORY DEFRAGMENT and record results (defrag_drive.py)
+HEADER
+    cat <<EOF
+
+Modes:
+  phased   4-phase algorithm (CENSUS → SELECT_TARGETS → EVACUATE → VERIFY)
+  legacy   Single-pass incremental reclaim
+
+Options (also settable via environment variables):
+  -m, --mul NUM         Key-count multiplier              [MUL=${MUL}]
+  -c, --cycles NUM      Max defrag cycles                 [CYCLES=${CYCLES}]
+  -w, --workload NAME   Workload profile (uniform|wide)   [WORKLOAD=${WORKLOAD}]
+  -p, --port NUM        Dragonfly listen port             [PORT=${PORT}]
+  -t, --target NUM      Stop when waste_pct <= NUM%       [TARGET_WASTE=${TARGET_WASTE}]
+  -s, --sleep-ms NUM    Sleep between cycles (ms)         [SLEEP_MS=${SLEEP_MS}]
+  -o, --output PATH     JSONL output file                 [auto: runs/<mode>_<workload>_mul<N>.jsonl]
+  -b, --binary PATH     Dragonfly binary                  [DRAGONFLY=${DRAGONFLY}]
+  -h, --help            Show this help
+
+Examples:
+  ./tools/defrag_run.sh phased
+  ./tools/defrag_run.sh legacy -m 10 -c 200
+  MUL=5 CYCLES=50 ./tools/defrag_run.sh phased
+EOF
+    exit 0
+}
+
+# ── Argument parsing ──────────────────────────────────────────────────────────
+if [[ $# -lt 1 ]]; then
+    usage
+fi
+
+# First pass: find mode (first positional arg)
+EXPERIMENTAL=""
+MODE=""
+POSITIONAL_SET=false
+PASSTHROUGH_FLAGS=()
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -h|--help)     usage ;;
+        -m|--mul)          MUL="$2"; shift 2 ;;
+        -c|--cycles)       CYCLES="$2"; shift 2 ;;
+        -w|--workload)     WORKLOAD="$2"; shift 2 ;;
+        -p|--port)         PORT="$2"; shift 2 ;;
+        -t|--target)       TARGET_WASTE="$2"; shift 2 ;;
+        -s|--sleep-ms)     SLEEP_MS="$2"; shift 2 ;;
+        -o|--output)       OUTPUT="$2"; shift 2 ;;
+        -b|--binary)       DRAGONFLY="$2"; shift 2 ;;
+        phased)
+            EXPERIMENTAL="true"; MODE="phased"; POSITIONAL_SET=true; shift ;;
+        legacy)
+            EXPERIMENTAL="false"; MODE="legacy"; POSITIONAL_SET=true; shift ;;
+        true)
+            EXPERIMENTAL="true"; MODE="phased"; POSITIONAL_SET=true; shift ;;
+        false)
+            EXPERIMENTAL="false"; MODE="legacy"; POSITIONAL_SET=true; shift ;;
+        --)
+            shift; PASSTHROUGH_FLAGS=("$@"); break ;;
+        *)
+            echo "error: unknown argument '$1' (expected phased|legacy or an option flag)"
+            echo "run with --help for usage"
+            exit 1 ;;
+    esac
+done
+
+if [[ "${POSITIONAL_SET}" == "false" ]]; then
+    echo "error: mode argument required (phased or legacy)"
+    echo "run with --help for usage"
+    exit 1
+fi
+
+OUTPUT="${OUTPUT:-${RUNS_DIR}/${MODE}_${WORKLOAD}_mul${MUL}.jsonl}"
+
+# ── CPU pinning ───────────────────────────────────────────────────────────────
+# Dragonfly on core 0, client on core 1. Single-shard makes logs trivial to read.
+DF_CORE_LIST="0"
+CLIENT_CORE_LIST="1"
+echo "cpu_pinning: dragonfly=${DF_CORE_LIST} (${DF_CORES} cores)  client=${CLIENT_CORE_LIST} (${CLIENT_CORES} cores)"
+
+# ── Pre-flight checks ─────────────────────────────────────────────────────────
+if [[ ! -x "${DRAGONFLY}" ]]; then
+    echo "error: dragonfly binary not found or not executable: ${DRAGONFLY}"
+    echo "       build it with: ninja -C build-opt dragonfly"
+    exit 1
+fi
+
+if ! command -v redis-cli &>/dev/null; then
+    echo "error: redis-cli not found in PATH (needed for readiness check)"
+    exit 1
+fi
+
+# Check nothing is already accepting connections on the port
+if redis-cli -p "${PORT}" ping 2>/dev/null | grep -q PONG; then
+    echo "error: port ${PORT} is already in use — is dragonfly or another server running?"
+    echo "       stop it first, or change PORT in this script"
+    exit 1
+fi
+
+mkdir -p "${RUNS_DIR}"
+
+# aioredis 2.x has a duplicate-base-class bug on Python 3.12+ where
+# asyncio.TimeoutError became an alias for builtins.TimeoutError.
+# Patch the installed exceptions.py in-place without touching the script.
+python3 - <<'PYEOF'
+import sys, importlib.util, pathlib
+
+spec = importlib.util.find_spec("aioredis")
+if spec is None:
+    print("error: aioredis not installed", file=sys.stderr)
+    sys.exit(1)
+
+exc_file = pathlib.Path(spec.origin).parent / "exceptions.py"
+original = exc_file.read_text()
+old = "class TimeoutError(asyncio.TimeoutError, builtins.TimeoutError, RedisError):"
+new = "class TimeoutError(*dict.fromkeys([asyncio.TimeoutError, builtins.TimeoutError]), RedisError):"
+if old in original:
+    exc_file.write_text(original.replace(old, new))
+    print("patched aioredis/exceptions.py for Python 3.12 compatibility")
+
+# Verify it actually imports now.
+try:
+    import aioredis  # noqa: F401
+except Exception as e:
+    print(f"error: aioredis still broken after patch: {e}", file=sys.stderr)
+    sys.exit(1)
+PYEOF
+
+echo "=== defrag experiment: mode=${MODE}  workload=${WORKLOAD}  mul=${MUL}x ==="
+
+# ── Step 1: Start Dragonfly ───────────────────────────────────────────────────
+echo ""
+echo "[1/3] starting dragonfly (mode=${MODE}) on port ${PORT} ..."
+
+EXTRA_FLAGS=(
+    # Disable the per-shard minimum-reclaimable guard so EVACUATE runs even
+    # when per-shard fragmentation is small (e.g. 433MiB / 14 shards ≈ 30MiB,
+    # which would be blocked by the default 64MiB threshold).
+    --defrag_min_plan_reclaimable_bytes=0
+    # Disable RDB snapshot on shutdown to avoid "direct I/O not supported"
+    # warnings on encrypted filesystems.
+    --dbfilename ""
+    # Extra flags passed after -- on the command line.
+    "${PASSTHROUGH_FLAGS[@]}"
+)
+
+taskset -c "${DF_CORE_LIST}" \
+"${DRAGONFLY}" \
+    --alsologtostderr \
+    --experimental_defrag="${EXPERIMENTAL}" \
+    --enable_bg_defrag=false \
+    --proactor_threads="${DF_CORES}" \
+    --port="${PORT}" \
+    "${EXTRA_FLAGS[@]}" &
+DF_PID=$!
+
+cleanup() {
+    echo ""
+    echo "stopping dragonfly (pid=${DF_PID}) ..."
+    kill "${DF_PID}" 2>/dev/null || true
+    # Give it up to 5s to exit cleanly, then SIGKILL
+    for _i in $(seq 1 10); do
+        sleep 0.5
+        if ! kill -0 "${DF_PID}" 2>/dev/null; then
+            break
+        fi
+        if [[ ${_i} -eq 10 ]]; then
+            echo "dragonfly did not exit cleanly, sending SIGKILL ..."
+            kill -9 "${DF_PID}" 2>/dev/null || true
+        fi
+    done
+    wait "${DF_PID}" 2>/dev/null || true
+    # Verify the port is free before returning
+    for _i in $(seq 1 10); do
+        if ! redis-cli -p "${PORT}" ping 2>/dev/null | grep -q PONG; then
+            echo "port ${PORT} is free"
+            return
+        fi
+        sleep 0.5
+    done
+    echo "warning: port ${PORT} may still be in use after shutdown"
+}
+trap cleanup EXIT
+
+echo "waiting for dragonfly to be ready ..."
+for i in $(seq 1 30); do
+    if redis-cli -p "${PORT}" ping 2>/dev/null | grep -q PONG; then
+        echo "dragonfly ready (${i} probes)"
+        break
+    fi
+    if [[ ${i} -eq 30 ]]; then
+        echo "error: dragonfly did not respond within 15s"
+        exit 1
+    fi
+    sleep 0.5
+done
+
+# ── Step 2: Create fragmentation baseline ────────────────────────────────────
+echo ""
+echo "[2/3] creating fragmentation baseline (workload=${WORKLOAD} mul=${MUL}x) ..."
+taskset -c "${CLIENT_CORE_LIST}" \
+python3 "${TOOLS_DIR}/defrag_baseline.py" \
+    --workload "${WORKLOAD}" \
+    --mul      "${MUL}"      \
+    --port     "${PORT}"     \
+    --arena
+
+# ── Step 3: Drive defrag ─────────────────────────────────────────────────────
+echo ""
+echo "[3/3] driving defrag (cycles=${CYCLES} target-waste=${TARGET_WASTE}% output=${OUTPUT}) ..."
+taskset -c "${CLIENT_CORE_LIST}" \
+python3 "${TOOLS_DIR}/defrag_drive.py" \
+    --cycles       "${CYCLES}"       \
+    --target-waste "${TARGET_WASTE}" \
+    --sleep-ms     "${SLEEP_MS}"     \
+    --log-path     "${LOG_PATH}"     \
+    --port         "${PORT}"         \
+    --output       "${OUTPUT}"
+
+echo ""
+echo "=== done: results written to ${OUTPUT} ==="
diff --git a/tools/defrag_sweep.sh b/tools/defrag_sweep.sh
new file mode 100755
index 000000000000..eb5f73ea7c24
--- /dev/null
+++ b/tools/defrag_sweep.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# run_defrag_sweep.sh — sweep multiplier × algorithm combinations
+#
+# Runs run_defrag_experiment.sh for every combination of MUL values and
+# defrag modes (legacy, phased). Each run gets its own log and JSONL file
+# under runs/sweep/. Runs are sequential (they share the same port).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# ── Defaults (overridable via flags or environment) ──────────────────────────
+SWEEP_DIR="${SWEEP_DIR:-${REPO_ROOT}/runs/sweep}"
+CYCLES="${CYCLES:-100}"
+MODES="${MODES:-legacy,phased}"     # comma-separated: legacy, phased, or both
+# Default MUL series — override with -m "1 2 5" or MULS="1 2 5"
+MULS_STR="${MULS:-1 2 5 10 15 20}"
+# Extra flags forwarded to run_defrag_experiment.sh (e.g. --port 6380)
+EXTRA_ARGS=()
+# ─────────────────────────────────────────────────────────────────────────────
+
+usage() {
+    cat <<'HEADER'
+Usage: defrag_sweep.sh [OPTIONS]
+
+Run defrag_run.sh for every MUL × mode combination, capturing
+stdout to per-run log files and JSONL results under runs/sweep/.
+HEADER
+    cat <<EOF
+
+Options:
+  -m, --muls "N ..."        Space-separated MUL values        [MULS="${MULS_STR}"]
+  -c, --cycles NUM          Max defrag cycles per run          [CYCLES=${CYCLES}]
+  -M, --modes LIST          Comma-separated modes              [MODES=${MODES}]
+                             (legacy, phased, or legacy,phased)
+  -d, --dir PATH            Output directory                   [SWEEP_DIR=${SWEEP_DIR}]
+  -h, --help                Show this help
+
+Any extra flags after '--' are forwarded to run_defrag_experiment.sh:
+  defrag_sweep.sh -m "5 10" -- --port 6380 --workload uniform
+
+Environment variables (MULS, CYCLES, MODES, SWEEP_DIR) work as alternatives
+to flags. Flags take precedence.
+
+Examples:
+  ./tools/defrag_sweep.sh                           # default: 6 MULs × 2 modes
+  ./tools/defrag_sweep.sh -m "5 10 20" -c 300      # custom MULs, 300 cycles
+  ./tools/defrag_sweep.sh -M phased -m "15 20"     # phased only, two MULs
+EOF
+    exit 0
+}
+
+# ── Argument parsing ──────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -h|--help)     usage ;;
+        -m|--muls)     MULS_STR="$2"; shift 2 ;;
+        -c|--cycles)   CYCLES="$2"; shift 2 ;;
+        -M|--modes)    MODES="$2"; shift 2 ;;
+        -d|--dir)      SWEEP_DIR="$2"; shift 2 ;;
+        --)            shift; EXTRA_ARGS=("$@"); break ;;
+        *)
+            echo "error: unknown argument '$1' (use -- to pass flags to experiment script)"
+            echo "run with --help for usage"
+            exit 1 ;;
+    esac
+done
+
+# Parse MULS_STR into array
+read -ra MULS_ARR <<< "${MULS_STR}"
+
+# Parse MODES into array of true/false values
+MODES_ARR=()
+IFS=',' read -ra MODE_NAMES <<< "${MODES}"
+for m in "${MODE_NAMES[@]}"; do
+    case "${m}" in
+        phased) MODES_ARR+=("true") ;;
+        legacy) MODES_ARR+=("false") ;;
+        *)
+            echo "error: unknown mode '${m}' (expected legacy or phased)"
+            exit 1 ;;
+    esac
+done
+
+mkdir -p "${SWEEP_DIR}"
+
+total=$(( ${#MULS_ARR[@]} * ${#MODES_ARR[@]} ))
+run=0
+
+echo "=== DEFRAG SWEEP: ${#MULS_ARR[@]} multipliers × ${#MODES_ARR[@]} modes = ${total} runs ==="
+echo "MULs:   ${MULS_ARR[*]}"
+echo "modes:  ${MODES}"
+echo "cycles: ${CYCLES}"
+echo "dir:    ${SWEEP_DIR}"
+echo ""
+
+for mul in "${MULS_ARR[@]}"; do
+    for mode in "${MODES_ARR[@]}"; do
+        run=$((run + 1))
+        label=$( [[ "${mode}" == "true" ]] && echo "phased" || echo "legacy" )
+        logfile="${SWEEP_DIR}/mul${mul}_${label}.log"
+
+        echo "────────────────────────────────────────────────────────────────"
+        echo "[${run}/${total}] MUL=${mul} MODE=${label} → ${logfile}"
+        echo "────────────────────────────────────────────────────────────────"
+
+        export MUL="${mul}"
+        export CYCLES
+        export OUTPUT="${SWEEP_DIR}/mul${mul}_${label}.jsonl"
+
+        if "${SCRIPT_DIR}/defrag_run.sh" "${mode}" "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}" 2>&1 | tee "${logfile}"; then
+            echo "[${run}/${total}] ✓ MUL=${mul} ${label} completed"
+        else
+            echo "[${run}/${total}] ✗ MUL=${mul} ${label} FAILED (see ${logfile})"
+        fi
+
+        echo ""
+        sleep 2
+    done
+done
+
+echo ""
+echo "=== SWEEP COMPLETE: ${total} runs ==="
+echo "logs:  ${SWEEP_DIR}/mul*_*.log"
+echo "jsonl: ${SWEEP_DIR}/mul*_*.jsonl"