Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f326f23
server: defrag 2
abhijat May 3, 2026
94059de
server: use cursor hints
abhijat May 4, 2026
9f88642
server: baseline script
abhijat May 4, 2026
526f132
server: benchmarks
abhijat May 4, 2026
4e8d45b
server: sampling census
abhijat May 4, 2026
60c4ff7
server: tooling for timing runs
abhijat May 5, 2026
9064c68
server: tail caching
abhijat May 5, 2026
6173e1e
patches: mimalloc mark skipped page
abhijat May 5, 2026
1346ad7
server: drop census entirely
abhijat May 5, 2026
4dedde4
server: remove top-k temporarily
abhijat May 6, 2026
bb3ad43
server: cleanups
abhijat May 6, 2026
382e330
remove cache and bloom filter from evac phase
abhijat May 6, 2026
79240fe
server: benchmark virtual calls
abhijat May 6, 2026
855f644
server: use tag dispatch
abhijat May 6, 2026
76db21e
server: clean up script
abhijat May 7, 2026
22b68f4
server: remove underused set from old algo
abhijat May 7, 2026
ce76322
server: silence warnings on page usage array access
abhijat May 7, 2026
9e55bc9
server: add back virtual calls
abhijat May 7, 2026
005a20b
tools: add defrag_run.sh and defrag_sweep.sh
glevkovich May 7, 2026
bf5e268
patches: fix field access from invalid page
abhijat May 7, 2026
4c1e5f9
fix(defrag): filter pages with immovable data in ClassifyForTarget
glevkovich May 7, 2026
bf05fb1
feat(defrag): add key defrag to phased evacuator
glevkovich May 7, 2026
d890500
server: increase max retained page cap
abhijat May 8, 2026
a2eef1a
tools: print more data on bands
abhijat May 8, 2026
732935e
tools: flushall at the end of the test run
abhijat May 8, 2026
93c9a3e
tools: scripts for plotting
abhijat May 8, 2026
e274769
Merge branch 'main' into abhijat/hack/defrag-2
abhijat May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
commit 0000000000000000000000000000000000000000
Author: Dragonfly Defrag Hackathon <dev@dragonflydb.io>
Date: Tue May 5 17:30:00 2026 +0000

feat: skip defrag-targeted pages in mi_malloc

Adds a defrag_skip byte to mi_page_t and a public API
mi_page_set_defrag_skip(page_addr, skip). When set, mi_malloc skips
the page in the small-size fast path, the queue-head fast path, and
mi_page_queue_find_free_ex. Prevents new allocations from landing
on a page that phased defrag is trying to drain.

--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -516,7 +516,14 @@
mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
const size_t idx = _mi_wsize_from_size(size);
mi_assert_internal(idx < MI_PAGES_DIRECT);
- return heap->pages_free_direct[idx];
+ mi_page_t* page = heap->pages_free_direct[idx];
+ // dragonfly: when the cached small-page is a defrag target, force the
+ // generic slow path so the allocation goes through `mi_find_free_page` ->
+ // `mi_page_queue_find_free_ex` which skips defrag-targeted pages.
+ if (mi_unlikely(page->defrag_skip)) {
+ return (mi_page_t*) &_mi_page_empty;
+ }
+ return page;
}

// Segment that contains the pointer
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -337,6 +337,7 @@
uint16_t used; // number of blocks in use (including blocks in `thread_free`)
uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type
+ uint8_t defrag_skip; // dragonfly: when nonzero, alloc paths skip this page (it is being drained by defrag)
// padding
size_t block_size; // size available in each block (always `>0`)
uint8_t* page_start; // start of the page area containing the blocks
--- a/src/init.c
+++ b/src/init.c
@@ -26,6 +26,7 @@
0, // used
0, // block size shift
0, // heap tag
+ 0, // defrag_skip (dragonfly)
0, // block_size
NULL, // page_start
#if (MI_PADDING || MI_ENCODE_FREELIST)
--- a/src/page.c
+++ b/src/page.c
@@ -697,6 +697,7 @@
page->keys[1] = _mi_heap_random_next(heap);
#endif
page->free_is_zero = page->is_zero_init;
+ page->defrag_skip = 0; // dragonfly: fresh page is not a defrag target
#if MI_DEBUG>2
if (page->is_zero_init) {
mi_track_mem_defined(page->page_start, page_size);
@@ -763,6 +764,14 @@
while (page != NULL)
{
mi_page_t* next = page->next; // remember next
+
+ // dragonfly: pages tagged by defrag are being drained; skip them so new
+ // allocations don't refill targets while EVACUATE moves entries off.
+ if (page->defrag_skip) {
+ page = next;
+ continue;
+ }
+
#if MI_STAT
count++;
#endif
@@ -860,6 +869,12 @@

// check the first page: we even do this with candidate search or otherwise we re-search every time
mi_page_t* page = pq->first;
+ // dragonfly: skip the queue-head fast path when it points at a defrag
+ // target so the search falls through to mi_page_queue_find_free_ex which
+ // walks past target pages.
+ if (page != NULL && page->defrag_skip) {
+ page = NULL;
+ }
if (page != NULL) {
#if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -711,6 +711,17 @@
return result;
}

+// dragonfly: mark a page so that mi_malloc skips it when picking a page to
+// allocate from. Used by phased defrag to prevent EVACUATE moves from
+// refilling target pages that we are trying to drain. `page_addr` must be
+// a value previously returned in mi_page_usage_stats_t::page_address (i.e.
+// a `(uintptr_t)mi_page_t*`).
+void mi_page_set_defrag_skip(uintptr_t page_addr, bool skip) mi_attr_noexcept {
+ if (page_addr == 0) return;
+ mi_page_t* page = (mi_page_t*) page_addr;
+ page->defrag_skip = (skip ? 1 : 0);
+}
+
// ------------------------------------------------------
// ensure explicit external inline definitions are emitted!
// ------------------------------------------------------
84 changes: 84 additions & 0 deletions patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
commit 0000000000000000000000000000000000000000
Author: Dragonfly Defrag Hackathon <dev@dragonflydb.io>
Date: Tue May 5 18:00:00 2026 +0000

feat: underutilized-page callback for reactive defrag

Adds a public API mi_dfly_set_underutil_callback(cb) that fires on
local-thread free when a page's used count drops below a configured
threshold for the first time. Lets phased defrag enqueue pages
reactively instead of doing a full prime-table CENSUS scan to
discover them.

--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -271,6 +271,13 @@

mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);

+// dragonfly: callback fired on local-thread free when a page's used count
+// crosses the configured threshold downward. Lets phased defrag enqueue
+// pages reactively instead of doing a full prime-table CENSUS scan.
+typedef void (*mi_dfly_underutil_callback_t)(uintptr_t page_addr);
+mi_decl_export void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb);
+mi_decl_export void mi_dfly_set_underutil_threshold_pct(uint8_t pct);
+
// Experimental
mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
--- a/src/free.c
+++ b/src/free.c
@@ -18,6 +18,24 @@
static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
static void mi_stat_free(const mi_page_t* page, const mi_block_t* block);

+// ------------------------------------------------------
+// Dragonfly: underutilized-page callback
+// Fired on local-thread free when the page's used count crosses the
+// configured threshold downward. Lets defrag enqueue pages reactively
+// instead of doing a full prime-table CENSUS scan.
+// ------------------------------------------------------
+static mi_dfly_underutil_callback_t _mi_dfly_underutil_cb = NULL;
+static uint8_t _mi_dfly_underutil_pct = 80;
+
+void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb) {
+ _mi_dfly_underutil_cb = cb;
+}
+
+void mi_dfly_set_underutil_threshold_pct(uint8_t pct) {
+ if (pct > 100) pct = 100;
+ _mi_dfly_underutil_pct = pct;
+}
+

// ------------------------------------------------------
// Free
@@ -44,12 +62,28 @@
// actual free: push on the local free list
mi_block_set_next(page, block, page->local_free);
page->local_free = block;
+ // dragonfly: decide whether to fire the underutilized-page callback BEFORE
+ // _mi_page_retire below. _mi_page_retire may call _mi_page_free which
+ // returns the page metadata to the segment, after which reading
+ // page->used / page->capacity is a UAF. We gate on page->used > 1 so that
+ // after --, used > 0 (page not retired and still alive), which makes
+ // (uintptr_t)page a valid address to hand to the callback.
+ bool fire_underutil_cb = false;
+ if (mi_unlikely(_mi_dfly_underutil_cb != NULL && page->used > 1)) {
+ const uint32_t cap_thr = (uint32_t)page->capacity * _mi_dfly_underutil_pct;
+ const uint32_t prev_x100 = (uint32_t)page->used * 100;
+ const uint32_t cur_x100 = (uint32_t)(page->used - 1) * 100;
+ fire_underutil_cb = (prev_x100 > cap_thr && cur_x100 <= cap_thr);
+ }
if mi_unlikely(--page->used == 0) {
_mi_page_retire(page);
}
else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
_mi_page_unfull(page);
}
+ if (fire_underutil_cb) {
+ _mi_dfly_underutil_cb((uintptr_t)page);
+ }
}

// Adjust a block that was allocated aligned, to the actual start of the block in the page.
8 changes: 6 additions & 2 deletions src/core/dash.h
Original file line number Diff line number Diff line change
Expand Up @@ -412,14 +412,18 @@ class DashTable : public detail::DashTableBase {
return stash_unloaded_;
}

// Advances cursor by exactly one logical bucket in bucket-major order, without
// visiting bucket contents. Used by sampled walkers (e.g. defrag CENSUS) to
// skip buckets between Traverse calls. Returns Cursor::end() once the table
// is exhausted.
Cursor AdvanceCursorBucketOrder(Cursor cursor);

private:
enum class InsertMode {
kInsertIfNotFound,
kForceInsert,
};

Cursor AdvanceCursorBucketOrder(Cursor cursor);

template <typename U, typename V, typename EvictionPolicy>
std::pair<iterator, bool> InsertInternal(U&& key, V&& value, EvictionPolicy& policy,
InsertMode mode);
Expand Down
2 changes: 1 addition & 1 deletion src/core/page_usage/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
add_library(dfly_page_usage page_usage_stats.cc)
add_library(dfly_page_usage page_usage_stats.cc page_usage_visitors.cc)
target_link_libraries(dfly_page_usage base TRDP::hdr_histogram redis_lib absl::strings)
6 changes: 3 additions & 3 deletions src/core/page_usage/page_usage_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,9 @@ uint64_t PageUsage::UsedQuotaCycles() const {
}

bool PageUsage::IsPageForObjectUnderUtilized(void* object) {
mi_page_usage_stats_t stat;
zmalloc_page_is_underutilized(object, threshold_, collect_stats_ == CollectPageStats::YES, &stat);
return ConsumePageStats(stat);
return ConsumePageStats(mi_heap_page_is_underutilized(static_cast<mi_heap_t*>(zmalloc_heap),
object, threshold_,
collect_stats_ == CollectPageStats::YES));
}

bool PageUsage::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) {
Expand Down
37 changes: 34 additions & 3 deletions src/core/page_usage/page_usage_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ namespace dfly {
class CycleQuota {
public:
static constexpr uint64_t kMaxQuota = std::numeric_limits<uint64_t>::max();
static constexpr uint64_t kDefaultDefragQuota = 150;
// 40000 here is ~10ms of real time because helio's CycleClock mixes raw rdtsc
// with abseil's shifted frequency, making FromUsec/ToUsec ~4x off on x86.
// Once the helio bug is fixed, drop this to 10000.
static constexpr uint64_t kDefaultDefragQuota = 40'000;

explicit CycleQuota(uint64_t quota_usec);

Expand Down Expand Up @@ -83,9 +86,11 @@ class PageUsage {

uint64_t UsedQuotaCycles() const;

// Returns true when the object on the page should be reallocated. Subclasses
// (Evacuator, CensusTaker) override to short-circuit or extend the decision.
// Out-of-line in page_usage_stats.cc.
virtual bool IsPageForObjectUnderUtilized(void* object);

bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object);
virtual bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object);

CollectedPageStats CollectedStats() const {
return unique_pages_.CollectedStats();
Expand All @@ -107,6 +112,31 @@ class PageUsage {

bool QuotaDepleted() const;

virtual bool ShouldStop() const {
return false;
}

// Read-only walkers (e.g. CENSUS) never reallocate, so callers can skip
// pre/post sizing work that only matters when an object may move.
virtual bool IsReadOnly() const {
return false;
}

// When true, the traversal should also defrag keys (it->first) in addition
// to values. Only the phased algorithm (CENSUS + EVACUATE) enables this.
virtual bool ShouldDefragKeys() const {
return false;
}

// Walkers may stash the bucket cursor about to be visited so that downstream
// Observe() calls can attribute candidates back to a bucket. Default no-op.
virtual void SetCurrentBucketCursor(uint64_t /*cursor*/) {
}

float threshold() const {
return threshold_;
}

void ExtendQuota(uint64_t quota_usec);

private:
Expand Down Expand Up @@ -136,6 +166,7 @@ class PageUsage {

CycleQuota quota_;

protected:
// For use in testing, forces reallocate check to always return true
bool force_reallocate_{false};
};
Expand Down
Loading
Loading