diff --git a/.github/workflows/bullmq-tests.yml b/.github/workflows/bullmq-tests.yml
index 047af4e6ee72..351e792a68f7 100644
--- a/.github/workflows/bullmq-tests.yml
+++ b/.github/workflows/bullmq-tests.yml
@@ -54,12 +54,15 @@ jobs:
 
       - name: Start Dragonfly
         run: |
+          mkdir -p /tmp/df-logs
           ${GITHUB_WORKSPACE}/build/dragonfly \
             --alsologtostderr \
+            --log_dir=/tmp/df-logs \
             --cluster_mode=emulated \
             --lock_on_hashtags \
             --dbfilename= \
-            --port 6379 &
+            --port 6379 \
+            >/tmp/df-logs/stdout.log 2>/tmp/df-logs/stderr.log &
           timeout 15s bash -c 'until redis-cli -p 6379 PING 2>/dev/null | grep -q PONG; do sleep 0.1; done'
 
       - name: Build BullMQ
@@ -84,8 +87,8 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v7
         with:
-          name: unit_logs
-          path: /tmp/dragonfly.*
+          name: dragonfly-logs
+          path: /tmp/df-logs/
 
       - name: Send notification on failure
         if: failure() && github.ref == 'refs/heads/main'
diff --git a/contrib/charts/dragonfly/go.mod b/contrib/charts/dragonfly/go.mod
index 22806292eef6..c59e2410fa9c 100644
--- a/contrib/charts/dragonfly/go.mod
+++ b/contrib/charts/dragonfly/go.mod
@@ -1,13 +1,11 @@
 module dragonfly
 
-go 1.24.0
-
-toolchain go1.24.7
+go 1.25.0
 
 require github.com/gruntwork-io/terratest v0.51.0
 
 require (
-	filippo.io/edwards25519 v1.1.0 // indirect
+	filippo.io/edwards25519 v1.1.1 // indirect
 	github.com/BurntSushi/toml v1.5.0 // indirect
 	github.com/aws/aws-sdk-go-v2 v1.41.5 // indirect
 	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.8 // indirect
@@ -83,7 +81,7 @@ require (
 	github.com/homeport/dyff v1.10.2 // indirect
 	github.com/jackc/pgpassfile v1.0.0 // indirect
 	github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
-	github.com/jackc/pgx/v5 v5.7.6 // indirect
+	github.com/jackc/pgx/v5 v5.9.2 // indirect
 	github.com/jackc/puddle/v2 v2.2.2 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
@@ -93,7 +91,7 @@ require (
 	github.com/mitchellh/go-homedir v1.1.0 // indirect
 	github.com/mitchellh/go-ps v1.0.0 // indirect
 	github.com/mitchellh/hashstructure v1.1.0 // indirect
-	github.com/moby/spdystream v0.5.0 // indirect
+	github.com/moby/spdystream v0.5.1 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
diff --git a/contrib/charts/dragonfly/go.sum b/contrib/charts/dragonfly/go.sum
index f113dc8a9f0c..e14ff49cef56 100644
--- a/contrib/charts/dragonfly/go.sum
+++ b/contrib/charts/dragonfly/go.sum
@@ -1,5 +1,5 @@
-filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
-filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
+filippo.io/edwards25519 v1.1.1 h1:YpjwWWlNmGIDyXOn8zLzqiD+9TyIlPhGFG96P39uBpw=
+filippo.io/edwards25519 v1.1.1/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
 github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
 github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
@@ -169,8 +169,8 @@ github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsI
 github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
 github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
 github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
-github.com/jackc/pgx/v5 v5.7.6 h1:rWQc5FwZSPX58r1OQmkuaNicxdmExaEz5A2DO2hUuTk=
-github.com/jackc/pgx/v5 v5.7.6/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
+github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw=
+github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4=
 github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
 github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
@@ -198,8 +198,8 @@ github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc
 github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
 github.com/mitchellh/hashstructure v1.1.0 h1:P6P1hdjqAAknpY/M1CGipelZgp+4y9ja9kmUZPXP+H0=
 github.com/mitchellh/hashstructure v1.1.0/go.mod h1:xUDAozZz0Wmdiufv0uyhnHkUTN6/6d8ulp4AwfLKrmA=
-github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
-github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
+github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y=
+github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
diff --git a/docs/pub-sub.md b/docs/pub-sub.md
index ab300ce8989f..a78ca14f2b24 100644
--- a/docs/pub-sub.md
+++ b/docs/pub-sub.md
@@ -2,9 +2,9 @@
 
 This document describes how Dragonfly implements the Publish-Subscribe (Pub/Sub) messaging
 paradigm within its shared-nothing, multi-threaded architecture. It covers the global
-subscription registry, the Read-Copy-Update (RCU) mechanism used to prevent lock contention
-on the publish path, the asynchronous message delivery pipeline, and the backpressure system
-that protects the server from slow-subscriber OOM.
+subscription registry backed by a `ShardedHashMap`, the per-shard two-lock RCU mechanism
+used to minimize lock contention on the publish path, the asynchronous message delivery
+pipeline, and the backpressure system that protects the server from slow-subscriber OOM.
 
 ## Overview
 
@@ -13,19 +13,25 @@ unique challenge: subscriptions must be globally addressable across all threads,
 global lock on every `PUBLISH` would create a severe bottleneck. A single popular channel
 with thousands of subscribers could serialize all publish operations onto one shard thread.
 
-Dragonfly solves this by using a **centralized `ChannelStore` updated via RCU
-(Read-Copy-Update)**:
+Dragonfly solves this with a **single global `ChannelStore`** backed by a
+`ShardedHashMap<string, UpdatablePointer, 16>` — a custom hash map split into 16 independent
+shards, each protected by two fiber-aware locks:
 
-- **Reads (`PUBLISH` / `SPUBLISH`)** are lock-free and use a thread-local pointer to the
-  most recent `ChannelStore` snapshot.
-- **Writes (`SUBSCRIBE` / `UNSUBSCRIBE` / `PSUBSCRIBE` / `PUNSUBSCRIBE`)** are serialized
-  by a single mutex, performed by copying the necessary routing maps, applying the mutation,
-  and atomically swapping the global pointer.
+- **`write_mu_`** (exclusive) — serializes writers within a shard. Readers never acquire it.
+- **`read_mu_`** (shared/exclusive) — taken shared by readers; taken exclusively only for
+  structural map changes (inserting/erasing channel entries) and for safe deletion of old
+  `SubscribeMap` pointers (draining in-flight readers).
+
+Within each shard, subscriber updates use an **RCU-style pointer swap** via
+`UpdatablePointer`: the writer copies the old `SubscribeMap`, modifies the copy, and
+atomically stores the new pointer — all while holding only `write_mu_`, so readers on the
+same shard proceed concurrently. Structural changes (new channel, channel deletion) briefly
+acquire `read_mu_` exclusively to block readers.
 
 This design avoids contention on a single shard thread for heavy throughput on a single
-channel and seamlessly scales across multiple threads even with a small number of channels.
-Publish latency is lower than a shard-routed design because no inter-thread hop is required
-to look up subscribers — the caller reads its local copy directly.
+channel and scales across threads even with a small number of channels. Publish latency is
+low because no inter-thread hop is required to look up subscribers — the caller reads its
+shard's `read_mu_` in shared mode directly.
 
 Dragonfly supports three flavors of Pub/Sub:
 
@@ -39,11 +45,11 @@ Dragonfly supports three flavors of Pub/Sub:
 
 | Type | Location | Role |
 |------|----------|------|
-| `ChannelStore` | `src/server/channel_store.h` | Centralized registry mapping channels/patterns to subscribers. Updated via RCU. |
-| `ChannelStoreUpdater` | `src/server/channel_store.h` | Orchestrates RCU mutations (add/remove) to the `ChannelStore`. |
+| `ChannelStore` | `src/server/channel_store.h` | Centralized registry mapping channels/patterns to subscribers. Single global instance (`extern ChannelStore* channel_store`). |
+| `ChannelStoreUpdater` | `src/server/channel_store.h` | Batches subscribe/unsubscribe operations by shard and applies them in one `Mutate` call per shard. |
 | `ChannelStore::Subscriber` | `src/server/channel_store.h` | Represents a subscribed client. Wraps `facade::ConnectionRef` plus a pattern string. |
-| `ChannelStore::ControlBlock` | `src/server/channel_store.h` | Holds the `most_recent` atomic pointer and `update_mu` mutex. Prevents overlapping structural updates. |
-| `ChannelStore::ChannelMap` | `src/server/channel_store.h` | `flat_hash_map<string, UpdatablePointer>` — maps channel/pattern names to subscriber lists. |
+| `ChannelStore::ChannelMap` | `src/server/channel_store.h` | `ShardedHashMap<string, UpdatablePointer, 16>` — sharded map of channel/pattern names to subscriber lists. |
+| `ShardedHashMap` | `src/core/sharded_hash_map.h` | Generic thread-safe sharded hash map. 16 shards, each with `write_mu_` and `read_mu_` fiber-aware locks over an `absl::flat_hash_map`. |
 | `ChannelStore::SubscribeMap` | `src/server/channel_store.h` | `flat_hash_map<ConnectionContext*, ThreadId>` — maps subscriber contexts to their owning thread. |
 | `ChannelStore::UpdatablePointer` | `src/server/channel_store.h` | Atomic wrapper around `SubscribeMap*`. Supports lock-free reads (`acquire`) and RCU-style swaps (`release`). |
 | `ConnectionState::SubscribeInfo` | `src/server/conn_context.h` | Per-connection set of subscribed channels and patterns. Created lazily on first subscription. |
@@ -58,89 +64,86 @@ Dragonfly supports three flavors of Pub/Sub:
   <img src="pubsub/pubsub_data_flow_overview.svg" alt="Pub/Sub Data Flow" width="1000"/>
 </div>
 
-## Subscription Management (RCU)
+## Subscription Management (Sharded RCU)
 
 ### Data Structure Layout
 
-Each `ChannelStore` instance holds two `ChannelMap` pointers:
+The single global `ChannelStore` holds two `ChannelMap` instances (each a
+`ShardedHashMap<string, UpdatablePointer, 16>`):
 
 <div align="center">
   <img src="pubsub/pubsub_data_structure_layout.svg" alt="Data Structure Layout" width="700"/>
 </div>
 
+Each of the 16 shards contains an `absl::flat_hash_map<string, UpdatablePointer>` guarded
+by two fiber-aware locks: `write_mu_` (serializes writers) and `read_mu_` (shared for
+readers, exclusive for structural changes).
+
 `UpdatablePointer` wraps a `std::atomic<SubscribeMap*>` with `memory_order_acquire` on read
 and `memory_order_release` on write. This ensures that when a thread reads the pointer, it
 also sees the fully constructed `SubscribeMap` that the writer published.
 
-### Two Levels of RCU
+### Per-Shard Two-Lock RCU
 
-The `ChannelStoreUpdater` implements two granularities of copy-on-write:
+The `ChannelStoreUpdater` groups pending subscribe/unsubscribe operations by shard index
+(via `Record()` → `ShardOf(channel)`) and processes each shard in a single `Mutate()` call.
 
-1. **ChannelMap-level copy** — triggered when a channel slot must be added (first subscriber)
-   or removed (last subscriber leaves). The entire `ChannelMap` is shallow-copied, the slot is
-   added/removed on the copy, a new `ChannelStore` is allocated pointing to the new map, and
-   the global `control_block.most_recent` is swapped.
+Within each shard's `Mutate()` callback, the updater handles two cases:
 
-2. **SubscribeMap-level RCU** — triggered when adding/removing a subscriber to an existing
-   channel (the map slot already exists). Only the `SubscribeMap` for that channel is copied,
-   the mutation is applied, and the `UpdatablePointer` is atomically swapped. No new
-   `ChannelStore` or `ChannelMap` is needed.
+**Case 1: Existing channel (add/remove subscriber, channel slot stays)**
+1. Acquire `write_mu_` exclusively (done by `Mutate`) — serializes writers on this shard.
+2. Copy the `SubscribeMap`, apply the mutation, atomically swap via `UpdatablePointer::Set`.
+   Readers are NOT blocked — they may still read the old pointer.
+3. Push the old `SubscribeMap*` onto a per-shard `freelist_`.
+4. Release `write_mu_` (Mutate returns).
+5. Acquire `read_mu_` exclusively via `WithReadExclusiveLock` — this drains any reader that
+   loaded the old `SubscribeMap` pointer, then deletes all entries in the freelist.
 
-This two-level scheme is implemented in `ChannelStoreUpdater::GetTargetMap()`:
+**Case 2: New channel (first subscriber) or channel deletion (last subscriber leaves)**
+1. Inside the `Mutate` callback, call `LockReaders()` to acquire `read_mu_` exclusively.
+   This blocks all readers in the shard while inserting or erasing the key.
+2. For add: emplace a new `UpdatablePointer{new SubscribeMap{{cntx_, thread_id_}}}`.
+3. For remove: delete the `SubscribeMap`, erase the map entry.
+4. Writers on other shards are unaffected.
 
-```cpp
-pair<ChannelStore::ChannelMap*, bool> ChannelStoreUpdater::GetTargetMap(ChannelStore* store) {
-  auto* target = pattern_ ? store->patterns_ : store->channels_;
-
-  for (auto key : ops_) {
-    auto it = target->find(key);
-    DCHECK(it != target->end() || to_add_);
-    // We need to make a copy, if we are going to add or delete a new map slot.
-    if ((to_add_ && it == target->end()) || (!to_add_ && it->second->size() == 1))
-      return {new ChannelStore::ChannelMap{*target}, true};
-  }
-
-  return {target, false};
-}
-```
-
-### Apply() Flow
-
-<div align="center">
-  <img src="pubsub/pubsub_apply.svg" alt="Apply Flow" width="1000"/>
-</div>
-
-Step 8 uses `AwaitBrief` (non-preempting dispatch) to update each thread's local pointer.
-The `seq_cst` load in the callback ensures the thread reads the latest pointer value _and_
-the memory published behind it.
-
-### Modify() — Per-Key Mutation
-
-For each key in the pending operations:
+### Apply() — Batch Per-Shard Mutation
 
 ```
-Modify(target, key)
-  it = target->find(key)
-
-  Case 1: Adding, key not in map (new channel)
-    → target->emplace(key, new SubscribeMap{{cntx_, thread_id_}})
-
-  Case 2: Removing, last subscriber (channel disappears)
-    → freelist_.push_back(it->second.Get())  // defer deletion
-    → target->erase(it)
-
-  Case 3: Existing channel, add/remove subscriber (RCU on SubscribeMap)
-    → replacement = new SubscribeMap{*it->second}
-    → if to_add_: replacement->emplace(cntx_, thread_id_)
-      else:       replacement->erase(cntx_)
-    → freelist_.push_back(it->second.Get())  // old map, defer deletion
-    → it->second.Set(replacement)            // atomic release-store
+ChannelStoreUpdater::Apply()
+  for each shard sid in 0..15:
+    if ops_[sid] empty: continue
+
+    map.Mutate(ShardId{sid}, [&](const auto& m, auto LockReaders) {
+      // Phase 1: RCU updates for existing channels (only write_mu_ held)
+      for each key in ops_[sid]:
+        it = m.find(key)
+        if to_add_ and it exists:
+          → copy SubscribeMap, add {cntx_, thread_id_}, swap pointer
+          → push old pointer to freelist_[sid]
+        if !to_add_ and it exists and size > 1:
+          → copy SubscribeMap, erase cntx_, swap pointer
+          → push old pointer to freelist_[sid]
+        if needs structural change:
+          → mark needs_map_change[i] = true
+
+      // Phase 2: structural changes (acquire read_mu_ exclusively)
+      if has_map_change:
+        auto locked = LockReaders()
+        for each key needing map change:
+          if to_add_: locked.map.emplace(key, new SubscribeMap{...})
+          if !to_add_: delete ptr, locked.map.erase(it)
+    })
+
+    // Phase 3: drain readers, delete old SubscribeMaps
+    if freelist_[sid] not empty:
+      map.WithReadExclusiveLock(ShardId{sid}, [&] {
+        for each sm in freelist_[sid]: delete sm
+      })
 ```
 
-Old `SubscribeMap` pointers are not immediately deleted because concurrent `PUBLISH`
-operations on other threads may still be reading them. They are placed in a `freelist_` and
-deleted only after `AwaitBrief` completes — at which point every thread has acknowledged the
-new state and no reader can hold a reference to the old maps.
+This batching minimizes lock acquisitions: all keys mapping to the same shard are processed
+under a single `write_mu_` acquisition, and old `SubscribeMap` pointers are cleaned up in
+one `read_mu_` exclusive pass.
 
 ### Connection-Level Subscription State
 
@@ -176,8 +179,8 @@ When a client issues `PUBLISH channel message` (or `SPUBLISH`):
 ```
 SendMessages(channel, messages, sharded)
   1. subscribers = FetchSubscribers(channel)
-     → exact match: channels_->find(channel)
-     → pattern match: for each (pat, subs) in *patterns_:
+     → exact match: channels_.FindIf(channel, ...)
+     → pattern match: patterns_.ForEachShared(...)
          if GlobMatcher{pat}.Matches(channel): Fill(subs, pat, &result)
      → sort result by thread_id  (enables efficient per-thread dispatch)
 
@@ -221,10 +224,12 @@ string allocations.
 
 ```
 FetchSubscribers(channel)
-  1. Exact match: channels_->find(channel)
+  1. Exact match: channels_.FindIf(channel, callback)
+     → acquires read_mu_ shared on the channel's shard
      → if found, Fill() creates Subscriber entries from the SubscribeMap
 
-  2. Pattern match: iterate ALL patterns
+  2. Pattern match: patterns_.ForEachShared(callback)
+     → iterates ALL patterns across all 16 shards (each shard locked independently)
      → for each (pat, subs): GlobMatcher{pat, case_sensitive=true}.Matches(channel)
      → matching subscribers are added with their pattern string
 
@@ -232,6 +237,10 @@ FetchSubscribers(channel)
      → enables O(log n) per-thread lookup during dispatch
 ```
 
+**Note**: `FetchSubscribers` is not atomic — each shard is locked independently via shared
+`read_mu_`, so the result may not reflect a fully consistent state. This trade-off is
+acceptable for pub/sub use cases.
+
 The `Fill` helper reads the `SubscribeMap` (via `UpdatablePointer::Get()` — acquire load)
 and creates `Subscriber` structs that hold a `ConnectionRef` (weak reference) obtained via
 `conn->Borrow()`.
@@ -411,20 +420,31 @@ is called:
 
 ```
 UnsubscribeAfterClusterSlotMigration(deleted_slots)
-  for each (channel, _) in *channels_:
+  // Phase 1: collect matching channels and their subscribers
+  channels_.ForEachShared([&](channel, up) {
     if deleted_slots.Contains(KeySlot(channel)):
-      csu.Record(channel)
-  csu.ApplyAndUnsubscribe()
+      Fill(*up, "", &subs)
+      owned_subs[channel] = sorted subs
+  })
+
+  if owned_subs empty: return
+
+  // Phase 2: remove all subscribers from matched channels
+  for each (channel, _) in owned_subs:
+    RemoveAllSubscribers(false, channel)
+
+  // Phase 3: notify connections on their owning threads
+  pool->AwaitFiberOnAll([&](idx, _) {
+    UnsubscribeConnectionsFromDeletedSlots(channel_subs_map, idx)
+  })
 ```
 
-`ApplyAndUnsubscribe()` differs from `Apply()`:
-1. It deep-copies the `ChannelMap` and removes the migrated channels.
-2. It calls `FetchSubscribers` for each removed channel _before_ updating the store
-   (since `FetchSubscribers` reads from the current active store).
-3. It uses `AwaitFiberOnAll` (fiber-based, may preempt) instead of `AwaitBrief` to dispatch
-   both the store update and unsubscription messages.
-4. On each thread, `UnsubscribeConnectionsFromDeletedSlots` sends `PubMessage`s with
-   `force_unsubscribe=true`, which triggers `sunsubscribe` push messages to affected clients.
+`RemoveAllSubscribers` uses `Mutate` to acquire `write_mu_`, then `LockReaders()` to block
+readers while deleting the `SubscribeMap` and erasing the channel entry.
+
+`AwaitFiberOnAll` (fiber-based, may preempt) dispatches to each thread, where
+`UnsubscribeConnectionsFromDeletedSlots` sends `PubMessage`s with `force_unsubscribe=true`
+via `BuildSender`, triggering `sunsubscribe` push messages to affected clients.
 
 ## Keyspace Event Notifications
 
@@ -439,8 +459,7 @@ When enabled:
 3. At the end of `DeleteExpiredStep`, batched events are published:
 
 ```cpp
-ChannelStore* store = ServerState::tlocal()->channel_store();
-store->SendMessages(
+channel_store->SendMessages(
     absl::StrCat("__keyevent@", cntx.db_index, "__:expired"),
     events, false);
 events.clear();
@@ -476,11 +495,11 @@ Notable flags:
 | Purpose | File Path |
 |---------|-----------|
 | ChannelStore & ChannelStoreUpdater | `src/server/channel_store.h`, `src/server/channel_store.cc` |
+| ShardedHashMap (underlying data structure) | `src/core/sharded_hash_map.h` |
 | Pub/Sub command handlers | `src/server/main_service.cc` (`Publish`, `Subscribe`, `Unsubscribe`, `PSubscribe`, `PUnsubscribe`, `Pubsub`) |
 | Connection-level subscription state | `src/server/conn_context.h`, `src/server/conn_context.cc` (`ChangeSubscriptions`, `UnsubscribeAll`, `PUnsubscribeAll`) |
 | PubMessage, AsyncFiber, backpressure | `src/facade/dragonfly_connection.h`, `src/facade/dragonfly_connection.cc` |
 | ConnectionRef (weak subscriber refs) | `src/facade/connection_ref.h` |
-| ServerState channel_store_ pointer | `src/server/server_state.h`, `src/server/server_state.cc` |
 | Keyspace event integration | `src/server/db_slice.cc` (`DeleteExpiredStep`) |
-| Cluster slot migration unsub | `src/server/channel_store.cc` (`UnsubscribeAfterClusterSlotMigration`, `ApplyAndUnsubscribe`) |
+| Cluster slot migration unsub | `src/server/channel_store.cc` (`UnsubscribeAfterClusterSlotMigration`, `RemoveAllSubscribers`) |
 | GlobMatcher for pattern matching | `src/core/glob_matcher.h` |
diff --git a/docs/transaction.md b/docs/transaction.md
index 8eff4366c619..2317648a3247 100644
--- a/docs/transaction.md
+++ b/docs/transaction.md
@@ -8,7 +8,7 @@ This document describes how Dragonfly transactions provide atomicity and seriali
 
 Serializability is an isolation level for database transactions. Serializability describes multiple transactions, where a transaction is usually composed of multiple operations on multiple objects.
 
-Database can executed transactions in parallel (and the operations in parallel). Serializability guarantees the result is the same with, as if the transactions were executed one by one. i.e. to behave like executed in a serial order.
+Databases can execute transactions in parallel (and the operations in parallel). Serializability guarantees the result is the same with, as if the transactions were executed one by one. i.e. to behave like executed in a serial order.
 
 Serializability doesn’t guarantee the resulting serial order respects recency. I.e. the serial order can be different from the order in which transactions were actually executed. E.g. Tx1 begins earlier than Tx2, but the result behaves as if Tx2 executed before Tx1. That is also to say, to satisfy the same Serializability, there can be more than one possible execution schedulings.
 
@@ -124,7 +124,7 @@ There are three modes called "multi modes" in which a multi transaction can be e
 
 __1. Global mode__
 
-The transaction is equivalent to a global transaction with multiple hops. It is scheduled globally and the commands are executed as a series of consequitive hops. This mode is required for global commands (like MOVE) and for accessing undeclared keys in Lua scripts. Otherwise, it should be avoided, because it prevents Dragonfly from running concurrently and thus greatly decreases throughput.
+The transaction is equivalent to a global transaction with multiple hops. It is scheduled globally and the commands are executed as a series of consecutive hops. This mode is required for global commands (like MOVE) and for accessing undeclared keys in Lua scripts. Otherwise, it should be avoided, because it prevents Dragonfly from running concurrently and thus greatly decreases throughput.
 
 __2. Lock ahead mode__
 
@@ -144,11 +144,11 @@ Luckily we can make one important observation about command sequences. Given a s
 * each command needs to preserve its order only relative to other commands accessing the same shard
 * commands accessing different shards can run in parallel
 
-The basic idea behind command squashing is identifying consecutive series of single-shard commands and separating them by shards, while maintaing their relative order withing each shard. Once the commands are separated, we can execute a single hop on all relevant shards. Within each shard the hop callback will execute one by one only those commands, that assigned to its respective shard. Because all commands are already placed on their relevant threads, no further hops are required and all command callbacks are executed inline.
+The basic idea behind command squashing is identifying consecutive series of single-shard commands and separating them by shards, while maintaing their relative order whitin each shard. Once the commands are separated, we can execute a single hop on all relevant shards. Within each shard the hop callback will execute one by one only those commands, that assigned to its respective shard. Because all commands are already placed on their relevant threads, no further hops are required and all command callbacks are executed inline.
 
 Reviewing our initial problems, command squashing:
 * Allows executing many commands with only one hop
-* Allows executing commands in pararllel
+* Allows executing commands in parallel
 
 ## Optimizations
 Out of order transactions - TBD
@@ -192,7 +192,7 @@ For the single-threaded Redis the order is determined by following the natural e
 
 However with blocking scenario for BLPOP, we do not have a built-in mechanism to determine which key was filled earlier - since, as stated, the concept of total order does not exist for multiple shards.
 
-### Interesing examples to consider:
+### Interesting examples to consider:
 
 **Ex1:**
 ```
diff --git a/go.work b/go.work
index 7d37c78ac8d1..1f70aa0629a4 100644
--- a/go.work
+++ b/go.work
@@ -1,6 +1,4 @@
-go 1.24.0
-
-toolchain go1.24.7
+go 1.25.0
 
 use (
 	./contrib/charts/dragonfly
diff --git a/go.work.sum b/go.work.sum
index 6e1391921a15..670dd5687d8f 100644
--- a/go.work.sum
+++ b/go.work.sum
@@ -41,10 +41,10 @@ github.com/bradleyfalzon/ghinstallation v1.1.1/go.mod h1:vyCmHTciHx/uuyN82Zc3rXN
 github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
 github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
 github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw=
-github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
 github.com/containerd/stargz-snapshotter/estargz v0.14.3/go.mod h1:KY//uOCIkSuNAHhJogcZtrNHdKrA99/FCCRjE3HD36o=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/denisenkom/go-mssqldb v0.12.3/go.mod h1:k0mtMFOnU+AihqFxPMiF05rtiDrorD1Vrm1KEz5hxDo=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
 github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
@@ -87,6 +87,7 @@ github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHW
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/jstemmer/go-junit-report v1.0.0/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
 github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
 github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
diff --git a/helio b/helio
index e8a8a0a67d90..2ed01846f979 160000
--- a/helio
+++ b/helio
@@ -1 +1 @@
-Subproject commit e8a8a0a67d90814dfaee57bc8371dcee122376e8
+Subproject commit 2ed01846f97968b9e684c22e9e1bef840893f72b
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 111d5a023d77..2e00bb377ee4 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -61,6 +61,7 @@ helio_cxx_test(zstd_test dfly_core TRDP::zstd LABELS DFLY)
 helio_cxx_test(dict_builder_test dfly_core LABELS DFLY)
 helio_cxx_test(top_keys_test dfly_core LABELS DFLY)
 helio_cxx_test(topk_test dfly_core LABELS DFLY)
+helio_cxx_test(sharded_hash_map_test dfly_core LABELS DFLY)
 helio_cxx_test(page_usage_stats_test dfly_core LABELS DFLY)
 helio_cxx_test(cms_test dfly_core LABELS DFLY)
 helio_cxx_test(memory_test TRDP::mimalloc2 LABELS DFLY)
diff --git a/src/core/bloom.cc b/src/core/bloom.cc
index 0942439949cd..bcb2b248e29b 100644
--- a/src/core/bloom.cc
+++ b/src/core/bloom.cc
@@ -196,11 +196,11 @@ SBF& SBF::operator=(SBF&& src) noexcept {
   return *this;
 }
 
-void SBF::AddFilter(const std::string& blob, unsigned hash_cnt) {
+uint8_t* SBF::AllocateFilter(size_t alloc_size, unsigned hash_cnt) {
   PMR_NS::memory_resource* mr = filters_.get_allocator().resource();
-  uint8_t* ptr = (uint8_t*)mr->allocate(blob.size());
-  memcpy(ptr, blob.data(), blob.size());
-  filters_.emplace_back().Init(ptr, blob.size(), hash_cnt);
+  const auto ptr = static_cast<uint8_t*>(mr->allocate(alloc_size));
+  filters_.emplace_back().Init(ptr, alloc_size, hash_cnt);
+  return ptr;
 }
 
 bool SBF::Add(std::string_view str) {
@@ -387,51 +387,79 @@ void SBFDumpIterator::ResolveCursorToPos() {
 
 nonstd::expected<SBF*, SBFLoadResult> LoadSBFHeader(std::string_view header_data,
                                                     PMR_NS::memory_resource* mr) {
+  using enum SBFLoadResult;
+  using nonstd::make_unexpected;
+
   if (header_data.size() < kDumpHeaderSize)
-    return nonstd::make_unexpected(SBFLoadResult::kTruncatedInput);
+    return make_unexpected(kTruncatedInput);
+
+  if (header_data.size() > kDumpHeaderSize)
+    return make_unexpected(kBadInput);
 
   const char* ptr = header_data.data();
   if (const uint32_t version = absl::little_endian::Load32(ptr); version != kSbfDumpVersion)
-    return nonstd::make_unexpected(SBFLoadResult::kBadVersion);
+    return make_unexpected(kBadVersion);
 
   const double grow_factor = std::bit_cast<double>(absl::little_endian::Load64(ptr + 4));
+  if (!std::isfinite(grow_factor) || grow_factor < 1.0)
+    return make_unexpected(kBadInput);
+
   // Initialize everything to 0, later filters will overwrite these values
   return CompactObj::AllocateMR<SBF>(grow_factor, 0.0, 0UL, 0UL, 0UL, mr);
 }
 
 SBFLoadResult AddNewFilterToSBF(std::string_view data, SBF* sbf) {
+  using enum SBFLoadResult;
+
   if (data.size() < kDumpFilterMetaSize)
-    return SBFLoadResult::kTruncatedInput;
+    return kTruncatedInput;
 
   auto [hash_cnt, data_length, state] = SBFFilterMeta::Parse(data.data());
+  if (hash_cnt == 0)
+    return kBadInput;
+
+  if (hash_cnt > std::numeric_limits<uint8_t>::max())
+    return kBadInput;
+
+  if (data_length == 0 || !absl::has_single_bit(data_length))
+    return kBadInput;
+
+  // probability should be 0 to 1 (probably less than 1)
+  if (!std::isfinite(state.fp_prob) || state.fp_prob <= 0.0 || state.fp_prob >= 1.0)
+    return kBadInput;
+
+  if (state.max_capacity == 0 || state.current_size >= state.max_capacity)
+    return kBadInput;
+
   const size_t payload = data.size() - kDumpFilterMetaSize;
   if (payload > data_length)
-    return SBFLoadResult::kOutOfRange;
+    return kOutOfRange;
 
   sbf->ApplyStateUpdate(state);
 
   const uint32_t new_index = sbf->num_filters();
-  // TODO validate variables against bloom invariants (power of two etc)
-  sbf->AddFilter(std::string(data_length, '\0'), hash_cnt);
+  auto* ptr = sbf->AllocateFilter(data_length, hash_cnt);
+  memset(ptr, 0, data_length);
 
   if (payload > 0)
     memcpy(sbf->filter_data(new_index), data.data() + kDumpFilterMetaSize, payload);
 
-  return SBFLoadResult::kOk;
+  return kOk;
 }
 
 SBFLoadResult LoadSBFChunk(int64_t cursor, std::string_view data, SBF* sbf) {
   DCHECK_NE(sbf, nullptr) << "Input ptr must be valid SBF";
 
-  // TODO on implementing LOADCHUNK there should be closer validation of the data fed into the SBF.
-  // This current implementation is mostly a test helper and proof that the SCANDUMP algorithm is
-  // actually loadable.
-
-  size_t global_offset = cursor - static_cast<int64_t>(data.size()) - 1;
+  const int64_t write_pos = cursor - static_cast<int64_t>(data.size());
+  if (write_pos < 1)
+    return SBFLoadResult::kOutOfRange;
 
+  size_t global_offset = write_pos - 1;
   for (uint32_t i = 0; i < sbf->num_filters(); ++i) {
     const size_t filter_span = kDumpFilterMetaSize + sbf->data(i).size();
     if (global_offset < filter_span) {
+      // we should never have a write position inside the header. The header is always fully
+      // written.
       if (global_offset < kDumpFilterMetaSize)
         return SBFLoadResult::kOutOfRange;
 
@@ -448,7 +476,24 @@ SBFLoadResult LoadSBFChunk(int64_t cursor, std::string_view data, SBF* sbf) {
   if (global_offset != 0)
     return SBFLoadResult::kOutOfRange;
 
+  // global offset is 0, ie ended exactly at the end of the filter. data goes into a new filter.
   return AddNewFilterToSBF(data, sbf);
 }
 
+const char* ToString(SBFLoadResult res) {
+  switch (res) {
+    case SBFLoadResult::kOk:
+      return "ok";
+    case SBFLoadResult::kBadInput:
+      return "bad_input";
+    case SBFLoadResult::kOutOfRange:
+      return "out_of_range";
+    case SBFLoadResult::kTruncatedInput:
+      return "truncated_input";
+    case SBFLoadResult::kBadVersion:
+      return "bad_version";
+  }
+  return "unknown";
+}
+
 }  // namespace dfly
diff --git a/src/core/bloom.h b/src/core/bloom.h
index e3bb9598edaa..dcb2f1102245 100644
--- a/src/core/bloom.h
+++ b/src/core/bloom.h
@@ -22,6 +22,8 @@ enum class SBFLoadResult : uint8_t {
   kOutOfRange,
 };
 
+const char* ToString(SBFLoadResult res);
+
 /// Bloom filter based on the design of https://github.com/jvirkki/libbloom
 class Bloom {
  public:
@@ -102,14 +104,14 @@ class SBF {
   SBF(const SBF&) = delete;
 
   // C'tor used for loading persisted filters into SBF.
-  // Should be followed by AddFilter.
+  // Should be followed by AllocateFilter.
   SBF(double grow_factor, double fp_prob, size_t max_capacity, size_t prev_size,
       size_t current_size, PMR_NS::memory_resource* mr);
   ~SBF();
 
   SBF& operator=(SBF&& src) noexcept;
 
-  void AddFilter(const std::string& blob, unsigned hash_cnt);
+  uint8_t* AllocateFilter(size_t alloc_size, unsigned hash_cnt);
 
   bool Add(std::string_view str);
   bool Exists(std::string_view str) const;
@@ -190,6 +192,34 @@ struct SBFChunk {
 // maximum of 16MiB in size. The first chunk sent back contains only the SBF metadata. Following
 // chunks contain filter data and a state of the SBF. The loader uses per filter data to update the
 // SBF as it encounters new filter items.
+
+/*
+SCANDUMP wire output format (all fields little-endian)
+
+  cursor=1 returns the SBF header (12 bytes):
+  +-------------------+--------------------+
+  | version (4B)      | grow_factor (8B)   |
+  +-------------------+--------------------+
+
+  cursor>1 chunks carry filter data. Each filter begins with
+  44 bytes of metadata, followed by the raw filter bytes.
+  A single filter may span multiple chunks.
+
+  First chunk of a filter:
+  +-----------------+----------------+------------+---------------------+
+  | hash_cnt 4B     | data_length 8B | fp_prob 8B | max_capacity 8B     |
+  +-----------------+----------------+------------+---------------------+
+  | current_size 8B | prev_size 8B   | filter bytes (up to 16MiB - 44B) |
+  +-----------------+----------------+------------ ... -----------------+
+
+  Continuation chunks (same filter, if >16MiB):
+  +------------------------ ... -------------------------+
+  | filter bytes (up to 16MiB)                           |
+  +------------------------ ... -------------------------+
+
+  cursor=0 signals end of iteration (empty data).
+*/
+
 class SBFDumpIterator {
  public:
   static constexpr uint64_t kMaxChunkSize = 16 * 1024 * 1024;
@@ -203,10 +233,6 @@ class SBFDumpIterator {
   SBFChunk Next();
 
  private:
-  // Sends the SBF wide header (little endian):
-  // +-------------------+-------------------+
-  // | version (4 bytes) | grow_factor (8B)  |
-  // +-------------------+-------------------+
   std::string SerializeHeader() const;
 
   // Converts a cursor to the specific filter and the offset inside it
diff --git a/src/core/dash.h b/src/core/dash.h
index 7545544f17ee..a2710122d2b3 100644
--- a/src/core/dash.h
+++ b/src/core/dash.h
@@ -348,7 +348,7 @@ class DashTable : public detail::DashTableBase {
   // Unlike Traverse, TraverseBuckets calls cb once on bucket iterator and not on each entry in
   // bucket. TraverseBuckets is stable during table mutations. It guarantees traversing all buckets
   // that existed at the beginning of traversal.
-  template <typename Cb> Cursor TraverseBuckets(Cursor curs, Cb&& cb);
+  template <typename Cb> Cursor TraverseBuckets(Cursor curs, Cb&& cb, bool visit_empty = false);
 
   // Traverses over a single bucket in table and calls cb(iterator). The traverse order will be
   // segment by segment over physical backets.
@@ -460,15 +460,16 @@ class DashTable<_Key, _Value, Policy>::Iterator {
   uint32_t seg_id_;
   detail::PhysicalBid bucket_id_;
   uint8_t slot_id_;
+  bool done_;
 
   friend class DashTable;
 
   Iterator(Owner* me, uint32_t seg_id, detail::PhysicalBid bid, uint8_t sid)
-      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(sid) {
+      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(sid), done_(false) {
   }
 
   Iterator(Owner* me, uint32_t seg_id, detail::PhysicalBid bid)
-      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(0) {
+      : owner_(me), seg_id_(seg_id), bucket_id_(bid), slot_id_(0), done_(false) {
     Seek2Occupied();
   }
 
@@ -486,7 +487,8 @@ class DashTable<_Key, _Value, Policy>::Iterator {
       : owner_(other.owner_),
         seg_id_(other.seg_id_),
         bucket_id_(other.bucket_id_),
-        slot_id_(other.slot_id_) {
+        slot_id_(other.slot_id_),
+        done_(other.done_) {
   }
 
   // Copy constructor from iterator to bucket_iterator and vice versa.
@@ -495,14 +497,15 @@ class DashTable<_Key, _Value, Policy>::Iterator {
       : owner_(other.owner_),
         seg_id_(other.seg_id_),
         bucket_id_(other.bucket_id_),
-        slot_id_(IsSingleBucket ? 0 : other.slot_id_) {
+        slot_id_(IsSingleBucket ? 0 : other.slot_id_),
+        done_(other.done_) {
     // if this - is a bucket_iterator - we reset slot_id to the first occupied space.
     if constexpr (IsSingleBucket) {
       Seek2Occupied();
     }
   }
 
-  Iterator() : owner_(nullptr), seg_id_(0), bucket_id_(0), slot_id_(0) {
+  Iterator() : owner_(nullptr), seg_id_(0), bucket_id_(0), slot_id_(0), done_(true) {
   }
 
   Iterator(const Iterator& other) = default;
@@ -539,7 +542,7 @@ class DashTable<_Key, _Value, Policy>::Iterator {
 
   // Make it self-contained. Does not need container::end().
   bool is_done() const {
-    return owner_ == nullptr;
+    return done_;
   }
 
   bool IsOccupied() const {
@@ -564,10 +567,11 @@ class DashTable<_Key, _Value, Policy>::Iterator {
   }
 
   friend bool operator==(const Iterator& lhs, const Iterator& rhs) {
-    if (lhs.owner_ == nullptr && rhs.owner_ == nullptr)
+    if (lhs.done_ && rhs.done_)
       return true;
     return lhs.owner_ == rhs.owner_ && lhs.seg_id_ == rhs.seg_id_ &&
-           lhs.bucket_id_ == rhs.bucket_id_ && lhs.slot_id_ == rhs.slot_id_;
+           lhs.bucket_id_ == rhs.bucket_id_ && lhs.slot_id_ == rhs.slot_id_ &&
+           lhs.done_ == rhs.done_;
   }
 
   friend bool operator!=(const Iterator& lhs, const Iterator& rhs) {
@@ -649,7 +653,7 @@ struct DashTable<_Key, _Value, Policy>::BucketSet {
 template <typename _Key, typename _Value, typename Policy>
 template <bool IsConst, bool IsSingleBucket>
 void DashTable<_Key, _Value, Policy>::Iterator<IsConst, IsSingleBucket>::Seek2Occupied() {
-  if (owner_ == nullptr)
+  if (done_)
     return;
   assert(seg_id_ < owner_->segment_.size());
 
@@ -673,7 +677,7 @@ void DashTable<_Key, _Value, Policy>::Iterator<IsConst, IsSingleBucket>::Seek2Oc
       bucket_id_ = slot_id_ = 0;
     }
   }
-  owner_ = nullptr;
+  done_ = true;
 }
 
 template <typename _Key, typename _Value, typename Policy>
@@ -1164,7 +1168,8 @@ auto DashTable<_Key, _Value, Policy>::AdvanceCursorBucketOrder(Cursor cursor) ->
 
 template <typename _Key, typename _Value, typename Policy>
 template <typename Cb>
-auto DashTable<_Key, _Value, Policy>::TraverseBuckets(Cursor cursor, Cb&& cb) -> Cursor {
+auto DashTable<_Key, _Value, Policy>::TraverseBuckets(Cursor cursor, Cb&& cb, bool visit_empty)
+    -> Cursor {
   if (SegmentType::OutOfRange(cursor.bucket_id()))  // sanity.
     return Cursor::end();
 
@@ -1178,7 +1183,7 @@ auto DashTable<_Key, _Value, Policy>::TraverseBuckets(Cursor cursor, Cb&& cb) ->
     assert(s);
     if (bid < s->num_buckets()) {
       const auto& bucket = s->GetBucket(bid);
-      if (bucket.GetBusy()) {  // Invoke callback only if bucket has elements.
+      if (visit_empty || bucket.GetBusy()) {
         cb(BucketIt(sid, bid));
         invoked = true;
       }
diff --git a/src/core/dragonfly_core.cc b/src/core/dragonfly_core.cc
index 1b046ddc3e86..43b3937227f0 100644
--- a/src/core/dragonfly_core.cc
+++ b/src/core/dragonfly_core.cc
@@ -4,11 +4,17 @@
 
 #include <absl/base/macros.h>
 
+#include <ostream>
+
 #include "base/logging.h"
 #include "core/intent_lock.h"
 
 namespace dfly {
 
+std::ostream& operator<<(std::ostream& o, const IntentLock& lock) {
+  return o << "{SHARED: " << lock.cnt_[0] << ", EXCLUSIVE: " << lock.cnt_[1] << "}";
+}
+
 const char* IntentLock::ModeName(Mode m) {
   switch (m) {
     case IntentLock::SHARED:
diff --git a/src/core/intent_lock.h b/src/core/intent_lock.h
index 6a565e6bde1d..77e5a3643230 100644
--- a/src/core/intent_lock.h
+++ b/src/core/intent_lock.h
@@ -1,11 +1,11 @@
 // Copyright 2022, DragonflyDB authors.  All rights reserved.
 // See LICENSE for licensing terms.
 //
+#pragma once
+
 #include <assert.h>
 
-#include <ostream>
-
-#pragma once
+#include <iosfwd>
 
 namespace dfly {
 
@@ -60,9 +60,7 @@ class IntentLock {
 
   void VerifyDebug();
 
-  friend std::ostream& operator<<(std::ostream& o, const IntentLock& lock) {
-    return o << "{SHARED: " << lock.cnt_[0] << ", EXCLUSIVE: " << lock.cnt_[1] << "}";
-  }
+  friend std::ostream& operator<<(std::ostream& o, const IntentLock& lock);
 
  private:
   unsigned cnt_[2] = {0, 0};
diff --git a/src/core/search/ast_expr.h b/src/core/search/ast_expr.h
index da239ba839bb..6bf214dfef8b 100644
--- a/src/core/search/ast_expr.h
+++ b/src/core/search/ast_expr.h
@@ -5,9 +5,8 @@
 #pragma once
 
 #include <algorithm>
-#include <iostream>
+#include <iosfwd>
 #include <memory>
-#include <ostream>
 #include <variant>
 #include <vector>
 
diff --git a/src/core/search/hnsw_alg.h b/src/core/search/hnsw_alg.h
index 4327ebb07417..a9ae39377617 100644
--- a/src/core/search/hnsw_alg.h
+++ b/src/core/search/hnsw_alg.h
@@ -78,6 +78,10 @@ template <typename dist_t> class HierarchicalNSW : public hnswlib::AlgorithmInte
 
   bool copy_vector_ = true;
 
+  // Cached in-memory footprint (bytes) — maintained by the constructor and resizeIndex;
+  // read lock-free by metrics. See memorySize() for what is / isn't counted.
+  std::atomic<size_t> memory_size_{0};
+
   bool allow_replace_deleted_ =
       false;  // flag to replace deleted elements (marked as deleted) during insertions
 
@@ -155,6 +159,7 @@ template <typename dist_t> class HierarchicalNSW : public hnswlib::AlgorithmInte
     size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
     mult_ = 1 / log(1.0 * M_);
     revSize_ = 1.0 / mult_;
+    updateMemorySize();
   }
 
   ~HierarchicalNSW() {
@@ -689,6 +694,37 @@ template <typename dist_t> class HierarchicalNSW : public hnswlib::AlgorithmInte
     linkLists_ = linkLists_new;
 
     max_elements_ = new_max_elements;
+    updateMemorySize();
+  }
+
+  // Approximate in-memory footprint in bytes. Lock-free: reads the cached
+  // capacity-based total plus rough estimates of the two dynamic containers
+  // (label_lookup_, deleted_elements) from their already-atomic counters.
+  // Per-element upper-layer link lists remain uncounted (< 5% of the total).
+  size_t memorySize() const {
+    // Rough std::unordered_map<labeltype, tableint> entry: node (key+value+hash+
+    // next-ptr) plus amortized bucket-slot overhead.
+    constexpr size_t kLabelLookupEntryBytes = sizeof(labeltype) + sizeof(tableint) + 32;
+    // std::unordered_set<tableint> entry, populated only in allow_replace_deleted mode.
+    constexpr size_t kDeletedEntryBytes = sizeof(tableint) + 24;
+    size_t total = memory_size_.load(std::memory_order_relaxed);
+    total += cur_element_count.load(std::memory_order_relaxed) * kLabelLookupEntryBytes;
+    total += num_deleted_.load(std::memory_order_relaxed) * kDeletedEntryBytes;
+    return total;
+  }
+
+  // Recomputes memory_size_ from the current allocation-defining fields.
+  // Must be called whenever max_elements_ changes.
+  void updateMemorySize() {
+    // Per-element costs: level-0 block, linkLists_ pointer slot, element_levels_ entry,
+    // link_list_locks_ mutex; plus the copied-vector block when enabled.
+    size_t per_element = size_data_per_element_ + sizeof(char*) + sizeof(int) + sizeof(std::mutex);
+    if (copy_vector_) {
+      per_element += data_size_;
+    }
+    // label_op_locks_ is a fixed-size shard of mutexes independent of max_elements_.
+    size_t fixed = MAX_LABEL_OPERATION_LOCKS * sizeof(std::mutex);
+    memory_size_.store(fixed + max_elements_ * per_element, std::memory_order_relaxed);
   }
 
   size_t indexFileSize() const {
diff --git a/src/core/search/hnsw_index.cc b/src/core/search/hnsw_index.cc
index 6cd660359aed..0e444e7812a3 100644
--- a/src/core/search/hnsw_index.cc
+++ b/src/core/search/hnsw_index.cc
@@ -132,35 +132,13 @@ struct HnswlibAdapter {
   HnswIndexMetadata GetMetadata() const {
     MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
     HnswIndexMetadata metadata;
-    metadata.max_elements = world_.max_elements_;
-    metadata.cur_element_count = world_.cur_element_count.load();
-    metadata.maxlevel = world_.maxlevel_;
     metadata.enterpoint_node = world_.enterpoint_node_;
     return metadata;
   }
 
-  void SetMetadata(const HnswIndexMetadata& metadata) {
-    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);
-    absl::WriterMutexLock resize_lock(&resize_mutex_);
-
-    // SetMetadata is only called during deserialization before the index is used.
-    // Assert the index is empty to ensure no concurrent operations are possible.
-    DCHECK_EQ(world_.cur_element_count.load(), 0u)
-        << "SetMetadata should only be called on an empty index during deserialization";
-
-    // Runtime check for release builds to prevent silent corruption
-    if (world_.cur_element_count.load() != 0) {
-      LOG(ERROR) << "SetMetadata called on non-empty HNSW index with "
-                 << world_.cur_element_count.load() << " elements, ignoring";
-      return;
-    }
-
-    // Pre-allocate capacity based on expected element count, but don't set cur_element_count.
-    // cur_element_count will be set by RestoreFromNodes when the actual nodes are restored.
-    if (world_.max_elements_ < metadata.cur_element_count) {
-      world_.resizeIndex(metadata.cur_element_count);
-    }
-    // Note: Don't set cur_element_count here - RestoreFromNodes will set it after restoring nodes.
+  int GetMaxLevel() const {
+    MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
+    return world_.maxlevel_;
   }
 
   size_t GetNodeCount() const {
@@ -280,13 +258,15 @@ struct HnswlibAdapter {
   }
 
  public:
-  // Restore HNSW graph structure from serialized nodes with metadata
-  void RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata) {
+  // Restore HNSW graph structure from serialized nodes with metadata.
+  // Returns false if the input is inconsistent (e.g. entry point not in node set) —
+  // caller should fall back to rebuilding the index from the keyspace.
+  bool RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata) {
     MRMWMutexLock lock(&mrmw_mutex_, MRMWMutex::LockMode::kWriteLock);
     absl::WriterMutexLock resize_lock(&resize_mutex_);
 
     if (nodes.empty()) {
-      return;
+      return true;
     }
 
     // RestoreFromNodes is only called during deserialization on a freshly created index.
@@ -294,17 +274,25 @@ struct HnswlibAdapter {
     DCHECK_EQ(world_.cur_element_count.load(), 0u)
         << "RestoreFromNodes should only be called on an empty index during deserialization";
 
-    // Ensure we have enough capacity.
-    // Metadata may have been captured before the snapshot read-lock, so
-    // cur_element_count can be smaller than actual node internal_ids when
-    // concurrent writes happen.  Compute the real requirement from nodes.
+    // hnswlib pairs enterpoint_node_ with maxlevel_; node levels are immutable after
+    // creation, so the entry point's level in the serialized set equals the live
+    // maxlevel at metadata capture. max(node.level) would risk OOB reads when a
+    // concurrent Add raised maxlevel between capture and node serialization.
     size_t max_internal_id = 0;
+    int entrypoint_level = -1;
     for (const auto& node : nodes) {
       max_internal_id = std::max<size_t>(max_internal_id, node.internal_id);
+      if (node.internal_id == metadata.enterpoint_node)
+        entrypoint_level = node.level;
     }
-    size_t required_capacity = std::max(metadata.cur_element_count, max_internal_id + 1);
-    if (world_.max_elements_ < required_capacity) {
-      world_.resizeIndex(required_capacity);
+    if (entrypoint_level < 0) {
+      LOG(ERROR) << "HNSW restore: entry point internal_id=" << metadata.enterpoint_node
+                 << " not present in serialized node set (" << nodes.size()
+                 << " nodes); skipping restore — index will be rebuilt from the keyspace";
+      return false;
+    }
+    if (world_.max_elements_ < max_internal_id + 1) {
+      world_.resizeIndex(max_internal_id + 1);
     }
 
     // Restore each node - directly set up memory and fields
@@ -378,12 +366,13 @@ struct HnswlibAdapter {
     }
 
     // Set the metadata for the graph
-    world_.maxlevel_ = metadata.maxlevel;
+    world_.maxlevel_ = entrypoint_level;
     world_.enterpoint_node_ = metadata.enterpoint_node;
 
     VLOG(1) << "Restored HNSW index with " << restored_count
-            << " nodes, maxlevel=" << metadata.maxlevel
+            << " nodes, maxlevel=" << entrypoint_level
             << ", enterpoint=" << metadata.enterpoint_node;
+    return true;
   }
 
   // Update vector data for an existing node (used after RestoreFromNodes).
@@ -424,6 +413,10 @@ struct HnswlibAdapter {
     return MRMWMutexLock(&mrmw_mutex_, MRMWMutex::LockMode::kReadLock);
   }
 
+  size_t GetMemoryUsage() const {
+    return world_.memorySize();
+  }
+
  private:
   HnswSpace space_;
   HierarchicalNSW<float> world_;
@@ -498,8 +491,8 @@ HnswIndexMetadata HnswVectorIndex::GetMetadata() const {
   return adapter_->GetMetadata();
 }
 
-void HnswVectorIndex::SetMetadata(const HnswIndexMetadata& metadata) {
-  adapter_->SetMetadata(metadata);
+int HnswVectorIndex::GetMaxLevel() const {
+  return adapter_->GetMaxLevel();
 }
 
 size_t HnswVectorIndex::GetNodeCount() const {
@@ -510,9 +503,9 @@ std::vector<HnswNodeData> HnswVectorIndex::GetNodesRange(size_t start, size_t en
   return adapter_->GetNodesRange(start, end);
 }
 
-void HnswVectorIndex::RestoreFromNodes(const std::vector<HnswNodeData>& nodes,
+bool HnswVectorIndex::RestoreFromNodes(const std::vector<HnswNodeData>& nodes,
                                        const HnswIndexMetadata& metadata) {
-  adapter_->RestoreFromNodes(nodes, metadata);
+  return adapter_->RestoreFromNodes(nodes, metadata);
 }
 
 bool HnswVectorIndex::UpdateVectorData(GlobalDocId id, const DocumentAccessor& doc,
@@ -542,4 +535,8 @@ MRMWMutexLock HnswVectorIndex::GetReadLock() const {
   return adapter_->GetReadLock();
 }
 
+size_t HnswVectorIndex::GetMemoryUsage() const {
+  return adapter_->GetMemoryUsage();
+}
+
 }  // namespace dfly::search
diff --git a/src/core/search/hnsw_index.h b/src/core/search/hnsw_index.h
index 55de0d1b5236..55f817351be9 100644
--- a/src/core/search/hnsw_index.h
+++ b/src/core/search/hnsw_index.h
@@ -11,16 +11,12 @@
 
 namespace dfly::search {
 
-// Metadata structure for HNSW index serialization
-// Contains the key parameters needed to restore the index state
+// Wire format for HNSW index AUX. Only the entry point is persisted: capacity is
+// derived from max(internal_id)+1 in the node set and maxlevel from the entry-point
+// node's level (hnswlib pairs enterpoint_node_ with maxlevel_, and node levels are
+// immutable after creation).
 struct HnswIndexMetadata {
-  size_t max_elements = 0;  // Maximum number of elements the index can hold
-  // Note: cur_element_count may be smaller than actual node count during concurrent writes,
-  // so we compute the real requirement from nodes during restoration.
-  // TODO: consider removing it from metadata and rely entirely on node data for restoration.
-  size_t cur_element_count = 0;  // Current number of elements in the index
-  int maxlevel = -1;             // Maximum level of the graph
-  size_t enterpoint_node = 0;    // Entry point node for the graph
+  size_t enterpoint_node = 0;
 };
 
 // Node data structure for HNSW serialization
@@ -75,8 +71,9 @@ class HnswVectorIndex {
   // Get metadata for serialization
   HnswIndexMetadata GetMetadata() const;
 
-  // Set metadata (used during restoration)
-  void SetMetadata(const HnswIndexMetadata& metadata);
+  // Current graph maxlevel_. Exposed for introspection and tests that need to
+  // verify invariants preserved by RestoreFromNodes (entry point must sit at maxlevel).
+  int GetMaxLevel() const;
 
   // Get total number of nodes in the index
   size_t GetNodeCount() const;
@@ -85,10 +82,12 @@ class HnswVectorIndex {
   // Returns vector of node data for serialization
   std::vector<HnswNodeData> GetNodesRange(size_t start, size_t end) const;
 
-  // Restore graph structure from serialized nodes with metadata
-  // This restores the HNSW graph links but NOT the vector data
-  // Vector data must be populated separately via UpdateVectorData
-  void RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata);
+  // Restore graph structure from serialized nodes with metadata.
+  // Restores links only; vector data must be populated separately via UpdateVectorData.
+  // Returns false if the metadata is inconsistent with the node set (e.g. the entry
+  // point is missing from the serialized nodes) — caller should then leave the index
+  // empty and let the higher-level rebuild path repopulate it from the keyspace.
+  bool RestoreFromNodes(const std::vector<HnswNodeData>& nodes, const HnswIndexMetadata& metadata);
 
   // Update vector data for an existing node (used after RestoreFromNodes)
   // This populates the vector data for a node that already has graph links
@@ -98,6 +97,9 @@ class HnswVectorIndex {
   // Use this during serialization to block concurrent Add/Remove (write) operations.
   MRMWMutexLock GetReadLock() const;
 
+  // Approximate in-memory footprint of this HNSW graph, in bytes.
+  size_t GetMemoryUsage() const;
+
  private:
   bool copy_vector_;
   size_t dim_;
diff --git a/src/core/search/scoring.cc b/src/core/search/scoring.cc
index 4f14294d30b7..f69f3e405fb7 100644
--- a/src/core/search/scoring.cc
+++ b/src/core/search/scoring.cc
@@ -6,15 +6,11 @@
 
 namespace dfly::search {
 
-double ScoreDocument(ScorerType scorer, const ScoringContext& ctx,
+double ScoreDocument(ScorerFn scorer, const ScoringContext& ctx,
                      const std::vector<ScoringTermInfo>& terms) {
   double score = 0.0;
-  switch (scorer) {
-    case ScorerType::BM25STD:
-      for (const auto& term : terms)
-        score += BM25Std(ctx, term);
-      break;
-  }
+  for (const auto& term : terms)
+    score += scorer(ctx, term);
   return score;
 }
 
diff --git a/src/core/search/scoring.h b/src/core/search/scoring.h
index e9e634a5dd07..ec9a2e0ada51 100644
--- a/src/core/search/scoring.h
+++ b/src/core/search/scoring.h
@@ -17,11 +17,6 @@ namespace dfly::search {
 class FieldIndices;
 struct TextIndex;
 
-// Supported scorer types
-enum class ScorerType : int {
-  BM25STD,  // Standard Okapi BM25 (default)
-};
-
 // Per-term information needed for scoring a single document
 struct ScoringTermInfo {
   uint32_t term_freq = 0;        // How many times this term appears in the document
@@ -35,6 +30,11 @@ struct ScoringContext {
   size_t num_docs = 0;  // Total documents in index
 };
 
+// Scorer function signature: computes the score for a single (term, document) pair.
+// Register new scorers by adding a function with this signature and exposing it via
+// ParseScorer in the command layer.
+using ScorerFn = double (*)(const ScoringContext&, const ScoringTermInfo&);
+
 // Compute BM25STD score for a single term in a document.
 //
 // Formula: IDF * f * (k1 + 1) / (f + k1 * (1 - b + b * docLen / avgDocLen))
@@ -63,9 +63,33 @@ inline double BM25Std(const ScoringContext& ctx, const ScoringTermInfo& term) {
   return idf * tf;
 }
 
-// Compute BM25STD score for a document matched against multiple terms.
-// Returns sum of per-term BM25 scores.
-double ScoreDocument(ScorerType scorer, const ScoringContext& ctx,
+// Compute TFIDF score for a single term in a document.
+//
+// Formula: f * IDF
+// where IDF = ln(N / n), clamped to be non-negative.
+//
+// Note: returns 0 when a term appears in every document (N == n, no discriminating power).
+// This differs from BM25STD, which adds a "+1" inside the log to keep the score positive.
+inline double TfIdf(const ScoringContext& ctx, const ScoringTermInfo& term) {
+  if (term.term_docs == 0)
+    return 0.0;
+
+  // Clamp N >= n to avoid negative IDF during transient states
+  double N = std::max(ctx.num_docs, term.term_docs);
+  return std::log(N / term.term_docs) * term.term_freq;
+}
+
+// Compute TFIDF with document length normalization for a single term.
+//
+// Formula: (f * IDF) / fieldDocLen
+inline double TfIdfDocNorm(const ScoringContext& ctx, const ScoringTermInfo& term) {
+  auto d_len = term.field_doc_len == 0 ? 1 : term.field_doc_len;
+  return TfIdf(ctx, term) / d_len;
+}
+
+// Compute score for a document matched against multiple terms.
+// Returns sum of per-term scores produced by the given scorer function.
+double ScoreDocument(ScorerFn scorer, const ScoringContext& ctx,
                      const std::vector<ScoringTermInfo>& terms);
 
 }  // namespace dfly::search
diff --git a/src/core/search/search.cc b/src/core/search/search.cc
index aff562f3bf0b..aaf7d8a20694 100644
--- a/src/core/search/search.cc
+++ b/src/core/search/search.cc
@@ -115,8 +115,8 @@ struct ProfileBuilder {
 struct BasicSearch {
   using LogicOp = AstLogicalNode::LogicOp;
 
-  BasicSearch(const FieldIndices* indices, std::optional<ScorerType> scorer = std::nullopt)
-      : indices_{indices}, scorer_type_{scorer} {
+  BasicSearch(const FieldIndices* indices, ScorerFn scorer = nullptr)
+      : indices_{indices}, scorer_{scorer} {
   }
 
   void EnableProfiling() {
@@ -234,7 +234,7 @@ struct BasicSearch {
 
     // Track matched terms for scoring (prefix/suffix/infix expand to multiple terms).
     // Synonym shadow entries (freq=0) are resolved to their group_id for correct scoring.
-    if (scorer_type_) {
+    if (scorer_) {
       for (auto* index : indices) {
         auto term_cb = [this, index](string_view term, const auto*) {
           std::string resolved{term};
@@ -280,7 +280,7 @@ struct BasicSearch {
 
     if (!active_field.empty()) {
       if (auto* index = GetIndex<TextIndex>(active_field); index) {
-        if (scorer_type_)
+        if (scorer_)
           AddMatchedTerm(index, term);
         return IndexResult{index->Matching(term, strip_whitespace)};
       }
@@ -290,7 +290,7 @@ struct BasicSearch {
     vector<TextIndex*> selected_indices = indices_->GetAllTextIndices();
 
     // Track terms for scoring
-    if (scorer_type_) {
+    if (scorer_) {
       for (auto* index : selected_indices)
         AddMatchedTerm(index, term);
     }
@@ -505,7 +505,7 @@ struct BasicSearch {
     optional<AlgorithmProfile> profile =
         profile_builder_ ? make_optional(profile_builder_->Take()) : nullopt;
 
-    if (scorer_type_ && !matched_text_terms_.empty()) {
+    if (scorer_ && !matched_text_terms_.empty()) {
       // Score ALL matched docs and return top-K by score (not arbitrary cutoff).
       auto [out, total_size, text_scores] = TakeScoredTopK(std::move(result), cuttoff_limit);
       return SearchResult{
@@ -573,7 +573,7 @@ struct BasicSearch {
           term_infos[t].field_avg_doc_len = cursors[t].index->GetFieldAvgDocLen();
         }
       }
-      scored.emplace_back(static_cast<float>(ScoreDocument(*scorer_type_, ctx, term_infos)), doc);
+      scored.emplace_back(static_cast<float>(ScoreDocument(scorer_, ctx, term_infos)), doc);
     }
 
     // Top-K by score (skip sort when no actual cutoff, e.g. FT.AGGREGATE)
@@ -602,7 +602,7 @@ struct BasicSearch {
   }
 
   const FieldIndices* indices_;
-  std::optional<ScorerType> scorer_type_;
+  ScorerFn scorer_ = nullptr;
 
   string error_;
   optional<ProfileBuilder> profile_builder_ = ProfileBuilder{};
@@ -866,7 +866,7 @@ bool SearchAlgorithm::Init(string_view query, const QueryParams* params,
 SearchResult SearchAlgorithm::Search(const FieldIndices* index, size_t cuttoff_limit) const {
   DCHECK(query_);
 
-  auto bs = BasicSearch{index, scorer_type_};
+  auto bs = BasicSearch{index, scorer_};
   if (profiling_enabled_)
     bs.EnableProfiling();
   return bs.Search(*query_, cuttoff_limit);
@@ -915,8 +915,8 @@ void SearchAlgorithm::EnableProfiling() {
   profiling_enabled_ = true;
 }
 
-void SearchAlgorithm::SetScorer(ScorerType type) {
-  scorer_type_ = type;
+void SearchAlgorithm::SetScorer(ScorerFn scorer) {
+  scorer_ = scorer;
 }
 
 const AstVectorRangeNode* SearchAlgorithm::GetVectorRangeNode() const {
diff --git a/src/core/search/search.h b/src/core/search/search.h
index 1d4f23038edd..0578ead865ef 100644
--- a/src/core/search/search.h
+++ b/src/core/search/search.h
@@ -15,6 +15,7 @@
 #include "base/pmr/memory_resource.h"
 #include "core/search/base.h"
 #include "core/search/range_tree.h"
+#include "core/search/scoring.h"
 #include "core/search/synonyms.h"
 
 namespace dfly::search {
@@ -209,8 +210,6 @@ struct KnnScoreSortOption {
   size_t limit = std::numeric_limits<size_t>::max();
 };
 
-enum class ScorerType : int;
-
 // SearchAlgorithm allows searching field indices with a query
 class SearchAlgorithm {
  public:
@@ -237,11 +236,11 @@ class SearchAlgorithm {
 
   void EnableProfiling();
 
-  void SetScorer(ScorerType type);
+  void SetScorer(ScorerFn scorer);
 
  private:
   bool profiling_enabled_ = false;
-  std::optional<ScorerType> scorer_type_;
+  ScorerFn scorer_ = nullptr;
   std::unique_ptr<AstNode> query_;
   std::optional<KnnScoreSortOption> knn_hnsw_score_sort_option_;
 };
diff --git a/src/core/search/search_test.cc b/src/core/search/search_test.cc
index aeacd41970ca..9c7338d2a44b 100644
--- a/src/core/search/search_test.cc
+++ b/src/core/search/search_test.cc
@@ -1072,6 +1072,30 @@ TEST_F(KnnTest, AutoResize) {
   EXPECT_EQ(indices.GetAllDocs().size(), 100);
 }
 
+// Seeds the given HNSW index with `n` deterministic random vectors of dim `dim` using
+// the given RNG seed. Returns the owning MockedDocuments so the caller can pass them
+// back to UpdateVectorData after a restore. Used by the serialization/restore tests.
+inline vector<MockedDocument> SeedHnswIndex(HnswVectorIndex& index, size_t n, size_t dim,
+                                            uint32_t rng_seed) {
+  vector<MockedDocument> docs(n);
+  std::mt19937 rng(rng_seed);
+  std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+  for (size_t i = 0; i < n; i++) {
+    vector<float> coords(dim);
+    for (size_t d = 0; d < dim; d++)
+      coords[d] = dist(rng);
+    docs[i] = MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}};
+    index.Add(i, docs[i], "vec");
+  }
+  return docs;
+}
+
+// Snapshots all nodes from the index under its read lock.
+inline vector<HnswNodeData> SnapshotHnswNodes(const HnswVectorIndex& index) {
+  auto lock = index.GetReadLock();
+  return index.GetNodesRange(0, index.GetNodeCount());
+}
+
 // Parameterized HNSW serialization round-trip test.
 // Parameters: {num_elements, dim, similarity}
 struct HnswSerParam {
@@ -1108,27 +1132,12 @@ TEST_P(HnswSerializationTest, RoundTrip) {
   params.hnsw_ef_construction = 200;
 
   HnswVectorIndex original(params, /*copy_vector=*/true);
+  vector<MockedDocument> docs = SeedHnswIndex(original, num_elements, dim, /*rng_seed=*/42);
 
-  std::mt19937 rng(42);
-  std::uniform_real_distribution<float> dist(0.0f, 1.0f);
-  vector<MockedDocument> docs(num_elements);
-  for (size_t i = 0; i < num_elements; i++) {
-    vector<float> coords(dim);
-    for (size_t d = 0; d < dim; d++)
-      coords[d] = dist(rng);
-    docs[i] = MockedDocument::Map{{"vec", ToBytes(absl::MakeConstSpan(coords))}};
-    original.Add(i, docs[i], "vec");
-  }
-
-  // Serialize
   auto metadata = original.GetMetadata();
-  ASSERT_EQ(metadata.cur_element_count, num_elements);
+  ASSERT_EQ(original.GetNodeCount(), num_elements);
 
-  std::vector<HnswNodeData> nodes;
-  {
-    auto lock = original.GetReadLock();
-    nodes = original.GetNodesRange(0, metadata.cur_element_count);
-  }
+  std::vector<HnswNodeData> nodes = SnapshotHnswNodes(original);
   ASSERT_EQ(nodes.size(), num_elements);
 
   // Verify node data integrity
@@ -1139,8 +1148,7 @@ TEST_P(HnswSerializationTest, RoundTrip) {
 
   // Deserialize into a fresh index
   HnswVectorIndex restored(params, /*copy_vector=*/true);
-  restored.SetMetadata(metadata);
-  restored.RestoreFromNodes(nodes, metadata);
+  ASSERT_TRUE(restored.RestoreFromNodes(nodes, metadata));
 
   // Before UpdateVectorData, all nodes must be marked deleted.
   // KNN should safely return empty results (no crash from nullptr dereference).
@@ -1153,17 +1161,16 @@ TEST_P(HnswSerializationTest, RoundTrip) {
   for (size_t i = 0; i < num_elements; i++)
     restored.UpdateVectorData(i, docs[i], "vec");
 
-  // Metadata must match
   auto rm = restored.GetMetadata();
-  EXPECT_EQ(rm.cur_element_count, metadata.cur_element_count);
-  EXPECT_EQ(rm.maxlevel, metadata.maxlevel);
+  EXPECT_EQ(restored.GetNodeCount(), num_elements);
   EXPECT_EQ(rm.enterpoint_node, metadata.enterpoint_node);
+  EXPECT_EQ(restored.GetMaxLevel(), original.GetMaxLevel());
 
   // Graph links must be identical
   std::vector<HnswNodeData> restored_nodes;
   {
     auto lock = restored.GetReadLock();
-    restored_nodes = restored.GetNodesRange(0, rm.cur_element_count);
+    restored_nodes = restored.GetNodesRange(0, restored.GetNodeCount());
   }
   ASSERT_EQ(restored_nodes.size(), nodes.size());
   for (size_t i = 0; i < nodes.size(); i++) {
@@ -1209,6 +1216,76 @@ TEST_P(HnswSerializationTest, RoundTrip) {
   }
 }
 
+// Regression for the save-side race where an Add raises maxlevel between metadata
+// capture and node serialization (see RestoreFromNodes for the rationale). Simulated
+// by forging metadata with a low-level entry point against a multi-level node set;
+// expects maxlevel_ to clamp to the entry point's level rather than max(node.level).
+TEST(HnswRestoreInvariant, MaxLevelClampedToEntryPointLevel) {
+  constexpr size_t kDim = 8;
+  constexpr size_t kN = 100;
+
+  InitTLSearchMR(PMR_NS::get_default_resource());
+  absl::Cleanup cleanup = [] { InitTLSearchMR(nullptr); };
+
+  SchemaField::VectorParams params;
+  params.use_hnsw = true;
+  params.dim = kDim;
+  params.sim = VectorSimilarity::L2;
+  params.capacity = kN;
+  params.hnsw_m = 16;
+  params.hnsw_ef_construction = 200;
+
+  HnswVectorIndex original(params, /*copy_vector=*/true);
+  SeedHnswIndex(original, kN, kDim, /*rng_seed=*/42);
+  std::vector<HnswNodeData> nodes = SnapshotHnswNodes(original);
+
+  int global_max_level = -1;
+  std::optional<uint32_t> low_level_internal_id;
+  for (const auto& n : nodes) {
+    global_max_level = std::max(global_max_level, n.level);
+    if (!low_level_internal_id && n.level == 0)
+      low_level_internal_id = n.internal_id;
+  }
+  ASSERT_GT(global_max_level, 0) << "test setup: need a multi-level graph";
+  ASSERT_TRUE(low_level_internal_id.has_value()) << "test setup: need a level-0 node";
+
+  HnswIndexMetadata forged_metadata{.enterpoint_node = *low_level_internal_id};
+
+  HnswVectorIndex restored(params, /*copy_vector=*/true);
+  ASSERT_TRUE(restored.RestoreFromNodes(nodes, forged_metadata));
+
+  EXPECT_EQ(restored.GetMaxLevel(), 0)
+      << "maxlevel_ must equal entry-point level; got " << restored.GetMaxLevel()
+      << " while node set max level=" << global_max_level;
+}
+
+// Malformed/mismatched metadata (entry point not in serialized node set) must
+// fail restoration gracefully — returning false — instead of SIGABRT'ing via
+// CHECK. Callers then rebuild the index from the keyspace.
+TEST(HnswRestoreInvariant, MissingEntrypointFailsGracefully) {
+  constexpr size_t kDim = 4;
+  constexpr size_t kN = 10;
+
+  InitTLSearchMR(PMR_NS::get_default_resource());
+  absl::Cleanup cleanup = [] { InitTLSearchMR(nullptr); };
+
+  SchemaField::VectorParams params;
+  params.use_hnsw = true;
+  params.dim = kDim;
+  params.sim = VectorSimilarity::L2;
+  params.capacity = kN;
+  params.hnsw_m = 16;
+  params.hnsw_ef_construction = 200;
+
+  HnswVectorIndex original(params, /*copy_vector=*/true);
+  SeedHnswIndex(original, kN, kDim, /*rng_seed=*/7);
+  std::vector<HnswNodeData> nodes = SnapshotHnswNodes(original);
+
+  HnswIndexMetadata bad_metadata{.enterpoint_node = 999999};  // well past any real id
+  HnswVectorIndex restored(params, /*copy_vector=*/true);
+  EXPECT_FALSE(restored.RestoreFromNodes(nodes, bad_metadata));
+}
+
 // Regression: in borrowed mode (copy_vector=false), Remove marks the node deleted
 // but hnswlib still traverses it and dereferences its data pointer.  If the external
 // data is freed (as happens after DEL), the pointer dangles.  The fix in DoRemove
@@ -2838,12 +2915,65 @@ TEST_F(ScoringTest, BM25StdMultiTerm) {
   ScoringTermInfo t2{
       .term_freq = 1, .term_docs = 20, .field_doc_len = 10, .field_avg_doc_len = 10.0};
 
-  double multi = ScoreDocument(ScorerType::BM25STD, ctx, {t1, t2});
+  double multi = ScoreDocument(&BM25Std, ctx, {t1, t2});
   double sum = BM25Std(ctx, t1) + BM25Std(ctx, t2);
 
   EXPECT_DOUBLE_EQ(multi, sum);
 }
 
+TEST_F(ScoringTest, TfIdfFormula) {
+  // f=2, N=10, n=3
+  // IDF = ln(10/3) ~ 1.2039
+  // score = 2 * 1.2039 ~ 2.4079
+  ScoringContext ctx{.num_docs = 10};
+  ScoringTermInfo term{.term_freq = 2, .term_docs = 3};
+
+  EXPECT_NEAR(TfIdf(ctx, term), 2.4079, 0.01);
+}
+
+TEST_F(ScoringTest, TfIdfZeroFreq) {
+  ScoringContext ctx{.num_docs = 10};
+  ScoringTermInfo term{.term_freq = 0, .term_docs = 3};
+
+  EXPECT_EQ(TfIdf(ctx, term), 0.0);
+}
+
+TEST_F(ScoringTest, TfIdfRareTermHigherScore) {
+  // Same TF, but rare term (small n) should score higher than common term (large n)
+  ScoringContext ctx{.num_docs = 100};
+  ScoringTermInfo rare{.term_freq = 1, .term_docs = 2};
+  ScoringTermInfo common{.term_freq = 1, .term_docs = 50};
+
+  EXPECT_GT(TfIdf(ctx, rare), TfIdf(ctx, common));
+}
+
+TEST_F(ScoringTest, TfIdfDocNormShorterDocScoresHigher) {
+  // Same TF/IDF, but shorter doc should score higher after length normalization
+  ScoringContext ctx{.num_docs = 10};
+  ScoringTermInfo short_doc{.term_freq = 1, .term_docs = 3, .field_doc_len = 5};
+  ScoringTermInfo long_doc{.term_freq = 1, .term_docs = 3, .field_doc_len = 50};
+
+  EXPECT_GT(TfIdfDocNorm(ctx, short_doc), TfIdfDocNorm(ctx, long_doc));
+}
+
+TEST_F(ScoringTest, TfIdfDocNormZeroDocLen) {
+  // field_doc_len = 0 should not cause division by zero — falls back to unnormalized score
+  ScoringContext ctx{.num_docs = 10};
+  ScoringTermInfo term{.term_freq = 1, .term_docs = 3, .field_doc_len = 0};
+
+  EXPECT_EQ(TfIdfDocNorm(ctx, term), TfIdf(ctx, term));
+}
+
+TEST_F(ScoringTest, ScoreDocumentDispatchesByScorerType) {
+  ScoringContext ctx{.num_docs = 10};
+  ScoringTermInfo term{
+      .term_freq = 2, .term_docs = 3, .field_doc_len = 5, .field_avg_doc_len = 5.0};
+
+  EXPECT_DOUBLE_EQ(ScoreDocument(&BM25Std, ctx, {term}), BM25Std(ctx, term));
+  EXPECT_DOUBLE_EQ(ScoreDocument(&TfIdf, ctx, {term}), TfIdf(ctx, term));
+  EXPECT_DOUBLE_EQ(ScoreDocument(&TfIdfDocNorm, ctx, {term}), TfIdfDocNorm(ctx, term));
+}
+
 TEST_F(ScoringTest, SearchWithScorer) {
   // Integration test: build index, search with scorer, verify scores are non-zero
   Schema schema = MakeSimpleSchema({{"field", SchemaField::TEXT}});
@@ -2861,7 +2991,7 @@ TEST_F(ScoringTest, SearchWithScorer) {
   QueryParams params;
   SearchAlgorithm algo;
   ASSERT_TRUE(algo.Init("hello", &params));
-  algo.SetScorer(ScorerType::BM25STD);
+  algo.SetScorer(&BM25Std);
 
   auto result = algo.Search(&index);
 
@@ -2901,7 +3031,7 @@ TEST_F(ScoringTest, SearchPrefixWithScorer) {
   QueryParams params;
   SearchAlgorithm algo;
   ASSERT_TRUE(algo.Init("hel*", &params));
-  algo.SetScorer(ScorerType::BM25STD);
+  algo.SetScorer(&BM25Std);
 
   auto result = algo.Search(&index);
 
@@ -3008,7 +3138,7 @@ TEST_F(ScoringTest, BM25StdAfterDocRemoval) {
   QueryParams params;
   SearchAlgorithm algo;
   ASSERT_TRUE(algo.Init("hello", &params));
-  algo.SetScorer(ScorerType::BM25STD);
+  algo.SetScorer(&BM25Std);
 
   auto result_before = algo.Search(&index);
   ASSERT_EQ(result_before.ids.size(), 3u);
@@ -3020,7 +3150,7 @@ TEST_F(ScoringTest, BM25StdAfterDocRemoval) {
   // Re-search
   SearchAlgorithm algo2;
   ASSERT_TRUE(algo2.Init("hello", &params));
-  algo2.SetScorer(ScorerType::BM25STD);
+  algo2.SetScorer(&BM25Std);
 
   auto result_after = algo2.Search(&index);
   ASSERT_EQ(result_after.ids.size(), 2u);
@@ -3055,7 +3185,7 @@ TEST_F(ScoringTest, ScorerTopKCutoff) {
   QueryParams params;
   SearchAlgorithm algo;
   ASSERT_TRUE(algo.Init("hello", &params));
-  algo.SetScorer(ScorerType::BM25STD);
+  algo.SetScorer(&BM25Std);
 
   // Request only top 3 - should return docs 9, 8, 7 (highest TF)
   auto result = algo.Search(&index, 3);
diff --git a/src/core/sharded_hash_map.h b/src/core/sharded_hash_map.h
new file mode 100644
index 000000000000..bebe13e93096
--- /dev/null
+++ b/src/core/sharded_hash_map.h
@@ -0,0 +1,218 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/hash/hash.h>
+
+#include <array>
+#include <functional>
+#include <mutex>
+#include <shared_mutex>
+
+#include "base/logging.h"
+#include "util/fibers/synchronization.h"
+
+namespace dfly {
+
+// Thread-safe hash map sharded into NUM_SHARDS independent shards.
+//
+// Each shard contains an absl::flat_hash_map protected by two fiber-aware locks:
+//   - write_mu_ (Mutex): serializes writers. Only one writer can modify the shard at a time.
+//   - read_mu_ (SharedMutex): guards readers. Acquired in shared mode for reads (FindIf,
+//     ForEachShared, Size) and in exclusive mode when a writer needs to commit changes that
+//     must be visible atomically to readers.
+//
+// The two-lock design allows multiple concurrent readers on a shard while a single writer
+// prepares its mutation (holding only write_mu_). The writer then briefly acquires read_mu_
+// exclusively to publish the change, minimizing the window during which readers are blocked.
+//
+// Shard selection is determined by hashing the key with Hash (default: absl::Hash<K>) and
+// taking modulo NUM_SHARDS. Both Hash and Eq are forwarded to the underlying
+// absl::flat_hash_map, so a custom Hash can be supplied as the fourth template argument and
+// a custom equality as the fifth. To enable heterogeneous lookup (e.g. finding a std::string
+// key via std::string_view), both Hash and Eq must be transparent. absl::Hash<K> is NOT
+// transparent — its operator() only accepts const K&. Supply a custom hash that declares
+// is_transparent and accepts all query types (e.g. std::string_view for string keys), paired
+// with std::equal_to<> as Eq. Without both being transparent, heterogeneous lookups will
+// not compile or will silently fall back to non-heterogeneous comparison.
+//
+// Thread safety guarantees:
+//   - Concurrent reads on the same shard are safe (shared read_mu_).
+//   - Concurrent writes to different shards are safe (independent locks).
+//   - A write and a read on the same shard are safe (write_mu_ + exclusive read_mu_).
+//   - Concurrent writes to the same shard are serialized by write_mu_.
+//
+// Re-entrancy: callbacks passed to FindIf, ForEachShared, ForEachExclusive, and
+// WithReadExclusiveLock are invoked while one or more shard locks are held. Calling any
+// ShardedHashMap method that would re-acquire the same lock on the same shard from within
+// a callback will deadlock.
+//
+template <typename K, typename V, size_t NUM_SHARDS = 32, typename Hash = absl::Hash<K>,
+          typename Eq = std::equal_to<K>>
+class ShardedHashMap {
+  static_assert(NUM_SHARDS > 0, "NUM_SHARDS must be greater than 0");
+  using InternalMap = absl::flat_hash_map<K, V, Hash, Eq>;
+
+ public:
+  static constexpr size_t kNumShards = NUM_SHARDS;
+
+  // Tag type to disambiguate shard-index Mutate(ShardId{idx}, ...) from key-based Mutate(key, ...).
+  struct ShardId {
+    size_t value;
+    explicit ShardId(size_t v) : value(v) {
+    }
+  };
+
+  // Returned by the AcquireReaderLock callable passed to Mutate(). Holds an exclusive lock on
+  // read_mu_ for the duration of its lifetime and exposes a mutable reference to the shard
+  // map. Mutations must be performed through LockedMap::map to guarantee that no reader
+  // observes a partial update.
+  struct LockedMap {
+    std::unique_lock<util::fb2::SharedMutex> lock;
+    InternalMap& map;
+  };
+
+  // Looks up `key` under a shared read lock on its shard. If found, invokes f(const V&)
+  // with the mapped value while still holding the lock, then returns true.
+  // Returns false if the key is not present. The callback must not modify the value.
+  //
+  // The template parameter Q allows heterogeneous lookup — any type hashable via
+  // Hash and comparable against K can be used.
+  template <typename Q, typename F> bool FindIf(const Q& key, F&& f) const {
+    const Shard& shard = shards_[ShardOf(key)];
+    std::shared_lock read_lock(shard.read_mu_);
+    auto it = shard.map_.find(key);
+    if (it == shard.map_.end()) {
+      return false;
+    }
+    std::forward<F>(f)(it->second);
+    return true;
+  }
+
+  // Iterates over all entries across every shard, invoking f(const K&, const V&) for each.
+  // Each shard's read_mu_ is acquired in shared mode independently — the iteration is NOT
+  // a global snapshot, so entries may be added or removed in other shards concurrently.
+  // Suitable for building approximate views or collecting statistics.
+  template <typename F> void ForEachShared(F&& f) const {
+    for (const Shard& shard : shards_) {
+      std::shared_lock read_lock(shard.read_mu_);
+      for (const auto& [k, v] : shard.map_) {
+        f(k, v);
+      }
+    }
+  }
+
+  // Iterates over all entries with full exclusive access, invoking f(const K&, V&) for each.
+  // Both write_mu_ and read_mu_ are held exclusively per shard, so no concurrent readers
+  // or writers can access the shard during iteration. This is the heaviest locking mode —
+  // use it only when entries must be mutated in-place or when a consistent per-shard view
+  // is required. Note: like ForEachShared, this is still not a global snapshot across shards.
+  template <typename F> void ForEachExclusive(F&& f) {
+    for (Shard& shard : shards_) {
+      std::unique_lock write_lock{shard.write_mu_};
+      std::unique_lock reader_lock{shard.read_mu_};
+      for (auto& [k, v] : shard.map_) {
+        f(k, v);
+      }
+    }
+  }
+
+  // Primary mutation interface. Acquires write_mu_ exclusively on the shard that owns `key`,
+  // then invokes f(const InternalMap& map, auto AcquireReaderLock).
+  //
+  // The callback receives:
+  //   - map: a const reference to the shard's underlying absl::flat_hash_map. The caller
+  //     may inspect data while only write_mu_ is held (readers still proceed).
+  //   - AcquireReaderLock: a callable that returns LockedMap, which holds an exclusive lock
+  //     on read_mu_ and a mutable InternalMap& reference. Mutations must go through LockedMap::map
+  //     only — this ensures no reader observes a partial update.
+  //
+  // Do not hold multiple LockedMap instances simultaneously within the callback — read_mu_ is
+  // non-recursive, so acquiring it twice will deadlock. Calling lock_readers() more than once
+  // is safe only if the previous LockedMap has gone out of scope first.
+  //
+  // Typical usage pattern:
+  //   map.Mutate(key, [&](const auto& m, auto lock_readers) {
+  //       /* optionally inspect m (const) without blocking readers */
+  //       auto lm = lock_readers();
+  //       lm.map[key] = new_value;  // now no reader sees a partial update
+  //   });
+  //
+  // The template parameter Q allows heterogeneous lookup — any type hashable via
+  // Hash and comparable against K can be used.
+  template <typename Q, typename F> void Mutate(const Q& key, F&& f) {
+    Shard& shard = shards_[ShardOf(key)];
+    std::unique_lock write_lock{shard.write_mu_};
+    std::forward<F>(f)(static_cast<const InternalMap&>(shard.map_), [&shard]() -> LockedMap {
+      return {std::unique_lock<util::fb2::SharedMutex>{shard.read_mu_}, shard.map_};
+    });
+  }
+
+  // Shard-index overload of Mutate. Same semantics as Mutate(key, f) but addresses the
+  // shard directly by its index `sid` (0 <= sid < NUM_SHARDS). Useful when the caller has
+  // already computed the shard via ShardOf() or needs to batch multiple keys that map to
+  // the same shard under a single lock acquisition. The same lock_readers() re-entrancy
+  // restriction applies: do not hold two LockedMap instances at the same time.
+  template <typename F> void Mutate(ShardId sid, F&& f) {
+    DCHECK_LT(sid.value, NUM_SHARDS);
+    Shard& shard = shards_[sid.value];
+    std::unique_lock write_lock{shard.write_mu_};
+    std::forward<F>(f)(static_cast<const InternalMap&>(shard.map_), [&shard]() -> LockedMap {
+      return {std::unique_lock<util::fb2::SharedMutex>{shard.read_mu_}, shard.map_};
+    });
+  }
+
+  // Returns the shard index (0 .. NUM_SHARDS-1) that `key` maps to. Can be used to
+  // pre-compute the shard for later use with the shard-index overloads of Mutate() or
+  // WithReadExclusiveLock(), or to group operations on keys that share a shard.
+  template <typename Q> size_t ShardOf(const Q& key) const {
+    return Hash{}(key) % NUM_SHARDS;
+  }
+
+  // Acquires read_mu_ exclusively on the shard that owns `key`, blocking all concurrent
+  // readers (FindIf, ForEachShared, Size) on that shard, then invokes f(). The write_mu_
+  // is NOT acquired, so this does not serialize against other writers. Use this when you
+  // need to perform an external side-effect that must not race with readers of this shard
+  // but the map itself is not being modified.
+  //
+  // The template parameter Q allows heterogeneous lookup — any type hashable via
+  // Hash and comparable against K can be used.
+  template <typename Q, typename F> void WithReadExclusiveLock(const Q& key, F&& f) {
+    Shard& shard = shards_[ShardOf(key)];
+    std::unique_lock l{shard.read_mu_};
+    std::forward<F>(f)();
+  }
+
+  // Shard-index overload of WithReadExclusiveLock. Same semantics but addresses the shard
+  // directly by its index `sid` (0 <= sid < NUM_SHARDS).
+  template <typename F> void WithReadExclusiveLock(ShardId sid, F&& f) {
+    DCHECK_LT(sid.value, NUM_SHARDS);
+    std::unique_lock l{shards_[sid.value].read_mu_};
+    std::forward<F>(f)();
+  }
+
+  // Returns the approximate total number of entries across all shards. Each shard's
+  // read_mu_ is acquired in shared mode independently and its size accumulated.
+  size_t SizeApproximate() const {
+    size_t total = 0;
+    for (const Shard& shard : shards_) {
+      std::shared_lock read_lock{shard.read_mu_};
+      total += shard.map_.size();
+    }
+    return total;
+  }
+
+ private:
+  // Aligned to cache line.
+  struct alignas(64) Shard {
+    util::fb2::Mutex write_mu_;
+    mutable util::fb2::SharedMutex read_mu_;
+    InternalMap map_;
+  };
+
+  std::array<Shard, NUM_SHARDS> shards_;
+};
+
+}  // namespace dfly
diff --git a/src/core/sharded_hash_map_test.cc b/src/core/sharded_hash_map_test.cc
new file mode 100644
index 000000000000..8b0de8d46cd5
--- /dev/null
+++ b/src/core/sharded_hash_map_test.cc
@@ -0,0 +1,292 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "core/sharded_hash_map.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "base/gtest.h"
+#include "base/logging.h"
+#include "util/fibers/fibers.h"
+#include "util/fibers/synchronization.h"
+
+namespace dfly {
+
+using namespace std;
+
+// Transparent hash for string-like types. absl::Hash<string> is not transparent (its
+// operator() only accepts const string&), so heterogeneous lookup requires a custom hash
+// that declares is_transparent and accepts string_view (which string and const char* both
+// convert to). absl guarantees that hashing equal string contents produces the same value
+// regardless of the concrete string type, so this is consistent with the stored keys.
+struct TransparentStringHash {
+  using is_transparent = void;
+  size_t operator()(std::string_view sv) const {
+    return absl::Hash<std::string_view>{}(sv);
+  }
+};
+
+class ShardedHashMapTest : public testing::Test {
+ protected:
+  ShardedHashMap<string, int> map_;
+};
+
+TEST_F(ShardedHashMapTest, EmptyMap) {
+  EXPECT_EQ(map_.SizeApproximate(), 0u);
+
+  bool found = map_.FindIf(string("missing"), [](const int&) {});
+  EXPECT_FALSE(found);
+}
+
+TEST_F(ShardedHashMapTest, MutateInsertAndFind) {
+  map_.Mutate(string("key1"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map["key1"] = 42;
+  });
+
+  EXPECT_EQ(map_.SizeApproximate(), 1u);
+
+  bool found = map_.FindIf(string("key1"), [](const int& v) { EXPECT_EQ(v, 42); });
+  EXPECT_TRUE(found);
+}
+
+TEST_F(ShardedHashMapTest, MutateOverwrite) {
+  map_.Mutate(string("key1"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map["key1"] = 10;
+  });
+
+  map_.Mutate(string("key1"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map["key1"] = 20;
+  });
+
+  EXPECT_TRUE(map_.FindIf(string("key1"), [](const int& v) { EXPECT_EQ(v, 20); }));
+  EXPECT_EQ(map_.SizeApproximate(), 1u);
+}
+
+TEST_F(ShardedHashMapTest, MutateErase) {
+  map_.Mutate(string("key1"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map["key1"] = 1;
+  });
+  EXPECT_EQ(map_.SizeApproximate(), 1u);
+
+  map_.Mutate(string("key1"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map.erase("key1");
+  });
+  EXPECT_EQ(map_.SizeApproximate(), 0u);
+
+  EXPECT_FALSE(map_.FindIf(string("key1"), [](const int&) {}));
+}
+
+TEST_F(ShardedHashMapTest, FindIfReturnsFalseForMissing) {
+  map_.Mutate(string("a"), [](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map["a"] = 1;
+  });
+
+  EXPECT_FALSE(map_.FindIf(string("b"), [](const int&) {}));
+}
+
+TEST_F(ShardedHashMapTest, MultipleKeys) {
+  for (int i = 0; i < 100; ++i) {
+    string key = "key" + to_string(i);
+    map_.Mutate(key, [&key, i](const auto& m, auto lock_readers) {
+      auto lm = lock_readers();
+      lm.map[key] = i;
+    });
+  }
+
+  EXPECT_EQ(map_.SizeApproximate(), 100u);
+
+  for (int i = 0; i < 100; ++i) {
+    string key = "key" + to_string(i);
+    bool found = map_.FindIf(key, [i](const int& v) { EXPECT_EQ(v, i); });
+    EXPECT_TRUE(found);
+  }
+}
+
+TEST_F(ShardedHashMapTest, HeterogeneousLookup) {
+  // Use transparent Eq so that string_view / C-string queries compile and match correctly.
+  ShardedHashMap<string, int, 32, TransparentStringHash, std::equal_to<>> hmap;
+
+  hmap.Mutate(string("hello"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["hello"] = 7;
+  });
+
+  string_view sv = "hello";
+  bool found = hmap.FindIf(sv, [](const int& v) { EXPECT_EQ(v, 7); });
+  EXPECT_TRUE(found);
+
+  const char* cstr = "hello";
+  found = hmap.FindIf(cstr, [](const int& v) { EXPECT_EQ(v, 7); });
+  EXPECT_TRUE(found);
+
+  EXPECT_FALSE(hmap.FindIf(string_view{"missing"}, [](const int&) {}));
+}
+
+TEST_F(ShardedHashMapTest, ShardOf) {
+  // ShardOf should be deterministic and within range.
+  string key = "test_key";
+  size_t shard = map_.ShardOf(key);
+  EXPECT_LT(shard, map_.kNumShards);
+  // Same key always maps to same shard.
+  EXPECT_EQ(shard, map_.ShardOf(key));
+}
+
+TEST_F(ShardedHashMapTest, MutateByShard) {
+  string key = "key1";
+  size_t sid = map_.ShardOf(key);
+
+  map_.Mutate(ShardedHashMap<string, int>::ShardId{sid}, [&key](const auto& m, auto lock_readers) {
+    auto lm = lock_readers();
+    lm.map[key] = 99;
+  });
+
+  bool found = map_.FindIf(key, [](const int& v) { EXPECT_EQ(v, 99); });
+  EXPECT_TRUE(found);
+}
+
+TEST_F(ShardedHashMapTest, ForEachShared) {
+  map_.Mutate(string("a"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["a"] = 1;
+  });
+  map_.Mutate(string("b"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["b"] = 2;
+  });
+
+  int sum = 0;
+  map_.ForEachShared([&sum](const string&, const int& v) { sum += v; });
+  EXPECT_EQ(sum, 3);
+}
+
+TEST_F(ShardedHashMapTest, ForEachExclusive) {
+  map_.Mutate(string("x"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["x"] = 10;
+  });
+  map_.Mutate(string("y"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["y"] = 20;
+  });
+
+  // Double all values via exclusive iteration.
+  map_.ForEachExclusive([](const string&, int& v) { v *= 2; });
+
+  EXPECT_TRUE(map_.FindIf(string("x"), [](const int& v) { EXPECT_EQ(v, 20); }));
+  EXPECT_TRUE(map_.FindIf(string("y"), [](const int& v) { EXPECT_EQ(v, 40); }));
+}
+
+TEST_F(ShardedHashMapTest, WithReadExclusiveLockByKey) {
+  map_.Mutate(string("k"), [](const auto& m, auto lr) {
+    auto lm = lr();
+    lm.map["k"] = 5;
+  });
+
+  bool executed = false;
+  map_.WithReadExclusiveLock(string("k"), [&executed]() { executed = true; });
+  EXPECT_TRUE(executed);
+}
+
+TEST_F(ShardedHashMapTest, WithReadExclusiveLockByShard) {
+  bool executed = false;
+  map_.WithReadExclusiveLock(ShardedHashMap<string, int>::ShardId{0},
+                             [&executed]() { executed = true; });
+  EXPECT_TRUE(executed);
+}
+
+TEST_F(ShardedHashMapTest, ConcurrentReadersAndWriter) {
+  // Insert initial data.
+  for (int i = 0; i < 50; ++i) {
+    string key = "key" + to_string(i);
+    map_.Mutate(key, [&key, i](const auto& m, auto lr) {
+      auto lm = lr();
+      lm.map[key] = i;
+    });
+  }
+
+  constexpr int kReaders = 4;
+  constexpr int kReadsPerFiber = 200;
+
+  util::fb2::Barrier barrier(kReaders + 1);  // +1 for writer fiber
+  vector<util::fb2::Fiber> fibers;
+
+  // Launch reader fibers.
+  for (int r = 0; r < kReaders; ++r) {
+    fibers.emplace_back("reader", [&] {
+      barrier.Wait();
+      for (int j = 0; j < kReadsPerFiber; ++j) {
+        string key = "key" + to_string(j % 50);
+        map_.FindIf(key, [](const int&) {});
+      }
+    });
+  }
+
+  // Launch writer fiber.
+  fibers.emplace_back("writer", [&] {
+    barrier.Wait();
+    for (int i = 50; i < 100; ++i) {
+      string key = "key" + to_string(i);
+      map_.Mutate(key, [&key, i](const auto& m, auto lr) {
+        auto lm = lr();
+        lm.map[key] = i;
+      });
+    }
+  });
+
+  for (auto& fb : fibers) {
+    fb.Join();
+  }
+
+  EXPECT_EQ(map_.SizeApproximate(), 100u);
+}
+
+TEST_F(ShardedHashMapTest, ConcurrentWriters) {
+  constexpr int kWriters = 4;
+  constexpr int kKeysPerWriter = 50;
+
+  vector<util::fb2::Fiber> fibers;
+  util::fb2::Barrier barrier(kWriters);
+
+  for (int w = 0; w < kWriters; ++w) {
+    fibers.emplace_back("writer", [&, w] {
+      barrier.Wait();
+      for (int i = 0; i < kKeysPerWriter; ++i) {
+        // Each writer writes to its own key space to avoid contention on values.
+        string key = "w" + to_string(w) + "_k" + to_string(i);
+        map_.Mutate(key, [&key, val = w * 1000 + i](const auto& m, auto lr) {
+          auto lm = lr();
+          lm.map[key] = val;
+        });
+      }
+    });
+  }
+
+  for (auto& fb : fibers) {
+    fb.Join();
+  }
+
+  EXPECT_EQ(map_.SizeApproximate(), kWriters * kKeysPerWriter);
+
+  // Verify all values.
+  for (int w = 0; w < kWriters; ++w) {
+    for (int i = 0; i < kKeysPerWriter; ++i) {
+      string key = "w" + to_string(w) + "_k" + to_string(i);
+      int expected = w * 1000 + i;
+      bool found = map_.FindIf(key, [expected](const int& v) { EXPECT_EQ(v, expected); });
+      EXPECT_TRUE(found) << "missing key: " << key;
+    }
+  }
+}
+
+}  // namespace dfly
diff --git a/src/facade/cmd_arg_parser.h b/src/facade/cmd_arg_parser.h
index e1ac88450a78..6073adb0a7db 100644
--- a/src/facade/cmd_arg_parser.h
+++ b/src/facade/cmd_arg_parser.h
@@ -7,15 +7,65 @@
 #include <absl/strings/match.h>
 #include <absl/strings/numbers.h>
 
+#include <concepts>
 #include <optional>
 #include <string_view>
+#include <tuple>
+#include <type_traits>
 #include <utility>
 
 #include "facade/facade_types.h"
 
 namespace facade {
 
-// Helper class for numerical range restriction during parsing
+// CmdArgParser — utility for parsing command option lists.
+//
+// Reading individual args:
+//   CmdArgParser parser(args);
+//   auto key = parser.Next<string_view>();                      // read one arg by type
+//   auto [src, dst] = parser.Next<string_view, string_view>();  // read several at once (tuple)
+//   auto db = parser.Next<FInt<0, 15>>();                       // range-restricted int
+//                                                               // (INVALID_INT if out of range)
+//   auto count = parser.NextOrDefault<size_t>(10);              // read optional with default
+//
+// Tag matching:
+//   parser.ExpectTag("LOAD");                                   // required literal keyword
+//   if (parser.Check("NX")) { ... }                             // consume tag only if matched
+//   auto mode = parser.MapNext("EX", Mode::EX, "PX", Mode::PX); // tag -> enum mapping
+//   auto maybe_mode = parser.TryMapNext("ASC", Dir::ASC,        // like MapNext but returns
+//                                       "DESC", Dir::DESC);     // nullopt (no error) on miss
+//
+// Bulk named options with Apply():
+//   parser.Apply(
+//       Exist("WITHSCORES", &with_scores),         // tag present -> sets bool true
+//       Tag("LIMIT", &offset, &limit),             // tag -> reads following args
+//       Tag("COUNT", &optional_count),             // std::optional<T>* supported directly
+//       Tag("GET", [&](CmdArgParser* p) {          // lambda: custom parsing on tag match
+//         patterns.push_back(p->Next<string_view>());
+//       }),
+//       Map(&dir, "ASC", Dir::ASC, "DESC", Dir::DESC),   // tag -> fixed value mapping
+//       Tag("ATTR", Map(&mask, "v", Mask::Volatile,      // nested: outer tag + inner Map
+//                       "p", Mask::Permanent)),          //   (inner keyword required on match)
+//       OneOf(Exist("NX", &nx), Exist("XX", &xx)),       // mutex — at most one may match
+//       If(!read_only, Tag("STORE", &store_key)));    // runtime-gated option
+//
+// Strict vs lenient dispatch:
+//   parser.Apply(...)        — stops at first unmatched arg; pair with Finalize() to error
+//   parser.ApplyOrSkip(...)  — silently skips unknown tags one-by-one
+//
+// Navigating manually:
+//   if (parser.HasNext()) { ... }                               // is there another arg?
+//   if (parser.HasAtLeast(3)) { ... }                           // at least N args remain?
+//   auto peek = parser.Peek();                                  // look at next without consuming
+//   parser.Skip(n);                                             // advance n args
+//   CmdArgList rest = parser.Tail();                            // remaining args (e.g. k/v pairs)
+//
+// Error surfacing (at the end of parse):
+//   if (!parser.Finalize())                                     // also reports UNPROCESSED on
+//     return cmd_cntx->SendError(parser.TakeError().MakeReply()); // trailing args
+//   // or: if (parser.HasError()) ...
+
+// Numerical range restriction used with Next<FInt<lo, hi>>().
 template <auto min, auto max> struct FInt {
   decltype(min) value = {};
   operator decltype(min)() {
@@ -31,7 +81,10 @@ template <class T> constexpr bool is_fint = false;
 
 template <auto min, auto max> constexpr bool is_fint<FInt<min, max>> = true;
 
-// Utility class for easily parsing command options from argument lists.
+template <class T> constexpr bool is_optional = false;
+
+template <class U> constexpr bool is_optional<std::optional<U>> = true;
+
 struct CmdArgParser {
   enum ErrorType {
     NO_ERROR,
@@ -59,15 +112,13 @@ struct CmdArgParser {
   CmdArgParser(ArgSlice args) : args_{args} {
   }
 
-  // Debug asserts sure error was consumed
+  // DCHECKs that any error was consumed.
   ~CmdArgParser();
 
-  // Get next value without consuming it
   std::string_view Peek() {
     return SafeSV(cur_i_);
   }
 
-  // Consume next value
   template <class T = std::string_view, class... Ts> auto Next() {
     if (cur_i_ + sizeof...(Ts) >= args_.size()) {
       Report(OUT_OF_BOUNDS, cur_i_);
@@ -85,15 +136,13 @@ struct CmdArgParser {
     }
   }
 
-  // returns next value if exists or default value
   template <class T = std::string_view> auto NextOrDefault(T default_value = {}) {
     return HasNext() ? Next<T>() : default_value;
   }
 
-  // check next value ignoring case and consume it
+  // Consumes the next arg; reports INVALID_NEXT if it doesn't match (case-insensitive).
   void ExpectTag(std::string_view tag);
 
-  // Consume next value
   template <class... Cases> auto MapNext(Cases&&... cases) {
     if (cur_i_ >= args_.size()) {
       Report(OUT_OF_BOUNDS, cur_i_);
@@ -110,7 +159,7 @@ struct CmdArgParser {
     return *res;
   }
 
-  // Consume next value if can map it and return mapped result or return nullopt
+  // Same as MapNext, but returns nullopt (no error) if no case matches.
   template <class... Cases>
   auto TryMapNext(Cases&&... cases)
       -> std::optional<std::tuple_element_t<1, std::tuple<Cases...>>> {
@@ -123,7 +172,7 @@ struct CmdArgParser {
     return res;
   }
 
-  // Check if the next value is equal to a specific tag. If equal, its consumed.
+  // If the next arg matches `tag`, consume it and the following args-into-pointers; else no-op.
   template <class... Args> bool Check(std::string_view tag, Args*... args) {
     if (cur_i_ + sizeof...(Args) >= args_.size())
       return false;
@@ -139,7 +188,22 @@ struct CmdArgParser {
     return true;
   }
 
-  // Skip specified number of arguments
+  // Greedily matches remaining args against the options. See the file header for usage.
+  template <class... Opts> void Apply(Opts... opts) {
+    while (HasNext() && (opts.TryApply(this) || ...)) {
+    }
+  }
+
+  // Like Apply, but silently skips unmatched args (one at a time) instead of stopping. Use when
+  // unknown tags should be ignored rather than reported. Prefer Apply + Finalize when strictness
+  // is desired.
+  template <class... Opts> void ApplyOrSkip(Opts... opts) {
+    while (HasNext()) {
+      if (!(opts.TryApply(this) || ...))
+        Skip(1);
+    }
+  }
+
   CmdArgParser& Skip(size_t n) {
     if (cur_i_ + n > args_.size()) {
       Report(OUT_OF_BOUNDS, cur_i_);
@@ -149,7 +213,7 @@ struct CmdArgParser {
     return *this;
   }
 
-  // Expect no more arguments and return if no error has occured
+  // Requires no leftover args and no prior errors. Reports UNPROCESSED if args remain.
   bool Finalize() {
     if (HasNext()) {
       Report(UNPROCESSED, cur_i_);
@@ -158,12 +222,10 @@ struct CmdArgParser {
     return !HasError();
   }
 
-  // Return remaining arguments
   ArgSlice Tail() const {
     return args_.subspan(cur_i_);
   }
 
-  // Return true if arguments are left and no errors occured
   bool HasNext() {
     return cur_i_ < args_.size() && !error_;
   }
@@ -182,11 +244,10 @@ struct CmdArgParser {
     return cur_i_;
   }
 
-  // Custom error_type should start from CUSTOM_ERROR
+  // Reports a custom error (error_type >= CUSTOM_ERROR) at the previously-consumed index
+  // (or 0 if called before any arg was consumed).
   void Report(int error_type) {
-    // we use previous index, because the check was done outside and it's done after element is
-    // processed
-    Report(error_type, cur_i_ - 1);
+    Report(error_type, cur_i_ > 0 ? cur_i_ - 1 : 0);
   }
 
  private:
@@ -216,10 +277,12 @@ struct CmdArgParser {
   }
 
   template <class T> T Convert(size_t idx) {
-    static_assert(
-        std::is_arithmetic_v<T> || std::is_constructible_v<T, std::string_view> || is_fint<T>,
-        "incorrect type");
-    if constexpr (std::is_arithmetic_v<T>) {
+    static_assert(std::is_arithmetic_v<T> || std::is_constructible_v<T, std::string_view> ||
+                      is_fint<T> || is_optional<T>,
+                  "incorrect type");
+    if constexpr (is_optional<T>) {
+      return T{Convert<typename T::value_type>(idx)};
+    } else if constexpr (std::is_arithmetic_v<T>) {
       return Num<T>(idx);
     } else if constexpr (std::is_constructible_v<T, std::string_view>) {
       return static_cast<T>(SafeSV(idx));
@@ -280,4 +343,153 @@ struct CmdArgParser {
   ErrorInfo error_;
 };
 
+namespace detail {
+
+struct ExistOpt {
+  std::string_view tag;
+  bool* field;
+
+  bool TryApply(CmdArgParser* parser) const {
+    if (parser->Check(tag)) {
+      *field = true;
+      return true;
+    }
+    return false;
+  }
+};
+
+template <class... Args> struct TagOpt {
+  std::string_view tag;
+  std::tuple<Args*...> args;
+
+  bool TryApply(CmdArgParser* parser) const {
+    // Match the tag first, then read fields via Next<>() — so a missing value surfaces
+    // OUT_OF_BOUNDS instead of being swallowed by ApplyOrSkip as "no match".
+    if (!parser->Check(tag))
+      return false;
+    std::apply(
+        [&](auto*... ptrs) {
+          (((*ptrs) = parser->template Next<std::remove_pointer_t<decltype(ptrs)>>()), ...);
+        },
+        args);
+    return true;
+  }
+};
+
+template <class Func> struct LambdaOpt {
+  std::string_view tag;
+  Func func;
+
+  bool TryApply(CmdArgParser* parser) const {
+    if (parser->Check(tag)) {
+      func(parser);
+      return true;
+    }
+    return false;
+  }
+};
+
+template <class T, class... Cases> struct MapOpt {
+  static_assert(sizeof...(Cases) % 2 == 0, "Map expects alternating tag/value pairs");
+
+  T* field;
+  std::tuple<Cases...> cases;
+
+  bool TryApply(CmdArgParser* parser) const {
+    return TryMatch<0>(parser);
+  }
+
+ private:
+  template <size_t I> bool TryMatch(CmdArgParser* parser) const {
+    if constexpr (I >= sizeof...(Cases)) {
+      return false;
+    } else if (parser->Check(std::get<I>(cases))) {
+      *field = std::get<I + 1>(cases);
+      return true;
+    } else {
+      return TryMatch<I + 2>(parser);
+    }
+  }
+};
+
+template <class Inner> struct IfOpt {
+  bool cond;
+  Inner inner;
+
+  bool TryApply(CmdArgParser* parser) const {
+    return cond && inner.TryApply(parser);
+  }
+};
+
+template <class... Opts> struct OneOfOpt {
+  std::tuple<Opts...> opts;
+  mutable bool matched = false;
+
+  bool TryApply(CmdArgParser* parser) const {
+    bool any = std::apply([&](auto&... os) { return (os.TryApply(parser) || ...); }, opts);
+    if (!any)
+      return false;
+    if (matched)
+      parser->Report(CmdArgParser::INVALID_CASES);
+    matched = true;
+    return true;
+  }
+};
+
+// Nested: outer tag consumes one arg, then inner option runs against the next arg. If the inner
+// doesn't match, reports INVALID_CASES (the inner keyword is required once the outer matched).
+template <class Inner> struct TagNestedOpt {
+  std::string_view tag;
+  Inner inner;
+
+  bool TryApply(CmdArgParser* parser) const {
+    if (!parser->Check(tag))
+      return false;
+    if (!inner.TryApply(parser))
+      parser->Report(CmdArgParser::INVALID_CASES);
+    return true;
+  }
+};
+
+// Concept matching any of the Apply options (has a TryApply(CmdArgParser*) method).
+template <class T>
+concept ParseOption = requires(const T& t, CmdArgParser* p) {
+  { t.TryApply(p) } -> std::same_as<bool>;
+};
+
+}  // namespace detail
+
+inline detail::ExistOpt Exist(std::string_view tag, bool* field) {
+  return {tag, field};
+}
+
+template <class... Args> detail::TagOpt<Args...> Tag(std::string_view tag, Args*... args) {
+  return detail::TagOpt<Args...>{tag, std::make_tuple(args...)};
+}
+
+template <class Func>
+requires std::is_invocable_v<Func, CmdArgParser*> detail::LambdaOpt<Func> Tag(std::string_view tag,
+                                                                              Func func) {
+  return {tag, std::move(func)};
+}
+
+// Nested option: outer tag + inner sub-option (e.g. Map). After outer matches, inner must match
+// the following arg or INVALID_CASES is reported.
+template <detail::ParseOption Inner>
+detail::TagNestedOpt<Inner> Tag(std::string_view tag, Inner inner) {
+  return {tag, std::move(inner)};
+}
+
+template <class T, class... Cases> detail::MapOpt<T, Cases...> Map(T* field, Cases... cases) {
+  return {field, std::make_tuple(std::move(cases)...)};
+}
+
+template <class Inner> detail::IfOpt<Inner> If(bool cond, Inner inner) {
+  return {cond, std::move(inner)};
+}
+
+template <class... Opts> detail::OneOfOpt<Opts...> OneOf(Opts... opts) {
+  return {{std::move(opts)...}, false};
+}
+
 }  // namespace facade
diff --git a/src/facade/cmd_arg_parser_test.cc b/src/facade/cmd_arg_parser_test.cc
index 7906a36286ac..5bc3c15748c3 100644
--- a/src/facade/cmd_arg_parser_test.cc
+++ b/src/facade/cmd_arg_parser_test.cc
@@ -143,6 +143,358 @@ TEST_F(CmdArgParserTest, IgnoreCase) {
   EXPECT_EQ(absl::implicit_cast<string_view>(parser.Next()), "world"sv);
 }
 
+TEST_F(CmdArgParserTest, Apply) {
+  // All option shapes: Exist sets a bool, Tag-with-one-field, Tag-with-two-fields.
+  {
+    auto parser = Make({"FLAG", "COUNT", "5", "LIMIT", "10", "20"});
+
+    bool flag = false;
+    uint32_t count = 0;
+    uint32_t offset = 0;
+    uint32_t limit = 0;
+
+    parser.Apply(Exist("FLAG", &flag), Tag("COUNT", &count), Tag("LIMIT", &offset, &limit));
+
+    EXPECT_TRUE(flag);
+    EXPECT_EQ(count, 5u);
+    EXPECT_EQ(offset, 10u);
+    EXPECT_EQ(limit, 20u);
+    EXPECT_FALSE(parser.HasError());
+  }
+
+  // Unknown option is left unconsumed (no error). The caller decides what to do next.
+  {
+    auto parser = Make({"COUNT", "5", "BOGUS"});
+
+    uint32_t count = 0;
+    parser.Apply(Tag("COUNT", &count));
+
+    EXPECT_EQ(count, 5u);
+    EXPECT_FALSE(parser.HasError());
+    EXPECT_TRUE(parser.HasNext());
+    EXPECT_EQ(parser.Peek(), "BOGUS");
+  }
+
+  // Case-insensitive matching (consistent with Check).
+  {
+    auto parser = Make({"count", "7"});
+
+    uint32_t count = 0;
+    parser.Apply(Tag("COUNT", &count));
+
+    EXPECT_EQ(count, 7u);
+    EXPECT_FALSE(parser.HasError());
+  }
+
+  // Invalid integer in a Tag arg propagates the error.
+  {
+    auto parser = Make({"COUNT", "NAN"});
+
+    uint32_t count = 0;
+    parser.Apply(Tag("COUNT", &count));
+
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_INT);
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyOrSkip) {
+  // ApplyOrSkip silently skips any unknown arg (1 at a time) and keeps going.
+  {
+    auto parser = Make({"BOGUS", "COUNT", "5", "MORE_BOGUS", "STUFF"});
+
+    uint32_t count = 0;
+    parser.ApplyOrSkip(Tag("COUNT", &count));
+
+    EXPECT_EQ(count, 5u);
+    EXPECT_FALSE(parser.HasError());
+    EXPECT_FALSE(parser.HasNext());  // everything consumed
+  }
+  // Empty input — no error, no work.
+  {
+    auto parser = Make({});
+    uint32_t count = 0;
+    parser.ApplyOrSkip(Tag("COUNT", &count));
+    EXPECT_FALSE(parser.HasError());
+    EXPECT_FALSE(parser.HasNext());
+  }
+  // Trailing unknown at end-of-args: the skip must not trip OUT_OF_BOUNDS.
+  {
+    auto parser = Make({"BOGUS"});
+    uint32_t count = 0;
+    parser.ApplyOrSkip(Tag("COUNT", &count));
+    EXPECT_FALSE(parser.HasError());
+    EXPECT_FALSE(parser.HasNext());
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyTagMissingValue) {
+  // A matched tag with missing trailing value(s) must surface an error, not be silently skipped.
+  // This guards against a subtle interaction with ApplyOrSkip: if TagOpt treated "tag matches,
+  // values missing" as "no match", the skip path would swallow the malformed option.
+  {
+    auto parser = Make({"COUNT"});  // tag matches, value missing
+    uint32_t count = 0;
+    parser.Apply(Tag("COUNT", &count));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::OUT_OF_BOUNDS);
+  }
+  {
+    auto parser = Make({"COUNT"});
+    uint32_t count = 0;
+    parser.ApplyOrSkip(Tag("COUNT", &count));
+    // Tag must have been consumed (not left for Skip to swallow silently).
+    EXPECT_FALSE(parser.HasNext());
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::OUT_OF_BOUNDS);
+  }
+  // Also guard the two-field case: LIMIT with only one trailing value.
+  {
+    auto parser = Make({"LIMIT", "10"});  // needs offset + limit
+    uint32_t offset = 0, limit = 0;
+    parser.Apply(Tag("LIMIT", &offset, &limit));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::OUT_OF_BOUNDS);
+  }
+}
+
+TEST_F(CmdArgParserTest, ReportBeforeAnyNext) {
+  // Report(code) at cur_i_ == 0 must clamp the error index to 0 rather than underflow to SIZE_MAX.
+  auto parser = Make({"x"});
+  parser.Report(CmdArgParser::CUSTOM_ERROR);
+  auto err = parser.TakeError();
+  EXPECT_TRUE(err);
+  EXPECT_EQ(err.index, 0u);
+}
+
+TEST_F(CmdArgParserTest, ApplyLambda) {
+  // Tag() with a lambda lets callers run custom parsing on match. Useful for side-effectful cases
+  // like push_back or toggling a bool to false.
+  auto parser = Make({"GET", "p1", "ASC", "GET", "p2"});
+
+  std::vector<std::string_view> patterns;
+  bool reversed = true;
+
+  parser.Apply(
+      Tag("ASC", [&](CmdArgParser*) { reversed = false; }),
+      Tag("GET", [&](CmdArgParser* p) { patterns.push_back(p->Next<std::string_view>()); }));
+
+  EXPECT_FALSE(reversed);
+  ASSERT_EQ(patterns.size(), 2u);
+  EXPECT_EQ(patterns[0], "p1");
+  EXPECT_EQ(patterns[1], "p2");
+  EXPECT_FALSE(parser.HasError());
+}
+
+TEST_F(CmdArgParserTest, ApplyMap) {
+  // Map(&field, tag, value, ...) — matches any tag and writes the corresponding value.
+  // Standalone Map allows repeated matches (last wins); wrap in OneOf to require at most one.
+  {
+    auto parser = Make({"DESC"});
+    bool reversed = false;
+    parser.Apply(Map(&reversed, "DESC", true, "ASC", false));
+    EXPECT_TRUE(reversed);
+    EXPECT_FALSE(parser.HasError());
+  }
+  {
+    auto parser = Make({"ASC"});
+    bool reversed = true;
+    parser.Apply(Map(&reversed, "DESC", true, "ASC", false));
+    EXPECT_FALSE(reversed);
+    EXPECT_FALSE(parser.HasError());
+  }
+  // Unrelated tag leaves field untouched and stops Apply.
+  {
+    auto parser = Make({"OTHER"});
+    bool reversed = false;
+    parser.Apply(Map(&reversed, "DESC", true, "ASC", false));
+    EXPECT_FALSE(reversed);
+    EXPECT_TRUE(parser.HasNext());
+  }
+  // Standalone Map allows repeated matches — last wins, no error. This matches Redis SORT
+  // semantics where "ASC DESC" is equivalent to "DESC".
+  {
+    auto parser = Make({"DESC", "ASC"});
+    bool reversed = true;
+    parser.Apply(Map(&reversed, "DESC", true, "ASC", false));
+    EXPECT_FALSE(reversed);  // ASC came last
+    EXPECT_FALSE(parser.HasError());
+  }
+  {
+    auto parser = Make({"ASC", "DESC"});
+    bool reversed = false;
+    parser.Apply(Map(&reversed, "DESC", true, "ASC", false));
+    EXPECT_TRUE(reversed);  // DESC came last
+    EXPECT_FALSE(parser.HasError());
+  }
+  // OneOf + Map — DESC followed by ASC is a mutex violation.
+  {
+    auto parser = Make({"DESC", "ASC"});
+    bool reversed = false;
+    parser.Apply(OneOf(Map(&reversed, "DESC", true, "ASC", false)));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_CASES);
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyTagNested) {
+  // Tag(tag, inner_opt) — outer tag matches, then inner option runs against the next arg.
+  // If the inner doesn't match, INVALID_CASES is reported (the inner keyword is required).
+  enum class Mode { A, B, C };
+  {
+    auto parser = Make({"MODE", "B"});
+    Mode mode = Mode::A;
+    parser.Apply(Tag("MODE", Map(&mode, "A", Mode::A, "B", Mode::B, "C", Mode::C)));
+    EXPECT_EQ(mode, Mode::B);
+    EXPECT_FALSE(parser.HasError());
+  }
+  // Unknown inner tag -> INVALID_CASES.
+  {
+    auto parser = Make({"MODE", "BOGUS"});
+    Mode mode = Mode::A;
+    parser.Apply(Tag("MODE", Map(&mode, "A", Mode::A, "B", Mode::B)));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_CASES);
+  }
+  // Outer tag absent -> no effect, no error.
+  {
+    auto parser = Make({});
+    Mode mode = Mode::A;
+    parser.Apply(Tag("MODE", Map(&mode, "A", Mode::A, "B", Mode::B)));
+    EXPECT_EQ(mode, Mode::A);
+    EXPECT_FALSE(parser.HasError());
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyTagIf) {
+  // If(cond, opt) behaves like `opt` when cond is true, and never matches when false.
+  // Use to gate an option on a runtime flag (e.g. is_read_only).
+
+  // cond=true -> delegate to inner (matches and sets field).
+  {
+    auto parser = Make({"STORE", "dest"});
+    std::string_view store;
+    parser.Apply(If(true, Tag("STORE", &store)));
+    EXPECT_EQ(store, "dest");
+    EXPECT_FALSE(parser.HasError());
+  }
+
+  // cond=false -> inner is skipped. Apply stops at the (now unmatched) arg; Finalize reports
+  // UNPROCESSED so the caller can surface a syntax error.
+  {
+    auto parser = Make({"STORE", "dest"});
+    std::string_view store;
+    parser.Apply(If(false, Tag("STORE", &store)));
+    EXPECT_EQ(store, "");
+    EXPECT_FALSE(parser.HasError());
+    EXPECT_TRUE(parser.HasNext());
+    EXPECT_FALSE(parser.Finalize());
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::UNPROCESSED);
+  }
+
+  // Composes: cond=false + Exist - does not toggle the bool even when the tag is present.
+  {
+    auto parser = Make({"FLAG"});
+    bool flag = false;
+    parser.Apply(If(false, Exist("FLAG", &flag)));
+    EXPECT_FALSE(flag);
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyOneOf) {
+  // OneOf groups mutually-exclusive options. Zero or one may match across the Apply loop.
+  // A second match reports an error instead of being quietly accepted.
+
+  // Zero matches — fine.
+  {
+    auto parser = Make({});
+    bool nx = false, xx = false;
+    parser.Apply(OneOf(Exist("NX", &nx), Exist("XX", &xx)));
+    EXPECT_FALSE(nx);
+    EXPECT_FALSE(xx);
+    EXPECT_FALSE(parser.HasError());
+  }
+
+  // Single match — fine.
+  {
+    auto parser = Make({"NX"});
+    bool nx = false, xx = false;
+    parser.Apply(OneOf(Exist("NX", &nx), Exist("XX", &xx)));
+    EXPECT_TRUE(nx);
+    EXPECT_FALSE(xx);
+    EXPECT_FALSE(parser.HasError());
+  }
+
+  // Two different members of the group match -> error.
+  {
+    auto parser = Make({"NX", "XX"});
+    bool nx = false, xx = false;
+    parser.Apply(OneOf(Exist("NX", &nx), Exist("XX", &xx)));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_CASES);
+  }
+
+  // Same member twice also counts as a second match -> error.
+  {
+    auto parser = Make({"NX", "NX"});
+    bool nx = false, xx = false;
+    parser.Apply(OneOf(Exist("NX", &nx), Exist("XX", &xx)));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_CASES);
+  }
+
+  // OneOf composes with other Apply options. Unrelated tags are not affected.
+  {
+    auto parser = Make({"NX", "COUNT", "5"});
+    bool nx = false, xx = false;
+    uint32_t count = 0;
+    parser.Apply(OneOf(Exist("NX", &nx), Exist("XX", &xx)), Tag("COUNT", &count));
+    EXPECT_TRUE(nx);
+    EXPECT_EQ(count, 5u);
+    EXPECT_FALSE(parser.HasError());
+  }
+}
+
+TEST_F(CmdArgParserTest, ApplyOptional) {
+  // Tag present -> optional engaged.
+  {
+    auto parser = Make({"COUNT", "5"});
+    std::optional<uint32_t> count;
+    parser.Apply(Tag("COUNT", &count));
+    ASSERT_TRUE(count.has_value());
+    EXPECT_EQ(*count, 5u);
+    EXPECT_FALSE(parser.HasError());
+  }
+  // Tag absent -> optional stays empty.
+  {
+    auto parser = Make({});
+    std::optional<uint32_t> count;
+    parser.Apply(Tag("COUNT", &count));
+    EXPECT_FALSE(count.has_value());
+    EXPECT_FALSE(parser.HasError());
+  }
+  // Invalid value -> INVALID_INT reported. The optional's state on error is undefined; callers
+  // must check for the parse error first.
+  {
+    auto parser = Make({"COUNT", "NAN"});
+    std::optional<uint32_t> count;
+    parser.Apply(Tag("COUNT", &count));
+    auto err = parser.TakeError();
+    EXPECT_TRUE(err);
+    EXPECT_EQ(err.type, CmdArgParser::INVALID_INT);
+  }
+}
+
 TEST_F(CmdArgParserTest, FixedRangeInt) {
   {
     auto parser = Make({"10", "-10", "12"});
diff --git a/src/facade/dragonfly_connection.cc b/src/facade/dragonfly_connection.cc
index e44fe2a5b30c..6aa6e67efe7a 100644
--- a/src/facade/dragonfly_connection.cc
+++ b/src/facade/dragonfly_connection.cc
@@ -157,16 +157,23 @@ bool MatchHttp11Line(string_view line) {
          absl::EndsWith(line, "HTTP/1.1");
 }
 
-void UpdateIoBufCapacity(const io::IoBuf& io_buf, ConnectionStats* stats,
-                         absl::FunctionRef<void()> f) {
-  const size_t prev_capacity = io_buf.Capacity();
-  f();
-  const size_t capacity = io_buf.Capacity();
-  if (prev_capacity != capacity) {
-    VLOG(2) << "Grown io_buf to " << capacity;
-    stats->read_buf_capacity += capacity - prev_capacity;
+struct ReadBufTracker {
+  explicit ReadBufTracker(const io::IoBuf& io_buf)
+      : io_buf_(io_buf), last_capacity_(io_buf.Capacity()) {
+  }
+
+  ~ReadBufTracker() {
+    size_t capacity = io_buf_.Capacity();
+    if (last_capacity_ != capacity) {
+      VLOG(2) << "Grown io_buf to " << capacity;
+      tl_facade_stats->conn_stats.read_buf_capacity += capacity - last_capacity_;
+    }
   }
-}
+
+ private:
+  const io::IoBuf& io_buf_;
+  size_t last_capacity_;
+};
 
 size_t UsedMemoryInternal(const ParsedCommand& msg) {
   return msg.GetSize() + msg.HeapMemory();
@@ -177,6 +184,10 @@ struct TrafficLogger {
   // Also, makes sure that LogTraffic are executed atomically.
   fb2::Mutex mutex;
   unique_ptr<io::WriteFile> log_file;
+  // Listener type that this thread's file is recording. Only connections with a
+  // matching `listener_type_` produce records; others are skipped on the hot path.
+  // Set once when the file is opened, cleared in ResetLocked().
+  Connection::ListenerType listener_type = Connection::ListenerType::MAIN_RESP;
 
   void ResetLocked();
   // Returns true if Write succeeded, false if it failed and the recording should be aborted.
@@ -189,6 +200,7 @@ void TrafficLogger::ResetLocked() {
     std::ignore = log_file->Close();
     log_file.reset();
   }
+  listener_type = Connection::ListenerType::MAIN_RESP;
 }
 
 // Returns true if Write succeeded, false if it failed and the recording should be aborted.
@@ -218,10 +230,16 @@ thread_local base::Histogram* io_req_size_hist = nullptr;
 thread_local const size_t reply_size_limit = absl::GetFlag(FLAGS_squashed_reply_size_limit);
 thread_local uint32 pipeline_wait_batch_usec = absl::GetFlag(FLAGS_pipeline_wait_batch_usec);
 
-void OpenTrafficLogger(string_view base_path) {
+// Opens the per-thread traffic log file. Distinguishes three outcomes so the caller
+// can report an accurate error to the user (was the logger already running, or did
+// we fail to open a file). `listener_type` is only committed after the file is
+// successfully opened so the logger's state stays consistent on failure.
+Connection::StartTrafficResult OpenTrafficLogger(string_view base_path,
+                                                 Connection::ListenerType listener_type) {
+  using Res = Connection::StartTrafficResult;
   unique_lock lk{tl_traffic_logger.mutex};
   if (tl_traffic_logger.log_file)
-    return;
+    return Res::kAlreadyLogging;
 
 #ifdef __linux__
   // Open file with append mode, without it concurrent fiber writes seem to conflict
@@ -230,21 +248,30 @@ void OpenTrafficLogger(string_view base_path) {
   auto file = util::fb2::OpenWrite(path, io::WriteFile::Options{/*.append = */ false});
   if (!file) {
     LOG(ERROR) << "Error opening a file " << path << " for traffic logging: " << file.error();
-    return;
+    return Res::kOpenFailed;
   }
   tl_traffic_logger.log_file = unique_ptr<io::WriteFile>{file.value()};
+  tl_traffic_logger.listener_type = listener_type;
 #else
   LOG(WARNING) << "Traffic logger is only supported on Linux";
+  return Res::kOpenFailed;
 #endif
 
-  // Write version, incremental numbering :)
-  uint8_t version[1] = {2};
-  std::ignore = tl_traffic_logger.log_file->Write(version);
+  // File header: version byte (v3), followed by a single byte carrying the listener
+  // type for the whole file. Every record in the file belongs to this listener.
+  uint8_t header[2] = {3, static_cast<uint8_t>(listener_type)};
+  std::ignore = tl_traffic_logger.log_file->Write(header);
+  return Res::kStarted;
 }
 
-void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
-                ServiceInterface::ContextInfo ci) {
-  string_view cmd = args.Front();
+// Writes a single record. `parts[0]` is the command name, following entries are its arguments.
+// Callers must guarantee a non-empty span (both LogTraffic and LogMemcacheTraffic push
+// the command name as the first element before invoking this function).
+void LogTrafficParts(uint32_t id, bool has_more, uint32_t db_index,
+                     absl::Span<const string_view> parts) {
+  DCHECK(!parts.empty());
+
+  string_view cmd = parts.front();
   if (absl::EqualsIgnoreCase(cmd, "debug"sv))
     return;
 
@@ -253,26 +280,22 @@ void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
   char stack_buf[1024];
   char* next = stack_buf;
 
-  // We write id, timestamp, db_index, has_more, num_parts, part_len, part_len, part_len, ...
-  // And then all the part blobs concatenated together.
+  // Record header: id, timestamp, db_index, has_more, num_parts, followed by
+  // part_len, part_len, ... and finally the concatenated part blobs.
+  // The listener type is stored once in the file header; it is not repeated per record.
   auto write_u32 = [&next](uint32_t i) {
     absl::little_endian::Store32(next, i);
     next += 4;
   };
 
-  // id
   write_u32(id);
 
-  // timestamp
   absl::little_endian::Store64(next, absl::GetCurrentTimeNanos());
   next += 8;
 
-  // db_index
-  write_u32(ci.db_index);
-
-  // has_more, num_parts
-  write_u32(has_more ? 1 : 0);
-  write_u32(uint32_t(args.size()));
+  write_u32(db_index);
+  write_u32(has_more ? 1u : 0u);
+  write_u32(uint32_t(parts.size()));
 
   // Grab the lock and check if the file is still open.
   lock_guard lk{tl_traffic_logger.mutex};
@@ -280,7 +303,7 @@ void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
     return;
 
   // part_len, ...
-  for (auto part : args.view()) {
+  for (string_view part : parts) {
     if (size_t(next - stack_buf + 4) > sizeof(stack_buf)) {
       if (!tl_traffic_logger.Write(string_view{stack_buf, size_t(next - stack_buf)})) {
         return;
@@ -297,7 +320,7 @@ void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
     blobs[index++] = iovec{.iov_base = stack_buf, .iov_len = size_t(next - stack_buf)};
   }
 
-  for (auto part : args.view()) {
+  for (string_view part : parts) {
     if (auto blob_len = part.size(); blob_len > 0) {
       blobs[index++] = iovec{.iov_base = const_cast<char*>(part.data()), .iov_len = blob_len};
 
@@ -315,6 +338,90 @@ void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
   }
 }
 
+void LogTraffic(uint32_t id, bool has_more, const cmn::BackedArguments& args,
+                ServiceInterface::ContextInfo ci) {
+  absl::InlinedVector<string_view, 16> parts;
+  parts.reserve(args.size());
+  for (auto v : args.view())
+    parts.push_back(v);
+  LogTrafficParts(id, has_more, ci.db_index, absl::MakeSpan(parts));
+}
+
+// Variant used by the Memcache protocol path.
+//
+// The memcache parser keeps fields that are NOT arguments in scalar Command members
+// (flags, expire_ts, delta, cas_unique) rather than in `backed_args`. We serialize
+// them into the record so that tools/replay has enough context to reproduce the
+// command faithfully. Record layout per command type:
+//
+//   SET/ADD/REPLACE/APPEND/PREPEND : [cmd, key, value, flags, expire_ts]
+//   CAS                            : [cas, key, value, flags, expire_ts, cas_unique]
+//   INCR/DECR                      : [cmd, key, delta]
+//   GAT/GATS                       : [cmd, expire_ts, key+]  (expire BEFORE keys, matches wire)
+//   all others (GET/GETS/DELETE/
+//               FLUSHALL/STATS/
+//               QUIT/VERSION)      : [cmd, *backed_args]
+void LogMemcacheTraffic(uint32_t id, bool has_more, const MemcacheParser::Command& mc,
+                        ServiceInterface::ContextInfo ci) {
+  using MP = MemcacheParser;
+  string_view cmd_name = MP::CmdName(mc.type);
+  if (cmd_name.empty())
+    return;
+
+  // owned backs stringified numeric fields. We use a fixed-size std::array
+  // rather than a resizable vector so that string_views inserted into `parts`
+  // remain stable even if more fields are appended in the future: std::array
+  // never reallocates. kMaxOwned must be >= the largest per-type push count
+  // (currently 3, for CAS: flags + expire_ts + cas_unique).
+  constexpr size_t kMaxOwned = 4;
+  std::array<string, kMaxOwned> owned;
+  size_t owned_n = 0;
+
+  absl::InlinedVector<string_view, 16> parts;
+  parts.reserve(mc.backed_args->size() + kMaxOwned + 1);
+  parts.push_back(cmd_name);
+
+  auto push_num = [&](uint64_t n) {
+    DCHECK_LT(owned_n, kMaxOwned);
+    owned[owned_n] = absl::StrCat(n);
+    parts.push_back(owned[owned_n]);
+    ++owned_n;
+  };
+
+  // For GAT/GATS we want expire_ts to precede the key list because the parser can
+  // push multiple keys into backed_args; placing expire at the end would make the
+  // expire index depend on the number of keys.
+  if (mc.type == MP::GAT || mc.type == MP::GATS)
+    push_num(mc.raw_expire_ts);
+
+  for (string_view a : mc.backed_args->view())
+    parts.push_back(a);
+
+  switch (mc.type) {
+    case MP::SET:
+    case MP::ADD:
+    case MP::REPLACE:
+    case MP::APPEND:
+    case MP::PREPEND:
+      push_num(mc.flags);
+      push_num(mc.raw_expire_ts);
+      break;
+    case MP::CAS:
+      push_num(mc.flags);
+      push_num(mc.raw_expire_ts);
+      push_num(mc.cas_unique);
+      break;
+    case MP::INCR:
+    case MP::DECR:
+      push_num(mc.delta);
+      break;
+    default:
+      break;
+  }
+
+  LogTrafficParts(id, has_more, ci.db_index, absl::MakeSpan(parts));
+}
+
 constexpr size_t kMinReadSize = 256;
 
 const char* kPhaseName[Connection::NUM_PHASES] = {"SETUP", "READ", "PROCESS", "SHUTTING_DOWN",
@@ -722,6 +829,14 @@ void Connection::OnConnectionStart() {
   // is null in unit-tests.
   if (const Listener* lsnr = static_cast<Listener*>(listener()); lsnr) {
     is_main_ = lsnr->IsMainInterface();
+    if (lsnr->IsPrivilegedInterface()) {
+      listener_type_ = ListenerType::ADMIN_RESP;
+    } else if (protocol_ == Protocol::MEMCACHE) {
+      listener_type_ = ListenerType::MEMCACHE;
+    } else {
+      // MAIN_RESP covers TCP main listener as well as unix-socket RESP listeners.
+      listener_type_ = ListenerType::MAIN_RESP;
+    }
   }
 
   if (GetFlag(FLAGS_tcp_nodelay) && !socket_->IsUDS()) {
@@ -1049,7 +1164,6 @@ io::Result<bool> Connection::CheckForHttpProto() {
 
   size_t last_len = 0;
   auto* peer = socket_.get();
-  auto& conn_stats = tl_facade_stats->conn_stats;
   do {
     auto buf = io_buf_.AppendBuffer();
     DCHECK(!buf.empty());
@@ -1082,7 +1196,10 @@ io::Result<bool> Connection::CheckForHttpProto() {
       return MatchHttp11Line(ib);
     }
     last_len = io_buf_.InputLen();
-    UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() { io_buf_.EnsureCapacity(128); });
+    {
+      ReadBufTracker tracker(io_buf_);
+      io_buf_.EnsureCapacity(128);
+    }
   } while (last_len < 1024);
 
   return false;
@@ -1108,7 +1225,7 @@ void Connection::ConnectionFlow() {
   if (io_buf_.InputLen() > 0) {
     phase_ = PROCESS;
     if (redis_parser_ && !ioloop_v2_) {
-      parse_status = ParseRedis(10000);
+      parse_status = ParseRedis(io_buf_, 10000);
     } else {
       parse_status = ParseLoop();
     }
@@ -1118,7 +1235,10 @@ void Connection::ConnectionFlow() {
 
   // Main loop.
   if (parse_status != ERROR && !ec) {
-    UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() { io_buf_.EnsureCapacity(64); });
+    {
+      ReadBufTracker tracker(io_buf_);
+      io_buf_.EnsureCapacity(64);
+    }
     variant<error_code, Connection::ParserStatus> res;
     if (ioloop_v2_) {
       res = IoLoopV2();
@@ -1266,7 +1386,8 @@ void Connection::DispatchSingle(bool has_more, absl::FunctionRef<void()> invoke_
   }
 }
 
-Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles, bool enqueue_only) {
+Connection::ParserStatus Connection::ParseRedis(base::IoBuf& io_buf, unsigned max_busy_cycles,
+                                                bool enqueue_only) {
   uint32_t consumed = 0;
   RespSrvParser::Result result = RespSrvParser::OK;
 
@@ -1281,7 +1402,7 @@ Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles, bool e
     auto* cmd = std::exchange(parsed_cmd_, ptr.release());
     EnqueueParsedCommand(cmd);
   };
-  io::Bytes read_buffer = io_buf_.InputBuffer();
+  io::Bytes read_buffer = io_buf.InputBuffer();
   // Keep track of total bytes consumed/parsed. The do/while{} loop below preempts,
   // and InputBuffer() size might change between preemption points. There is a corner case,
   // that ConsumeInput() will strip a portion of the request which makes the test_publish_stuck
@@ -1302,7 +1423,7 @@ Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles, bool e
       request_consumed_bytes_ = 0;
       bool has_more = consumed < read_buffer.size();
 
-      if (tl_traffic_logger.log_file && IsMain() /* log only on the main interface */) {
+      if (tl_traffic_logger.log_file && tl_traffic_logger.listener_type == listener_type_) {
         LogTraffic(id_, has_more, *parsed_cmd_, service_->GetContextInfo(cc_.get()));
       }
 
@@ -1336,7 +1457,7 @@ Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles, bool e
     }
   } while (RespSrvParser::OK == result && read_buffer.size() > 0 && !reply_builder_->GetError());
 
-  io_buf_.ConsumeInput(total_consumed);
+  io_buf.ConsumeInput(total_consumed);
 
   parser_error_ = result;
   if (result == RespSrvParser::OK)
@@ -1359,7 +1480,7 @@ auto Connection::ParseLoop() -> ParserStatus {
 
   bool commands_parsed = false;
   do {
-    commands_parsed = (this->*parse_func)();
+    commands_parsed = (this->*parse_func)(io_buf_);
 
     if (!ExecuteBatch())
       return ERROR;
@@ -1476,10 +1597,10 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoop() {
     }
 
     phase_ = PROCESS;
-    bool is_iobuf_full = io_buf_.AppendLen() == 0;
+    bool reached_capacity = io_buf_.AppendLen() == 0;
 
     if (redis_parser_) {
-      parse_status = ParseRedis(max_busy_read_cycles_cached);
+      parse_status = ParseRedis(io_buf_, max_busy_read_cycles_cached);
     } else {
       DCHECK(memcache_parser_);
       parse_status = ParseLoop();
@@ -1504,19 +1625,16 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoop() {
         // (Note: The buffer object is only working in power-of-2 sizes,
         // so there's no danger of accidental O(n^2) behavior.)
         if (parser_hint > capacity) {
-          auto& conn_stats = GetLocalConnStats();
-          UpdateIoBufCapacity(io_buf_, &conn_stats,
-                              [&]() { io_buf_.Reserve(std::min(max_iobfuf_len, parser_hint)); });
+          ReadBufTracker tracker(io_buf_);
+          io_buf_.Reserve(std::min(max_iobfuf_len, parser_hint));
         }
 
         // If we got a partial request because iobuf was full, grow it up to
         // a reasonable limit to save on Recv() calls.
-        if (is_iobuf_full && capacity < max_iobfuf_len / 2) {
-          auto& conn_stats = GetLocalConnStats();
+        if (reached_capacity && capacity < max_iobfuf_len / 2) {
           // Last io used most of the io_buf to the end.
-          UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() {
-            io_buf_.Reserve(capacity * 2);  // Valid growth range.
-          });
+          ReadBufTracker tracker(io_buf_);
+          io_buf_.Reserve(capacity * 2);  // Valid growth range.
         }
 
         if (io_buf_.AppendLen() == 0U) {
@@ -2201,8 +2319,9 @@ void Connection::RequestAsyncMigration(util::fb2::ProactorBase* dest, bool force
   }
 }
 
-void Connection::StartTrafficLogging(string_view path) {
-  OpenTrafficLogger(path);
+Connection::StartTrafficResult Connection::StartTrafficLogging(string_view path,
+                                                               ListenerType listener_type) {
+  return OpenTrafficLogger(path, listener_type);
 }
 
 void Connection::StopTrafficLogging() {
@@ -2210,6 +2329,26 @@ void Connection::StopTrafficLogging() {
   tl_traffic_logger.ResetLocked();
 }
 
+void Connection::LogReplicaCommand(const cmn::BackedArguments& args, uint32_t db_index) {
+  // Contract: LSN/PING opcodes are filtered before ExecuteTx, and
+  // COMMAND/EXPIRED journal entries always carry at least a command name.
+  DCHECK(!args.empty());
+  // Fast-path gate: cheap thread-local reads without the mutex. If the logger
+  // was swapped out concurrently, LogTrafficParts re-checks `log_file` inside
+  // the lock so at worst we do a bit of wasted work (building `parts`).
+  // id=0 is a synthetic client id — replication has no connection/client of its
+  // own, and callers on the same fiber serialise naturally.
+  if (!tl_traffic_logger.log_file ||
+      tl_traffic_logger.listener_type != ListenerType::REPLICA_RESP) {
+    return;
+  }
+  absl::InlinedVector<string_view, 16> parts;
+  parts.reserve(args.size());
+  for (auto v : args.view())
+    parts.push_back(v);
+  LogTrafficParts(/*id=*/0, /*has_more=*/false, db_index, absl::MakeSpan(parts));
+}
+
 bool Connection::IsHttp() const {
   return is_http_;
 }
@@ -2285,7 +2424,7 @@ void Connection::BreakOnce(uint32_t ev_mask) {
   if (breaker_cb_) {
     DVLOG(1) << "[" << id_ << "] Connection::breaker_cb_ " << ev_mask;
     auto fun = std::move(breaker_cb_);
-    DCHECK(!breaker_cb_);
+    breaker_cb_ = nullptr;
     fun(ev_mask);
   }
 }
@@ -2306,7 +2445,7 @@ bool Connection::IsReplySizeOverLimit() const {
   return over_limit;
 }
 
-bool Connection::ParseRedisBatch() {
+bool Connection::ParseRedisBatch(base::IoBuf& buf) {
   QueueBackpressure& qbp = GetQueueBackpressure();
 
   // Only throttle parsing if this connection is actively contributing to the queue.
@@ -2319,11 +2458,11 @@ bool Connection::ParseRedisBatch() {
     GetLocalConnStats().pipeline_throttle_count++;
     return false;
   }
-  return ParseRedis(max_busy_read_cycles_cached, true) == ParserStatus::OK;
+  return ParseRedis(buf, max_busy_read_cycles_cached, true) == ParserStatus::OK;
 }
 
-bool Connection::ParseMCBatch() {
-  CHECK(io_buf_.InputLen() > 0);
+bool Connection::ParseMCBatch(base::IoBuf& io_buf) {
+  CHECK(io_buf.InputLen() > 0);
 
   do {
     if (parsed_cmd_ == nullptr) {
@@ -2333,15 +2472,22 @@ bool Connection::ParseMCBatch() {
     }
     uint32_t consumed = 0;
     memcache_parser_->set_last_unix_time(time(nullptr));
-    MemcacheParser::Result result = memcache_parser_->Parse(io::View(io_buf_.InputBuffer()),
+    MemcacheParser::Result result = memcache_parser_->Parse(io::View(io_buf.InputBuffer()),
                                                             &consumed, parsed_cmd_->mc_command());
-    io_buf_.ConsumeInput(consumed);
+    io_buf.ConsumeInput(consumed);
 
     DVLOG(2) << "mc_result " << unsigned(result) << " consumed: " << consumed << " type "
              << unsigned(parsed_cmd_->mc_command()->type);
     if (result == MemcacheParser::INPUT_PENDING)
       return false;
 
+    if (result == MemcacheParser::OK && tl_traffic_logger.log_file &&
+        tl_traffic_logger.listener_type == listener_type_) {
+      bool has_more = io_buf_.InputLen() > 0;
+      LogMemcacheTraffic(id_, has_more, *parsed_cmd_->mc_command(),
+                         service_->GetContextInfo(cc_.get()));
+    }
+
     // We push the command to the parsed queue even in case of parse errors,
     // so that we can reply in order.
     EnqueueParsedCommand(parsed_cmd_);
@@ -2370,7 +2516,7 @@ bool Connection::ParseMCBatch() {
           break;
       }
     }
-  } while (parsed_cmd_q_len_ < 128 && io_buf_.InputLen() > 0);
+  } while (parsed_cmd_q_len_ < 128 && io_buf.InputLen() > 0);
   return true;
 }
 
@@ -2453,12 +2599,12 @@ bool Connection::ExecuteBatch() {
 
 bool Connection::ReplyBatch() {
   reply_builder_->SetBatchMode(true);
-  while (HasDispatchedCommands() && parsed_head_->CanReply()) {
+  while (HasInFlightCommands() && parsed_head_->CanReply()) {
     current_wait_.reset();  // Clear the subscription before moving to the next command
     auto* cmd = parsed_head_;
     parsed_head_ = cmd->next;
     cmd->SendReply();
-    ReleaseParsedCommand(cmd, HasDispatchedCommands() /* is_pipelined */);
+    ReleaseParsedCommand(cmd, HasInFlightCommands() /* is_pipelined */);
     if (reply_builder_->GetError())
       return false;
   }
@@ -2473,7 +2619,18 @@ bool Connection::ReplyBatch() {
   }
 
   reply_builder_->SetBatchMode(false);
-  reply_builder_->Flush();
+
+  // V1: handles its pipeline batching inside AsyncFiber, so it flushes unconditionally here.
+  //
+  // V2: operates as a single-fiber event loop where reading, parsing, and executing happen
+  // sequentially. Because ParseLoop processes pipelines in chunks, flushing here would trigger a
+  // sendmsg syscall for every single chunk. Instead, V2 delegates flushing to IoLoopV2, which
+  // safely flushes the coalesced buffer right before the fiber yields (await) or when memory limits
+  // are reached.
+  if (!ioloop_v2_) {
+    reply_builder_->Flush();
+  }
+
   return !reply_builder_->GetError();
 }
 
@@ -2663,8 +2820,10 @@ void Connection::NotifyOnRecv(const util::FiberSocketBase::RecvNotification& n)
     pending_input_ = true;
   } else if (std::holds_alternative<io::MutableBytes>(n.read_result)) {  // provided buffer.
     io::MutableBytes buf = std::get<io::MutableBytes>(n.read_result);
-    UpdateIoBufCapacity(io_buf_, &tl_facade_stats->conn_stats,
-                        [&]() { io_buf_.WriteAndCommit(buf.data(), buf.size()); });
+    {
+      ReadBufTracker tracker(io_buf_);
+      io_buf_.WriteAndCommit(buf.data(), buf.size());
+    }
     last_interaction_ = time(nullptr);
   } else {
     LOG(FATAL) << "Should not reach here";
@@ -2700,11 +2859,10 @@ void Connection::ReadPendingInput() {
   }
 }
 
-void Connection::CheckIoBufCapacity(bool is_iobuf_full) {
-  auto& conn_stats = tl_facade_stats->conn_stats;
+void Connection::CheckIoBufCapacity(bool reached_capacity, base::IoBuf* io_buf) {
   size_t max_io_buf_len = GetFlag(FLAGS_max_client_iobuf_len);
 
-  size_t capacity = io_buf_.Capacity();
+  size_t capacity = io_buf->Capacity();
   if (capacity < max_io_buf_len) {
     size_t parser_hint = 0;
     if (redis_parser_)
@@ -2716,23 +2874,22 @@ void Connection::CheckIoBufCapacity(bool is_iobuf_full) {
     // (Note: The buffer object is only working in power-of-2 sizes,
     // so there's no danger of accidental O(n^2) behavior.)
     if (parser_hint > capacity) {
-      UpdateIoBufCapacity(io_buf_, &conn_stats,
-                          [&]() { io_buf_.Reserve(std::min(max_io_buf_len, parser_hint)); });
+      ReadBufTracker tracker(*io_buf);
+      io_buf->Reserve(std::min(max_io_buf_len, parser_hint));
     }
 
     // If we got a partial request because iobuf was full, grow it up to
     // a reasonable limit to save on Recv() calls.
-    if (is_iobuf_full && capacity < max_io_buf_len / 2) {
+    if (reached_capacity && capacity < max_io_buf_len / 2) {
       // Last io used most of the io_buf to the end.
-      UpdateIoBufCapacity(io_buf_, &conn_stats, [&]() {
-        io_buf_.Reserve(capacity * 2);  // Valid growth range.
-      });
+      ReadBufTracker tracker(*io_buf);
+      io_buf->Reserve(capacity * 2);  // Valid growth range.
     }
 
-    if (io_buf_.AppendLen() == 0U) {
+    if (io_buf->AppendLen() == 0U) {
       // it can happen with memcached but not for RedisParser, because RedisParser fully
       // consumes the passed buffer
-      LOG_EVERY_T(WARNING, 10) << "Maximum io_buf length reached " << io_buf_.Capacity()
+      LOG_EVERY_T(WARNING, 10) << "Maximum io_buf length reached " << io_buf->Capacity()
                                << ", consider to increase max_client_iobuf_len flag";
     }
   }
@@ -2795,7 +2952,7 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
     HandleMigrateRequest();
 
     // Register completion for current head if its pending and we don't wait on current_wait_.
-    if (HasDispatchedCommands() && !current_wait_.has_value()) {
+    if (HasInFlightCommands() && !current_wait_.has_value()) {
       current_wait_.emplace(parsed_head_, &cmd_completion_waiter);
     }
 
@@ -2807,26 +2964,33 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
     if (io_buf_.InputLen() == 0) {
       phase_ = READ_SOCKET;
 
+      // Flush replies deferred by ReplyBatch before sleeping - ensures the client
+      // gets its response even when no more data arrives (single commands, end of pipeline).
+      reply_builder_->Flush();
+      if (auto err = reply_builder_->GetError(); err) {
+        return err;
+      }
+
       io_event_.await([this, &is_ready_to_migrate]() {
         // TODO: optimize CanReply with looking up waiter key
         // io_buf_.InputLen() > 0 is still needed for multishot flow.
 
         // We wake up if:
         // 1. New data arrived or is pending (io_buf_.InputLen() > 0 || pending_input_).
-        // 2. A parsed command is ready to execute (HeadReadyToDispatch()).
+        // 2. A parsed command is ready to execute (HasCommandToExecute()).
         // 3. An executed command is ready to send its reply (parsed_head_ &&
         //    parsed_head_->CanReply()).
         // 4. Control-plane messages arrived (!dispatch_q_.empty()).
         // 5. The socket encountered an error/closed (io_ec_).
         // 6. A migration to another thread was requested AND is actionable now (no subscriptions).
-        return io_buf_.InputLen() > 0 || pending_input_ || HeadReadyToDispatch() ||
+        return io_buf_.InputLen() > 0 || pending_input_ || HasCommandToExecute() ||
                (parsed_head_ && parsed_head_->CanReply()) || !dispatch_q_.empty() || io_ec_ ||
                is_ready_to_migrate();
       });
     }
 
     phase_ = PROCESS;
-    bool is_iobuf_full = io_buf_.AppendLen() == 0;
+    bool reached_capacity = io_buf_.AppendLen() == 0;
 
     // Temporary: Handle dispatch queue items (Control Path) one by one blocking command execution
     if (!dispatch_q_.empty()) {
@@ -2844,10 +3008,9 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
         std::visit(AsyncOperations{reply_builder_.get(), this}, msg.handle);
       }
 
-      // TODO: Possibly don't flush unconditionally - optimize it
-      reply_builder_->Flush();
-      if (auto ec = reply_builder_->GetError(); ec)
-        return ec;
+      // Note: No flush needed here: the `continue` below re-enters the loop, which either
+      // hits the data path (ParseLoop flushes via ReplyBatch) or the idle-await block
+      // (Flush 1), which always flushes before sleeping.
 
       // TODO: Properly handle backpressure
       GetQueueBackpressure().pubsub_ec.notifyAll();
@@ -2880,7 +3043,7 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
       size_t mem_before = conn_stats.pipeline_queue_bytes;
 
       if (parsed_head_) {
-        if (HeadReadyToDispatch())
+        if (HasCommandToExecute())
           ExecuteBatch();
         ReplyBatch();
       }
@@ -2916,6 +3079,12 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
         // us "deaf" to future memory relief.
         auto sub_key = qbp.v2_pipeline_backpressure_ec.subscribe_persistent(&backpressure_waiter);
 
+        // Client needs replies to free its send buffer and relieve backpressure.
+        reply_builder_->Flush();
+        if (auto err = reply_builder_->GetError(); err) {
+          return err;
+        }
+
         io_event_.await([this, &is_ready_to_migrate]() {
           bool cmd_ready = parsed_head_ && parsed_head_->CanReply();
           bool under_limit = !GetQueueBackpressure().IsPipelineBufferOverLimit(
@@ -2939,6 +3108,10 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
     // Check io_ec_ after parsing and flushing replies, so that half-closed
     // connections get their responses before we close.
     if (io_ec_) {
+      reply_builder_->Flush();
+      if (auto err = reply_builder_->GetError(); err) {
+        return err;
+      }
       LOG_IF(WARNING, cntx()->replica_conn) << "async io error: " << io_ec_;
       return std::exchange(io_ec_, {});
     }
@@ -2949,12 +3122,18 @@ variant<error_code, Connection::ParserStatus> Connection::IoLoopV2() {
 
     // Migration requested and actionable: skip buffer bookkeeping, jump to HandleMigrateRequest().
     if (is_ready_to_migrate()) {
+      // Flush before migrating: handing off unflushed thread-local buffers to a
+      // new thread will cause data corruption or a hard crash.
+      reply_builder_->Flush();
+      if (auto err = reply_builder_->GetError(); err) {
+        return err;  // Connection is dead, no point migrating it cross-thread.
+      }
       continue;
     }
 
     if (parse_status == NEED_MORE) {
       parse_status = OK;
-      CheckIoBufCapacity(is_iobuf_full);
+      CheckIoBufCapacity(reached_capacity, &io_buf_);
     }
   } while (peer->IsOpen());
 
diff --git a/src/facade/dragonfly_connection.h b/src/facade/dragonfly_connection.h
index 3b6d1792f10a..7279030f1833 100644
--- a/src/facade/dragonfly_connection.h
+++ b/src/facade/dragonfly_connection.h
@@ -191,6 +191,24 @@ class Connection : public util::Connection {
   // This method returns true for customer facing listeners.
   bool IsMainOrMemcache() const;
 
+  // Classification of the traffic source for the DEBUG TRAFFIC recorder.
+  // Persisted as the second byte of the file header in the on-disk format;
+  // the numeric values are part of that format — do not change them.
+  // MAIN_RESP / ADMIN_RESP / REPLICA_RESP all carry RESP-format commands;
+  // MAIN vs ADMIN differ by the port they were accepted on, while REPLICA
+  // covers commands that arrived on a replica via the replication stream
+  // (not from a client-facing listener).
+  enum class ListenerType : uint8_t {
+    MAIN_RESP = 1,     // main RESP listener (TCP and unix-socket)
+    MEMCACHE = 2,      // memcached protocol listener
+    ADMIN_RESP = 3,    // privileged / admin listener (RESP protocol on admin port)
+    REPLICA_RESP = 4,  // commands arriving on a replica from its master
+  };
+
+  ListenerType GetListenerType() const {
+    return listener_type_;
+  }
+
   void SetName(std::string name);
 
   void SetLibName(std::string name);
@@ -218,14 +236,32 @@ class Connection : public util::Connection {
   // and only when the flag --migrate_connections is true.
   void RequestAsyncMigration(util::fb2::ProactorBase* dest, bool force);
 
+  // Outcome of a StartTrafficLogging call on a single thread.
+  enum class StartTrafficResult : uint8_t {
+    kStarted,         // new recording started successfully on this thread
+    kAlreadyLogging,  // this thread already had an active recording (noop)
+    kOpenFailed,      // failed to open the log file (or unsupported platform)
+  };
+
   // Starts traffic logging in the calling thread. Must be a proactor thread.
-  // Each thread creates its own log file combining requests from all the connections in
-  // that thread. A noop if the thread is already logging.
-  static void StartTrafficLogging(std::string_view base_path);
+  // Each thread creates its own log file containing requests from connections on
+  // that thread whose listener type equals `listener_type`. Exactly one listener
+  // kind per recording — mixing protocols in a single file is not supported.
+  static StartTrafficResult StartTrafficLogging(std::string_view base_path,
+                                                ListenerType listener_type);
 
   // Stops traffic logging in this thread. A noop if the thread is not logging.
   static void StopTrafficLogging();
 
+  // Writes a single command to the per-thread traffic log if (and only if) the
+  // logger on this thread is currently recording the REPLICA_RESP source.
+  // Used by the replication read path on replicas to capture commands that
+  // arrived from the master — they do not travel through a Connection, so the
+  // regular per-connection hot path does not see them.
+  // `db_index` is the database the command should be applied to; it is stored
+  // in the record so replay tools can issue SELECT before dispatch.
+  static void LogReplicaCommand(const cmn::BackedArguments& args, uint32_t db_index);
+
   // Get quick debug info for logs
   std::string DebugInfo() const;
 
@@ -287,7 +323,7 @@ class Connection : public util::Connection {
   // Drains currently available bytes from socket into io_buf_ using non-blocking reads.
   void ReadPendingInput();
 
-  void CheckIoBufCapacity(bool is_iobuf_full);
+  void CheckIoBufCapacity(bool reached_capacity, base::IoBuf* buf);
 
   // Main loop reading client messages and passing requests to dispatch queue.
   std::variant<std::error_code, ParserStatus> IoLoopV2();
@@ -319,7 +355,7 @@ class Connection : public util::Connection {
   // If add is true, stats are incremented, otherwise decremented.
   void UpdateDispatchStats(const MessageHandle& msg, bool add);
 
-  ParserStatus ParseRedis(unsigned max_busy_cycles, bool enqueue_only = false);
+  ParserStatus ParseRedis(base::IoBuf& buf, unsigned max_busy_cycles, bool enqueue_only = false);
 
   void OnBreakCb(int32_t mask);
 
@@ -366,9 +402,13 @@ class Connection : public util::Connection {
   // Returns true if one or more commands were parsed from the read buffer,
   // and false if no complete commands could be parsed (for example, when
   // parsing is pending more input).
-  bool ParseMCBatch();
+  bool ParseMCBatch(base::IoBuf& buf);
 
-  bool ParseRedisBatch();
+  bool ParseRedisBatch(base::IoBuf& buf);
+
+  // Call the appropriate ParseMCBatch or ParseRedisBatch based on the protocol.
+  // Only CPU-bound work; must not perform I/O or fiber suspension.
+  void ParseFromBuffer(base::IoBuf& buf);
 
   // Call appropriate ParseBatch function, proceed with Execute and Reply all why input is remaining
   ParserStatus ParseLoop();
@@ -453,13 +493,13 @@ class Connection : public util::Connection {
   size_t parsed_cmd_q_bytes_ = 0;
 
   // Returns true if there are dispatched commands that haven't been replied yet.
-  bool HasDispatchedCommands() const {
+  bool HasInFlightCommands() const {
     return parsed_head_ != parsed_to_execute_;
   }
 
-  // Returns true if the head command is ready to dispatch (nothing in-flight ahead of it).
-  bool HeadReadyToDispatch() const {
-    return parsed_head_ && !HasDispatchedCommands();
+  // Returns true if the head command is ready to execute (nothing in-flight ahead of it).
+  bool HasCommandToExecute() const {
+    return parsed_head_ && !HasInFlightCommands();
   }
 
   // Returns true if there are any commands pending in the parsed command queue or dispatch queue.
@@ -532,6 +572,8 @@ class Connection : public util::Connection {
     };
   };
 
+  ListenerType listener_type_ = ListenerType::MAIN_RESP;
+
   bool request_shutdown_ = false;
 };
 
diff --git a/src/facade/error.h b/src/facade/error.h
index aae0a72fc99e..88262358db30 100644
--- a/src/facade/error.h
+++ b/src/facade/error.h
@@ -52,4 +52,6 @@ inline constexpr char kRestrictDenied[] = "restrict_denied";
 inline constexpr char kNoGroupErrType[] = "no_group_error";
 inline constexpr char kNoAuthErrType[] = "no_auth";
 
+inline constexpr char kBloomFilterLoadInProgress[] = "bloom filter load in progress";
+
 }  // namespace facade
diff --git a/src/facade/facade_test.h b/src/facade/facade_test.h
index 76b6e090eea2..3394698b1f59 100644
--- a/src/facade/facade_test.h
+++ b/src/facade/facade_test.h
@@ -6,7 +6,7 @@
 
 #include <gmock/gmock.h>
 
-#include <ostream>
+#include <iosfwd>
 #include <string>
 #include <string_view>
 
diff --git a/src/facade/memcache_parser.cc b/src/facade/memcache_parser.cc
index 6750c8a0fc11..841e3ae2bf22 100644
--- a/src/facade/memcache_parser.cc
+++ b/src/facade/memcache_parser.cc
@@ -94,6 +94,7 @@ MP::Result ParseStore(ArgSlice tokens, int64_t now, MP::Command* res, uint32_t m
     return MP::PARSE_ERROR;
   }
 
+  res->raw_expire_ts = expire_ts;
   res->expire_ts = ToAbsolute(expire_ts, now);
 
   if (res->type == MP::CAS && !absl::SimpleAtoi(tokens[4], &res->cas_unique)) {
@@ -126,6 +127,7 @@ MP::Result ParseValueless(ArgSlice tokens, int64_t now, MP::Command* res) {
     if (!absl::SimpleAtoi(tokens[0], &expire_ts)) {
       return MP::BAD_INT;
     }
+    res->raw_expire_ts = expire_ts;
     res->expire_ts = ToAbsolute(expire_ts, now);
     ++key_pos;
   }
@@ -277,6 +279,7 @@ MP::Result ParseMeta(ArgSlice tokens, int64_t now, MP::Command* res, uint32_t ma
       case 'T':
         if (!absl::SimpleAtoi(token.substr(1), &expire_ts))
           return MP::BAD_INT;
+        res->raw_expire_ts = expire_ts;
         res->expire_ts = ToAbsolute(expire_ts, now);
         if (res->type == MP::GET)
           res->type = MP::GAT;
@@ -473,4 +476,60 @@ auto MP::ConsumeValue(std::string_view str, uint32_t* consumed, Command* dest) -
   return val_len_to_read_ > 0 ? MP::INPUT_PENDING : MP::OK;
 }
 
+// Inverse of the token map in From(): enum -> wire token. Only used by the
+// traffic logger, which is off most of the time, so a switch is plenty.
+string_view MP::CmdName(CmdType type) {
+  switch (type) {
+    case MP::SET:
+      return "set"sv;
+    case MP::ADD:
+      return "add"sv;
+    case MP::REPLACE:
+      return "replace"sv;
+    case MP::APPEND:
+      return "append"sv;
+    case MP::PREPEND:
+      return "prepend"sv;
+    case MP::CAS:
+      return "cas"sv;
+    case MP::GET:
+      return "get"sv;
+    case MP::GETS:
+      return "gets"sv;
+    case MP::GAT:
+      return "gat"sv;
+    case MP::GATS:
+      return "gats"sv;
+    case MP::STATS:
+      return "stats"sv;
+    case MP::INCR:
+      return "incr"sv;
+    case MP::DECR:
+      return "decr"sv;
+    case MP::DELETE:
+      return "delete"sv;
+    case MP::FLUSHALL:
+      return "flush_all"sv;
+    case MP::QUIT:
+      return "quit"sv;
+    case MP::VERSION:
+      return "version"sv;
+    case MP::META_NOOP:
+      return "mn"sv;
+    case MP::META_SET:
+      return "ms"sv;
+    case MP::META_DEL:
+      return "md"sv;
+    case MP::META_ARITHM:
+      return "ma"sv;
+    case MP::META_GET:
+      return "mg"sv;
+    case MP::META_DEBUG:
+      return "me"sv;
+    case MP::INVALID:
+      return ""sv;
+  }
+  return ""sv;
+}
+
 }  // namespace facade
diff --git a/src/facade/memcache_parser.h b/src/facade/memcache_parser.h
index 68127ef0828b..ad008ec12c5f 100644
--- a/src/facade/memcache_parser.h
+++ b/src/facade/memcache_parser.h
@@ -90,6 +90,12 @@ class MemcacheParser {
 
     int64_t expire_ts = 0;  // unix time (expire_ts > month) in seconds
 
+    // Original, pre-ToAbsolute exptime token as sent by the client. Kept so that
+    // tools/replay can reproduce the exact wire command: relative exptimes stay
+    // relative on replay (re-resolved against the replayer's "now"), absolute
+    // exptimes stay absolute. `expire_ts` above is always the absolutised form.
+    uint32_t raw_expire_ts = 0;
+
     // flags for STORE commands
     uint32_t flags = 0;
 
@@ -99,7 +105,7 @@ class MemcacheParser {
     cmn::BackedArguments* backed_args = nullptr;
   };
 
-  static_assert(sizeof(Command) == 40);
+  static_assert(sizeof(Command) == 48);
 
   enum Result : uint8_t {
     OK,
@@ -114,6 +120,11 @@ class MemcacheParser {
     return type >= SET && type <= CAS;
   }
 
+  // Returns the wire-protocol token for `type` (e.g. "set", "mg"), or an empty
+  // string_view for INVALID / unrecognized values. Used by the traffic logger
+  // so that the memcache command name does not need to be duplicated in callers.
+  static std::string_view CmdName(CmdType type);
+
   size_t UsedMemory() const {
     return tmp_buf_.capacity();
   }
diff --git a/src/facade/op_status.cc b/src/facade/op_status.cc
index e429d8b42ce4..049796776137 100644
--- a/src/facade/op_status.cc
+++ b/src/facade/op_status.cc
@@ -1,9 +1,19 @@
 #include "facade/op_status.h"
 
+#include <ostream>
+
 #include "base/logging.h"
 #include "facade/error.h"
 #include "facade/resp_expr.h"
 
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, facade::OpStatus op) {
+  return os << static_cast<int>(op);
+}
+
+}  // namespace std
+
 namespace facade {
 
 std::string_view StatusToMsg(OpStatus status) {
@@ -44,6 +54,8 @@ std::string_view StatusToMsg(OpStatus status) {
       return kNanOrInfDuringIncr;
     case OpStatus::IO_ERROR:
       return kTieredIoError;
+    case OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS:
+      return kBloomFilterLoadInProgress;
     default:
       LOG(ERROR) << "Unsupported status " << status;
       return "Internal error";
diff --git a/src/facade/op_status.h b/src/facade/op_status.h
index 1fedca1cb4df..939f06a187ea 100644
--- a/src/facade/op_status.h
+++ b/src/facade/op_status.h
@@ -5,7 +5,9 @@
 #pragma once
 
 #include <cstdint>
-#include <ostream>
+#include <iosfwd>
+#include <string_view>
+#include <utility>
 
 namespace facade {
 
@@ -35,6 +37,7 @@ enum class OpStatus : uint16_t {
   INVALID_JSON,
   IO_ERROR,
   NAN_OR_INF_DURING_INCR,
+  BLOOM_FILTER_LOAD_IN_PROGRESS,
 };
 
 class OpResultBase {
@@ -127,14 +130,10 @@ std::string_view StatusToMsg(OpStatus status);
 
 namespace std {
 
-template <typename T> std::ostream& operator<<(std::ostream& os, const facade::OpResult<T>& res) {
-  os << res.status();
-  return os;
-}
+std::ostream& operator<<(std::ostream& os, facade::OpStatus op);
 
-inline std::ostream& operator<<(std::ostream& os, const facade::OpStatus op) {
-  os << int(op);
-  return os;
+template <typename T> std::ostream& operator<<(std::ostream& os, const facade::OpResult<T>& res) {
+  return os << res.status();
 }
 
 }  // namespace std
diff --git a/src/facade/reply_builder.cc b/src/facade/reply_builder.cc
index 90b04d900757..8e217b1d6441 100644
--- a/src/facade/reply_builder.cc
+++ b/src/facade/reply_builder.cc
@@ -134,6 +134,10 @@ void SinkReplyBuilder::WriteRef(std::string_view str) {
 }
 
 void SinkReplyBuilder::Flush(size_t expected_buffer_cap) {
+  // Fast path: nothing buffered and no buffer resize requested.
+  if (vecs_.empty() && (expected_buffer_cap == 0))
+    return;
+
   if (!vecs_.empty())
     Send();
 
diff --git a/src/facade/reply_payload.h b/src/facade/reply_payload.h
index b40741eaa8ed..a122178bdf49 100644
--- a/src/facade/reply_payload.h
+++ b/src/facade/reply_payload.h
@@ -27,7 +27,7 @@ struct BulkString : public std::string {};    // SendBulkString
 using Payload = std::variant<std::monostate, Null, Error, long, double, SimpleString, BulkString,
                              std::unique_ptr<CollectionPayload>>;
 
-#ifdef __linux__
+#if defined(__linux__) && !defined(_LIBCPP_VERSION)
 static_assert(sizeof(Payload) == 40);
 #endif
 
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
index 8e3b3cc5fd92..f7bbd89d80b5 100644
--- a/src/server/CMakeLists.txt
+++ b/src/server/CMakeLists.txt
@@ -165,6 +165,7 @@ helio_cxx_test(cluster/cluster_config_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(cluster/cluster_family_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(acl/acl_family_test dfly_test_lib LABELS DFLY)
 helio_cxx_test(engine_shard_set_test dfly_test_lib LABELS DFLY)
+helio_cxx_test(serializer_base_test dfly_test_lib LABELS DFLY)
 
 add_dependencies(check_dfly dragonfly_test json_family_test list_family_test
                  generic_family_test memcache_parser_test rdb_test journal_test
diff --git a/src/server/acl/acl_family.cc b/src/server/acl/acl_family.cc
index 9d1a72d08d47..98f18320bf9f 100644
--- a/src/server/acl/acl_family.cc
+++ b/src/server/acl/acl_family.cc
@@ -147,6 +147,7 @@ void AclFamily::SetUser(CmdArgList args, CommandContext* cmd_cntx) {
 
   auto update_case = [username, &reg, cmd_cntx, this, exists](User::UpdateRequest&& req) {
     auto& user = reg.registry[username];
+    const User::MemoryUsage before = exists ? user.GetMemoryUsage() : User::MemoryUsage{};
     if (!exists) {
       User::UpdateRequest default_req;
       default_req.updates = {User::UpdateRequest::CategoryValueType{User::Sign::MINUS, acl::ALL}};
@@ -155,6 +156,7 @@ void AclFamily::SetUser(CmdArgList args, CommandContext* cmd_cntx) {
     }
     const bool reset_channels = req.reset_channels;
     user.Update(std::move(req), CategoryToIdx(), reverse_cat_table_, CategoryToCommandsIndex());
+    registry_->TrackUser(before, user.GetMemoryUsage(), !exists);
     // Send ok first because the connection might get evicted
     cmd_cntx->SendOk();
     if (exists) {
@@ -333,6 +335,7 @@ GenericError AclFamily::LoadToRegistryFromFile(std::string_view full_path,
     // Evict open connections for old users
     EvictOpenConnectionsOnAllProactorsWithRegistry(registry);
     registry.clear();
+    registry_->ResetStats();
   }
 
   for (size_t i = 0; i < usernames.size(); ++i) {
@@ -343,12 +346,14 @@ GenericError AclFamily::LoadToRegistryFromFile(std::string_view full_path,
                 CategoryToCommandsIndex());
     user.Update(std::move(requests[i]), CategoryToIdx(), reverse_cat_table_,
                 CategoryToCommandsIndex());
+    registry_->TrackUser({}, user.GetMemoryUsage(), /*is_new=*/true);
   }
 
   if (!registry.contains("default")) {
     auto& user = registry["default"];
     user.Update(registry_->DefaultUserUpdateRequest(), CategoryToIdx(), reverse_cat_table_,
                 CategoryToCommandsIndex());
+    registry_->TrackUser({}, user.GetMemoryUsage(), /*is_new=*/true);
   }
 
   return {};
diff --git a/src/server/acl/acl_family_test.cc b/src/server/acl/acl_family_test.cc
index 870b1d750408..e58b3b3dc901 100644
--- a/src/server/acl/acl_family_test.cc
+++ b/src/server/acl/acl_family_test.cc
@@ -575,4 +575,49 @@ TEST_F(AclFamilyTest, TestAclLogUB) {
   EXPECT_THAT(resp, ErrArg("ERR index out of range"));
 }
 
+TEST_F(AclFamilyTest, AclInfoMetrics) {
+  TestInitAclFam();
+
+  // After init only the default user exists.
+  auto stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_users, 1);
+  EXPECT_EQ(stats.num_passwords, 0);  // default user uses nopass
+  EXPECT_EQ(stats.num_key_globs, 0);
+  EXPECT_EQ(stats.num_pubsub_globs, 0);
+
+  // Add a new user with two passwords.
+  Run("ACL SETUSER alice >pass1 >pass2");
+  stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_users, 2);
+  EXPECT_EQ(stats.num_passwords, 2);
+
+  // Adding a key glob is tracked in num_key_globs and key_globs_bytes.
+  Run("ACL SETUSER alice ~mykey*");
+  stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_key_globs, 1);
+  EXPECT_EQ(stats.key_globs_bytes, std::string("mykey*").size());
+
+  // Adding a pubsub glob is tracked in num_pubsub_globs and pubsub_globs_bytes.
+  Run("ACL SETUSER alice &news.*");
+  stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_pubsub_globs, 1);
+  EXPECT_EQ(stats.pubsub_globs_bytes, std::string("news.*").size());
+
+  // Removing a password is reflected immediately.
+  Run("ACL SETUSER alice <pass1");
+  stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_passwords, 1);
+
+  // Deleting a user decrements num_users and clears its contributions.
+  Run("ACL DELUSER alice");
+  stats = GetMetrics().acl_stats;
+  EXPECT_EQ(stats.num_users, 1);
+  EXPECT_EQ(stats.num_passwords, 0);
+  EXPECT_EQ(stats.num_key_globs, 0);
+  EXPECT_EQ(stats.num_pubsub_globs, 0);
+
+  // TotalBytes must be positive even with only the default user.
+  EXPECT_GT(stats.TotalBytes(), 0u);
+}
+
 }  // namespace dfly
diff --git a/src/server/acl/user.cc b/src/server/acl/user.cc
index bd976dc35620..16d921844993 100644
--- a/src/server/acl/user.cc
+++ b/src/server/acl/user.cc
@@ -247,6 +247,44 @@ const User::CommandChanges& User::CmdChanges() const {
   return cmd_changes_;
 }
 
+User::MemoryUsage& User::MemoryUsage::operator+=(const MemoryUsage& u) {
+  num_passwords += u.num_passwords;
+  num_cat_changes += u.num_cat_changes;
+  num_cmd_changes += u.num_cmd_changes;
+  num_key_globs += u.num_key_globs;
+  key_globs_bytes += u.key_globs_bytes;
+  num_pubsub_globs += u.num_pubsub_globs;
+  pubsub_globs_bytes += u.pubsub_globs_bytes;
+  return *this;
+}
+
+User::MemoryUsage& User::MemoryUsage::operator-=(const MemoryUsage& u) {
+  num_passwords -= u.num_passwords;
+  num_cat_changes -= u.num_cat_changes;
+  num_cmd_changes -= u.num_cmd_changes;
+  num_key_globs -= u.num_key_globs;
+  key_globs_bytes -= u.key_globs_bytes;
+  num_pubsub_globs -= u.num_pubsub_globs;
+  pubsub_globs_bytes -= u.pubsub_globs_bytes;
+  return *this;
+}
+
+User::MemoryUsage User::GetMemoryUsage() const {
+  MemoryUsage usage;
+  usage.num_passwords = password_hashes_.size();
+  usage.num_cat_changes = cat_changes_.size();
+  usage.num_cmd_changes = cmd_changes_.size();
+  usage.num_key_globs = keys_.key_globs.size();
+  for (const auto& [glob, _] : keys_.key_globs) {
+    usage.key_globs_bytes += glob.size();
+  }
+  usage.num_pubsub_globs = pub_sub_.globs.size();
+  for (const auto& [glob, _] : pub_sub_.globs) {
+    usage.pubsub_globs_bytes += glob.size();
+  }
+  return usage;
+}
+
 void User::SetKeyGlobs(std::vector<UpdateKey> keys) {
   for (auto& key : keys) {
     if (key.all_keys) {
diff --git a/src/server/acl/user.h b/src/server/acl/user.h
index fb8daa418fc5..c107bf5f0717 100644
--- a/src/server/acl/user.h
+++ b/src/server/acl/user.h
@@ -134,6 +134,22 @@ class User final {
 
   const CommandChanges& CmdChanges() const;
 
+  // Per-user heap-allocated collection sizes, used by UserRegistry for aggregate stats.
+  struct MemoryUsage {
+    size_t num_passwords = 0;
+    size_t num_cat_changes = 0;
+    size_t num_cmd_changes = 0;
+    size_t num_key_globs = 0;
+    size_t key_globs_bytes = 0;  // total byte length of key glob strings
+    size_t num_pubsub_globs = 0;
+    size_t pubsub_globs_bytes = 0;  // total byte length of pubsub glob strings
+
+    MemoryUsage& operator+=(const MemoryUsage& u);
+    MemoryUsage& operator-=(const MemoryUsage& u);
+  };
+
+  MemoryUsage GetMemoryUsage() const;
+
  private:
   void SetAclCategoriesAndIncrSeq(uint32_t cat, const CategoryToIdxStore& cat_to_id,
                                   const ReverseCategoryIndexTable& reverse_cat,
diff --git a/src/server/acl/user_registry.cc b/src/server/acl/user_registry.cc
index efdc876ce1e8..915d1e533803 100644
--- a/src/server/acl/user_registry.cc
+++ b/src/server/acl/user_registry.cc
@@ -17,15 +17,75 @@ using namespace util;
 
 namespace dfly::acl {
 
+// SHA256 produces 32-byte binary hashes. Each is stored as a std::string in the
+// flat_hash_set, which exceeds SSO capacity and thus heap-allocates its content.
+static constexpr size_t kSHA256Bytes = 32;
+
+size_t UserRegistry::AclStats::TotalBytes() const {
+  // Fixed per-user cost: the User object itself plus the always-allocated commands_ vector.
+  const size_t per_user_base = sizeof(User) + NumberOfFamilies() * sizeof(uint64_t);
+
+  // Each password hash is a 32-byte binary string stored in an absl flat_hash_set.
+  // The std::string object lives inline in the set slot; the content (>SSO) is heap-allocated.
+  const size_t per_password = sizeof(std::string) + kSHA256Bytes;
+
+  // Category-change map entry: uint32_t key + ChangeMetadata value.
+  const size_t per_cat_change = sizeof(User::CategoryChange) + sizeof(User::ChangeMetadata);
+
+  // Command-change map entry: pair<size_t,uint64_t> key + ChangeMetadata value.
+  const size_t per_cmd_change = sizeof(User::CommandChange) + sizeof(User::ChangeMetadata);
+
+  // Key-glob vector entry: pair<string, KeyOp> object + any string content exceeding SSO.
+  const size_t per_key_glob = sizeof(std::pair<std::string, KeyOp>);
+
+  // PubSub-glob vector entry: pair<string, bool> object + any string content exceeding SSO.
+  const size_t per_pubsub_glob = sizeof(std::pair<std::string, bool>);
+
+  return num_users * per_user_base +                               //
+         num_passwords * per_password +                            //
+         num_cat_changes * per_cat_change +                        //
+         num_cmd_changes * per_cmd_change +                        //
+         num_key_globs * per_key_glob + key_globs_bytes +          //
+         num_pubsub_globs * per_pubsub_glob + pubsub_globs_bytes;  //
+}
+
+UserRegistry::AclStats UserRegistry::GetAclStats() const {
+  std::shared_lock<fb2::SharedMutex> lock(mu_);
+  return stats_;
+}
+
+void UserRegistry::TrackUser(const User::MemoryUsage& before, const User::MemoryUsage& after,
+                             bool is_new) {
+  if (is_new)
+    ++stats_.num_users;
+  stats_ -= before;
+  stats_ += after;
+}
+
+void UserRegistry::ResetStats() {
+  stats_ = AclStats{};
+}
+
 void UserRegistry::MaybeAddAndUpdate(std::string_view username, User::UpdateRequest req) {
   std::unique_lock<fb2::SharedMutex> lock(mu_);
+  const bool is_new = !registry_.contains(username);
   auto& user = registry_[username];
+
+  const User::MemoryUsage before = is_new ? User::MemoryUsage{} : user.GetMemoryUsage();
   user.Update(std::move(req), *cat_to_id_table_, *reverse_cat_table_, *cat_to_commands_table_);
+  TrackUser(before, user.GetMemoryUsage(), is_new);
 }
 
 bool UserRegistry::RemoveUser(std::string_view username) {
   std::unique_lock<fb2::SharedMutex> lock(mu_);
-  return registry_.erase(username);
+  auto it = registry_.find(username);
+  if (it == registry_.end()) {
+    return false;
+  }
+  TrackUser(it->second.GetMemoryUsage(), {}, false);
+  --stats_.num_users;
+  registry_.erase(it);
+  return true;
 }
 
 UserCredentials UserRegistry::GetCredentials(std::string_view username) const {
diff --git a/src/server/acl/user_registry.h b/src/server/acl/user_registry.h
index 529ef8654ef8..6e38b18410a4 100644
--- a/src/server/acl/user_registry.h
+++ b/src/server/acl/user_registry.h
@@ -75,9 +75,30 @@ class UserRegistry {
 
   User::UpdateRequest DefaultUserUpdateRequest() const;
 
+  // Aggregate memory-usage stats across all users. Updated incrementally on every
+  // user creation, deletion, or mutation so that INFO ACL can read them lock-free.
+  struct AclStats : User::MemoryUsage {
+    size_t num_users = 0;
+
+    // Estimated total bytes consumed by all ACL users (base structs + heap collections).
+    size_t TotalBytes() const;
+  };
+
+  // Returns a snapshot of the aggregate stats under the registry read lock.
+  AclStats GetAclStats() const;
+
+  // Updates aggregate stats after a user mutation performed via GetRegistryWithWriteLock.
+  // Must be called while the write lock is held. `before` is zeroed for new users.
+  void TrackUser(const User::MemoryUsage& before, const User::MemoryUsage& after, bool is_new);
+
+  // Resets aggregate stats to zero. Must be called while the write lock is held,
+  // immediately after registry.clear().
+  void ResetStats();
+
  private:
   RegistryType registry_;
   mutable util::fb2::SharedMutex mu_;
+  AclStats stats_;  // maintained under mu_
 
   // Helper class for accessing the registry with a ReadLock outside the scope of UserRegistry
   template <template <typename T> typename LockT, typename RegT> class RegistryWithLock {
diff --git a/src/server/bitops_family.cc b/src/server/bitops_family.cc
index 8b7105dd9850..02c2905afd35 100644
--- a/src/server/bitops_family.cc
+++ b/src/server/bitops_family.cc
@@ -60,7 +60,7 @@ string GetString(const PrimeValue& pv);
 bool SetBitValue(uint32_t offset, bool bit_value, string* entry);
 std::size_t CountBitSetByByteIndices(string_view at, std::size_t start, std::size_t end);
 std::size_t CountBitSet(string_view str, int64_t start, int64_t end, bool bits);
-std::size_t CountBitSetByBitIndices(string_view at, std::size_t start, std::size_t end);
+std::size_t CountBitSetByBitIndices(string_view at, std::size_t front, std::size_t back);
 string RunBitOperationOnValues(string_view op, const BitsStrVec& values);
 
 // ------------------------------------------------------------------------- //
@@ -169,24 +169,21 @@ std::size_t CountBitSetByByteIndices(string_view at, std::size_t start, std::siz
   return count;
 }
 
-// Count the number of bits that are on, on bits boundaries: i.e. Start and end are the indices for
-// bits locations inside str
-std::size_t CountBitSetByBitIndices(string_view at, std::size_t start, std::size_t end) {
-  auto first_byte_index = GetByteIndex(start);
-  auto last_byte_index = GetByteIndex(end);
-  if (start % OFFSET_FACTOR == 0 && end % OFFSET_FACTOR == 0) {
-    return CountBitSetByByteIndices(at, first_byte_index, last_byte_index);
-  }
-  const auto last_bit_first_byte =
-      first_byte_index != last_byte_index ? OFFSET_FACTOR : GetBitIndex(end);
-  const auto first_byte = GetByteValue(at, start);
-  std::uint32_t count = CountBitsRange(first_byte, GetBitIndex(start), last_bit_first_byte);
-  if (first_byte_index < last_byte_index) {
-    first_byte_index++;
-    const auto last_byte = GetByteValue(at, end);
-    count += CountBitsRange(last_byte, 0, GetBitIndex(end));
-    count += CountBitSetByByteIndices(at, first_byte_index, last_byte_index);
-  }
+// Count the number of bits that are on, in the inclusive bit range [front, back].
+// Caller must guarantee 0 <= front <= back < at.size() * OFFSET_FACTOR.
+std::size_t CountBitSetByBitIndices(string_view at, std::size_t front, std::size_t back) {
+  const size_t front_byte = front / OFFSET_FACTOR;
+  const size_t back_byte = back / OFFSET_FACTOR;
+  const uint8_t front_bit = front % OFFSET_FACTOR;
+  const uint8_t back_bit_end = back % OFFSET_FACTOR + 1;  // exclusive upper
+
+  if (front_byte == back_byte) {
+    return CountBitsRange(static_cast<uint8_t>(at[front_byte]), front_bit, back_bit_end);
+  }
+  std::size_t count =
+      CountBitsRange(static_cast<uint8_t>(at[front_byte]), front_bit, OFFSET_FACTOR);
+  count += CountBitSetByByteIndices(at, front_byte + 1, back_byte);
+  count += CountBitsRange(static_cast<uint8_t>(at[back_byte]), 0, back_bit_end);
   return count;
 }
 
@@ -206,23 +203,28 @@ int64_t NormalizedOffset(int64_t size, int64_t offset) {
 // Note that when bits is false, it means that we are looking on byte boundaries.
 std::size_t CountBitSet(string_view str, int64_t start, int64_t end, bool bits) {
   const int64_t strlen = bits ? str.size() * OFFSET_FACTOR : str.size();
+  if (strlen == 0)
+    return 0;
+
+  // Both-negative inverted range is empty; without this, clamping pulls both
+  // up to 0 on short strings and counts a spurious byte/bit.
+  if (start < 0 && end < 0 && start > end)
+    return 0;
 
   if (start < 0)
     start = strlen + start;
   if (end < 0)
     end = strlen + end;
 
-  end = min(end, strlen);
+  start = max(start, int64_t(0));
+  end = max(int64_t(0), min(end, strlen - 1));
 
-  if (strlen == 0 || start > end)
+  if (start > end)
     return 0;
 
-  start = max(start, int64_t(0));
-  end = max(end, int64_t(0));
-
-  ++end;
+  // `end` is passed inclusive to the bit helper, exclusive (end + 1) to the byte helper.
   return bits ? CountBitSetByBitIndices(str, start, end)
-              : CountBitSetByByteIndices(str, start, end);
+              : CountBitSetByByteIndices(str, start, end + 1);
 }
 
 // return true if bit is on
diff --git a/src/server/bitops_family_test.cc b/src/server/bitops_family_test.cc
index fffe1db957c6..c328fd936782 100644
--- a/src/server/bitops_family_test.cc
+++ b/src/server/bitops_family_test.cc
@@ -315,6 +315,20 @@ TEST_F(BitOpsFamilyTest, BitCountByteSubRange) {
   EXPECT_EQ(13, CheckedInt({"bitcount", "foo", "-5", "-2"}));
   EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "-1", "-2"}));  // illegal range
   EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "1", "0"}));    // illegal range
+
+  // Negative `end` that resolves to < 0 must be clamped to 0 (Redis semantics);
+  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "0", "-6"}));    // end resolves to 0
+  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "0", "-100"}));  // end resolves far below 0
+  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "-100", "-100"}));
+  EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "-100", "-99"}));
+
+  auto a_resp = Run({"set", "A", "A"});
+  EXPECT_EQ(a_resp, "OK");
+  EXPECT_EQ(2, CheckedInt({"bitcount", "A", "0", "-2"}));
+
+  // Both-negative inverted range on a 1-byte key: must be 0, not a count of
+  // byte 0 after both indices clamp up.
+  EXPECT_EQ(0, CheckedInt({"bitcount", "A", "-1", "-2"}));
 }
 
 TEST_F(BitOpsFamilyTest, BitCountByteBitSubRange) {
@@ -333,6 +347,42 @@ TEST_F(BitOpsFamilyTest, BitCountByteBitSubRange) {
   EXPECT_EQ(4, CheckedInt({"bitcount", "foo", "1", "9", "bit"}));
   EXPECT_EQ(7, CheckedInt({"bitcount", "foo", "2", "19", "bit"}));
   EXPECT_EQ(0, CheckedInt({"bitcount", "foo", "-1", "-2", "bit"}));  // illegal range
+
+  // Both-negative inverted range past the end of a 1-byte key: must be 0,
+  // not a count of bit 0 after both indices clamp up.
+  auto x_resp = Run({"set", "x", std::string(1, '\xff')});
+  EXPECT_EQ(x_resp, "OK");
+  EXPECT_EQ(0, CheckedInt({"bitcount", "x", "-9", "-10", "bit"}));
+}
+
+TEST_F(BitOpsFamilyTest, BitCountBitLastBitRegression) {
+  // Regression: `BITCOUNT key s e BIT` used to read one byte past the end of
+  // the value whenever `e` (after converting inclusive->exclusive via ++e) fell
+  // on a byte boundary AND `s` was in an earlier byte. The read was latent UB:
+  // the bogus byte was multiplied by zero inside CountBitsRange, so the
+  // numeric result was correct, but libc++ hardening / ASan trapped on the
+  // out-of-bounds string_view::operator[].
+  // Known trigger: any 1-bit-exclusive range that ends exactly at 8*N-1.
+
+  // Single-byte value: bit 0 = 1, bit 7 = 1, all others 0 → popcount 2.
+  auto resp = Run({"set", "k1", std::string(1, '\x81')});
+  EXPECT_EQ(resp, "OK");
+  EXPECT_EQ(2, CheckedInt({"bitcount", "k1", "0", "7", "BIT"}));    // full-byte, end@byte-boundary
+  EXPECT_EQ(1, CheckedInt({"bitcount", "k1", "1", "7", "BIT"}));    // partial start, end@boundary
+  EXPECT_EQ(2, CheckedInt({"bitcount", "k1", "-8", "-1", "BIT"}));  // negative form of 0..7
+  EXPECT_EQ(2, CheckedInt({"bitcount", "k1", "0", "-1", "BIT"}));
+  EXPECT_EQ(0, CheckedInt({"bitcount", "k1", "8", "8", "BIT"}));  // start past last bit → 0
+
+  // Multi-byte value: "abcdef" has 48 bits; bits 0-47 valid. These ranges all
+  // end at or past the last valid bit — each previously tripped the OOB.
+  resp = Run({"set", "k2", "abcdef"});
+  EXPECT_EQ(resp, "OK");
+  EXPECT_EQ(CheckedInt({"bitcount", "k2", "0", "-1"}),          // reference (byte form)
+            CheckedInt({"bitcount", "k2", "0", "47", "BIT"}));  // end@last bit
+  EXPECT_EQ(CheckedInt({"bitcount", "k2", "5", "5"}),           // last byte only (byte form)
+            CheckedInt({"bitcount", "k2", "40", "47", "BIT"}));
+  EXPECT_EQ(0, CheckedInt({"bitcount", "k2", "48", "48", "BIT"}));    // past the end
+  EXPECT_EQ(0, CheckedInt({"bitcount", "k2", "100", "200", "BIT"}));  // way past
 }
 
 // ------------------------- BITOP tests
diff --git a/src/server/blocking_controller.cc b/src/server/blocking_controller.cc
index 8ad535c55976..30eb1b28b317 100644
--- a/src/server/blocking_controller.cc
+++ b/src/server/blocking_controller.cc
@@ -140,6 +140,8 @@ void BlockingController::RemovedWatched(Keys keys, Transaction* tx) {
   if (wt.queue_map.empty()) {
     watched_dbs_.erase(dbit);
   }
+  // TODO: awakened_keys.insert in UnwatchTx already guards on !wq->items.empty(), so we could
+  // skip awakened_indices_.emplace when no key was re-queued, avoiding a spurious NotifyPending.
   awakened_indices_.emplace(tx->GetDbIndex());
 }
 
@@ -232,8 +234,11 @@ void BlockingController::NotifyWatchQueue(std::string_view key, WatchQueue* wq,
   while (!queue.empty()) {
     auto& wi = queue.front();
     Transaction* head = wi.get();
-    // We check may the transaction be notified otherwise move it to the end of the queue
-    if (wi.key_ready_checker(owner_, context, head, key)) {
+    KeyReadyResult result = wi.key_ready_checker(owner_, context, key);
+    if (result == KeyReadyResult::kKeyNotFound) {
+      // Key is gone - no tx in this queue can be woken, abort the scan entirely.
+      break;
+    } else if (result == KeyReadyResult::kReady) {
       DVLOG(2) << "WQ-Pop " << head->DebugId() << " from key " << key << " committed txid "
                << owner_->committed_txid();
       if (head->NotifySuspended(sid, key)) {
@@ -243,7 +248,7 @@ void BlockingController::NotifyWatchQueue(std::string_view key, WatchQueue* wq,
         awakened_transactions_.insert(head);
         break;
       }
-    } else {
+    } else {  // kNotReady - key exists but per-tx conditions not met, try next
       skipped.push_back(std::move(wi));
     }
 
diff --git a/src/server/blocking_controller_test.cc b/src/server/blocking_controller_test.cc
index c6c77bb94c34..0b96d93193c9 100644
--- a/src/server/blocking_controller_test.cc
+++ b/src/server/blocking_controller_test.cc
@@ -86,7 +86,7 @@ TEST_F(BlockingControllerTest, Basic) {
     BlockingController bc(shard, &namespaces->GetDefaultNamespace());
     auto keys = t->GetShardArgs(shard->shard_id());
     bc.AddWatched(
-        keys, [](auto...) { return true; }, t);
+        keys, [](auto...) { return KeyReadyResult::kReady; }, t);
     EXPECT_EQ(1, bc.NumWatched(0));
 
     bc.RemovedWatched(keys, t);
@@ -95,13 +95,55 @@ TEST_F(BlockingControllerTest, Basic) {
   });
 }
 
+// Regression for https://github.com/dragonflydb/dragonfly/pull/7225:
+// NotifyWatchQueue used to walk every queued waiter (O(N) per notify) when
+// the key was absent. The fast path now short-circuits via FindReadOnly. We
+// assert the per-waiter checker is never invoked.
+TEST_F(BlockingControllerTest, NotifyWatchQueueFastPathOnAbsentKey) {
+  constexpr size_t kWaiters = 64;
+  const std::string_view key = str_vec_[0];  // "x", hashes to shard 0 (verified in SetUp)
+
+  std::vector<boost::intrusive_ptr<Transaction>> txs;
+  txs.reserve(kWaiters);
+  for (size_t i = 0; i < kWaiters; ++i) {
+    auto t = boost::intrusive_ptr<Transaction>(new Transaction{&cid_});
+    t->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {arg_vec_.data(), arg_vec_.size()});
+    txs.push_back(std::move(t));
+  }
+
+  size_t checker_calls = 0;
+
+  shard_set->Await(0, [&] {
+    EngineShard* shard = EngineShard::tlocal();
+    BlockingController bc(shard, &namespaces->GetDefaultNamespace());
+
+    auto checker = [&checker_calls](EngineShard*, const DbContext&, std::string_view) {
+      ++checker_calls;
+      return KeyReadyResult::kKeyNotFound;
+    };
+
+    for (auto& t : txs) {
+      bc.AddWatched(t->GetShardArgs(shard->shard_id()), checker, t.get());
+    }
+    ASSERT_EQ(1u, bc.NumWatched(0));  // 1 watched key, kWaiters items in its queue
+
+    bc.Awaken(0, key);
+    bc.NotifyPending();
+  });
+
+  // With the enum-based fast path, the first item's checker is called once and returns
+  // kKeyNotFound, aborting the scan without visiting the remaining kWaiters-1 items.
+  EXPECT_EQ(1u, checker_calls) << "fast path did not short-circuit";
+}
+
 TEST_F(BlockingControllerTest, Timeout) {
   time_point tp = steady_clock::now() + chrono::milliseconds(10);
   bool blocked;
   bool paused;
 
   facade::OpStatus status = trans_->WaitOnWatch(
-      tp, Transaction::kShardArgs, [](auto...) { return true; }, &blocked, &paused);
+      tp, Transaction::kShardArgs, [](auto...) { return KeyReadyResult::kReady; }, &blocked,
+      &paused);
 
   EXPECT_EQ(status, facade::OpStatus::TIMED_OUT);
   unsigned num_watched = shard_set->Await(
diff --git a/src/server/bloom_family.cc b/src/server/bloom_family.cc
index 51d2e5a93267..128f029a4fe4 100644
--- a/src/server/bloom_family.cc
+++ b/src/server/bloom_family.cc
@@ -12,6 +12,7 @@
 #include "server/db_slice.h"
 #include "server/engine_shard_set.h"
 #include "server/error.h"
+#include "server/family_utils.h"
 #include "server/transaction.h"
 
 namespace dfly {
@@ -36,6 +37,10 @@ struct SbfParams {
 using AddResult = absl::InlinedVector<OpResult<bool>, 4>;
 using ExistsResult = absl::InlinedVector<bool, 4>;
 
+bool IsBeingLoaded(const SBF* sbf) {
+  return sbf->num_filters() == 0;
+}
+
 OpStatus OpReserve(const SbfParams& params, const OpArgs& op_args, string_view key) {
   auto& db_slice = op_args.GetDbSlice();
   auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, OBJ_SBF);
@@ -64,6 +69,9 @@ OpResult<AddResult> OpAdd(const OpArgs& op_args, string_view key, CmdArgList ite
   }
 
   SBF* sbf = pv.GetSBF();
+  if (IsBeingLoaded(sbf))
+    return OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS;
+
   AddResult result(items.size());
   for (size_t i = 0; i < items.size(); ++i) {
     result[i] = sbf->Add(ToSV(items[i]));
@@ -79,6 +87,9 @@ OpResult<ExistsResult> OpExists(const OpArgs& op_args, string_view key, CmdArgLi
   auto it = (*op_res);
 
   const SBF* sbf = it->second.GetSBF();
+  if (IsBeingLoaded(sbf))
+    return OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS;
+
   ExistsResult result(items.size());
 
   for (size_t i = 0; i < items.size(); ++i) {
@@ -88,6 +99,54 @@ OpResult<ExistsResult> OpExists(const OpArgs& op_args, string_view key, CmdArgLi
   return result;
 }
 
+OpStatus OpLoadChunk(const OpArgs& op_args, std::string_view blob, std::string_view key,
+                     int64_t cursor) {
+  auto& db_slice = op_args.GetDbSlice();
+
+  if (cursor == 1) {  // Init phase
+    auto load_result = LoadSBFHeader(blob, CompactObj::memory_resource());
+    if (!load_result.has_value()) {
+      LOG_EVERY_T(WARNING, 10) << "BF.LOADCHUNK invalid header"
+                               << " key=" << key << " cursor=" << cursor
+                               << " blob_size=" << blob.size()
+                               << " load_res=" << ToString(load_result.error());
+      return OpStatus::INVALID_VALUE;
+    }
+
+    // type set to nullopt to find any type key and overwrite it, not just SBF
+    auto op_res = db_slice.AddOrFind(op_args.db_cntx, key, std::nullopt);
+    if (!op_res) {
+      CompactObj::DeleteMR<SBF>(load_result.value());
+      return op_res.status();
+    }
+
+    // LOADCHUNK overwrites existing key
+    if (!op_res->is_new) {
+      // existing key might not necessarily be SBF, it could be HASH/JSON, and indexed
+      RemoveKeyFromIndexesIfNeeded(key, op_args.db_cntx, op_res->it->second, op_args.shard);
+      db_slice.RemoveExpire(op_args.db_cntx.db_index, op_res->it);
+    }
+
+    op_res->it->second.SetSBF(load_result.value());
+    return OpStatus::OK;
+  }  // cursor == 1 (Init phase)
+
+  // Continue loading chunks into not-yet-fully-loaded filter.
+  auto op_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_SBF);
+  if (!op_res)
+    return op_res.status();
+
+  SBF* sbf = op_res->it->second.GetSBF();
+  if (auto load_res = LoadSBFChunk(cursor, blob, sbf); load_res != SBFLoadResult::kOk) {
+    LOG_EVERY_T(WARNING, 10) << "BF.LOADCHUNK invalid chunk"
+                             << " key=" << key << " cursor=" << cursor
+                             << " blob_size=" << blob.size() << " load_res=" << ToString(load_res);
+    return OpStatus::OUT_OF_RANGE;
+  }
+
+  return OpStatus::OK;
+}
+
 void CmdReserve(CmdArgList args, CommandContext* cmd_cntx) {
   CmdArgParser parser(args);
   string_view key = parser.Next();
@@ -140,6 +199,10 @@ void CmdExists(CmdArgList args, CommandContext* cmd_cntx) {
   };
 
   OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
+
+  if (!res && res.status() == OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS)
+    return cmd_cntx->SendError(res.status());
+
   return cmd_cntx->SendLong(res ? res->front() : 0);
 }
 
@@ -153,9 +216,8 @@ void CmdMAdd(CmdArgList args, CommandContext* cmd_cntx) {
 
   RedisReplyBuilder* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
   OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
-  if (!res) {
+  if (!res)
     return rb->SendError(res.status());
-  }
   const AddResult& add_res = *res;
 
   RedisReplyBuilder::ArrayScope scope{rb, add_res.size()};
@@ -186,14 +248,16 @@ void CmdScanDump(CmdArgList args, CommandContext* cmd_cntx) {
       return op_res.status();
 
     const SBF* sbf = op_res.value()->second.GetSBF();
+    if (IsBeingLoaded(sbf))
+      return OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS;
+
     SBFDumpIterator it(*sbf, cursor);
     return it.Next();
   };
 
   OpResult<SBFChunk> res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
-  if (!res) {
+  if (!res)
     return rb->SendError(res.status());
-  }
 
   RedisReplyBuilder::ArrayScope scope{rb, 2};
   rb->SendLong(res->cursor);
@@ -202,6 +266,32 @@ void CmdScanDump(CmdArgList args, CommandContext* cmd_cntx) {
   rb->SendBulkString(res->data);
 }
 
+void CmdLoadChunk(CmdArgList args, CommandContext* cmd_cntx) {
+  CmdArgParser parser(args);
+  auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
+  const std::string_view key = parser.Next();
+
+  const int64_t cursor = parser.Next<int64_t>();
+  if (const auto err = parser.TakeError(); err)
+    return rb->SendError(err.MakeReply());
+
+  if (cursor <= 0)
+    return rb->SendError(kInvalidIntErr);
+
+  const std::string_view blob = parser.Next();
+
+  const auto cb = [&](Transaction* t, EngineShard* shard) {
+    return OpLoadChunk(t->GetOpArgs(shard), blob, key, cursor);
+  };
+
+  const OpStatus res = cmd_cntx->tx()->ScheduleSingleHop(std::move(cb));
+  if (res == OpStatus::OK)
+    return rb->SendOk();
+  if (res == OpStatus::INVALID_VALUE)
+    return rb->SendError("INVALIDOBJ invalid bloom dump payload");
+  return rb->SendError(res);
+}
+
 void CmdMExists(CmdArgList args, CommandContext* cmd_cntx) {
   string_view key = ArgS(args, 0);
   args.remove_prefix(1);
@@ -211,8 +301,11 @@ void CmdMExists(CmdArgList args, CommandContext* cmd_cntx) {
   };
 
   OpResult res = cmd_cntx->tx()->ScheduleSingleHopT(std::move(cb));
-
   auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
+
+  if (!res && res.status() == OpStatus::BLOOM_FILTER_LOAD_IN_PROGRESS)
+    return rb->SendError(res.status());
+
   RedisReplyBuilder::ArrayScope scope{rb, args.size()};
   for (size_t i = 0; i < args.size(); ++i) {
     rb->SendLong(res ? res->at(i) : 0);
@@ -228,14 +321,15 @@ using CI = CommandId;
 void RegisterBloomFamily(CommandRegistry* registry) {
   registry->StartFamily();
 
-  *registry << CI{"BF.RESERVE", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::BLOOM}.HFUNC(
-                   Reserve)
-            << CI{"BF.ADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Add)
-            << CI{"BF.MADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(
-                   MAdd)
-            << CI{"BF.EXISTS", CO::READONLY | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Exists)
-            << CI{"BF.MEXISTS", CO::READONLY | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(MExists)
-            << CI{"BF.SCANDUMP", CO::READONLY, 3, 1, 1, acl::BLOOM}.HFUNC(ScanDump);
+  *registry
+      << CI{"BF.RESERVE", CO::JOURNALED | CO::DENYOOM | CO::FAST, -4, 1, 1, acl::BLOOM}.HFUNC(
+             Reserve)
+      << CI{"BF.ADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Add)
+      << CI{"BF.MADD", CO::JOURNALED | CO::DENYOOM | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(MAdd)
+      << CI{"BF.EXISTS", CO::READONLY | CO::FAST, 3, 1, 1, acl::BLOOM}.HFUNC(Exists)
+      << CI{"BF.MEXISTS", CO::READONLY | CO::FAST, -3, 1, 1, acl::BLOOM}.HFUNC(MExists)
+      << CI{"BF.SCANDUMP", CO::READONLY, 3, 1, 1, acl::BLOOM}.HFUNC(ScanDump)
+      << CI{"BF.LOADCHUNK", CO::JOURNALED | CO::DENYOOM, 4, 1, 1, acl::BLOOM}.HFUNC(LoadChunk);
 };
 
 }  // namespace dfly
diff --git a/src/server/bloom_family_test.cc b/src/server/bloom_family_test.cc
index 0d8e8cbeb4ec..e9306d3e9a49 100644
--- a/src/server/bloom_family_test.cc
+++ b/src/server/bloom_family_test.cc
@@ -85,6 +85,47 @@ TEST_F(BloomFamilyTest, ScanDump) {
   EXPECT_GE(chunk_count, 1);
 }
 
+TEST_F(BloomFamilyTest, ChunkRoundTrip) {
+  constexpr int total_items = 100;
+
+  Run({"bf.reserve", "b1", "0.01", "1000"});
+  for (int i = 0; i < total_items; ++i)
+    Run({"bf.add", "b1", absl::StrCat("item", i)});
+
+  struct Chunk {
+    int64_t cursor;
+    std::string data;
+  };
+  std::vector<Chunk> chunks;
+
+  int64_t cursor = 0;
+  do {
+    auto resp = Run({"bf.scandump", "b1", std::to_string(cursor)});
+    const auto& vec = resp.GetVec();
+    ASSERT_EQ(vec.size(), 2u);
+
+    const int64_t next_cursor = *vec[0].GetInt();
+    ASSERT_TRUE(next_cursor > cursor || next_cursor == 0);
+
+    if (next_cursor != 0) {
+      EXPECT_EQ(vec[1].type, RespExpr::STRING);
+      EXPECT_FALSE(vec[1].GetBuf().empty());
+      chunks.push_back({next_cursor, vec[1].GetString()});
+    }
+    cursor = next_cursor;
+  } while (cursor != 0);
+
+  ASSERT_GE(chunks.size(), 2);
+
+  // Load all chunks into new key
+  for (const auto& [crs, data] : chunks)
+    EXPECT_EQ(Run({"bf.loadchunk", "b2", std::to_string(crs), data}), "OK");
+
+  // Verify all items exist in the loaded copy
+  for (int i = 0; i < total_items; ++i)
+    EXPECT_THAT(Run({"bf.exists", "b2", absl::StrCat("item", i)}), IntArg(1));
+}
+
 TEST_F(BloomFamilyTest, ScanDumpPastEnd) {
   Run({"bf.reserve", "b1", "0.01", "100"});
   Run({"bf.add", "b1", "x"});
@@ -99,4 +140,9 @@ TEST_F(BloomFamilyTest, ScanDumpPastEnd) {
   EXPECT_TRUE(vec[1].GetBuf().empty());
 }
 
+TEST_F(BloomFamilyTest, LoadChunkErrors) {
+  EXPECT_THAT(Run({"bf.loadchunk", "b1", "0", "data"}), ErrArg("not an integer"));
+  EXPECT_THAT(Run({"bf.loadchunk", "b1", "-1", "data"}), ErrArg("not an integer"));
+}
+
 }  // namespace dfly
diff --git a/src/server/channel_store.cc b/src/server/channel_store.cc
index f1afde3a9ed6..4a4c205bf348 100644
--- a/src/server/channel_store.cc
+++ b/src/server/channel_store.cc
@@ -1,10 +1,10 @@
-#include "server/channel_store.h"
-
 // Copyright 2023, DragonflyDB authors.  All rights reserved.
 // See LICENSE for licensing terms.
 //
+#include "server/channel_store.h"
 
 #include <absl/container/fixed_array.h>
+#include <absl/container/inlined_vector.h>
 
 #include "base/logging.h"
 #include "core/glob_matcher.h"
@@ -13,7 +13,6 @@
 #include "server/cluster_support.h"
 #include "server/conn_context.h"
 #include "server/engine_shard_set.h"
-#include "server/server_state.h"
 
 namespace dfly {
 using namespace std;
@@ -51,23 +50,22 @@ auto BuildSender(string_view channel, facade::ArgRange messages, bool sharded =
 
 }  // namespace
 
-bool ChannelStore::Subscriber::ByThread(const Subscriber& lhs, const Subscriber& rhs) {
-  return ByThreadId(lhs, rhs.LastKnownThreadId());
-}
+ChannelStore* channel_store = nullptr;
 
-bool ChannelStore::Subscriber::ByThreadId(const Subscriber& lhs, const unsigned thread) {
-  return lhs.LastKnownThreadId() < thread;
+ChannelStore::UpdatablePointer::UpdatablePointer(const UpdatablePointer& other) {
+  ptr.store(other.ptr.load(memory_order_relaxed), memory_order_relaxed);
 }
 
-ChannelStore::UpdatablePointer::UpdatablePointer(const UpdatablePointer& other) {
+ChannelStore::UpdatablePointer::UpdatablePointer(UpdatablePointer&& other) noexcept {
   ptr.store(other.ptr.load(memory_order_relaxed), memory_order_relaxed);
+  other.ptr.store(nullptr, memory_order_relaxed);
 }
 
 ChannelStore::SubscribeMap* ChannelStore::UpdatablePointer::Get() const {
   return ptr.load(memory_order_acquire);  // sync pointed memory
 }
 
-void ChannelStore::UpdatablePointer::Set(ChannelStore::SubscribeMap* sm) {
+void ChannelStore::UpdatablePointer::Set(ChannelStore::SubscribeMap* sm) const {
   ptr.store(sm, memory_order_release);  // sync pointed memory
 }
 
@@ -79,49 +77,21 @@ const ChannelStore::SubscribeMap& ChannelStore::UpdatablePointer::operator*() co
   return *Get();
 }
 
-void ChannelStore::ChannelMap::Add(string_view key, ConnectionContext* me, uint32_t thread_id) {
-  auto it = find(key);
-  if (it == end())
-    it = emplace(key, new SubscribeMap{}).first;
-  it->second->emplace(me, thread_id);
-}
-
-void ChannelStore::ChannelMap::Remove(string_view key, ConnectionContext* me) {
-  if (auto it = find(key); it != end()) {
-    it->second->erase(me);
-    if (it->second->empty())
-      erase(it);
-  }
-}
-
-void ChannelStore::ChannelMap::DeleteAll() {
-  for (auto [k, ptr] : *this)
-    delete ptr.Get();
-}
-
-ChannelStore::ChannelStore() : channels_{new ChannelMap{}}, patterns_{new ChannelMap{}} {
-  control_block.most_recent = this;
+bool ChannelStore::Subscriber::ByThread(const Subscriber& lhs, const Subscriber& rhs) {
+  return ByThreadId(lhs, rhs.LastKnownThreadId());
 }
 
-ChannelStore::ChannelStore(ChannelMap* channels, ChannelMap* patterns)
-    : channels_{channels}, patterns_{patterns} {
+bool ChannelStore::Subscriber::ByThreadId(const Subscriber& lhs, const unsigned thread) {
+  return lhs.LastKnownThreadId() < thread;
 }
 
-void ChannelStore::Destroy() {
-  control_block.update_mu.lock();
-  control_block.update_mu.unlock();
-
-  auto* store = control_block.most_recent.load(memory_order_relaxed);
-  for (auto* chan_map : {store->channels_, store->patterns_}) {
-    chan_map->DeleteAll();
-    delete chan_map;
-  }
-  delete control_block.most_recent;
+ChannelStore::~ChannelStore() {
+  auto del_cb = [](const string&, UpdatablePointer& up) { delete up.Get(); };
+  channels_.ForEachExclusive(del_cb);
+  patterns_.ForEachExclusive(del_cb);
 }
 
-ChannelStore::ControlBlock ChannelStore::control_block;
-
-unsigned ChannelStore::SendMessages(std::string_view channel, facade::ArgRange messages,
+unsigned ChannelStore::SendMessages(string_view channel, facade::ArgRange messages,
                                     bool sharded) const {
   vector<Subscriber> subscribers = FetchSubscribers(channel);
   if (subscribers.empty())
@@ -163,17 +133,19 @@ unsigned ChannelStore::SendMessages(std::string_view channel, facade::ArgRange m
   return subscribers_ptr->size();
 }
 
+// Note: This function is not atomic. The underlying channel and pattern stores
+// may be modified concurrently, so the result may not reflect a fully consistent state.
+// This trade-off for avoiding synchronization is acceptable for pub/sub use cases.
 vector<ChannelStore::Subscriber> ChannelStore::FetchSubscribers(string_view channel) const {
   vector<Subscriber> res;
 
-  if (auto it = channels_->find(channel); it != channels_->end())
-    Fill(*it->second, string{}, &res);
+  channels_.FindIf(channel, [&](const UpdatablePointer& up) { Fill(*up, string{}, &res); });
 
-  for (const auto& [pat, subs] : *patterns_) {
+  patterns_.ForEachShared([&](const string& pat, const UpdatablePointer& up) {
     GlobMatcher matcher{pat, true};
     if (matcher.Matches(channel))
-      Fill(*subs, pat, &res);
-  }
+      Fill(*up, pat, &res);
+  });
 
   sort(res.begin(), res.end(), Subscriber::ByThread);
   return res;
@@ -190,40 +162,65 @@ void ChannelStore::Fill(const SubscribeMap& src, const string& pattern, vector<S
   }
 }
 
-std::vector<string> ChannelStore::ListChannels(const string_view pattern) const {
+vector<string> ChannelStore::ListChannels(const string_view pattern) const {
   vector<string> res;
   GlobMatcher matcher{pattern, true};
-  for (const auto& [channel, _] : *channels_) {
+  channels_.ForEachShared([&](const string& channel, const UpdatablePointer&) {
     if (pattern.empty() || matcher.Matches(channel))
       res.push_back(channel);
-  }
+  });
   return res;
 }
 
 size_t ChannelStore::PatternCount() const {
-  return patterns_->size();
+  return patterns_.SizeApproximate();
 }
 
 void ChannelStore::UnsubscribeAfterClusterSlotMigration(const cluster::SlotSet& deleted_slots) {
-  if (deleted_slots.Empty()) {
+  if (deleted_slots.Empty())
     return;
-  }
 
-  const uint32_t tid = util::ProactorBase::me()->GetPoolIndex();
-  ChannelStoreUpdater csu(false, false, nullptr, tid);
-
-  for (const auto& [channel, _] : *channels_) {
-    auto channel_slot = KeySlot(channel);
-    if (deleted_slots.Contains(channel_slot)) {
-      csu.Record(channel);
+  // Single pass: collect matching channels and their subscribers.
+  absl::flat_hash_map<string, vector<Subscriber>> owned_subs;
+  channels_.ForEachShared([&](const string& channel, const UpdatablePointer& up) {
+    if (!deleted_slots.Contains(KeySlot(channel)))
+      return;
+    vector<Subscriber> subs;
+    Fill(*up, string{}, &subs);
+    if (!subs.empty()) {
+      sort(subs.begin(), subs.end(), Subscriber::ByThread);
+      owned_subs.emplace(channel, std::move(subs));
     }
-  }
+  });
+
+  if (owned_subs.empty())
+    return;
+
+  for (const auto& [channel, _] : owned_subs)
+    RemoveAllSubscribers(false, channel);
+
+  ChannelsSubMap channel_subs_map;
+  channel_subs_map.reserve(owned_subs.size());
+  for (auto& [channel, subs] : owned_subs)
+    channel_subs_map.emplace(channel, std::move(subs));
+
+  shard_set->pool()->AwaitFiberOnAll([&channel_subs_map](unsigned idx, util::ProactorBase*) {
+    channel_store->UnsubscribeConnectionsFromDeletedSlots(channel_subs_map, idx);
+  });
+}
 
-  csu.ApplyAndUnsubscribe();
+void ChannelStore::RemoveAllSubscribers(bool pattern, string_view channel) {
+  ChannelMap& map = pattern ? patterns_ : channels_;
+  map.Mutate(channel, [&](const auto& m, auto LockReaders) {
+    auto it = m.find(channel);
+    if (it == m.end())
+      return;
+    auto locked_map = LockReaders();
+    delete it->second.Get();
+    locked_map.map.erase(it);
+  });
 }
 
-// TODO: Reuse common code with Send function
-// TODO: Find proper solution to hacky `force_unsubscribe` flag or at least move logic out of io
 void ChannelStore::UnsubscribeConnectionsFromDeletedSlots(const ChannelsSubMap& sub_map,
                                                           uint32_t idx) {
   for (const auto& [channel, subscribers] : sub_map) {
@@ -249,157 +246,97 @@ ChannelStoreUpdater::ChannelStoreUpdater(bool pattern, bool to_add, ConnectionCo
     : pattern_{pattern}, to_add_{to_add}, cntx_{cntx}, thread_id_{thread_id} {
 }
 
-void ChannelStoreUpdater::Record(string_view key) {
-  ops_.emplace_back(key);
-}
-
-pair<ChannelStore::ChannelMap*, bool> ChannelStoreUpdater::GetTargetMap(ChannelStore* store) {
-  auto* target = pattern_ ? store->patterns_ : store->channels_;
-
-  for (auto key : ops_) {
-    auto it = target->find(key);
-    DCHECK(it != target->end() || to_add_);
-    // We need to make a copy, if we are going to add or delete new map slot.
-    if ((to_add_ && it == target->end()) || (!to_add_ && it->second->size() == 1))
-      return {new ChannelStore::ChannelMap{*target}, true};
-  }
-
-  return {target, false};
-}
-
-void ChannelStoreUpdater::Modify(ChannelMap* target, string_view key) {
-  using SubscribeMap = ChannelStore::SubscribeMap;
-
-  auto it = target->find(key);
-
-  // New key, add new slot.
-  if (to_add_ && it == target->end()) {
-    target->emplace(key, new SubscribeMap{{cntx_, thread_id_}});
-    return;
-  }
-
-  // Last entry for key, remove slot.
-  if (!to_add_ && it->second->size() == 1) {
-    DCHECK(it->second->begin()->first == cntx_);
-    freelist_.push_back(it->second.Get());
-    target->erase(it);
-    return;
-  }
-
-  // RCU update existing SubscribeMap entry.
-  DCHECK(!it->second->empty());
-  auto* replacement = new SubscribeMap{*it->second};
-  if (to_add_)
-    replacement->emplace(cntx_, thread_id_);
-  else
-    replacement->erase(cntx_);
-
-  // The pointer can still be in use, so delay freeing it
-  // until the dispatch and update the slot atomically.
-  freelist_.push_back(it->second.Get());
-  it->second.Set(replacement);
+void ChannelStoreUpdater::Record(string_view channel) {
+  ChannelStore::ChannelMap& map = pattern_ ? channel_store->patterns_ : channel_store->channels_;
+  size_t sid = map.ShardOf(channel);
+  ops_[sid].push_back(channel);
 }
 
 void ChannelStoreUpdater::Apply() {
-  // Wait for other updates to finish, lock the control block and update store pointer.
-  auto& cb = ChannelStore::control_block;
-  cb.update_mu.lock();
-  auto* store = cb.most_recent.load(memory_order_relaxed);
-
-  // Get target map (copied if needed) and apply operations.
-  auto [target, copied] = GetTargetMap(store);
-  for (auto key : ops_)
-    Modify(target, key);
-
-  // Prepare replacement.
-  auto* replacement = store;
-  if (copied) {
-    auto* new_chans = pattern_ ? store->channels_ : target;
-    auto* new_patterns = pattern_ ? target : store->patterns_;
-    replacement = new ChannelStore{new_chans, new_patterns};
-  }
+  ChannelStore::ChannelMap& map = pattern_ ? channel_store->patterns_ : channel_store->channels_;
 
-  // Update control block and unlock it.
-  cb.most_recent.store(replacement, memory_order_relaxed);
-  cb.update_mu.unlock();
-
-  // Update thread local references. Readers fetch subscribers via FetchSubscribers,
-  // which runs without preemption, and store references to them in self container Subscriber
-  // structs. This means that any point on the other thread is safe to update the channel store.
-  // Regardless of whether we need to replace, we dispatch to make sure all
-  // queued SubscribeMaps in the freelist are no longer in use.
-  shard_set->pool()->AwaitBrief([](unsigned idx, util::ProactorBase*) {
-    ServerState::tlocal()->UpdateChannelStore(
-        // Do not use memory_order_relaxed, we need to fetch the latest value of
-        // the control block
-        ChannelStore::control_block.most_recent.load(std::memory_order_seq_cst));
-  });
+  for (size_t sid = 0; sid < ChannelStore::ChannelMap::kNumShards; ++sid) {
+    const auto& shard_keys = ops_[sid];
 
-  // Delete previous map and channel store.
-  if (copied) {
-    delete (pattern_ ? store->patterns_ : store->channels_);
-    delete store;
-  }
-
-  for (auto ptr : freelist_)
-    delete ptr;
-}
-
-void ChannelStoreUpdater::ApplyAndUnsubscribe() {
-  DCHECK(to_add_ == false);
-  DCHECK(pattern_ == false);
-  DCHECK(cntx_ == nullptr);
-
-  if (ops_.empty()) {
-    return;
-  }
-
-  // Wait for other updates to finish, lock the control block and update store pointer.
-  auto& cb = ChannelStore::control_block;
-  cb.update_mu.lock();
-  auto* store = cb.most_recent.load(memory_order_relaxed);
-
-  // Deep copy, we will remove channels
-  auto* target = new ChannelStore::ChannelMap{*store->channels_};
-
-  for (auto key : ops_) {
-    auto it = target->find(key);
-    freelist_.push_back(it->second.Get());
-    target->erase(it);
-    continue;
-  }
-
-  // Prepare replacement.
-  auto* replacement = new ChannelStore{target, store->patterns_};
+    if (shard_keys.empty()) {
+      continue;
+    }
 
-  // Update control block and unlock it.
-  cb.most_recent.store(replacement, memory_order_relaxed);
-  cb.update_mu.unlock();
+    map.Mutate(ChannelStore::ChannelMap::ShardId{sid}, [&](const auto& m, auto LockReaders) {
+      // Track which keys require map changes - new insert or last-subscriber erase.
+      absl::InlinedVector<bool, 8> needs_map_change(shard_keys.size(), false);
+      bool has_map_change = false;
+
+      // Apply RCU update if possible, track if map change is needed.
+      for (size_t i = 0; i < shard_keys.size(); ++i) {
+        std::string_view key = shard_keys[i];
+        auto it = m.find(key);
+        if (to_add_) {
+          if (it == m.end()) {
+            needs_map_change[i] = true;
+            has_map_change = true;
+          } else {
+            auto* old_sm = it->second.Get();
+            auto* new_sm = new ChannelStore::SubscribeMap{*old_sm};
+            new_sm->emplace(cntx_, thread_id_);
+            it->second.Set(new_sm);
+            freelist_[sid].push_back(old_sm);
+          }
+        } else {
+          if (it == m.end())
+            continue;
+          DCHECK(!it->second->empty());
+          if (it->second->size() == 1) {
+            // If a channel is being deleted because the last subscriber is leaving,
+            // make sure the one leaving is actually that last subscriber.
+            DCHECK(it->second->begin()->first == cntx_);
+            needs_map_change[i] = true;
+            has_map_change = true;
+          } else {
+            auto* old_sm = it->second.Get();
+            // You cannot unsubscribe from a channel if you aren't subscribed to it in the first
+            // place.
+            DCHECK(old_sm->contains(cntx_));
+            auto* new_sm = new ChannelStore::SubscribeMap{*old_sm};
+            new_sm->erase(cntx_);
+            it->second.Set(new_sm);
+            freelist_[sid].push_back(old_sm);
+          }
+        }
+      }
 
-  // FetchSubscribers is not thead safe so we need to fetch here before we do the hop below.
-  // Bonus points because now we compute subscribers only once.
-  absl::flat_hash_map<std::string_view, std::vector<ChannelStore::Subscriber>> subs;
-  for (auto channel : ops_) {
-    auto channel_subs = ServerState::tlocal()->channel_store()->FetchSubscribers(channel);
-    DCHECK(!subs.contains(channel));
-    subs[channel] = std::move(channel_subs);
+      // Apply map changes under exclusive if needed.
+      if (has_map_change) {
+        auto locked_map = LockReaders();
+        for (size_t i = 0; i < shard_keys.size(); ++i) {
+          if (!needs_map_change[i]) {
+            continue;
+          }
+          std::string_view key = shard_keys[i];
+          if (to_add_) {
+            DCHECK(!locked_map.map.contains(key));
+            locked_map.map.emplace(std::string{key},
+                                   ChannelStore::UpdatablePointer{
+                                       new ChannelStore::SubscribeMap{{cntx_, thread_id_}}});
+          } else {
+            auto it = locked_map.map.find(key);
+            DCHECK(it != locked_map.map.end());
+            delete it->second.Get();
+            locked_map.map.erase(it);
+          }
+        }
+      }
+    });
+
+    // Delete old SubscribeMaps after taking exclusive read lock.
+    if (!freelist_[sid].empty()) {
+      map.WithReadExclusiveLock(ChannelStore::ChannelMap::ShardId{sid},
+                                [&old_sms = freelist_[sid]]() {
+                                  for (auto* sm : old_sms)
+                                    delete sm;
+                                });
+    }
   }
-  // Update thread local references. Readers fetch subscribers via FetchSubscribers,
-  // which runs without preemption, and store references to them in self container Subscriber
-  // structs. This means that any point on the other thread is safe to update the channel store.
-  // Regardless of whether we need to replace, we dispatch to make sure all
-  // queued SubscribeMaps in the freelist are no longer in use.
-  shard_set->pool()->AwaitFiberOnAll([&subs](unsigned idx, util::ProactorBase*) {
-    ServerState::tlocal()->UnsubscribeSlotsAndUpdateChannelStore(
-        subs, ChannelStore::control_block.most_recent.load(memory_order_relaxed));
-  });
-
-  // Delete previous map and channel store.
-  delete store->channels_;
-  delete store;
-
-  for (auto ptr : freelist_)
-    delete ptr;
 }
 
 }  // namespace dfly
diff --git a/src/server/channel_store.h b/src/server/channel_store.h
index 7d4d5221fdd7..4371526ece5f 100644
--- a/src/server/channel_store.h
+++ b/src/server/channel_store.h
@@ -4,17 +4,19 @@
 #pragma once
 
 #include <absl/container/flat_hash_map.h>
+#include <absl/hash/hash.h>
 
+#include <array>
+#include <atomic>
 #include <string_view>
 
+#include "core/sharded_hash_map.h"
 #include "facade/connection_ref.h"
 #include "facade/facade_types.h"
-#include "util/fibers/synchronization.h"
 
 namespace dfly {
 
 class ConnectionContext;
-class ChannelStoreUpdater;
 
 namespace cluster {
 class SlotSet;
@@ -22,26 +24,33 @@ class SlotSet;
 
 // ChannelStore manages PUB/SUB subscriptions.
 //
-// Updates are carried out via RCU (read-copy-update). Each thread stores a pointer to ChannelStore
-// in its local ServerState and uses it for reads. Whenever an update needs to be performed,
-// a new ChannelStore is constructed with the requested modifications and broadcasted to all
-// threads.
+// The store is backed by two ChannelMap instances (ShardedHashMap<string, UpdatablePointer, 16>)
+// — one for exact-channel subscriptions and one for pattern subscriptions.
+// Each of the 16 shards carries two independent fiber-aware mutexes:
 //
-// ServerState ChannelStore* -> ChannelMap* -> atomic<SubscribeMap*> (cntx -> thread)
+//   write_mu_  — serializes all writers within the shard; readers never acquire it.
+//   read_mu_   — taken shared by readers; taken exclusively only for structural map changes
+//                and for safe deletion of old SubscribeMaps (draining in-flight readers).
 //
-// Specifically, whenever a new channel is registered or a channel is removed fully,
-// a new ChannelMap for the specified type (channel/pattern) needs to be constructed. However, if
-// only a single SubscribeMap is modified (no map ChannelMap slots are added or removed),
-// we can update only it with a simpler version of RCU, as SubscribeMap is stored as an atomic
-// pointer inside ChannelMap.
+// UpdatablePointer wraps an atomic<SubscribeMap*> enabling RCU-style pointer swap:
+// a writer copies the old map, modifies the copy, then atomically stores the new pointer
+// via UpdatablePointer::Set — without holding read_mu_, so readers run concurrently.
 //
-// To prevent parallel (and thus overlapping) updates, a centralized ControlBlock is used.
-// Update operations are carried out by the ChannelStoreUpdater.
+// Add / Remove subscriber flow (via ChannelStoreUpdater::Apply)
+// -------------------------------------------------------------
+//   Key already exists (add/remove subscriber from a non-empty channel):
+//     1. Acquire write_mu_ exclusively (Mutate) — serializes writers on this shard.
+//     2. Copy the SubscribeMap, modify the copy, atomically swap via UpdatablePointer::Set.
+//        Readers are NOT blocked; they may still read the old pointer.
+//     3. Release write_mu_ (Mutate returns).
+//     4. Acquire read_mu_ exclusively (WithReadExclusiveLock, separate call) — this drains
+//        any reader that loaded the old SubscribeMap pointer, then deletes it.
+//
+//   Key does not exist (first subscriber) / last subscriber removed (channel erased):
+//     Inside the Mutate callback, acquire read_mu_ exclusively via the
+//     AcquireReaderExclusiveLock callable — blocks all readers in the shard while
+//     inserting or erasing the key. Writers on other shards are unaffected.
 //
-// A centralized ChannelStore, contrary to sharded storage, avoids contention on a single shard
-// thread for heavy throughput on a single channel and thus seamlessly scales on multiple threads
-// even with a small number of channels. In general, it has a slightly lower latency, due to the
-// fact that no hop is required to fetch the subscribers.
 class ChannelStore {
   friend class ChannelStoreUpdater;
 
@@ -58,12 +67,17 @@ class ChannelStore {
     std::string pattern;  // non-empty if registered via psubscribe
   };
 
-  ChannelStore();
+  ChannelStore() = default;
+  ~ChannelStore();
+
+  ChannelStore(const ChannelStore&) = delete;
+  ChannelStore& operator=(const ChannelStore&) = delete;
 
-  // Send messages to channel, block on connection backpressure
+  // Send messages to channel, block on connection backpressure.
   unsigned SendMessages(std::string_view channel, facade::ArgRange messages, bool sharded) const;
 
   // Fetch all subscribers for channel, including matching patterns.
+  // Note: not a global snapshot — each shard is locked independently.
   std::vector<Subscriber> FetchSubscribers(std::string_view channel) const;
 
   std::vector<std::string> ListChannels(const std::string_view pattern) const;
@@ -72,102 +86,86 @@ class ChannelStore {
 
   void UnsubscribeAfterClusterSlotMigration(const cluster::SlotSet& deleted_slots);
 
-  using ChannelsSubMap =
-      absl::flat_hash_map<std::string_view, std::vector<ChannelStore::Subscriber>>;
-  void UnsubscribeConnectionsFromDeletedSlots(const ChannelsSubMap& sub_map, uint32_t idx);
-
-  // Destroy current instance and delete it.
-  static void Destroy();
-
  private:
   using ThreadId = unsigned;
 
-  // Subscribers for a single channel/pattern.
+  using ChannelsSubMap =
+      absl::flat_hash_map<std::string_view, std::vector<ChannelStore::Subscriber>>;
+
+  // Subscribers for a single channel/pattern: connection context → owning thread-id.
   using SubscribeMap = absl::flat_hash_map<ConnectionContext*, ThreadId>;
 
-  // Wrapper around atomic pointer that allows copying and moving.
-  // Made to overcome restrictions of absl::flat_hash_map.
-  // Copy/Move don't need to be atomic with RCU.
+  // Atomic wrapper around a raw SubscribeMap pointer enabling RCU-style pointer swap.
+  // Updated under write_mu_ exclusive (does NOT need read_mu_), so readers can be
+  // concurrently active. The old SubscribeMap is deleted via WithReadExclusiveLock
+  // after write_mu_ is released — that acquires read_mu_ exclusive, draining any
+  // in-flight readers before calling delete.
   struct UpdatablePointer {
-    UpdatablePointer(SubscribeMap* sm) : ptr{sm} {
+    explicit UpdatablePointer(SubscribeMap* sm) : ptr{sm} {
     }
 
     UpdatablePointer(const UpdatablePointer& other);
+    UpdatablePointer(UpdatablePointer&& other) noexcept;
+
+    UpdatablePointer& operator=(const UpdatablePointer&) = delete;
+    UpdatablePointer& operator=(UpdatablePointer&&) = delete;
 
     SubscribeMap* Get() const;
-    void Set(SubscribeMap* sm);
+    void Set(SubscribeMap* sm) const;
 
     SubscribeMap* operator->() const;
     const SubscribeMap& operator*() const;
 
    private:
-    std::atomic<SubscribeMap*> ptr;
+    mutable std::atomic<SubscribeMap*> ptr;
   };
 
-  // SubscriberMaps for channels/patterns.
-  struct ChannelMap : absl::flat_hash_map<std::string, UpdatablePointer> {
-    void Add(std::string_view key, ConnectionContext* me, uint32_t thread_id);
-    void Remove(std::string_view key, ConnectionContext* me);
-
-    // Delete all stored SubscribeMap pointers.
-    void DeleteAll();
-  };
-
-  // Centralized controller to prevent overlaping updates.
-  struct ControlBlock {
-    std::atomic<ChannelStore*> most_recent;
-    util::fb2::Mutex update_mu;  // locked during updates.
+  // Transparent hash: accepts both std::string and std::string_view for heterogeneous lookup.
+  struct StringViewHash {
+    using is_transparent = void;
+    size_t operator()(std::string_view sv) const {
+      return absl::Hash<std::string_view>{}(sv);
+    }
   };
 
- private:
-  static ControlBlock control_block;
+  using ChannelMap =
+      ShardedHashMap<std::string, UpdatablePointer, 16, StringViewHash, std::equal_to<>>;
 
-  ChannelStore(ChannelMap* channels, ChannelMap* patterns);
+  // Remove all subscribers from a channel, erasing it from the map.
+  void RemoveAllSubscribers(bool pattern, std::string_view channel);
 
   static void Fill(const SubscribeMap& src, const std::string& pattern,
                    std::vector<Subscriber>* out);
 
-  ChannelMap* channels_;
-  ChannelMap* patterns_;
+  void UnsubscribeConnectionsFromDeletedSlots(const ChannelsSubMap& sub_map, uint32_t idx);
+
+  ChannelMap channels_;
+  ChannelMap patterns_;
 };
 
-// Performs RCU (read-copy-update) updates to the channel store.
-// See ChannelStore header top for design details.
-// Queues operations and performs them with Apply().
+extern ChannelStore* channel_store;
+
+// ChannelStoreUpdater batches multiple subscribe/unsubscribe operations for a single
+// connection, groups them by shard index, and processes each shard in one Mutate call
+// to minimise lock acquisitions.
 class ChannelStoreUpdater {
  public:
   ChannelStoreUpdater(bool pattern, bool to_add, ConnectionContext* cntx, uint32_t thread_id);
 
-  void Record(std::string_view key);
+  void Record(std::string_view channel);
   void Apply();
 
-  // Used for cluster when slots migrate. We need to:
-  // 1. Remove the channel from the copy.
-  // 2. Unsuscribe all the connections from each channel.
-  // 3. Update the control block pointer.
-  void ApplyAndUnsubscribe();
-
- private:
-  using ChannelMap = ChannelStore::ChannelMap;
-
-  // Get target map and flag whether it was copied.
-  // Must be called with locked control block.
-  std::pair<ChannelMap*, bool> GetTargetMap(ChannelStore* store);
-
-  // Apply modify operation to target map.
-  void Modify(ChannelMap* target, std::string_view key);
-
  private:
   bool pattern_;
   bool to_add_;
   ConnectionContext* cntx_;
   uint32_t thread_id_;
 
-  // Pending operations.
-  std::vector<std::string_view> ops_;
-
-  // Replaced SubscribeMaps that need to be deleted safely.
-  std::vector<ChannelStore::SubscribeMap*> freelist_;
+  // Pending operations grouped by shard index.
+  std::array<std::vector<std::string_view>, ChannelStore::ChannelMap::kNumShards> ops_;
+  // Replaced SubscribeMaps that need to be deleted safely, grouped by shard index.
+  std::array<std::vector<ChannelStore::SubscribeMap*>, ChannelStore::ChannelMap::kNumShards>
+      freelist_;
 };
 
 }  // namespace dfly
diff --git a/src/server/cluster/cluster_config.cc b/src/server/cluster/cluster_config.cc
index bf325aa4d9a6..6dd9172d1768 100644
--- a/src/server/cluster/cluster_config.cc
+++ b/src/server/cluster/cluster_config.cc
@@ -53,6 +53,11 @@ bool IsConfigValid(const ClusterShardInfos& new_config) {
     return false;
   }
 
+  absl::flat_hash_set<string_view> master_ids;
+  for (const auto& shard : new_config) {
+    master_ids.insert(shard.master.id);
+  }
+
   for (const auto& shard : new_config) {
     for (const auto& slot_range : shard.slot_ranges) {
       if (slot_range.start > slot_range.end) {
@@ -77,6 +82,55 @@ bool IsConfigValid(const ClusterShardInfos& new_config) {
         slots_found[slot] = true;
       }
     }
+
+    vector<SlotRange> seen_migration_ranges;
+    absl::flat_hash_set<string_view> seen_migration_targets;
+    for (const auto& migration : shard.migrations) {
+      if (migration.node_info.id == shard.master.id) {
+        LOG(ERROR) << "Invalid cluster config: migration target equals source master="
+                   << shard.master.id;
+        return false;
+      }
+      if (!master_ids.contains(migration.node_info.id)) {
+        LOG(ERROR) << "Invalid cluster config: migration target " << migration.node_info.id
+                   << " is not a shard master in the config";
+        return false;
+      }
+      if (!seen_migration_targets.insert(migration.node_info.id).second) {
+        // Runtime matches outgoing migrations by target id alone, so duplicates would
+        // make cancellation ambiguous.
+        LOG(ERROR) << "Invalid cluster config: duplicate migration target "
+                   << migration.node_info.id << " in shard master=" << shard.master.id;
+        return false;
+      }
+      if (migration.slot_ranges.Empty()) {
+        LOG(ERROR) << "Invalid cluster config: empty migration slot_ranges to target "
+                   << migration.node_info.id << " in shard master=" << shard.master.id;
+        return false;
+      }
+
+      for (const auto& slot_range : migration.slot_ranges) {
+        if (!slot_range.IsValid()) {
+          LOG(ERROR) << "Invalid cluster config: bad migration slot range "
+                     << slot_range.ToString();
+          return false;
+        }
+        if (!shard.slot_ranges.Contains(slot_range)) {
+          LOG(ERROR) << "Invalid cluster config: migration range " << slot_range.ToString()
+                     << " is not owned by shard master=" << shard.master.id;
+          return false;
+        }
+        for (const auto& prev : seen_migration_ranges) {
+          if (slot_range.Overlaps(prev)) {
+            LOG(ERROR) << "Invalid cluster config: overlapping migration ranges "
+                       << slot_range.ToString() << " and " << prev.ToString()
+                       << " in shard master=" << shard.master.id;
+            return false;
+          }
+        }
+        seen_migration_ranges.push_back(slot_range);
+      }
+    }
   }
 
   if (!all_of(slots_found.begin(), slots_found.end(), [](bool b) { return b; }) > 0UL) {
diff --git a/src/server/cluster/cluster_config_test.cc b/src/server/cluster/cluster_config_test.cc
index 4ea161591cdc..6546dd5e68ac 100644
--- a/src/server/cluster/cluster_config_test.cc
+++ b/src/server/cluster/cluster_config_test.cc
@@ -574,6 +574,99 @@ TEST_F(ClusterConfigTest, InvalidConfigMigrationsWithoutIP) {
   EXPECT_EQ(config, nullptr);
 }
 
+struct MigrationValidationCase {
+  std::string name;
+  SlotRanges shard_slots;
+  std::vector<MigrationInfo> migrations;
+  bool expected_valid;
+};
+
+class MigrationValidationTest : public ClusterConfigTest,
+                                public testing::WithParamInterface<MigrationValidationCase> {};
+
+TEST_P(MigrationValidationTest, Validate) {
+  const auto& tc = GetParam();
+  // Build a two-shard config where the first shard holds tc.shard_slots (+ migrations)
+  // and the second shard owns the rest, so the full 16384 slot space is always covered.
+  SlotRanges rest = SlotSet(true).GetRemovedSlots(SlotSet(tc.shard_slots)).ToSlotRanges();
+  auto config = ClusterConfig::CreateFromConfig(
+      kMyId, ClusterShardInfos(
+                 {{.slot_ranges = tc.shard_slots,
+                   .master = {{.id = "id0", .ip = "127.0.0.1", .port = 3000}, NodeHealth::ONLINE},
+                   .replicas = {},
+                   .migrations = tc.migrations},
+                  {.slot_ranges = rest,
+                   .master = {{.id = "id1", .ip = "127.0.0.1", .port = 3001}, NodeHealth::ONLINE},
+                   .replicas = {},
+                   .migrations = {}}}));
+  EXPECT_EQ(config == nullptr, !tc.expected_valid);
+}
+
+static MigrationInfo Mig(SlotRanges ranges, std::string target = "id1") {
+  return {.slot_ranges = std::move(ranges),
+          .node_info = {.id = std::move(target), .ip = "127.0.0.1", .port = 9001}};
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MigrationValidation, MigrationValidationTest,
+    testing::Values(
+        MigrationValidationCase{
+            "Subset", SlotRanges({{0, 8000}}), {Mig(SlotRanges({{7000, 8000}}))}, true},
+        MigrationValidationCase{
+            "ExactMatch", SlotRanges({{0, 8000}}), {Mig(SlotRanges({{0, 8000}}))}, true},
+        MigrationValidationCase{"MultipleMigrationRanges",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{100, 200}, {7000, 8000}}))},
+                                true},
+        MigrationValidationCase{"MultiRangeShardSubsetOfOne",
+                                SlotRanges({{0, 500}, {1000, 8000}}),
+                                {Mig(SlotRanges({{2000, 3000}}))},
+                                true},
+        MigrationValidationCase{
+            "FullyOutsideShard", SlotRanges({{0, 8000}}), {Mig(SlotRanges({{8500, 8600}}))}, false},
+        MigrationValidationCase{"PartiallyOutsideShard",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{7500, 8500}}))},
+                                false},
+        MigrationValidationCase{"StartGreaterThanEnd",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{8000, 7000}}))},
+                                false},
+        MigrationValidationCase{
+            "EndOutOfBounds", SlotRanges({{0, 8000}}), {Mig(SlotRanges({{100, 17000}}))}, false},
+        MigrationValidationCase{"InShardGap",
+                                SlotRanges({{0, 500}, {1000, 8000}}),
+                                {Mig(SlotRanges({{700, 800}}))},
+                                false},
+        MigrationValidationCase{"SpansAdjacentShardRanges",
+                                SlotRanges({{0, 100}, {101, 8000}}),
+                                {Mig(SlotRanges({{50, 150}}))},
+                                false},
+        MigrationValidationCase{"UnknownTarget",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{100, 200}}), "nonexistent")},
+                                false},
+        MigrationValidationCase{"SelfMigration",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{100, 200}}), "id0")},
+                                false},
+        MigrationValidationCase{"OverlappingMigrations",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{100, 500}})), Mig(SlotRanges({{400, 800}}))},
+                                false},
+        MigrationValidationCase{"OverlappingRangesInOneMigration",
+                                SlotRanges({{0, 8000}}),
+                                {Mig(SlotRanges({{100, 500}, {300, 700}}))},
+                                false},
+        MigrationValidationCase{
+            "EmptyMigrationSlotRanges", SlotRanges({{0, 8000}}), {Mig(SlotRanges())}, false},
+        MigrationValidationCase{
+            "DuplicateTargetIds",
+            SlotRanges({{0, 8000}}),
+            {Mig(SlotRanges({{100, 200}}), "id1"), Mig(SlotRanges({{7000, 8000}}), "id1")},
+            false}),
+    [](const auto& info) { return info.param.name; });
+
 TEST_F(ClusterConfigTest, SlotSetAPI) {
   {
     SlotSet ss(false);
@@ -649,9 +742,7 @@ TEST_F(ClusterConfigTest, ConfigComparison) {
     {
       "slot_ranges": [ { "start": 0, "end": 16383 } ],
       "master": { "id": "id0", "ip": "localhost", "port": 3000 },
-      "replicas": [],
-      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
-                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
+      "replicas": []
     }
   ])json");
   EXPECT_NE(config1->GetConfig(), config2->GetConfig());
@@ -686,9 +777,14 @@ TEST_F(ClusterConfigTest, ConfigComparison) {
                      , "ip": "127.0.0.1", "port" : 9001, "node_id": "id2" }]
     },
     {
-      "slot_ranges": [ { "start": 8001, "end": 16383 } ],
+      "slot_ranges": [ { "start": 8001, "end": 12000 } ],
       "master": { "id": "id1", "ip": "localhost", "port": 3001 },
       "replicas": []
+    },
+    {
+      "slot_ranges": [ { "start": 12001, "end": 16383 } ],
+      "master": { "id": "id2", "ip": "localhost", "port": 3002 },
+      "replicas": []
     }
   ])json");
 
@@ -726,9 +822,7 @@ TEST_F(ClusterConfigTest, NodesHealth) {
       "slot_ranges": [ { "start": 0, "end": 16383 } ],
       "master": { "id": "id0", "ip": "localhost", "port": 3000, "health" : "online" },
       "replicas": [{ "id": "id1", "ip": "localhost", "port": 3001, "health" : "loading" },
-                   { "id": "id2", "ip": "localhost", "port": 3002, "health" : "fail" }],
-      "migrations": [{ "slot_ranges": [ { "start": 7000, "end": 8000 } ]
-                     , "ip": "127.0.0.1", "port" : 9001, "node_id": "id1" }]
+                   { "id": "id2", "ip": "localhost", "port": 3002, "health" : "fail" }]
     }
 
   ])json");
diff --git a/src/server/cluster/cluster_defs.h b/src/server/cluster/cluster_defs.h
index fb3ee6915ef4..59eeda212bfe 100644
--- a/src/server/cluster/cluster_defs.h
+++ b/src/server/cluster/cluster_defs.h
@@ -34,6 +34,10 @@ struct SlotRange {
     return id >= start && id <= end;
   }
 
+  bool Overlaps(SlotRange other) const noexcept {
+    return start <= other.end && other.start <= end;
+  }
+
   std::string ToString() const;
 };
 
@@ -50,6 +54,15 @@ class SlotRanges {
     return false;
   }
 
+  // True iff `r` lies entirely within a single range of *this. Expects r.IsValid().
+  bool Contains(SlotRange r) const noexcept {
+    for (const auto& sr : ranges_) {
+      if (sr.start <= r.start && r.end <= sr.end)
+        return true;
+    }
+    return false;
+  }
+
   size_t Size() const noexcept {
     return ranges_.size();
   }
diff --git a/src/server/cluster/cluster_family.cc b/src/server/cluster/cluster_family.cc
index f72d1c3c0f7e..f323b29ccfe2 100644
--- a/src/server/cluster/cluster_family.cc
+++ b/src/server/cluster/cluster_family.cc
@@ -505,7 +505,6 @@ void DeleteSlots(Transaction* trans, const SlotRanges& slots_ranges) {
       },
       true);
 
-  auto* channel_store = ServerState::tlocal()->channel_store();
   auto deleted = SlotSet(slots_ranges);
   channel_store->UnsubscribeAfterClusterSlotMigration(deleted);
 }
diff --git a/src/server/cluster/coordinator.cc b/src/server/cluster/coordinator.cc
index 3b37183febf7..068dbc0b98db 100644
--- a/src/server/cluster/coordinator.cc
+++ b/src/server/cluster/coordinator.cc
@@ -146,6 +146,8 @@ class Coordinator::CrossShardClient : public ProtocolClient {
   }
 
  private:
+  ExecutionState exec_st_;
+
   std::queue<std::shared_ptr<CrossShardRequest>> send_queue_;
   std::queue<std::shared_ptr<CrossShardRequest>> resp_queue_;
 
diff --git a/src/server/cluster/outgoing_slot_migration.cc b/src/server/cluster/outgoing_slot_migration.cc
index 3626337f7c76..1d93c2e66e97 100644
--- a/src/server/cluster/outgoing_slot_migration.cc
+++ b/src/server/cluster/outgoing_slot_migration.cc
@@ -109,6 +109,7 @@ class OutgoingMigration::SliceSlotMigration : private ProtocolClient {
   using ProtocolClient::CloseSocket;
 
  private:
+  ExecutionState exec_st_;
   RestoreStreamer streamer_;
 };
 
diff --git a/src/server/cluster/outgoing_slot_migration.h b/src/server/cluster/outgoing_slot_migration.h
index cfe23a8d3522..aeb1af2fd1ce 100644
--- a/src/server/cluster/outgoing_slot_migration.h
+++ b/src/server/cluster/outgoing_slot_migration.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "server/cluster/cluster_defs.h"
+#include "server/execution_state.h"
 #include "server/protocol_client.h"
 #include "server/transaction.h"
 
@@ -89,6 +90,8 @@ class OutgoingMigration : private ProtocolClient {
 
   void OnAllShards(std::function<void(UniqueSliceSlotMigration&)>);
 
+  ExecutionState exec_st_;
+
   MigrationInfo migration_info_;
   std::vector<std::unique_ptr<SliceSlotMigration>> slot_migrations_;
   ServerFamily* server_family_;
diff --git a/src/server/container_utils.cc b/src/server/container_utils.cc
index f193f7100d45..3bfc54b6695c 100644
--- a/src/server/container_utils.cc
+++ b/src/server/container_utils.cc
@@ -368,8 +368,13 @@ OpResult<string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_ty
 
   auto* ns = &trans->GetNamespace();
   const auto key_checker = [req_obj_type, ns](EngineShard* owner, const DbContext& context,
-                                              Transaction*, std::string_view key) -> bool {
-    return ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, req_obj_type).ok();
+                                              std::string_view key) -> KeyReadyResult {
+    auto res = ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, req_obj_type);
+    if (res.ok())
+      return KeyReadyResult::kReady;
+    if (res.status() == OpStatus::WRONG_TYPE)
+      return KeyReadyResult::kNotReady;
+    return KeyReadyResult::kKeyNotFound;
   };
 
   auto status =
diff --git a/src/server/db_slice.cc b/src/server/db_slice.cc
index 4d2e83567c7d..c8322c805889 100644
--- a/src/server/db_slice.cc
+++ b/src/server/db_slice.cc
@@ -5,6 +5,7 @@
 #include "server/db_slice.h"
 
 #include "core/dense_set.h"
+#include "strings/human_readable.h"
 
 extern "C" {
 #include "redis/hyperloglog.h"
@@ -963,10 +964,17 @@ util::fb2::Fiber DbSlice::FlushDbIndexes(const std::vector<DbIndex>& indexes) {
   LOG_IF(DFATAL, !fetched_items_.empty())
       << "Some operation might bumped up items outside of a transaction";
 
-  auto cb = [flush_db_arr = std::move(flush_db_arr)]() mutable {
+  ShardId shard_id = owner_->shard_id();
+  auto cb = [flush_db_arr = std::move(flush_db_arr), shard_id]() mutable {
+    LOG(INFO) << "Drakarys shard " << shard_id << " cb entered (pre-destructors)"
+              << " rss="
+              << strings::HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
     flush_db_arr.clear();
     ServerState::tlocal()->DecommitMemory(ServerState::kDataHeap | ServerState::kBackingHeap |
                                           ServerState::kGlibcmalloc);
+    LOG(INFO) << "Drakarys shard " << shard_id << " finished decommit"
+              << " rss="
+              << strings::HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
   };
 
   return {"flush_dbs", std::move(cb)};
@@ -1432,8 +1440,8 @@ auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteEx
 
   // Send and clear accumulated expired key events
   if (auto& events = db_arr_[cntx.db_index]->expired_keys_events_; !events.empty()) {
-    ChannelStore* store = ServerState::tlocal()->channel_store();
-    store->SendMessages(absl::StrCat("__keyevent@", cntx.db_index, "__:expired"), events, false);
+    channel_store->SendMessages(absl::StrCat("__keyevent@", cntx.db_index, "__:expired"), events,
+                                false);
     events.clear();
   }
 
diff --git a/src/server/debugcmd.cc b/src/server/debugcmd.cc
index 4a2e2dbe3329..d9302ecaea91 100644
--- a/src/server/debugcmd.cc
+++ b/src/server/debugcmd.cc
@@ -668,10 +668,15 @@ void DebugCmd::Run(CmdArgList args, CommandContext* cmd_cntx) {
         "    calling VALUES OFF command.",
         "TX",
         "    Performs transaction analysis per shard.",
-        "TRAFFIC <path>/<file_prefix> | [STOP]",
-        "    Use <path>/<file_prefix> to start traffic logging to the specified path.",
-        "    All recorded files will have the specified prefix.",
-        "    Use 'STOP' or do not specify any arguments to stop traffic logging.",
+        "TRAFFIC START <path>/<file_prefix> LISTENER <main|memcache|admin>",
+        "    Start traffic logging for a single listener type to files with the given",
+        "    path/prefix. LISTENER is required; mixing listeners in one recording is",
+        "    intentionally not supported - start separate recordings per listener.",
+        "TRAFFIC START <path>/<file_prefix> REPLICA",
+        "    On a replica, capture commands received from the master via the replication",
+        "    stream. Fails with an error on a master/standalone server.",
+        "TRAFFIC STOP",
+        "    Stop traffic logging started by a previous TRAFFIC START.",
         "RECVSIZE [<tid> | ENABLE | DISABLE]",
         "    Prints the histogram of the received request sizes on the given thread",
         "COMPRESSION [IMPORT <bintable> | EXPORT | SET <bintable>] [type]",
@@ -1051,25 +1056,103 @@ void DebugCmd::Exec(CommandContext* cmd_cntx) {
 }
 
 void DebugCmd::LogTraffic(CmdArgList args, CommandContext* cmd_cntx) {
-  optional<string> path;
+  using facade::Connection;
+
   auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
   if (ProactorBase::me()->GetKind() != ProactorBase::IOURING) {
     return cmd_cntx->SendError("Traffic recording supported only on iouring");
   }
 
-  if (args.size() == 1 && absl::AsciiStrToUpper(facade::ToSV(args.front())) != "STOP"sv) {
-    path = ArgS(args, 0);
-    LOG(INFO) << "Logging to traffic to " << *path << "*.bin";
-  } else {
+  // Syntax:
+  //   DEBUG TRAFFIC STOP
+  //   DEBUG TRAFFIC START <path> LISTENER <main|memcache|admin>
+  //   DEBUG TRAFFIC START <path> REPLICA
+  // A recording captures exactly one source; LISTENER and REPLICA are mutually
+  // exclusive. REPLICA captures commands received from a master via the
+  // replication stream (only meaningful while this server is a replica).
+  CmdArgParser parser(args);
+  if (parser.Check("STOP")) {
+    if (!parser.Finalize())
+      return cmd_cntx->SendError(parser.TakeError().MakeReply());
     LOG(INFO) << "Traffic logging stopped";
+    shard_set->pool()->AwaitFiberOnAll([](auto*) { Connection::StopTrafficLogging(); });
+    return rb->SendOk();
   }
 
-  shard_set->pool()->AwaitFiberOnAll([path](auto*) {
-    if (path)
-      facade::Connection::StartTrafficLogging(*path);
-    else
-      facade::Connection::StopTrafficLogging();
-  });
+  parser.ExpectTag("START");
+  auto path = parser.Next<string_view>();
+
+  Connection::ListenerType listener_type;
+  if (parser.Check("REPLICA")) {
+    // Replication stream is only incoming on a replica; there is no stream to
+    // capture on a master/standalone server. Fail fast so the caller gets a
+    // clear diagnosis instead of an empty log. We intentionally skip Finalize()
+    // here: the role error is more actionable than "extra arg after REPLICA",
+    // and calling Finalize would leave an unchecked UNPROCESSED error that
+    // trips CmdArgParser's destructor DCHECK.
+    if (ServerState::tlocal()->is_master) {
+      return cmd_cntx->SendError(
+          "REPLICA option requires this server to be a replica (current role is master)");
+    }
+    listener_type = Connection::ListenerType::REPLICA_RESP;
+  } else {
+    parser.ExpectTag("LISTENER");
+    listener_type = parser.MapNext("main", Connection::ListenerType::MAIN_RESP, "memcache",
+                                   Connection::ListenerType::MEMCACHE, "admin",
+                                   Connection::ListenerType::ADMIN_RESP);
+  }
+  if (!parser.Finalize())
+    return cmd_cntx->SendError(parser.TakeError().MakeReply());
+
+  LOG(INFO) << "Logging traffic to " << path << "*.bin, listener=" << unsigned(listener_type);
+
+  std::atomic<unsigned> started_new{0};
+  std::atomic<unsigned> already_logging{0};
+  std::atomic<unsigned> open_failed{0};
+  std::string path_str(path);
+  shard_set->pool()->AwaitFiberOnAll(
+      [path_str, listener_type, &started_new, &already_logging, &open_failed](auto*) {
+        switch (Connection::StartTrafficLogging(path_str, listener_type)) {
+          case Connection::StartTrafficResult::kStarted:
+            started_new.fetch_add(1, std::memory_order_relaxed);
+            break;
+          case Connection::StartTrafficResult::kAlreadyLogging:
+            already_logging.fetch_add(1, std::memory_order_relaxed);
+            break;
+          case Connection::StartTrafficResult::kOpenFailed:
+            open_failed.fetch_add(1, std::memory_order_relaxed);
+            break;
+        }
+      });
+
+  unsigned started = started_new.load(std::memory_order_relaxed);
+  unsigned refused = already_logging.load(std::memory_order_relaxed);
+  unsigned failed = open_failed.load(std::memory_order_relaxed);
+
+  // Any failure rolls back any thread that started, then reports a specific error.
+  // The only success path is when every thread reports kStarted.
+  if (failed > 0 || refused > 0) {
+    if (started > 0)
+      shard_set->pool()->AwaitFiberOnAll([](auto*) { Connection::StopTrafficLogging(); });
+
+    const char* msg;
+    if (failed > 0) {
+      msg =
+          "Failed to open traffic log file on one or more threads; "
+          "no recording is active. Check server logs for details.";
+    } else if (started > 0) {
+      // Partial state: some threads started a new recording while others refused
+      // (they were already logging). The previously-active recording is now split
+      // across two files and cannot be restored, so we stop everything.
+      msg =
+          "Traffic logging was in an inconsistent state; all recording has been stopped. "
+          "Retry DEBUG TRAFFIC START.";
+    } else {
+      // Every thread was already logging — the original recording is untouched.
+      msg = "Traffic logging is already in progress. Call DEBUG TRAFFIC STOP first.";
+    }
+    return cmd_cntx->SendError(msg);
+  }
   rb->SendOk();
 }
 
diff --git a/src/server/dflycmd.cc b/src/server/dflycmd.cc
index b058ac97808b..c4836476b36c 100644
--- a/src/server/dflycmd.cc
+++ b/src/server/dflycmd.cc
@@ -997,7 +997,7 @@ bool DflyCmd::CheckReplicaStateOrReply(const ReplicaInfo& repl_info, SyncState e
   return true;
 }
 
-void DflyCmd::Shutdown() {
+void DflyCmd::CancelReplicas() {
   ReplicaInfoMap pending;
   {
     util::fb2::LockGuard lk(mu_);
diff --git a/src/server/dflycmd.h b/src/server/dflycmd.h
index 34c60da572ab..6e2bc207807f 100644
--- a/src/server/dflycmd.h
+++ b/src/server/dflycmd.h
@@ -138,7 +138,7 @@ class DflyCmd {
   void OnClose(unsigned sync_id);
 
   // Stop all background processes so we can exit in orderly manner.
-  void Shutdown();
+  void CancelReplicas() ABSL_LOCKS_EXCLUDED(mu_);
 
   // Create new sync session. Returns (session_id, number of flows)
   std::pair<uint32_t, unsigned> CreateSyncSession(ConnectionState* state) ABSL_LOCKS_EXCLUDED(mu_);
diff --git a/src/server/dragonfly_test.cc b/src/server/dragonfly_test.cc
index 834c311c7df7..c2e2204f941e 100644
--- a/src/server/dragonfly_test.cc
+++ b/src/server/dragonfly_test.cc
@@ -16,6 +16,7 @@ extern "C" {
 #include "base/flags.h"
 #include "base/gtest.h"
 #include "base/logging.h"
+#include "facade/error.h"
 #include "facade/facade_test.h"
 #include "server/main_service.h"
 #include "server/test_utils.h"
@@ -288,6 +289,11 @@ TEST_F(DflyEngineTest, EvalSha) {
   EXPECT_THAT(resp, "c6459b95a0e81df97af6fdd49b1a9e0287a57363");
 }
 
+TEST_F(DflyEngineTest, EvalShaNegativeZeroNumKeys) {
+  EXPECT_THAT(Run({"evalsha", "k1", "-0"}), ErrArg(facade::kInvalidIntErr));
+  EXPECT_THAT(Run({"eval", "return 1", "-0"}), ErrArg(facade::kInvalidIntErr));
+}
+
 TEST_F(DflyEngineTest, ScriptFlush) {
   auto resp = Run({"script", "load", "return 5"});
   EXPECT_THAT(resp, ArgType(RespExpr::STRING));
diff --git a/src/server/error.cc b/src/server/error.cc
index be1cef8893b3..5c378f19d319 100644
--- a/src/server/error.cc
+++ b/src/server/error.cc
@@ -32,15 +32,18 @@ class error_category : public std::error_category {
 
 string error_category::message(int ev) const {
   switch (ev) {
-    case errc::wrong_signature:
+    case wrong_signature:
       return "Wrong signature while trying to load from rdb file";
-    case errc::out_of_memory:
+    case out_of_memory:
       return "Out of memory, or used memory is too high";
-    case errc::incorrect_snapshot_id:
+    case incorrect_snapshot_id:
       return "Snapshot id mismatch";
+    case rdb_chunk_budget_exceeded:
+      return "RDB chunk payload size exceeded";
+    case rdb_chunk_payload_remaining:
+      return "RDB chunk short read";
     default:
       return absl::StrCat("Internal error when loading RDB file ", ev);
-      break;
   }
 }
 
diff --git a/src/server/error.h b/src/server/error.h
index 392427a24d65..0c2e991ac078 100644
--- a/src/server/error.h
+++ b/src/server/error.h
@@ -84,6 +84,8 @@ enum errc {
   unsupported_operation = 13,
   value_expired = 14,  // applying to set and hmap
   incorrect_snapshot_id = 15,
+  rdb_chunk_budget_exceeded = 16,
+  rdb_chunk_payload_remaining = 17,
 };
 
 }  // namespace rdb
diff --git a/src/server/generic_family.cc b/src/server/generic_family.cc
index eb0ebc73ab3b..bbc7f26b28a7 100644
--- a/src/server/generic_family.cc
+++ b/src/server/generic_family.cc
@@ -7,6 +7,7 @@
 #include <absl/strings/ascii.h>
 #include <absl/strings/str_cat.h>
 
+#include <limits>
 #include <optional>
 
 #include "facade/cmd_arg_parser.h"
@@ -267,52 +268,29 @@ OpResult<DbSlice::ItAndUpdater> RdbRestoreValue::Add(string_view key, string_vie
 // args[3] .. args[n]: optional arguments that can be [REPLACE] [ABSTTL] [IDLETIME seconds]
 //            [FREQ frequency], in any order
 OpResult<RestoreArgs> RestoreArgs::TryFrom(const CmdArgList& args) {
+  using namespace facade;
   RestoreArgs out_args;
-  string cur_arg{ArgS(args, 1)};  // extract ttl
-  if (!absl::SimpleAtoi(cur_arg, &out_args.expiration_) || (out_args.expiration_ < 0)) {
+  CmdArgParser parser(args);
+
+  // args[0] = key (skip); args[1] = ttl; args[2] = serialized value (skip).
+  parser.Skip(1);
+  out_args.expiration_ = parser.Next<int64_t>();
+  if (parser.TakeError() || out_args.expiration_ < 0)
     return OpStatus::INVALID_INT;
-  }
+  parser.Skip(1);
 
-  // the 3rd arg is the serialized value, so we are starting from one pass it
-  // Note that all these are actually optional
-  // note about the redis doc for this command: https://redis.io/commands/restore/
-  // the IDLETIME and FREQ are not required, but to make this the same as in redis
-  // we would parse them and ensure that they are correct, maybe later they will be used
-  int64_t idle_time = 0;
-
-  for (size_t i = 3; i < args.size(); ++i) {
-    cur_arg = absl::AsciiStrToUpper(ArgS(args, i));
-    bool additional = args.size() - i - 1 >= 1;
-    if (cur_arg == "REPLACE") {
-      out_args.replace_ = true;
-    } else if (cur_arg == "ABSTTL") {
-      out_args.abs_time_ = true;
-    } else if (cur_arg == "STICK") {
-      out_args.sticky_ = true;
-    } else if (cur_arg == "IDLETIME" && additional) {
-      ++i;
-      cur_arg = ArgS(args, i);
-      if (!absl::SimpleAtoi(cur_arg, &idle_time)) {
-        return OpStatus::INVALID_INT;
-      }
-      if (idle_time < 0) {
-        return OpStatus::SYNTAX_ERR;
-      }
-    } else if (cur_arg == "FREQ" && additional) {
-      ++i;
-      cur_arg = ArgS(args, i);
-      int freq = 0;
-      if (!absl::SimpleAtoi(cur_arg, &freq)) {
-        return OpStatus::INVALID_INT;
-      }
-      if (freq < 0 || freq > 255) {
-        return OpStatus::OUT_OF_RANGE;  // need to translate in this case
-      }
-    } else {
-      LOG(WARNING) << "Got unknown command line option for restore '" << cur_arg << "'";
-      return OpStatus::SYNTAX_ERR;
-    }
+  // IDLETIME and FREQ are parsed (for compat with Redis) but not used currently. Both are
+  // range-checked at parse time via FInt — out-of-range values surface as INVALID_INT.
+  FInt<int64_t{0}, std::numeric_limits<int64_t>::max()> idle_time{};
+  FInt<0, 255> freq{};
+  parser.Apply(Exist("REPLACE", &out_args.replace_), Exist("ABSTTL", &out_args.abs_time_),
+               Exist("STICK", &out_args.sticky_), Tag("IDLETIME", &idle_time), Tag("FREQ", &freq));
+
+  if (!parser.Finalize()) {
+    auto err = parser.TakeError();
+    return err.type == CmdArgParser::INVALID_INT ? OpStatus::INVALID_INT : OpStatus::SYNTAX_ERR;
   }
+
   return out_args;
 }
 
@@ -1373,12 +1351,10 @@ void GenericFamily::Persist(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::Expire(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  string_view sec = ArgS(args, 1);
-  int64_t int_arg;
-
-  if (!absl::SimpleAtoi(sec, &int_arg)) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  facade::CmdArgParser parser{args};
+  auto [key, int_arg] = parser.Next<string_view, int64_t>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
 
   int_arg = std::max<int64_t>(int_arg, -1);
@@ -1403,12 +1379,10 @@ void GenericFamily::Expire(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::ExpireAt(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  string_view sec = ArgS(args, 1);
-  int64_t int_arg;
-
-  if (!absl::SimpleAtoi(sec, &int_arg)) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  facade::CmdArgParser parser{args};
+  auto [key, int_arg] = parser.Next<string_view, int64_t>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
 
   int_arg = std::max<int64_t>(int_arg, 0L);
@@ -1454,12 +1428,10 @@ void GenericFamily::Keys(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::PexpireAt(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  string_view msec = ArgS(args, 1);
-  int64_t int_arg;
-
-  if (!absl::SimpleAtoi(msec, &int_arg)) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  facade::CmdArgParser parser{args};
+  auto [key, int_arg] = parser.Next<string_view, int64_t>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
 
   int_arg = std::max<int64_t>(int_arg, 0L);
@@ -1485,12 +1457,10 @@ void GenericFamily::PexpireAt(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::Pexpire(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  string_view msec = ArgS(args, 1);
-  int64_t int_arg;
-
-  if (!absl::SimpleAtoi(msec, &int_arg)) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  facade::CmdArgParser parser{args};
+  auto [key, int_arg] = parser.Next<string_view, int64_t>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
   int_arg = std::max<int64_t>(int_arg, -1);
 
@@ -2026,28 +1996,17 @@ void SortGeneric(CmdArgList args, CommandContext* cmd_cntx, bool is_read_only) {
   SortParams params;
   params.is_read_only = is_read_only;
 
-  while (parser.HasNext()) {
-    if (parser.Check("ALPHA")) {
-      params.alpha = true;
-    } else if (parser.Check("DESC")) {
-      params.reversed = true;
-    } else if (parser.Check("ASC")) {
-      params.reversed = false;
-    } else if (parser.Check("LIMIT")) {
-      uint32_t offset = parser.Next<uint32_t>();
-      uint32_t limit = parser.Next<uint32_t>();
-      params.bounds = {offset, limit};
-    } else if (!is_read_only && parser.Check("STORE", &params.store_key)) {
-    } else if (parser.Check("BY", &params.by_pattern)) {
-    } else if (parser.Check("GET")) {
-      params.get_patterns.push_back(parser.Next());
-    } else {
-      LOG_EVERY_T(ERROR, 1) << "Unsupported option " << parser.Peek();
-      return cmd_cntx->SendError(kSyntaxErr);
-    }
-  }
+  parser.Apply(
+      Exist("ALPHA", &params.alpha), Map(&params.reversed, "DESC", true, "ASC", false),
+      Tag("LIMIT",
+          [&](CmdArgParser* p) {
+            auto [offset, limit] = p->Next<uint32_t, uint32_t>();
+            params.bounds = std::pair{offset, limit};
+          }),
+      If(!is_read_only, Tag("STORE", &params.store_key)), Tag("BY", &params.by_pattern),
+      Tag("GET", [&](CmdArgParser* p) { params.get_patterns.push_back(p->Next<string_view>()); }));
 
-  if (parser.HasError()) {
+  if (!parser.Finalize()) {
     return cmd_cntx->SendError(parser.TakeError().MakeReply());
   }
 
@@ -2296,10 +2255,9 @@ void GenericFamily::Restore(CmdArgList args, CommandContext* cmd_cntx) {
 void GenericFamily::FieldExpire(CmdArgList args, CommandContext* cmd_cntx) {
   CmdArgParser parser{args};
   string_view key = parser.Next();
-  string_view ttl_str = parser.Next();
-  uint32_t ttl_sec;
-  if (!absl::SimpleAtoi(ttl_str, &ttl_sec) || ttl_sec == 0 || ttl_sec > kMaxTtl) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  uint32_t ttl_sec = parser.Next<FInt<1u, kMaxTtl>>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
   CmdArgList fields = parser.Tail();
 
@@ -2338,11 +2296,10 @@ void GenericFamily::FieldTtl(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::Move(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  string_view target_db_sv = ArgS(args, 1);
-  int32_t target_db;
-  if (!absl::SimpleAtoi(target_db_sv, &target_db)) {
-    return cmd_cntx->SendError(kInvalidIntErr);
+  facade::CmdArgParser parser{args};
+  auto [key, target_db] = parser.Next<string_view, int32_t>();
+  if (auto err = parser.TakeError(); err) {
+    return cmd_cntx->SendError(err.MakeReply());
   }
 
   if (target_db < 0 || uint32_t(target_db) >= absl::GetFlag(FLAGS_dbnum)) {
@@ -2355,6 +2312,8 @@ void GenericFamily::Move(CmdArgList args, CommandContext* cmd_cntx) {
 
   OpStatus res = OpStatus::SKIPPED;
   ShardId target_shard = Shard(key, shard_set->size());
+  // Holds the serialized target_db for the journal record (ArgSlice cannot own storage).
+  string target_db_str = absl::StrCat(target_db);
   auto cb = [&](Transaction* t, EngineShard* shard) {
     // MOVE runs as a global transaction and is therefore scheduled on every shard.
     if (target_shard == shard->shard_id()) {
@@ -2363,7 +2322,7 @@ void GenericFamily::Move(CmdArgList args, CommandContext* cmd_cntx) {
       // MOVE runs as global command but we want to write the
       // command to only one journal.
       if (op_args.shard->journal()) {
-        RecordJournal(op_args, "MOVE"sv, ArgSlice{key, target_db_sv});
+        RecordJournal(op_args, "MOVE"sv, ArgSlice{key, target_db_str});
       }
     }
     return OpStatus::OK;
@@ -2444,9 +2403,9 @@ void GenericFamily::Pttl(CmdArgList args, CommandContext* cmd_cntx) {
 }
 
 void GenericFamily::Select(CmdArgList args, CommandContext* cmd_cntx) {
-  string_view key = ArgS(args, 0);
-  int64_t index;
-  if (!absl::SimpleAtoi(key, &index)) {
+  facade::CmdArgParser parser{args};
+  int64_t index = parser.Next<int64_t>();
+  if (parser.TakeError()) {
     return cmd_cntx->SendError(kInvalidDbIndErr);
   }
   if (IsClusterEnabled() && index != 0) {
diff --git a/src/server/generic_family_test.cc b/src/server/generic_family_test.cc
index 52ba2ae0b297..5f3881a694c6 100644
--- a/src/server/generic_family_test.cc
+++ b/src/server/generic_family_test.cc
@@ -723,6 +723,11 @@ TEST_F(GenericFamilyTest, Sort) {
   // desc strig
   ASSERT_THAT(Run({"sort", "list-1", "DESC", "ALPHA"}).GetVec(),
               ElementsAre("3.5", "200", "2.20", "10.1", "1.2"));
+  // ASC/DESC are not mutually exclusive — last one wins (matches Redis behavior).
+  ASSERT_THAT(Run({"sort", "list-1", "DESC", "ASC"}).GetVec(),
+              ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
+  ASSERT_THAT(Run({"sort", "list-1", "ASC", "DESC"}).GetVec(),
+              ElementsAre("200", "10.1", "3.5", "2.20", "1.2"));
   // limits
   ASSERT_THAT(Run({"sort", "list-1", "LIMIT", "0", "5"}).GetVec(),
               ElementsAre("1.2", "2.20", "3.5", "10.1", "200"));
diff --git a/src/server/journal/journal.cc b/src/server/journal/journal.cc
index a5fa4a44adef..865796e87843 100644
--- a/src/server/journal/journal.cc
+++ b/src/server/journal/journal.cc
@@ -34,6 +34,15 @@ void StartInThreadAtLsn(LSN lsn) {
   journal_slice.SetStartingLSN(lsn);
 }
 
+void ClearBuffer() {
+  journal_slice.ResetRingBuffer();
+  // Advance LSN so that any stale LSN from a pre-clear replica no longer
+  // matches journal::GetLsn(); otherwise the partial-sync fast path in
+  // DflyCmd::IsLSNInPartialSyncBuffer would let a reconnecting replica skip
+  // full sync even though the buffer is empty.
+  journal_slice.SetStartingLSN(journal_slice.cur_lsn() + 1);
+}
+
 error_code Close() {
   VLOG(1) << "Journal::Close";
 
diff --git a/src/server/journal/journal.h b/src/server/journal/journal.h
index 875baf0dbc37..c2d7c5b73315 100644
--- a/src/server/journal/journal.h
+++ b/src/server/journal/journal.h
@@ -16,6 +16,10 @@ void StartInThread();
 // Also drops the (resets) the partial sync buffers
 void StartInThreadAtLsn(LSN lsn);
 
+// Drops the partial-sync buffer for the current shard and invalidates any
+// replica's previously-observed LSN; see the definition for details.
+void ClearBuffer();
+
 std::error_code Close();
 
 //******* The following functions must be called in the context of the owning shard *********//
diff --git a/src/server/list_family.cc b/src/server/list_family.cc
index cc5cc361a1dc..b9ec93805665 100644
--- a/src/server/list_family.cc
+++ b/src/server/list_family.cc
@@ -945,6 +945,15 @@ void BLMove(CmdArgList args, CommandContext* cmd_cntx) {
   }
 }
 
+KeyReadyResult ListKeyChecker(EngineShard* owner, const DbContext& context, std::string_view key) {
+  auto res = context.GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_LIST);
+  if (res.ok())
+    return KeyReadyResult::kReady;
+  if (res.status() == OpStatus::WRONG_TYPE)
+    return KeyReadyResult::kNotReady;
+  return KeyReadyResult::kKeyNotFound;
+}
+
 BPopPusher::BPopPusher(string_view pop_key, string_view push_key, ListDir popdir, ListDir pushdir)
     : pop_key_(pop_key), push_key_(push_key), popdir_(popdir), pushdir_(pushdir) {
 }
@@ -986,13 +995,8 @@ OpResult<string> BPopPusher::RunSingle(time_point tp, Transaction* tx, Connectio
     return op_res;
   }
 
-  const auto key_checker = [](EngineShard* owner, const DbContext& context, Transaction*,
-                              std::string_view key) -> bool {
-    return context.GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_LIST).ok();
-  };
-
   // Block
-  auto status = tx->WaitOnWatch(tp, pop_key_, key_checker, &(cntx->blocked), &(cntx->paused));
+  auto status = tx->WaitOnWatch(tp, pop_key_, ListKeyChecker, &(cntx->blocked), &(cntx->paused));
   if (status != OpStatus::OK)
     return status;
 
@@ -1012,16 +1016,11 @@ OpResult<string> BPopPusher::RunPair(time_point tp, Transaction* tx, ConnectionC
     return op_res;
   }
 
-  const auto key_checker = [](EngineShard* owner, const DbContext& context, Transaction*,
-                              std::string_view key) -> bool {
-    return context.GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_LIST).ok();
-  };
-
   // a hack: we watch in both shards for pop_key but only in the source shard it's relevant.
   // Therefore we follow the regular flow of watching the key but for the destination shard it
   // will never be triggerred.
   // This allows us to run Transaction::Execute on watched transactions in both shards.
-  if (auto status = tx->WaitOnWatch(tp, pop_key_, key_checker, &cntx->blocked, &cntx->paused);
+  if (auto status = tx->WaitOnWatch(tp, pop_key_, ListKeyChecker, &cntx->blocked, &cntx->paused);
       status != OpStatus::OK)
     return status;
 
@@ -1163,8 +1162,7 @@ void CmdLMPop(CmdArgList args, CommandContext* cmd_cntx) {
 
   ListDir dir = parser.MapNext("LEFT", ListDir::LEFT, "RIGHT", ListDir::RIGHT);
   size_t pop_count = 1;
-  if (parser.Check("COUNT"))
-    pop_count = parser.Next<size_t>();
+  parser.Check("COUNT", &pop_count);
 
   if (!parser.Finalize())
     return cmd_cntx->SendError(parser.TakeError().MakeReply());
@@ -1252,8 +1250,7 @@ void CmdBLMPop(CmdArgList args, CommandContext* cmd_cntx) {
   ListDir dir = parser.MapNext("LEFT", ListDir::LEFT, "RIGHT", ListDir::RIGHT);
 
   size_t pop_count = 1;
-  if (parser.Check("COUNT"))
-    pop_count = parser.Next<size_t>();
+  parser.Check("COUNT", &pop_count);
 
   if (!parser.Finalize())
     return cmd_cntx->SendError(parser.TakeError().MakeReply());
@@ -1320,38 +1317,19 @@ void CmdLPos(CmdArgList args, CommandContext* cmd_cntx) {
   auto [key, elem] = parser.Next<string_view, string_view>();
 
   int rank = 1;
-  uint32_t count = 1;
+  std::optional<uint32_t> count;
   uint32_t max_len = 0;
-  bool skip_count = true;
 
-  while (parser.HasNext()) {
-    if (parser.Check("RANK")) {
-      rank = parser.Next<int>();
-      continue;
-    }
+  parser.ApplyOrSkip(Tag("RANK", &rank), Tag("COUNT", &count), Tag("MAXLEN", &max_len));
 
-    if (parser.Check("COUNT")) {
-      count = parser.Next<uint32_t>();
-      skip_count = false;
-      continue;
-    }
-
-    if (parser.Check("MAXLEN")) {
-      max_len = parser.Next<uint32_t>();
-      continue;
-    }
-
-    parser.Skip(1);
-  }
+  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);
 
   auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
   if (rank == 0)
     return rb->SendError(kInvalidIntErr);
 
-  RETURN_ON_PARSE_ERROR(parser, cmd_cntx);
-
   auto cb = [&, &key = key, &elem = elem](Transaction* t, EngineShard* shard) {
-    return OpPos(t->GetOpArgs(shard), key, elem, rank, count, max_len);
+    return OpPos(t->GetOpArgs(shard), key, elem, rank, count.value_or(1), max_len);
   };
 
   Transaction* trans = cmd_cntx->tx();
@@ -1363,7 +1341,7 @@ void CmdLPos(CmdArgList args, CommandContext* cmd_cntx) {
     return rb->SendError(result.status());
   }
 
-  if (skip_count) {
+  if (!count.has_value()) {
     if (result->empty()) {
       rb->SendNull();
     } else {
diff --git a/src/server/main_service.cc b/src/server/main_service.cc
index aea2f9504939..5da081168e1d 100644
--- a/src/server/main_service.cc
+++ b/src/server/main_service.cc
@@ -35,6 +35,7 @@ extern "C" {
 #include "base/flags.h"
 #include "base/logging.h"
 #include "core/search/vector_utils.h"
+#include "facade/cmd_arg_parser.h"
 #include "facade/dragonfly_connection.h"
 #include "facade/dragonfly_listener.h"
 #include "facade/error.h"
@@ -502,13 +503,14 @@ bool IsSHA(string_view str) {
 }
 
 optional<ErrorReply> EvalValidator(CmdArgList args) {
-  string_view num_keys_str = ArgS(args, 1);
-  int32_t num_keys;
+  facade::CmdArgParser parser{args};
+  parser.Skip(1);  // script body / sha
+  uint32_t num_keys = parser.Next<uint32_t>();
 
-  if (!absl::SimpleAtoi(num_keys_str, &num_keys) || num_keys < 0)
-    return ErrorReply{facade::kInvalidIntErr};
+  if (auto err = parser.TakeError(); err)
+    return err.MakeReply();
 
-  if (unsigned(num_keys) > args.size() - 2)
+  if (num_keys > parser.Tail().size())
     return ErrorReply{"Number of keys can't be greater than number of args", kSyntaxErrType};
 
   return nullopt;
@@ -1006,12 +1008,13 @@ void Service::Init(util::AcceptServer* acceptor, std::vector<facade::Listener*>
   // see dfly_main RunEngine. In unit tests, listeners are empty.
   facade::Listener* main_listener = listeners.empty() ? nullptr : listeners.front();
 
-  ChannelStore* cs = new ChannelStore{};
+  // Create Global ChannelStore
+  channel_store = new ChannelStore{};
+
   // Must initialize before the shard_set because EngineShard::Init references ServerState.
   pp_.AwaitBrief([&](uint32_t index, ProactorBase* pb) {
     tl_facade_stats = new FacadeStats;
     ServerState::Init(index, shard_num, main_listener, &user_registry_);
-    ServerState::tlocal()->UpdateChannelStore(cs);
   });
 
   const auto tcp_disabled = GetFlag(FLAGS_port) == 0u;
@@ -1067,11 +1070,12 @@ void Service::Shutdown() {
 
   engine_varz.reset();
 
-  ChannelStore::Destroy();
-
   shard_set->PreShutdown();
   shard_set->Shutdown();
 
+  delete channel_store;
+  channel_store = nullptr;
+
   Transaction::Shutdown();
 
   pp_.AwaitFiberOnAll([](ProactorBase* pb) {
@@ -2517,8 +2521,7 @@ void Service::Publish(CmdArgList args, CommandContext* cmd_cntx) {
   string_view channel = ArgS(args, 0);
   string_view messages[] = {ArgS(args, 1)};
 
-  auto* cs = ServerState::tlocal()->channel_store();
-  cmd_cntx->SendLong(cs->SendMessages(channel, messages, sharded));
+  cmd_cntx->SendLong(channel_store->SendMessages(channel, messages, sharded));
 }
 
 void Service::Subscribe(CmdArgList args, CommandContext* cmd_cntx) {
@@ -2582,11 +2585,11 @@ void Service::Function(CmdArgList args, CommandContext* cmd_cntx) {
 
 void Service::PubsubChannels(string_view pattern, SinkReplyBuilder* builder) {
   auto* rb = static_cast<RedisReplyBuilder*>(builder);
-  rb->SendBulkStrArr(ServerState::tlocal()->channel_store()->ListChannels(pattern));
+  rb->SendBulkStrArr(channel_store->ListChannels(pattern));
 }
 
 void Service::PubsubPatterns(SinkReplyBuilder* builder) {
-  size_t pattern_count = ServerState::tlocal()->channel_store()->PatternCount();
+  size_t pattern_count = channel_store->PatternCount();
   builder->SendLong(pattern_count);
 }
 
@@ -2595,7 +2598,7 @@ void Service::PubsubNumSub(CmdArgList args, SinkReplyBuilder* builder) {
   rb->StartArray(args.size() * 2);
   for (string_view channel : args) {
     rb->SendBulkString(channel);
-    rb->SendLong(ServerState::tlocal()->channel_store()->FetchSubscribers(channel).size());
+    rb->SendLong(channel_store->FetchSubscribers(channel).size());
   }
 }
 
diff --git a/src/server/main_service.h b/src/server/main_service.h
index 4c31bd1a1239..7146136c46cb 100644
--- a/src/server/main_service.h
+++ b/src/server/main_service.h
@@ -118,6 +118,10 @@ class Service : public facade::ServiceInterface {
     return cluster_family_;
   }
 
+  acl::UserRegistry& user_registry() {
+    return user_registry_;
+  }
+
   // Utility function used in unit tests
   // Do not use in production, only meant to be used by unit tests
   const acl::AclFamily* TestInit();
diff --git a/src/server/protocol_client.cc b/src/server/protocol_client.cc
index 0f99807eeab0..ac7cbeffcddb 100644
--- a/src/server/protocol_client.cc
+++ b/src/server/protocol_client.cc
@@ -123,8 +123,6 @@ ProtocolClient::ProtocolClient(ServerContext context) : server_context_(std::mov
 }
 
 ProtocolClient::~ProtocolClient() {
-  exec_st_.JoinErrorHandler();
-
 #ifdef DFLY_USE_SSL
   if (ssl_ctx_) {
     SSL_CTX_free(ssl_ctx_);
diff --git a/src/server/protocol_client.h b/src/server/protocol_client.h
index 593b424207db..de0131f0d286 100644
--- a/src/server/protocol_client.h
+++ b/src/server/protocol_client.h
@@ -153,8 +153,6 @@ class ProtocolClient {
   util::fb2::Mutex sock_mu_;
 
  protected:
-  ExecutionState exec_st_;  // context for tasks in replica.
-
   std::string last_cmd_;
   std::string last_resp_;
 
diff --git a/src/server/rdb_extensions.h b/src/server/rdb_extensions.h
index fee5feb2e74f..b79ad78a299c 100644
--- a/src/server/rdb_extensions.h
+++ b/src/server/rdb_extensions.h
@@ -58,3 +58,6 @@ constexpr uint8_t RDB_OPCODE_VECTOR_INDEX = 222;
 // Opcode to store ShardDocIndex key-to-DocId mapping for search indices
 // Format: [shard_id, index_name, mapping_count, then for each mapping: key_string, doc_id]
 constexpr uint8_t RDB_OPCODE_SHARD_DOC_INDEX = 223;
+
+// Used to tag a chunk of serialized data with its stream id
+constexpr uint8_t RDB_OPCODE_TAGGED_CHUNK = 224;
diff --git a/src/server/rdb_load.cc b/src/server/rdb_load.cc
index cb29a9fa6fbe..fe8b99b1be04 100644
--- a/src/server/rdb_load.cc
+++ b/src/server/rdb_load.cc
@@ -5,8 +5,10 @@
 #include "server/rdb_load.h"
 
 #include "absl/strings/escaping.h"
+#include "server/common.h"
 #include "server/search/global_hnsw_index.h"
 #include "server/tiered_storage.h"
+#include "strings/human_readable.h"
 
 extern "C" {
 #include "redis/intset.h"
@@ -288,13 +290,19 @@ void RdbLoaderBase::OpaqueObjLoader::operator()(const unique_ptr<LoadTrace>& ptr
 }
 
 void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbSBF& src) {
-  SBF* sbf =
-      CompactObj::AllocateMR<SBF>(src.grow_factor, src.fp_prob, src.max_capacity, src.prev_size,
+  SBF* sbf = config_.append ? pv_->GetSBF()
+                            : CompactObj::AllocateMR<SBF>(
+                                  src.grow_factor, src.fp_prob, src.max_capacity, src.prev_size,
                                   src.current_size, CompactObj::memory_resource());
   for (unsigned i = 0; i < src.filters.size(); ++i) {
-    sbf->AddFilter(src.filters[i].blob, src.filters[i].hash_cnt);
+    const auto& blob = src.filters[i].blob;
+    auto* ptr = sbf->AllocateFilter(blob.size(), src.filters[i].hash_cnt);
+    memcpy(ptr, blob.data(), blob.size());
   }
-  pv_->SetSBF(sbf);
+
+  // new obj
+  if (!config_.append)
+    pv_->SetSBF(sbf);
 }
 
 void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbTOPK& src) {
@@ -1001,7 +1009,7 @@ std::error_code RdbLoaderBase::FetchBuf(size_t size, void* dest) {
   DVLOG(3) << "Copying " << to_copy << " bytes";
 
   ::memcpy(next, mem_buf_->InputBuffer().data(), to_copy);
-  mem_buf_->ConsumeInput(to_copy);
+  RETURN_ON_ERR(ConsumeInput(to_copy));
   size -= to_copy;
   if (size == 0)
     return kOk;
@@ -1017,6 +1025,7 @@ std::error_code RdbLoaderBase::FetchBuf(size_t size, void* dest) {
   if (size > 512) {  // Worth reading directly into next.
     io::MutableBytes mb{next, size};
 
+    RETURN_ON_ERR(ConsumeChunkBudget(size));
     SET_OR_RETURN(src_->Read(mb), bytes_read);
     if (bytes_read < size)
       return RdbError(errc::rdb_file_corrupted);
@@ -1046,7 +1055,7 @@ std::error_code RdbLoaderBase::FetchBuf(size_t size, void* dest) {
 
   mem_buf_->CommitWrite(bytes_read);
   ::memcpy(next, mem_buf_->InputBuffer().data(), size);
-  mem_buf_->ConsumeInput(size);
+  RETURN_ON_ERR(ConsumeInput(size));
 
   return kOk;
 }
@@ -1142,8 +1151,11 @@ auto RdbLoaderBase::FetchLzfStringObject() -> io::Result<string> {
 
   // FetchBuf consumes the input but if we have not went through that path
   // we need to consume now.
-  if (zerocopy_decompress)
-    mem_buf_->ConsumeInput(clen);
+  if (zerocopy_decompress) {
+    if (auto ec = ConsumeInput(clen); ec) {
+      return make_unexpected(ec);
+    }
+  }
 
   return res;
 }
@@ -1162,13 +1174,14 @@ auto RdbLoaderBase::FetchIntegerObject(int enctype) -> io::Result<string> {
 }
 
 io::Result<double> RdbLoaderBase::FetchBinaryDouble() {
-  auto ec = EnsureRead(8);
-  if (ec)
+  if (auto ec = EnsureRead(8))
     return make_unexpected(ec);
 
+  if (auto ec = ConsumeChunkBudget(8))
+    return make_unexpected(ec);
   uint8_t buf[8];
   mem_buf_->ReadAndConsume(8, buf);
-  uint64_t val = base::LE::LoadT<uint64_t>(buf);
+  const uint64_t val = base::LE::LoadT<uint64_t>(buf);
   return std::bit_cast<double>(val);
 }
 
@@ -1380,17 +1393,20 @@ auto RdbLoaderBase::ReadSet(int rdbtype) -> io::Result<OpaqueObj> {
   unique_ptr<LoadTrace> load_trace(new LoadTrace);
   size_t n = std::min(len, kMaxBlobLen);
   load_trace->arr.resize(n);
-  for (size_t i = 0; i < n; i++) {
+  size_t i = 0;
+  for (; i < n && !ChunkBudgetExhausted(); i++) {
     error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
     if (ec) {
       return make_unexpected(ec);
     }
   }
+  // cut off extra elements we allocated but stopped short due to budget
+  load_trace->arr.resize(i);
 
   // If there are still unread elements, cache the number of remaining
   // elements, or clear if the full object has been read.
-  if (len > n) {
-    pending_read_.remaining = len - n;
+  if (len > i) {
+    pending_read_.remaining = len - i;
   } else if (pending_read_.remaining > 0) {
     pending_read_.remaining = 0;
   }
@@ -1461,16 +1477,18 @@ auto RdbLoaderBase::ReadHMap(int rdbtype) -> io::Result<OpaqueObj> {
   unique_ptr<LoadTrace> load_trace(new LoadTrace);
   size_t n = std::min<size_t>(len, kMaxBlobLen);
   load_trace->arr.resize(n);
-  for (size_t i = 0; i < n; ++i) {
+  size_t i = 0;
+  for (; i < n && !ChunkBudgetExhausted(); ++i) {
     error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
     if (ec)
       return make_unexpected(ec);
   }
+  load_trace->arr.resize(i);
 
   // If there are still unread elements, cache the number of remaining
   // elements, or clear if the full object has been read.
-  if (len > n) {
-    pending_read_.remaining = len - n;
+  if (len > i) {
+    pending_read_.remaining = len - i;
   } else if (pending_read_.remaining > 0) {
     pending_read_.remaining = 0;
   }
@@ -1496,7 +1514,8 @@ auto RdbLoaderBase::ReadZSet(int rdbtype) -> io::Result<OpaqueObj> {
   unique_ptr<LoadTrace> load_trace(new LoadTrace);
   size_t n = std::min<size_t>(zsetlen, kMaxBlobLen);
   load_trace->arr.resize(n);
-  for (size_t i = 0; i < n; ++i) {
+  size_t i = 0;
+  for (; i < n && !ChunkBudgetExhausted(); ++i) {
     error_code ec = ReadStringObj(&load_trace->arr[i].rdb_var);
     if (ec)
       return make_unexpected(ec);
@@ -1511,11 +1530,12 @@ auto RdbLoaderBase::ReadZSet(int rdbtype) -> io::Result<OpaqueObj> {
     }
     load_trace->arr[i].score = score;
   }
+  load_trace->arr.resize(i);
 
   // If there are still unread elements, cache the number of remaining
   // elements, or clear if the full object has been read.
-  if (zsetlen > n) {
-    pending_read_.remaining = zsetlen - n;
+  if (zsetlen > i) {
+    pending_read_.remaining = zsetlen - i;
   } else if (pending_read_.remaining > 0) {
     pending_read_.remaining = 0;
   }
@@ -1540,7 +1560,8 @@ auto RdbLoaderBase::ReadListQuicklist(int rdbtype) -> io::Result<OpaqueObj> {
   // therefore using a smaller segment length than kMaxBlobLen.
   size_t n = std::min<size_t>(len, 512);
   load_trace->arr.resize(n);
-  for (size_t i = 0; i < n; ++i) {
+  size_t i = 0;
+  for (; i < n && !ChunkBudgetExhausted(); ++i) {
     uint64_t container = QUICKLIST_NODE_CONTAINER_PACKED;
     if (rdbtype == RDB_TYPE_LIST_QUICKLIST_2) {
       SET_OR_UNEXPECT(LoadLen(nullptr), container);
@@ -1563,11 +1584,12 @@ auto RdbLoaderBase::ReadListQuicklist(int rdbtype) -> io::Result<OpaqueObj> {
     load_trace->arr[i].rdb_var = std::move(var);
     load_trace->arr[i].encoding = container;
   }
+  load_trace->arr.resize(i);
 
   // If there are still unread elements, cache the number of remaining
   // elements, or clear if the full object has been read.
-  if (len > n) {
-    pending_read_.remaining = len - n;
+  if (len > i) {
+    pending_read_.remaining = len - i;
   } else if (pending_read_.remaining > 0) {
     pending_read_.remaining = 0;
   }
@@ -1590,7 +1612,10 @@ auto RdbLoaderBase::ReadStreams(int rdbtype) -> io::Result<OpaqueObj> {
   load_trace->arr.resize(n * 2);
 
   error_code ec;
-  for (size_t i = 0; i < n; ++i) {
+  size_t i = 0;
+  // The sender always sends stream id and blob together, there is no flush between. So the budget
+  // check is not midway between the two entries.
+  for (; i < n && !ChunkBudgetExhausted(); ++i) {
     /* Get the master ID, the one we'll use as key of the radix tree
      * node: the entries inside the listpack itself are delta-encoded
      * relatively to this ID. */
@@ -1615,14 +1640,15 @@ auto RdbLoaderBase::ReadStreams(int rdbtype) -> io::Result<OpaqueObj> {
     load_trace->arr[2 * i].rdb_var = std::move(stream_id);
     load_trace->arr[2 * i + 1].rdb_var = std::move(blob);
   }
+  load_trace->arr.resize(2 * i);
 
   // If there are still unread elements, cache the number of remaining
   // elements, or clear if the full object has been read.
   //
   // We only load the stream metadata and consumer groups in the final read,
   // so if there are still unread elements return the partial stream.
-  if (listpacks > n) {
-    pending_read_.remaining = listpacks - n;
+  if (listpacks > i) {
+    pending_read_.remaining = listpacks - i;
     return OpaqueObj{std::move(load_trace), rdbtype};
   }
 
@@ -1798,62 +1824,116 @@ auto RdbLoaderBase::ReadRedisJson() -> io::Result<OpaqueObj> {
   return OpaqueObj{std::move(dest), RDB_TYPE_JSON};
 }
 
-auto RdbLoaderBase::ReadSBFImpl(bool chunking) -> io::Result<OpaqueObj> {
+auto RdbLoaderBase::ReadSBFImpl(bool filter_is_chunked) -> io::Result<OpaqueObj> {
   RdbSBF res;
-  uint64_t options;
-  SET_OR_UNEXPECT(LoadLen(nullptr), options);
-  if (options != 0)
-    return Unexpected(errc::rdb_file_corrupted);
-  SET_OR_UNEXPECT(FetchBinaryDouble(), res.grow_factor);
-  SET_OR_UNEXPECT(FetchBinaryDouble(), res.fp_prob);
-  if (res.fp_prob <= 0 || res.fp_prob > 0.5) {
-    return Unexpected(errc::rdb_file_corrupted);
-  }
-  SET_OR_UNEXPECT(LoadLen(nullptr), res.prev_size);
-  SET_OR_UNEXPECT(LoadLen(nullptr), res.current_size);
-  SET_OR_UNEXPECT(LoadLen(nullptr), res.max_capacity);
+  auto is_power2 = [](size_t n) { return (n & (n - 1)) == 0; };
 
   unsigned num_filters = 0;
-  SET_OR_UNEXPECT(LoadLen(nullptr), num_filters);
-  auto is_power2 = [](size_t n) { return (n & (n - 1)) == 0; };
 
-  for (unsigned i = 0; i < num_filters; ++i) {
+  // Only read SBF metadata if not continuing.
+  if (!pending_read_.sbf_filter.has_value() && pending_read_.remaining == 0) {
+    uint64_t options;
+    SET_OR_UNEXPECT(LoadLen(nullptr), options);
+    if (options != 0)
+      return Unexpected(errc::rdb_file_corrupted);
+    SET_OR_UNEXPECT(FetchBinaryDouble(), res.grow_factor);
+    SET_OR_UNEXPECT(FetchBinaryDouble(), res.fp_prob);
+    if (res.fp_prob <= 0 || res.fp_prob > 0.5) {
+      return Unexpected(errc::rdb_file_corrupted);
+    }
+    SET_OR_UNEXPECT(LoadLen(nullptr), res.prev_size);
+    SET_OR_UNEXPECT(LoadLen(nullptr), res.current_size);
+    SET_OR_UNEXPECT(LoadLen(nullptr), res.max_capacity);
+
+    SET_OR_UNEXPECT(LoadLen(nullptr), num_filters);
+  } else {
+    num_filters = pending_read_.remaining;
+    pending_read_.remaining = 0;
+  }
+
+  // Read one SBF filter encoded as [chunk_size][chunk_data][chunk_size][chunk_data] up to a total
+  // size which is the size of the `data` string.
+  // We may stop between chunks if the outer tagged chunk ends.
+  auto read_filter_chunks = [&](string& data, size_t start_offset) -> io::Result<size_t> {
+    // data size never changes after initial reserve, we memcpy directly into it in FetchBuf
+    const size_t total = data.size();
+    size_t curr_offset = start_offset;
+    while (curr_offset < total && !ChunkBudgetExhausted()) {
+      auto chunk_res = LoadLen(nullptr);
+      if (!chunk_res)
+        return make_unexpected(chunk_res.error());
+      const size_t chunk_size = *chunk_res;
+      if (chunk_size == 0 || chunk_size > total - curr_offset)
+        return Unexpected(errc::rdb_file_corrupted);
+      if (auto ec = FetchBuf(chunk_size, data.data() + curr_offset))
+        return make_unexpected(ec);
+      curr_offset += chunk_size;
+    }
+    return curr_offset;
+  };
+
+  // Adds a fully read filter into the SBF object after size validation
+  auto append_filter = [&](unsigned hash_cnt, string filter_data) -> error_code {
+    if (const size_t bit_len = filter_data.size() * 8; !is_power2(bit_len) || filter_data.empty())
+      return RdbError(errc::rdb_file_corrupted);
+    res.filters.emplace_back(hash_cnt, std::move(filter_data));
+    return {};
+  };
+
+  // First, complete a partially read filter from the previous state if there is one
+  // In this code path num_filters still includes the partially read filter. So num_filters is
+  // decremented only once the current filter is fully read.
+  if (pending_read_.sbf_filter) {
+    auto& sf = *pending_read_.sbf_filter;
+
+    SET_OR_UNEXPECT(read_filter_chunks(sf.filter_data, sf.offset), sf.offset);
+
+    if (sf.offset < sf.filter_data.size()) {
+      // restore pending_read_ as we did not finish the partially read filter. Also, do not
+      // decrement num_filters. We still need to read the same number of filters next time.
+      pending_read_.remaining = num_filters;
+      return OpaqueObj{std::move(res), RDB_TYPE_SBF};
+    }
+
+    if (auto ec = append_filter(sf.hash_cnt, std::move(sf.filter_data)))
+      return make_unexpected(ec);
+
+    pending_read_.sbf_filter.reset();
+    num_filters--;
+  }
+
+  unsigned filters_read = 0;
+  for (; filters_read < num_filters && !ChunkBudgetExhausted(); ++filters_read) {
     unsigned hash_cnt;
     string filter_data;
     SET_OR_UNEXPECT(LoadLen(nullptr), hash_cnt);
 
-    if (chunking) {
+    if (filter_is_chunked) {
       size_t total_size = 0;
       SET_OR_UNEXPECT(LoadLen(nullptr), total_size);
-      if (total_size == 0) {
+      if (total_size == 0)
         return Unexpected(errc::rdb_file_corrupted);
-      }
 
+      // This size is fixed and never changes. It acts as a limit of how much to read.
       filter_data.resize(total_size);
       size_t offset = 0;
-      while (offset < total_size) {
-        size_t chunk_size = 0;
-        SET_OR_UNEXPECT(LoadLen(nullptr), chunk_size);
-        if (chunk_size == 0 || chunk_size > total_size - offset) {
-          return Unexpected(errc::rdb_file_corrupted);
-        }
-        error_code ec = FetchBuf(chunk_size, filter_data.data() + offset);
-        if (ec) {
-          return make_unexpected(ec);
-        }
+      SET_OR_UNEXPECT(read_filter_chunks(filter_data, 0), offset);
 
-        offset += chunk_size;
+      if (offset < total_size) {
+        DCHECK(ChunkBudgetExhausted()) << "chunk budget not exhausted but filter read ended early";
+        pending_read_.sbf_filter = {std::move(filter_data), offset, hash_cnt};
+        pending_read_.remaining = num_filters - filters_read;
+        return OpaqueObj{std::move(res), RDB_TYPE_SBF};
       }
     } else {
       SET_OR_UNEXPECT(FetchGenericString(), filter_data);
     }
 
-    size_t bit_len = filter_data.size() * 8;
-    if (!is_power2(bit_len)) {  // must be power of two
-      return Unexpected(errc::rdb_file_corrupted);
-    }
-    res.filters.emplace_back(hash_cnt, std::move(filter_data));
+    if (auto ec = append_filter(hash_cnt, std::move(filter_data)))
+      return make_unexpected(ec);
   }
+
+  pending_read_.remaining = num_filters - filters_read;
   return OpaqueObj{std::move(res), RDB_TYPE_SBF};
 }
 
@@ -1969,11 +2049,12 @@ io::Result<RdbLoaderBase::OpaqueObj> RdbLoaderBase::ReadCMS() {
 }
 
 template <typename T> io::Result<T> RdbLoaderBase::FetchInt() {
-  auto ec = EnsureRead(sizeof(T));
-  if (ec)
+  if (auto ec = EnsureRead(sizeof(T)); ec)
     return make_unexpected(ec);
 
   char buf[16];
+  if (auto ec = ConsumeChunkBudget(sizeof(T)); ec)
+    return make_unexpected(ec);
   mem_buf_->ReadAndConsume(sizeof(T), buf);
 
   return base::LE::LoadT<std::make_unsigned_t<T>>(buf);
@@ -2155,9 +2236,10 @@ error_code RdbLoader::Load(io::Source* src) {
     }
 
     if (type == RDB_OPCODE_FULLSYNC_END) {
-      VLOG(1) << "Read RDB_OPCODE_FULLSYNC_END";
+      LOG(INFO) << "Read RDB_OPCODE_FULLSYNC_END rss="
+                << strings::HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
       RETURN_ON_ERR(EnsureRead(8));
-      mem_buf_->ConsumeInput(8);  // ignore 8 bytes
+      RETURN_ON_ERR(ConsumeInput(8));  // ignore 8 bytes
 
       if (full_sync_cut_cb) {
         FlushAllShards();  // Flush as the handler awakes post load handlers
@@ -2261,78 +2343,12 @@ error_code RdbLoader::Load(io::Source* src) {
     }
 
     if (type == RDB_OPCODE_VECTOR_INDEX) {
-      // HNSW vector index graph data.
-      // Binary format: [index_key, elements_number,
-      //   then for each node (little-endian):
-      //     internal_id (4 bytes), global_id (8 bytes), level (4 bytes),
-      //     for each level (0 to level): links_num (4 bytes) + links (4 bytes each)]
-      string index_key;
-      SET_OR_RETURN(FetchGenericString(), index_key);
-
-      uint64_t elements_number;
-      SET_OR_RETURN(LoadLen(nullptr), elements_number);
-
-      if (!deserialize_hnsw_index_) {
-        RETURN_ON_ERR(SkipVectorIndex(index_key, elements_number));
-      } else {
-        DCHECK_GT(shard_count_, 0u);
-        // Parse "index_name:field_name" from the composite key.
-        size_t colon_pos = index_key.rfind(':');
-        string_view index_name{index_key.data(),
-                               colon_pos != string::npos ? colon_pos : index_key.size()};
-        string_view field_name = colon_pos != string::npos
-                                     ? string_view{index_key.data() + colon_pos + 1}
-                                     : string_view{};
-
-        if (shard_count_ == shard_set->size()) {
-          // Same shard count: restore directly.
-          RETURN_ON_ERR(RestoreVectorIndex(index_key, index_name, field_name, elements_number));
-        } else {
-          // Different shard count: load nodes and defer restoration.
-          // Global_ids will be remapped in PerformPostLoad after all key mappings are collected.
-          PendingHnswNodes pending{std::string(index_name), std::string(field_name), {}};
-          RETURN_ON_ERR(LoadVectorIndexNodes(elements_number, &pending.nodes));
-          LOG(INFO) << "Deferred HNSW index restore for " << index_key << " with "
-                    << pending.nodes.size() << " nodes (shard count mismatch: " << shard_count_
-                    << " vs " << shard_set->size() << ")";
-          load_context_->AddPendingHnswNodes(std::move(pending));
-        }
-      }
+      RETURN_ON_ERR(HandleVectorIndex());
       continue;
     }
 
     if (type == RDB_OPCODE_SHARD_DOC_INDEX) {
-      // Load ShardDocIndex key-to-DocId mapping
-      // Format: [shard_id, index_name, mapping_count, then for each mapping: key_string, doc_id]
-      PendingIndexMapping pim;
-      uint32_t shard_id;
-      SET_OR_RETURN(LoadLen(nullptr), shard_id);
-
-      SET_OR_RETURN(FetchGenericString(), pim.index_name);
-
-      uint64_t mapping_count;
-      SET_OR_RETURN(LoadLen(nullptr), mapping_count);
-      pim.mappings.reserve(mapping_count);
-
-      for (uint64_t i = 0; i < mapping_count; ++i) {
-        string key;
-        SET_OR_RETURN(FetchGenericString(), key);
-        uint64_t doc_id;
-        SET_OR_RETURN(LoadLen(nullptr), doc_id);
-        pim.mappings.emplace_back(std::move(key), static_cast<search::DocId>(doc_id));
-      }
-
-      if (!deserialize_hnsw_index_) {
-        continue;
-      }
-      DCHECK_GT(shard_count_, 0u);
-
-      VLOG(2) << "Loaded index mapping for shard " << shard_id << " with " << mapping_count
-              << " entries";
-
-      // Always store mappings. When shard counts differ, PerformPostLoad will redistribute
-      // keys to replica shards and remap global_ids accordingly.
-      load_context_->AddPendingIndexMapping(shard_id, std::move(pim));
+      RETURN_ON_ERR(HandleShardDocIndex());
       continue;
     }
 
@@ -2431,14 +2447,34 @@ error_code RdbLoaderBase::EnsureReadInternal(size_t min_to_read) {
   return kOk;
 }
 
+std::error_code RdbLoaderBase::ConsumeInput(size_t n) {
+  RETURN_ON_ERR(ConsumeChunkBudget(n));
+  mem_buf_->ConsumeInput(n);
+  return kOk;
+}
+
+std::error_code RdbLoaderBase::ConsumeChunkBudget(size_t n) {
+  if (!current_chunk_state_)
+    return kOk;
+
+  if (n > current_chunk_state_->remaining_payload_bytes) {
+    LOG(ERROR) << "Chunk budget exceeded: requested " << n << " bytes, remaining "
+               << current_chunk_state_->remaining_payload_bytes << ", stream_id "
+               << current_chunk_state_->stream_id << ", bytes_read " << bytes_read_;
+    return RdbError(errc::rdb_chunk_budget_exceeded);
+  }
+
+  current_chunk_state_->remaining_payload_bytes -= n;
+  return kOk;
+}
+
 io::Result<uint64_t> RdbLoaderBase::LoadLen(bool* is_encoded) {
   if (is_encoded)
     *is_encoded = false;
 
   // Every RDB file with rdbver >= 5 has 8-bytes checksum at the end,
   // so we can ensure we have 9 bytes to read up until that point.
-  error_code ec = EnsureRead(9);
-  if (ec)
+  if (error_code ec = EnsureRead(9))
     return make_unexpected(ec);
 
   // Read integer meta info.
@@ -2453,7 +2489,9 @@ io::Result<uint64_t> RdbLoaderBase::LoadLen(bool* is_encoded) {
   if (meta.Type() == RDB_ENCVAL && is_encoded)
     *is_encoded = true;
 
-  mem_buf_->ConsumeInput(1 + meta.ByteSize());
+  if (auto ec = ConsumeInput(1 + meta.ByteSize()); ec) {
+    return make_unexpected(ec);
+  }
 
   return res;
 }
@@ -2552,15 +2590,21 @@ error_code RdbLoaderBase::HandleJournalBlob(Service* service) {
   string journal_blob;
   SET_OR_RETURN(FetchGenericString(), journal_blob);
 
+  // Create reader & executor if needed
+  if (!journal_reader_)
+    journal_reader_ = std::make_unique<JournalReader>(nullptr, 0);
+
+  if (!journal_executor_)
+    journal_executor_ = std::make_unique<JournalExecutor>(service);
+
   io::BytesSource bs{io::Buffer(journal_blob)};
-  journal_reader_.SetSource(&bs);
+  journal_reader_->SetSource(&bs);
 
   // Parse and exectue in loop.
   size_t done = 0;
-  JournalExecutor ex{service};
   while (done < num_entries) {
     journal::ParsedEntry entry;
-    auto ec = journal_reader_.ReadEntry(&entry);
+    auto ec = journal_reader_->ReadEntry(&entry);
     if (ec)
       return ec;
 
@@ -2582,7 +2626,7 @@ error_code RdbLoaderBase::HandleJournalBlob(Service* service) {
     }
 
     DVLOG(2) << "Executing item: " << entry.ToString();
-    ex.Execute(entry.dbid, entry.cmd);
+    journal_executor_->Execute(entry.dbid, entry.cmd);
   }
 
   return std::error_code{};
@@ -2748,9 +2792,10 @@ void RdbLoader::CreateObjectOnShard(const DbContext& db_cntx, const Item* item,
   };
 
   LoadConfig config_copy = item->load_config;
+  ChunkedKey chunked_key{db_ind, item->key};
   if (item->load_config.chunked && item->load_config.append) {
     std::unique_lock lk{now_chunked_mu_};
-    if (auto it = now_chunked_.find(item->key); it != now_chunked_.end()) {
+    if (auto it = now_chunked_.find(chunked_key); it != now_chunked_.end()) {
       pv_ptr = it->second.get();
     } else {
       // Sets and hashes are deleted when all their entries are expired.
@@ -2791,13 +2836,13 @@ void RdbLoader::CreateObjectOnShard(const DbContext& db_cntx, const Item* item,
 
   if (item->load_config.chunked) {
     std::unique_lock lk{now_chunked_mu_};
-    if (!now_chunked_.contains(item->key))
-      now_chunked_.emplace(item->key, make_unique<PrimeValue>(std::move(pv)));
+    if (!now_chunked_.contains(chunked_key))
+      now_chunked_.emplace(chunked_key, make_unique<PrimeValue>(std::move(pv)));
 
     if (!item->load_config.finalize)
       return;
 
-    pv = std::move(*now_chunked_.extract(item->key).mapped());
+    pv = std::move(*now_chunked_.extract(chunked_key).mapped());
   }
 
   // We need this extra check because we don't return empty_key
@@ -2877,81 +2922,97 @@ error_code RdbLoader::LoadKeyValPair(int type, ObjSettings* settings) {
   SET_OR_RETURN(ReadKey(), key);
   last_key_loaded_ = key;
 
-  bool dry_run = absl::GetFlag(FLAGS_rdb_load_dry_run);
-  bool streamed = false;
+  auto remaining_payload_bytes = [&] {
+    return current_chunk_state_ ? current_chunk_state_->remaining_payload_bytes : UINT32_MAX;
+  };
+
+  bool finalized = false;
   do {
-    // If there is a cached Item in the free pool, take it, otherwise allocate
-    // a new Item (LoadItemsBuffer returns free items).
-    Item* item = item_queue_.Pop();
-    if (item == nullptr) {
-      item = new Item;
-    }
-    // Delete the item if we fail to load the key/val pair.
-    auto cleanup = absl::Cleanup([item] { delete item; });
+    SET_OR_RETURN(ReadAndDispatchObject(type, key, *settings, cur_db_index_), finalized);
+  } while (!finalized && remaining_payload_bytes() > 0 && !stop_early_.load(memory_order_relaxed));
 
-    item->load_config.append = pending_read_.remaining > 0;
+  int delta_ms = (absl::GetCurrentTimeNanos() - start) / 1000'000;
+  LOG_IF(INFO, delta_ms > 1000) << "Took " << delta_ms << " ms to load rdb_type " << type;
 
-    error_code ec = ReadObj(type, &item->val);
-    if (ec) {
-      VLOG(2) << "ReadObj error " << ec << " for key " << key;
-      return ec;
-    }
+  pending_read_ = {};
+  return kOk;
+}
 
-    // If the key can be discarded, we must still continue to read the
-    // object from the RDB so we can read the next key.
-    if (ShouldDiscardKey(key, *settings)) {
-      pending_read_.reserve = 0;
-      continue;
-    }
+io::Result<bool> RdbLoader::ReadAndDispatchObject(int object_type, std::string& key,
+                                                  const ObjSettings& obj_settings,
+                                                  DbIndex db_index) {
+  const ShardId sid = Shard(key, shard_set->size());
+  bool run_inlined = EngineShard::tlocal() && EngineShard::tlocal()->shard_id() == sid;
+  Item local_item, *item = &local_item;
 
-    if (dry_run)
-      continue;
+  // If we run non-inlined, take an item from the queue
+  if (!run_inlined) {
+    if (item = item_queue_.Pop(); item == nullptr)
+      item = new Item;
+  }
 
-    item->load_config.finalize = pending_read_.remaining == 0;
-    if (!item->load_config.finalize) {
-      item->key = key;
-      streamed = true;
-    } else {
-      // Avoid copying the key if this is the last read of the object.
-      item->key = std::move(key);
-    }
+  auto cleanup = absl::Cleanup([item, run_inlined] {
+    if (!run_inlined)
+      delete item;
+  });
+
+  // The caller restores pending_read_ for continuation chunks.
+  // If it is already non-empty, this call appends to an existing partially built object.
+  const bool was_appending = pending_read_.remaining != 0;
+
+  // Read a part of the object. Updates remaining items
+  if (auto ec = ReadObj(object_type, &item->val); ec)
+    return make_unexpected(ec);
+
+  const bool finalized = pending_read_.remaining == 0;
 
-    item->load_config.chunked = streamed;
-    item->load_config.reserve = pending_read_.reserve;
-    // Clear 'reserve' as we must only set when the object is first
-    // initialized.
+  if (ShouldDiscardKey(key, obj_settings)) {
     pending_read_.reserve = 0;
+    return finalized;
+  }
 
-    item->is_sticky = settings->is_sticky;
-    item->has_mc_flags = settings->has_mc_flags;
-    item->mc_flags = settings->mc_flags;
-    item->expire_ms = settings->expiretime;
+  if (GetFlag(FLAGS_rdb_load_dry_run)) {
+    return finalized;
+  }
 
-    std::move(cleanup).Cancel();
-    ShardId sid = Shard(item->key, shard_set->size());
-    EngineShard* es = EngineShard::tlocal();
+  item->load_config = {
+      .chunked = was_appending || !finalized,
+      .reserve = pending_read_.reserve,
+      // append drives create vs. append and stays true for all continuation chunks, including the
+      // final one. It is always false for the first chunk.
+      .append = was_appending,
+      // finalize is the post-read state and tells the caller whether this call finished reading
+      // the object or has remaining data to read.
+      .finalize = finalized,
+  };
+  pending_read_.reserve = 0;
 
-    if (es && es->shard_id() == sid) {
-      DbContext db_cntx{&namespaces->GetDefaultNamespace(), cur_db_index_, GetCurrentTimeMs()};
-      CreateObjectOnShard(db_cntx, item, &db_cntx.GetDbSlice(sid));
-      item_queue_.Push(item);
-    } else {
-      auto& out_buf = shard_buf_[sid];
+  if (finalized) {
+    item->key = std::move(key);
+  } else {
+    item->key = key;
+  }
 
-      out_buf.emplace_back(item);
+  item->is_sticky = obj_settings.is_sticky;
+  item->has_mc_flags = obj_settings.has_mc_flags;
+  item->mc_flags = obj_settings.mc_flags;
+  item->expire_ms = obj_settings.expiretime;
 
-      constexpr size_t kBufSize = 64;
-      if (out_buf.size() >= kBufSize) {
-        // Despite being async, this function can block if the shard queue is full.
-        FlushShardAsync(sid);
-      }
-    }
-  } while (pending_read_.remaining > 0 && !stop_early_.load(memory_order_relaxed));
+  std::move(cleanup).Cancel();
 
-  int delta_ms = (absl::GetCurrentTimeNanos() - start) / 1000'000;
-  LOG_IF(INFO, delta_ms > 1000) << "Took " << delta_ms << " ms to load rdb_type " << type;
+  if (run_inlined) {
+    const DbContext db_cntx{&namespaces->GetDefaultNamespace(), db_index, GetCurrentTimeMs()};
+    CreateObjectOnShard(db_cntx, item, &db_cntx.GetDbSlice(sid));
+  } else {
+    auto& out_buf = shard_buf_[sid];
+    out_buf.emplace_back(item);
+    constexpr size_t kBufSize = 64;
+    if (out_buf.size() >= kBufSize) {
+      FlushShardAsync(sid);
+    }
+  }
 
-  return kOk;
+  return finalized;
 }
 
 bool RdbLoader::ShouldDiscardKey(std::string_view key, const ObjSettings& settings) const {
@@ -3006,13 +3067,10 @@ void RdbLoader::LoadHnswIndexMetadataFromAux(string&& def) {
     PendingHnswMetadata phm;
     phm.index_name = json["index_name"].as<string>();
     phm.field_name = json["field_name"].as<string>();
-    phm.metadata.max_elements = json["max_elements"].as<size_t>();
-    phm.metadata.cur_element_count = json["cur_element_count"].as<size_t>();
-    phm.metadata.maxlevel = json["maxlevel"].as<int>();
     phm.metadata.enterpoint_node = json["enterpoint_node"].as<size_t>();
 
     LOG(INFO) << "Loaded HNSW metadata for index=" << phm.index_name << " field=" << phm.field_name
-              << " elements=" << phm.metadata.cur_element_count;
+              << " enterpoint=" << phm.metadata.enterpoint_node;
 
     load_context_->AddPendingHnswMetadata(std::move(phm));
   } catch (const std::exception& e) {
@@ -3020,6 +3078,81 @@ void RdbLoader::LoadHnswIndexMetadataFromAux(string&& def) {
   }
 }
 
+error_code RdbLoader::HandleVectorIndex() {
+  // HNSW vector index graph data.
+  // Binary format: [index_key, elements_number,
+  //   then for each node (little-endian):
+  //     internal_id (4 bytes), global_id (8 bytes), level (4 bytes),
+  //     for each level (0 to level): links_num (4 bytes) + links (4 bytes each)]
+  string index_key;
+  SET_OR_RETURN(FetchGenericString(), index_key);
+
+  uint64_t elements_number;
+  SET_OR_RETURN(LoadLen(nullptr), elements_number);
+
+  if (!deserialize_hnsw_index_) {
+    return SkipVectorIndex(index_key, elements_number);
+  }
+
+  DCHECK_GT(shard_count_, 0u);
+  // Parse "index_name:field_name" from the composite key.
+  size_t colon_pos = index_key.rfind(':');
+  string_view index_name{index_key.data(),
+                         colon_pos != string::npos ? colon_pos : index_key.size()};
+  string_view field_name =
+      colon_pos != string::npos ? string_view{index_key.data() + colon_pos + 1} : string_view{};
+
+  if (shard_count_ == shard_set->size()) {
+    // Same shard count: restore directly.
+    return RestoreVectorIndex(index_key, index_name, field_name, elements_number);
+  }
+
+  // Different shard count: load nodes and defer restoration.
+  // Global_ids will be remapped in PerformPostLoad after all key mappings are collected.
+  PendingHnswNodes pending{std::string(index_name), std::string(field_name), {}};
+  RETURN_ON_ERR(LoadVectorIndexNodes(elements_number, &pending.nodes));
+  LOG(INFO) << "Deferred HNSW index restore for " << index_key << " with " << pending.nodes.size()
+            << " nodes (shard count mismatch: " << shard_count_ << " vs " << shard_set->size()
+            << ")";
+  load_context_->AddPendingHnswNodes(std::move(pending));
+  return kOk;
+}
+
+error_code RdbLoader::HandleShardDocIndex() {
+  // Load ShardDocIndex key-to-DocId mapping.
+  // Format: [shard_id, index_name, mapping_count, then for each mapping: key_string, doc_id]
+  PendingIndexMapping pim;
+  uint32_t shard_id;
+  SET_OR_RETURN(LoadLen(nullptr), shard_id);
+
+  SET_OR_RETURN(FetchGenericString(), pim.index_name);
+
+  uint64_t mapping_count;
+  SET_OR_RETURN(LoadLen(nullptr), mapping_count);
+  pim.mappings.reserve(mapping_count);
+
+  for (uint64_t i = 0; i < mapping_count; ++i) {
+    string key;
+    SET_OR_RETURN(FetchGenericString(), key);
+    uint64_t doc_id;
+    SET_OR_RETURN(LoadLen(nullptr), doc_id);
+    pim.mappings.emplace_back(std::move(key), static_cast<search::DocId>(doc_id));
+  }
+
+  if (!deserialize_hnsw_index_) {
+    return kOk;
+  }
+  DCHECK_GT(shard_count_, 0u);
+
+  VLOG(2) << "Loaded index mapping for shard " << shard_id << " with " << mapping_count
+          << " entries";
+
+  // Always store mappings. When shard counts differ, PerformPostLoad will redistribute
+  // keys to replica shards and remap global_ids accordingly.
+  load_context_->AddPendingIndexMapping(shard_id, std::move(pim));
+  return kOk;
+}
+
 error_code RdbLoader::LoadVectorIndexNodes(uint64_t elements_number,
                                            std::vector<search::HnswNodeData>* nodes) {
   nodes->reserve(elements_number);
@@ -3058,13 +3191,21 @@ error_code RdbLoader::RestoreVectorIndex(string_view index_key, string_view inde
   std::vector<search::HnswNodeData> nodes;
   RETURN_ON_ERR(LoadVectorIndexNodes(elements_number, &nodes));
 
-  if (!nodes.empty()) {
-    auto metadata = load_context_->FindHnswMetadata(index_name, field_name);
-    DCHECK(metadata) << "HNSW metadata missing for " << index_key;
+  if (nodes.empty())
+    return {};
 
-    hnsw_index->RestoreFromNodes(nodes, *metadata);
-    LOG(INFO) << "Restored HNSW index " << index_key << " with " << nodes.size() << " nodes";
+  auto metadata = load_context_->FindHnswMetadata(index_name, field_name);
+  if (!metadata) {
+    LOG(ERROR) << "HNSW metadata missing for " << index_key
+               << "; skipping graph restore — index will be rebuilt from keyspace";
+    return {};
+  }
+  if (!hnsw_index->RestoreFromNodes(nodes, *metadata)) {
+    LOG(WARNING) << "HNSW graph restore rejected for " << index_key
+                 << "; index will be rebuilt from keyspace";
+    return {};
   }
+  LOG(INFO) << "Restored HNSW index " << index_key << " with " << nodes.size() << " nodes";
   return {};
 #else
   return SkipVectorIndex(index_key, elements_number);
diff --git a/src/server/rdb_load.h b/src/server/rdb_load.h
index cc3d4f270e4a..ed293d38e5fa 100644
--- a/src/server/rdb_load.h
+++ b/src/server/rdb_load.h
@@ -20,8 +20,9 @@ extern "C" {
 #include "io/io_buf.h"
 #include "server/detail/decompress.h"
 #include "server/execution_state.h"
-#include "server/journal/serializer.h"
 #include "server/rdb_load_context.h"
+#include "server/table.h"
+#include "server/tx_base.h"
 
 struct streamID;
 
@@ -31,6 +32,8 @@ class EngineShardSet;
 class ScriptMgr;
 class CompactObj;
 class Service;
+class JournalExecutor;
+struct JournalReader;
 
 using RdbVersion = std::uint16_t;
 
@@ -48,9 +51,9 @@ class RdbLoaderBase {
   };
 
   struct RdbSBF {
-    double grow_factor, fp_prob;
-    size_t prev_size, current_size;
-    size_t max_capacity;
+    double grow_factor = 0, fp_prob = 0;
+    size_t prev_size = 0, current_size = 0;
+    size_t max_capacity = 0;
 
     struct Filter {
       unsigned hash_cnt;
@@ -143,7 +146,22 @@ class RdbLoaderBase {
     size_t reserve = 0;
 
     // Number of elements remaining in the object.
+    // For SBF2 object, this means the number of filters remaining.
+    // If the sbf_filter field is set, then this number also includes the partially read filter.
     size_t remaining = 0;
+
+    // partial state for single filter in an SBF
+    // when chunk size runs out mid-filter, saves the partially filled buffer and resumes on the
+    // next chunk.
+    struct SbfFilterState {
+      // Pre-allocated to total_size, partially filled
+      std::string filter_data;
+      // Bytes read so far, the point to which we will write next
+      size_t offset = 0;
+      // Only read on first chunk of a filter
+      unsigned hash_cnt = 0;
+    };
+    std::optional<SbfFilterState> sbf_filter;
   };
 
   struct LoadConfig {
@@ -187,7 +205,7 @@ class RdbLoaderBase {
   ::io::Result<OpaqueObj> ReadListQuicklist(int rdbtype);
   ::io::Result<OpaqueObj> ReadStreams(int rdbtype);
   ::io::Result<OpaqueObj> ReadRedisJson();
-  ::io::Result<OpaqueObj> ReadSBFImpl(bool chunking);
+  ::io::Result<OpaqueObj> ReadSBFImpl(bool filter_is_chunked);
   ::io::Result<OpaqueObj> ReadSBF();
   ::io::Result<OpaqueObj> ReadSBF2();
   ::io::Result<OpaqueObj> ReadCMS();
@@ -206,6 +224,18 @@ class RdbLoaderBase {
 
   std::error_code EnsureReadInternal(size_t min_to_read);
 
+  // Wrapper to consume n bytes from mem buf, and also decrement remaining_payload_bytes if a chunk
+  // read is in progress
+  std::error_code ConsumeInput(size_t n);
+
+  // If reading a chunk, deducts n bytes from size with error checking. No op if chunk is not being
+  // read such as journal data etc
+  std::error_code ConsumeChunkBudget(size_t n);
+
+  bool ChunkBudgetExhausted() const {
+    return current_chunk_state_ && current_chunk_state_->remaining_payload_bytes == 0;
+  }
+
   static void CopyStreamId(const StreamID& src, struct streamID* dest);
 
   base::IoBuf* mem_buf_ = nullptr;
@@ -216,10 +246,24 @@ class RdbLoaderBase {
   size_t source_limit_ = SIZE_MAX;
   base::PODArray<uint8_t> compr_buf_;
   std::unique_ptr<detail::DecompressImpl> decompress_impl_;
-  JournalReader journal_reader_{nullptr, 0};
   std::optional<uint64_t> journal_offset_ = std::nullopt;
   RdbVersion rdb_version_ = RDB_VERSION;
   PendingRead pending_read_;
+
+  std::unique_ptr<JournalReader> journal_reader_;
+  std::unique_ptr<JournalExecutor> journal_executor_;
+
+  // State for the tagged chunk currently being parsed
+  struct ActiveTaggedChunk {
+    // Identifies which interleaved object stream this chunk belongs to
+    uint32_t stream_id;
+    // Number of payload bytes still unread in this tagged chunk
+    uint32_t remaining_payload_bytes;
+  };
+
+  // Set while parsing a tagged chunk. nullopt means the current input is a regular top-level RDB
+  // entry or opcode, not tagged chunk payload
+  std::optional<ActiveTaggedChunk> current_chunk_state_ = std::nullopt;
 };
 
 class RdbLoader : protected RdbLoaderBase {
@@ -325,6 +369,10 @@ class RdbLoader : protected RdbLoaderBase {
   struct ObjSettings;
 
   std::error_code LoadKeyValPair(int type, ObjSettings* settings);
+
+  io::Result<bool> ReadAndDispatchObject(int object_type, std::string& key,
+                                         const ObjSettings& obj_settings, DbIndex db_index);
+
   // Returns whether to discard the read key pair.
   bool ShouldDiscardKey(std::string_view key, const ObjSettings& settings) const;
 
@@ -364,6 +412,11 @@ class RdbLoader : protected RdbLoaderBase {
   // Skip over serialized HNSW vector index node data without restoring.
   std::error_code SkipVectorIndex(std::string_view index_key, uint64_t elements_number);
 
+  // Extracted opcode handlers — kept out of RdbLoader::Load() so their
+  // locals don't accumulate in Load()'s stack frame.
+  std::error_code HandleVectorIndex();
+  std::error_code HandleShardDocIndex();
+
   Service* service_;
   RdbLoadContext* load_context_;
 
@@ -396,8 +449,11 @@ class RdbLoader : protected RdbLoaderBase {
   // A free pool of allocated unused items.
   base::MPSCIntrusiveQueue<Item> item_queue_;
 
-  // Map of currently chunked big values
-  std::unordered_map<std::string, std::unique_ptr<PrimeValue>> now_chunked_;
+  // Map of currently chunked big values, keyed by (db index, key) to avoid
+  // collisions when the same key name exists in different databases, and we
+  // receive chunked data from >1 db with the same key name
+  using ChunkedKey = std::pair<DbIndex, std::string>;
+  std::unordered_map<ChunkedKey, std::unique_ptr<PrimeValue>, absl::Hash<ChunkedKey>> now_chunked_;
   base::SpinLock now_chunked_mu_;  // guards now_chunked_
 
   std::string last_key_loaded_;
diff --git a/src/server/rdb_load_context.cc b/src/server/rdb_load_context.cc
index 643deca64aba..cd686cc1aaa9 100644
--- a/src/server/rdb_load_context.cc
+++ b/src/server/rdb_load_context.cc
@@ -13,6 +13,7 @@
 #include "base/logging.h"
 #include "facade/redis_parser.h"
 #include "facade/reply_capture.h"
+#include "server/common.h"
 #include "server/conn_context.h"
 #include "server/engine_shard_set.h"
 #include "server/main_service.h"
@@ -20,6 +21,7 @@
 #include "server/search/doc_index.h"
 #include "server/search/global_hnsw_index.h"
 #include "server/sharding.h"
+#include "strings/human_readable.h"
 
 namespace dfly {
 
@@ -125,9 +127,19 @@ absl::flat_hash_set<std::string> RemapAndRestoreHnswGraphs(
         break;
       }
     }
-    DCHECK(phm_ptr) << "HNSW metadata missing for " << pn.index_name << ":" << pn.field_name;
+    if (!phm_ptr) {
+      LOG(ERROR) << "HNSW metadata missing for " << pn.index_name << ":" << pn.field_name
+                 << ". Will rebuild from scratch.";
+      failed_indices.insert(pn.index_name);
+      continue;
+    }
 
-    hnsw_index->RestoreFromNodes(pn.nodes, phm_ptr->metadata);
+    if (!hnsw_index->RestoreFromNodes(pn.nodes, phm_ptr->metadata)) {
+      LOG(WARNING) << "HNSW graph restore rejected for " << pn.index_name << ":" << pn.field_name
+                   << ". Will rebuild from scratch.";
+      failed_indices.insert(pn.index_name);
+      continue;
+    }
     LOG(INFO) << "Restored HNSW index " << pn.index_name << ":" << pn.field_name << " with "
               << pn.nodes.size() << " nodes (" << remapped << " global_ids remapped)";
   }
@@ -378,6 +390,9 @@ void RdbLoadContext::PerformPostLoad(Service* service, bool is_error) {
   // RestoreKeyIndex (above) and RebuildAllIndices (below) run in separate sequential
   // AwaitRunningOnShardQueue calls, so there is no parallel index build that could interfere
   // with the doc_ids assigned during key mapping restoration.
+  LOG(INFO) << "PostLoad: rebuilding search indices across shards has_hnsw_restore="
+            << has_hnsw_restore << " rss="
+            << strings::HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
   shard_set->AwaitRunningOnShardQueue([has_hnsw_restore](EngineShard* es) {
     OpArgs op_args{es, nullptr,
                    DbContext{&namespaces->GetDefaultNamespace(), 0, GetCurrentTimeMs()}};
@@ -390,8 +405,11 @@ void RdbLoadContext::PerformPostLoad(Service* service, bool is_error) {
   }
 
   // Wait until index building ends (all shards' vector data populated).
-  shard_set->RunBlockingInParallel(
-      [](EngineShard* es) { es->search_indices()->BlockUntilConstructionEnd(); });
+  shard_set->RunBlockingInParallel([](EngineShard* es) {
+    es->search_indices()->BlockUntilConstructionEnd();
+    LOG(INFO) << "PostLoad: search index rebuild phase returned rss="
+              << strings::HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
+  });
 
   // All shards completed restoration — drain pending ops.
   // DrainPendingVectorUpdates sets kBuilding which allows Add calls.
diff --git a/src/server/rdb_save.cc b/src/server/rdb_save.cc
index de9595979cb8..a0f5778ec703 100644
--- a/src/server/rdb_save.cc
+++ b/src/server/rdb_save.cc
@@ -572,7 +572,11 @@ error_code RdbSerializer::SaveStreamObject(const PrimeValue& pv) {
     RETURN_ON_ERR(SaveString((uint8_t*)ri.key, ri.key_len));
     RETURN_ON_ERR(SaveString(lp, lp_bytes));
 
-    PushToConsumerIfNeeded(FlushState::kFlushMidEntry);
+    // Do not split after the final listpack. The loader can resume between listpacks,
+    // but the stream metadata tail is expected to stay bundled with the last listpack chunk, not
+    // in its own separate chunk.
+    if (i + 1 < rax_size)
+      PushToConsumerIfNeeded(FlushState::kFlushMidEntry);
   }
 
   std::move(stop_listpacks_rax).Invoke();
@@ -1533,13 +1537,14 @@ void CollectSearchIndices([[maybe_unused]] const EngineShard& shard,
           !(finfo.flags & search::SchemaField::NOINDEX)) {
         if (auto hnsw_index = GlobalHnswIndexRegistry::Instance().Get(index_name, finfo.short_name);
             hnsw_index) {
+          // Empty graph: enterpoint_node_ is -1 (wraps to UINT32_MAX as tableint); skip
+          // emission so the load path doesn't receive a garbage entry point.
+          if (hnsw_index->GetNodeCount() == 0)
+            break;
           auto meta = hnsw_index->GetMetadata();
           TmpJson meta_json;
           meta_json["index_name"] = index_name;
           meta_json["field_name"] = finfo.short_name;
-          meta_json["max_elements"] = meta.max_elements;
-          meta_json["cur_element_count"] = meta.cur_element_count;
-          meta_json["maxlevel"] = meta.maxlevel;
           meta_json["enterpoint_node"] = meta.enterpoint_node;
           hnsw_index_metadata->emplace_back(meta_json.to_string());
           break;
@@ -1885,4 +1890,98 @@ void RdbSerializer::PushToConsumerIfNeeded(FlushState flush_state) {
   }
 }
 
+void MemBufController::StartEntry() {
+  DCHECK_EQ(entry_buffer_owner_, 0u);
+  DCHECK_EQ(entry_buffer_.InputLen(), 0u);
+  active_id_ = next_id_++;
+  entry_buffer_owner_ = active_id_;
+  current_buffer_ = &entry_buffer_;
+}
+
+void MemBufController::FinishEntry() {
+  if (current_buffer_ == &entry_buffer_)
+    TagAndDrainToDefaultBuffer();
+
+  DCHECK_EQ(entry_buffer_.InputLen(), 0u);
+  split_entries_.erase(active_id_);
+  entry_buffer_owner_ = 0;
+  current_buffer_ = &default_buffer_;
+  active_id_ = 0;
+}
+
+void MemBufController::TagAndDrainToDefaultBuffer() {
+  DCHECK_EQ(current_buffer_, &entry_buffer_);
+  if (entry_buffer_.InputLen() == 0)
+    return;
+
+  const auto bytes = entry_buffer_.InputBuffer();
+  if (split_entries_.contains(active_id_) && send_tagged_entries_) {
+    const auto header = MakeTagHeader(entry_buffer_.InputLen());
+    default_buffer_.WriteAndCommit(header.data(), header.size());
+  }
+
+  default_buffer_.WriteAndCommit(bytes.data(), bytes.size());
+  entry_buffer_.ConsumeInput(bytes.size());
+}
+
+std::array<uint8_t, 9> MemBufController::MakeTagHeader(size_t size) const {
+  DCHECK_NE(active_id_, 0u) << "tagging when active entry is invalid";
+  DCHECK_LT(size, std::numeric_limits<uint32>::max());
+
+  std::array<uint8_t, 9> header;
+  header[0] = RDB_OPCODE_TAGGED_CHUNK;
+  absl::little_endian::Store32(header.data() + 1, active_id_);
+  absl::little_endian::Store32(header.data() + 5, size);
+  return header;
+}
+
+size_t MemBufController::FlushableSize() const {
+  auto size = current_buffer_->InputLen();
+  if (current_buffer_ != &default_buffer_)
+    size += default_buffer_.InputLen();
+  return size;
+}
+
+MemBufController::EntryId MemBufController::SaveStateBeforeConsume() {
+  DCHECK_EQ(entry_buffer_owner_, active_id_);
+  DCHECK_EQ(entry_buffer_.InputLen(), 0u);
+  const EntryId id = active_id_;
+  entry_buffer_owner_ = 0;
+  current_buffer_ = &default_buffer_;
+  active_id_ = 0;
+  return id;
+}
+
+void MemBufController::RestoreStateAfterConsume(EntryId id) {
+  DCHECK_EQ(entry_buffer_owner_, 0u);
+  DCHECK_EQ(entry_buffer_.InputLen(), 0u);
+  entry_buffer_owner_ = id;
+  active_id_ = id;
+  current_buffer_ = &entry_buffer_;
+}
+
+std::string MemBufController::BuildBlob(Bytes current_bytes) {
+  const bool has_prefix = current_buffer_ != &default_buffer_ && default_buffer_.InputLen() > 0;
+  const auto prefix = has_prefix ? default_buffer_.InputBuffer() : Bytes{};
+  const bool should_tag = send_tagged_entries_ && active_id_ != 0 &&
+                          split_entries_.contains(active_id_) && !current_bytes.empty();
+
+  std::string out;
+  out.reserve(prefix.size() + (should_tag ? kHeaderSize : 0) + current_bytes.size());
+
+  if (has_prefix) {
+    out.append(io::View(prefix));
+    default_buffer_.ConsumeInput(prefix.size());
+  }
+
+  if (should_tag) {
+    const auto header = MakeTagHeader(current_bytes.size());
+    out.append(reinterpret_cast<const char*>(header.data()), header.size());
+  }
+
+  out.append(io::View(current_bytes));
+  current_buffer_->ConsumeInput(current_bytes.size());
+  return out;
+}
+
 }  // namespace dfly
diff --git a/src/server/rdb_save.h b/src/server/rdb_save.h
index 1704e5f2af2a..091a8db89c08 100644
--- a/src/server/rdb_save.h
+++ b/src/server/rdb_save.h
@@ -3,7 +3,7 @@
 //
 #pragma once
 
-#include <absl/container/flat_hash_map.h>
+#include <absl/container/flat_hash_set.h>
 #include <absl/types/span.h>
 
 extern "C" {
@@ -79,6 +79,83 @@ CompressionMode GetDefaultCompressionMode();
 
 using StringVec = std::vector<std::string>;
 
+// Manages per-entry IO buffers for the RDB serializer, enabling tagged chunk framing for
+// interleaved serialization of multiple keys. When tagging is enabled, entries that were split
+// across multiple flushes are prefixed with a [opcode:1][stream_id:4][payload_length:4] header so
+// the loader can reassemble them.
+class MemBufController {
+  friend class MemBufControllerTest;
+
+ public:
+  using EntryId = uint32_t;
+  // Tagged chunk envelope: [RDB_OPCODE_TAGGED_CHUNK:1][stream_id:4][payload_length:4]
+  static constexpr auto kHeaderSize = 9;
+
+  // Makes entry_buffer_ the current write target and assigns a new entry id.
+  // Must be paired with FinishEntry().
+  void StartEntry();
+
+  // Finalizes the active entry. Drains any remaining data from entry_buffer_ into the
+  // default buffer (tagging it if the entry was split), then resets to default state.
+  void FinishEntry();
+
+  // Moves data from entry_buffer_ into the default buffer. If the entry was
+  // split and tagging is enabled, a tag header is prepended.
+  void TagAndDrainToDefaultBuffer();
+
+  io::IoBuf* CurrentBuffer() const {
+    return current_buffer_;
+  }
+
+  // Marks the active entry as having been split across multiple flushes. Once marked,
+  // later flushes of this entry's data will be tagged with a chunk header.
+  void MarkEntrySplit() {
+    split_entries_.insert(active_id_);
+  }
+
+  // Total bytes available for flushing: current entry buffer + any previously drained
+  // data sitting in the default buffer.
+  size_t FlushableSize() const;
+
+  // Captures the active entry id and points the current buffer to the default buffer. Called before
+  // the serializer's consume callback, which may preempt and allow other entries to interleave.
+  [[nodiscard]] EntryId SaveStateBeforeConsume();
+
+  // Restores a previously saved entry id after the consume callback returns. Points the current
+  // buffer to entry_buffer_.
+  void RestoreStateAfterConsume(EntryId id);
+
+  // Assembles a flush blob in the following steps:
+  // 1. Prepends any data in the default buffer (from previously finished entries) as a prefix.
+  // 2. If the active entry was split, a tag header is inserted before current_bytes.
+  // 3. current_bytes is added.
+  // 3. Consumes all buffers used.
+  // current_bytes is typically CurrentBuffer()->InputBuffer(), passed explicitly because it works
+  // on data returned by PrepareFlush.
+  [[nodiscard]] std::string BuildBlob(io::Bytes current_bytes);
+
+  void SetTagEntries(bool tag_entries) {
+    send_tagged_entries_ = tag_entries;
+  }
+
+ private:
+  // Builds a 9-byte tagged chunk header for the active entry with the given payload size.
+  std::array<uint8_t, 9> MakeTagHeader(size_t size) const;
+
+  bool send_tagged_entries_ = false;
+
+  EntryId next_id_ = 1;
+  EntryId active_id_ = 0;
+
+  io::IoBuf default_buffer_{4096};
+  io::IoBuf entry_buffer_{4096};
+
+  // intent lock to check that some entry id does not own the entry buffer before writing to it
+  EntryId entry_buffer_owner_ = 0;
+  io::IoBuf* current_buffer_ = &default_buffer_;
+  absl::flat_hash_set<EntryId> split_entries_;
+};
+
 class RdbSaver {
  public:
   // Global data which doesn't belong to shards and is serialized in header
diff --git a/src/server/rdb_test.cc b/src/server/rdb_test.cc
index 4c419e4ceafd..c5a407db2b46 100644
--- a/src/server/rdb_test.cc
+++ b/src/server/rdb_test.cc
@@ -5,7 +5,6 @@
 
 extern "C" {
 #include "redis/crc64.h"
-#include "redis/listpack.h"
 #include "redis/redis_aux.h"
 #include "redis/zmalloc.h"
 }
@@ -19,6 +18,7 @@ extern "C" {
 #include "facade/facade_test.h"  // needed to find operator== for RespExpr.
 #include "io/file.h"
 #include "server/engine_shard_set.h"
+#include "server/rdb_extensions.h"
 #include "server/rdb_load.h"
 #include "server/rdb_save.h"
 #include "server/test_utils.h"
@@ -1216,4 +1216,165 @@ TEST_F(RdbTest, TopkSerializationDecayParameter) {
   EXPECT_THAT(resp2, RespArray(ElementsAre("item3", "item4")));
 }
 
+void AssertTaggedData(std::string_view blob, std::string_view expected, uint32_t expected_id = 1) {
+  using namespace absl::little_endian;
+
+  ASSERT_EQ(blob.size(), MemBufController::kHeaderSize + expected.size());
+  EXPECT_EQ(static_cast<uint8_t>(blob[0]), RDB_OPCODE_TAGGED_CHUNK);
+
+  auto id = Load32(reinterpret_cast<const uint8_t*>(blob.data()) + 1);
+  auto len = Load32(reinterpret_cast<const uint8_t*>(blob.data()) + 5);
+
+  EXPECT_EQ(id, expected_id);
+  EXPECT_EQ(len, expected.size());
+  EXPECT_EQ(blob.substr(MemBufController::kHeaderSize), expected);
+}
+
+class MemBufControllerTest : public Test {
+ protected:
+  MemBufController controller_;
+
+  bool HasSplitEntries() const {
+    return !controller_.split_entries_.empty();
+  }
+
+  std::string Flush() {
+    auto current = controller_.CurrentBuffer()->InputBuffer();
+    const auto blob = controller_.BuildBlob(current);
+    EXPECT_EQ(controller_.FlushableSize(), 0);
+    return blob;
+  }
+
+  void Write(std::string_view s) {
+    controller_.CurrentBuffer()->WriteAndCommit(s.data(), s.size());
+  }
+
+  void AssertDefaultState() const {
+    EXPECT_EQ(controller_.active_id_, 0u);
+    EXPECT_EQ(controller_.CurrentBuffer(), &controller_.default_buffer_);
+  }
+
+  void MarkMidFlush() {
+    controller_.MarkEntrySplit();
+    EXPECT_TRUE(controller_.split_entries_.contains(controller_.active_id_));
+  }
+
+  MemBufController::EntryId SplitAndSuspend(std::string_view payload, uint32_t expected_id) {
+    controller_.StartEntry();
+    EXPECT_EQ(controller_.active_id_, expected_id);
+    Write(payload);
+    MarkMidFlush();
+    AssertTaggedData(Flush(), payload, expected_id);
+
+    const auto saved_id = controller_.SaveStateBeforeConsume();
+    EXPECT_EQ(saved_id, expected_id);
+    AssertDefaultState();
+    EXPECT_EQ(controller_.FlushableSize(), 0);
+    return saved_id;
+  }
+
+  void Restore(MemBufController::EntryId id) {
+    controller_.RestoreStateAfterConsume(id);
+    EXPECT_EQ(controller_.active_id_, id);
+    EXPECT_EQ(controller_.CurrentBuffer(), &controller_.entry_buffer_);
+  }
+};
+
+TEST_F(MemBufControllerTest, TaggedData) {
+  controller_.SetTagEntries(true);
+
+  const std::string_view data = "a_a_a_";
+  const auto saved_id = SplitAndSuspend(data, 1);
+  EXPECT_TRUE(HasSplitEntries());
+
+  Write("a");
+  Restore(saved_id);
+  EXPECT_EQ(controller_.FlushableSize(), 1);
+
+  Write("b");
+  EXPECT_EQ(controller_.FlushableSize(), 2);
+  controller_.FinishEntry();
+  EXPECT_FALSE(HasSplitEntries());
+
+  const std::string blob = Flush();
+
+  EXPECT_EQ(blob.size(), MemBufController::kHeaderSize + 2);
+  EXPECT_EQ(blob[0], 'a');
+  AssertTaggedData(blob.substr(1), "b");
+}
+
+TEST_F(MemBufControllerTest, NestedInterleaving) {
+  controller_.SetTagEntries(true);
+
+  const auto saved_id_a = SplitAndSuspend("aaa", 1);
+  const auto saved_id_b = SplitAndSuspend("bbb", 2);
+
+  controller_.StartEntry();
+  Write("ccc");
+  controller_.FinishEntry();
+  AssertDefaultState();
+
+  EXPECT_EQ(controller_.FlushableSize(), 3);
+
+  EXPECT_EQ(Flush(), "ccc");
+
+  Restore(saved_id_b);
+  Write("x");
+  controller_.FinishEntry();
+
+  AssertTaggedData(Flush(), "x", 2);
+
+  Restore(saved_id_a);
+  Write("y");
+  controller_.FinishEntry();
+  EXPECT_FALSE(HasSplitEntries());
+
+  AssertTaggedData(Flush(), "y");
+}
+
+TEST_F(MemBufControllerTest, BuildBlobEdgeCases) {
+  controller_.SetTagEntries(true);
+
+  Write("p");
+  controller_.StartEntry();
+  Write("x");
+  MarkMidFlush();
+
+  const std::string blob = Flush();
+  ASSERT_FALSE(blob.empty());
+  EXPECT_EQ(blob[0], 'p');
+  AssertTaggedData(blob.substr(1), "x");
+
+  controller_.FinishEntry();
+  AssertDefaultState();
+}
+
+TEST_F(MemBufControllerTest, UnsplitEntry) {
+  controller_.SetTagEntries(true);
+
+  controller_.StartEntry();
+  Write("hello");
+  controller_.FinishEntry();
+  AssertDefaultState();
+
+  EXPECT_EQ(controller_.FlushableSize(), 5);
+  EXPECT_EQ(Flush(), "hello");
+}
+
+TEST_F(MemBufControllerTest, TaggingDisabled) {
+  controller_.StartEntry();
+  Write("abc");
+  MarkMidFlush();
+
+  EXPECT_EQ(Flush(), "abc");
+
+  const auto saved_id = controller_.SaveStateBeforeConsume();
+  Restore(saved_id);
+
+  Write("def");
+  controller_.FinishEntry();
+
+  EXPECT_EQ(Flush(), "def");
+}
+
 }  // namespace dfly
diff --git a/src/server/replica.cc b/src/server/replica.cc
index f19eefe47285..de9431d28688 100644
--- a/src/server/replica.cc
+++ b/src/server/replica.cc
@@ -25,6 +25,7 @@ extern "C" {
 
 #include "base/flags.h"
 #include "base/logging.h"
+#include "facade/dragonfly_connection.h"
 #include "facade/redis_parser.h"
 #include "facade/reply_capture.h"
 #include "facade/socket_utils.h"
@@ -101,6 +102,7 @@ Replica::Replica(string host, uint16_t port, Service* se, std::string_view id,
 Replica::~Replica() {
   sync_fb_.JoinIfNeeded();
   acks_fb_.JoinIfNeeded();
+  exec_st_.JoinErrorHandler();
 }
 
 static const char kConnErr[] = "could not connect to master: ";
@@ -869,7 +871,7 @@ io::Result<bool> DflyShardReplica::StartSyncFlow(
   proactor_index_ = ProactorBase::me()->GetPoolIndex();
 
   RETURN_ON_ERR_T(make_unexpected,
-                  ConnectAndAuth(absl::GetFlag(FLAGS_master_connect_timeout_ms) * 1ms, &exec_st_));
+                  ConnectAndAuth(absl::GetFlag(FLAGS_master_connect_timeout_ms) * 1ms, cntx));
 
   VLOG(1) << "Sending on flow " << master_context_.master_repl_id << " "
           << master_context_.dfly_session_id << " " << flow_id_ << " lsn: " << lsn.value_or(-1);
@@ -1139,6 +1141,10 @@ bool DflyShardReplica::ExecuteTx(TransactionData&& tx_data, ExecutionState* cntx
 
   if (!tx_data.IsGlobalCmd()) {
     VLOG(3) << "Execute cmd without sync between shards. txid: " << tx_data.txid;
+    // Traffic logger hook: gate is inside LogReplicaCommand, so the no-op path
+    // (logger disabled) is cheap. Log before Execute so a crash during execute
+    // still leaves the record on disk for post-mortem replay.
+    facade::Connection::LogReplicaCommand(tx_data.command, tx_data.dbid);
     return executor_->Execute(tx_data.dbid, tx_data.command) == facade::DispatchResult::OK;
   }
 
@@ -1167,6 +1173,9 @@ bool DflyShardReplica::ExecuteTx(TransactionData&& tx_data, ExecutionState* cntx
   // replica.
   bool execution_res = true;
   if (inserted_by_me) {
+    // Global command — log exactly once (only the inserter flow runs Execute,
+    // so this guard naturally dedups across per-shard flows).
+    facade::Connection::LogReplicaCommand(tx_data.command, tx_data.dbid);
     execution_res = executor_->Execute(tx_data.dbid, tx_data.command) == facade::DispatchResult::OK;
   }
   // Wait until exection is done, to make sure we done execute next commands while the global is
diff --git a/src/server/replica.h b/src/server/replica.h
index ce1813e5a415..e2df9079b40b 100644
--- a/src/server/replica.h
+++ b/src/server/replica.h
@@ -138,6 +138,8 @@ class Replica : ProtocolClient {
   size_t GetRecCountExecutedPerShard(const std::vector<unsigned>& indexes) const;
 
  private:
+  ExecutionState exec_st_;
+
   util::fb2::ProactorBase* proactor_ = nullptr;
   Service& service_;
   MasterContext master_context_;
diff --git a/src/server/search/doc_index.cc b/src/server/search/doc_index.cc
index 854c3fb29057..8493c5989cb8 100644
--- a/src/server/search/doc_index.cc
+++ b/src/server/search/doc_index.cc
@@ -873,8 +873,13 @@ SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& pa
   bool skip_sort = false;
   if (auto ko = search_algo->GetKnnScoreSortOption(); ko) {
     skip_sort = !params.sort_option || params.sort_option->IsSame(*ko);
-    if (!skip_sort)
+    if (skip_sort) {
+      // Caller (SearchReply) will globally reorder by knn_score. Don't cut at the
+      // shard level — otherwise multi-shard top-K-by-distance can drop true winners.
+      limit = numeric_limits<size_t>::max();
+    } else {
       limit = max(limit, ko->limit);
+    }
   }
 
   // We don't apply limit if this is prefilter HNSW KNN search
@@ -905,7 +910,7 @@ SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& pa
 
     // If we sorted with knn_scores present, rearrange them
     if (!sort_scores.empty() && !result.knn_scores.empty()) {
-      unordered_map<DocId, size_t> score_lookup(result.knn_scores.begin(), result.knn_scores.end());
+      unordered_map<DocId, float> score_lookup(result.knn_scores.begin(), result.knn_scores.end());
       for (size_t i = 0; i < min(limit, result.ids.size()); i++)
         result.knn_scores[i] = {result.ids[i], score_lookup[result.ids[i]]};
     }
@@ -914,6 +919,11 @@ SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& pa
   // Cut off unnecessary items
   result.ids.resize(min(result.ids.size(), limit));
 
+  // Build text score lookup (DocId -> score) if available
+  absl::flat_hash_map<search::DocId, float> text_score_map;
+  for (const auto& [doc, score] : result.text_scores)
+    text_score_map[doc] = score;
+
   // Serialize documents
   vector<SerializedSearchDoc> out;
   out.reserve(min(limit, result.ids.size()));
@@ -921,12 +931,15 @@ SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& pa
   size_t expired_count = 0;
   for (size_t i = 0; i < result.ids.size(); i++) {
     float knn_score = result.knn_scores.empty() ? 0 : result.knn_scores[i].second;
+    float text_score = 0;
+    if (auto it = text_score_map.find(result.ids[i]); it != text_score_map.end())
+      text_score = it->second;
     auto sort_score = sort_scores.empty() ? std::monostate{} : std::move(sort_scores[i]);
 
     // Don't load entry if we need only its key. Ignore expiration.
     if (params.IdsOnly()) {
       string_view key = key_index_.Get(result.ids[i]);
-      out.push_back({result.ids[i], string{key}, {}, knn_score, sort_score});
+      out.push_back({result.ids[i], string{key}, {}, knn_score, text_score, sort_score});
       continue;
     }
 
@@ -945,7 +958,8 @@ SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& pa
 
     auto more_fields = accessor->Serialize(base_->schema, return_fields);
     fields.insert(make_move_iterator(more_fields.begin()), make_move_iterator(more_fields.end()));
-    out.push_back({result.ids[i], string{key}, std::move(fields), knn_score, sort_score});
+    out.push_back(
+        {result.ids[i], string{key}, std::move(fields), knn_score, text_score, sort_score});
   }
 
   return {result.total - expired_count, std::move(out), std::move(result.profile)};
@@ -970,13 +984,22 @@ vector<SearchDocData> ShardDocIndex::SearchForAggregator(
       knn_score_map[doc_id] = dist;
   }
 
-  return LoadDocEntriesWithScores(op_args, params, search_results.ids, score_alias, knn_score_map);
+  // Build text score lookup for ADDSCORES injection (keyed by DocId, safe across expired docs)
+  absl::flat_hash_map<DocId, float> text_score_map;
+  if (params.add_scores && !search_results.text_scores.empty()) {
+    text_score_map.reserve(search_results.text_scores.size());
+    for (auto& [doc_id, score] : search_results.text_scores)
+      text_score_map[doc_id] = score;
+  }
+
+  return LoadDocEntriesWithScores(op_args, params, search_results.ids, score_alias, knn_score_map,
+                                  text_score_map);
 }
 
 vector<SearchDocData> ShardDocIndex::LoadHnswRangeDocsForAggregator(
     const OpArgs& op_args, const AggregateParams& params,
-    absl::Span<const std::pair<search::DocId, float>> doc_distances,
-    std::string_view score_alias) const {
+    absl::Span<const std::pair<search::DocId, float>> doc_distances, std::string_view score_alias,
+    const absl::flat_hash_map<search::DocId, float>& text_score_map) const {
   vector<DocId> ids;
   absl::flat_hash_map<DocId, float> score_map;
   ids.reserve(doc_distances.size());
@@ -985,13 +1008,13 @@ vector<SearchDocData> ShardDocIndex::LoadHnswRangeDocsForAggregator(
     ids.push_back(doc_id);
     score_map[doc_id] = dist;
   }
-  return LoadDocEntriesWithScores(op_args, params, ids, score_alias, score_map);
+  return LoadDocEntriesWithScores(op_args, params, ids, score_alias, score_map, text_score_map);
 }
 
 vector<SearchDocData> ShardDocIndex::LoadDocEntriesWithScores(
     const OpArgs& op_args, const AggregateParams& params, absl::Span<const search::DocId> ids,
-    std::string_view score_alias,
-    const absl::flat_hash_map<search::DocId, float>& score_map) const {
+    std::string_view score_alias, const absl::flat_hash_map<search::DocId, float>& score_map,
+    const absl::flat_hash_map<search::DocId, float>& text_score_map) const {
   auto [fields_to_load, sort_indicies] =
       PreprocessAggregateFields(base_->schema, params, params.load_fields);
 
@@ -1018,6 +1041,11 @@ vector<SearchDocData> ShardDocIndex::LoadDocEntriesWithScores(
       if (it != score_map.end())
         out.back()[string{score_alias}] = static_cast<double>(it->second);
     }
+
+    if (!text_score_map.empty()) {
+      if (auto it = text_score_map.find(doc); it != text_score_map.end())
+        out.back()["__score"] = static_cast<double>(it->second);
+    }
   }
   return out;
 }
diff --git a/src/server/search/doc_index.h b/src/server/search/doc_index.h
index e4e96c3b6e72..c6aa10f972b5 100644
--- a/src/server/search/doc_index.h
+++ b/src/server/search/doc_index.h
@@ -44,7 +44,8 @@ struct SerializedSearchDoc {
   search::DocId id;
   std::string key;
   SearchDocData values;
-  float knn_score;
+  float knn_score = 0;
+  float text_score = 0;
   search::SortableValue sort_score;
 };
 
@@ -129,6 +130,9 @@ struct SearchParams {
 
   search::QueryParams query_params;
 
+  bool with_scores = false;           // WITHSCORES flag
+  search::ScorerFn scorer = nullptr;  // SCORER parameter (null = not set)
+
   bool ShouldReturnAllFields() const {
     return !return_fields.has_value();
   }
@@ -191,6 +195,9 @@ struct AggregateParams {
 
   std::optional<std::vector<FieldReference>> load_fields;
   std::vector<aggregate::AggregationStep> steps;
+
+  bool add_scores = false;            // ADDSCORES flag
+  search::ScorerFn scorer = nullptr;  // SCORER parameter (null = not set)
 };
 
 // Stores basic info about a document index.
@@ -358,8 +365,8 @@ class ShardDocIndex {
   // from the global HNSW index rather than a per-shard search.
   std::vector<SearchDocData> LoadHnswRangeDocsForAggregator(
       const OpArgs& op_args, const AggregateParams& params,
-      absl::Span<const std::pair<search::DocId, float>> doc_distances,
-      std::string_view score_alias) const;
+      absl::Span<const std::pair<search::DocId, float>> doc_distances, std::string_view score_alias,
+      const absl::flat_hash_map<search::DocId, float>& text_score_map) const;
 
   // Methods needed for join operation
   join::Vector<join::OwnedEntry> PreagregateDataForJoin(
@@ -473,8 +480,8 @@ class ShardDocIndex {
   // Loads, serializes, and (optionally) injects the YIELD_DISTANCE_AS alias for each doc.
   std::vector<SearchDocData> LoadDocEntriesWithScores(
       const OpArgs& op_args, const AggregateParams& params, absl::Span<const search::DocId> ids,
-      std::string_view score_alias,
-      const absl::flat_hash_map<search::DocId, float>& score_map) const;
+      std::string_view score_alias, const absl::flat_hash_map<search::DocId, float>& score_map,
+      const absl::flat_hash_map<search::DocId, float>& text_score_map) const;
 
   // Clears internal data. Traverses all matching documents and assigns ids.
   void Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr, bool is_restored = false);
diff --git a/src/server/search/global_hnsw_index.cc b/src/server/search/global_hnsw_index.cc
index c74f2fce8bd7..0cf5f729f5ca 100644
--- a/src/server/search/global_hnsw_index.cc
+++ b/src/server/search/global_hnsw_index.cc
@@ -82,6 +82,15 @@ void GlobalHnswIndexRegistry::Reset() {
   indices_.clear();
 }
 
+size_t GlobalHnswIndexRegistry::GetTotalMemoryUsage() const {
+  std::shared_lock<std::shared_mutex> lock(registry_mutex_);
+  size_t total = 0;
+  for (const auto& [_, index] : indices_) {
+    total += index->GetMemoryUsage();
+  }
+  return total;
+}
+
 absl::flat_hash_set<std::string> GlobalHnswIndexRegistry::GetIndexNames() const {
   std::shared_lock<std::shared_mutex> lock(registry_mutex_);
   absl::flat_hash_set<std::string> index_names;
diff --git a/src/server/search/global_hnsw_index.h b/src/server/search/global_hnsw_index.h
index c5674a2aea2c..78eb9a0d6509 100644
--- a/src/server/search/global_hnsw_index.h
+++ b/src/server/search/global_hnsw_index.h
@@ -41,6 +41,9 @@ class GlobalHnswIndexRegistry {
   // Returns unique index names from all registered HNSW indices
   absl::flat_hash_set<std::string> GetIndexNames() const;
 
+  // Aggregate in-memory footprint of all registered HNSW indices, in bytes.
+  size_t GetTotalMemoryUsage() const;
+
   void Reset();
 
  private:
diff --git a/src/server/search/search_family.cc b/src/server/search/search_family.cc
index 16a49d271f10..0e99d989cdde 100644
--- a/src/server/search/search_family.cc
+++ b/src/server/search/search_family.cc
@@ -11,6 +11,7 @@
 #include <absl/strings/str_join.h>
 #include <absl/strings/str_split.h>
 
+#include <algorithm>
 #include <atomic>
 #include <variant>
 #include <vector>
@@ -18,6 +19,7 @@
 #include "base/logging.h"
 #include "core/search/indices.h"
 #include "core/search/query_driver.h"
+#include "core/search/scoring.h"
 #include "core/search/search.h"
 #include "core/search/vector_utils.h"
 #include "facade/cmd_arg_parser.h"
@@ -440,6 +442,12 @@ search::QueryParams ParseQueryParams(CmdArgParser* parser) {
   return params;
 }
 
+std::optional<search::ScorerFn> ParseScorer(CmdArgParser* parser) {
+  return parser->TryMapNext("BM25STD", &search::BM25Std,  //
+                            "TFIDF", &search::TfIdf,      //
+                            "TFIDF.DOCNORM", &search::TfIdfDocNorm);
+}
+
 ParseResult<SearchParams> ParseSearchParams(CmdArgParser* parser) {
   SearchParams params;
 
@@ -477,6 +485,15 @@ ParseResult<SearchParams> ParseSearchParams(CmdArgParser* parser) {
       ParseNumericFilter(parser, &params);
     } else if (parser->Check("WITHSORTKEYS")) {
       params.with_sortkeys = true;
+    } else if (parser->Check("WITHSCORES")) {
+      params.with_scores = true;
+    } else if (parser->Check("SCORER")) {
+      auto scorer = ParseScorer(parser);
+      if (!scorer)
+        return CreateSyntaxError(absl::StrCat("No such scorer: ", parser->Peek()));
+      params.scorer = *scorer;
+    } else if (parser->Check("DIALECT")) {
+      parser->Skip(1);  // Accepted and ignored — DF always behaves as dialect 2
     } else {
       // Unsupported parameters are ignored for now
       parser->Skip(1);
@@ -731,6 +748,25 @@ ParseResult<AggregateParams> ParseAggregatorParams(CmdArgParser* parser) {
       continue;
     }
 
+    // SCORER, ADDSCORES, WITHSCORES can appear anywhere in the command
+    if (parser->Check("SCORER")) {
+      auto scorer = ParseScorer(parser);
+      if (!scorer)
+        return CreateSyntaxError(absl::StrCat("No such scorer: ", parser->Peek()));
+      params.scorer = *scorer;
+      continue;
+    }
+
+    if (parser->Check("ADDSCORES")) {
+      params.add_scores = true;
+      continue;
+    }
+
+    if (parser->Check("WITHSCORES")) {
+      // Silently ignored for FT.AGGREGATE (use ADDSCORES instead)
+      continue;
+    }
+
     if (parser->Check("LOAD")) {
       return CreateSyntaxError("LOAD cannot be applied after projectors or reducers"sv);
     }
@@ -1047,7 +1083,8 @@ void PartialSort(absl::Span<SerializedSearchDoc*> docs, size_t limit, SortOrder
 
 void SearchReply(const SearchParams& params,
                  std::optional<search::KnnScoreSortOption> knn_sort_option,
-                 absl::Span<SearchResult> results, SinkReplyBuilder* builder, bool is_css) {
+                 std::string_view inject_score_alias, absl::Span<SearchResult> results,
+                 SinkReplyBuilder* builder, bool is_css) {
   size_t total_hits = 0;
   absl::InlinedVector<SerializedSearchDoc*, 5> docs;
   docs.reserve(results.size());
@@ -1069,6 +1106,10 @@ void SearchReply(const SearchParams& params,
     ignore_sort = !params.sort_option || params.sort_option->IsSame(*knn_sort_option);
     if (params.ShouldReturnField(knn_sort_option->score_field_alias))
       knn_score_ret_field = knn_sort_option->score_field_alias;
+  } else if (!inject_score_alias.empty() && params.ShouldReturnField(inject_score_alias)) {
+    // FLAT VECTOR_RANGE without distance-based SORTBY: expose the alias without
+    // forcing a global reorder. Matches Redis Stack default ordering.
+    knn_score_ret_field = string{inject_score_alias};
   }
 
   // Apply LIMIT
@@ -1089,7 +1130,8 @@ void SearchReply(const SearchParams& params,
 
   const bool reply_with_ids_only = params.IdsOnly();
   auto* rb = static_cast<RedisReplyBuilder*>(builder);
-  const size_t items_per_field = (reply_with_ids_only ? 1 : 2) + params.with_sortkeys;
+  const size_t items_per_field =
+      (reply_with_ids_only ? 1 : 2) + params.with_sortkeys + params.with_scores;
   RedisReplyBuilder::ArrayScope scope{rb, limit * items_per_field + 1};
 
   Overloaded sortable_value_sender{
@@ -1101,6 +1143,9 @@ void SearchReply(const SearchParams& params,
   rb->SendLong(total_hits);
   for (size_t i = offset; i < end; i++) {
     rb->SendBulkString(docs[i]->key);
+    if (params.with_scores) {
+      rb->SendBulkString(absl::StrCat(docs[i]->text_score));
+    }
     if (params.with_sortkeys) {
       visit(sortable_value_sender, docs[i]->sort_score);
     }
@@ -1874,6 +1919,9 @@ void CmdFtSearch(CmdArgList args, CommandContext* cmd_cntx) {
 
   vector<SearchResult> css_docs;
   if (absl::GetFlag(FLAGS_cluster_search) && !is_cross_shard && IsClusterEnabled()) {
+    if (params->with_scores || params->scorer) {
+      return builder->SendError("WITHSCORES/SCORER is not yet supported in cluster search mode");
+    }
     std::string args_str = absl::StrJoin(args.subspan(2), " ");
 
     css_docs = FtSearchCSS(index_name, query_str, args_str, *params);
@@ -1883,6 +1931,12 @@ void CmdFtSearch(CmdArgList args, CommandContext* cmd_cntx) {
   if (!search_algo.Init(query_str, &params->query_params, &params->optional_filters))
     return builder->SendError("Query syntax error");
 
+  // Enable scorer: explicit SCORER param, or default BM25STD when WITHSCORES is set
+  if (params->scorer)
+    search_algo.SetScorer(params->scorer);
+  else if (params->with_scores)
+    search_algo.SetScorer(&search::BM25Std);
+
   auto [knn_node, knn] = TryPopHnswKnnNode(search_algo, index_name);
 
   // Check for HNSW vector range query (mutually exclusive with KNN)
@@ -1945,11 +1999,28 @@ void CmdFtSearch(CmdArgList args, CommandContext* cmd_cntx) {
                                       *cmd_cntx);
   }
 
+  // FLAT VECTOR_RANGE alias: HNSW is handled above. For FLAT, the per-shard Search()
+  // populates SerializedSearchDoc::knn_score but doesn't expose the alias name.
+  // If the user is sorting by the alias, route through knn_sort_option to get distance
+  // ordering; otherwise expose the alias without forcing a global reorder.
+  std::string_view inject_score_alias;
+  if (!knn && !hnsw_range && !knn_sort_option) {
+    if (auto* vr = search_algo.GetVectorRangeNode(); vr && !vr->score_alias.empty()) {
+      search::KnnScoreSortOption opt{vr->score_alias, std::numeric_limits<size_t>::max()};
+      if (params->sort_option && params->sort_option->IsSame(opt)) {
+        knn_sort_option = opt;
+      } else {
+        inject_score_alias = vr->score_alias;
+      }
+    }
+  }
+
   // TODO add merging of CSS results with local results (SORT, LIMIT, etc)
   docs.insert(docs.end(), std::make_move_iterator(css_docs.begin()),
               std::make_move_iterator(css_docs.end()));
 
-  SearchReply(*params, knn_sort_option, absl::MakeSpan(docs), builder, is_cross_shard);
+  SearchReply(*params, knn_sort_option, inject_score_alias, absl::MakeSpan(docs), builder,
+              is_cross_shard);
 }
 
 void CmdFtProfile(CmdArgList args, CommandContext* cmd_cntx) {
@@ -1975,6 +2046,12 @@ void CmdFtProfile(CmdArgList args, CommandContext* cmd_cntx) {
   if (!search_algo.Init(query_str, &params->query_params))
     return cmd_cntx->SendError("query syntax error");
 
+  // Enable scorer: explicit SCORER param, or default BM25STD when WITHSCORES is set
+  if (params->scorer)
+    search_algo.SetScorer(params->scorer);
+  else if (params->with_scores)
+    search_algo.SetScorer(&search::BM25Std);
+
   search_algo.EnableProfiling();
 
   absl::Time start = absl::Now();
@@ -2024,8 +2101,8 @@ void CmdFtProfile(CmdArgList args, CommandContext* cmd_cntx) {
 
   // Result of the search command
   if (!result_is_empty) {
-    SearchReply(*params, search_algo.GetKnnScoreSortOption(), absl::MakeSpan(search_results), rb,
-                false);
+    SearchReply(*params, search_algo.GetKnnScoreSortOption(), {}, absl::MakeSpan(search_results),
+                rb, false);
   } else {
     rb->StartArray(1);
     rb->SendLong(0);
@@ -2147,6 +2224,12 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
     if (!search_algo.Init(params->query, &params->params))
       return builder->SendError("Query syntax error");
 
+    // Enable scorer: explicit SCORER param, or default BM25STD when ADDSCORES is set
+    if (params->scorer)
+      search_algo.SetScorer(params->scorer);
+    else if (params->add_scores)
+      search_algo.SetScorer(&search::BM25Std);
+
     using ResultContainer = decltype(declval<ShardDocIndex>().SearchForAggregator(
         declval<OpArgs>(), params.value(), &search_algo));
 
@@ -2154,21 +2237,29 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
 
     auto [knn_node, knn] = TryPopHnswKnnNode(search_algo, params->index);
 
+    // Per-shard text scores from prefilter for __score injection in ADDSCORES mode.
+    // Indexed by shard_id because local DocIds are not unique across shards.
+    // Always allocated to shard_set->size() so the callback can index unconditionally.
+    std::vector<absl::flat_hash_map<search::DocId, float>> prefilter_text_scores(shard_set->size());
+
     // Build a shard-load callback for HNSW results (KNN or VECTOR_RANGE).
-    // The returned lambda captures shard_docs by const-reference — the caller must ensure
-    // it outlives the ScheduleSingleHop / Execute call.
+    // The returned lambda captures shard_docs and text_scores by const-reference —
+    // the caller must ensure they outlive the ScheduleSingleHop / Execute call.
     auto make_load_cb =
         [&](const std::vector<std::vector<std::pair<search::DocId, float>>>& shard_docs,
-            std::string_view score_alias) {
-          return
-              [&query_results, &params, &shard_docs, score_alias](Transaction* t, EngineShard* es) {
-                auto* index = es->search_indices()->GetIndex(params->index);
-                if (!index || shard_docs[es->shard_id()].empty())
-                  return OpStatus::OK;
-                query_results[es->shard_id()] = index->LoadHnswRangeDocsForAggregator(
-                    t->GetOpArgs(es), params.value(), shard_docs[es->shard_id()], score_alias);
-                return OpStatus::OK;
-              };
+            std::string_view score_alias,
+            const std::vector<absl::flat_hash_map<search::DocId, float>>& text_scores) {
+          return [&query_results, &params, &shard_docs, score_alias, &text_scores](
+                     Transaction* t, EngineShard* es) {
+            auto* index = es->search_indices()->GetIndex(params->index);
+            if (!index || shard_docs[es->shard_id()].empty())
+              return OpStatus::OK;
+            DCHECK_LT(es->shard_id(), text_scores.size());
+            query_results[es->shard_id()] = index->LoadHnswRangeDocsForAggregator(
+                t->GetOpArgs(es), params.value(), shard_docs[es->shard_id()], score_alias,
+                text_scores[es->shard_id()]);
+            return OpStatus::OK;
+          };
         };
 
     if (knn) {
@@ -2196,6 +2287,15 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
             },
             false);
         prefilter_global_ids = CollectPrefilterGlobalIds(prefilter_docs);
+
+        // Collect text scores per-shard from prefilter results for __score injection
+        if (params->add_scores) {
+          for (size_t shard_id = 0; shard_id < prefilter_docs.size(); shard_id++) {
+            for (const auto& doc : prefilter_docs[shard_id].docs) {
+              prefilter_text_scores[shard_id][doc.id] = doc.text_score;
+            }
+          }
+        }
       }
 
       // Run global HNSW KNN search
@@ -2207,10 +2307,11 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
       auto shard_docs = GroupByShardId(knn_results, shard_set->size());
 
       if (knn_has_prefilter) {
-        cmd_cntx->tx()->Execute(make_load_cb(shard_docs, knn->score_alias),
+        cmd_cntx->tx()->Execute(make_load_cb(shard_docs, knn->score_alias, prefilter_text_scores),
                                 true);  // finalize multi-hop
       } else {
-        cmd_cntx->tx()->ScheduleSingleHop(make_load_cb(shard_docs, knn->score_alias));
+        cmd_cntx->tx()->ScheduleSingleHop(
+            make_load_cb(shard_docs, knn->score_alias, prefilter_text_scores));
       }
     } else if (auto* vr = search_algo.GetVectorRangeNode();
                vr && GlobalHnswIndexRegistry::Instance().Exist(params->index, vr->field)) {
@@ -2224,7 +2325,8 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
 
       auto shard_docs = GroupByShardId(range_results, shard_set->size());
 
-      cmd_cntx->tx()->ScheduleSingleHop(make_load_cb(shard_docs, hnsw_range->score_alias));
+      cmd_cntx->tx()->ScheduleSingleHop(
+          make_load_cb(shard_docs, hnsw_range->score_alias, prefilter_text_scores));
     } else {
       cmd_cntx->tx()->ScheduleSingleHop([&](Transaction* t, EngineShard* es) {
         if (auto* index = es->search_indices()->GetIndex(params->index); index) {
@@ -2345,6 +2447,13 @@ void CmdFtAggregate(CmdArgList args, CommandContext* cmd_cntx) {
     }
   }
 
+  // Auto-add __score to visible fields when ADDSCORES is set
+  static constexpr std::string_view kScoreField = "__score";
+  if (params->add_scores &&
+      std::find(load_fields.begin(), load_fields.end(), kScoreField) == load_fields.end()) {
+    load_fields.push_back(kScoreField);
+  }
+
   auto agg_results = aggregate::Process(std::move(values), load_fields, params->steps);
 
   auto* rb = static_cast<RedisReplyBuilder*>(cmd_cntx->rb());
diff --git a/src/server/search/search_family_test.cc b/src/server/search/search_family_test.cc
index 97a4aed4342a..6a06b646e238 100644
--- a/src/server/search/search_family_test.cc
+++ b/src/server/search/search_family_test.cc
@@ -14,7 +14,6 @@
 #include "base/gtest.h"
 #include "base/logging.h"
 #include "core/detail/gen_utils.h"
-#include "core/search/stateless_allocator.h"
 #include "facade/error.h"
 #include "facade/facade_test.h"
 #include "facade/resp_parser.h"
@@ -347,10 +346,11 @@ TEST_F(SearchFamilyTest, Stats) {
   EXPECT_EQ(metrics.search_stats.num_indices, 2);
   EXPECT_EQ(metrics.search_stats.num_entries, 50 * 2);
 
+  // Per-entry overhead includes: rax nodes, block list headers, CSS varint-encoded diffs + freqs,
+  // and per-field BM25 stats (field_doc_lengths_ vector, counters).
   size_t expected_usage = 2 * (50 + 3 /* number of distinct words*/) * (24 + 48 /* kv size */) +
                           50 * 2 * 1 /* posting list entries */;
   EXPECT_GE(metrics.search_stats.used_memory, expected_usage);
-  // Upper bound accounts for index data + DocKeyIndex + FieldIndices overhead
   EXPECT_LE(metrics.search_stats.used_memory, 4 * expected_usage);
 }
 
@@ -399,6 +399,41 @@ TEST_F(SearchFamilyTest, MemoryTrackingDocKeyIndex) {
       << "Remaining half should still be tracked";
 }
 
+// Verify that HNSW index memory is accounted for in both search_stats.used_memory
+// (surfaced as search_used class of dragonfly_memory_by_class_bytes) and in
+// heap_used_bytes (surfaced as dragonfly_memory_used_bytes). See issue #7110.
+TEST_F(SearchFamilyTest, MemoryTrackingHnsw) {
+  constexpr size_t kDim = 16;
+  constexpr size_t kCapacity = 1024;
+
+  size_t search_mem_before = GetMetrics().search_stats.used_memory;
+  size_t heap_mem_before = GetMetrics().heap_used_bytes;
+
+  EXPECT_EQ(Run({"FT.CREATE", "hnsw_idx", "ON", "HASH", "SCHEMA", "v", "VECTOR", "HNSW", "8",
+                 "TYPE", "FLOAT32", "DIM", absl::StrCat(kDim), "DISTANCE_METRIC", "L2",
+                 "INITIAL_CAP", absl::StrCat(kCapacity)}),
+            "OK");
+
+  auto metrics = GetMetrics();
+  // Guard against unsigned underflow if a gauge somehow ticks down between snapshots.
+  ASSERT_GE(metrics.search_stats.used_memory, search_mem_before);
+  ASSERT_GE(metrics.heap_used_bytes, heap_mem_before);
+  size_t search_delta = metrics.search_stats.used_memory - search_mem_before;
+  size_t heap_delta = metrics.heap_used_bytes - heap_mem_before;
+
+  // The level-0 block alone allocates capacity * size_data_per_element_ bytes.
+  // With M=16 (default), maxM0_ = 32, size_data_per_element_ >= 32*4 + 8 = 136 bytes,
+  // so 1024 * 100 is a safe lower bound.
+  constexpr size_t kMinLevel0Bytes = kCapacity * 100;
+  EXPECT_GE(search_delta, kMinLevel0Bytes) << "HNSW index creation must bump search_used memory";
+  EXPECT_GE(heap_delta, kMinLevel0Bytes) << "HNSW index creation must bump heap_used_bytes";
+  // The HNSW contribution is added to both gauges. heap_used_bytes also rolls up
+  // other shard-local state (ShardDocIndices::GetUsedMemory, mimalloc heap, etc.),
+  // so it can grow strictly more than search_used; exact equality is too strict.
+  EXPECT_GE(heap_delta, search_delta)
+      << "heap_used_bytes must include the HNSW contribution counted in search_used";
+}
+
 // Test how asynchronous indexing indexes documents and reports its progress
 TEST_F(SearchFamilyTest, Indexing) {
   // Create documents
@@ -2366,12 +2401,12 @@ TEST_F(SearchFamilyTest, PrefixSearchWithSynonyms) {
 }
 
 TEST_F(SearchFamilyTest, SearchSortByOptionNonSortableFieldJson) {
-  auto resp = Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.text", "AS", "text", "TEXT"});
-  EXPECT_EQ(resp, "OK");
-
   Run({"JSON.SET", "json1", "$", R"({"text":"2"})"});
   Run({"JSON.SET", "json2", "$", R"({"text":"1"})"});
 
+  auto resp = Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.text", "AS", "text", "TEXT"});
+  EXPECT_EQ(resp, "OK");
+
   auto expect_expr = [](std::string_view text_field) {
     return IsArray(2, "json2", IsMap(text_field, "1", "$", R"({"text":"1"})"), "json1",
                    IsMap(text_field, "2", "$", R"({"text":"2"})"));
@@ -4440,6 +4475,52 @@ TEST_F(SearchFamilyTest, HnswVectorRangeWithoutYieldDistanceAs) {
   EXPECT_THAT(resp, AreDocIds("k4", "k5", "k6"));
 }
 
+// Regression: FLAT VECTOR_RANGE must inject the YIELD_DISTANCE_AS alias into FT.SEARCH
+// replies. Default order is unchanged (no implicit reorder by distance — matches Redis
+// Stack); SORTBY <alias> opts in to distance ordering.
+TEST_F(SearchFamilyTest, FlatVectorRangeYieldDistanceAs) {
+  auto F = [](float f) { return string(reinterpret_cast<const char*>(&f), sizeof(float)); };
+
+  Run({"FT.CREATE", "idx", "ON", "HASH", "SCHEMA", "pos", "VECTOR", "FLAT", "6", "TYPE", "FLOAT32",
+       "DIM", "1", "DISTANCE_METRIC", "L2"});
+  for (int i = 0; i < 10; i++)
+    Run({"HSET", absl::StrFormat("k%d", i), "pos", F(static_cast<float>(i))});
+
+  string vec = F(5.0f);
+
+  auto extract = [](const RespExpr& resp) {
+    std::map<string, double> out;
+    auto& arr = resp.GetVec();
+    for (size_t i = 1; i + 1 < arr.size(); i += 2) {
+      auto& flds = arr[i + 1].GetVec();
+      for (size_t j = 0; j + 1 < flds.size(); j += 2) {
+        if (flds[j].GetString() == "dist")
+          out[arr[i].GetString()] = std::stod(flds[j + 1].GetString());
+      }
+    }
+    return out;
+  };
+
+  // Default RETURN: alias must be present alongside the indexed field.
+  auto resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $v]=>{$YIELD_DISTANCE_AS: dist}",
+                   "PARAMS", "2", "v", vec});
+  ASSERT_EQ(resp.GetVec()[0].GetInt(), 3);
+  auto d = extract(resp);
+  EXPECT_DOUBLE_EQ(d["k4"], 1.0);
+  EXPECT_DOUBLE_EQ(d["k5"], 0.0);
+  EXPECT_DOUBLE_EQ(d["k6"], 1.0);
+
+  // Explicit RETURN of the alias.
+  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $v]=>{$YIELD_DISTANCE_AS: dist}",
+              "PARAMS", "2", "v", vec, "RETURN", "1", "dist"});
+  EXPECT_EQ(extract(resp).size(), 3u);
+
+  // SORTBY on the alias yields distance ASC (k5 closest).
+  resp = Run({"FT.SEARCH", "idx", "@pos:[VECTOR_RANGE 1.5 $v]=>{$YIELD_DISTANCE_AS: dist}",
+              "PARAMS", "2", "v", vec, "SORTBY", "dist", "ASC", "RETURN", "1", "dist"});
+  EXPECT_EQ(resp.GetVec()[1].GetString(), "k5");
+}
+
 TEST_F(SearchFamilyTest, VectorRangeAggregate) {
   auto FloatToBytes = [](float f) -> string {
     return string(reinterpret_cast<const char*>(&f), sizeof(float));
@@ -5398,6 +5479,211 @@ TEST_F(SearchFamilyTest, InfoIndexDefaultStopwordsOmitted) {
   EXPECT_THAT(info, IsArray(_, _, _, _, _, _, _, _, "num_docs", _, "indexing", _, _, _));
 }
 
+// Verify that BM25 text scores survive document expiration correctly:
+// expired docs must not cause score injection into wrong documents.
+TEST_F(SearchFamilyTest, SearchWithScoresExpiredDoc) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT"}),
+            "OK");
+
+  Run({"hset", "d:1", "title", "hello world hello"});
+  Run({"hset", "d:2", "title", "hello there"});
+  Run({"hset", "d:3", "title", "hello universe"});
+  Run({"pexpire", "d:2", "50"});
+
+  // All 3 docs match before expiry
+  auto resp = Run({"ft.search", "i1", "hello", "WITHSCORES"});
+  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
+  auto results = resp.GetVec();
+  // With WITHSCORES: [total, key, score, fields, key, score, fields, ...]
+  // Each doc takes 3 slots (key + score + field-array), plus 1 for total
+  EXPECT_GE(results.size(), 1 + 3 * 3u);
+
+  // Wait for d:2 to expire
+  AdvanceTime(60);
+  ThisFiber::SleepFor(5ms);
+
+  // Now only d:1 and d:3 should match; scores must be positive and assigned to correct docs
+  resp = Run({"ft.search", "i1", "hello", "WITHSCORES"});
+  results = resp.GetVec();
+  ASSERT_GE(results.size(), 1 + 3 * 2u);
+
+  // First element is total hits count
+  EXPECT_THAT(results[0], IntArg(2));
+
+  // Verify each returned doc has a positive score (not 0, not misassigned)
+  for (size_t i = 1; i < results.size(); i += 3) {
+    std::string key = results[i].GetString();
+    std::string score_str = results[i + 1].GetString();
+    double score = std::stod(score_str);
+    EXPECT_GT(score, 0.0) << "Doc " << key << " should have positive BM25 score";
+    EXPECT_TRUE(key == "d:1" || key == "d:3") << "Unexpected key: " << key;
+  }
+
+  Run({"flushall"});
+}
+
+// Verify ADDSCORES injects __score in FT.AGGREGATE and survives GROUPBY + REDUCE SUM
+TEST_F(SearchFamilyTest, AggregateAddScoresGroupBy) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT",
+                 "category", "TAG"}),
+            "OK");
+
+  // Two docs in category "a", one in "b" — all match "hello"
+  Run({"hset", "d:1", "title", "hello world hello", "category", "a"});
+  Run({"hset", "d:2", "title", "hello there", "category", "a"});
+  Run({"hset", "d:3", "title", "hello universe", "category", "b"});
+
+  // ADDSCORES should inject __score, then GROUPBY + REDUCE SUM aggregates them
+  auto resp = Run({"ft.aggregate", "i1", "hello", "ADDSCORES", "GROUPBY", "1", "@category",
+                   "REDUCE", "SUM", "1", "@__score", "AS", "total_score"});
+  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
+  auto results = resp.GetVec();
+
+  // FT.AGGREGATE returns [count, group1, group2, ...] — skip element [0]
+  ASSERT_GE(results.size(), 3u);  // count + 2 groups
+
+  // Parse groups: extract (category -> total_score)
+  std::map<std::string, double> group_scores;
+  for (size_t g = 1; g < results.size(); g++) {
+    auto group_vec = results[g].GetVec();
+    std::string cat;
+    double total = 0;
+    for (size_t j = 0; j < group_vec.size(); j += 2) {
+      auto key = group_vec[j].GetString();
+      if (key == "category")
+        cat = group_vec[j + 1].GetString();
+      else if (key == "total_score")
+        total = std::stod(group_vec[j + 1].GetString());
+    }
+    ASSERT_FALSE(cat.empty());
+    group_scores[cat] = total;
+  }
+
+  ASSERT_EQ(group_scores.size(), 2u);
+  EXPECT_GT(group_scores["a"], 0.0) << "Group 'a' should have positive total score";
+  EXPECT_GT(group_scores["b"], 0.0) << "Group 'b' should have positive total score";
+  // Group "a" has 2 docs matching "hello", group "b" has 1 — sum should be higher
+  EXPECT_GT(group_scores["a"], group_scores["b"])
+      << "Group with more matching docs should have higher total score";
+}
+
+// Verify WITHSCORES returns positive BM25 scores for basic text search
+TEST_F(SearchFamilyTest, SearchWithScoresBasic) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT"}),
+            "OK");
+
+  Run({"hset", "d:1", "title", "hello world hello hello"});  // TF=3
+  Run({"hset", "d:2", "title", "hello there"});              // TF=1
+
+  auto resp = Run({"ft.search", "i1", "hello", "WITHSCORES"});
+  auto results = resp.GetVec();
+  // [total_hits, key, score, fields, key, score, fields]
+  ASSERT_GE(results.size(), 1 + 3 * 2u);
+  EXPECT_THAT(results[0], IntArg(2));
+
+  // Collect (key -> score)
+  std::map<std::string, double> scores;
+  for (size_t i = 1; i < results.size(); i += 3) {
+    scores[results[i].GetString()] = std::stod(results[i + 1].GetString());
+  }
+
+  EXPECT_GT(scores["d:1"], 0.0);
+  EXPECT_GT(scores["d:2"], 0.0);
+  // d:1 has higher TF -> should score higher
+  EXPECT_GT(scores["d:1"], scores["d:2"]) << "Doc with higher TF should score higher";
+}
+
+// Verify ADDSCORES injects __score for simple (non-KNN) FT.AGGREGATE
+TEST_F(SearchFamilyTest, AggregateAddScoresSimple) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT"}),
+            "OK");
+
+  Run({"hset", "d:1", "title", "hello world hello hello"});  // TF=3
+  Run({"hset", "d:2", "title", "hello there"});              // TF=1
+  Run({"hset", "d:3", "title", "goodbye world"});            // no match
+
+  auto resp = Run({"ft.aggregate", "i1", "hello", "ADDSCORES", "SORTBY", "2", "@__score", "DESC"});
+  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
+  auto results = resp.GetVec();
+
+  // FT.AGGREGATE returns [count, result1, result2, ...] — skip element [0]
+  ASSERT_GE(results.size(), 3u);  // count + 2 results
+
+  // Results should be sorted descending by __score
+  auto first = results[1].GetVec();
+  auto second = results[2].GetVec();
+
+  // Find __score values
+  double score1 = 0, score2 = 0;
+  for (size_t j = 0; j < first.size(); j += 2) {
+    if (first[j].GetString() == "__score")
+      score1 = std::stod(first[j + 1].GetString());
+  }
+  for (size_t j = 0; j < second.size(); j += 2) {
+    if (second[j].GetString() == "__score")
+      score2 = std::stod(second[j + 1].GetString());
+  }
+
+  EXPECT_GT(score1, 0.0) << "First result should have positive score";
+  EXPECT_GT(score2, 0.0) << "Second result should have positive score";
+  EXPECT_GE(score1, score2) << "Results should be sorted by score DESC";
+}
+
+// Verify per-field BM25 scoring: a long "body" field shouldn't penalize a short "title" match
+TEST_F(SearchFamilyTest, SearchWithScoresPerField) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT",
+                 "body", "TEXT"}),
+            "OK");
+
+  // d:1 — short title match, very long body (unrelated)
+  Run({"hset", "d:1", "title", "hello", "body",
+       "the quick brown fox jumps over the lazy dog and many other words here to make body long"});
+  // d:2 — short title match, short body
+  Run({"hset", "d:2", "title", "hello", "body", "short"});
+
+  auto resp = Run({"ft.search", "i1", "@title:hello", "WITHSCORES"});
+  auto results = resp.GetVec();
+  ASSERT_GE(results.size(), 1 + 3 * 2u);
+
+  // Collect scores
+  std::map<std::string, double> scores;
+  for (size_t i = 1; i < results.size(); i += 3) {
+    scores[results[i].GetString()] = std::stod(results[i + 1].GetString());
+  }
+
+  EXPECT_GT(scores["d:1"], 0.0);
+  EXPECT_GT(scores["d:2"], 0.0);
+  // With per-field scoring, both docs have the same title field content ("hello", TF=1)
+  // and same title field length (1), so scores should be equal regardless of body length.
+  EXPECT_DOUBLE_EQ(scores["d:1"], scores["d:2"])
+      << "Per-field scoring: body length should not affect title-only query score";
+}
+
+// Verify ADDSCORES makes __score visible even without explicit LOAD or pipeline steps
+TEST_F(SearchFamilyTest, AggregateAddScoresAutoVisible) {
+  EXPECT_EQ(Run({"ft.create", "i1", "ON", "HASH", "PREFIX", "1", "d:", "SCHEMA", "title", "TEXT"}),
+            "OK");
+
+  Run({"hset", "d:1", "title", "hello world"});
+
+  // ADDSCORES with no LOAD, no SORTBY, no GROUPBY — __score should still be visible
+  auto resp = Run({"ft.aggregate", "i1", "hello", "ADDSCORES"});
+  ASSERT_THAT(resp, ArgType(RespExpr::ARRAY));
+  auto results = resp.GetVec();
+  ASSERT_GE(results.size(), 2u);  // count + at least 1 result
+
+  auto row = results[1].GetVec();
+  bool found_score = false;
+  for (size_t j = 0; j < row.size(); j += 2) {
+    if (row[j].GetString() == "__score") {
+      double score = std::stod(row[j + 1].GetString());
+      EXPECT_GT(score, 0.0);
+      found_score = true;
+    }
+  }
+  EXPECT_TRUE(found_score) << "__score should be visible with ADDSCORES even without LOAD/pipeline";
+}
+
 // DocKeyIndex: empty-key documents must survive Serialize/Restore and not be
 // confused with freed slots (which also have keys_[id] == "").
 
diff --git a/src/server/serializer_base.cc b/src/server/serializer_base.cc
index 017e420a742e..720dd3308c41 100644
--- a/src/server/serializer_base.cc
+++ b/src/server/serializer_base.cc
@@ -148,8 +148,9 @@ bool SerializerBase::ProcessBucket(DbIndex db_index, PrimeTable::bucket_iterator
   if (it.is_done() || it.GetVersion() >= snapshot_version_) {
     stats_.buckets_skipped++;
 
-    if (it.is_done())
-      return false;
+    // Update versions for empty buckets
+    if (it.GetVersion() < snapshot_version_)
+      it.SetVersion(snapshot_version_);
 
     // Force flush all delayed entries in the touched bucket
     if (EngineShard::tlocal()->tiered_storage() != nullptr && on_update)
diff --git a/src/server/serializer_base.h b/src/server/serializer_base.h
index 3b3572b76bd8..27b0c4586f66 100644
--- a/src/server/serializer_base.h
+++ b/src/server/serializer_base.h
@@ -18,6 +18,7 @@
 namespace dfly {
 
 class ExecutionState;
+struct TestDriver;
 
 // Opaque identity for a physical DashTable bucket — its memory address.
 // Unique across all databases/segments for the lifetime of a serialization.
@@ -66,6 +67,8 @@ struct DelayedEntryHandler {
   }
 
  private:
+  friend struct TestDriver;
+
   BucketDependencies& deps_;
 
   // Entries that are waiting for tiered storage reads to complete before they can be serialized.
diff --git a/src/server/serializer_base_test.cc b/src/server/serializer_base_test.cc
new file mode 100644
index 000000000000..8e02d8b7c07e
--- /dev/null
+++ b/src/server/serializer_base_test.cc
@@ -0,0 +1,427 @@
+// Copyright 2026, DragonflyDB authors.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "server/serializer_base.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/random/distributions.h>
+#include <absl/random/random.h>
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <boost/smart_ptr/intrusive_ptr.hpp>
+#include <chrono>
+#include <queue>
+
+#include "base/logging.h"
+#include "facade/facade_test.h"
+#include "facade/resp_expr.h"
+#include "io/io.h"
+#include "server/command_registry.h"
+#include "server/common.h"
+#include "server/conn_context.h"
+#include "server/db_slice.h"
+#include "server/engine_shard.h"
+#include "server/execution_state.h"
+#include "server/journal/journal.h"
+#include "server/journal/serializer.h"
+#include "server/journal/types.h"
+#include "server/table.h"
+#include "server/test_utils.h"
+#include "server/transaction.h"
+#include "util/fibers/fibers.h"
+#include "util/fibers/synchronization.h"
+
+using namespace std::chrono_literals;
+
+namespace dfly {
+
+// Driver for "artificially" resolving delayed entries with some delay
+// driven by a fiber in the background
+struct TestDelayDriver {
+  using Fut = util::fb2::Future<io::Result<std::string>>;
+  using OrdEntry = std::pair<std::chrono::steady_clock::time_point, Fut>;
+
+  struct Comp {
+    bool operator()(const OrdEntry& e1, const OrdEntry& e2) const {
+      return e1.first > e2.first;
+    }
+  };
+
+  Fut Enqeue(unsigned delay_us) {
+    auto tp = std::chrono::steady_clock::now() + std::chrono::microseconds(delay_us);
+    Fut future{};
+    q_.emplace(tp, future);
+    var_.notify_one();
+    return future;
+  }
+
+  void Loop() {
+    while (!done) {
+      util::fb2::NoOpLock lock;
+      var_.wait(lock, [this]() { return done || (!paused && !q_.empty()); });
+
+      while (!paused && !done && !q_.empty()) {
+        auto entry = q_.top();
+        q_.pop();
+
+        util::ThisFiber::SleepUntil(entry.first);
+        entry.second.Resolve(std::string{});
+      }
+    }
+  }
+
+  void Pause() {
+    paused = true;
+    var_.notify_all();
+  }
+
+  void Resume() {
+    paused = false;
+    var_.notify_all();
+  }
+
+  void Start() {
+    resolver_fb_ = {std::bind_front(&TestDelayDriver::Loop, this)};
+  }
+
+  void Stop() {
+    done = true;
+    var_.notify_all();
+    resolver_fb_.JoinIfNeeded();
+  }
+
+  bool done = false;
+  bool paused = false;
+  util::fb2::CondVarAny var_;
+
+  std::priority_queue<OrdEntry, std::vector<OrdEntry>, Comp> q_;
+  util::fb2::Fiber resolver_fb_;
+};
+
+struct TestDriver : public SerializerBase, journal::JournalConsumerInterface {
+  struct Params {
+    float delay_prob = 0.0;
+    std::pair<unsigned, unsigned> delay_lat_us = {0, 100};
+  };
+
+  TestDriver(Params params, DbSlice* slice, ExecutionState* cntx, CommandRegistry* reg)
+      : SerializerBase(slice, cntx), params_{params}, reg_{reg} {
+  }
+
+  unsigned SerializeBucketLocked(DbIndex db_index, PrimeTable::bucket_iterator it,
+                                 bool on_update) override;
+
+  void SerializeFetchedEntry(const TieredDelayedEntry& tde, const PrimeValue& pv) override {
+    RecordSerialized(tde.key.ToString());
+  }
+
+  void ConsumeJournalChange(const journal::JournalChangeItem& item) override;
+
+  void ThrottleIfNeeded() override {
+  }
+
+  // TODO: possibly replace with unified loop if we decide on this?
+  void Loop();
+
+  void Serialize(BucketIdentity bucket, std::string key) {
+    if (absl::Bernoulli(bg_, params_.delay_prob)) {
+      DelayedEntryHandler::deps_.Increment(bucket);
+      unsigned delay = absl::Uniform(bg_, params_.delay_lat_us.first, params_.delay_lat_us.second);
+      auto de = std::make_unique<TieredDelayedEntry>(0, CompactKey{key},
+                                                     delay_driver_.Enqeue(delay), 0, 0);
+      DelayedEntryHandler::delayed_entries_.emplace(bucket, std::move(de));
+    } else {
+      RecordSerialized(std::move(key));
+    }
+  }
+
+  void RecordSerialized(std::string key) {
+    CHECK(!emitted_baselines_.contains(key));
+    CHECK(!journal_writes_.contains(key));  // No journal entries must exist for this key
+    emitted_baselines_.emplace(std::move(key));
+  }
+
+  void Start() {
+    SerializerBase::RegisterChangeListener();
+    journal::StartInThread();
+    journal_id_ = journal::RegisterConsumer(this);
+
+    snapshot_fb_ = util::fb2::Fiber{[this] {
+      Loop();
+      UnregisterChangeListener();
+    }};
+
+    delay_driver_.Start();
+  }
+
+  void Wait() {
+    snapshot_fb_.JoinIfNeeded();
+  }
+
+  auto Finish() {
+    Wait();
+    delay_driver_.Stop();
+    journal::UnregisterConsumer(journal_id_);
+    return std::tuple(GetStats(), std::move(emitted_baselines_), std::move(journal_writes_));
+  }
+
+  Params params_;
+  CommandRegistry* reg_;
+
+  absl::InsecureBitGen bg_;
+
+  util::fb2::Fiber snapshot_fb_;
+  PrimeTable::Cursor snapshot_cursor_;
+  uint32_t journal_id_;
+
+  // subdriver for delayed entries
+  TestDelayDriver delay_driver_;
+
+  absl::flat_hash_set<std::string> emitted_baselines_;
+  absl::flat_hash_map<std::string, unsigned> journal_writes_;
+};
+
+void TestDriver::Loop() {
+  for (DbIndex snapshot_db_indx = 0; snapshot_db_indx < db_array_.size(); ++snapshot_db_indx) {
+    if (!base_cntx_->IsRunning())
+      return;
+
+    if (!db_array_[snapshot_db_indx])
+      continue;
+
+    PrimeTable* pt = &db_array_[snapshot_db_indx]->prime;
+    do {
+      if (!base_cntx_->IsRunning()) {
+        return;
+      }
+
+      snapshot_cursor_ = pt->TraverseBuckets(snapshot_cursor_, [this, snapshot_db_indx](auto it) {
+        ProcessBucket(snapshot_db_indx, it, false);
+      });
+
+      util::ThisFiber::Yield();
+    } while (snapshot_cursor_);
+
+    {
+      std::lock_guard guard(stream_mu_);
+      ProcessDelayedEntries(true, 0, base_cntx_);
+    }
+
+    util::ThisFiber::Yield();
+  }  // for (dbindex)
+}
+
+void TestDriver::ConsumeJournalChange(const journal::JournalChangeItem& item) {
+  io::BytesSource bytes{item.journal_item.data};
+  JournalReader reader{&bytes, 0};
+
+  // Check entry is parsable
+  journal::ParsedEntry entry;
+  auto ec = reader.ReadEntry(&entry);
+  CHECK(!ec) << ec;
+
+  // Check entry is a command trace
+  if (entry.opcode != journal::Op::COMMAND)
+    return;
+
+  // Extract cid + original args
+  auto cid = reg_->Find(entry.cmd.Front());
+  std::vector<std::string_view> str_vec;
+  for (auto v : entry.cmd.view())
+    str_vec.push_back(v);
+  str_vec.erase(str_vec.begin());
+
+  // Check all keys were baseline emitted before
+  auto keys = DetermineKeys(cid, str_vec);
+  CHECK(keys);
+  for (auto key : keys->Range(str_vec))
+    journal_writes_[key]++;
+}
+
+unsigned TestDriver::SerializeBucketLocked(DbIndex db_index, PrimeTable::bucket_iterator it,
+                                           bool on_update) {
+  unsigned serialized = 0;
+  for (it.AdvanceIfNotOccupied(); !it.is_done(); ++it) {
+    DCHECK_EQ(it.GetVersion(), snapshot_version_);
+
+    std::lock_guard lk{stream_mu_};
+    Serialize(it.bucket_address(), it->first.ToString());
+    ++serialized;
+
+    while (absl::Bernoulli(bg_, 0.3)) {
+      for (unsigned it = absl::Uniform(bg_, 1, 10); it > 0; it--)
+        util::ThisFiber::Yield();
+    }
+  }
+  return serialized;
+}
+
+class SerializerBaseTest : public BaseFamilyTest {
+ public:
+  void SetUp() {
+    num_threads_ = 1;
+    BaseFamilyTest::SetUp();
+  }
+
+ protected:
+  void Start() {
+    pp_->at(0)->Await([this] { StartOnThread(); });
+  }
+
+  auto Finish() {
+    std::decay_t<decltype(driver_->Finish())> res;
+    pp_->at(0)->Await([this, &res] {
+      res = driver_->Finish();
+      driver_.reset();  // must be destroyed in this thread
+    });
+
+    return res;
+  }
+
+  void Change(auto cb) {
+    pp_->at(0)->Await([this, cb] { cb(*driver_); });
+  }
+
+  TestDriver::Params driver_params;
+
+ private:
+  void StartOnThread() {
+    auto* reg = service_->mutable_registry();
+
+    boost::intrusive_ptr<Transaction> tx = new Transaction{reg->Find("SAVE")};
+    tx->InitByArgs(&namespaces->GetDefaultNamespace(), 0, {});
+
+    tx->ScheduleSingleHop([this, reg](Transaction* t, EngineShard* es) {
+      driver_.emplace(driver_params, &t->GetDbSlice(es->shard_id()), &cntx_, reg);
+      driver_->Start();
+      return OpStatus::OK;
+    });
+  }
+
+  ExecutionState cntx_;
+  std::optional<TestDriver> driver_;
+};
+
+// Check that basic serialization of debug populate is successful and fullfils all driver asserts
+TEST_F(SerializerBaseTest, StaticDebugPopulate) {
+  const size_t kKeys = 10000;
+  Run({"DEBUG", "POPULATE", std::to_string(kKeys)});
+  Start();
+
+  // Issue appends at the same time
+  std::atomic_bool running = true;
+  auto worker = pp_->at(0)->LaunchFiber([&] {
+    for (unsigned i = 0; running.load(std::memory_order_relaxed) && i < kKeys; i++) {
+      Run("W1", {"APPEND", absl::StrCat("key:", i), "D"});
+      util::ThisFiber::Yield();
+    }
+  });
+
+  // Finish and join worker
+  auto [stats, baselines, _] = Finish();
+  running = false;
+  worker.Join();
+
+  // Expect serialized keys
+  EXPECT_EQ(stats.keys_serialized, kKeys);
+  for (unsigned i = 0; i < kKeys; i++)
+    EXPECT_TRUE(baselines.contains(absl::StrCat("key:", i)));
+}
+
+// Check serialization of lists is successful with parallel additions to list.
+// Each operation (including creation) adds one item to the list
+// and each operation causes either serialization or a journal write.
+// So at the end the number of writes (baseline and journal) must be equal to the list length
+// TODO: Add multiple drivers
+// TODO: Will be wrong with journal omits
+TEST_F(SerializerBaseTest, IncreasingLists) {
+  const size_t kKeys = 5000;
+  Run({"DEBUG", "POPULATE", std::to_string(kKeys), "key", "100", "TYPE", "LIST", "ELEMENTS", "1"});
+  Start();
+
+  // Issue single value appends at the same time
+  // Select keys in range [0, 2 * kKeys] to have a balance of new and existing keys
+  std::atomic_bool running = true;
+  std::vector<util::fb2::Fiber> workers;
+  for (size_t w = 0; w < 3; w++) {
+    auto worker = pp_->at(w % pp_->size())->LaunchFiber([&, w] {
+      std::string id = absl::StrCat("w", w);
+      absl::InsecureBitGen bg;
+
+      while (running.load(std::memory_order_relaxed)) {
+        size_t i = absl::Uniform(bg, 0u, 2 * kKeys);
+        size_t j = absl::Uniform(bg, 0u, 1000u);
+
+        Run(id, {"LPUSH", absl::StrCat("key:", i), absl::StrCat("v", j)});
+        util::ThisFiber::Yield();
+      }
+    });
+    workers.push_back(std::move(worker));
+  }
+
+  // Wait for main loop
+  Change([](TestDriver& d) { d.Wait(); });
+
+  // Stop all writers and finish
+  running = false;
+  for (auto& w : workers)
+    w.Join();
+  auto [stats, baselines, journal_writes] = Finish();
+
+  // Verify invariants (see comment at function top)
+  unsigned seen = 0;
+  for (size_t i = 0; i < kKeys * 2; i++) {
+    auto key = absl::StrCat("key:", i);
+    if (Run({"exists", key}).GetInt() == 0)
+      continue;
+
+    seen++;
+    unsigned len = Run({"LLEN", key}).GetInt().value_or(0);
+    unsigned base_written = baselines.contains(key);
+    unsigned journal_written = journal_writes.contains(key) ? journal_writes.at(key) : 0u;
+
+    EXPECT_EQ(len, base_written + journal_written);
+  }
+
+  EXPECT_THAT(Run({"dbsize"}), IntArg(seen));
+}
+
+// During delayed read of a tiered value, it can be come expired.
+// Mass expity of items can cause previously occupied buckets to become empty.
+// Serialization code has many paths that omit empty bucket checks at all -
+// assert those "lost" delayed reads are correctly flushed before new changes
+TEST_F(SerializerBaseTest, DelayedAllDeleted) {
+  GTEST_SKIP() << "To be fixed";
+
+  // 1-2 ms
+  driver_params = {.delay_prob = 0.9, .delay_lat_us = {1000, 2000}};
+
+  // Fill database with some keys
+  const size_t kKeys = 10000;
+  Run({"DEBUG", "POPULATE", std::to_string(kKeys)});
+
+  // Set short expiry (10ms)
+  for (unsigned i = 0; i < kKeys; i++)
+    Run({"PEXPIRE", absl::StrCat("key:", i), "10"});
+
+  // Start and pause reolution of delayed entries
+  Start();
+  Change([](TestDriver& d) { d.delay_driver_.Pause(); });
+
+  // Let all values to be expire deleted
+  TEST_current_time_ms = TEST_current_time_ms + 100;
+  for (unsigned i = 0; i < kKeys; i++)
+    EXPECT_THAT(Run({"GET", absl::StrCat("key:", i)}), ArgType(RespExpr::NIL));
+
+  // Reallow delayed entry resolution
+  Change([](TestDriver& d) { d.delay_driver_.Resume(); });
+
+  // Trigger changes with dels
+  for (unsigned i = 0; i < kKeys; i++)
+    Run({"SET", absl::StrCat("key:", i), "V"});
+
+  Finish();
+}
+}  // namespace dfly
diff --git a/src/server/server_family.cc b/src/server/server_family.cc
index 7dd5f2ca32be..3ecc4b1eb5be 100644
--- a/src/server/server_family.cc
+++ b/src/server/server_family.cc
@@ -68,6 +68,9 @@ extern "C" {
 #include "server/rdb_save.h"
 #include "server/replica.h"
 #include "server/script_mgr.h"
+#ifdef WITH_SEARCH
+#include "server/search/global_hnsw_index.h"
+#endif
 #include "server/search/search_family.h"
 #include "server/server_state.h"
 #include "server/snapshot.h"
@@ -1283,7 +1286,7 @@ void ServerFamily::Shutdown() {
     }
     StopAllClusterReplicas();
 
-    dfly_cmd_->Shutdown();
+    dfly_cmd_->CancelReplicas();
     DebugCmd::Shutdown();
 #ifdef WITH_SEARCH
     SearchFamily::Shutdown();
@@ -1445,6 +1448,13 @@ std::optional<fb2::Future<GenericError>> ServerFamily::Load(const std::string& p
     } else {
       load_context->PerformPostLoad(&service_);
       LOG(INFO) << "Load finished, num keys read: " << aggregated_result->keys_read;
+
+      // Loaded data bypasses the journal, so force replicas into full sync.
+      dfly_cmd_->CancelReplicas();
+      shard_set->RunBriefInParallel([](EngineShard* shard) {
+        if (shard->journal())
+          journal::ClearBuffer();
+      });
     }
 
     service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);
@@ -2449,7 +2459,8 @@ bool ServerFamily::TEST_IsSaving() const {
 }
 
 void ServerFamily::Drakarys(Transaction* transaction, DbIndex db_ind, bool wait) {
-  VLOG(1) << "Drakarys";
+  LOG(INFO) << "Drakarys start db=" << db_ind << " wait=" << wait
+            << " rss=" << HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
 
   vector<fb2::Fiber> fibers(shard_set->size());
   transaction->Execute(
@@ -2462,6 +2473,11 @@ void ServerFamily::Drakarys(Transaction* transaction, DbIndex db_ind, bool wait)
   auto action = wait ? &fb2::Fiber::JoinIfNeeded : &fb2::Fiber::Detach;
   for (auto& f : fibers)
     (f.*action)();
+
+  LOG(INFO) << (wait ? "Drakarys main done (shards joined)"
+                     : "Drakarys main dispatched (shards detached; per-shard 'finished decommit' "
+                       "logs mark real completion)")
+            << " rss=" << HumanReadableNumBytes(rss_mem_current.load(std::memory_order_relaxed));
 }
 
 SaveInfoData ServerFamily::GetLastSaveInfo() const {
@@ -3089,6 +3105,16 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
 
   service_.proactor_pool().AwaitFiberOnAll(std::move(cb));
 
+#ifdef WITH_SEARCH
+  // HNSW indices live in a single global registry shared across shards, so their
+  // footprint must be added once — not per-shard. Track it in both the search_used
+  // breakdown (memory_by_class_bytes{class="search_used"}) and the overall heap
+  // gauge (memory_used_bytes). See issue #7110.
+  size_t hnsw_memory = GlobalHnswIndexRegistry::Instance().GetTotalMemoryUsage();
+  result.search_stats.used_memory += hnsw_memory;
+  result.heap_used_bytes += hnsw_memory;
+#endif
+
   uint64_t after_cb = absl::GetCurrentTimeNanos();
 
   // Normalize moving average stats
@@ -3106,6 +3132,7 @@ Metrics ServerFamily::GetMetrics(Namespace* ns) const {
   }
 
   result.migration_errors_total = service_.cluster_family().MigrationsErrorsCount();
+  result.acl_stats = service_.user_registry().GetAclStats();
 
   // Update peak stats. We rely on the fact that GetMetrics is called frequently enough to
   // update peak_stats_ from it.
@@ -3643,6 +3670,19 @@ string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view sectio
     }
   }
 
+  if (should_enter("ACL", true)) {
+    const auto& acl = m.acl_stats;
+    append("acl_num_users", acl.num_users);
+    append("acl_num_passwords", acl.num_passwords);
+    append("acl_num_cat_changes", acl.num_cat_changes);
+    append("acl_num_cmd_changes", acl.num_cmd_changes);
+    append("acl_num_key_globs", acl.num_key_globs);
+    append("acl_key_globs_bytes", acl.key_globs_bytes);
+    append("acl_num_pubsub_globs", acl.num_pubsub_globs);
+    append("acl_pubsub_globs_bytes", acl.pubsub_globs_bytes);
+    append("acl_total_bytes", acl.TotalBytes());
+  }
+
   return info;
 }
 
diff --git a/src/server/server_family.h b/src/server/server_family.h
index f1d420a31cc5..8862d7122924 100644
--- a/src/server/server_family.h
+++ b/src/server/server_family.h
@@ -12,6 +12,7 @@
 #include "core/qlist.h"
 #include "facade/facade_stats.h"
 #include "facade/facade_types.h"
+#include "server/acl/user_registry.h"
 #include "server/db_slice.h"
 #include "server/engine_shard_set.h"
 #include "server/replica_types.h"
@@ -151,6 +152,8 @@ struct Metrics {
   absl::flat_hash_map<std::string, hdr_histogram*> cmd_latency_map;
 
   InternedStringStats interned_string_stats;
+
+  acl::UserRegistry::AclStats acl_stats;
 };
 
 // Contains the state of the last save operation.
diff --git a/src/server/server_family_test.cc b/src/server/server_family_test.cc
index 746d6a0cf5af..86945bb34d37 100644
--- a/src/server/server_family_test.cc
+++ b/src/server/server_family_test.cc
@@ -41,7 +41,7 @@ TEST_F(ServerFamilyTest, ReadTcpInfo) {
   server_addr.sin_port = 0;  // Let the system choose a free port
 
   // Bind to the port
-  ASSERT_EQ(bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
+  ASSERT_EQ(::bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
       << "Failed to bind socket: " << strerror(errno);
 
   // Start listening
@@ -78,7 +78,7 @@ TEST_F(ServerFamilyTest, GetTcpSocketInfoIPv6) {
   server_addr.sin6_port = 0;  // Let the system choose a free port
 
   // Bind to the port
-  ASSERT_EQ(bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
+  ASSERT_EQ(::bind(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)), 0)
       << "Failed to bind IPv6 socket: " << strerror(errno);
 
   // Start listening
diff --git a/src/server/server_state.cc b/src/server/server_state.cc
index a5dc4aabb56f..5d5714847fe4 100644
--- a/src/server/server_state.cc
+++ b/src/server/server_state.cc
@@ -346,12 +346,6 @@ void ServerState::ConnectionsWatcherFb(util::ListenerInterface* main) {
   }
 }
 
-void ServerState::UnsubscribeSlotsAndUpdateChannelStore(const ChannelStore::ChannelsSubMap& sub_map,
-                                                        ChannelStore* replacement) {
-  channel_store_->UnsubscribeConnectionsFromDeletedSlots(sub_map, thread_index_);
-  channel_store_ = replacement;
-}
-
 void ServerState::RecordCmd(bool is_main_conn) {
   if (is_main_conn) {
     ++tl_connection_stats()->command_cnt_main;
diff --git a/src/server/server_state.h b/src/server/server_state.h
index a451d701c739..419c4b15d099 100644
--- a/src/server/server_state.h
+++ b/src/server/server_state.h
@@ -245,17 +245,6 @@ class ServerState {  // public struct - to allow initialization.
     return thread_index_;
   }
 
-  ChannelStore* channel_store() const {
-    return channel_store_;
-  }
-
-  void UpdateChannelStore(ChannelStore* replacement) {
-    channel_store_ = replacement;
-  }
-
-  void UnsubscribeSlotsAndUpdateChannelStore(const ChannelStore::ChannelsSubMap& sub_map,
-                                             ChannelStore* replacement);
-
   bool ShouldLogSlowCmd(unsigned latency_usec) const;
 
   Stats stats;
@@ -316,8 +305,6 @@ class ServerState {  // public struct - to allow initialization.
   InterpreterManager interpreter_mgr_;
   absl::flat_hash_map<ScriptMgr::ScriptKey, ScriptMgr::ScriptParams> cached_script_params_;
 
-  ChannelStore* channel_store_;
-
   GlobalState gstate_ = GlobalState::ACTIVE;
 
   // To support concurrent `CLIENT PAUSE commands` correctly, we store the amount
diff --git a/src/server/snapshot.cc b/src/server/snapshot.cc
index 83846336dbd3..6c3a1ed32581 100644
--- a/src/server/snapshot.cc
+++ b/src/server/snapshot.cc
@@ -179,9 +179,10 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
         return;
       }
 
-      snapshot_cursor_ = pt->TraverseBuckets(snapshot_cursor_, [this, snapshot_db_indx](auto it) {
-        ProcessBucket(snapshot_db_indx, it, false);
-      });
+      snapshot_cursor_ = pt->TraverseBuckets(
+          snapshot_cursor_,
+          [this, snapshot_db_indx](auto it) { ProcessBucket(snapshot_db_indx, it, false); },
+          true /* include empty buckets */);
 
       if (use_background_mode_) {
         // Yielding for background fibers has low overhead if the time slice isn't used up.
diff --git a/src/server/stream_family.cc b/src/server/stream_family.cc
index 6b5e9a61d2cc..bc9e886eef36 100644
--- a/src/server/stream_family.cc
+++ b/src/server/stream_family.cc
@@ -3006,16 +3006,17 @@ void XReadBlock(ReadOpts* opts, Transaction* tx, SinkReplyBuilder* builder,
   auto tp = (opts->timeout) ? chrono::steady_clock::now() + chrono::milliseconds(opts->timeout)
                             : Transaction::time_point::max();
 
-  const auto key_checker = [opts](EngineShard* owner, const DbContext& context, Transaction* tx,
-                                  std::string_view key) -> bool {
+  const auto key_checker = [opts](EngineShard* owner, const DbContext& context,
+                                  std::string_view key) -> KeyReadyResult {
     auto& db_slice = context.GetDbSlice(owner->shard_id());
     auto res_it = db_slice.FindReadOnly(context, key, OBJ_STREAM);
     if (!res_it.ok())
-      return false;
+      return res_it.status() == OpStatus::WRONG_TYPE ? KeyReadyResult::kNotReady
+                                                     : KeyReadyResult::kKeyNotFound;
 
     StreamIDsItem& sitem = opts->stream_ids.at(key);
     if (sitem.id.val.ms != UINT64_MAX && sitem.id.val.seq != UINT64_MAX)
-      return true;
+      return KeyReadyResult::kReady;
 
     const CompactObj& cobj = (*res_it)->second;
     stream* s = GetReadOnlyStream(cobj);
@@ -3028,10 +3029,11 @@ void XReadBlock(ReadOpts* opts, Transaction* tx, SinkReplyBuilder* builder,
     if (opts->read_group) {
       sitem.group = StreamLookupCG(s, WrapSds(opts->group_name));
       if (!sitem.group)
-        return true;  // abort
+        return KeyReadyResult::kReady;  // abort
     }
 
-    return streamCompareID(&last_id, &sitem.group->last_id) > 0;
+    return streamCompareID(&last_id, &sitem.group->last_id) > 0 ? KeyReadyResult::kReady
+                                                                : KeyReadyResult::kNotReady;
   };
 
   if (auto status =
diff --git a/src/server/tiering/common.h b/src/server/tiering/common.h
index 6292825f8e66..fca80702bc10 100644
--- a/src/server/tiering/common.h
+++ b/src/server/tiering/common.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <iosfwd>
 #include <memory>
 #include <optional>
 #include <variant>
@@ -42,9 +43,7 @@ struct DiskSegment {
 
   size_t offset = 0, length = 0;
 
-  friend std::ostream& operator<<(std::ostream& os, const DiskSegment& ds) {
-    return os << "[" << ds.offset << ", " << ds.length << "]";
-  }
+  friend std::ostream& operator<<(std::ostream& os, const DiskSegment& ds);
 };
 
 using KeyRef = std::pair<uint16_t /* DbIndex */, std::string_view>;
diff --git a/src/server/tiering/disk_storage.cc b/src/server/tiering/disk_storage.cc
index 4dc0802bd797..0469090858f1 100644
--- a/src/server/tiering/disk_storage.cc
+++ b/src/server/tiering/disk_storage.cc
@@ -4,6 +4,7 @@
 
 #include "server/tiering/disk_storage.h"
 
+#include <ostream>
 #include <system_error>
 
 #include "base/flags.h"
@@ -24,6 +25,10 @@ ABSL_FLAG(uint64_t, registered_buffer_size, 512_KB,
 
 namespace dfly::tiering {
 
+std::ostream& operator<<(std::ostream& os, const DiskSegment& ds) {
+  return os << "[" << ds.offset << ", " << ds.length << "]";
+}
+
 using namespace std;
 using namespace ::util::fb2;
 
diff --git a/src/server/tx_base.h b/src/server/tx_base.h
index caf49cf26809..e64ce5795725 100644
--- a/src/server/tx_base.h
+++ b/src/server/tx_base.h
@@ -107,9 +107,15 @@ class LockTag {
   }
 };
 
+enum class KeyReadyResult {
+  kKeyNotFound,  // key doesn't exist - abort the entire watch queue
+  kNotReady,     // key exists but per-tx conditions not met - skip this tx, try next
+  kReady,        // wake this tx
+};
+
 // Checks whether the touched key is valid for a blocking transaction watching it.
 using KeyReadyChecker =
-    std::function<bool(EngineShard*, const DbContext& context, Transaction* tx, std::string_view)>;
+    std::function<KeyReadyResult(EngineShard*, const DbContext& context, std::string_view)>;
 
 // References arguments in another array.
 using IndexSlice = std::pair<uint32_t, uint32_t>;  // [begin, end)
diff --git a/src/server/zset_family.cc b/src/server/zset_family.cc
index 7f15717b7fc9..dba90bc7b757 100644
--- a/src/server/zset_family.cc
+++ b/src/server/zset_family.cc
@@ -2656,9 +2656,14 @@ void ZMPopGeneric(CmdArgList args, CommandContext* cmd_cntx, bool is_blocking) {
       using namespace std::chrono;
       limit_tp = steady_clock::now() + milliseconds(limit_ms);
     }
-    const auto key_checker = [ns](EngineShard* owner, const DbContext& context, Transaction*,
-                                  std::string_view key) -> bool {
-      return ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_ZSET).ok();
+    const auto key_checker = [ns](EngineShard* owner, const DbContext& context,
+                                  std::string_view key) -> KeyReadyResult {
+      auto res = ns->GetDbSlice(owner->shard_id()).FindReadOnly(context, key, OBJ_ZSET);
+      if (res.ok())
+        return KeyReadyResult::kReady;
+      if (res.status() == OpStatus::WRONG_TYPE)
+        return KeyReadyResult::kNotReady;
+      return KeyReadyResult::kKeyNotFound;
     };
 
     DCHECK(trans->IsScheduled());  // Checking if the transaction is scheduled before calling
diff --git a/tests/dragonfly/cluster_test.py b/tests/dragonfly/cluster_test.py
index 82efafc9167d..6f55be8bcb08 100644
--- a/tests/dragonfly/cluster_test.py
+++ b/tests/dragonfly/cluster_test.py
@@ -1985,6 +1985,7 @@ async def test_keys_expiration_during_migration(df_factory: DflyInstanceFactory)
     await seeder_task
 
     logging.debug("finish migration")
+    nodes[0].migrations = []
     nodes[0].slots = []
     nodes[1].slots = [(0, 16383)]
     await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
@@ -2257,6 +2258,7 @@ async def test_cluster_migration_while_seeding(
     logging.debug("Migration finished")
 
     logging.debug("Finalizing migration")
+    nodes[0].migrations = []
     nodes[0].slots = []
     nodes[1].slots = [(0, 16383)]
     await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes])
diff --git a/tests/dragonfly/connection_test.py b/tests/dragonfly/connection_test.py
index 82dc15b28f5d..f17571eabf76 100644
--- a/tests/dragonfly/connection_test.py
+++ b/tests/dragonfly/connection_test.py
@@ -12,10 +12,10 @@
 import pytest
 import redis as base_redis
 from redis import asyncio as aioredis
-from redis.cache import CacheConfig
 from redis.backoff import NoBackoff
-from redis.retry import Retry
+from redis.cache import CacheConfig
 from redis.exceptions import ConnectionError, ResponseError
+from redis.retry import Retry
 
 from . import dfly_args, dfly_multi_test_args
 from .instance import DflyInstance, DflyInstanceFactory
@@ -60,10 +60,32 @@ async def _monitor(self):
             async for message in monitor.listen():
                 self.messages.append(CollectedRedisMsg(message["command"], message["client_type"]))
 
+    async def _wait_monitor_ready(self):
+        probe = aioredis.Redis(connection_pool=self.client.connection_pool)
+        try:
+            # is bound by timeout of 5
+            while True:
+                marker = f"__monitor_ready__:{time.monotonic_ns()}"
+                expected = CollectedRedisMsg(f"ECHO {marker}")
+                await probe.echo(marker)
+                # only try a few times, retry if ECHO ran before MONITOR
+                for _ in range(10):
+                    if self._monitor_task.done():
+                        await self._monitor_task
+                        raise AssertionError("monitor task exited before MONITOR was registered")
+                    # monitor is set up now
+                    if expected in self.messages:
+                        self.messages.clear()
+                        return
+                    await asyncio.sleep(0.05)
+        finally:
+            await probe.aclose()
+
     async def start(self):
         if self._monitor_task is None:
             self._monitor_task = asyncio.create_task(self._monitor())
-        await asyncio.sleep(0.1)
+        async with async_timeout.timeout(5):
+            await self._wait_monitor_ready()
 
     async def stop(self, timeout=0.1):
         if self._monitor_task:
@@ -2136,3 +2158,56 @@ async def flood():
         await flood_task
         writer.close()
         await writer.wait_closed()
+
+
+async def test_blocking_command_close_eof(df_server: DflyInstance):
+    """Server must drop a connection that is parked on a blocking command
+    when the client closes its socket.
+
+    Without this handling, the connection's reader fiber is suspended waiting
+    for the blocking-command wakeup and does not observe the socket EOF, so
+    the FD lingers until the command times out or the client is CLIENT KILL'd.
+    Symptom: client sockets accumulate in FIN_WAIT_2 (server in CLOSE_WAIT),
+    matching the reaper-driven leak seen in production with a Sidekiq/Puma
+    connection-pool reaper that closes idle conns. The race only manifests
+    under sustained concurrent open+close churn against BLPOP-parked conns;
+    a single batch is handled correctly. Plain valkey closes its half
+    immediately on FIN even mid-BLPOP.
+    """
+    client = df_server.client()
+    baseline = int((await client.info("clients"))["connected_clients"])
+
+    workers = 8
+    conns_per_cycle = 25
+    cycles = 4
+    blpop_cmd = b"*3\r\n$5\r\nBLPOP\r\n$10\r\n__no_queue\r\n$1\r\n0\r\n"
+
+    def churn():
+        for _ in range(cycles):
+            socks = []
+            try:
+                for _ in range(conns_per_cycle):
+                    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                    s.connect(("127.0.0.1", df_server.port))
+                    s.sendall(blpop_cmd)
+                    socks.append(s)
+                    time.sleep(0.1)
+            finally:
+                for s in socks:
+                    try:
+                        s.close()
+                    except OSError:
+                        pass
+
+    threads = [Thread(target=churn) for _ in range(workers)]
+    for t in threads:
+        t.start()
+    await asyncio.to_thread(lambda: [t.join() for t in threads])
+
+    @assert_eventually(timeout=5)
+    async def wait_disconnected():
+        info = await client.info("clients")
+        assert int(info["connected_clients"]) <= baseline, info["connected_clients"]
+        assert int(info["blocked_clients"]) == 0, info["blocked_clients"]
+
+    await wait_disconnected()
diff --git a/tests/dragonfly/replication_test.py b/tests/dragonfly/replication_test.py
index 627240287c88..9b72753322e0 100644
--- a/tests/dragonfly/replication_test.py
+++ b/tests/dragonfly/replication_test.py
@@ -254,6 +254,11 @@ async def check():
         # it's usually close to 1% but there are some that are close to 3.
         assert preemptions <= (key_capacity * 0.03)
 
+    # Assert select calls are properly optimized
+    for replica in c_replicas:
+        select_calls = (await replica.info("ALL"))["cmdstat_select"]["calls"]
+        assert select_calls < 16
+
 
 """
 Regression test for the double-apply bug during full sync.
@@ -4404,6 +4409,42 @@ async def test_hnsw_search_replication_with_network_disruptions(
         await proxy.close(proxy_task)
 
 
+@pytest.mark.parametrize("document_type", ["HASH", "JSON"])
+async def test_hnsw_failover_chain(df_factory: DflyInstanceFactory, document_type: str):
+    """
+    Primary → replica1 → REPLTAKEOVER → attach replica2 to promoted node.
+    The promoted node must still serve KNN, and a freshly attached replica
+    must rebuild the HNSW index from the promoted node's data.
+    """
+    master = df_factory.create(proactor_threads=2)
+    replica1 = df_factory.create(proactor_threads=2)
+    replica2 = df_factory.create(proactor_threads=2)
+    df_factory.start_all([master, replica1, replica2])
+
+    c_master = master.client()
+    c1 = replica1.client()
+    c2 = replica2.client()
+
+    seeder = HnswSearchSeeder(num_initial_docs=300, num_dims=8, document_type=document_type)
+    await seeder.create_index(c_master)
+    await seeder.seed_initial_docs(c_master)
+
+    await c1.execute_command(f"REPLICAOF localhost {master.port}")
+    await wait_available_async(c1)
+    await check_all_replicas_finished([c1], c_master)
+    await seeder.verify(c_master, c1)
+
+    # Promote replica1. The master exits after REPLTAKEOVER completes.
+    await c1.execute_command("REPLTAKEOVER 5")
+    assert (await c1.execute_command("role"))[0] == "master"
+
+    # Attach replica2 to the promoted node and verify it rebuilds HNSW.
+    await c2.execute_command(f"REPLICAOF localhost {replica1.port}")
+    await wait_available_async(c2)
+    await check_all_replicas_finished([c2], c1)
+    await seeder.verify(c1, c2)
+
+
 async def test_rm_replication(df_factory: DflyInstanceFactory):
     """Test that RM command propagates deletions to replica and is rejected on replica."""
     master = df_factory.create(proactor_threads=2)
@@ -4626,3 +4667,51 @@ async def test_hnsw_external_vector_replication_crash(df_factory: DflyInstanceFa
     await traffic_task
 
     await check_all_replicas_finished([c_replica], c_master, timeout=60)
+
+
+async def test_snapshot_load_replication(df_factory: DflyInstanceFactory):
+    dbfilename = f"dump_{tmp_file_name()}"
+
+    master = df_factory.create()
+    replica = df_factory.create()
+    df_factory.start_all([master, replica])
+
+    c_master = master.client()
+    c_replica = replica.client()
+
+    # Populate initial data and save a snapshot.
+    seeder = DebugPopulateSeeder(key_target=1000, data_size=100)
+    await seeder.run(c_master)
+    await c_master.execute_command("SAVE", "DF", dbfilename)
+    await c_master.execute_command("FLUSHALL")
+
+    await c_replica.execute_command("REPLICAOF", "localhost", str(master.port))
+    await wait_available_async(c_replica)
+
+    # Stream writes during DFLY LOAD to exercise the race between journal
+    # writes and the load that bypasses the journal. LOADING state rejects
+    # seeder Lua scripts, so the seeder task may fail.
+    stream_seeder = SeederV2(key_target=500)
+    seed_task = asyncio.create_task(stream_seeder.run(c_master, target_deviation=0.1))
+    await asyncio.sleep(
+        0.5
+    )  # Let the seeder start and write some data before we load the snapshot.
+
+    await c_master.execute_command("DFLY", "LOAD", f"{dbfilename}-summary.dfs")
+
+    await asyncio.sleep(0.5)  # Let the seeder fail because of the loading state.
+    seed_task.cancel()
+    try:
+        await seed_task
+    except (asyncio.CancelledError, Exception):
+        pass
+
+    # Wait for the replica to complete the new full sync.
+    await wait_for_replicas_state(c_replica)
+    await check_all_replicas_finished([c_replica], c_master)
+
+    master_capture = await DebugPopulateSeeder.capture(c_master)
+    replica_capture = await DebugPopulateSeeder.capture(c_replica)
+    assert master_capture == replica_capture
+
+    await c_replica.execute_command("REPLICAOF", "NO", "ONE")
diff --git a/tests/dragonfly/requirements.txt b/tests/dragonfly/requirements.txt
index ab2dba3f4169..822395144b5f 100644
--- a/tests/dragonfly/requirements.txt
+++ b/tests/dragonfly/requirements.txt
@@ -2,7 +2,7 @@ async-timeout>=4.0.3
 attrs>=22.1.0
 Deprecated>=1.2.13
 iniconfig>=1.1.1
-packaging>=23.1
+packaging>=26.2
 pluggy>=1.0.0
 py>=1.11.0
 pyparsing>=3.0.9
@@ -13,7 +13,7 @@ wrapt>=1.14.1
 pytest-asyncio==0.20.1
 pytest-repeat>=0.9.3
 pymemcache>=4.0.0
-meta_memcache>=2
+meta_memcache>=2.2.0
 prometheus_client>=0.17.0
 aiohttp>=3.10.2
 numpy
@@ -23,12 +23,12 @@ boto3>=1.28.55
 azure-storage-blob>=12.19.0
 redis-om>=0.3.3
 pytest-emoji>=0.2.0
-pytest-icdiff>=0.8
+pytest-icdiff>=0.9
 pytest-timeout>=2.2.0
 asyncio>=3.4.3
 fakeredis[json]>=2.26.2
 hiredis==2.4.0
-PyYAML>=6.0
+PyYAML>=6.0.3
 valkey>=6.0.2
 celery>=5.3.0
 # bullmq>=2.0.0
diff --git a/tests/dragonfly/search_test.py b/tests/dragonfly/search_test.py
index d35846518aed..8561dc823683 100644
--- a/tests/dragonfly/search_test.py
+++ b/tests/dragonfly/search_test.py
@@ -15,6 +15,8 @@
 from redis.commands.search.query import Query
 
 from . import dfly_args
+from .instance import DflyInstanceFactory
+from .seeder import HnswSearchSeeder
 from .utility import *
 
 TEST_DATA = [
@@ -405,6 +407,90 @@ def rand_point():
     await i3.dropindex()
 
 
+@dfly_args({"proactor_threads": 4})
+@pytest.mark.parametrize("index_type", [IndexType.HASH, IndexType.JSON])
+@pytest.mark.parametrize("algo_type", ["HNSW", "FLAT"])
+async def test_vector_empty_and_update(async_client: aioredis.Redis, index_type, algo_type):
+    """KNN on an empty index returns no results; overwriting a vector moves the doc."""
+    idx = async_client.ft("vec_ops_" + str(index_type))
+    vector_field = VectorField(
+        "pos",
+        algorithm=algo_type,
+        attributes={"TYPE": "FLOAT32", "DIM": 1, "DISTANCE_METRIC": "L2"},
+    )
+    await idx.create_index(
+        fix_schema_naming(index_type, [vector_field]),
+        definition=IndexDefinition(index_type=index_type),
+    )
+
+    async def set_pos(key, val):
+        if index_type == IndexType.HASH:
+            await async_client.hset(
+                key, mapping={"pos": np.array([val], dtype=np.float32).tobytes()}
+            )
+        else:
+            await async_client.json().set(key, "$", {"pos": [val]})
+
+    # Empty index: KNN must return no results, not crash.
+    assert await knn_query(idx, "* => [KNN 5 @pos $vec]", [0.0]) == set()
+
+    # Populate docs on the axis; k_target sits at 0.0.
+    await set_pos("k_target", 0.0)
+    for i in range(1, 10):
+        await set_pos(f"k{i}", float(i * 100))
+
+    # Before update: k_target is nearest to 0.0.
+    assert "k_target" in await knn_query(idx, "* => [KNN 1 @pos $vec]", [0.0])
+
+    # Overwrite k_target's vector to 1000.0 — it must move in the index.
+    await set_pos("k_target", 1000.0)
+    assert "k_target" in await knn_query(idx, "* => [KNN 1 @pos $vec]", [1000.0])
+    assert "k_target" not in await knn_query(idx, "* => [KNN 1 @pos $vec]", [0.0])
+
+    await idx.dropindex()
+
+
+@pytest.mark.parametrize("document_type", ["HASH", "JSON"])
+@pytest.mark.parametrize("start_threads, reload_threads", [(4, 4), (4, 2), (2, 4)])
+async def test_hnsw_reload_different_threads(
+    df_factory: DflyInstanceFactory, document_type, start_threads, reload_threads
+):
+    """HNSW KNN must still work after SAVE + restart with a different thread count."""
+    dbfilename = f"hnsw_threads_{tmp_file_name()}"
+    inst = df_factory.create(proactor_threads=start_threads, dbfilename=dbfilename)
+    inst.start()
+    client = inst.client()
+
+    seeder = HnswSearchSeeder(num_initial_docs=50, num_dims=8, document_type=document_type)
+    await seeder.create_index(client)
+    await seeder.seed_initial_docs(client)
+
+    query_vec = seeder._make_embedding().tobytes()
+    k = 10
+    _, before_ids = await seeder._search_knn(client, query_vec, k)
+
+    await client.execute_command("SAVE")
+    inst.stop()
+
+    inst2 = df_factory.create(proactor_threads=reload_threads, dbfilename=dbfilename)
+    inst2.start()
+    client2 = inst2.client()
+    await wait_available_async(client2)
+
+    assert await client2.dbsize() == seeder.num_initial_docs
+    _, after_ids = await seeder._search_knn(client2, query_vec, k)
+
+    # HNSW is approximate, so the top-k may shift across restarts. Docs that
+    # appear only post-reload are approximation noise — what matters for
+    # correctness is that every doc previously in top-k is still indexed. For
+    # each one missing from the reloaded top-k, confirm it with a TAG-filtered
+    # KNN (bypasses approximation).
+    for key in before_ids - after_ids:
+        assert await seeder._search_knn_filtered(
+            client2, query_vec, key, k=1
+        ), f"doc {key} lost from index after reload"
+
+
 @dfly_args({"proactor_threads": 4})
 async def test_knn_score_return(async_client: aioredis.Redis):
     i1 = async_client.ft("i1")
@@ -1083,3 +1169,129 @@ async def test_vector_search_with_geo_and_tags(async_client: aioredis.Redis):
         ), f"Expected {expected_count} {cat}s, got {result.total}"
 
     await idx.dropindex()
+
+
+@dfly_args({"proactor_threads": 4})
+async def test_ft_search_scorer_bm25std(async_client: aioredis.Redis):
+    """Test FT.SEARCH with SCORER BM25STD and WITHSCORES."""
+    idx = async_client.ft("scorer_idx")
+
+    await idx.create_index(
+        [TextField("content")],
+        definition=IndexDefinition(index_type=IndexType.HASH),
+    )
+
+    # Doc with "hello" appearing multiple times should score higher
+    await async_client.hset("doc:1", mapping={"content": "hello world hello hello"})
+    await async_client.hset("doc:2", mapping={"content": "hello there"})
+    await async_client.hset("doc:3", mapping={"content": "goodbye world"})
+
+    # Raw command: FT.SEARCH scorer_idx "hello" WITHSCORES SCORER BM25STD
+    res = await async_client.execute_command(
+        "FT.SEARCH", "scorer_idx", "hello", "WITHSCORES", "SCORER", "BM25STD"
+    )
+
+    # Response format: [total, key1, score1, fields1, key2, score2, fields2, ...]
+    total = res[0]
+    assert total == 2, f"Expected 2 matches, got {total}"
+
+    # Parse results: each doc is (key, score, fields)
+    docs = {}
+    i = 1
+    while i < len(res):
+        key = str(res[i]) if isinstance(res[i], bytes) else res[i]
+        score = float(res[i + 1])
+        i += 3  # skip key, score, fields
+        docs[key] = score
+
+    assert "doc:1" in docs, f"doc:1 should match, got {docs}"
+    assert "doc:2" in docs, f"doc:2 should match, got {docs}"
+    assert "doc:3" not in docs, f"doc:3 should not match, got {docs}"
+
+    # doc:1 has higher TF for "hello" -> higher score
+    assert docs["doc:1"] > docs["doc:2"], (
+        f"doc:1 (TF=3) should score higher than doc:2 (TF=1), "
+        f"got {docs['doc:1']} vs {docs['doc:2']}"
+    )
+
+    # Scores should be positive
+    assert docs["doc:1"] > 0
+    assert docs["doc:2"] > 0
+
+    await idx.dropindex()
+
+
+@dfly_args({"proactor_threads": 4})
+async def test_ft_search_scorer_invalid(async_client: aioredis.Redis):
+    """Test that invalid scorer name returns error."""
+    idx = async_client.ft("scorer_err_idx")
+
+    await idx.create_index(
+        [TextField("content")],
+        definition=IndexDefinition(index_type=IndexType.HASH),
+    )
+
+    await async_client.hset("doc:1", mapping={"content": "hello"})
+
+    try:
+        await async_client.execute_command(
+            "FT.SEARCH", "scorer_err_idx", "hello", "SCORER", "INVALID_SCORER"
+        )
+        assert False, "Should have raised error for invalid scorer"
+    except Exception as e:
+        assert "scorer" in str(e).lower() or "syntax" in str(e).lower()
+
+    await idx.dropindex()
+
+
+@dfly_args({"proactor_threads": 4})
+async def test_ft_aggregate_addscores(async_client: aioredis.Redis):
+    """Test FT.AGGREGATE with SCORER BM25STD and ADDSCORES."""
+    await async_client.execute_command(
+        "FT.CREATE", "agg_score_idx", "ON", "HASH", "SCHEMA", "content", "TEXT"
+    )
+
+    await async_client.hset("doc:1", mapping={"content": "science science science"})
+    await async_client.hset("doc:2", mapping={"content": "science fiction"})
+    await async_client.hset("doc:3", mapping={"content": "hello world"})
+
+    # FT.AGGREGATE with LOAD first, then SCORER + ADDSCORES + SORTBY @__score DESC
+    res = await async_client.execute_command(
+        "FT.AGGREGATE",
+        "agg_score_idx",
+        "@content:(science)",
+        "LOAD",
+        "1",
+        "@content",
+        "SCORER",
+        "BM25STD",
+        "ADDSCORES",
+        "SORTBY",
+        "2",
+        "@__score",
+        "DESC",
+    )
+
+    total = res[0]
+    assert total == 2, f"Expected 2 matches, got {total}"
+
+    # Parse aggregate results -- each result is a list of field-value pairs
+    results = []
+    for row in res[1:]:
+        entry = {}
+        for j in range(0, len(row), 2):
+            entry[row[j].decode() if isinstance(row[j], bytes) else row[j]] = (
+                row[j + 1].decode() if isinstance(row[j + 1], bytes) else row[j + 1]
+            )
+        results.append(entry)
+
+    # Both results should have __score field
+    for r in results:
+        assert "__score" in r, f"Expected __score field in result: {r}"
+        assert float(r["__score"]) > 0, f"Expected positive score, got {r['__score']}"
+
+    # First result (sorted DESC) should have higher score
+    if len(results) >= 2:
+        assert float(results[0]["__score"]) >= float(results[1]["__score"])
+
+    await async_client.execute_command("FT.DROPINDEX", "agg_score_idx")
diff --git a/tests/dragonfly/seeder/__init__.py b/tests/dragonfly/seeder/__init__.py
index de742a556e27..2d35e2a5da33 100644
--- a/tests/dragonfly/seeder/__init__.py
+++ b/tests/dragonfly/seeder/__init__.py
@@ -1,4 +1,5 @@
 import asyncio
+import json as json_mod
 import random
 import logging
 import re
@@ -247,12 +248,16 @@ def __init__(
         num_dims=4,
         num_initial_docs=200,
         seed=42,
+        document_type="HASH",
     ):
+        if document_type not in ("HASH", "JSON"):
+            raise ValueError(f"document_type must be HASH or JSON, got {document_type}")
         self.index_name = index_name
         self.prefix = prefix
         self.num_dims = num_dims
         self.num_initial_docs = num_initial_docs
         self.seed = seed
+        self.document_type = document_type
 
         self._doc_counter = 0
         self._stop_event = asyncio.Event()
@@ -260,45 +265,97 @@ def __init__(
     def _make_embedding(self):
         return np.random.uniform(-10, 10, self.num_dims).astype(np.float32)
 
+    def _field(self, name: str, *spec: str) -> list:
+        # FT.CREATE field syntax: HASH uses the field name directly; JSON
+        # uses a JSONPath plus AS alias so query syntax (@name:...) stays
+        # the same in both modes.
+        if self.document_type == "HASH":
+            return [name, *spec]
+        return [f"$.{name}", "AS", name, *spec]
+
     async def create_index(self, client: aioredis.Redis):
+        schema = [
+            *self._field("title", "TEXT"),
+            *self._field("doc_id", "TAG"),
+            *self._field(
+                "embedding",
+                "VECTOR",
+                "HNSW",
+                "6",
+                "TYPE",
+                "FLOAT32",
+                "DIM",
+                str(self.num_dims),
+                "DISTANCE_METRIC",
+                "L2",
+            ),
+        ]
         await client.execute_command(
             "FT.CREATE",
             self.index_name,
             "ON",
-            "HASH",
+            self.document_type,
             "PREFIX",
             "1",
             self.prefix,
             "SCHEMA",
-            "title",
-            "TEXT",
-            "doc_id",
-            "TAG",
-            "embedding",
-            "VECTOR",
-            "HNSW",
-            "6",
-            "TYPE",
-            "FLOAT32",
-            "DIM",
-            str(self.num_dims),
-            "DISTANCE_METRIC",
-            "L2",
+            *schema,
         )
 
-    async def seed_initial_docs(self, client: aioredis.Redis):
-        pipe = client.pipeline(transaction=False)
-        for i in range(self.num_initial_docs):
+    async def _write_doc(self, client: aioredis.Redis, doc_id: int, emb=None):
+        if emb is None:
             emb = self._make_embedding()
-            pipe.hset(
-                f"{self.prefix}{i}",
+        key = f"{self.prefix}{doc_id}"
+        if self.document_type == "HASH":
+            await client.hset(
+                key,
                 mapping={
-                    "title": f"Product {i}",
-                    "doc_id": str(i),
+                    "title": f"Product {doc_id}",
+                    "doc_id": str(doc_id),
                     "embedding": emb.tobytes(),
                 },
             )
-        await pipe.execute()
+        else:
+            await client.execute_command(
+                "JSON.SET",
+                key,
+                "$",
+                json_mod.dumps(
+                    {
+                        "title": f"Product {doc_id}",
+                        "doc_id": str(doc_id),
+                        "embedding": emb.tolist(),
+                    }
+                ),
+            )
+
+    async def _update_embedding(self, client: aioredis.Redis, doc_id: int):
+        emb = self._make_embedding()
+        key = f"{self.prefix}{doc_id}"
+        if self.document_type == "HASH":
+            await client.hset(key, mapping={"embedding": emb.tobytes()})
+        else:
+            await client.execute_command(
+                "JSON.SET", key, "$.embedding", json_mod.dumps(emb.tolist())
+            )
+
+    async def seed_initial_docs(self, client: aioredis.Redis):
+        if self.document_type == "HASH":
+            pipe = client.pipeline(transaction=False)
+            for i in range(self.num_initial_docs):
+                emb = self._make_embedding()
+                pipe.hset(
+                    f"{self.prefix}{i}",
+                    mapping={
+                        "title": f"Product {i}",
+                        "doc_id": str(i),
+                        "embedding": emb.tobytes(),
+                    },
+                )
+            await pipe.execute()
+        else:
+            for i in range(self.num_initial_docs):
+                await self._write_doc(client, i)
         self._doc_counter = self.num_initial_docs
 
     def stop(self):
@@ -407,23 +464,13 @@ async def run_traffic(self, client: aioredis.Redis, sleep_interval=0.01):
             op = random.choice(["insert", "update", "delete"])
             try:
                 if op == "insert":
-                    emb = self._make_embedding()
-                    await client.hset(
-                        f"{self.prefix}{self._doc_counter}",
-                        mapping={
-                            "title": f"Product {self._doc_counter}",
-                            "doc_id": str(self._doc_counter),
-                            "embedding": emb.tobytes(),
-                        },
-                    )
+                    await self._write_doc(client, self._doc_counter)
                     self._doc_counter += 1
                 elif op == "update":
                     key_id = random.randint(0, max(self._doc_counter - 1, 0))
-                    key = f"{self.prefix}{key_id}"
-                    if not await client.exists(key):
+                    if not await client.exists(f"{self.prefix}{key_id}"):
                         continue
-                    emb = self._make_embedding()
-                    await client.hset(key, mapping={"embedding": emb.tobytes()})
+                    await self._update_embedding(client, key_id)
                 elif op == "delete":
                     key_id = random.randint(0, max(self._doc_counter - 1, 0))
                     await client.delete(f"{self.prefix}{key_id}")
diff --git a/tests/fakeredis/poetry.lock b/tests/fakeredis/poetry.lock
index 2b7325b43ec8..db248e698cd2 100644
--- a/tests/fakeredis/poetry.lock
+++ b/tests/fakeredis/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
 
 [[package]]
 name = "async-timeout"
@@ -33,6 +33,19 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi
 tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
 tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""]
 
+[[package]]
+name = "backports-asyncio-runner"
+version = "1.2.0"
+description = "Backport of asyncio.Runner, a context manager that controls event loop life cycle."
+optional = false
+python-versions = "<3.11,>=3.8"
+groups = ["main"]
+markers = "python_version < \"3.11\""
+files = [
+    {file = "backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5"},
+    {file = "backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162"},
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -132,7 +145,7 @@ description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "python_version == \"3.10\""
+markers = "python_version < \"3.11\""
 files = [
     {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
     {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
@@ -465,6 +478,21 @@ files = [
     {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
 ]
 
+[[package]]
+name = "pygments"
+version = "2.20.0"
+description = "Pygments is a syntax highlighting package written in Python."
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176"},
+    {file = "pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f"},
+]
+
+[package.extras]
+windows-terminal = ["colorama (>=0.4.6)"]
+
 [[package]]
 name = "pyprobables"
 version = "0.6.1"
@@ -479,44 +507,47 @@ files = [
 
 [[package]]
 name = "pytest"
-version = "8.3.5"
+version = "9.0.3"
 description = "pytest: simple powerful testing with Python"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
-    {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
+    {file = "pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9"},
+    {file = "pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c"},
 ]
 
 [package.dependencies]
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
-iniconfig = "*"
-packaging = "*"
+colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""}
+iniconfig = ">=1.0.1"
+packaging = ">=22"
 pluggy = ">=1.5,<2"
+pygments = ">=2.7.2"
 tomli = {version = ">=1", markers = "python_version < \"3.11\""}
 
 [package.extras]
-dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.24.0"
+version = "1.3.0"
 description = "Pytest support for asyncio"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "pytest_asyncio-0.24.0-py3-none-any.whl", hash = "sha256:a811296ed596b69bf0b6f3dc40f83bcaf341b155a269052d82efa2b25ac7037b"},
-    {file = "pytest_asyncio-0.24.0.tar.gz", hash = "sha256:d081d828e576d85f875399194281e92bf8a68d60d72d1a2faf2feddb6c46b276"},
+    {file = "pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5"},
+    {file = "pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5"},
 ]
 
 [package.dependencies]
-pytest = ">=8.2,<9"
+backports-asyncio-runner = {version = ">=1.1,<2", markers = "python_version < \"3.11\""}
+pytest = ">=8.2,<10"
+typing-extensions = {version = ">=4.12", markers = "python_version < \"3.13\""}
 
 [package.extras]
-docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"]
 testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"]
 
 [[package]]
@@ -691,7 +722,7 @@ description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version == \"3.10\""
+markers = "python_version < \"3.13\""
 files = [
     {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"},
     {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"},
@@ -700,4 +731,4 @@ files = [
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
-content-hash = "56a32fa3694dd2074c8be21fcc1296ed70ad38977cdf912a93b6e5f0b3ed7e93"
+content-hash = "ec663a0571b4cf8cc44c31ffc231e32ffde0a2c50f0c836c0a2aff6ae0d0502a"
diff --git a/tests/fakeredis/pyproject.toml b/tests/fakeredis/pyproject.toml
index 7c00451edf48..6dc1ceaae6cd 100644
--- a/tests/fakeredis/pyproject.toml
+++ b/tests/fakeredis/pyproject.toml
@@ -21,9 +21,9 @@ python = "^3.10"
 redis = ">=5"
 fakeredis = { version = "^2.26.1", extras = ["json", "bf", "cf", "lua"] }
 hypothesis = "^6.111"
-pytest = "^8.3"
+pytest = "^9.0.3"
 pytest-timeout = "^2.3.1"
-pytest-asyncio = "^0.24"
+pytest-asyncio = "^1.0"
 pytest-cov = "^5.0"
 pytest-mock = "^3.14"
 pytest-html = "^4.1"
diff --git a/tests/fakeredis/test/conftest.py b/tests/fakeredis/test/conftest.py
index cdb8318fd157..d751f94b88f5 100644
--- a/tests/fakeredis/test/conftest.py
+++ b/tests/fakeredis/test/conftest.py
@@ -16,7 +16,7 @@ def _check_lua_module_supported() -> bool:
         return False
 
 
-@pytest_asyncio.fixture(scope="session")
+@pytest.fixture(scope="session")
 def real_redis_version() -> Tuple[str, Union[None, Tuple[int, ...]]]:
     """Returns server's version or None if server is not running"""
     client = None
@@ -24,9 +24,7 @@ def real_redis_version() -> Tuple[str, Union[None, Tuple[int, ...]]]:
         client = redis.StrictRedis("localhost", port=6380, db=2)
         client_info = client.info()
         server_type = "dragonfly" if "dragonfly_version" in client_info else "redis"
-        server_version = (
-            client_info["redis_version"] if server_type != "dragonfly" else (7, 0)
-        )
+        server_version = client_info["redis_version"] if server_type != "dragonfly" else (7, 0)
         server_version = _create_version(server_version) or (7,)
         return server_type, server_version
     except redis.ConnectionError:
@@ -78,21 +76,15 @@ def _create_redis(request) -> Callable[[int], redis.Redis]:
     server_type, server_version = request.getfixturevalue("real_redis_version")
     if not cls_name.startswith("Fake") and not server_version:
         pytest.skip("Redis is not running")
-    unsupported_server_types = request.node.get_closest_marker(
-        "unsupported_server_types"
-    )
+    unsupported_server_types = request.node.get_closest_marker("unsupported_server_types")
     if unsupported_server_types and server_type in unsupported_server_types.args:
         pytest.skip(f"Server type {server_type} is not supported")
     min_server = _marker_version_value(request, "min_server")
     max_server = _marker_version_value(request, "max_server")
     if server_version < min_server:
-        pytest.skip(
-            f"Redis server {min_server} or more required but {server_version} found"
-        )
+        pytest.skip(f"Redis server {min_server} or more required but {server_version} found")
     if server_version > max_server:
-        pytest.skip(
-            f"Redis server {max_server} or less required but {server_version} found"
-        )
+        pytest.skip(f"Redis server {max_server} or less required but {server_version} found")
     decode_responses = request.node.get_closest_marker("decode_responses") is not None
     lua_modules_marker = request.node.get_closest_marker("load_lua_modules")
     lua_modules = set(lua_modules_marker.args) if lua_modules_marker else None
@@ -127,21 +119,15 @@ async def _req_aioredis2(request) -> redis.asyncio.Redis:
     server_type, server_version = request.getfixturevalue("real_redis_version")
     if request.param != "fake" and not server_version:
         pytest.skip("Redis is not running")
-    unsupported_server_types = request.node.get_closest_marker(
-        "unsupported_server_types"
-    )
+    unsupported_server_types = request.node.get_closest_marker("unsupported_server_types")
     if unsupported_server_types and server_type in unsupported_server_types.args:
         pytest.skip(f"Server type {server_type} is not supported")
     min_server_marker = _marker_version_value(request, "min_server")
     max_server_marker = _marker_version_value(request, "max_server")
     if server_version < min_server_marker:
-        pytest.skip(
-            f"Redis server {min_server_marker} or more required but {server_version} found"
-        )
+        pytest.skip(f"Redis server {min_server_marker} or more required but {server_version} found")
     if server_version > max_server_marker:
-        pytest.skip(
-            f"Redis server {max_server_marker} or less required but {server_version} found"
-        )
+        pytest.skip(f"Redis server {max_server_marker} or less required but {server_version} found")
     lua_modules_marker = request.node.get_closest_marker("load_lua_modules")
     lua_modules = set(lua_modules_marker.args) if lua_modules_marker else None
     if lua_modules and not _check_lua_module_supported():
diff --git a/tests/fakeredis/test/test_asyncredis.py b/tests/fakeredis/test/test_asyncredis.py
index cd3d59a1abdc..fdae01b9ecab 100644
--- a/tests/fakeredis/test/test_asyncredis.py
+++ b/tests/fakeredis/test/test_asyncredis.py
@@ -35,9 +35,7 @@ async def test_ping(async_redis: redis.asyncio.Redis):
 
 
 async def test_types(async_redis: redis.asyncio.Redis):
-    await async_redis.hset(
-        "hash", mapping={"key1": "value1", "key2": "value2", "key3": 123}
-    )
+    await async_redis.hset("hash", mapping={"key1": "value1", "key2": "value2", "key3": 123})
     result = await async_redis.hgetall("hash")
     assert result == {b"key1": b"value1", b"key2": b"value2", b"key3": b"123"}
 
@@ -64,7 +62,7 @@ async def test_transaction_fail(async_redis: redis.asyncio.Redis):
             await tr.execute()
 
 
-async def test_pubsub(async_redis, event_loop):
+async def test_pubsub(async_redis):
     queue = asyncio.Queue()
 
     async def reader(ps):
@@ -77,7 +75,7 @@ async def reader(ps):
 
     async with async_timeout(5), async_redis.pubsub() as ps:
         await ps.subscribe("channel")
-        task = event_loop.create_task(reader(ps))
+        task = asyncio.create_task(reader(ps))
         await async_redis.publish("channel", "message1")
         await async_redis.publish("channel", "message2")
         result1 = await queue.get()
@@ -133,14 +131,14 @@ async def test_blocking_timeout(conn):
 
 
 @pytest.mark.slow
-async def test_blocking_unblock(async_redis, conn, event_loop):
+async def test_blocking_unblock(async_redis, conn):
     """Blocking command that gets unblocked after some time."""
 
     async def unblock():
         await asyncio.sleep(0.1)
         await async_redis.rpush("list", "y")
 
-    task = event_loop.create_task(unblock())
+    task = asyncio.create_task(unblock())
     result = await conn.blpop("list", timeout=1)
     assert result == (b"list", b"y")
     await task
diff --git a/tools/pubsub/celery-test.py b/tools/pubsub/celery-test.py
new file mode 100755
index 000000000000..f30a1758c17c
--- /dev/null
+++ b/tools/pubsub/celery-test.py
@@ -0,0 +1,466 @@
+import argparse
+import time
+import threading
+import random
+import redis
+import celery.result
+
+from celery import Celery, group
+from celery.result import allow_join_result
+
+
+# Disable Celery's block guard so group_get can call .get() inside a worker.
+def _no_block_check(*args, **kwargs):
+    return
+
+
+celery.result.assert_will_not_block = _no_block_check
+
+
+# ------------------------
+# CONFIG
+# ------------------------
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Celery pub/sub test with worker warmup and metrics"
+    )
+    parser.add_argument("--broker", default="redis://localhost:6379/0", help="Celery broker URL")
+    parser.add_argument(
+        "--backend", default="redis://localhost:6379/1", help="Celery result backend URL"
+    )
+    parser.add_argument("--batch-size", type=int, default=50, help="Number of subtasks per group")
+    parser.add_argument("--rounds", type=int, default=5, help="Number of rounds to run")
+    parser.add_argument(
+        "--get-timeout", type=int, default=5, help="Timeout for group.get() inside the task"
+    )
+    parser.add_argument(
+        "--concurrent", type=int, default=16, help="Concurrent dispatch_group tasks per round"
+    )
+    parser.add_argument(
+        "--extra-poll", type=int, default=30, help="Seconds to keep polling instance after timeout"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="Random seed for reproducible test ordering"
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+if args.seed is not None:
+    random.seed(args.seed)
+
+REDIS_BROKER = args.broker
+REDIS_BACKEND = args.backend
+
+BATCH = args.batch_size
+CONCURRENT = args.concurrent
+ROUNDS = args.rounds
+TIMEOUT = args.get_timeout
+EXTRA_POLL = args.extra_poll
+
+
+# ------------------------
+# CELERY APP
+# ------------------------
+
+app = Celery("pubsub_test", broker=REDIS_BROKER, backend=REDIS_BACKEND)
+app.conf.update(
+    task_serializer="json",
+    result_serializer="json",
+    accept_content=["json"],
+    task_track_started=True,
+    result_expires=300,
+    worker_prefetch_multiplier=4,
+)
+
+
+# ------------------------
+# TASK
+# ------------------------
+
+
+@app.task(name="noop")
+def noop(i):
+    time.sleep(0.02)
+    return {"i": i, "published_at": time.time()}
+
+
+# ------------------------
+# GROUP TASK
+# ------------------------
+
+
+@app.task(name="group_get", bind=True, soft_time_limit=120, time_limit=130)
+def group_get(self, batch_size, get_timeout=5):
+    with allow_join_result():
+        dispatched_at = time.time()
+        g = group(noop.s(i) for i in range(batch_size))()
+
+        try:
+            results = g.get(timeout=get_timeout)
+            # Wall-clock time when g.get() unblocked — i.e. the last PUBLISH arrived.
+            received_at = time.time()
+
+            published_times = [res["published_at"] for res in results]
+            last_published = max(published_times)
+
+            return {
+                "lost": 0,
+                "not_completed": 0,
+                # Total time from group dispatch to all results received.
+                "group_rtt_ms": (received_at - dispatched_at) * 1000,
+                # Time from when the last noop task stored its result (sent PUBLISH)
+                # to when group_get received the notification. This is the true
+                # PUBLISH round-trip latency for the bottleneck task.
+                "tail_latency_ms": (received_at - last_published) * 1000,
+            }
+
+        except Exception:
+            timed_out_at = time.time()
+            conn = redis.Redis.from_url(REDIS_BACKEND)
+            keys = [f"celery-task-meta-{r.id}" for r in g.results]
+
+            # Snapshot Redis right at timeout — tasks already here finished before the
+            # timeout but their PUBLISH was silently dropped (the actual bug).
+            snapshot = conn.mget(keys)
+            publish_lost = sum(1 for v in snapshot if v is not None)
+
+            # Poll for the remaining tasks to distinguish slow tasks from truly lost ones.
+            deadline = timed_out_at + EXTRA_POLL
+            while time.time() < deadline:
+                values = conn.mget(keys)
+                if all(v is not None for v in values):
+                    break
+                time.sleep(0.05)
+            else:
+                values = conn.mget(keys)
+
+            polled_at = time.time()
+            conn.close()
+
+            # Tasks not in Redis at timeout but present after polling — just slow workers.
+            slow_tasks = sum(1 for s, v in zip(snapshot, values) if s is None and v is not None)
+            # Still missing even after EXTRA_POLL seconds — truly stuck or lost.
+            truly_missing = sum(1 for v in values if v is None)
+            # How much longer after the timeout it took for remaining tasks to appear.
+            extra_wait_ms = (polled_at - timed_out_at) * 1000 if truly_missing == 0 else None
+
+            return {
+                "lost": publish_lost,
+                "slow_tasks": slow_tasks,
+                "not_completed": truly_missing,
+                # If not None: tasks did finish, timeout was just too short by this many ms.
+                "extra_wait_ms": extra_wait_ms,
+                "group_rtt_ms": None,
+                "tail_latency_ms": None,
+            }
+
+        finally:
+            g.revoke()
+            g.forget()
+
+
+# ------------------------
+# WORKER
+# ------------------------
+
+
+def start_worker():
+    app.worker_main(
+        [
+            "worker",
+            "--loglevel=WARNING",
+            "--concurrency=32",
+            "--pool=prefork",
+            "--without-gossip",
+            "--without-mingle",
+            "--without-heartbeat",
+        ]
+    )
+
+
+# ------------------------
+# WARMUP
+# ------------------------
+
+
+def warmup_workers(num_workers):
+    """
+    Two-phase warmup to initialize worker child processes before measuring.
+
+    Phase 1: Send sequential dispatch_group tasks (small batches) so each
+    child process lazily initializes its ResultConsumer pubsub connection.
+
+    Phase 2: Ramp up to full concurrent load and wait until stable. This
+    ensures all children are warm before we start measuring.
+    """
+    total_children = 32 * num_workers  # 32 children per worker
+
+    # Phase 1: sequential tasks to initialize individual children
+    num_seq = total_children * 2
+    print(f"  Phase 1: {num_seq} sequential group_get tasks...")
+    for _ in range(num_seq):
+        ar = app.tasks["group_get"].apply_async(args=[5], kwargs={"get_timeout": 60})
+        try:
+            ar.get(timeout=70)
+        except Exception:
+            pass
+    print(f"    done")
+
+    # Phase 2: ramp up to full concurrency, wait until stable (using lighter load)
+    print(f"  Phase 2: ramping to full concurrency, waiting for stability...")
+    consecutive_ok = 0
+    attempts = 0
+    warmup_concurrent = max(1, CONCURRENT // 4)  # Use 1/4 load for faster warmup
+    warmup_batch = max(10, BATCH // 5)  # Smaller batches for faster iteration
+
+    while consecutive_ok < 3:
+        attempts += 1
+        # Run warmup with lighter load: fewer concurrent tasks, smaller batches
+        ars = [
+            app.tasks["group_get"].delay(warmup_batch, get_timeout=TIMEOUT)
+            for _ in range(warmup_concurrent)
+        ]
+
+        round_fail = 0
+        for ar in ars:
+            try:
+                res = ar.get(timeout=TIMEOUT + EXTRA_POLL + 10)
+                if res["lost"] > 0 or res.get("not_completed", 0) > 0:
+                    round_fail += 1
+            except Exception:
+                round_fail += 1
+
+        if round_fail == 0:
+            consecutive_ok += 1
+        else:
+            consecutive_ok = 0
+        if attempts > 20:
+            print("    WARN: could not stabilize after 20 rounds")
+            break
+    print(f"    stable after {attempts} round(s)")
+
+
+# ------------------------
+# TEST RUNNER
+# ------------------------
+
+
+def run():
+    print("Starting pub/sub publish-not-lost test...\n")
+
+    # Detect backend type
+    backend_url = REDIS_BACKEND.replace("redis://", "")
+    host_port, _ = backend_url.rsplit("/", 1)
+    host, port = host_port.split(":")
+    r = redis.Redis(host=host, port=int(port))
+    info = r.info("server")
+    server_type = "Dragonfly" if "dragonfly_version" in info else "Redis"
+    version = info.get("dragonfly_version", info.get("redis_version", "unknown"))
+    r.close()
+
+    print(f"Backend: {server_type} {version}")
+    seed_str = f", seed={args.seed}" if args.seed is not None else ""
+    print(
+        f"Config: batch_size={BATCH}, rounds={ROUNDS}, get_timeout={TIMEOUT}s, concurrent={CONCURRENT}{seed_str}"
+    )
+    print()
+
+    # Wait for workers
+    print("Waiting for workers...", end=" ", flush=True)
+    active_workers = None
+    for _ in range(30):
+        inspector = app.control.inspect()
+        active_workers = inspector.ping()
+        if active_workers:
+            break
+        time.sleep(2)
+        print(".", end="", flush=True)
+    if not active_workers:
+        print(" NONE FOUND")
+        print("Start workers first")
+        return
+    num_workers = len(active_workers)
+    print(f" {num_workers} worker(s)")
+    print()
+
+    outer_timeout = TIMEOUT + EXTRA_POLL + 10
+    dispatch = app.tasks["group_get"]
+
+    def run_round():
+        """Run one round and return (round_ok, round_fail, round_lost, round_elapsed)."""
+        ars = [dispatch.delay(BATCH, get_timeout=TIMEOUT) for _ in range(CONCURRENT)]
+
+        round_ok = 0
+        round_fail = 0
+        round_lost = 0
+        round_max_elapsed = 0.0
+
+        for ar in ars:
+            try:
+                res = ar.get(timeout=outer_timeout)
+                if res["lost"] == 0 and res.get("not_completed", 0) == 0:
+                    round_ok += 1
+                else:
+                    round_fail += 1
+                    round_lost += res["lost"]
+                round_max_elapsed = max(round_max_elapsed, outer_timeout)
+            except Exception:
+                round_fail += 1
+                round_lost += BATCH
+                round_max_elapsed = outer_timeout
+
+        return round_ok, round_fail, round_lost, round_max_elapsed
+
+    print("Warming up workers (initializing connections)...")
+    warmup_workers(num_workers)
+    print()
+
+    total_ok = 0
+    total_fail = 0
+    total_lost = 0
+
+    for rnd in range(1, ROUNDS + 1):
+        print(f"================ ROUND {rnd}/{ROUNDS} ================")
+
+        ars = [dispatch.delay(BATCH, get_timeout=TIMEOUT) for _ in range(CONCURRENT)]
+
+        round_lost = 0
+        round_max_tail = 0.0
+        round_max_rtt = 0.0
+        round_ok = 0
+        round_fail = 0
+
+        for worker_idx, ar in enumerate(ars):
+            try:
+                res = ar.get(timeout=outer_timeout)
+                lost = res["lost"]
+                slow_tasks = res.get("slow_tasks", 0)
+                not_completed = res.get("not_completed", 0)
+                extra_wait_ms = res.get("extra_wait_ms")
+                round_lost += lost
+
+                if res["tail_latency_ms"] is not None:
+                    round_max_tail = max(round_max_tail, res["tail_latency_ms"])
+                if res["group_rtt_ms"] is not None:
+                    round_max_rtt = max(round_max_rtt, res["group_rtt_ms"])
+
+                if lost or slow_tasks or not_completed:
+                    round_fail += 1
+                    parts = []
+                    if lost:
+                        msg = f"publish_lost={lost}"
+                        if extra_wait_ms is not None:
+                            msg += f" (finished {extra_wait_ms:.0f} ms after timeout)"
+                        parts.append(msg)
+                    if slow_tasks:
+                        if extra_wait_ms is not None:
+                            parts.append(
+                                f"slow_tasks={slow_tasks} (finished {extra_wait_ms:.0f} ms after timeout)"
+                            )
+                        else:
+                            parts.append(f"slow_tasks={slow_tasks}")
+                    if not_completed:
+                        parts.append(
+                            f"not_completed={not_completed} (still missing after {EXTRA_POLL}s)"
+                        )
+                    print(f"  [worker {worker_idx:02d}] FAIL  {'  '.join(parts)}")
+                else:
+                    round_ok += 1
+                    print(
+                        f"  [worker {worker_idx:02d}] ok"
+                        f"  rtt={res['group_rtt_ms']:.1f} ms"
+                        f"  tail={res['tail_latency_ms']:.1f} ms"
+                    )
+
+            except Exception as e:
+                round_fail += 1
+                task_state = ar.state
+                task_info = f"state={task_state}"
+
+                # If task succeeded but ar.get() timed out, use the result directly
+                if (
+                    task_state == "SUCCESS"
+                    and ar.result is not None
+                    and isinstance(ar.result, dict)
+                ):
+                    res = ar.result
+                    task_info += f", result={res}"
+                    lost = res["lost"]
+                    slow_tasks = res.get("slow_tasks", 0)
+                    not_completed = res.get("not_completed", 0)
+                    round_lost += lost
+
+                    if lost == 0 and not_completed == 0:
+                        round_ok += 1
+                    else:
+                        # Fall through to print failure details
+                        round_fail += 1
+                        parts = []
+                        if lost:
+                            msg = f"publish_lost={lost}"
+                            if res.get("extra_wait_ms") is not None:
+                                msg += (
+                                    f" (finished {res.get('extra_wait_ms'):.0f} ms after timeout)"
+                                )
+                            parts.append(msg)
+                        if slow_tasks:
+                            if res.get("extra_wait_ms") is not None:
+                                parts.append(
+                                    f"slow_tasks={slow_tasks} (finished {res.get('extra_wait_ms'):.0f} ms after timeout)"
+                                )
+                            else:
+                                parts.append(f"slow_tasks={slow_tasks}")
+                        if not_completed:
+                            parts.append(
+                                f"not_completed={not_completed} (still missing after {EXTRA_POLL}s)"
+                            )
+                        print(
+                            f"  [worker {worker_idx:02d}] FAIL (ar.get timeout but result available)  {'  '.join(parts)}"
+                        )
+                    continue
+
+                # Otherwise, actual timeout or failure
+                if ar.result is not None and isinstance(ar.result, dict):
+                    task_info += f", result={ar.result}"
+                round_lost += BATCH
+                print(f"  [worker {worker_idx:02d}] TIMEOUT — {task_info}, error={e}")
+
+        total_ok += round_ok
+        total_fail += round_fail
+        total_lost += round_lost
+        status = "PASS" if round_fail == 0 else "FAIL"
+        detail = ""
+        if round_lost > 0:
+            detail = f" (pubsub_lost={round_lost})"
+        print(
+            f"  => {round_ok}/{CONCURRENT} ok  max_rtt={round_max_rtt:.1f} ms"
+            f"  max_tail_latency={round_max_tail:.1f} ms [{status}]{detail}\n"
+        )
+
+    print()
+    total = total_ok + total_fail
+    print(f"================= RESULT ==================")
+    print(f"Total: {total_ok}/{total} passed, {total_fail} failed")
+    if total_lost > 0:
+        print(f"  PUB/SUB BUG: {total_lost} tasks completed but group.get() never received PUBLISH")
+    if total_fail > 0:
+        print("FAILED: Some group.get() calls timed out")
+    else:
+        print("PASSED: All group.get() calls completed successfully")
+
+
+# ------------------------
+# ENTRYPOINT
+# ------------------------
+
+if __name__ == "__main__":
+    t = threading.Thread(target=start_worker, daemon=True)
+    t.start()
+
+    time.sleep(1)
+
+    run()
diff --git a/tools/pubsub/latency-test.py b/tools/pubsub/latency-test.py
new file mode 100755
index 000000000000..9c1207e5f5cd
--- /dev/null
+++ b/tools/pubsub/latency-test.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+Test pub/sub latency under concurrent load.
+Measures: publish latency, subscribe notification latency, and how they scale with concurrency.
+"""
+import redis
+import threading
+import time
+import statistics
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+
+DRAGONFLY_HOST = "127.0.0.1"
+DRAGONFLY_PORT = 6379
+
+
+def test_pubsub_latency(num_channels=16, publishes_per_channel=100):
+    """
+    Test pub/sub latency:
+    - Subscribe to N channels
+    - Publish to each channel M times
+    - Measure time from publish -> notification received
+    """
+    r = redis.Redis(host=DRAGONFLY_HOST, port=DRAGONFLY_PORT, decode_responses=True)
+    pubsub = r.pubsub()
+
+    latencies = []
+
+    # Subscribe to all channels
+    channels = [f"channel-{i}" for i in range(num_channels)]
+    for ch in channels:
+        pubsub.subscribe(ch)
+
+    # Consume subscription confirmations
+    for _ in range(num_channels):
+        pubsub.get_message(timeout=1)
+
+    # Publish and measure
+    for ch in channels:
+        for msg_id in range(publishes_per_channel):
+            publish_time = time.time()
+            r.publish(ch, f"msg-{msg_id}")
+
+            # Wait for notification
+            msg = pubsub.get_message(timeout=1)
+            if msg and msg["type"] == "message":
+                receive_time = time.time()
+                latency_us = (receive_time - publish_time) * 1e6
+                latencies.append(latency_us)
+
+    pubsub.close()
+    r.close()
+
+    return latencies
+
+
+def test_concurrent_publish(num_concurrent=16, publishes_per_client=100):
+    """
+    Test concurrent publish throughput and latency.
+    Multiple threads publishing simultaneously.
+    """
+    r = redis.Redis(host=DRAGONFLY_HOST, port=DRAGONFLY_PORT, decode_responses=True)
+    pubsub = r.pubsub()
+
+    # Subscribe to all channels
+    channels = [f"chan-{i}" for i in range(num_concurrent)]
+    for ch in channels:
+        pubsub.subscribe(ch)
+
+    # Consume subscription confirmations
+    for _ in range(num_concurrent):
+        pubsub.get_message(timeout=1)
+
+    latencies = []
+    lock = threading.Lock()
+
+    def publish_and_measure(channel_id):
+        r_thread = redis.Redis(host=DRAGONFLY_HOST, port=DRAGONFLY_PORT, decode_responses=True)
+        ch = channels[channel_id]
+
+        for msg_id in range(publishes_per_client):
+            publish_time = time.time()
+            r_thread.publish(ch, f"msg-{msg_id}")
+            # Note: we're measuring publish latency, not end-to-end notification
+            publish_latency_us = (time.time() - publish_time) * 1e6
+
+            with lock:
+                latencies.append(publish_latency_us)
+
+        r_thread.close()
+
+    # Run concurrent publishers and measure wall-clock time
+    start_time = time.time()
+    with ThreadPoolExecutor(max_workers=num_concurrent) as executor:
+        futures = [executor.submit(publish_and_measure, i) for i in range(num_concurrent)]
+        for f in futures:
+            f.result()
+    elapsed_time = time.time() - start_time
+
+    pubsub.close()
+    r.close()
+
+    return latencies, elapsed_time
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Test Dragonfly pub/sub latency under concurrent load"
+    )
+    parser.add_argument(
+        "--num-channels", type=int, default=16, help="Number of channels (default: 16)"
+    )
+    parser.add_argument(
+        "--publishes-per-channel", type=int, default=100, help="Messages per channel (default: 100)"
+    )
+    parser.add_argument(
+        "--num-concurrent", type=int, default=16, help="Number of concurrent clients (default: 16)"
+    )
+    parser.add_argument(
+        "--publishes-per-client", type=int, default=100, help="Publishes per client (default: 100)"
+    )
+    args = parser.parse_args()
+
+    print("Testing Dragonfly pub/sub latency...\n")
+
+    # Test 1: Sequential pub/sub (baseline)
+    print("=" * 60)
+    print(
+        f"Test 1: Sequential pub/sub ({args.num_channels} channels, {args.publishes_per_channel} messages each)"
+    )
+    print("=" * 60)
+    latencies = test_pubsub_latency(
+        num_channels=args.num_channels, publishes_per_channel=args.publishes_per_channel
+    )
+    if latencies:
+        print(f"Messages measured: {len(latencies)}")
+        print(f"Min latency:    {min(latencies):.2f} µs")
+        print(f"p50 latency:    {statistics.median(latencies):.2f} µs")
+        print(f"p99 latency:    {sorted(latencies)[int(len(latencies)*0.99)]:.2f} µs")
+        print(f"Max latency:    {max(latencies):.2f} µs")
+        print(f"Avg latency:    {statistics.mean(latencies):.2f} µs")
+    else:
+        print("No measurements collected")
+    print()
+
+    # Test 2: Concurrent publish (16 clients, 100 publishes each)
+    print("=" * 60)
+    print(
+        f"Test 2: Concurrent publish ({args.num_concurrent} clients, {args.publishes_per_client} publishes each)"
+    )
+    print("=" * 60)
+    latencies, elapsed_time = test_concurrent_publish(
+        num_concurrent=args.num_concurrent, publishes_per_client=args.publishes_per_client
+    )
+    if latencies:
+        print(f"Publishes measured: {len(latencies)}")
+        print(f"Min latency:    {min(latencies):.2f} µs")
+        print(f"p50 latency:    {statistics.median(latencies):.2f} µs")
+        print(f"p99 latency:    {sorted(latencies)[int(len(latencies)*0.99)]:.2f} µs")
+        print(f"Max latency:    {max(latencies):.2f} µs")
+        print(f"Avg latency:    {statistics.mean(latencies):.2f} µs")
+        if elapsed_time > 0:
+            throughput = len(latencies) / elapsed_time
+            print(f"Throughput:     {throughput:.0f} ops/sec")
+        else:
+            print("Throughput:     N/A (elapsed time too short)")
+    else:
+        print("No measurements collected")
diff --git a/tools/pubsub-stress.py b/tools/pubsub/stress.py
old mode 100644
new mode 100755
similarity index 100%
rename from tools/pubsub-stress.py
rename to tools/pubsub/stress.py
diff --git a/tools/replay/go.mod b/tools/replay/go.mod
index 66bede9a43cb..5310a8540316 100644
--- a/tools/replay/go.mod
+++ b/tools/replay/go.mod
@@ -1,6 +1,6 @@
 module dragonfydb.io/traffic-replay
 
-go 1.18
+go 1.24.0
 
 require (
 	github.com/influxdata/tdigest v0.0.1
@@ -10,16 +10,21 @@ require (
 
 require (
 	github.com/atomicgo/cursor v0.0.1 // indirect
-	github.com/cespare/xxhash/v2 v2.2.0 // indirect
-	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
+	github.com/google/go-cmp v0.7.0 // indirect
 	github.com/gookit/color v1.4.2 // indirect
+	github.com/kr/pretty v0.3.1 // indirect
 	github.com/mattn/go-runewidth v0.0.13 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
-	github.com/stretchr/testify v1.7.0 // indirect
+	github.com/rogpeppe/go-internal v1.13.1 // indirect
+	github.com/stretchr/testify v1.11.1 // indirect
 	github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 // indirect
-	golang.org/x/sys v0.1.0 // indirect
-	golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d // indirect
+	golang.org/x/exp v0.0.0-20250911091902-df9299821621 // indirect
+	golang.org/x/sys v0.36.0 // indirect
+	golang.org/x/term v0.35.0 // indirect
+	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/tools/replay/go.sum b/tools/replay/go.sum
index 7961d34bce73..e86758f5b8de 100644
--- a/tools/replay/go.sum
+++ b/tools/replay/go.sum
@@ -2,49 +2,50 @@ github.com/atomicgo/cursor v0.0.1 h1:xdogsqa6YYlLfM+GyClC/Lchf7aiMerFiZQn7soTOoU
 github.com/atomicgo/cursor v0.0.1/go.mod h1:cBON2QmmrysudxNBFthvMtN32r3jxVRIvzkUiF/RuIk=
 github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
 github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
-github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
-github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
-github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/gookit/color v1.4.2 h1:tXy44JFSFkKnELV6WaMo/lLfu/meqITX3iAV52do7lk=
 github.com/gookit/color v1.4.2/go.mod h1:fqRyamkC1W8uxl+lxCQxOT09l/vYfZ+QeiX3rKQHCoQ=
 github.com/influxdata/tdigest v0.0.1 h1:XpFptwYmnEKUqmkcDjrzffswZ3nvNeevbUSLPP/ZzIY=
 github.com/influxdata/tdigest v0.0.1/go.mod h1:Z0kXnxzbTC2qrx4NaIzYkE1k66+6oEDQTvL95hQFh5Y=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
 github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pterm/pterm v0.12.25 h1:l9a8FU4XmJHs6rug8/YV1L2g/LBdMgaTvBBgwxD8avI=
 github.com/pterm/pterm v0.12.25/go.mod h1:PhQ89w4i95rhgE+xedAoqous6K9X+r6aSOI2eFF7DZI=
 github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
 github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 h1:QldyIu/L63oPpyvQmHgvgickp1Yw510KJOqX7H24mg8=
 github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs=
-golang.org/x/exp v0.0.0-20180321215751-8460e604b9de h1:xSjD6HQTqT0H/k60N5yYBtnN1OEkVy7WIo/DYyxKRO0=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20250911091902-df9299821621 h1:2id6c1/gto0kaHYyrixvknJ8tUK/Qs5IsmBtrc+FtgU=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
-golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d h1:SZxvLBoTP5yHO3Frd4z4vrF+DBX9vMVanchswa69toE=
+golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
 golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca h1:PupagGYwj8+I4ubCxcmcBRk3VlUWtTg5huQpZR9flmE=
 gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
diff --git a/tools/replay/main.go b/tools/replay/main.go
index 248e46225c28..4cf604e4ea9b 100644
--- a/tools/replay/main.go
+++ b/tools/replay/main.go
@@ -13,8 +13,11 @@ import (
 	"github.com/pterm/pterm"
 )
 
-var fHost = flag.String("host", "127.0.0.1:6379", "Redis host")
-var fCompareHost = flag.String("compare-host", "", "Redis host to compare with")
+var fHost = flag.String("host", "127.0.0.1:6379",
+	"host:port of the replay target. The client protocol is chosen per file from its "+
+		"header (MAIN_RESP/ADMIN_RESP use RESP, MEMCACHE uses the memcache text protocol); "+
+		"-host must speak the protocol of the files being replayed")
+var fCompareHost = flag.String("compare-host", "", "RESP host to compare with (main listener only)")
 var fClientBuffer = flag.Int("buffer", 100, "How many records to buffer per client")
 var fPace = flag.Bool("pace", true, "whether to pace the traffic according to the original timings.false - to pace as fast as possible")
 var fSkip = flag.Uint("skip", 0, "skip N records")
@@ -145,7 +148,7 @@ func Print(files []string) {
 	for i, file := range files {
 		tops[i].ch = make(chan Record, 100)
 		go func(ch chan Record, file string) {
-			parseRecords(file, func(r Record) bool {
+			parseRecords(file, nil, func(r Record) bool {
 				ch <- r
 				return true
 			}, *fIgnoreParseErrors)
@@ -192,9 +195,9 @@ func Analyze(files []string) {
 	for _, file := range files {
 		fileClients := make(map[uint32]bool)
 
-		parseRecords(file, func(r Record) bool {
+		parseRecords(file, nil, func(r Record) bool {
 			total += 1
-			if r.HasMore > 0 {
+			if r.HasMore() {
 				chained += 1
 			}
 
@@ -243,7 +246,8 @@ func main() {
 		fmt.Fprintln(os.Stderr, "  analyze - analyzes the traffic")
 
 		fmt.Fprintln(os.Stderr, "\nExamples:")
-		fmt.Fprintf(os.Stderr, "   %s -host 192.168.1.10:6379 -buffer 50 run *.bin\n", binaryName)
+		fmt.Fprintf(os.Stderr, "   %s -host 192.168.1.10:6379 -buffer 50 run *.bin        # RESP files\n", binaryName)
+		fmt.Fprintf(os.Stderr, "   %s -host 192.168.1.10:11211 run *.bin                  # memcache files\n", binaryName)
 		fmt.Fprintf(os.Stderr, "   %s -skip-time-sec 30 run *.bin\n", binaryName)
 		fmt.Fprintf(os.Stderr, "   %s -time-limit 60 run *.bin\n", binaryName)
 		fmt.Fprintf(os.Stderr, "   %s print *.bin\n", binaryName)
diff --git a/tools/replay/memcache.go b/tools/replay/memcache.go
new file mode 100644
index 000000000000..31163e72df61
--- /dev/null
+++ b/tools/replay/memcache.go
@@ -0,0 +1,174 @@
+// Package main: minimal raw-TCP memcache ASCII protocol client used by the
+// traffic replayer. Exists because we need to reproduce *exactly* the wire
+// commands that were recorded (including CAS tokens, GAT one-shot semantics,
+// multi-key requests, flags/expire, etc.) — high-level libraries like
+// gomemcache hide those details.
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+	"net"
+	"strconv"
+)
+
+// mcClient is a synchronous text-protocol memcache client. Each command is
+// written in full and its reply is consumed before returning, so replay is
+// sequential per connection — matching the memcache ASCII protocol, which has
+// no request multiplexing.
+type mcClient struct {
+	addr string
+	conn net.Conn
+	r    *bufio.Reader
+	w    *bufio.Writer
+}
+
+func newMCClient(addr string) *mcClient {
+	c := &mcClient{addr: addr}
+	c.dial()
+	return c
+}
+
+func (c *mcClient) dial() {
+	conn, err := net.Dial("tcp", c.addr)
+	if err != nil {
+		log.Fatalf("memcache dial %s: %v", c.addr, err)
+	}
+	c.conn = conn
+	c.r = bufio.NewReader(conn)
+	c.w = bufio.NewWriter(conn)
+}
+
+// writeAll writes the prepared command buffer (including any value block and
+// terminating \r\n), flushes and reads the server reply.
+func (c *mcClient) writeAll(cmd []byte) {
+	if _, err := c.w.Write(cmd); err != nil {
+		log.Printf("memcache write to %s failed: %v", c.addr, err)
+		return
+	}
+	if err := c.w.Flush(); err != nil {
+		log.Printf("memcache flush to %s failed: %v", c.addr, err)
+		return
+	}
+	c.drainReply()
+}
+
+// drainReply consumes one server reply. For retrieval commands this means
+// reading VALUE lines + data blocks until END; for everything else a single
+// line. Non-fatal: errors are logged and the reader is left positioned
+// wherever possible so replay can continue.
+func (c *mcClient) drainReply() {
+	for {
+		line, err := c.r.ReadBytes('\n')
+		if err != nil {
+			if err != io.EOF {
+				log.Printf("memcache read from %s failed: %v", c.addr, err)
+			}
+			return
+		}
+		switch {
+		case bytes.HasPrefix(line, []byte("VALUE ")):
+			// VALUE <key> <flags> <bytes> [<cas>]\r\n  -- followed by <bytes> + "\r\n"
+			fields := bytes.Fields(line)
+			if len(fields) < 4 {
+				return
+			}
+			n, err := strconv.Atoi(string(fields[3]))
+			if err != nil {
+				return
+			}
+			// Read exactly n bytes + trailing \r\n.
+			if _, err := io.CopyN(io.Discard, c.r, int64(n)+2); err != nil {
+				return
+			}
+			// Loop — next line is either another VALUE or END.
+		case bytes.HasPrefix(line, []byte("STAT ")):
+			// STATS reply is a sequence of STAT lines terminated by END.
+			// Keep reading.
+		default:
+			// All other terminal responses are single-line: STORED, NOT_STORED,
+			// EXISTS, NOT_FOUND, DELETED, END, OK, VERSION..., <decimal>,
+			// ERROR, CLIENT_ERROR..., SERVER_ERROR...
+			return
+		}
+	}
+}
+
+// send formats `<header>\r\n[value\r\n]` and issues it.
+// value may be nil for non-store commands.
+func (c *mcClient) send(header string, value []byte) {
+	// Preallocate: header + \r\n + value + \r\n
+	buf := make([]byte, 0, len(header)+4+len(value))
+	buf = append(buf, header...)
+	buf = append(buf, '\r', '\n')
+	if value != nil {
+		buf = append(buf, value...)
+		buf = append(buf, '\r', '\n')
+	}
+	c.writeAll(buf)
+}
+
+// Store replays SET/ADD/REPLACE/APPEND/PREPEND.
+func (c *mcClient) Store(cmd, key string, flags uint32, exp int64, value []byte) {
+	hdr := fmt.Sprintf("%s %s %d %d %d", cmd, key, flags, exp, len(value))
+	c.send(hdr, value)
+}
+
+// Cas replays `cas <key> <flags> <exp> <bytes> <cas_unique>\r\n<value>\r\n`.
+// The server will reply STORED or EXISTS — the same response the original
+// client saw, preserving the CAS-miss behaviour.
+func (c *mcClient) Cas(key string, flags uint32, exp int64, casUnique uint64, value []byte) {
+	hdr := fmt.Sprintf("cas %s %d %d %d %d", key, flags, exp, len(value), casUnique)
+	c.send(hdr, value)
+}
+
+// Retrieve issues `<cmd> <key>...\r\n` where cmd is get|gets. For gat/gats use
+// RetrieveGat — those commands need an <exp> token between the command and the
+// key list so they get their own helper.
+func (c *mcClient) Retrieve(cmd string, keys []string) {
+	var b bytes.Buffer
+	b.WriteString(cmd)
+	for _, k := range keys {
+		b.WriteByte(' ')
+		b.WriteString(k)
+	}
+	c.send(b.String(), nil)
+}
+
+// RetrieveGat issues `gat|gats <exp> <key>...\r\n`.
+func (c *mcClient) RetrieveGat(cmd string, exp int64, keys []string) {
+	var b bytes.Buffer
+	fmt.Fprintf(&b, "%s %d", cmd, exp)
+	for _, k := range keys {
+		b.WriteByte(' ')
+		b.WriteString(k)
+	}
+	c.send(b.String(), nil)
+}
+
+// Delete replays `delete <key> [extras...]\r\n`. Memcache's deprecated delayed
+// delete (`delete <key> <time>`) parses into backed_args as extra tokens —
+// forward them verbatim so behaviour matches the original request.
+func (c *mcClient) Delete(key string, extras []string) {
+	var b bytes.Buffer
+	b.WriteString("delete ")
+	b.WriteString(key)
+	for _, e := range extras {
+		b.WriteByte(' ')
+		b.WriteString(e)
+	}
+	c.send(b.String(), nil)
+}
+
+// IncrDecr replays `incr|decr <key> <delta>\r\n`.
+func (c *mcClient) IncrDecr(cmd, key string, delta uint64) {
+	c.send(fmt.Sprintf("%s %s %d", cmd, key, delta), nil)
+}
+
+// FlushAll replays `flush_all\r\n`.
+func (c *mcClient) FlushAll() {
+	c.send("flush_all", nil)
+}
diff --git a/tools/replay/parsing.go b/tools/replay/parsing.go
index 59391eaa7ba9..b2f8167c8842 100644
--- a/tools/replay/parsing.go
+++ b/tools/replay/parsing.go
@@ -40,7 +40,15 @@ func parseStrings(file io.Reader) (out []interface{}, err error) {
 	return
 }
 
-func parseRecords(filename string, cb func(Record) bool, ignoreErrors bool) error {
+// parseRecords reads a traffic log file. The file header (version + listener
+// type) is delivered via onHeader before any record callback runs, so callers
+// can configure per-listener behaviour up-front. onHeader may be nil.
+//
+// File format v2: one byte version = 2, then records. Legacy main-listener
+// only, treated as RESP on read.
+// File format v3: one byte version = 3, one byte listener_type, then records.
+func parseRecords(filename string, onHeader func(listenerType uint8),
+	cb func(Record) bool, ignoreErrors bool) error {
 	file, err := os.Open(filename)
 	if err != nil {
 		return err
@@ -50,9 +58,23 @@ func parseRecords(filename string, cb func(Record) bool, ignoreErrors bool) erro
 	reader := bufio.NewReader(file)
 
 	var version uint8
-	binary.Read(reader, binary.LittleEndian, &version)
-	if version != 2 {
-		panic("Requires version two replayer, roll back in commits!")
+	if err := binary.Read(reader, binary.LittleEndian, &version); err != nil {
+		return err
+	}
+
+	var listenerType uint8
+	switch version {
+	case 2:
+		listenerType = ListenerMainRESP
+	case 3:
+		if err := binary.Read(reader, binary.LittleEndian, &listenerType); err != nil {
+			return err
+		}
+	default:
+		log.Fatalf("Unsupported traffic log version %d (supported: 2, 3)", version)
+	}
+	if onHeader != nil {
+		onHeader(listenerType)
 	}
 
 	recordNum := 0
diff --git a/tools/replay/workers.go b/tools/replay/workers.go
index 073fddc7465f..0c5f45b97b04 100644
--- a/tools/replay/workers.go
+++ b/tools/replay/workers.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"log"
 	"math"
+	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -14,11 +15,32 @@ import (
 	"github.com/redis/go-redis/v9"
 )
 
+// Listener type identifiers as stored in traffic log v3 (must match
+// facade::Connection::ListenerType in the C++ code). MAIN_RESP, ADMIN_RESP and
+// REPLICA_RESP all carry RESP-format commands and are replayed via the same
+// RESP client; only MEMCACHE needs a different client.
+const (
+	ListenerMainRESP    uint8 = 1
+	ListenerMemcache    uint8 = 2
+	ListenerAdminRESP   uint8 = 3
+	ListenerReplicaRESP uint8 = 4
+)
+
+// Flags field layout:
+//
+//	bit 0     : 1 if the next record belongs to the same batch as this one (HasMore)
+//	bits 1-31 : reserved, must be zero
+//
+// The listener type is stored once in the file header (see parsing.go), not per record.
 type RecordHeader struct {
 	Client  uint32
 	Time    uint64
 	DbIndex uint32
-	HasMore uint32
+	Flags   uint32
+}
+
+func (h RecordHeader) HasMore() bool {
+	return h.Flags&1 != 0
 }
 
 type Record struct {
@@ -30,24 +52,30 @@ type Record struct {
 func DetermineBaseTime(files []string) time.Time {
 	var minTime uint64 = math.MaxUint64
 	for _, file := range files {
-		parseRecords(file, func(r Record) bool {
+		parseRecords(file, nil, func(r Record) bool {
 			if r.Time < minTime {
 				minTime = r.Time
 			}
 			return false
-		}, *fIgnoreParseErrors)
+		}, *fIgnoreParseErrors) //nolint:errcheck
 	}
 	return time.Unix(0, int64(minTime))
 }
 
 // Handles a single connection/client
 type ClientWorker struct {
-	redis     *redis.Client
+	// RESP path (used for resp and admin listeners).
+	redis       *redis.Client
 	compare     *redis.Client
+	pipe        redis.Pipeliner
+	comparePipe redis.Pipeliner
+
+	// Memcache path (used for memcache listener).
+	mc           *mcClient
+	listenerType uint8
+
 	incoming  chan Record
 	processed uint
-	pipe      redis.Pipeliner
-	comparePipe redis.Pipeliner
 }
 
 // Pipeline length ranges for summary
@@ -63,13 +91,13 @@ var pipelineRanges = []struct {
 }
 
 var compareIgnoreCmds = []string{
-    "HELLO",
-    "AUTH",
-    "SELECT",
-    "INFO",
-    "TIME",
-    "CLIENT",
-    "CONFIG",
+	"HELLO",
+	"AUTH",
+	"SELECT",
+	"INFO",
+	"TIME",
+	"CLIENT",
+	"CONFIG",
 }
 
 // Handles a single file and distributes messages to clients
@@ -113,81 +141,90 @@ func trackLatency(worker *FileWorker, batchLatency float64, size int) {
 }
 
 func ignoreCompareCmd(c redis.Cmder) bool {
-    args := c.Args()
-    if len(args) == 0 {
-        return true
-    }
-    name := strings.ToUpper(fmt.Sprint(args[0]))
-    for _, ign := range compareIgnoreCmds {
-        if name == ign {
-            return true
-        }
-    }
-    return false
+	args := c.Args()
+	if len(args) == 0 {
+		return true
+	}
+	name := strings.ToUpper(fmt.Sprint(args[0]))
+	for _, ign := range compareIgnoreCmds {
+		if name == ign {
+			return true
+		}
+	}
+	return false
 }
 
 func cmdAsString(c redis.Cmder) string {
-    args := c.Args()
-    if len(args) == 0 {
-        return "<no-args>"
-    }
-
-    name := strings.ToUpper(fmt.Sprint(args[0]))
-    if len(args) == 1 {
-        return name
-    }
-
-    parts := make([]string, 0, len(args) - 1)
-    for _, a := range args[1:] {
-        s := fmt.Sprint(a)
-        parts = append(parts, s)
-    }
-    return name + " " + strings.Join(parts, " ")
+	args := c.Args()
+	if len(args) == 0 {
+		return "<no-args>"
+	}
+
+	name := strings.ToUpper(fmt.Sprint(args[0]))
+	if len(args) == 1 {
+		return name
+	}
+
+	parts := make([]string, 0, len(args)-1)
+	for _, a := range args[1:] {
+		s := fmt.Sprint(a)
+		parts = append(parts, s)
+	}
+	return name + " " + strings.Join(parts, " ")
 }
 
 func cmdResultString(cm redis.Cmder) string {
-    if err := cm.Err(); err != nil {
-        if err == redis.Nil {
-            return "(nil)"
-        }
-        return "ERR: " + err.Error()
-    }
-
-    if cmd, ok := cm.(*redis.Cmd); ok {
-        v := cmd.Val()
-        s := fmt.Sprintf("%v", v)
-        return s
-    }
-
-    return fmt.Sprintf("<unknown Cmder %T>", cm)
+	if err := cm.Err(); err != nil {
+		if err == redis.Nil {
+			return "(nil)"
+		}
+		return "ERR: " + err.Error()
+	}
+
+	if cmd, ok := cm.(*redis.Cmd); ok {
+		v := cmd.Val()
+		s := fmt.Sprintf("%v", v)
+		return s
+	}
+
+	return fmt.Sprintf("<unknown Cmder %T>", cm)
 }
 
 func compareCmdResults(a, b []redis.Cmder, lastMsg Record) {
-    if len(a) != len(b) {
-        log.Fatalf("[COMPARE] mismatch count: primary=%d compare=%d (last client=%d time=%d)", len(a), len(b), lastMsg.Client, lastMsg.Time)
-        return
-    }
+	if len(a) != len(b) {
+		log.Fatalf("[COMPARE] mismatch count: primary=%d compare=%d (last client=%d time=%d)", len(a), len(b), lastMsg.Client, lastMsg.Time)
+		return
+	}
 
-    for i := range a {
-		if (ignoreCompareCmd(a[i])) {
+	for i := range a {
+		if ignoreCompareCmd(a[i]) {
 			continue
 		}
 		pa := cmdResultString(a[i])
-        pb := cmdResultString(b[i])
-        if pa != pb {
+		pb := cmdResultString(b[i])
+		if pa != pb {
 			log.Fatalf("[COMPARE] mismatch at idx %d cmd=%s\n  primary=%s\n  compare=%s\n  (client=%d time=%d)", i, cmdAsString(a[i]), pa, pb, lastMsg.Client, lastMsg.Time)
 		}
-    }
+	}
 }
 
 func (c *ClientWorker) Run(pace bool, worker *FileWorker) {
+	if c.listenerType == ListenerMemcache {
+		c.runMC(pace, worker)
+	} else {
+		c.runRedis(pace, worker)
+	}
+	worker.clientGroup.Done()
+}
+
+func (c *ClientWorker) runRedis(pace bool, worker *FileWorker) {
 	for msg := range c.incoming {
 		if c.processed == 0 && msg.DbIndex != 0 {
 			// There is no easy way to switch, we rely on connection pool consisting only of one connection
 			c.redis.Do(context.Background(), []interface{}{"SELECT", fmt.Sprint(msg.DbIndex)})
 			if c.compare != nil {
-        		c.compare.Do(context.Background(), []interface{}{"SELECT", fmt.Sprint(msg.DbIndex)})
-    		}
+				c.compare.Do(context.Background(), []interface{}{"SELECT", fmt.Sprint(msg.DbIndex)})
+			}
 		}
 
 		lag := time.Until(worker.HappensAt(time.Unix(0, int64(msg.Time))))
@@ -201,12 +238,12 @@ func (c *ClientWorker) Run(pace bool, worker *FileWorker) {
 
 		c.pipe.Do(context.Background(), msg.values...).Result()
 		if c.comparePipe != nil {
-    		c.comparePipe.Do(context.Background(), msg.values...).Result()
+			c.comparePipe.Do(context.Background(), msg.values...).Result()
 		}
 
 		atomic.AddUint64(&worker.processed, 1)
 
-		if msg.HasMore == 0 {
+		if !msg.HasMore() {
 			size := c.pipe.Len()
 			start := time.Now()
 			cmds, _ := c.pipe.Exec(context.Background())
@@ -214,35 +251,166 @@ func (c *ClientWorker) Run(pace bool, worker *FileWorker) {
 			trackLatency(worker, batchLatency, size)
 			c.processed += uint(size)
 
-    		if c.comparePipe != nil {
-        		ccmds, _ := c.comparePipe.Exec(context.Background())
-        		compareCmdResults(cmds, ccmds, msg)
-    		}
+			if c.comparePipe != nil {
+				ccmds, _ := c.comparePipe.Exec(context.Background())
+				compareCmdResults(cmds, ccmds, msg)
+			}
 		}
 	}
 
-	if size := c.pipe.Len(); size >= 0 {
+	// Final flush: only run Exec if there is something pending (the input channel
+	// closed mid-batch). Also drain the compare pipeline so its connection does not
+	// keep commands buffered; we don't have a last-message context to attribute a
+	// mismatch to, so we only compare when both pipelines have the same length.
+	if size := c.pipe.Len(); size > 0 {
 		start := time.Now()
-		c.pipe.Exec(context.Background())
+		cmds, _ := c.pipe.Exec(context.Background())
 		batchLatency := float64(time.Since(start).Microseconds())
 		trackLatency(worker, batchLatency, size)
 		c.processed += uint(size)
+
+		if c.comparePipe != nil && c.comparePipe.Len() == size {
+			ccmds, _ := c.comparePipe.Exec(context.Background())
+			compareCmdResults(cmds, ccmds, Record{})
+		} else if c.comparePipe != nil {
+			// Lengths diverged — still drain so the comparePipe doesn't leak commands.
+			c.comparePipe.Exec(context.Background())
+		}
 	}
+}
 
-	worker.clientGroup.Done()
+// runMC replays memcache-listener records via the memcache text protocol.
+// The ASCII protocol has no multiplexing, so each command is issued synchronously
+// on the per-client connection (see mcClient in memcache.go) and the
+// pipeline-range latency digest uses batch size = 1.
+func (c *ClientWorker) runMC(pace bool, worker *FileWorker) {
+	for msg := range c.incoming {
+		lag := time.Until(worker.HappensAt(time.Unix(0, int64(msg.Time))))
+		if lag < 0 {
+			atomic.AddUint64(&worker.delayed, 1)
+		}
+		if pace {
+			time.Sleep(lag)
+		}
+
+		start := time.Now()
+		dispatchMC(c.mc, msg.values)
+		trackLatency(worker, float64(time.Since(start).Microseconds()), 1)
+		atomic.AddUint64(&worker.processed, 1)
+		c.processed++
+	}
 }
 
-func NewClient(w *FileWorker, pace bool) *ClientWorker {
+// dispatchMC maps a recorded memcache command to the raw wire commands sent
+// by the original client. See memcache.go for the underlying TCP client.
+//
+// Record layout written by facade::LogMemcacheTraffic:
+//
+//	SET/ADD/REPLACE/APPEND/PREPEND : [cmd, key, value, flags, expire_ts]
+//	CAS                            : [cas, key, value, flags, expire_ts, cas_unique]
+//	INCR/DECR                      : [cmd, key, delta]
+//	GAT/GATS                       : [cmd, expire_ts, key+]
+//	GET/GETS/DELETE/FLUSHALL/...   : [cmd, *args]
+//
+// Unknown commands are dropped silently.
+func dispatchMC(mc *mcClient, values []interface{}) {
+	if len(values) == 0 {
+		return
+	}
+	name, ok := values[0].(string)
+	if !ok {
+		return
+	}
+	arg := func(i int) string {
+		if i >= len(values) {
+			return ""
+		}
+		s, _ := values[i].(string)
+		return s
+	}
+	argU64 := func(i int) uint64 {
+		v, _ := strconv.ParseUint(arg(i), 10, 64)
+		return v
+	}
+	argI64 := func(i int) int64 {
+		v, _ := strconv.ParseInt(arg(i), 10, 64)
+		return v
+	}
+	argU32 := func(i int) uint32 {
+		return uint32(argU64(i))
+	}
+
+	lc := strings.ToLower(name)
+	switch lc {
+	case "set", "add", "replace", "append", "prepend":
+		mc.Store(lc, arg(1), argU32(3), argI64(4), []byte(arg(2)))
+	case "cas":
+		mc.Cas(arg(1), argU32(3), argI64(4), argU64(5), []byte(arg(2)))
+	case "get", "gets":
+		keys := stringArgs(values, 1)
+		if len(keys) > 0 {
+			mc.Retrieve(lc, keys)
+		}
+	case "gat", "gats":
+		keys := stringArgs(values, 2)
+		if len(keys) > 0 {
+			mc.RetrieveGat(lc, argI64(1), keys)
+		}
+	case "delete":
+		if k := arg(1); k != "" {
+			mc.Delete(k, stringArgs(values, 2))
+		}
+	case "incr", "decr":
+		mc.IncrDecr(lc, arg(1), argU64(2))
+	case "flush_all":
+		mc.FlushAll()
+	case "quit", "version", "stats", "mn":
+		// No-op: control / meta-NOOP commands, not useful for replay.
+	default:
+		// Defensive: parser normalises meta commands (ms/md/ma/mg/me) to their
+		// regular counterparts before logging, so only unknown/corrupted records
+		// can land here. Surface them so replay isn't silently skipping data.
+		log.Printf("replay: unknown memcache command %q skipped", name)
+	}
+}
+
+// stringArgs returns values[start:] coerced to []string (non-string entries
+// are skipped).
+func stringArgs(values []interface{}, start int) []string {
+	out := make([]string, 0, len(values)-start)
+	for i := start; i < len(values); i++ {
+		if s, ok := values[i].(string); ok && s != "" {
+			out = append(out, s)
+		}
+	}
+	return out
+}
+
+func NewClient(w *FileWorker, pace bool, listenerType uint8) *ClientWorker {
 	client := &ClientWorker{
-		redis:    redis.NewClient(&redis.Options{Addr: *fHost, PoolSize: 1, DisableIndentity: true}),
-		incoming: make(chan Record, *fClientBuffer),
+		listenerType: listenerType,
+		incoming:     make(chan Record, *fClientBuffer),
 	}
-	client.pipe = client.redis.Pipeline()
 
-	if *fCompareHost != "" {
-        client.compare = redis.NewClient(&redis.Options{Addr: *fCompareHost, PoolSize: 1, DisableIndentity: true})
-        client.comparePipe = client.compare.Pipeline()
-    }
+	// Protocol is decided per file from its header (see FileWorker.Run), so the
+	// same -host can be reused across invocations that target different backends
+	// of the same Dragonfly instance (main RESP port for RESP records, memcache
+	// port for memcache records). Mixing file types in one invocation is the
+	// caller's responsibility — the tool connects as the file says.
+	if listenerType == ListenerMemcache {
+		client.mc = newMCClient(*fHost)
+	} else {
+		// MAIN_RESP and ADMIN_RESP both speak RESP — they share -host. Replaying
+		// admin-listener traffic against a non-admin port may lose privileged
+		// commands; the caller is expected to point -host at the appropriate port.
+		client.redis = redis.NewClient(&redis.Options{Addr: *fHost, PoolSize: 1, DisableIndentity: true})
+		client.pipe = client.redis.Pipeline()
+		// -compare-host only makes sense for the main-listener path.
+		if listenerType == ListenerMainRESP && *fCompareHost != "" {
+			client.compare = redis.NewClient(&redis.Options{Addr: *fCompareHost, PoolSize: 1, DisableIndentity: true})
+			client.comparePipe = client.compare.Pipeline()
+		}
+	}
 
 	atomic.AddUint64(&w.clients, 1)
 	w.clientGroup.Add(1)
@@ -258,10 +426,15 @@ func (w *FileWorker) Run(file string, wg *sync.WaitGroup) {
 	}
 	clients := make(map[uint32]*ClientWorker, 0)
 	recordId := uint64(0)
-	err := parseRecords(file, func(r Record) bool {
+	var listenerType uint8
+	err := parseRecords(file, func(lt uint8) {
+		listenerType = lt
+	}, func(r Record) bool {
 		client, ok := clients[r.Client]
 		if !ok {
-			client = NewClient(w, *fPace)
+			// Listener type is uniform for the whole file (file-header), so every
+			// client spawned for this FileWorker targets the same backend.
+			client = NewClient(w, *fPace, listenerType)
 			clients[r.Client] = client
 		}
 		cmdName := strings.ToLower(r.values[0].(string))