Merge pull request #13495 from mmaslankaprv/fix-internal-752

mmaslankaprv · web-flow · commit 54c0d86213ba · 2023-09-20T09:16:36.000+02:00
fixed tracking expected last offset of a follower
diff --git a/src/v/cluster/cluster_utils.cc b/src/v/cluster/cluster_utils.cc
@@ -332,7 +332,7 @@ partition_raft_state get_partition_raft_state(consensus_ptr ptr) {
             state.last_flushed_log_index = md.last_flushed_log_index;
             state.match_index = md.match_index;
             state.next_index = md.next_index;
-            state.last_sent_offset = md.last_sent_offset;
+            state.expected_log_end_offset = md.expected_log_end_offset;
             state.heartbeats_failed = md.heartbeats_failed;
             state.is_learner = md.is_learner;
             state.is_recovering = md.is_recovering;
diff --git a/src/v/cluster/types.h b/src/v/cluster/types.h
@@ -3592,7 +3592,7 @@ struct partition_raft_state
         model::offset last_dirty_log_index;
         model::offset match_index;
         model::offset next_index;
-        model::offset last_sent_offset;
+        model::offset expected_log_end_offset;
         size_t heartbeats_failed;
         bool is_learner;
         uint64_t ms_since_last_heartbeat;
@@ -3609,7 +3609,7 @@ struct partition_raft_state
               last_dirty_log_index,
               match_index,
               next_index,
-              last_sent_offset,
+              expected_log_end_offset,
               heartbeats_failed,
               is_learner,
               ms_since_last_heartbeat,
diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc
@@ -449,7 +449,7 @@ consensus::success_reply consensus::update_follower_index(
         successfull_append_entries_reply(idx, std::move(reply));
         return success_reply::yes;
     } else {
-        idx.last_sent_offset = idx.last_dirty_log_index;
+        idx.expected_log_end_offset = model::offset{};
     }
 
     if (idx.is_recovering) {
@@ -461,7 +461,6 @@ consensus::success_reply consensus::update_follower_index(
             idx.last_dirty_log_index = reply.last_dirty_log_index;
             idx.last_flushed_log_index = reply.last_flushed_log_index;
             idx.next_index = model::next_offset(idx.last_dirty_log_index);
-            idx.last_sent_offset = model::offset{};
         }
         return success_reply::no;
     }
@@ -554,6 +553,13 @@ void consensus::successfull_append_entries_reply(
     idx.match_index = idx.last_dirty_log_index;
     idx.next_index = model::next_offset(idx.last_dirty_log_index);
     idx.last_successful_received_seq = idx.last_received_seq;
+    /**
+     * Update expected log end offset only if it is smaller than current value,
+     * the check is needed here as there might be pending append entries
+     * requests that were not yet replied by the follower.
+     */
+    idx.expected_log_end_offset = std::max(
+      idx.last_dirty_log_index, idx.expected_log_end_offset);
     vlog(
       _ctxlog.trace,
       "Updated node {} match {} and next {} indices",
@@ -588,7 +594,7 @@ void consensus::dispatch_recovery(follower_index_metadata& idx) {
           idx.next_index,
           log_max_offset);
         idx.next_index = log_max_offset;
-        idx.last_sent_offset = model::offset{};
+        idx.expected_log_end_offset = model::offset{};
     }
     idx.is_recovering = true;
     // background
@@ -1996,6 +2002,10 @@ consensus::do_append_entries(append_entries_request&& r) {
     if (request_metadata.prev_log_index < last_log_offset) {
         if (unlikely(request_metadata.prev_log_index < _commit_index)) {
             reply.result = reply_result::success;
+            // clamp dirty offset to the current commit index not to allow
+            // leader reasoning about follower log beyond that point
+            reply.last_dirty_log_index = _commit_index;
+            reply.last_flushed_log_index = _commit_index;
             vlog(
               _ctxlog.info,
               "Stale append entries request processed, entry is already "
diff --git a/src/v/raft/recovery_stm.cc b/src/v/raft/recovery_stm.cc
@@ -132,8 +132,13 @@ ss::future<> recovery_stm::do_recover(ss::io_priority_class iopc) {
         co_return;
     }
 
-    // wait for another round
-    if (meta.value()->last_sent_offset >= lstats.dirty_offset) {
+    /**
+     * If expected_log_end_offset is indicating that all the requests were
+     * already dispatched to the follower wait for append entries responses. The
+     * responses will trigger the follower state condition variable and
+     * recovery_stm will redo the check if follower still needs to be recovered.
+     */
+    if (meta.value()->expected_log_end_offset >= lstats.dirty_offset) {
         co_await meta.value()
           ->follower_state_change.wait()
           .handle_exception_type([this](const ss::broken_condition_variable&) {
@@ -160,16 +165,6 @@ ss::future<> recovery_stm::do_recover(ss::io_priority_class iopc) {
     meta = get_follower_meta();
 }
 
-bool recovery_stm::state_changed() {
-    auto meta = get_follower_meta();
-    if (!meta) {
-        return true;
-    }
-    auto lstats = _ptr->_log->offsets();
-    return lstats.dirty_offset > meta.value()->last_dirty_log_index
-           || meta.value()->last_sent_offset == lstats.dirty_offset;
-}
-
 flush_after_append
 recovery_stm::should_flush(model::offset follower_committed_match_index) const {
     constexpr size_t checkpoint_flush_size = 1_MiB;
@@ -319,7 +314,15 @@ ss::future<> recovery_stm::send_install_snapshot_request() {
             .dirty_offset = _ptr->dirty_offset()};
 
           _sent_snapshot_bytes += chunk_size;
-
+          if (req.done) {
+              auto meta = get_follower_meta();
+              if (!meta) {
+                  // stop recovery when node was removed
+                  _stop_requested = true;
+                  return ss::make_ready_future<>();
+              }
+              (*meta)->expected_log_end_offset = _ptr->_last_snapshot_index;
+          }
           vlog(_ctxlog.trace, "sending install_snapshot request: {}", req);
           auto hb_guard = _ptr->suppress_heartbeats(_node_id);
           return _ptr->_client_protocol
@@ -375,7 +378,6 @@ ss::future<> recovery_stm::handle_install_snapshot_reply(
     // snapshot received by the follower, continue with recovery
     (*meta)->match_index = _ptr->_last_snapshot_index;
     (*meta)->next_index = model::next_offset(_ptr->_last_snapshot_index);
-    (*meta)->last_sent_offset = _ptr->_last_snapshot_index;
     return close_snapshot_reader();
 }
 
@@ -444,7 +446,11 @@ ss::future<> recovery_stm::replicate(
         _stop_requested = true;
         return ss::now();
     }
-    meta.value()->last_sent_offset = _last_batch_offset;
+    /**
+     * Update follower expected log end. It is equal to the last batch in a set
+     * of batches read for this recovery round.
+     */
+    meta.value()->expected_log_end_offset = _last_batch_offset;
     meta.value()->last_sent_protocol_meta = r.metadata();
     _ptr->update_node_append_timestamp(_node_id);
 
@@ -493,7 +499,7 @@ ss::future<> recovery_stm::replicate(
               }
               meta.value()->next_index = std::max(
                 model::offset(0), model::prev_offset(_base_batch_offset));
-              meta.value()->last_sent_offset = model::offset{};
+
               vlog(
                 _ctxlog.trace,
                 "Move next index {} backward",
diff --git a/src/v/raft/recovery_stm.h b/src/v/raft/recovery_stm.h
@@ -45,7 +45,7 @@ class recovery_stm {
     ss::future<> handle_install_snapshot_reply(result<install_snapshot_reply>);
     ss::future<> open_snapshot_reader();
     ss::future<> close_snapshot_reader();
-    bool state_changed();
+
     bool is_recovery_finished();
     flush_after_append should_flush(model::offset) const;
     consensus* _ptr;
diff --git a/src/v/raft/replicate_entries_stm.cc b/src/v/raft/replicate_entries_stm.cc
@@ -260,13 +260,13 @@ inline bool replicate_entries_stm::should_skip_follower_request(vnode id) {
               id);
             return true;
         }
-        if (f_meta.last_sent_offset != _meta.prev_log_index) {
+        if (f_meta.expected_log_end_offset != _meta.prev_log_index) {
             vlog(
               _ctxlog.trace,
-              "Skipping sending append request to {} - last sent offset: {}, "
-              "expected follower last offset: {}",
+              "Skipping sending append request to {} - expected follower log "
+              "end offset: {}, request expected last offset: {}",
               id,
-              f_meta.last_sent_offset,
+              f_meta.expected_log_end_offset,
               _meta.prev_log_index);
             return true;
         }
@@ -305,7 +305,7 @@ ss::future<result<replicate_result>> replicate_entries_stm::apply(units_t u) {
         if (rni != _ptr->self()) {
             auto it = _ptr->_fstats.find(rni);
             if (it != _ptr->_fstats.end()) {
-                it->second.last_sent_offset = _dirty_offset;
+                it->second.expected_log_end_offset = _dirty_offset;
                 it->second.last_sent_protocol_meta = _meta;
             }
         }
diff --git a/src/v/raft/types.cc b/src/v/raft/types.cc
@@ -139,7 +139,7 @@ replicate_stages::replicate_stages(raft::errc ec)
 void follower_index_metadata::reset() {
     last_dirty_log_index = model::offset{};
     last_flushed_log_index = model::offset{};
-    last_sent_offset = model::offset{};
+    expected_log_end_offset = model::offset{};
     match_index = model::offset{};
     next_index = model::offset{};
     heartbeats_failed = 0;
diff --git a/src/v/raft/types.h b/src/v/raft/types.h
@@ -109,7 +109,10 @@ struct follower_index_metadata {
     }
     // next index to send to this follower
     model::offset next_index;
-    model::offset last_sent_offset;
+    // field indicating end offset of follower log after current pending
+    // append_entries_requests are successfully delivered and processed by the
+    // follower.
+    model::offset expected_log_end_offset;
     // timestamp of last append_entries_rpc call
     clock_type::time_point last_sent_append_entries_req_timestamp;
     clock_type::time_point last_received_reply_timestamp;
diff --git a/src/v/redpanda/admin/api-doc/debug.json b/src/v/redpanda/admin/api-doc/debug.json
@@ -857,9 +857,9 @@
                     "type": "long",
                     "description": "Next index"
                 },
-                "last_sent_offset": {
+                "expected_log_end_offset": {
                     "type": "long",
-                    "description": "Last sent offset"
+                    "description": "Follower log end offset expected by the leader"
                 },
                 "heartbeats_failed": {
                     "type": "long",
diff --git a/src/v/redpanda/admin_server.cc b/src/v/redpanda/admin_server.cc
@@ -4086,7 +4086,8 @@ void fill_raft_state(
             follower_state.last_dirty_log_index = f.last_dirty_log_index();
             follower_state.match_index = f.match_index();
             follower_state.next_index = f.next_index();
-            follower_state.last_sent_offset = f.last_sent_offset();
+            follower_state.expected_log_end_offset
+              = f.expected_log_end_offset();
             follower_state.heartbeats_failed = f.heartbeats_failed;
             follower_state.is_learner = f.is_learner;
             follower_state.ms_since_last_heartbeat = f.ms_since_last_heartbeat;