Skip to content

Commit 54c0d86

Browse files
authored
Merge pull request #13495 from mmaslankaprv/fix-internal-752
fixed tracking expected last offset of a follower
2 parents bc581cd + d0e45c0 commit 54c0d86

File tree

10 files changed

+53
-33
lines changed

10 files changed

+53
-33
lines changed

src/v/cluster/cluster_utils.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ partition_raft_state get_partition_raft_state(consensus_ptr ptr) {
332332
state.last_flushed_log_index = md.last_flushed_log_index;
333333
state.match_index = md.match_index;
334334
state.next_index = md.next_index;
335-
state.last_sent_offset = md.last_sent_offset;
335+
state.expected_log_end_offset = md.expected_log_end_offset;
336336
state.heartbeats_failed = md.heartbeats_failed;
337337
state.is_learner = md.is_learner;
338338
state.is_recovering = md.is_recovering;

src/v/cluster/types.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -3592,7 +3592,7 @@ struct partition_raft_state
35923592
model::offset last_dirty_log_index;
35933593
model::offset match_index;
35943594
model::offset next_index;
3595-
model::offset last_sent_offset;
3595+
model::offset expected_log_end_offset;
35963596
size_t heartbeats_failed;
35973597
bool is_learner;
35983598
uint64_t ms_since_last_heartbeat;
@@ -3609,7 +3609,7 @@ struct partition_raft_state
36093609
last_dirty_log_index,
36103610
match_index,
36113611
next_index,
3612-
last_sent_offset,
3612+
expected_log_end_offset,
36133613
heartbeats_failed,
36143614
is_learner,
36153615
ms_since_last_heartbeat,

src/v/raft/consensus.cc

+13-3
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ consensus::success_reply consensus::update_follower_index(
449449
successfull_append_entries_reply(idx, std::move(reply));
450450
return success_reply::yes;
451451
} else {
452-
idx.last_sent_offset = idx.last_dirty_log_index;
452+
idx.expected_log_end_offset = model::offset{};
453453
}
454454

455455
if (idx.is_recovering) {
@@ -461,7 +461,6 @@ consensus::success_reply consensus::update_follower_index(
461461
idx.last_dirty_log_index = reply.last_dirty_log_index;
462462
idx.last_flushed_log_index = reply.last_flushed_log_index;
463463
idx.next_index = model::next_offset(idx.last_dirty_log_index);
464-
idx.last_sent_offset = model::offset{};
465464
}
466465
return success_reply::no;
467466
}
@@ -554,6 +553,13 @@ void consensus::successfull_append_entries_reply(
554553
idx.match_index = idx.last_dirty_log_index;
555554
idx.next_index = model::next_offset(idx.last_dirty_log_index);
556555
idx.last_successful_received_seq = idx.last_received_seq;
556+
/**
557+
* Update expected log end offset only if it is smaller than current value,
558+
* the check is needed here as there might be pending append entries
559+
* requests that were not yet replied by the follower.
560+
*/
561+
idx.expected_log_end_offset = std::max(
562+
idx.last_dirty_log_index, idx.expected_log_end_offset);
557563
vlog(
558564
_ctxlog.trace,
559565
"Updated node {} match {} and next {} indices",
@@ -588,7 +594,7 @@ void consensus::dispatch_recovery(follower_index_metadata& idx) {
588594
idx.next_index,
589595
log_max_offset);
590596
idx.next_index = log_max_offset;
591-
idx.last_sent_offset = model::offset{};
597+
idx.expected_log_end_offset = model::offset{};
592598
}
593599
idx.is_recovering = true;
594600
// background
@@ -1996,6 +2002,10 @@ consensus::do_append_entries(append_entries_request&& r) {
19962002
if (request_metadata.prev_log_index < last_log_offset) {
19972003
if (unlikely(request_metadata.prev_log_index < _commit_index)) {
19982004
reply.result = reply_result::success;
2005+
// clamp dirty offset to the current commit index not to allow
2006+
// leader reasoning about follower log beyond that point
2007+
reply.last_dirty_log_index = _commit_index;
2008+
reply.last_flushed_log_index = _commit_index;
19992009
vlog(
20002010
_ctxlog.info,
20012011
"Stale append entries request processed, entry is already "

src/v/raft/recovery_stm.cc

+22-16
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,13 @@ ss::future<> recovery_stm::do_recover(ss::io_priority_class iopc) {
132132
co_return;
133133
}
134134

135-
// wait for another round
136-
if (meta.value()->last_sent_offset >= lstats.dirty_offset) {
135+
/**
136+
* If expected_log_end_offset is indicating that all the requests were
137+
* already dispatched to the follower wait for append entries responses. The
138+
* responses will trigger the follower state condition variable and
139+
* recovery_stm will redo the check if follower still needs to be recovered.
140+
*/
141+
if (meta.value()->expected_log_end_offset >= lstats.dirty_offset) {
137142
co_await meta.value()
138143
->follower_state_change.wait()
139144
.handle_exception_type([this](const ss::broken_condition_variable&) {
@@ -160,16 +165,6 @@ ss::future<> recovery_stm::do_recover(ss::io_priority_class iopc) {
160165
meta = get_follower_meta();
161166
}
162167

163-
bool recovery_stm::state_changed() {
164-
auto meta = get_follower_meta();
165-
if (!meta) {
166-
return true;
167-
}
168-
auto lstats = _ptr->_log->offsets();
169-
return lstats.dirty_offset > meta.value()->last_dirty_log_index
170-
|| meta.value()->last_sent_offset == lstats.dirty_offset;
171-
}
172-
173168
flush_after_append
174169
recovery_stm::should_flush(model::offset follower_committed_match_index) const {
175170
constexpr size_t checkpoint_flush_size = 1_MiB;
@@ -319,7 +314,15 @@ ss::future<> recovery_stm::send_install_snapshot_request() {
319314
.dirty_offset = _ptr->dirty_offset()};
320315

321316
_sent_snapshot_bytes += chunk_size;
322-
317+
if (req.done) {
318+
auto meta = get_follower_meta();
319+
if (!meta) {
320+
// stop recovery when node was removed
321+
_stop_requested = true;
322+
return ss::make_ready_future<>();
323+
}
324+
(*meta)->expected_log_end_offset = _ptr->_last_snapshot_index;
325+
}
323326
vlog(_ctxlog.trace, "sending install_snapshot request: {}", req);
324327
auto hb_guard = _ptr->suppress_heartbeats(_node_id);
325328
return _ptr->_client_protocol
@@ -375,7 +378,6 @@ ss::future<> recovery_stm::handle_install_snapshot_reply(
375378
// snapshot received by the follower, continue with recovery
376379
(*meta)->match_index = _ptr->_last_snapshot_index;
377380
(*meta)->next_index = model::next_offset(_ptr->_last_snapshot_index);
378-
(*meta)->last_sent_offset = _ptr->_last_snapshot_index;
379381
return close_snapshot_reader();
380382
}
381383

@@ -444,7 +446,11 @@ ss::future<> recovery_stm::replicate(
444446
_stop_requested = true;
445447
return ss::now();
446448
}
447-
meta.value()->last_sent_offset = _last_batch_offset;
449+
/**
450+
* Update follower expected log end. It is equal to the last batch in a set
451+
* of batches read for this recovery round.
452+
*/
453+
meta.value()->expected_log_end_offset = _last_batch_offset;
448454
meta.value()->last_sent_protocol_meta = r.metadata();
449455
_ptr->update_node_append_timestamp(_node_id);
450456

@@ -493,7 +499,7 @@ ss::future<> recovery_stm::replicate(
493499
}
494500
meta.value()->next_index = std::max(
495501
model::offset(0), model::prev_offset(_base_batch_offset));
496-
meta.value()->last_sent_offset = model::offset{};
502+
497503
vlog(
498504
_ctxlog.trace,
499505
"Move next index {} backward",

src/v/raft/recovery_stm.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class recovery_stm {
4545
ss::future<> handle_install_snapshot_reply(result<install_snapshot_reply>);
4646
ss::future<> open_snapshot_reader();
4747
ss::future<> close_snapshot_reader();
48-
bool state_changed();
48+
4949
bool is_recovery_finished();
5050
flush_after_append should_flush(model::offset) const;
5151
consensus* _ptr;

src/v/raft/replicate_entries_stm.cc

+5-5
Original file line numberDiff line numberDiff line change
@@ -260,13 +260,13 @@ inline bool replicate_entries_stm::should_skip_follower_request(vnode id) {
260260
id);
261261
return true;
262262
}
263-
if (f_meta.last_sent_offset != _meta.prev_log_index) {
263+
if (f_meta.expected_log_end_offset != _meta.prev_log_index) {
264264
vlog(
265265
_ctxlog.trace,
266-
"Skipping sending append request to {} - last sent offset: {}, "
267-
"expected follower last offset: {}",
266+
"Skipping sending append request to {} - expected follower log "
267+
"end offset: {}, request expected last offset: {}",
268268
id,
269-
f_meta.last_sent_offset,
269+
f_meta.expected_log_end_offset,
270270
_meta.prev_log_index);
271271
return true;
272272
}
@@ -305,7 +305,7 @@ ss::future<result<replicate_result>> replicate_entries_stm::apply(units_t u) {
305305
if (rni != _ptr->self()) {
306306
auto it = _ptr->_fstats.find(rni);
307307
if (it != _ptr->_fstats.end()) {
308-
it->second.last_sent_offset = _dirty_offset;
308+
it->second.expected_log_end_offset = _dirty_offset;
309309
it->second.last_sent_protocol_meta = _meta;
310310
}
311311
}

src/v/raft/types.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ replicate_stages::replicate_stages(raft::errc ec)
139139
void follower_index_metadata::reset() {
140140
last_dirty_log_index = model::offset{};
141141
last_flushed_log_index = model::offset{};
142-
last_sent_offset = model::offset{};
142+
expected_log_end_offset = model::offset{};
143143
match_index = model::offset{};
144144
next_index = model::offset{};
145145
heartbeats_failed = 0;

src/v/raft/types.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,10 @@ struct follower_index_metadata {
109109
}
110110
// next index to send to this follower
111111
model::offset next_index;
112-
model::offset last_sent_offset;
112+
// field indicating end offset of follower log after current pending
113+
// append_entries_requests are successfully delivered and processed by the
114+
// follower.
115+
model::offset expected_log_end_offset;
113116
// timestamp of last append_entries_rpc call
114117
clock_type::time_point last_sent_append_entries_req_timestamp;
115118
clock_type::time_point last_received_reply_timestamp;

src/v/redpanda/admin/api-doc/debug.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -857,9 +857,9 @@
857857
"type": "long",
858858
"description": "Next index"
859859
},
860-
"last_sent_offset": {
860+
"expected_log_end_offset": {
861861
"type": "long",
862-
"description": "Last sent offset"
862+
"description": "Follower log end offset expected by the leader"
863863
},
864864
"heartbeats_failed": {
865865
"type": "long",

src/v/redpanda/admin_server.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -4086,7 +4086,8 @@ void fill_raft_state(
40864086
follower_state.last_dirty_log_index = f.last_dirty_log_index();
40874087
follower_state.match_index = f.match_index();
40884088
follower_state.next_index = f.next_index();
4089-
follower_state.last_sent_offset = f.last_sent_offset();
4089+
follower_state.expected_log_end_offset
4090+
= f.expected_log_end_offset();
40904091
follower_state.heartbeats_failed = f.heartbeats_failed;
40914092
follower_state.is_learner = f.is_learner;
40924093
follower_state.ms_since_last_heartbeat = f.ms_since_last_heartbeat;

0 commit comments

Comments
 (0)