Skip to content

Commit d7a656e

Browse files
authored
Merge pull request #22993 from bharathv/241x_sync_race
[backport] [v24.1.x] rm_stm: fix race during leadership transfer
2 parents 78d2169 + 12df2f9 commit d7a656e

File tree

2 files changed

+29
-24
lines changed

2 files changed

+29
-24
lines changed

src/v/cluster/rm_stm.cc

+22-19
Original file line numberDiff line numberDiff line change
@@ -974,19 +974,12 @@ rm_stm::do_mark_expired(model::producer_identity pid) {
974974
co_return std::error_code(co_await do_try_abort_old_tx(pid));
975975
}
976976

977-
ss::future<result<kafka_result>> rm_stm::do_sync_and_transactional_replicate(
977+
ss::future<result<kafka_result>> rm_stm::transactional_replicate(
978+
model::term_id synced_term,
978979
producer_ptr producer,
979980
model::batch_identity bid,
980981
model::record_batch_reader rdr,
981982
ssx::semaphore_units units) {
982-
if (!co_await sync(_sync_timeout)) {
983-
vlog(
984-
_ctx_log.trace,
985-
"processing name:replicate_tx pid:{} => stale leader",
986-
bid.pid);
987-
co_return errc::not_leader;
988-
}
989-
auto synced_term = _insync_term;
990983
auto result = co_await do_transactional_replicate(
991984
synced_term, producer, bid, std::move(rdr));
992985
if (!result) {
@@ -1105,31 +1098,34 @@ ss::future<result<kafka_result>> rm_stm::transactional_replicate(
11051098
if (!check_tx_permitted()) {
11061099
co_return errc::generic_tx_error;
11071100
}
1101+
if (!co_await sync(_sync_timeout)) {
1102+
vlog(
1103+
_ctx_log.trace,
1104+
"processing name:replicate_tx pid:{} => stale leader",
1105+
bid.pid);
1106+
co_return errc::not_leader;
1107+
}
1108+
auto synced_term = _insync_term;
11081109
auto [producer, _] = maybe_create_producer(bid.pid);
11091110
co_return co_await producer
11101111
->run_with_lock([&](ssx::semaphore_units units) {
1111-
return do_sync_and_transactional_replicate(
1112-
producer, bid, std::move(rdr), std::move(units));
1112+
return transactional_replicate(
1113+
synced_term, producer, bid, std::move(rdr), std::move(units));
11131114
})
11141115
.finally([this, producer] {
11151116
_producer_state_manager.local().touch(*producer, _vcluster_id);
11161117
});
11171118
}
11181119

1119-
ss::future<result<kafka_result>> rm_stm::do_sync_and_idempotent_replicate(
1120+
ss::future<result<kafka_result>> rm_stm::idempotent_replicate(
1121+
model::term_id synced_term,
11201122
producer_ptr producer,
11211123
model::batch_identity bid,
11221124
model::record_batch_reader br,
11231125
raft::replicate_options opts,
11241126
ss::lw_shared_ptr<available_promise<>> enqueued,
11251127
ssx::semaphore_units units,
11261128
producer_previously_known known_producer) {
1127-
if (!co_await sync(_sync_timeout)) {
1128-
// it's ok not to set enqueued on early return because
1129-
// the safety check in replicate_in_stages sets it automatically
1130-
co_return errc::not_leader;
1131-
}
1132-
auto synced_term = _insync_term;
11331129
auto result = co_await do_idempotent_replicate(
11341130
synced_term,
11351131
producer,
@@ -1256,10 +1252,17 @@ ss::future<result<kafka_result>> rm_stm::idempotent_replicate(
12561252
raft::replicate_options opts,
12571253
ss::lw_shared_ptr<available_promise<>> enqueued) {
12581254
try {
1255+
if (!co_await sync(_sync_timeout)) {
1256+
// it's ok not to set enqueued on early return because
1257+
// the safety check in replicate_in_stages sets it automatically
1258+
co_return errc::not_leader;
1259+
}
1260+
auto synced_term = _insync_term;
12591261
auto [producer, known_producer] = maybe_create_producer(bid.pid);
12601262
co_return co_await producer
12611263
->run_with_lock([&, known_producer](ssx::semaphore_units units) {
1262-
return do_sync_and_idempotent_replicate(
1264+
return idempotent_replicate(
1265+
synced_term,
12631266
producer,
12641267
bid,
12651268
std::move(br),

src/v/cluster/rm_stm.h

+7-5
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,8 @@ class rm_stm final : public raft::persisted_stm<> {
313313
ss::future<result<kafka_result>> transactional_replicate(
314314
model::batch_identity, model::record_batch_reader);
315315

316-
ss::future<result<kafka_result>> do_sync_and_transactional_replicate(
316+
ss::future<result<kafka_result>> transactional_replicate(
317+
model::term_id,
317318
producer_ptr,
318319
model::batch_identity,
319320
model::record_batch_reader,
@@ -331,23 +332,24 @@ class rm_stm final : public raft::persisted_stm<> {
331332
raft::replicate_options,
332333
ss::lw_shared_ptr<available_promise<>>);
333334

334-
ss::future<result<kafka_result>> do_idempotent_replicate(
335+
ss::future<result<kafka_result>> idempotent_replicate(
335336
model::term_id,
336337
producer_ptr,
337338
model::batch_identity,
338339
model::record_batch_reader,
339340
raft::replicate_options,
340341
ss::lw_shared_ptr<available_promise<>>,
341-
ssx::semaphore_units&,
342+
ssx::semaphore_units,
342343
producer_previously_known);
343344

344-
ss::future<result<kafka_result>> do_sync_and_idempotent_replicate(
345+
ss::future<result<kafka_result>> do_idempotent_replicate(
346+
model::term_id,
345347
producer_ptr,
346348
model::batch_identity,
347349
model::record_batch_reader,
348350
raft::replicate_options,
349351
ss::lw_shared_ptr<available_promise<>>,
350-
ssx::semaphore_units,
352+
ssx::semaphore_units&,
351353
producer_previously_known);
352354

353355
ss::future<result<kafka_result>> replicate_msg(

0 commit comments

Comments
 (0)