Skip to content

Commit

Permalink
Merge pull request #23933 from ztlpn/v24.2.x-bp
Browse files Browse the repository at this point in the history
[v24.2.x] Fix ghost node handling in raft heartbeats
  • Loading branch information
mmaslankaprv authored Oct 29, 2024
2 parents 97a466b + 88a682e commit a18efa7
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 13 deletions.
4 changes: 2 additions & 2 deletions src/v/raft/consensus.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3955,7 +3955,7 @@ reply_result consensus::lightweight_heartbeat(
target_node,
_self,
source_node);
return reply_result::failure;
return reply_result::group_unavailable;
}

/**
Expand Down Expand Up @@ -4010,7 +4010,7 @@ ss::future<full_heartbeat_reply> consensus::full_heartbeat(
target_vnode,
_self,
source_vnode);
reply.result = reply_result::failure;
reply.result = reply_result::group_unavailable;
co_return reply;
}
/**
Expand Down
11 changes: 11 additions & 0 deletions src/v/raft/heartbeat_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,17 @@ void heartbeat_manager::process_reply(
return;
}
auto& reply = r.value();

if (reply.source() != n) {
vlog(
raftlog.warn,
"got heartbeat reply from a different node id {} (expected {}), "
"ignoring",
reply.source(),
n);
return;
}

reply.for_each_lw_reply([this, n, target = reply.target(), &groups](
group_id group, reply_result result) {
auto it = _consensus_groups.find(group);
Expand Down
17 changes: 6 additions & 11 deletions tests/rptest/tests/admin_uuid_operations_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,14 +276,12 @@ def test_force_uuid_override(self, mode):
backoff_sec=2,
err_msg=f"{to_stop.name} did not take the UUID override")

self.logger.debug(f"Wait for the cluster to become healthy...")
self.logger.debug(f"Decommission ghost node [{ghost_node_id}]...")
self._decommission(ghost_node_id)

self.logger.debug(f"...and wait for the cluster to become healthy.")
self.wait_until_cluster_healthy(timeout_sec=30)

self.logger.debug(
f".. and decommission ghost node [{ghost_node_id}]...")
self._decommission(ghost_node_id)

self.logger.debug(
"Check that all this state sticks across a rolling restart")

Expand Down Expand Up @@ -373,14 +371,11 @@ def test_force_uuid_override_multinode(self, mode):
auto_assign_node_id=True,
)

self.logger.debug("Wait for the cluster to become healthy...")
self.logger.debug(f"Decommission ghost node [{ghost_node_id}]...")
self._decommission(ghost_node_id)

self.logger.debug("...and wait for the cluster to become healthy.")
controller_leader = self.wait_until_cluster_healthy(timeout_sec=30)

assert controller_leader is not None, "Didn't elect a controller leader"
assert controller_leader not in to_stop, f"Unexpected controller leader {controller_leader.account.hostname}"

self.logger.debug(
f"...and decommission ghost node [{ghost_node_id}]...")

self._decommission(ghost_node_id)

0 comments on commit a18efa7

Please sign in to comment.