Skip to content

Commit

Permalink
fix(run_unique_sequence): use same target node for each step
Browse files Browse the repository at this point in the history
When this nemesis runs with multidc configuration,
disrupt_mgmt_repair and disrupt_terminate_and_replace
methods could choose target node from different dc,
and _shrink_cluster method periodically failed with error:
```
File "/home/ubuntu/scylla-cluster-tests/sdcm/nemesis.py", line 4277, in _shrink_cluster
    raise Exception(error)
Exception: Not enough nodes for decommission
```

because target_node was chosen from dc where new node was not added by
_grow_cluster method.

Additionaly set to run nemesis only for data nodes
  • Loading branch information
aleksbykov authored and fruch committed Dec 1, 2024
1 parent 5fb15f0 commit a8fb34a
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1645,7 +1645,9 @@ def _kubernetes_wait_till_node_up_after_been_recreated(self, node, old_uid=None)

@target_all_nodes
def disrupt_terminate_and_replace_node(self): # pylint: disable=invalid-name
self._terminate_and_replace_node()

def _terminate_and_replace_node(self):
def get_node_state(node_ip: str) -> List["str"] | None:
"""Gets node state by IP address from nodetool status response"""
status = self.cluster.get_nodetool_status()
Expand Down Expand Up @@ -4292,6 +4294,8 @@ def _shrink_cluster(self, rack=None, new_nodes: list[BaseNode] | None = None):
add_nodes_number = self.tester.params.get('nemesis_add_node_cnt')
InfoEvent(message=f'Start shrink cluster by {add_nodes_number} nodes').publish()
# Check that number of nodes is enough for decommission:
self.log.debug("Current target_node %s, is zero_node: %s, dc_idx: %s", self.target_node.name,
self.target_node._is_zero_token_node, self.target_node.dc_idx)
cur_num_nodes_in_dc = len([n for n in self.cluster.data_nodes if n.dc_idx == self.target_node.dc_idx])
initial_db_size = self.tester.params.get("n_db_nodes")
if self._is_it_on_kubernetes():
Expand Down Expand Up @@ -4535,15 +4539,18 @@ def disrupt_run_unique_sequence(self):
self.steady_state_latency()
self.has_steady_run = True
InfoEvent(message='StartEvent - start a repair by ScyllaManager').publish()
self.disrupt_mgmt_repair_cli()
InfoEvent(message='FinishEvent - Manager repair has finished').publish()
if self.cluster.params.get('use_mgmt') or self.cluster.params.get('use_cloud_manager'):
self._mgmt_repair_cli()
InfoEvent(message='FinishEvent - Manager repair has finished').publish()
else:
InfoEvent(message='FinishEvent - Manager repair was Skipped').publish()
time.sleep(sleep_time_between_ops)
InfoEvent(message='Starting grow disruption').publish()
self._grow_cluster(rack=None)
InfoEvent(message='Finished grow disruption').publish()
time.sleep(sleep_time_between_ops)
InfoEvent(message='Starting terminate_and_replace disruption').publish()
self.disrupt_terminate_and_replace_node()
self._terminate_and_replace_node()
InfoEvent(message='Finished terminate_and_replace disruption').publish()
time.sleep(sleep_time_between_ops)
InfoEvent(message='Starting shrink disruption').publish()
Expand Down

0 comments on commit a8fb34a

Please sign in to comment.