Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2024.1] fix(nemesis.py): log nemesis start/end on db nodes logs #9797

Draft
wants to merge 1 commit into
base: branch-2024.1
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import itertools
import json
import ipaddress
import shlex
from importlib import import_module
from typing import List, Optional, Dict, Union, Set, Iterable, ContextManager, Any, IO, AnyStr
from datetime import datetime, timezone
Expand Down Expand Up @@ -3062,6 +3063,26 @@ def wait_node_fully_start(self, verbose=True, timeout=3600):
self.log.info('Waiting for native_transport to be ready')
self.wait_native_transport()

<<<<<<< HEAD
||||||| parent of da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)
def disable_firewall(self) -> None:
self.remoter.sudo('systemctl stop iptables', ignore_status=True)
self.remoter.sudo('systemctl disable iptables', ignore_status=True)
self.remoter.sudo('systemctl stop firewalld', ignore_status=True)
self.remoter.sudo('systemctl disable firewalld', ignore_status=True)

=======
def disable_firewall(self) -> None:
self.remoter.sudo('systemctl stop iptables', ignore_status=True)
self.remoter.sudo('systemctl disable iptables', ignore_status=True)
self.remoter.sudo('systemctl stop firewalld', ignore_status=True)
self.remoter.sudo('systemctl disable firewalld', ignore_status=True)

def log_message(self, message: str, level: str = 'info', verbose: bool = False) -> None:
self.remoter.run(
f'scylla-api-client system log POST --level {level} --message {shlex.quote(message)}', verbose=verbose)

>>>>>>> da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)

class FlakyRetryPolicy(RetryPolicy):

Expand Down Expand Up @@ -4892,6 +4913,30 @@ def get_db_nodes_cpu_mode(self):
self.log.info("DB nodes CPU modes: %s", results)
return results

<<<<<<< HEAD
||||||| parent of da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)
def is_features_enabled_on_node(self, feature_list: list[str], node: BaseNode):
enabled_features_state = []
with self.cql_connection_patient_exclusive(node) as session:
enabled_features = get_enabled_features(session)
for feature in feature_list:
enabled_features_state.append(feature in enabled_features)
return all(enabled_features_state)

=======
def is_features_enabled_on_node(self, feature_list: list[str], node: BaseNode):
enabled_features_state = []
with self.cql_connection_patient_exclusive(node) as session:
enabled_features = get_enabled_features(session)
for feature in feature_list:
enabled_features_state.append(feature in enabled_features)
return all(enabled_features_state)

def log_message(self, message: str, level: str = 'info', verbose: bool = False) -> None:
for node in self.nodes:
node.log_message(message, level, verbose)

>>>>>>> da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)

class BaseLoaderSet():

Expand Down
36 changes: 32 additions & 4 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1844,11 +1844,9 @@ def call_random_disrupt_method(self, disrupt_methods=None, predefined_sequence=F

def execute_disrupt_method(self, disrupt_method):
disrupt_method_name = disrupt_method.__name__.replace('disrupt_', '')
self.log.info(">>>>>>>>>>>>>Started random_disrupt_method %s" % disrupt_method_name)
self.metrics_srv.event_start(disrupt_method_name)
try:
disrupt_method()
self.log.info("<<<<<<<<<<<<<Finished random_disrupt_method %s" % disrupt_method_name)
finally:
self.metrics_srv.event_stop(disrupt_method_name)

Expand Down Expand Up @@ -5096,6 +5094,13 @@ def argus_finalize_nemesis_info(nemesis: Nemesis, method_name: str, start_time:
except Exception: # pylint: disable=broad-except
nemesis.log.error("Error finalizing nemesis information in Argus", exc_info=True)

def get_nemesis_status(nemesis_event: DisruptionEvent) -> str:
if nemesis_event.severity == Severity.ERROR:
return NemesisStatus.FAILED
if nemesis_event.is_skipped:
return NemesisStatus.SKIPPED
return NemesisStatus.SUCCEEDED

def data_validation_prints(args):
try:
if hasattr(args[0].tester, 'data_validator') and args[0].tester.data_validator:
Expand All @@ -5113,7 +5118,7 @@ def data_validation_prints(args):
args[0].log.debug(f'Data validator error: {err}')

@wraps(method)
def wrapper(*args, **kwargs): # pylint: disable=too-many-statements # noqa: PLR0914
def wrapper(*args, **kwargs): # pylint: disable=too-many-statements # noqa: PLR0914, PLR0915
# pylint: disable=too-many-locals
# pylint: disable=too-many-branches
method_name = method.__name__
Expand All @@ -5128,13 +5133,30 @@ def wrapper(*args, **kwargs): # pylint: disable=too-many-statements # noqa: PL
# NOTE: exclusive nemesis will wait before the end of all other ones
time.sleep(10)

<<<<<<< HEAD
current_disruption = "".join(p.capitalize() for p in method_name.replace("disrupt_", "").split("_"))
args[0].set_target_node(current_disruption=current_disruption)

||||||| parent of da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)
current_disruption = "".join(p.capitalize() for p in method_name.replace("disrupt_", "").split("_"))
args[0].set_target_node_pool_type(target_pool_type)
args[0].set_target_node(current_disruption=current_disruption)

=======
>>>>>>> da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)
args[0].cluster.check_cluster_health()
num_nodes_before = len(args[0].cluster.nodes)
start_time = time.time()
args[0].log.debug('Start disruption at `%s`', datetime.datetime.fromtimestamp(start_time))

current_disruption = "".join(p.capitalize() for p in method_name.replace("disrupt_", "").split("_"))
args[0].set_target_node_pool_type(target_pool_type)
args[0].set_target_node(current_disruption=current_disruption)
start_msg = (f"Started disruption {method_name} ({current_disruption} nemesis) on the target node "
f"'{str(args[0].target_node)}'")
args[0].log.debug("{start_symbol} {msg} {start_symbol}".format(start_symbol='>' * 12, msg=start_msg))
args[0].cluster.log_message(
"{start_symbol} {msg} {start_symbol}".format(start_symbol='=' * 12, msg=start_msg))

class_name = args[0].get_class_name()
if class_name.find('Chaos') < 0:
args[0].metrics_srv.event_start(class_name)
Expand Down Expand Up @@ -5217,6 +5239,12 @@ def wrapper(*args, **kwargs): # pylint: disable=too-many-statements # noqa: PL
argus_finalize_nemesis_info(nemesis=args[0], method_name=method_name, start_time=int(
start_time), nemesis_event=nemesis_event)

end_msg = (f"Finished disruption {method_name} ({current_disruption} nemesis) with status "
f"'{get_nemesis_status(nemesis_event)}'")
args[0].log.debug("{end_symbol} {msg} {end_symbol}".format(end_symbol='<' * 12, msg=end_msg))
args[0].cluster.log_message(
"{end_symbol} {msg} {end_symbol}".format(end_symbol='=' * 12, msg=end_msg))

args[0].cluster.check_cluster_health()
num_nodes_after = len(args[0].cluster.nodes)
if num_nodes_before != num_nodes_after:
Expand Down
26 changes: 26 additions & 0 deletions unit_tests/test_nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ class Node:
def scylla_shards(self):
return 8

def log_message(self, *args, **kwargs):
pass


@dataclass
class Cluster:
Expand All @@ -39,6 +42,29 @@ class Cluster:
def check_cluster_health(self):
pass

<<<<<<< HEAD
||||||| parent of da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)
@property
def data_nodes(self):
return self.nodes

@property
def zero_nodes(self):
return self.nodes

=======
@property
def data_nodes(self):
return self.nodes

@property
def zero_nodes(self):
return self.nodes

def log_message(self, *args, **kwargs):
pass

>>>>>>> da19d860a (fix(nemesis.py): log nemesis start/end on db nodes logs)

@dataclass
class FakeTester:
Expand Down
Loading