diff --git a/configurations/enable_force_gossip_changes_on_upgrade.yaml b/configurations/enable_force_gossip_changes_on_upgrade.yaml new file mode 100644 index 00000000000..f86c7f334ce --- /dev/null +++ b/configurations/enable_force_gossip_changes_on_upgrade.yaml @@ -0,0 +1 @@ +enable_force_gossip_topology_changes_on_upgrade: true diff --git a/configurations/enable_tablets_on_upgrade.yaml b/configurations/enable_tablets_on_upgrade.yaml new file mode 100644 index 00000000000..c675458b5af --- /dev/null +++ b/configurations/enable_tablets_on_upgrade.yaml @@ -0,0 +1 @@ +enable_tablets_on_upgrade: true diff --git a/configurations/force_gossip_topology_changes.yaml b/configurations/force_gossip_topology_changes.yaml new file mode 100644 index 00000000000..d82b40b4985 --- /dev/null +++ b/configurations/force_gossip_topology_changes.yaml @@ -0,0 +1,2 @@ +append_scylla_yaml: + force_gossip_topology_changes: true diff --git a/defaults/test_default.yaml b/defaults/test_default.yaml index b5bb4ec521f..c19390d9dbb 100644 --- a/defaults/test_default.yaml +++ b/defaults/test_default.yaml @@ -255,3 +255,5 @@ kafka_backend: null kafka_connectors: [] run_scylla_doctor: false + +enable_force_gossip_topology_changes_on_upgrade: false diff --git a/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile new file mode 100644 index 00000000000..f5582ebd05c --- /dev/null +++ b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile @@ -0,0 +1,14 @@ +#!groovy + +// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 +def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) + +rollingUpgradePipeline( + backend: 'gce', + base_versions: '', // auto mode + linux_distro: 'ubuntu-focal', + use_preinstalled_scylla: true, + test_name: 'upgrade_test.UpgradeTest.test_rolling_upgrade', + test_config: '''["test-cases/upgrades/rolling-upgrade.yaml", "configurations/gce/n2-highmem-32.yaml", "configurations/tablets_disabled.yaml", "configurations/enable_tablets_on_upgrade.yaml"]''', + internode_compression: 'all', +) diff --git a/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile new file mode 100644 index 00000000000..4e9210e97fc --- /dev/null +++ b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile @@ -0,0 +1,14 @@ +#!groovy + +// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 +def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) + +rollingUpgradePipeline( + backend: 'gce', + base_versions: '', // auto mode + linux_distro: 'ubuntu-focal', + use_preinstalled_scylla: true, + test_name: 'upgrade_test.UpgradeTest.test_rolling_upgrade', + test_config: '''["test-cases/upgrades/rolling-upgrade.yaml", "configurations/gce/n2-highmem-32.yaml", "configurations/enable_force_gossip_changes_on_upgrade.yaml", "configurations/force_gossip_topology_changes.yaml"]''', + internode_compression: 'all', +) diff --git a/sdcm/provision/scylla_yaml/scylla_yaml.py b/sdcm/provision/scylla_yaml/scylla_yaml.py index 9729c4e8af3..9666c284b2f 100644 --- a/sdcm/provision/scylla_yaml/scylla_yaml.py +++ b/sdcm/provision/scylla_yaml/scylla_yaml.py @@ -339,6 +339,8 @@ def set_authorizer(cls, authorizer: str): audit_categories: str = None # None audit_tables: str = None # None audit_keyspaces: str = None # None + force_gossip_topology_changes: bool = None # False + enable_tablets: bool = None # Default value False, but explicitly set true for new clusters in scylla.yaml compaction_collection_items_count_warning_threshold: int = None # None diff --git a/sdcm/rest/raft_upgrade_procedure.py b/sdcm/rest/raft_upgrade_procedure.py new file mode 100644 index 00000000000..c99e1483e32 --- /dev/null +++ b/sdcm/rest/raft_upgrade_procedure.py @@ -0,0 +1,42 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See LICENSE for more details. +# +# Copyright (c) 2024 ScyllaDB +import json +from sdcm.cluster import BaseNode +from sdcm.rest.remote_curl_client import RemoteCurlClient +from sdcm.wait import wait_for + + +class RaftUpgradeProcedure(RemoteCurlClient): + """ Raft upgrade procedure to enable consistent topology changes. + The procedure should be run only once after all nodes had been upgraded + Doc: + https://opensource.docs.scylladb.com/stable/upgrade/upgrade-opensource/upgrade-guide-from-5.4-to-6.0/enable-consistent-topology.html + + + """ + + def __init__(self, node: BaseNode): + super().__init__(host="localhost:10000", endpoint="storage_service", node=node) + + def start_upgrade_procedure(self) -> str: + path = "raft_topology/upgrade" + return self.run_remoter_curl(method="POST", path=path, params=None, timeout=30).stdout.strip() + + def get_upgrade_procedure_status(self) -> str: + """ rest api return json string""" + path = "raft_topology/upgrade" + return json.loads(self.run_remoter_curl(method="GET", path=path, params=None, timeout=30).stdout.strip()) + + def wait_upgrade_procedure_done(self): + wait_for(lambda: self.get_upgrade_procedure_status().lower() == "done", + step=5, text="Check raft upgrade procedure state", timeout=60) diff --git a/sdcm/sct_config.py b/sdcm/sct_config.py index 1258eaf7296..5099af8de3e 100644 --- a/sdcm/sct_config.py +++ b/sdcm/sct_config.py @@ -1562,6 +1562,12 @@ class SCTConfiguration(dict): dict(name="run_scylla_doctor", env="SCT_RUN_SCYLLA_DOCTOR", type=boolean, help="Run scylla-doctor in artifact tests"), + + dict(name="enable_force_gossip_topology_changes_on_upgrade", + env="SCT_ENABLE_FORCE_GOSSIP_TOPOLOGY_CHANGES_ON_UPGRADE", + type=boolean, + help="""Enable gossip topology changes (disable raft topology) on upgrade"""), + ] required_params = ['cluster_backend', 'test_duration', 'n_db_nodes', 'n_loaders', 'use_preinstalled_scylla', diff --git a/unit_tests/test_scylla_yaml.py b/unit_tests/test_scylla_yaml.py index 05f88cb011c..b5ba8cd1d71 100644 --- a/unit_tests/test_scylla_yaml.py +++ b/unit_tests/test_scylla_yaml.py @@ -398,7 +398,9 @@ def test_scylla_yaml(self): 'virtual_dirty_soft_limit': None, 'volatile_system_keyspace_for_testing': None, 'workdir': None, - 'write_request_timeout_in_ms': None + 'write_request_timeout_in_ms': None, + 'enable_tablets': None, + 'force_gossip_topology_changes': None, } ) diff --git a/unit_tests/test_scylla_yaml_builders.py b/unit_tests/test_scylla_yaml_builders.py index 3b741a33210..d91544b290d 100644 --- a/unit_tests/test_scylla_yaml_builders.py +++ b/unit_tests/test_scylla_yaml_builders.py @@ -134,7 +134,7 @@ def test_aws_multi_openldap(self): 'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?' '(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)', 'role_manager': 'com.scylladb.auth.LDAPRoleManager', - 'saslauthd_socket_path': '/run/saslauthd/mux' + 'saslauthd_socket_path': '/run/saslauthd/mux', }, ) @@ -172,7 +172,7 @@ def test_gce_single_openldap(self): 'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?' '(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)', 'role_manager': 'com.scylladb.auth.LDAPRoleManager', - 'saslauthd_socket_path': '/run/saslauthd/mux' + 'saslauthd_socket_path': '/run/saslauthd/mux', } ) diff --git a/upgrade_test.py b/upgrade_test.py index e72746cd6e8..09a7b4ea70a 100644 --- a/upgrade_test.py +++ b/upgrade_test.py @@ -21,7 +21,7 @@ import time import re from functools import wraps, cache -from typing import List +from typing import List, Any import contextlib import cassandra @@ -51,7 +51,10 @@ from sdcm.sct_events.group_common_events import ignore_upgrade_schema_errors, ignore_ycsb_connection_refused, \ ignore_abort_requested_errors, decorate_with_context from sdcm.utils import loader_utils +from sdcm.utils.features import TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE, get_enabled_features +from sdcm.wait import wait_for from sdcm.paths import SCYLLA_YAML_PATH +from sdcm.rest.raft_upgrade_procedure import RaftUpgradeProcedure from test_lib.sla import create_sla_auth NUMBER_OF_ROWS_FOR_TRUNCATE_TEST = 10 @@ -208,7 +211,10 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v scylla_yaml_updates.update({"consistent_cluster_management": True}) if self.params.get("enable_tablets_on_upgrade"): - scylla_yaml_updates.update({"experimental_features": ["tablets", "consistent-topology-changes"]}) + scylla_yaml_updates.update({"enable_tablets": True}) + + if self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + scylla_yaml_updates.update({"force_gossip_topology_changes": True}) if self.params.get('test_sst3'): scylla_yaml_updates.update({"enable_sstables_mc_format": True}) @@ -373,15 +379,6 @@ def _rollback_node(self, node, upgrade_sstables=True): node.run_nodetool("snapshot") node.stop_scylla_server(verify_down=False) - if self.params.get("enable_tablets_on_upgrade"): - with node.remote_scylla_yaml() as scylla_yml: - current_experimental_features = scylla_yml.experimental_features - current_experimental_features.remove("tablets") - current_experimental_features.remove("consistent-topology-changes") - if len(current_experimental_features) == 0: - current_experimental_features = None - scylla_yml.experimental_features = current_experimental_features - if node.distro.is_rhel_like: node.remoter.run('sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo') node.remoter.run('sudo chown root.root /etc/yum.repos.d/scylla.repo') @@ -607,7 +604,7 @@ def _update_scylla_yaml_on_node(node_to_update: BaseNode, updates: dict): with node_to_update.remote_scylla_yaml() as scylla_yaml: scylla_yaml.update(updates) - def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-statements # noqa: PLR0915 + def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-statements,too-many-branches # noqa: PLR0915 """ Upgrade half of nodes in the cluster, and start special read workload during the stage. Checksum method is changed to xxhash from Scylla 2.2, @@ -740,10 +737,10 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat step = 'Step4 - Verify data during mixed cluster mode ' InfoEvent(message=step).publish() self.fill_and_verify_db_data('after rollback the second node') + InfoEvent(message='Repair the first upgraded Node').publish() - self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair') - self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, - step=step) + self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair', timeout=7200, coredump_on_timeout=True) + self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, step=step) with ignore_upgrade_schema_errors(): @@ -758,6 +755,8 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat self.fill_and_verify_db_data('after upgraded %s' % self.db_cluster.node_to_upgrade.name) self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, step=step) + if self.params.get("enable_tablets_on_upgrade") or not self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + self.run_raft_topology_upgrade_procedure() InfoEvent(message='Step6 - Verify stress results after upgrade ').publish() InfoEvent(message='Waiting for stress threads to complete after upgrade').publish() @@ -879,6 +878,34 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat InfoEvent(message='all nodes were upgraded, and last workaround is verified.').publish() + def run_raft_topology_upgrade_procedure(self): + features = set() + if not self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + features.update([CONSISTENT_TOPOLOGY_CHANGES_FEATURE]) + if self.params.get("enable_tablets_on_upgrade"): + features.update([TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE]) + InfoEvent(message='Step5.1 - run raft topology upgrade procedure') + + def check_features_enabled(feature_list: list[str], node: BaseNode): + enabled_features_state = [] + with self.db_cluster.cql_connection_patient_exclusive(node) as session: + enabled_features = get_enabled_features(session) + for feature in feature_list: + enabled_features_state.append(feature in enabled_features) + return all(enabled_features_state) + + # wait features is enabled on nodes after upgrade + for node in self.db_cluster.nodes: + wait_for(func=check_features_enabled, timeout=60, step=f"Check feature enabled on node {node.name}", + feature_list=features, node=node) + raft_upgrade = RaftUpgradeProcedure(self.db_cluster.nodes[0]) + result = raft_upgrade.start_upgrade_procedure() + InfoEvent(message=f'result {result}') + InfoEvent("Wait upgrade procedure done") + for node in self.db_cluster.nodes: + RaftUpgradeProcedure(node).wait_upgrade_procedure_done() + InfoEvent(message="Step5.1 - raft topology upgrade procedure done") + def _start_and_wait_for_node_upgrade(self, node: BaseNode, step: int) -> None: InfoEvent( message=f"Step {step} - Upgrade {node.name} from dc {node.dc_idx}").publish() @@ -1498,3 +1525,7 @@ def _custom_profile_rolling_upgrade(self, cs_user_profiles, new_scylla_repo=None assert all(tables_upgraded), "Failed to upgrade the sstable format {}".format(tables_upgraded) InfoEvent(message='all nodes were upgraded, and last workaround is verified.').publish() + + def get_old_config_value(self, config_name: str) -> Any: + with self.db_cluster.nodes[0].remote_scylla_yaml() as scylla_yaml: + return getattr(scylla_yaml, config_name, None)