From f37b8c993bfa29c94a04668a038809d0cde5d5db Mon Sep 17 00:00:00 2001 From: Aleksandr Bykov Date: Thu, 21 Mar 2024 20:40:20 +0700 Subject: [PATCH] fix(upgrade): upgrade with raft topology procedure After upgrade to latest master(6.0) raft topology feature or tablets + raft topology features will be enabled by default To switch cluster from gossiper to raft topology, raft topology procedure should be executed. It is described in scylla doc upgrade from 5.4-> 6.0 section: Upgrade from legacy topology to raft-based topology Two major new features were introduced in the 6.0 and 6.1 releases: - tablets - raft topology (consistent_topology_changes) The 'raft topology changes' feature is enabled by default for any new cluster. No parameter related to this feature could be used in the scylla.yaml file. To disable the raft topology feature for a new cluster we need to add a new parameter to scylla.yaml: 'force_gossip_topology_changes: true'. To enable the feature after upgrade from versions where 'raft topology feature' is missed or from version where it was disabled, we need manually trigger 'raft topology upgrade procedure'. It's important to note that once the upgrade procedure for enabling the raft topology has been performed, there is no way to revert back to the gossip topology. The tablets feature is disabled by default and depends on the raft topology feature. If the raft topology feature is disabled, then tablets cannot be enabled independently. For new clusters (scylla version >= 6.0), it is enabled via adding 'enable_tablets: true' to scylla.yaml If a cluster was created with the disabled tablets feature or was upgraded from version < 6.0, then tablets are disabled. We need to support the following upgrade paths because the sct master branch could be used with different versions and to safely backport to 6.0 and enterprise: 1. 5.4 -> 6.0, 2. 5.4->2024.2.dev, 3. 6.0->6.1.dev, 4. 2024.1 -> 2024.2.dev Feature state per versions: 5.4, 2024.1 - doesn't have tablets and raft topology. 6.0+ - could have tablets and raft topology in different states: enabled/disabled. Upgrade from versions 5.4, 2024.1 -> 6.0, 2024.2 could be done with the following options: 1. raft topology disabled after upgrade. For that, we need to add a new parameter force_gossip_topology_changes:true and not run the raft topology upgrade procedure after all nodes have been upgraded. In this case, tablets should not be enabled at all. No need to add anything to scylla.yaml. 2. raft topology enabled after upgrade. No need to add anything to scylla.yaml and run the raft topology upgrade procedure after all nodes have been upgraded. 3. tablets feature is not enabled after upgrade. No need to add anything to scylla.yaml (because tablets are disabled by default). Raft topology feature is default as in points (1,2). 4. tablets feature is enabled after upgrade. Before node upgrade, 'enable_tablets: true' should be added to scylla.yaml and if the raft topology were disabled by parameter, it should be removed. And after all nodes have been upgraded, run the raft topology upgrade procedure. Upgrade from 6.0 -> to 6.1 and enterprise default scenarios: 1. raft topology feature already enabled. in this case we can't disable it upon upgrade and should run regular upgrade 2. tablets feature enabled in this case features couldn't be disabled after upgrade and should run as is 3. raft topology feature was disabled and it will be enabled after upgrade the force_gossip_topology_changes could be removed from scylla.yaml before upgrade or could stay and raft topology upgrade procedure have to be executed after all nodes has been upgraded 4. raft topology feature was disabled and after upgrade it should stay disabled: nothing should be done with scylla.yaml and raft topology upgrade procedure shouldn't be executed after all nodes has been upgraded 5. tablets feature is disabled and should be enabled after upgrade scylla.yaml should be updated with 'enable_tablets: true' before upgrade and raft topology upgrade procedure should be run after all node has been upgraded 6. tablets feature is disabled and should be disabled after upgrade nothing should be done with scylla.yaml and after upgrade to support all these paths 2 sct_config parameters will be used: - enable_force_gossip_topology_changes_on_upgrade. Default value is false. This parameter is used to trigger raft topology upgrade procedure - enable_tablets_on_upgrade. Default value is true update scylla.yaml with 'enable_tablets: true' and appropriate jobs should have appropriate 'scylla_yaml_append' parameter: 1. to disable tablets before upgrade: - scylla_yaml_append: enable_tablets: false 2. to disable raft topology feature before upgrade: - scylla_yaml_append: force_gossip_topology_changes: true enable_tablets: false force_gossip_topology_changes: true combination of these 4th parameters should allow to support all possible configurations. Upgrade from 5.4 -> 6.0 Default upgrade with enable raft topology feature and tablets feature: enable_force_gossip_topology_on_upgrade: false enable_tablets_on_upgrade: true no additional `append_scylla_yaml' parameters which is default for rolling upgrades upgrade will be run and after upgrade raft toplogy feature will be enabled, raft topology upgrade procedure will be executed and tablets feature will be enabled via scylla.yaml Upgrade from 5.4 -> 6.0 Default upgrade with enable raft topology feature and tablets feature: enable_force_gossip_topology_on_upgrade: false enable_tablets_on_upgrade: false no additional `append_scylla_yaml' parameters which is default for rolling upgrades upgrade will be run and after upgrade raft topology feature will be enabled, raft topology upgrade procedure will be executed and tablets feature will be disabled(no in scylla.yaml and default is false) Upgrade from 6.0->6.1 default path: raft topology feature enabled tablets enabled in scylla.yaml. enable_force_gossip_topology_on_upgrade: false enable_tablets_on_upgrade: false upgrade will be run with both features enabled Upgrade from 6.0->6.1 default path: raft topology feature enabled tablets disabled in scylla.yaml. append_scylla_yaml: enable_tablets: false enable_force_gossip_topology_on_upgrade: false enable_tablets_on_upgrade: true upgrade will be run with enabled raft topology feature, before upgrade scylla.yaml will be updated with enable_tablets: true --- ...nable_force_gossip_changes_on_upgrade.yaml | 1 + configurations/enable_tablets_on_upgrade.yaml | 1 + .../force_gossip_topology_changes.yaml | 2 + defaults/test_default.yaml | 2 + ...ng-upgrade-with-enable-tablets.jenkinsfile | 14 +++++ ...e-with-gossip-topology-changes.jenkinsfile | 14 +++++ sdcm/provision/scylla_yaml/scylla_yaml.py | 2 + sdcm/rest/raft_upgrade_procedure.py | 42 +++++++++++++ sdcm/sct_config.py | 6 ++ unit_tests/test_scylla_yaml.py | 4 +- unit_tests/test_scylla_yaml_builders.py | 4 +- upgrade_test.py | 61 ++++++++++++++----- 12 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 configurations/enable_force_gossip_changes_on_upgrade.yaml create mode 100644 configurations/enable_tablets_on_upgrade.yaml create mode 100644 configurations/force_gossip_topology_changes.yaml create mode 100644 jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile create mode 100644 jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile create mode 100644 sdcm/rest/raft_upgrade_procedure.py diff --git a/configurations/enable_force_gossip_changes_on_upgrade.yaml b/configurations/enable_force_gossip_changes_on_upgrade.yaml new file mode 100644 index 00000000000..f86c7f334ce --- /dev/null +++ b/configurations/enable_force_gossip_changes_on_upgrade.yaml @@ -0,0 +1 @@ +enable_force_gossip_topology_changes_on_upgrade: true diff --git a/configurations/enable_tablets_on_upgrade.yaml b/configurations/enable_tablets_on_upgrade.yaml new file mode 100644 index 00000000000..c675458b5af --- /dev/null +++ b/configurations/enable_tablets_on_upgrade.yaml @@ -0,0 +1 @@ +enable_tablets_on_upgrade: true diff --git a/configurations/force_gossip_topology_changes.yaml b/configurations/force_gossip_topology_changes.yaml new file mode 100644 index 00000000000..d82b40b4985 --- /dev/null +++ b/configurations/force_gossip_topology_changes.yaml @@ -0,0 +1,2 @@ +append_scylla_yaml: + force_gossip_topology_changes: true diff --git a/defaults/test_default.yaml b/defaults/test_default.yaml index b5bb4ec521f..c19390d9dbb 100644 --- a/defaults/test_default.yaml +++ b/defaults/test_default.yaml @@ -255,3 +255,5 @@ kafka_backend: null kafka_connectors: [] run_scylla_doctor: false + +enable_force_gossip_topology_changes_on_upgrade: false diff --git a/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile new file mode 100644 index 00000000000..f5582ebd05c --- /dev/null +++ b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-enable-tablets.jenkinsfile @@ -0,0 +1,14 @@ +#!groovy + +// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 +def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) + +rollingUpgradePipeline( + backend: 'gce', + base_versions: '', // auto mode + linux_distro: 'ubuntu-focal', + use_preinstalled_scylla: true, + test_name: 'upgrade_test.UpgradeTest.test_rolling_upgrade', + test_config: '''["test-cases/upgrades/rolling-upgrade.yaml", "configurations/gce/n2-highmem-32.yaml", "configurations/tablets_disabled.yaml", "configurations/enable_tablets_on_upgrade.yaml"]''', + internode_compression: 'all', +) diff --git a/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile new file mode 100644 index 00000000000..4e9210e97fc --- /dev/null +++ b/jenkins-pipelines/oss/rolling-upgrade/rolling-upgrade-with-gossip-topology-changes.jenkinsfile @@ -0,0 +1,14 @@ +#!groovy + +// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 +def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) + +rollingUpgradePipeline( + backend: 'gce', + base_versions: '', // auto mode + linux_distro: 'ubuntu-focal', + use_preinstalled_scylla: true, + test_name: 'upgrade_test.UpgradeTest.test_rolling_upgrade', + test_config: '''["test-cases/upgrades/rolling-upgrade.yaml", "configurations/gce/n2-highmem-32.yaml", "configurations/enable_force_gossip_changes_on_upgrade.yaml", "configurations/force_gossip_topology_changes.yaml"]''', + internode_compression: 'all', +) diff --git a/sdcm/provision/scylla_yaml/scylla_yaml.py b/sdcm/provision/scylla_yaml/scylla_yaml.py index 9729c4e8af3..9666c284b2f 100644 --- a/sdcm/provision/scylla_yaml/scylla_yaml.py +++ b/sdcm/provision/scylla_yaml/scylla_yaml.py @@ -339,6 +339,8 @@ def set_authorizer(cls, authorizer: str): audit_categories: str = None # None audit_tables: str = None # None audit_keyspaces: str = None # None + force_gossip_topology_changes: bool = None # False + enable_tablets: bool = None # Default value False, but explicitly set true for new clusters in scylla.yaml compaction_collection_items_count_warning_threshold: int = None # None diff --git a/sdcm/rest/raft_upgrade_procedure.py b/sdcm/rest/raft_upgrade_procedure.py new file mode 100644 index 00000000000..c99e1483e32 --- /dev/null +++ b/sdcm/rest/raft_upgrade_procedure.py @@ -0,0 +1,42 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See LICENSE for more details. +# +# Copyright (c) 2024 ScyllaDB +import json +from sdcm.cluster import BaseNode +from sdcm.rest.remote_curl_client import RemoteCurlClient +from sdcm.wait import wait_for + + +class RaftUpgradeProcedure(RemoteCurlClient): + """ Raft upgrade procedure to enable consistent topology changes. + The procedure should be run only once after all nodes had been upgraded + Doc: + https://opensource.docs.scylladb.com/stable/upgrade/upgrade-opensource/upgrade-guide-from-5.4-to-6.0/enable-consistent-topology.html + + + """ + + def __init__(self, node: BaseNode): + super().__init__(host="localhost:10000", endpoint="storage_service", node=node) + + def start_upgrade_procedure(self) -> str: + path = "raft_topology/upgrade" + return self.run_remoter_curl(method="POST", path=path, params=None, timeout=30).stdout.strip() + + def get_upgrade_procedure_status(self) -> str: + """ rest api return json string""" + path = "raft_topology/upgrade" + return json.loads(self.run_remoter_curl(method="GET", path=path, params=None, timeout=30).stdout.strip()) + + def wait_upgrade_procedure_done(self): + wait_for(lambda: self.get_upgrade_procedure_status().lower() == "done", + step=5, text="Check raft upgrade procedure state", timeout=60) diff --git a/sdcm/sct_config.py b/sdcm/sct_config.py index 1258eaf7296..5099af8de3e 100644 --- a/sdcm/sct_config.py +++ b/sdcm/sct_config.py @@ -1562,6 +1562,12 @@ class SCTConfiguration(dict): dict(name="run_scylla_doctor", env="SCT_RUN_SCYLLA_DOCTOR", type=boolean, help="Run scylla-doctor in artifact tests"), + + dict(name="enable_force_gossip_topology_changes_on_upgrade", + env="SCT_ENABLE_FORCE_GOSSIP_TOPOLOGY_CHANGES_ON_UPGRADE", + type=boolean, + help="""Enable gossip topology changes (disable raft topology) on upgrade"""), + ] required_params = ['cluster_backend', 'test_duration', 'n_db_nodes', 'n_loaders', 'use_preinstalled_scylla', diff --git a/unit_tests/test_scylla_yaml.py b/unit_tests/test_scylla_yaml.py index 05f88cb011c..b5ba8cd1d71 100644 --- a/unit_tests/test_scylla_yaml.py +++ b/unit_tests/test_scylla_yaml.py @@ -398,7 +398,9 @@ def test_scylla_yaml(self): 'virtual_dirty_soft_limit': None, 'volatile_system_keyspace_for_testing': None, 'workdir': None, - 'write_request_timeout_in_ms': None + 'write_request_timeout_in_ms': None, + 'enable_tablets': None, + 'force_gossip_topology_changes': None, } ) diff --git a/unit_tests/test_scylla_yaml_builders.py b/unit_tests/test_scylla_yaml_builders.py index 3b741a33210..d91544b290d 100644 --- a/unit_tests/test_scylla_yaml_builders.py +++ b/unit_tests/test_scylla_yaml_builders.py @@ -134,7 +134,7 @@ def test_aws_multi_openldap(self): 'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?' '(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)', 'role_manager': 'com.scylladb.auth.LDAPRoleManager', - 'saslauthd_socket_path': '/run/saslauthd/mux' + 'saslauthd_socket_path': '/run/saslauthd/mux', }, ) @@ -172,7 +172,7 @@ def test_gce_single_openldap(self): 'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?' '(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)', 'role_manager': 'com.scylladb.auth.LDAPRoleManager', - 'saslauthd_socket_path': '/run/saslauthd/mux' + 'saslauthd_socket_path': '/run/saslauthd/mux', } ) diff --git a/upgrade_test.py b/upgrade_test.py index e72746cd6e8..09a7b4ea70a 100644 --- a/upgrade_test.py +++ b/upgrade_test.py @@ -21,7 +21,7 @@ import time import re from functools import wraps, cache -from typing import List +from typing import List, Any import contextlib import cassandra @@ -51,7 +51,10 @@ from sdcm.sct_events.group_common_events import ignore_upgrade_schema_errors, ignore_ycsb_connection_refused, \ ignore_abort_requested_errors, decorate_with_context from sdcm.utils import loader_utils +from sdcm.utils.features import TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE, get_enabled_features +from sdcm.wait import wait_for from sdcm.paths import SCYLLA_YAML_PATH +from sdcm.rest.raft_upgrade_procedure import RaftUpgradeProcedure from test_lib.sla import create_sla_auth NUMBER_OF_ROWS_FOR_TRUNCATE_TEST = 10 @@ -208,7 +211,10 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v scylla_yaml_updates.update({"consistent_cluster_management": True}) if self.params.get("enable_tablets_on_upgrade"): - scylla_yaml_updates.update({"experimental_features": ["tablets", "consistent-topology-changes"]}) + scylla_yaml_updates.update({"enable_tablets": True}) + + if self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + scylla_yaml_updates.update({"force_gossip_topology_changes": True}) if self.params.get('test_sst3'): scylla_yaml_updates.update({"enable_sstables_mc_format": True}) @@ -373,15 +379,6 @@ def _rollback_node(self, node, upgrade_sstables=True): node.run_nodetool("snapshot") node.stop_scylla_server(verify_down=False) - if self.params.get("enable_tablets_on_upgrade"): - with node.remote_scylla_yaml() as scylla_yml: - current_experimental_features = scylla_yml.experimental_features - current_experimental_features.remove("tablets") - current_experimental_features.remove("consistent-topology-changes") - if len(current_experimental_features) == 0: - current_experimental_features = None - scylla_yml.experimental_features = current_experimental_features - if node.distro.is_rhel_like: node.remoter.run('sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo') node.remoter.run('sudo chown root.root /etc/yum.repos.d/scylla.repo') @@ -607,7 +604,7 @@ def _update_scylla_yaml_on_node(node_to_update: BaseNode, updates: dict): with node_to_update.remote_scylla_yaml() as scylla_yaml: scylla_yaml.update(updates) - def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-statements # noqa: PLR0915 + def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-statements,too-many-branches # noqa: PLR0915 """ Upgrade half of nodes in the cluster, and start special read workload during the stage. Checksum method is changed to xxhash from Scylla 2.2, @@ -740,10 +737,10 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat step = 'Step4 - Verify data during mixed cluster mode ' InfoEvent(message=step).publish() self.fill_and_verify_db_data('after rollback the second node') + InfoEvent(message='Repair the first upgraded Node').publish() - self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair') - self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, - step=step) + self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair', timeout=7200, coredump_on_timeout=True) + self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, step=step) with ignore_upgrade_schema_errors(): @@ -758,6 +755,8 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat self.fill_and_verify_db_data('after upgraded %s' % self.db_cluster.node_to_upgrade.name) self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, step=step) + if self.params.get("enable_tablets_on_upgrade") or not self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + self.run_raft_topology_upgrade_procedure() InfoEvent(message='Step6 - Verify stress results after upgrade ').publish() InfoEvent(message='Waiting for stress threads to complete after upgrade').publish() @@ -879,6 +878,34 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat InfoEvent(message='all nodes were upgraded, and last workaround is verified.').publish() + def run_raft_topology_upgrade_procedure(self): + features = set() + if not self.params.get("enable_force_gossip_topology_changes_on_upgrade"): + features.update([CONSISTENT_TOPOLOGY_CHANGES_FEATURE]) + if self.params.get("enable_tablets_on_upgrade"): + features.update([TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE]) + InfoEvent(message='Step5.1 - run raft topology upgrade procedure') + + def check_features_enabled(feature_list: list[str], node: BaseNode): + enabled_features_state = [] + with self.db_cluster.cql_connection_patient_exclusive(node) as session: + enabled_features = get_enabled_features(session) + for feature in feature_list: + enabled_features_state.append(feature in enabled_features) + return all(enabled_features_state) + + # wait features is enabled on nodes after upgrade + for node in self.db_cluster.nodes: + wait_for(func=check_features_enabled, timeout=60, step=f"Check feature enabled on node {node.name}", + feature_list=features, node=node) + raft_upgrade = RaftUpgradeProcedure(self.db_cluster.nodes[0]) + result = raft_upgrade.start_upgrade_procedure() + InfoEvent(message=f'result {result}') + InfoEvent("Wait upgrade procedure done") + for node in self.db_cluster.nodes: + RaftUpgradeProcedure(node).wait_upgrade_procedure_done() + InfoEvent(message="Step5.1 - raft topology upgrade procedure done") + def _start_and_wait_for_node_upgrade(self, node: BaseNode, step: int) -> None: InfoEvent( message=f"Step {step} - Upgrade {node.name} from dc {node.dc_idx}").publish() @@ -1498,3 +1525,7 @@ def _custom_profile_rolling_upgrade(self, cs_user_profiles, new_scylla_repo=None assert all(tables_upgraded), "Failed to upgrade the sstable format {}".format(tables_upgraded) InfoEvent(message='all nodes were upgraded, and last workaround is verified.').publish() + + def get_old_config_value(self, config_name: str) -> Any: + with self.db_cluster.nodes[0].remote_scylla_yaml() as scylla_yaml: + return getattr(scylla_yaml, config_name, None)