From 83b6782918edf4218c555e252f0a8921525468d2 Mon Sep 17 00:00:00 2001 From: Dmitriy Kruglov Date: Mon, 25 Nov 2024 11:27:32 +0100 Subject: [PATCH] improvement(upgrade-test): upgrade node system early in the test During rolling upgrade scenarios the Scylla upgrade cycle starts with the system packages upgrade. The upgrade cycle has to be performed sequentially, a node at a time, but this is not a strict requirement for the step of upgrading system. The change moves the step of upgrading system packages to the start of the test and parallelizes it across the nodes. This would save up to 65% of time needed for upgrading system on all nodes. Additionally, a flag is added to the SCT config to indicate whether system upgrade is to be perfomed during rolling upgrade scenarios (enabled by default). Closes: https://github.com/scylladb/qa-tasks/issues/1773 --- defaults/test_default.yaml | 1 + docs/configuration_options.md | 1 + sdcm/sct_config.py | 3 +++ upgrade_test.py | 32 +++++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/defaults/test_default.yaml b/defaults/test_default.yaml index 58d4a64da3..23fa9187f9 100644 --- a/defaults/test_default.yaml +++ b/defaults/test_default.yaml @@ -140,6 +140,7 @@ test_upgrade_from_installed_3_1_0: false target_upgrade_version: '' disable_raft: true enable_tablets_on_upgrade: false +upgrade_node_system: true stress_cdclog_reader_cmd: "cdc-stressor -stream-query-round-duration 30s" diff --git a/docs/configuration_options.md b/docs/configuration_options.md index b57080c998..2785484703 100644 --- a/docs/configuration_options.md +++ b/docs/configuration_options.md @@ -293,6 +293,7 @@ | **disable_raft** | As for now, raft will be enable by default in all [upgrade] tests, so this flag will allow usto still run [upgrade] test without raft enabled (or disabling raft), so we will have bettercoverage | True | SCT_DISABLE_RAFT | **enable_tablets_on_upgrade** | By default, the tablets feature is disabled. With this parameter, created for the upgrade test,the tablets feature will only be enabled after the upgrade | N/A | SCT_ENABLE_TABLETS_ON_UPGRADE | **upgrade_node_packages** | | N/A | SCT_UPGRADE_NODE_PACKAGES +| **upgrade_node_system** | Upgrade system packages on nodes before upgrading Scylla. Enabled by default | N/A | SCT_UPGRADE_NODE_SYSTEM | **test_sst3** | | N/A | SCT_TEST_SST3 | **test_upgrade_from_installed_3_1_0** | Enable an option for installed 3.1.0 for work around a scylla issue if it's true | N/A | SCT_TEST_UPGRADE_FROM_INSTALLED_3_1_0 | **recover_system_tables** | | N/A | SCT_RECOVER_SYSTEM_TABLES diff --git a/sdcm/sct_config.py b/sdcm/sct_config.py index 7778887dc2..a7710ba2eb 100644 --- a/sdcm/sct_config.py +++ b/sdcm/sct_config.py @@ -1270,6 +1270,9 @@ class SCTConfiguration(dict): dict(name="upgrade_node_packages", env="SCT_UPGRADE_NODE_PACKAGES", type=str, help=""), + dict(name="upgrade_node_system", env="SCT_UPGRADE_NODE_SYSTEM", type=boolean, + help="Upgrade system packages on nodes before upgrading Scylla. Enabled by default"), + dict(name="test_sst3", env="SCT_TEST_SST3", type=boolean, help=""), diff --git a/upgrade_test.py b/upgrade_test.py index 688eab3eca..1cf3b79a36 100644 --- a/upgrade_test.py +++ b/upgrade_test.py @@ -36,6 +36,7 @@ from sdcm.fill_db_data import FillDatabaseData from sdcm.sct_events import Severity from sdcm.stress_thread import CassandraStressThread +from sdcm.utils.common import ParallelObject from sdcm.utils.decorators import retrying from sdcm.utils.user_profile import get_profile_content from sdcm.utils.version_utils import ( @@ -158,6 +159,8 @@ def __init__(self, *args): # would be recalculated after all the cluster finish upgrade expected_sstable_format_version = 'mc' + system_upgrade_timeout = 6 * 60 + @retrying(n=5) def _query_from_one_table(self, session, query, table_name) -> list: return self.rows_to_list(session.execute(SimpleStatement(query.format(table_name)), timeout=300)) @@ -220,15 +223,6 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v scylla_yaml_updates.update({"enable_sstables_mc_format": True}) InfoEvent(message='Upgrading a Node').publish() - # because of scylladb/scylla-enterprise#2818 we are for now adding this workaround - if node.distro.is_ubuntu: - InfoEvent(message='upgrade_node - removing "shim-signed" package as a workaround').publish() - node.remoter.sudo("apt-get remove shim-signed -y --allow-remove-essential") - InfoEvent(message='upgrade_node - ended removing "shim-signed" package as a workaround').publish() - InfoEvent(message=f'upgrade_node - starting to "upgrade_system" of the node {node.name}').publish() - node.upgrade_system() - InfoEvent(message=f'upgrade_node - ended to "upgrade_system" of the node {node.name}').publish() - # We assume that if update_db_packages is not empty we install packages from there. # In this case we don't use upgrade based on new_scylla_repo(ignored sudo yum update scylla...) result = node.remoter.run('scylla --version') @@ -370,6 +364,17 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v self.db_cluster.wait_all_nodes_un() + def upgrade_os(self, nodes): + def upgrade(node): + InfoEvent(message=f'upgrade_node_system - starting to "upgrade_system" of the node {node.name}').publish() + node.upgrade_system() + InfoEvent(message=f'upgrade_node_system - ended to "upgrade_system" of the node {node.name}').publish() + + if self.params.get('upgrade_node_system'): + InfoEvent(message='Upgrading OS on nodes').publish() + parallel_obj = ParallelObject(objects=nodes, timeout=self.system_upgrade_timeout) + parallel_obj.run(upgrade) + @truncate_entries # https://github.com/scylladb/scylla/issues/10447#issuecomment-1194155163 def rollback_node(self, node, upgrade_sstables=True): @@ -529,6 +534,8 @@ def test_upgrade_cql_queries(self): Run a set of different cql queries against various types/tables before and after upgrade of every node to check the consistency of data """ + self.upgrade_os(self.db_cluster.nodes) + InfoEvent(message='Populate DB with many types of tables and data').publish() self.fill_db_data() InfoEvent(message='Run some Queries to verify data BEFORE UPGRADE').publish() @@ -620,6 +627,8 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat we want to use this case to verify the read (cl=ALL) workload works well, upgrade all nodes to new version in the end. """ + self.upgrade_os(self.db_cluster.nodes) + InfoEvent(message='pre-test - prepare test keyspaces and tables').publish() # prepare test keyspaces and tables before upgrade to avoid schema change during mixed cluster. self.prepare_keyspaces_and_tables() @@ -981,6 +990,8 @@ def test_generic_cluster_upgrade(self): For multi-dc upgrades, alternates upgraded nodes between dc's. """ + self.upgrade_os(self.db_cluster.nodes) + # Prepare keyspace and tables for truncate test self.fill_db_data_for_truncate_test(insert_rows=NUMBER_OF_ROWS_FOR_TRUNCATE_TEST) @@ -1037,6 +1048,7 @@ def test_cluster_upgrade_latency_regression(self): - Read latte data (stress_after_cluster_upgrade) generating report file - Compare latte report files and raise SCT ERROR event if latencies are worse for more than 10% """ + self.upgrade_os(self.db_cluster.nodes) InfoEvent(message="Step1 - Populate DB data").publish() # Prepare keyspace and tables for truncate test @@ -1505,6 +1517,8 @@ def prepare_data_before_upgrade(self): return cs_user_profiles def _custom_profile_rolling_upgrade(self, cs_user_profiles, new_scylla_repo=None, new_version=None): # pylint: disable=too-many-locals,too-many-statements + self.upgrade_os(self.db_cluster.nodes) + InfoEvent(message='Starting write workload during entire test').publish() user_profiles, duration_per_cs_profile = self.parse_cs_user_profiles_param(cs_user_profiles) entire_write_thread_pool = self.run_cs_user_profiles(cs_profiles=user_profiles,