Skip to content

Commit

Permalink
improvement(upgrade-test): upgrade node system early in the test
Browse files Browse the repository at this point in the history
During rolling upgrade scenarios the Scylla upgrade cycle starts with the
system packages upgrade. The upgrade cycle has to be performed sequentially,
a node at a time, but this is not a strict requirement for the step of upgrading
system.

The change moves the step of upgrading system packages to the start of the test
and parallelizes it across the nodes. This would save up to 65% of time needed
for upgrading system on all nodes.
Additionally, a flag is added to the SCT config to indicate whether system
upgrade is to be perfomed during rolling upgrade scenarios (enabled by default).

Closes: scylladb/qa-tasks#1773
  • Loading branch information
dimakr authored and soyacz committed Nov 27, 2024
1 parent 5aaf574 commit 83b6782
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 9 deletions.
1 change: 1 addition & 0 deletions defaults/test_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ test_upgrade_from_installed_3_1_0: false
target_upgrade_version: ''
disable_raft: true
enable_tablets_on_upgrade: false
upgrade_node_system: true

stress_cdclog_reader_cmd: "cdc-stressor -stream-query-round-duration 30s"

Expand Down
1 change: 1 addition & 0 deletions docs/configuration_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@
| **<a href="#user-content-disable_raft" name="disable_raft">disable_raft</a>** | As for now, raft will be enable by default in all [upgrade] tests, so this flag will allow usto still run [upgrade] test without raft enabled (or disabling raft), so we will have bettercoverage | True | SCT_DISABLE_RAFT
| **<a href="#user-content-enable_tablets_on_upgrade" name="enable_tablets_on_upgrade">enable_tablets_on_upgrade</a>** | By default, the tablets feature is disabled. With this parameter, created for the upgrade test,the tablets feature will only be enabled after the upgrade | N/A | SCT_ENABLE_TABLETS_ON_UPGRADE
| **<a href="#user-content-upgrade_node_packages" name="upgrade_node_packages">upgrade_node_packages</a>** | | N/A | SCT_UPGRADE_NODE_PACKAGES
| **<a href="#user-content-upgrade_node_system" name="upgrade_node_system">upgrade_node_system</a>** | Upgrade system packages on nodes before upgrading Scylla. Enabled by default | N/A | SCT_UPGRADE_NODE_SYSTEM
| **<a href="#user-content-test_sst3" name="test_sst3">test_sst3</a>** | | N/A | SCT_TEST_SST3
| **<a href="#user-content-test_upgrade_from_installed_3_1_0" name="test_upgrade_from_installed_3_1_0">test_upgrade_from_installed_3_1_0</a>** | Enable an option for installed 3.1.0 for work around a scylla issue if it's true | N/A | SCT_TEST_UPGRADE_FROM_INSTALLED_3_1_0
| **<a href="#user-content-recover_system_tables" name="recover_system_tables">recover_system_tables</a>** | | N/A | SCT_RECOVER_SYSTEM_TABLES
Expand Down
3 changes: 3 additions & 0 deletions sdcm/sct_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,9 @@ class SCTConfiguration(dict):
dict(name="upgrade_node_packages", env="SCT_UPGRADE_NODE_PACKAGES", type=str,
help=""),

dict(name="upgrade_node_system", env="SCT_UPGRADE_NODE_SYSTEM", type=boolean,
help="Upgrade system packages on nodes before upgrading Scylla. Enabled by default"),

dict(name="test_sst3", env="SCT_TEST_SST3", type=boolean,
help=""),

Expand Down
32 changes: 23 additions & 9 deletions upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from sdcm.fill_db_data import FillDatabaseData
from sdcm.sct_events import Severity
from sdcm.stress_thread import CassandraStressThread
from sdcm.utils.common import ParallelObject
from sdcm.utils.decorators import retrying
from sdcm.utils.user_profile import get_profile_content
from sdcm.utils.version_utils import (
Expand Down Expand Up @@ -158,6 +159,8 @@ def __init__(self, *args):
# would be recalculated after all the cluster finish upgrade
expected_sstable_format_version = 'mc'

system_upgrade_timeout = 6 * 60

@retrying(n=5)
def _query_from_one_table(self, session, query, table_name) -> list:
return self.rows_to_list(session.execute(SimpleStatement(query.format(table_name)), timeout=300))
Expand Down Expand Up @@ -220,15 +223,6 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v
scylla_yaml_updates.update({"enable_sstables_mc_format": True})

InfoEvent(message='Upgrading a Node').publish()
# because of scylladb/scylla-enterprise#2818 we are for now adding this workaround
if node.distro.is_ubuntu:
InfoEvent(message='upgrade_node - removing "shim-signed" package as a workaround').publish()
node.remoter.sudo("apt-get remove shim-signed -y --allow-remove-essential")
InfoEvent(message='upgrade_node - ended removing "shim-signed" package as a workaround').publish()
InfoEvent(message=f'upgrade_node - starting to "upgrade_system" of the node {node.name}').publish()
node.upgrade_system()
InfoEvent(message=f'upgrade_node - ended to "upgrade_system" of the node {node.name}').publish()

# We assume that if update_db_packages is not empty we install packages from there.
# In this case we don't use upgrade based on new_scylla_repo(ignored sudo yum update scylla...)
result = node.remoter.run('scylla --version')
Expand Down Expand Up @@ -370,6 +364,17 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v

self.db_cluster.wait_all_nodes_un()

def upgrade_os(self, nodes):
def upgrade(node):
InfoEvent(message=f'upgrade_node_system - starting to "upgrade_system" of the node {node.name}').publish()
node.upgrade_system()
InfoEvent(message=f'upgrade_node_system - ended to "upgrade_system" of the node {node.name}').publish()

if self.params.get('upgrade_node_system'):
InfoEvent(message='Upgrading OS on nodes').publish()
parallel_obj = ParallelObject(objects=nodes, timeout=self.system_upgrade_timeout)
parallel_obj.run(upgrade)

@truncate_entries
# https://github.com/scylladb/scylla/issues/10447#issuecomment-1194155163
def rollback_node(self, node, upgrade_sstables=True):
Expand Down Expand Up @@ -529,6 +534,8 @@ def test_upgrade_cql_queries(self):
Run a set of different cql queries against various types/tables before
and after upgrade of every node to check the consistency of data
"""
self.upgrade_os(self.db_cluster.nodes)

InfoEvent(message='Populate DB with many types of tables and data').publish()
self.fill_db_data()
InfoEvent(message='Run some Queries to verify data BEFORE UPGRADE').publish()
Expand Down Expand Up @@ -620,6 +627,8 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat
we want to use this case to verify the read (cl=ALL) workload works
well, upgrade all nodes to new version in the end.
"""
self.upgrade_os(self.db_cluster.nodes)

InfoEvent(message='pre-test - prepare test keyspaces and tables').publish()
# prepare test keyspaces and tables before upgrade to avoid schema change during mixed cluster.
self.prepare_keyspaces_and_tables()
Expand Down Expand Up @@ -981,6 +990,8 @@ def test_generic_cluster_upgrade(self):
For multi-dc upgrades, alternates upgraded nodes between dc's.
"""
self.upgrade_os(self.db_cluster.nodes)

# Prepare keyspace and tables for truncate test
self.fill_db_data_for_truncate_test(insert_rows=NUMBER_OF_ROWS_FOR_TRUNCATE_TEST)

Expand Down Expand Up @@ -1037,6 +1048,7 @@ def test_cluster_upgrade_latency_regression(self):
- Read latte data (stress_after_cluster_upgrade) generating report file
- Compare latte report files and raise SCT ERROR event if latencies are worse for more than 10%
"""
self.upgrade_os(self.db_cluster.nodes)

InfoEvent(message="Step1 - Populate DB data").publish()
# Prepare keyspace and tables for truncate test
Expand Down Expand Up @@ -1505,6 +1517,8 @@ def prepare_data_before_upgrade(self):
return cs_user_profiles

def _custom_profile_rolling_upgrade(self, cs_user_profiles, new_scylla_repo=None, new_version=None): # pylint: disable=too-many-locals,too-many-statements
self.upgrade_os(self.db_cluster.nodes)

InfoEvent(message='Starting write workload during entire test').publish()
user_profiles, duration_per_cs_profile = self.parse_cs_user_profiles_param(cs_user_profiles)
entire_write_thread_pool = self.run_cs_user_profiles(cs_profiles=user_profiles,
Expand Down

0 comments on commit 83b6782

Please sign in to comment.