From b66b186a1e19839ec3aeab9dfd46c3b25509eb41 Mon Sep 17 00:00:00 2001 From: Lili Deng Date: Tue, 26 Dec 2023 09:09:07 +0800 Subject: [PATCH] update hibernation --- lisa/features/__init__.py | 3 +- lisa/features/startstop.py | 9 +++ lisa/sut_orchestrator/azure/features.py | 37 +++++++++- lisa/tools/dmesg.py | 2 + lisa/tools/hibernation_setup.py | 15 ++-- microsoft/testsuites/power/common.py | 81 +++++++++++--------- microsoft/testsuites/power/power.py | 98 +++++++++++++++++++------ microsoft/testsuites/power/stress.py | 1 + 8 files changed, 180 insertions(+), 66 deletions(-) diff --git a/lisa/features/__init__.py b/lisa/features/__init__.py index 73afcf59eb..7afe7c9572 100644 --- a/lisa/features/__init__.py +++ b/lisa/features/__init__.py @@ -28,7 +28,7 @@ SecurityProfileType, ) from .serial_console import SerialConsole -from .startstop import StartStop, StopState +from .startstop import StartStop, StopState, VMStatus __all__ = [ "ACC", @@ -63,6 +63,7 @@ "SecurityProfileType", "Sriov", "StopState", + "VMStatus", "Synthetic", "StartStop", ] diff --git a/lisa/features/startstop.py b/lisa/features/startstop.py index bf85317d37..fd07ce1bbe 100644 --- a/lisa/features/startstop.py +++ b/lisa/features/startstop.py @@ -7,6 +7,12 @@ FEATURE_NAME_STARTSTOP = "StartStop" +class VMStatus(str, Enum): + Deallocated = "Deallocated" + Running = "Running" + ProvisionSucceeded = "ProvisionSucceeded" + + class StopState(str, Enum): Hibernate = "hibernate" Shutdown = "shutdown" @@ -48,3 +54,6 @@ def restart(self, wait: bool = True) -> None: self._log.info("restarting") self._restart(wait=wait) self._node.close() + + def get_status(self) -> VMStatus: + raise NotImplementedError() diff --git a/lisa/sut_orchestrator/azure/features.py b/lisa/sut_orchestrator/azure/features.py index c26464fc4b..09b8ef9a5d 100644 --- a/lisa/sut_orchestrator/azure/features.py +++ b/lisa/sut_orchestrator/azure/features.py @@ -47,6 +47,7 @@ FEATURE_NAME_SECURITY_PROFILE, SecurityProfileType, ) +from lisa.features.startstop import VMStatus from lisa.node import Node, RemoteNode from lisa.operating_system import BSD, CBLMariner, CentOs, Redhat, Suse, Ubuntu from lisa.search_space import RequirementMethod @@ -129,6 +130,13 @@ def _initialize_information(self, node: Node) -> None: class StartStop(AzureFeatureMixin, features.StartStop): + azure_vm_status_map = { + "VM deallocated": VMStatus.Deallocated, + "VM running": VMStatus.Running, + "Provisioning succeeded": VMStatus.ProvisionSucceeded, + # Add more Azure-specific mappings as needed + } + @classmethod def create_setting( cls, *args: Any, **kwargs: Any @@ -182,6 +190,23 @@ def _execute(self, wait: bool, operator: str, **kwargs: Any) -> None: if wait: wait_operation(operation, failure_identity="Start/Stop") + def get_status(self) -> VMStatus: + try: + platform: AzurePlatform = self._platform # type: ignore + compute_client = get_compute_client(platform) + status = ( + compute_client.virtual_machines.get( + self._resource_group_name, self._vm_name, expand="instanceView" + ) + .instance_view.statuses[1] + .display_status + ) + assert isinstance(status, str), f"actual: {type(status)}" + assert self.azure_vm_status_map.get(status) is not None, "unknown vm status" + return cast(VMStatus, self.azure_vm_status_map.get(status)) + except Exception as e: + raise LisaException(f"fail to get status of vm {self._vm_name}") from e + class FixedSerialPortsOperations(SerialPortsOperations): # type: ignore def connect( @@ -2014,8 +2039,18 @@ def create_setting( cls, *args: Any, **kwargs: Any ) -> Optional[schema.FeatureSettings]: raw_capabilities: Any = kwargs.get("raw_capabilities") + resource_sku: Any = kwargs.get("resource_sku") - if raw_capabilities.get("HibernationSupported", None) == "True": + if ( + resource_sku.family + in [ + "standardDSv5Family", + "standardDDSv5Family", + "standardDASv5Family", + "standardDADSv5Family", + ] + or raw_capabilities.get("HibernationSupported", None) == "True" + ): return schema.FeatureSettings.create(cls.name()) return None diff --git a/lisa/tools/dmesg.py b/lisa/tools/dmesg.py index 5ed2d8572d..85b2580ca9 100644 --- a/lisa/tools/dmesg.py +++ b/lisa/tools/dmesg.py @@ -18,6 +18,8 @@ class Dmesg(Tool): re.compile("rcu_sched self-detected stall on CPU"), re.compile("rcu_sched detected stalls on"), re.compile("BUG: soft lockup"), + re.compile("Hibernate inconsistent memory map detected"), + re.compile("check_flush_dependency"), ] # [ 3.191822] hv_vmbus: Hyper-V Host Build:18362-10.0-3-0.3294; Vmbus version:3.0 diff --git a/lisa/tools/hibernation_setup.py b/lisa/tools/hibernation_setup.py index 90edc5bf29..fcb5cc3e62 100644 --- a/lisa/tools/hibernation_setup.py +++ b/lisa/tools/hibernation_setup.py @@ -5,13 +5,13 @@ import re from typing import List, Pattern, Type -from lisa.base_tools import Systemctl +from lisa.base_tools import Cat, Systemctl from lisa.executable import Tool from lisa.operating_system import CBLMariner from lisa.util import find_patterns_in_lines -from .dmesg import Dmesg from .git import Git +from .ls import Ls from .make import Make @@ -74,9 +74,14 @@ def _install(self) -> bool: return self._check_exists() def _check(self, pattern: Pattern[str]) -> int: - dmesg = self.node.tools[Dmesg] - dmesg_output = dmesg.get_output(force_run=True) - matched_lines = find_patterns_in_lines(dmesg_output, [pattern]) + cat = self.node.tools[Cat] + log_output = "" + ls = self.node.tools[Ls] + if ls.path_exists("/var/log/syslog", sudo=True): + log_output = cat.read("/var/log/syslog", force_run=True, sudo=True) + if ls.path_exists("/var/log/messages", sudo=True): + log_output = cat.read("/var/log/messages", force_run=True, sudo=True) + matched_lines = find_patterns_in_lines(log_output, [pattern]) if not matched_lines: return 0 return len(matched_lines[0]) diff --git a/microsoft/testsuites/power/common.py b/microsoft/testsuites/power/common.py index 2c1ea43a98..cf2ef1f11f 100644 --- a/microsoft/testsuites/power/common.py +++ b/microsoft/testsuites/power/common.py @@ -5,16 +5,16 @@ from assertpy import assert_that -from lisa import Environment, Logger, RemoteNode, features +from lisa import Environment, Logger, Node, RemoteNode, features from lisa.features import StartStop +from lisa.features.startstop import VMStatus from lisa.operating_system import Redhat, Suse, Ubuntu -from lisa.tools import Fio, HibernationSetup, Iperf3, KernelConfig, Kill, Lscpu -from lisa.util import LisaException, SkippedException, constants +from lisa.tools import Dmesg, Fio, HibernationSetup, Iperf3, KernelConfig, Kill, Lscpu +from lisa.util import LisaException, SkippedException from lisa.util.perf_timer import create_timer -from lisa.util.shell import wait_tcp_port_ready -def is_distro_supported(node: RemoteNode) -> None: +def is_distro_supported(node: Node) -> None: if not node.tools[KernelConfig].is_enabled("CONFIG_HIBERNATION"): raise SkippedException( f"CONFIG_HIBERNATION is not enabled in current distro {node.os.name}, " @@ -32,69 +32,78 @@ def is_distro_supported(node: RemoteNode) -> None: ) -def verify_hibernation(node: RemoteNode, log: Logger) -> None: +def verify_hibernation( + node: Node, + log: Logger, + throw_error: bool = True, +) -> None: + hibernation_setup_tool = node.tools[HibernationSetup] + startstop = node.features[StartStop] + node_nic = node.nics lower_nics_before_hibernation = node_nic.get_lower_nics() upper_nics_before_hibernation = node_nic.get_nic_names() - hibernation_setup_tool = node.tools[HibernationSetup] entry_before_hibernation = hibernation_setup_tool.check_entry() exit_before_hibernation = hibernation_setup_tool.check_exit() received_before_hibernation = hibernation_setup_tool.check_received() uevent_before_hibernation = hibernation_setup_tool.check_uevent() - startstop = node.features[StartStop] + + # only set up hibernation setup tool for the first time hibernation_setup_tool.start() - startstop.stop(state=features.StopState.Hibernate) + + try: + startstop.stop(state=features.StopState.Hibernate) + except Exception as ex: + try: + node.tools[Dmesg].get_output(force_run=True) + except Exception as e: + log.debug(f"error on get dmesg output: {e}") + raise LisaException(f"fail to hibernate: {ex}") + is_ready = True timeout = 900 timer = create_timer() while timeout > timer.elapsed(False): - is_ready, _ = wait_tcp_port_ready( - node.connection_info[constants.ENVIRONMENTS_NODES_REMOTE_ADDRESS], - node.connection_info[constants.ENVIRONMENTS_NODES_REMOTE_PORT], - log=log, - timeout=10, - ) - if not is_ready: + if startstop.get_status() == VMStatus.Deallocated: + is_ready = False break if is_ready: - raise LisaException("VM still can be accessed after hibernation") + raise LisaException("VM is not in deallocated status after hibernation") + startstop.start() + dmesg = node.tools[Dmesg] + dmesg.check_kernel_errors(force_run=True, throw_error=throw_error) + entry_after_hibernation = hibernation_setup_tool.check_entry() exit_after_hibernation = hibernation_setup_tool.check_exit() received_after_hibernation = hibernation_setup_tool.check_received() uevent_after_hibernation = hibernation_setup_tool.check_uevent() - assert_that( - entry_after_hibernation - entry_before_hibernation, - "not find 'hibernation entry'.", + assert_that(entry_after_hibernation - entry_before_hibernation).described_as( + "not find 'hibernation entry'." ).is_equal_to(1) - assert_that( - exit_after_hibernation - exit_before_hibernation, - "not find 'hibernation exit'.", + assert_that(exit_after_hibernation - exit_before_hibernation).described_as( + "not find 'hibernation exit'." ).is_equal_to(1) - assert_that( - received_after_hibernation - received_before_hibernation, - "not find 'Hibernation request received'.", + assert_that(received_after_hibernation - received_before_hibernation).described_as( + "not find 'Hibernation request received'." ).is_equal_to(1) - assert_that( - uevent_after_hibernation - uevent_before_hibernation, - "not find 'Sent hibernation uevent'.", + assert_that(uevent_after_hibernation - uevent_before_hibernation).described_as( + "not find 'Sent hibernation uevent'." ).is_equal_to(1) node_nic = node.nics node_nic.initialize() lower_nics_after_hibernation = node_nic.get_lower_nics() upper_nics_after_hibernation = node_nic.get_nic_names() - assert_that( - len(lower_nics_after_hibernation), - "sriov nics count changes after hibernation.", + assert_that(len(lower_nics_after_hibernation)).described_as( + "sriov nics count changes after hibernation." ).is_equal_to(len(lower_nics_before_hibernation)) - assert_that( - len(upper_nics_after_hibernation), - "synthetic nics count changes after hibernation.", + assert_that(len(upper_nics_after_hibernation)).described_as( + "synthetic nics count changes after hibernation." ).is_equal_to(len(upper_nics_before_hibernation)) -def run_storage_workload(node: RemoteNode) -> Decimal: +def run_storage_workload(node: Node) -> Decimal: fio = node.tools[Fio] fiodata = node.get_pure_path("./fiodata") core_count = node.tools[Lscpu].get_core_count() diff --git a/microsoft/testsuites/power/power.py b/microsoft/testsuites/power/power.py index 175c7bb0e3..f951452e45 100644 --- a/microsoft/testsuites/power/power.py +++ b/microsoft/testsuites/power/power.py @@ -14,7 +14,7 @@ TestSuite, TestSuiteMetadata, ) -from lisa.features import HibernationEnabled, Sriov, Synthetic +from lisa.features import Disk, HibernationEnabled, Sriov, Synthetic from lisa.node import Node from lisa.operating_system import BSD, Windows from lisa.testsuite import simple_requirement @@ -64,10 +64,7 @@ def before_case(self, log: Logger, **kwargs: Any) -> None: supported_features=[HibernationEnabled()], ), ) - def verify_hibernation_synthetic_network( - self, environment: Environment, log: Logger - ) -> None: - node = cast(RemoteNode, environment.nodes[0]) + def verify_hibernation_synthetic_network(self, node: Node, log: Logger) -> None: is_distro_supported(node) verify_hibernation(node, log) @@ -82,10 +79,7 @@ def verify_hibernation_synthetic_network( supported_features=[HibernationEnabled()], ), ) - def verify_hibernation_sriov_network( - self, environment: Environment, log: Logger - ) -> None: - node = cast(RemoteNode, environment.nodes[0]) + def verify_hibernation_sriov_network(self, node: Node, log: Logger) -> None: is_distro_supported(node) verify_hibernation(node, log) @@ -103,10 +97,7 @@ def verify_hibernation_sriov_network( supported_features=[HibernationEnabled()], ), ) - def verify_hibernation_time_sync( - self, environment: Environment, log: Logger - ) -> None: - node = cast(RemoteNode, environment.nodes[0]) + def verify_hibernation_time_sync(self, node: Node, log: Logger) -> None: is_distro_supported(node) date = node.tools[Date] current_date = date.current() @@ -171,10 +162,7 @@ def verify_hibernation_with_network_workload( supported_features=[HibernationEnabled()], ), ) - def verify_hibernation_with_storage_workload( - self, environment: Environment, log: Logger - ) -> None: - node = cast(RemoteNode, environment.nodes[0]) + def verify_hibernation_with_storage_workload(self, node: Node, log: Logger) -> None: is_distro_supported(node) run_storage_workload(node) verify_hibernation(node, log) @@ -194,15 +182,79 @@ def verify_hibernation_with_storage_workload( supported_features=[HibernationEnabled()], ), ) - def verify_hibernation_with_memory_workload( - self, environment: Environment, log: Logger - ) -> None: - node = cast(RemoteNode, environment.nodes[0]) + def verify_hibernation_with_memory_workload(self, node: Node, log: Logger) -> None: is_distro_supported(node) stress_ng_tool = node.tools[StressNg] - stress_ng_tool.launch_vm_stressor(16, "100%", 300) + stress_ng_tool.launch_vm_stressor(16, "90%", 300) + verify_hibernation(node, log, throw_error=False) + stress_ng_tool.launch_vm_stressor(16, "90%", 300) + + @TestCaseMetadata( + description=""" + This case is to verify vm hibernation with synthetic network with max nics. + Steps, + 1. Install HibernationSetup tool to prepare prerequisite for vm + hibernation. + 2. Get nics info before hibernation. + 3. Hibernate vm. + 4. Check vm is inaccessible. + 5. Resume vm by starting vm. + 6. Check vm hibernation successfully by checking keywords in dmesg. + 6. Get nics info after hibernation. + 7. Fail the case if nics count and info changes after vm resume. + """, + priority=3, + requirement=simple_requirement( + min_nic_count=8, + network_interface=Synthetic(), + supported_features=[HibernationEnabled()], + ), + ) + def verify_hibernation_synthetic_network_max_nics( + self, node: Node, log: Logger + ) -> None: + is_distro_supported(node) + verify_hibernation(node, log) + + @TestCaseMetadata( + description=""" + This case is to verify vm hibernation with sriov network with max nics. + It has the same steps with verify_hibernation_synthetic_network_max_nics. + """, + priority=3, + requirement=simple_requirement( + min_nic_count=8, + network_interface=Sriov(), + supported_features=[HibernationEnabled()], + ), + ) + def verify_hibernation_sriov_network_max_nics( + self, node: Node, log: Logger + ) -> None: + is_distro_supported(node) + verify_hibernation(node, log) + + @TestCaseMetadata( + description=""" + This case is to verify vm hibernation with max data disks. + It has the same steps with verify_hibernation_synthetic_network_max_nics. + """, + priority=3, + requirement=simple_requirement( + min_nic_count=8, + supported_features=[HibernationEnabled()], + min_data_disk_count=32, + ), + ) + def verify_hibernation_max_data_disks(self, node: Node, log: Logger) -> None: + is_distro_supported(node) + disk = node.features[Disk] + data_disks_before_hibernation = disk.get_raw_data_disks() verify_hibernation(node, log) - stress_ng_tool.launch_vm_stressor(16, "100%", 300) + data_disks_after_hibernation = disk.get_raw_data_disks() + assert_that(data_disks_before_hibernation).described_as( + "data disks are inconsistent after hibernation" + ).is_length(data_disks_after_hibernation) def after_case(self, log: Logger, **kwargs: Any) -> None: environment: Environment = kwargs.pop("environment") diff --git a/microsoft/testsuites/power/stress.py b/microsoft/testsuites/power/stress.py index 723d0d4259..03ad354951 100644 --- a/microsoft/testsuites/power/stress.py +++ b/microsoft/testsuites/power/stress.py @@ -43,6 +43,7 @@ def before_case(self, log: Logger, **kwargs: Any) -> None: This case is to verify vm hibernation in a loop. """, priority=3, + timeout=720000, requirement=simple_requirement( network_interface=Sriov(), supported_features=[HibernationEnabled()],