Skip to content

Commit

Permalink
migrate cpu offline online case. (#1673)
Browse files Browse the repository at this point in the history
* Reset _vmbus_devices when rerun lsvmbus -vv command.

* Simplify method name get_device_channels_from_lsvmbus

* remove case cpu_verify_vmbus_force_online, the logic is contained in the new case verify_cpu_online_offline.

* migrate verify_cpu_hot_plug and verify_cpu_hot_plug_stress
  • Loading branch information
LiliDeng authored Jan 6, 2022
1 parent 17f829c commit 8106c40
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 88 deletions.
2 changes: 1 addition & 1 deletion lisa/features/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def get_gpu_count_with_lsvmbus(self) -> int:
bridge_device_count = 0

lsvmbus_tool = self._node.tools[Lsvmbus]
device_list = lsvmbus_tool.get_device_channels_from_lsvmbus()
device_list = lsvmbus_tool.get_device_channels()
for device in device_list:
for name, id, bridge_count in self.gpu_devices:
if id in device.device_id:
Expand Down
5 changes: 2 additions & 3 deletions lisa/tools/lsvmbus.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,9 @@ def install(self) -> bool:

return self._check_exists()

def get_device_channels_from_lsvmbus(
self, force_run: bool = False
) -> List[VmBusDevice]:
def get_device_channels(self, force_run: bool = False) -> List[VmBusDevice]:
if (not self._vmbus_devices) or force_run:
self._vmbus_devices = []
result = self.run("-vv", force_run=force_run, shell=True)
if result.exit_code != 0:
result = self.run("-vv", force_run=force_run, shell=True, sudo=True)
Expand Down
85 changes: 2 additions & 83 deletions microsoft/testsuites/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from __future__ import annotations

import time
from pathlib import PurePosixPath

from assertpy.assertpy import assert_that

from lisa import (
BadEnvironmentStateException,
LisaException,
Logger,
Node,
Expand All @@ -17,89 +15,19 @@
TestSuite,
TestSuiteMetadata,
)
from lisa.tools import Cat, Echo, InterruptInspector, Lscpu, Lsvmbus, TaskSet, Uname


class CPUState:
OFFLINE: str = "0"
ONLINE: str = "1"

from lisa.tools import Cat, InterruptInspector, Lscpu, TaskSet, Uname

hyperv_interrupt_substr = ["hyperv", "Hypervisor", "Hyper-V"]


@TestSuiteMetadata(
area="cpu",
area="core",
category="functional",
description="""
This test suite is used to run CPU related tests.
""",
)
class CPU(TestSuite):
@TestCaseMetadata(
description="""
This test will check that CPU assigned to lsvmbus
channels cannot be put offline.
Steps :
1. Get the list of lsvmbus channel cpu mappings using
command `lsvmbus -vv`.
2. Create a set of cpu's assigned to lsvmbus channels.
3. Try to put cpu offline by running
`echo 0 > /sys/devices/system/cpu/cpu/<cpu_id>/online`.
Note : We skip cpu 0 as it handles system interrupts.
4. Ensure that cpu is still online by checking state '1' in
`/sys/devices/system/cpu/cpu/<target_cpu>/online`.
""",
priority=2,
)
def cpu_verify_vmbus_force_online(self, node: Node, log: Logger) -> None:
cpu_count = node.tools[Lscpu].get_core_count()
log.debug(f"{cpu_count} CPU cores detected...")

# Find CPUs(except CPU0) which are mapped to LSVMBUS channels and have
# `sys/devices/system/cpu/cpu/cpu<id>/online` file present.
channels = node.tools[Lsvmbus].get_device_channels_from_lsvmbus()
is_non_zero_cpu_id_mapped = False
mapped_cpu_set = set()
for channel in channels:
for channel_vp_map in channel.channel_vp_map:
target_cpu = channel_vp_map.target_cpu
if target_cpu == "0":
continue
is_non_zero_cpu_id_mapped = True
file_path = self._get_cpu_config_file(target_cpu)
file_exists = node.shell.exists(PurePosixPath(file_path))
if file_exists:
mapped_cpu_set.add(target_cpu)

# Fail test if `/sys/devices/system/cpu/cpu/cpu<id>/online` file does
# not exist for all CPUs(except CPU0) mapped to LSVMBUS channels. This
# is to catch distros which have this unexpected behaviour.
if is_non_zero_cpu_id_mapped and not mapped_cpu_set:
raise LisaException(
"/sys/devices/system/cpu/cpu/cpu<id>/online file"
"does not exists for all CPUs mapped to LSVMBUS channels."
)

for target_cpu in mapped_cpu_set:
log.debug(f"Checking CPU {target_cpu} on /sys/device/....")
result = self._set_cpu_state(target_cpu, CPUState.OFFLINE, node)
if result:
# Try to bring CPU back to it's original state
reset = self._set_cpu_state(target_cpu, CPUState.ONLINE, node)
exception_message = (
f"Expected CPU {target_cpu} state : {CPUState.ONLINE}(online), "
f"actual state : {CPUState.OFFLINE}(offline). CPU's mapped to "
f"LSVMBUS channels shouldn't be in state "
f"{CPUState.OFFLINE}(offline)."
)
if not reset:
raise BadEnvironmentStateException(
exception_message,
f"The test failed leaving CPU {target_cpu} in a bad state.",
)
raise AssertionError(exception_message)

@TestCaseMetadata(
description="""
This test case will check that L3 cache is correctly mapped
Expand Down Expand Up @@ -268,15 +196,6 @@ def verify_vmbus_interrupts(self, node: Node, log: Logger) -> None:
if not found_hyperv_interrupt:
raise LisaException("Hyper-V interrupts are not recorded.")

def _get_cpu_config_file(self, cpu_id: str) -> str:
return f"/sys/devices/system/cpu/cpu{cpu_id}/online"

def _set_cpu_state(self, cpu_id: str, state: str, node: Node) -> bool:
file_path = self._get_cpu_config_file(cpu_id)
node.tools[Echo].write_to_file(state, node.get_pure_path(file_path), sudo=True)
result = node.tools[Cat].read(file_path, force_run=True, sudo=True)
return result == state

def _create_stimer_interrupts(self, node: Node, cpu_count: int) -> None:
# Run CPU intensive workload to create hyper-v synthetic timer
# interrupts.
Expand Down
2 changes: 1 addition & 1 deletion microsoft/testsuites/core/lsvmbus.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def lsvmbus_count_devices_channels(self, node: Node) -> None:
"1" == node.tools[VmGeneration].get_generation()
)
lsvmbus_tool = node.tools[Lsvmbus]
vmbus_devices_list = lsvmbus_tool.get_device_channels_from_lsvmbus()
vmbus_devices_list = lsvmbus_tool.get_device_channels()
actual_vmbus_device_names = [x.name for x in vmbus_devices_list]
assert_that(actual_vmbus_device_names).is_not_none()
assert_that(vmbus_devices.names).is_subset_of(actual_vmbus_device_names)
Expand Down
178 changes: 178 additions & 0 deletions microsoft/testsuites/cpu/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import annotations

from typing import Dict, List

from lisa import BadEnvironmentStateException, Logger, Node
from lisa.tools import Cat, Dmesg, Echo, Lscpu, Lsvmbus, Uname
from lisa.util import SkippedException


class CPUState:
OFFLINE: str = "0"
ONLINE: str = "1"


def check_runnable(node: Node) -> None:
uname = node.tools[Uname]
kernel_version = uname.get_linux_information().kernel_version
config_path = f"/boot/config-{kernel_version}"
config_result = node.execute(f"grep CONFIG_HOTPLUG_CPU=y {config_path}", shell=True)
if config_result.exit_code != 0:
raise SkippedException(
f"the distro {node.os.name} doesn't support cpu hotplug."
)


def set_interrupts_assigned_cpu(
log: Logger, node: Node, target_cpu: str = "0"
) -> Dict[str, str]:
uname = node.tools[Uname]
kernel_version = uname.get_linux_information().kernel_version
dmesg = node.tools[Dmesg]
lsvmbus = node.tools[Lsvmbus]
vmbus_version = dmesg.get_vmbus_version()
file_path_list: Dict[str, str] = {}
# the vmbus interrupt channel reassignment feature is available in 5.8+ kernel and
# vmbus version in 4.1+, the vmbus version is negotiated with the host.
if kernel_version >= "5.8.0" and vmbus_version >= "4.1.0":
# save the raw cpu number for each channel for restoring later.
channels = lsvmbus.get_device_channels(force_run=True)
for channel in channels:
for channel_vp_map in channel.channel_vp_map:
current_target_cpu = channel_vp_map.target_cpu
if current_target_cpu == target_cpu:
continue
file_path_list[
get_interrupts_assigned_cpu(
channel.device_id, channel_vp_map.rel_id
)
] = current_target_cpu
# set all vmbus channel interrupts go into cpu target_cpu.
assign_interrupts(file_path_list, node, target_cpu)
else:
# if current distro doesn't support this feature, the backup dict will be empty,
# there is nothing we can restore later, the case will rely on actual cpu usage
# on vm, if no idle cpu, then case will be skipped.
log.debug(
f"current distro {node.os.name}, os version {kernel_version}, "
f"vmbus version {vmbus_version} doesn't support "
"change channels target cpu featue."
)
return file_path_list


def get_idle_cpus(node: Node) -> List[str]:
lsvmbus = node.tools[Lsvmbus]
channels = lsvmbus.get_device_channels(force_run=True)
# get all cpu in used from vmbus channels assignment
cpu_in_used = set()
for channel in channels:
for channel_vp_map in channel.channel_vp_map:
target_cpu = channel_vp_map.target_cpu
if target_cpu == "0":
continue
cpu_in_used.add(target_cpu)

# get all cpu exclude cpu 0, usually cpu 0 is not allowed to do hotplug
cpu_count = node.tools[Lscpu].get_core_count()
all_cpu = list(range(1, cpu_count))

# get the idle cpu by excluding in used cpu from all cpu
idle_cpu = [str(x) for x in all_cpu if str(x) not in cpu_in_used]
return idle_cpu


def set_idle_cpu_offline_online(log: Logger, node: Node, idle_cpu: List[str]) -> None:
for target_cpu in idle_cpu:
set_offline = set_cpu_state(node, target_cpu, False)
log.debug(f"set cpu{target_cpu} from online to offline.")
exception_message = (
f"expected cpu{target_cpu} state: {CPUState.OFFLINE}(offline), "
f"actual state: {CPUState.ONLINE}(online)."
)
if not set_offline:
raise BadEnvironmentStateException(
exception_message,
f"the test failed leaving cpu{target_cpu} in a bad state.",
)

set_online = set_cpu_state(node, target_cpu, True)
log.debug(f"set cpu{target_cpu} from offline to online.")
exception_message = (
f"expected cpu{target_cpu} state: {CPUState.ONLINE}(online), "
f"actual state: {CPUState.OFFLINE}(offline)."
)
if not set_online:
raise BadEnvironmentStateException(
exception_message,
f"the test failed leaving cpu{target_cpu} in a bad state.",
)


def verify_cpu_hot_plug(log: Logger, node: Node, run_times: int = 1) -> None:
check_runnable(node)
file_path_list: Dict[str, str] = {}
restore_state = False
try:
for iteration in range(1, run_times + 1):
log.debug(f"start the {iteration} time(s) testing.")
restore_state = False
# set vmbus channels target cpu into 0 if kernel supports this feature.
file_path_list = set_interrupts_assigned_cpu(log, node)
# when kernel doesn't support above feature, we have to rely on current vm's
# cpu usage. then collect the cpu not in used exclude cpu0.
idle_cpu = get_idle_cpus(node)
if 0 == len(idle_cpu):
raise SkippedException(
"all of the cpu are associated vmbus channels,"
" no idle cpu can be used to test hotplug."
)
# start to take idle cpu from online to offline, then offline to online.
set_idle_cpu_offline_online(log, node, idle_cpu)
# when kernel doesn't support set vmbus channels target cpu feature, the
# dict which stores original status is empty, nothing need to be restored.
restore_interrupts_assignment(file_path_list, node)
restore_state = True
finally:
if not restore_state:
restore_interrupts_assignment(file_path_list, node)


def get_cpu_state_file(cpu_id: str) -> str:
return f"/sys/devices/system/cpu/cpu{cpu_id}/online"


def get_interrupts_assigned_cpu(device_id: str, channel_id: str) -> str:
return f"/sys/bus/vmbus/devices/{device_id}/channels/{channel_id}/cpu"


def assign_interrupts(
path_cpu: Dict[str, str],
node: Node,
target_cpu: str = "0",
) -> None:
for path, _ in path_cpu.items():
node.tools[Echo].write_to_file(target_cpu, node.get_pure_path(path), sudo=True)


def restore_interrupts_assignment(
path_cpu: Dict[str, str],
node: Node,
) -> None:
if path_cpu:
for path, target_cpu in path_cpu.items():
node.tools[Echo].write_to_file(
target_cpu, node.get_pure_path(path), sudo=True
)


def set_cpu_state(node: Node, cpu: str, online: bool = False) -> bool:
file_path = get_cpu_state_file(cpu)
state = CPUState.OFFLINE
if online:
state = CPUState.ONLINE
node.tools[Echo].write_to_file(state, node.get_pure_path(file_path), sudo=True)
result = node.tools[Cat].read(file_path, force_run=True, sudo=True)
return result == state
53 changes: 53 additions & 0 deletions microsoft/testsuites/cpu/cpusuite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import annotations

from lisa import (
Logger,
Node,
TestCaseMetadata,
TestSuite,
TestSuiteMetadata,
simple_requirement,
)
from microsoft.testsuites.cpu.common import verify_cpu_hot_plug


@TestSuiteMetadata(
area="cpu",
category="functional",
description="""
This test suite is used to run cpu related tests.
""",
)
class CPUSuite(TestSuite):
@TestCaseMetadata(
description="""
This test will check cpu hotplug.
Steps :
1. skip test case when kernel doesn't support cpu hotplug.
2. set all vmbus channels target to cpu 0.
when kernel version >= 5.8 and vmbus version >= 4.1, code supports changing
vmbus channels target cpu, by setting the cpu number to the file
/sys/bus/vmbus/devices/<device id>/channels/<channel id>/cpu.
then all cpus except for cpu 0 are in idle state.
2.1 save the raw cpu number of each channel for restoring after testing.
2.2 set all vmbus channel interrupts go into cpu 0.
3. collect idle cpu which can be used for hotplug.
if the kernel supports step 2, now in used cpu is 0.
exclude the in used cpu from all cpu list to get idle cpu set which can be
offline and online.
if the kernel doesn't step 2,
the idle cpu is quite rely on the cpu usage at that time.
4. skip testing when there is no idle cpu can be set offline and online.
5. set idle cpu offline then back to online.
6. restore the cpu vmbus channel target cpu back to the original state.
""",
priority=3,
requirement=simple_requirement(
min_core_count=32,
),
)
def verify_cpu_hot_plug(self, log: Logger, node: Node) -> None:
verify_cpu_hot_plug(log, node)
Loading

0 comments on commit 8106c40

Please sign in to comment.