Skip to content

Commit

Permalink
Fix kdump test for Mariner (#3475)
Browse files Browse the repository at this point in the history
* Add sufficient sized data-disk in kdump test

* Move print_additional_info just before panic

* Use execute_async and remove kill_on_timeout parameter

* Fix linter errors by moving the check into a new internal function

* Remove whitespace

* Use black format
  • Loading branch information
0xba1a authored Oct 22, 2024
1 parent b80e5bc commit 717e7c4
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 6 deletions.
107 changes: 106 additions & 1 deletion lisa/tools/kdump.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import math
import re
from pathlib import PurePath, PurePosixPath
from time import sleep
Expand All @@ -12,6 +13,8 @@
from lisa.executable import Tool
from lisa.operating_system import CBLMariner, Debian, Oracle, Posix, Redhat, Suse
from lisa.tools import Find, Gcc
from lisa.tools.lsblk import Lsblk
from lisa.tools.lscpu import Lscpu
from lisa.tools.make import Make
from lisa.tools.sysctl import Sysctl
from lisa.tools.tar import Tar
Expand Down Expand Up @@ -451,6 +454,10 @@ def check_crashkernel_loaded(self, crashkernel_memory: str) -> None:
# Check if memory is reserved for crash kernel
self._check_crashkernel_memory_reserved()

def capture_info(self) -> None:
# Override this method to print additional info before panic
return


class KdumpRedhat(KdumpBase):
@property
Expand Down Expand Up @@ -597,15 +604,113 @@ def _install(self) -> bool:
self.node.os.install_packages("kexec-tools")
return self._check_exists()

def enable_kdump_service(self) -> None:
"""
This method enables the kdump service.
"""
kdump_conf = "/etc/kdump.conf"
sed = self.node.tools[Sed]
# Remove force_no_rebuild=1 if present
sed.substitute(
match_lines="^force_no_rebuild",
regexp="force_no_rebuild",
replacement="#force_no_rebuild",
file=kdump_conf,
sudo=True,
)
# Set mariner_2_initrd_use_suffix. Otherwise it will replace
# the original initrd file which will cause a reboot-loop
sed.substitute(
match_lines="mariner_2_initrd_use_suffix",
regexp="#mariner_2_initrd_use_suffix",
replacement="mariner_2_initrd_use_suffix",
file=kdump_conf,
sudo=True,
)

# Check for sufficient core numbers
self.ensure_nr_cpus()

super().enable_kdump_service()

def ensure_nr_cpus(self) -> None:
lscpu = self.node.tools[Lscpu]
core_count = lscpu.get_core_count()
preferred_nr_cpus = math.ceil(core_count / 56)
conf_file = "/etc/sysconfig/kdump"
sed = self.node.tools[Sed]
# replace nr_cpus=<whatever> to nr_cpus=preferred_nr_cpus
sed.substitute(
match_lines="^KDUMP_COMMANDLINE_APPEND",
regexp="nr_cpus=[^[:space:]]*",
replacement=f"nr_cpus={preferred_nr_cpus}",
file=conf_file,
sudo=True,
)

def calculate_crashkernel_size(self, total_memory: str) -> str:
# For x86 and arm64 Mariner, the default setting is 256M
return ""

def _get_crashkernel_cfg_file(self) -> str:
return "/boot/mariner.cfg"
if self.node.os.information.version.major >= 3:
return "/etc/default/grub.d/51_kexec_tools.cfg"
else:
return "/boot/mariner.cfg"

def _get_crashkernel_cfg_cmdline(self) -> str:
return "mariner_cmdline"

def _get_crashkernel_update_cmd(self, crashkernel: str) -> str:
return ""

def config_resource_disk_dump_path(self, dump_path: str) -> None:
"""
If the system memory size is bigger than 1T, the default size of /var/crash
may not be enough to store the dump file, need to change the dump path
"""
self.node.execute(
f"mkdir -p {dump_path}",
expected_exit_code=0,
expected_exit_code_failure_message=(f"Fail to create dir {dump_path}"),
shell=True,
sudo=True,
)
self.dump_path = dump_path
# Change dump path in kdump conf
kdump_conf = "/etc/kdump.conf"
sed = self.node.tools[Sed]
sed.substitute(
match_lines="^path",
regexp="path",
replacement="#path",
file=kdump_conf,
sudo=True,
)
sed.append(f"path {self.dump_path}", kdump_conf, sudo=True)

def capture_info(self) -> None:
# print /proc/cmdline
cat = self.node.tools[Cat]
result = cat.run("/proc/cmdline", force_run=True, sudo=True)
self._log.info(f"Current kernel command line: {result.stdout}")
# print /etc/default/grub.d/51_kexec_tools.cfg
result = cat.run(self._get_crashkernel_cfg_file(), force_run=True, sudo=True)
self._log.info(f"Current kernel cmdline in config file: {result.stdout}")
# print /etc/sysconfig/kdump
result = cat.run("/etc/sysconfig/kdump", force_run=True, sudo=True)
self._log.info(f"Current kdump configuration: {result.stdout}")
# print /proc/sys/kernel/sysrq
result = cat.run("/proc/sys/kernel/sysrq", force_run=True, sudo=True)
self._log.info(f"Current sysrq value: {result.stdout}")
# print lsblk -l output
lsblk = self.node.tools[Lsblk]
result = lsblk.run("-l", force_run=True)
self._log.info(f"Current disk partitions: {result.stdout}")
# print /etc/fstab
result = cat.run("/etc/fstab", force_run=True, sudo=True)
self._log.info(f"Current fstab: {result.stdout}")
# print /etc/kdump.conf
result = cat.run("/etc/kdump.conf", force_run=True, sudo=True)
self._log.info(f"Current kdump configuration: {result.stdout}")
return
29 changes: 24 additions & 5 deletions microsoft/testsuites/kdump/kdumpcrash.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,24 @@ def _get_resource_disk_dump_path(self, node: Node) -> str:
dump_path = mount_point + "/crash"
return dump_path

def _is_system_with_more_memory(self, node: Node) -> bool:
free = node.tools[Free]
total_memory = free.get_total_memory()
# Return true when system memory is 10 GiB higher than the OS disk size
if "T" in total_memory or (
"G" in total_memory
and (
node.capability.disk
and isinstance(node.capability.disk.os_disk_size, int)
and (
float(total_memory.strip("G"))
> (node.capability.disk.os_disk_size - 10)
)
)
):
return True
return False

def _kdump_test(self, node: Node, log_path: Path, log: Logger) -> None:
try:
self._check_supported(node)
Expand All @@ -281,14 +299,14 @@ def _kdump_test(self, node: Node, log_path: Path, log: Logger) -> None:
if self.is_auto:
self.crash_kernel = "auto"

if "T" in total_memory and float(total_memory.strip("T")) > 1:
# System memory is more than 1T, need to change the dump path
if self._is_system_with_more_memory(node):
# System memory is more os disk size, need to change the dump path
# and increase the timeout duration
kdump.config_resource_disk_dump_path(
self._get_resource_disk_dump_path(node)
)
self.timeout_of_dump_crash = 1200
if float(total_memory.strip("T")) > 6:
if "T" in total_memory and float(total_memory.strip("T")) > 6:
self.timeout_of_dump_crash = 2000

kdump.config_crashkernel_memory(self.crash_kernel)
Expand All @@ -310,14 +328,15 @@ def _kdump_test(self, node: Node, log_path: Path, log: Logger) -> None:
echo.write_to_file("1", node.get_pure_path("/proc/sys/kernel/sysrq"), sudo=True)
node.execute("sync", shell=True, sudo=True)

kdump.capture_info()

try:
# Trigger kdump. After execute the trigger cmd, the VM will be disconnected
# We set a timeout time 10.
node.execute(
node.execute_async(
self.trigger_kdump_cmd,
shell=True,
sudo=True,
timeout=10,
)
except Exception as identifier:
log.debug(f"ignorable ssh exception: {identifier}")
Expand Down

0 comments on commit 717e7c4

Please sign in to comment.