Skip to content

Commit

Permalink
skip install ofed driver when the image is hpc type (#2844)
Browse files Browse the repository at this point in the history
* skip install ofed driver when the image is hpc type

* fix verify_ibm_mpi on hpc images

* add is_hpc_image
  • Loading branch information
LiliDeng authored Jul 13, 2023
1 parent 7230be6 commit 9d60fe0
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 40 deletions.
13 changes: 13 additions & 0 deletions lisa/base_tools/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def check_service_exists(self, name: str) -> bool:
def is_service_inactive(self, name: str) -> bool:
return self._internal_tool._is_service_inactive(name) # type: ignore

def is_service_running(self, name: str) -> bool:
return self._internal_tool._check_service_running(name) # type: ignore


class ServiceInternal(Tool):
@property
Expand Down Expand Up @@ -96,6 +99,10 @@ def restart_service(self, name: str, ignore_exit_code: int = 0) -> None:

_check_error_codes(cmd_result, ignore_exit_code)

def is_service_running(self, name: str) -> bool:
cmd_result = self.run(f"{name} status", shell=True, sudo=True, force_run=True)
return "Active: active" in cmd_result.stdout


class Systemctl(Tool):
__STATE_PATTERN = re.compile(r"^(\s+)State:(\s+)(?P<state>.*)", re.M)
Expand Down Expand Up @@ -141,6 +148,12 @@ def state(self) -> str:
)
return group["state"]

def is_service_running(self, name: str) -> bool:
cmd_result = self.run(
f"--full --no-pager status {name}", shell=True, sudo=True, force_run=True
)
return "Active: active" in cmd_result.stdout

def _check_exists(self) -> bool:
return True

Expand Down
99 changes: 68 additions & 31 deletions lisa/features/infiniband.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from lisa.feature import Feature
from lisa.features import Disk
from lisa.operating_system import CentOs, Oracle, Redhat, Ubuntu
from lisa.tools import Firewall, Ls, Lspci, Make
from lisa.tools import Firewall, Ls, Lspci, Make, Service
from lisa.tools.tar import Tar
from lisa.util import (
LisaException,
Expand Down Expand Up @@ -55,6 +55,7 @@ class Infiniband(Feature):

def _initialize(self, *args: Any, **kwargs: Any) -> None:
super()._initialize(*args, **kwargs)
self.is_hpc_image = False
self.resource_disk_path = self._node.features[
Disk
].get_resource_disk_mount_point()
Expand Down Expand Up @@ -171,7 +172,40 @@ def get_pkey(self) -> str:
cat = self._node.tools[Cat]
return cat.read(f"/sys/class/infiniband/{ib_device_name}/ports/1/pkeys/0")

def setup_rdma(self) -> None: # noqa: C901
def setup_rdma(self) -> None:
if not self.is_hpc_image:
self.install_ofed()

node = self._node
# Turn off firewall
firewall = node.tools[Firewall]
firewall.stop()
# Disable SELinux
sed = node.tools[Sed]
sed.substitute(
regexp="SELINUX=enforcing",
replacement="SELINUX=disabled",
file="/etc/selinux/config",
sudo=True,
)

# for non-hpc images, add net.ifnames=0 biosdevname=0 in boot kernel parameter
# to make ib device name consistent across reboots
if (
not node.tools[Service].is_service_running("azure_persistent_rdma_naming")
and isinstance(node.os, Ubuntu)
and node.os.information.version > "18.4.0"
):
node.tools[Sed].substitute(
regexp='GRUB_CMDLINE_LINUX="\\(.*\\)"',
replacement='GRUB_CMDLINE_LINUX="\\1 net.ifnames=0 biosdevname=0"',
file="/etc/default/grub",
sudo=True,
)
node.execute("update-grub", sudo=True)
node.reboot()

def _install_dependencies(self) -> None:
node = self._node
os_version = node.os.information.release.split(".")
# Dependencies
Expand Down Expand Up @@ -297,18 +331,12 @@ def setup_rdma(self) -> None: # noqa: C901
"supported by the HCP team",
)

# Turn off firewall
firewall = node.tools[Firewall]
firewall.stop()

# Disable SELinux
sed = node.tools[Sed]
sed.substitute(
regexp="SELINUX=enforcing",
replacement="SELINUX=disabled",
file="/etc/selinux/config",
sudo=True,
)
def install_ofed(self) -> None:
node = self._node
os_version = node.os.information.release.split(".")
# Dependencies
kernel = node.tools[Uname].get_linux_information().kernel_version_raw
self._install_dependencies()

# Install OFED
mofed_version = self._get_mofed_version()
Expand Down Expand Up @@ -390,16 +418,6 @@ def setup_rdma(self) -> None: # noqa: C901
sudo=True,
)

if isinstance(node.os, Ubuntu) and node.os.information.version > "18.4.0":
node.tools[Sed].substitute(
regexp='GRUB_CMDLINE_LINUX="\\(.*\\)"',
replacement='GRUB_CMDLINE_LINUX="\\1 net.ifnames=0 biosdevname=0"',
file="/etc/default/grub",
sudo=True,
)
node.execute("update-grub", sudo=True)
node.reboot()

def install_intel_mpi(self) -> None:
node = self._node
# Install Intel MPI
Expand Down Expand Up @@ -451,6 +469,15 @@ def install_ibm_mpi(self) -> None:
node = self._node
if isinstance(node.os, Redhat):
node.os.install_packages("libstdc++.i686")
if isinstance(node.os, Ubuntu):
for package in [
"lib32gcc-9-dev",
"python3-dev",
"lib32gcc-8-dev",
"python-dev",
]:
if node.os.is_package_in_repo(package):
node.os.install_packages(package)
# Install Open MPI
wget = node.tools[Wget]
script_path = wget.get(
Expand All @@ -467,13 +494,23 @@ def install_ibm_mpi(self) -> None:
expected_exit_code=0,
expected_exit_code_failure_message="Failed to install IBM MPI.",
)
make = node.tools[Make]
make.make(
"",
cwd=node.get_pure_path("/opt/ibm/platform_mpi/help"),
update_envs={"MPI_IB_PKEY": self.get_pkey()},
sudo=True,
)
# if it is hpc image, use module tool load mpi/hpcx to compile the ping_pong.c
if self.is_hpc_image:
node.execute(
"bash -c 'source /usr/share/modules/init/bash"
" && module load mpi/hpcx && mpicc -o ping_pong ping_pong.c'",
cwd=node.get_pure_path("/opt/ibm/platform_mpi/help"),
sudo=True,
shell=True,
)
else:
make = node.tools[Make]
make.make(
"",
cwd=node.get_pure_path("/opt/ibm/platform_mpi/help"),
update_envs={"MPI_IB_PKEY": self.get_pkey()},
sudo=True,
)

def install_mvapich_mpi(self) -> None:
node = self._node
Expand Down
2 changes: 2 additions & 0 deletions lisa/sut_orchestrator/azure/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,8 @@ def is_over_nd(self) -> bool:
return "hvnd_try_bind_nic" in dmesg.get_output()

def setup_rdma(self) -> None:
if self._node.tools[Ls].path_exists("/opt/azurehpc/component_versions.txt"):
self.is_hpc_image = True
super().setup_rdma()
waagent = self._node.tools[Waagent]
devices = self._get_ib_device_names()
Expand Down
2 changes: 1 addition & 1 deletion lisa/tools/ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class Ip(Tool):
__ip_addr_show_regex = re.compile(
(
r"\d+: (?P<name>\w+): \<.+\> .+\n\s+link\/(?:ether|infiniband|loopback)"
r" (?P<mac>[0-9a-z:]+)( .+\\n(?:(?:.+\n\s+|.*)altname \w+))?"
r" (?P<mac>[0-9a-z:]+)( .+\n(?:(?:.+\n\s+|.*)altname \w+))?"
r"(.*(?:\s+inet (?P<ip_addr>[\d.]+)\/.*\n))?"
)
)
Expand Down
40 changes: 32 additions & 8 deletions microsoft/testsuites/hpc/infinibandsuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,20 +433,44 @@ def verify_ibm_mpi(self, environment: Environment, log: Logger) -> None:
server_ssh.add_known_host(client_ip)
client_ssh.add_known_host(server_ip)

# if it is hpc image, use module tool load mpi/hpcx
# then run pingpong test
if server_ib.is_hpc_image:
command_str_1 = (
"bash -c 'source /usr/share/modules/init/bash && module load mpi/hpcx "
f"&& mpirun --host {server_ip}:1,{server_ip}:1 -np 2 -x "
f"MPI_IB_PKEY={server_ib.get_pkey()} -x LD_LIBRARY_PATH "
"/opt/ibm/platform_mpi/help/ping_pong 4096'"
)
command_str_2 = (
"bash -c 'source /usr/share/modules/init/bash && module load mpi/hpcx "
f"&& mpirun --host {server_ip}:1,{client_ip}:1 -np 2 -x "
f"MPI_IB_PKEY={server_ib.get_pkey()} -x LD_LIBRARY_PATH "
"/opt/ibm/platform_mpi/help/ping_pong 4096'"
)
else:
command_str_1 = (
"/opt/ibm/platform_mpi/bin/mpirun "
f"-hostlist {server_ip}:1,{server_ip}:1 -np 2 -e "
f"MPI_IB_PKEY={server_ib.get_pkey()} -ibv /opt/ibm/platform_mpi/help/"
"ping_pong 4096"
)
command_str_2 = (
"/opt/ibm/platform_mpi/bin/mpirun "
f"-hostlist {server_ip}:1,{client_ip}:1 -np 2 -e "
f"MPI_IB_PKEY={server_ib.get_pkey()} -ibv /opt/ibm/platform_mpi/help/"
"ping_pong 4096"
)
server_node.execute(
"/opt/ibm/platform_mpi/bin/mpirun "
f"-hostlist {server_ip}:1,{server_ip}:1 -np 2 -e "
f"MPI_IB_PKEY={server_ib.get_pkey()} -ibv /opt/ibm/platform_mpi/help/"
"ping_pong 4096",
command_str_1,
shell=True,
expected_exit_code=0,
expected_exit_code_failure_message="Infiniband intra-node ping pong "
"test failed with IBM MPI",
)
server_node.execute(
"/opt/ibm/platform_mpi/bin/mpirun "
f"-hostlist {server_ip}:1,{client_ip}:1 -np 2 -e "
f"MPI_IB_PKEY={server_ib.get_pkey()} -ibv /opt/ibm/platform_mpi/help/"
"ping_pong 4096",
command_str_2,
shell=True,
expected_exit_code=0,
expected_exit_code_failure_message="Infiniband inter-node ping pong "
"test failed with IBM MPI",
Expand Down

0 comments on commit 9d60fe0

Please sign in to comment.