Skip to content

Commit

Permalink
Merge pull request #85 from cdesiniotis/move-file-discovery-to-go
Browse files Browse the repository at this point in the history
[mig-manager] move library/nvidia-smi discovery logic to Go
  • Loading branch information
cdesiniotis authored Jun 25, 2024
2 parents 3617e6a + f8b0c31 commit 138411f
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 61 deletions.
89 changes: 89 additions & 0 deletions cmd/nvidia-mig-manager/find.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/

package main

import (
"fmt"
"path/filepath"
)

type root string

// getDriverLibraryPath returns path to `libnvidia-ml.so.1` in the driver root.
// The folder for this file is also expected to be the location of other driver files.
func (r root) getDriverLibraryPath() (string, error) {
librarySearchPaths := []string{
"/usr/lib64",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/aarch64-linux-gnu",
"/lib64",
"/lib/x86_64-linux-gnu",
"/lib/aarch64-linux-gnu",
}

libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...)
if err != nil {
return "", err
}

return libraryPath, nil
}

// getNvidiaSMIPath returns path to the `nvidia-smi` executable in the driver root.
func (r root) getNvidiaSMIPath() (string, error) {
binarySearchPaths := []string{
"/usr/bin",
"/usr/sbin",
"/bin",
"/sbin",
}

binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...)
if err != nil {
return "", err
}

return binaryPath, nil
}

// findFile searches the root for a specified file.
// A number of folders can be specified to search in addition to the root itself.
// If the file represents a symlink, this is resolved and the final path is returned.
func (r root) findFile(name string, searchIn ...string) (string, error) {

for _, d := range append([]string{"/"}, searchIn...) {
l := filepath.Join(string(r), d, name)
candidate, err := resolveLink(l)
if err != nil {
continue
}
return candidate, nil
}

return "", fmt.Errorf("error locating %q", name)
}

// resolveLink finds the target of a symlink or the file itself in the
// case of a regular file.
// This is equivalent to running `readlink -f ${l}`.
func resolveLink(l string) (string, error) {
resolved, err := filepath.EvalSymlinks(l)
if err != nil {
return "", fmt.Errorf("error resolving link '%s': %w", l, err)
}
return resolved, nil
}
48 changes: 45 additions & 3 deletions cmd/nvidia-mig-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"

Expand Down Expand Up @@ -280,6 +281,11 @@ func start(c *cli.Context) error {
return fmt.Errorf("error building kubernetes clientset from config: %s", err)
}

driverLibraryPath, nvidiaSMIPath, err := getPathsForCDI()
if err != nil {
return fmt.Errorf("failed to get paths required for cdi: %w", err)
}

migConfig := NewSyncableMigConfig()

stop := ContinuouslySyncMigConfigChanges(clientset, migConfig)
Expand All @@ -289,7 +295,7 @@ func start(c *cli.Context) error {
log.Infof("Waiting for change to '%s' label", MigConfigLabel)
value := migConfig.Get()
log.Infof("Updating to MIG config: %s", value)
err := runScript(value)
err := runScript(value, driverLibraryPath, nvidiaSMIPath)
if err != nil {
log.Errorf("Error: %s", err)
continue
Expand All @@ -298,6 +304,42 @@ func start(c *cli.Context) error {
}
}

// getPathsForCDI discovers the paths to libnvidia-ml.so.1 and nvidia-smi
// when required.
//
// After applying a MIG configuration but before generating a CDI spec,
// it is required to run nvidia-smi to create the nvidia-cap* device nodes.
// If driverRoot != devRoot, we must discover the paths to libnvidia-ml.so.1 and
// nvidia-smi in order to run nvidia-smi. We discover the paths here once and
// pass these as arguments to reconfigure-mig.sh
//
// Currently, driverRoot != devRoot only when devRoot='/'. Since mig-manager
// has rw access to the host rootFS (at hostRootMountFlag), reconfigure-mig.sh
// will first chroot into the host rootFS before invoking nvidia-smi, so the
// device nodes get created at '/dev' on the host.
func getPathsForCDI() (string, string, error) {
if !cdiEnabledFlag || (driverRoot == devRoot) {
return "", "", nil
}

driverRoot := root(filepath.Join(hostRootMountFlag, driverRoot))
driverLibraryPath, err := driverRoot.getDriverLibraryPath()
if err != nil {
return "", "", fmt.Errorf("failed to locate driver libraries: %w", err)
}
// Strip the leading '/host' so that the path is relative to the host rootFS
driverLibraryPath = filepath.Clean(strings.TrimPrefix(driverLibraryPath, hostRootMountFlag))

nvidiaSMIPath, err := driverRoot.getNvidiaSMIPath()
if err != nil {
return "", "", fmt.Errorf("failed to locate nvidia-smi: %w", err)
}
// Strip the leading '/host' so that the path is relative to the host rootFS
nvidiaSMIPath = filepath.Clean(strings.TrimPrefix(nvidiaSMIPath, hostRootMountFlag))

return driverLibraryPath, nvidiaSMIPath, nil
}

func parseGPUCLientsFile(file string) (*GPUClients, error) {
var err error
var yamlBytes []byte
Expand All @@ -320,7 +362,7 @@ func parseGPUCLientsFile(file string) (*GPUClients, error) {
return &clients, nil
}

func runScript(migConfigValue string) error {
func runScript(migConfigValue string, driverLibraryPath string, nvidiaSMIPath string) error {
gpuClients, err := parseGPUCLientsFile(gpuClientsFileFlag)
if err != nil {
return fmt.Errorf("error parsing host's GPU clients file: %s", err)
Expand All @@ -338,7 +380,7 @@ func runScript(migConfigValue string) error {
"-p", defaultGPUClientsNamespaceFlag,
}
if cdiEnabledFlag {
args = append(args, "-e", "-t", driverRoot, "-a", driverRootCtrPath, "-b", devRoot, "-j", devRootCtrPath)
args = append(args, "-e", "-t", driverRoot, "-a", driverRootCtrPath, "-b", devRoot, "-j", devRootCtrPath, "-l", driverLibraryPath, "-q", nvidiaSMIPath)
}
if withRebootFlag {
args = append(args, "-r")
Expand Down
82 changes: 24 additions & 58 deletions deployments/container/reconfigure-mig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ DRIVER_ROOT=""
DRIVER_ROOT_CTR_PATH=""
DEV_ROOT=""
DEV_ROOT_CTR_PATH=""
DRIVER_LIBRARY_PATH=""
NVIDIA_SMI_PATH=""

export SYSTEMD_LOG_LEVEL="info"

function usage() {
echo "USAGE:"
echo " ${0} -h "
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path>] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
echo ""
echo "OPTIONS:"
echo " -h Display this help message"
Expand All @@ -56,9 +58,11 @@ function usage() {
echo " -a <driver-root-ctr-path> Root path to the NVIDIA driver installation mounted in the container"
echo " -b <dev-root> Root path to the NVIDIA device nodes"
echo " -j <dev-root-ctr-path> Root path to the NVIDIA device nodes mounted in the container"
echo " -l <driver-library-path> Path to libnvidia-ml.so.1 in the container"
echo " -q <nvidia-smi-path> Path to nvidia-smi in the container"
}

while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:" opt; do
while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:l:q:" opt; do
case ${opt} in
h ) # process option h
usage; exit 0
Expand Down Expand Up @@ -111,7 +115,13 @@ while getopts "hrden:f:c:m:i:o:g:k:p:t:a:b:j:" opt; do
j ) # process option j
DEV_ROOT_CTR_PATH=${OPTARG}
;;
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path>] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
l ) # process option l
DRIVER_LIBRARY_PATH=${OPTARG}
;;
q ) # process option q
NVIDIA_SMI_PATH=${OPTARG}
;;
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [-e -t <driver-root> -a <driver-root-ctr-path> -b <dev-root> -j <dev-root-ctr-path> -l <driver-library-path> -q <nvidia-smi-path> ] [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
;;
esac
done
Expand Down Expand Up @@ -147,6 +157,16 @@ if [ "${CDI_ENABLED}" = "true" ]; then
if [ "${DEV_ROOT_CTR_PATH}" == "" ]; then
DEV_ROOT_CTR_PATH="${DRIVER_ROOT_CTR_PATH}"
fi
if [ "${DRIVER_ROOT_CTR_PATH}" != "${DEV_ROOT_CTR_PATH}" ]; then
if [ "${DRIVER_LIBRARY_PATH}" = "" ]; then
echo "Error: missing -l <driver-library-path> flag"
usage; exit 1
fi
if [ "${NVIDIA_SMI_PATH}" = "" ]; then
echo "Error: missing -q <nvidia-smi-path> flag"
usage; exit 1
fi
fi
fi

HOST_GPU_CLIENT_SERVICES=(${HOST_GPU_CLIENT_SERVICES//,/ })
Expand Down Expand Up @@ -349,40 +369,6 @@ EOF
return 0
}

function get_driver_library_path() {
local search_paths=("/usr/lib64"
"/usr/lib/x86_64-linux-gnu"
"/usr/lib/aarch64-linux-gnu"
"/lib64"
"/lib/x86_64-linux-gnu"
"/lib/aarch64-linux-gnu"
)
for search_path in "${search_paths[@]}"
do
path=$(find "/host/${DRIVER_ROOT}${search_path}" -name "libnvidia-ml.so.1" 2>/dev/null | head -n 1)
if [ ! -z $path ]; then
echo "$path"
return
fi
done
}

function get_nvidia_smi_path() {
local search_paths=("/usr/bin"
"/usr/sbin"
"/bin"
"/sbin"
)
for search_path in "${search_paths[@]}"
do
path=$(find "/host/${DRIVER_ROOT}${search_path}" -name "nvidia-smi" 2>/dev/null | head -n 1)
if [ ! -z $path ]; then
echo "$path"
return
fi
done
}

function run_nvidia_smi() {
if [ "${DRIVER_ROOT_CTR_PATH}" = "${DEV_ROOT_CTR_PATH}" ]; then
chroot ${DRIVER_ROOT_CTR_PATH} nvidia-smi >/dev/null
Expand All @@ -392,27 +378,7 @@ function run_nvidia_smi() {
return 0
fi

# Currently, driverRoot != devRoot only when devRoot='/'.
# Since both devRoot and driverRoot are relative to hostRoot, the below
# code will execute nvidia-smi in the host's environment. This will create
# the device nodes at /host/dev
library_path=$(get_driver_library_path)
if [ -z $library_path ]; then
echo "failed to discover the path to libnvidia-ml.so.1"
return 1
fi
# Strip the leading '/host' so that the path is relative to the host rootFS
library_path=${library_path#"/host"}

nvidia_smi_path=$(get_nvidia_smi_path)
if [ -z $nvidia_smi_path ]; then
echo "failed to discover the path to nvidia-smi"
return 1
fi
# Strip the leading '/host' so that the path is relative to the host rootFS
nvidia_smi_path=${nvidia_smi_path#"/host"}

LD_PRELOAD=$library_path chroot /host $nvidia_smi_path >/dev/null 2>&1
LD_PRELOAD=$DRIVER_LIBRARY_PATH chroot $HOST_ROOT_MOUNT $NVIDIA_SMI_PATH >/dev/null 2>&1
if [ "${?}" != "0" ]; then
return 1
fi
Expand Down

0 comments on commit 138411f

Please sign in to comment.