From b6a7c645ee60eb16e4e8fdfdb0867a4a8b00de08 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 22 Oct 2024 22:22:36 +0200 Subject: [PATCH] Search driver root for imex configs This change also searches the configured driver-root in the container for the specified IMEX configs. This allows for better integration with the GPU Operator where these paths are set. Note that this moves the GFD-specific config option to a more general IMEX section. Signed-off-by: Evan Lezar --- api/config/v1/config.go | 1 + api/config/v1/flags.go | 6 ----- api/config/v1/flags_test.go | 6 ++--- api/config/v1/imex.go | 4 ++++ cmd/gpu-feature-discovery/main.go | 8 ++++++- internal/lm/fabric.go | 37 ++++++++++++++++++++++++++----- 6 files changed, 46 insertions(+), 16 deletions(-) diff --git a/api/config/v1/config.go b/api/config/v1/config.go index c102c4239..0af2c8125 100644 --- a/api/config/v1/config.go +++ b/api/config/v1/config.go @@ -61,6 +61,7 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) { if c.IsSet("imex-required") { config.Imex.Required = c.Bool("imex-required") } + updateFromCLIFlag(&config.Imex.NodesConfigFile, c, "imex-nodes-config-file") // If nvidiaDevRoot (the path to the device nodes on the host) is not set, // we default to using the driver root on the host. diff --git a/api/config/v1/flags.go b/api/config/v1/flags.go index 1e0eb4b00..2720379e2 100644 --- a/api/config/v1/flags.go +++ b/api/config/v1/flags.go @@ -108,10 +108,6 @@ type GFDCommandLineFlags struct { SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"` OutputFile *string `json:"outputFile" yaml:"outputFile"` MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"` - // ImexNodesConfigFile is the path to a file containing the IP addresses of nodes - // that are part of the IMEX domain. - // Note that this is the absolute path to the file in the device plugin container. - ImexNodesConfigFile *string `json:"imexNodesConfigFile" yaml:"imexNodesConfigFile"` } // UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set. @@ -166,8 +162,6 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) { updateFromCLIFlag(&f.GFD.Oneshot, c, n) case "output-file": updateFromCLIFlag(&f.GFD.OutputFile, c, n) - case "imex-nodes-config-file": - updateFromCLIFlag(&f.GFD.ImexNodesConfigFile, c, n) case "sleep-interval": updateFromCLIFlag(&f.GFD.SleepInterval, c, n) case "no-timestamp": diff --git a/api/config/v1/flags_test.go b/api/config/v1/flags_test.go index bbbf3772e..3d8720380 100644 --- a/api/config/v1/flags_test.go +++ b/api/config/v1/flags_test.go @@ -186,8 +186,7 @@ func TestMarshalFlags(t *testing.T) { "noTimestamp": null, "outputFile": null, "sleepInterval": "0s", - "machineTypeFile": null, - "imexNodesConfigFile": null + "machineTypeFile": null } }`, }, @@ -211,8 +210,7 @@ func TestMarshalFlags(t *testing.T) { "noTimestamp": null, "outputFile": null, "sleepInterval": "5ns", - "machineTypeFile": null, - "imexNodesConfigFile": null + "machineTypeFile": null } }`, }, diff --git a/api/config/v1/imex.go b/api/config/v1/imex.go index 928e13e85..9a98b1f86 100644 --- a/api/config/v1/imex.go +++ b/api/config/v1/imex.go @@ -39,6 +39,10 @@ type Imex struct { // If it is not required its injection is skipped if the device nodes do not exist or if its // existence cannot be queried. Required bool `json:"required,omitempty" yaml:"required,omitempty"` + // NodesConfigFile defines the location to the IMEX nodes config file. + // Such a nodes config file contains the IP addresses of nodes that are part of the IMEX domain. + // Note that this is the absolute path to the file in the device plugin container. + NodesConfigFile *string `json:"nodesConfigFile,omitempty" yaml:"nodesConfigFile,omitempty"` } // AssertChannelIDsIsValid checks whether the specified list of channel IDs is valid. diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index 2a35e243b..ec242019a 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -115,6 +115,13 @@ func main() { Usage: "the strategy to use to discover devices: 'auto', 'nvml', 'tegra' or 'vfio'", EnvVars: []string{"DEVICE_DISCOVERY_STRATEGY"}, }, + &cli.StringFlag{ + Name: "driver-root-ctr-path", + Aliases: []string{"container-driver-root"}, + Value: spec.DefaultContainerDriverRoot, + Usage: "the path where the NVIDIA driver root is mounted in the container", + EnvVars: []string{"DRIVER_ROOT_CTR_PATH", "CONTAINER_DRIVER_ROOT"}, + }, } config.flags = append(config.flags, config.kubeClientConfig.Flags()...) @@ -150,7 +157,6 @@ func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) { if err != nil { return nil, fmt.Errorf("unable to validate flags: %v", err) } - config.Flags.Plugin = nil return config, nil } diff --git a/internal/lm/fabric.go b/internal/lm/fabric.go index f84d2a3ed..166603a54 100644 --- a/internal/lm/fabric.go +++ b/internal/lm/fabric.go @@ -18,10 +18,12 @@ package lm import ( "bufio" + "errors" "fmt" "io" "net" "os" + "path/filepath" "sort" "strings" @@ -33,15 +35,40 @@ import ( ) func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) { - if config.Flags.GFD.ImexNodesConfigFile == nil || *config.Flags.GFD.ImexNodesConfigFile == "" { + if config.Imex.NodesConfigFile == nil || *config.Imex.NodesConfigFile == "" { // No imex config file, return empty labels return empty{}, nil } - imexConfigFile, err := os.Open(*config.Flags.GFD.ImexNodesConfigFile) + nodesConfigFiles := []string{*config.Imex.NodesConfigFile} + if root := config.Flags.Plugin.ContainerDriverRoot; root != nil && *root != "" { + nodesConfigFiles = append(nodesConfigFiles, filepath.Join(*root, *config.Imex.NodesConfigFile)) + } + + var errs error + for _, configFilePath := range nodesConfigFiles { + imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices) + if err != nil { + errs = errors.Join(errs, err) + continue + } + if imexLabeler != nil { + klog.Infof("Using labeler for IMEX config %v", configFilePath) + return imexLabeler, nil + } + } + if errs != nil { + return nil, errs + } + + return empty{}, nil +} + +func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) { + imexConfigFile, err := os.Open(configFilePath) if os.IsNotExist(err) { // No imex config file, return empty labels - return empty{}, nil + return nil, nil } else if err != nil { return nil, fmt.Errorf("failed to open imex config file: %v", err) } @@ -52,7 +79,7 @@ func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, er return nil, err } if clusterUUID == "" || cliqueID == "" { - return empty{}, nil + return nil, nil } imexDomainID, err := getImexDomainID(imexConfigFile) @@ -60,7 +87,7 @@ func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, er return nil, err } if imexDomainID == "" { - return empty{}, nil + return nil, nil } labels := Labels{