Skip to content

Commit

Permalink
Merge pull request #1009 from elezar/imex-config-file
Browse files Browse the repository at this point in the history
Search driver root for IMEX node configs
  • Loading branch information
cdesiniotis authored Oct 22, 2024
2 parents 41cb251 + b6a7c64 commit 620aaed
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 16 deletions.
1 change: 1 addition & 0 deletions api/config/v1/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) {
if c.IsSet("imex-required") {
config.Imex.Required = c.Bool("imex-required")
}
updateFromCLIFlag(&config.Imex.NodesConfigFile, c, "imex-nodes-config-file")

// If nvidiaDevRoot (the path to the device nodes on the host) is not set,
// we default to using the driver root on the host.
Expand Down
6 changes: 0 additions & 6 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,6 @@ type GFDCommandLineFlags struct {
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
// ImexNodesConfigFile is the path to a file containing the IP addresses of nodes
// that are part of the IMEX domain.
// Note that this is the absolute path to the file in the device plugin container.
ImexNodesConfigFile *string `json:"imexNodesConfigFile" yaml:"imexNodesConfigFile"`
}

// UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set.
Expand Down Expand Up @@ -166,8 +162,6 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config-file":
updateFromCLIFlag(&f.GFD.ImexNodesConfigFile, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 2 additions & 4 deletions api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,7 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "0s",
"machineTypeFile": null,
"imexNodesConfigFile": null
"machineTypeFile": null
}
}`,
},
Expand All @@ -211,8 +210,7 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "5ns",
"machineTypeFile": null,
"imexNodesConfigFile": null
"machineTypeFile": null
}
}`,
},
Expand Down
4 changes: 4 additions & 0 deletions api/config/v1/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ type Imex struct {
// If it is not required its injection is skipped if the device nodes do not exist or if its
// existence cannot be queried.
Required bool `json:"required,omitempty" yaml:"required,omitempty"`
// NodesConfigFile defines the location to the IMEX nodes config file.
// Such a nodes config file contains the IP addresses of nodes that are part of the IMEX domain.
// Note that this is the absolute path to the file in the device plugin container.
NodesConfigFile *string `json:"nodesConfigFile,omitempty" yaml:"nodesConfigFile,omitempty"`
}

// AssertChannelIDsIsValid checks whether the specified list of channel IDs is valid.
Expand Down
8 changes: 7 additions & 1 deletion cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ func main() {
Usage: "the strategy to use to discover devices: 'auto', 'nvml', 'tegra' or 'vfio'",
EnvVars: []string{"DEVICE_DISCOVERY_STRATEGY"},
},
&cli.StringFlag{
Name: "driver-root-ctr-path",
Aliases: []string{"container-driver-root"},
Value: spec.DefaultContainerDriverRoot,
Usage: "the path where the NVIDIA driver root is mounted in the container",
EnvVars: []string{"DRIVER_ROOT_CTR_PATH", "CONTAINER_DRIVER_ROOT"},
},
}

config.flags = append(config.flags, config.kubeClientConfig.Flags()...)
Expand Down Expand Up @@ -150,7 +157,6 @@ func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) {
if err != nil {
return nil, fmt.Errorf("unable to validate flags: %v", err)
}
config.Flags.Plugin = nil

return config, nil
}
Expand Down
37 changes: 32 additions & 5 deletions internal/lm/fabric.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ package lm

import (
"bufio"
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"sort"
"strings"

Expand All @@ -33,15 +35,40 @@ import (
)

func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
if config.Flags.GFD.ImexNodesConfigFile == nil || *config.Flags.GFD.ImexNodesConfigFile == "" {
if config.Imex.NodesConfigFile == nil || *config.Imex.NodesConfigFile == "" {
// No imex config file, return empty labels
return empty{}, nil
}

imexConfigFile, err := os.Open(*config.Flags.GFD.ImexNodesConfigFile)
nodesConfigFiles := []string{*config.Imex.NodesConfigFile}
if root := config.Flags.Plugin.ContainerDriverRoot; root != nil && *root != "" {
nodesConfigFiles = append(nodesConfigFiles, filepath.Join(*root, *config.Imex.NodesConfigFile))
}

var errs error
for _, configFilePath := range nodesConfigFiles {
imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices)
if err != nil {
errs = errors.Join(errs, err)
continue
}
if imexLabeler != nil {
klog.Infof("Using labeler for IMEX config %v", configFilePath)
return imexLabeler, nil
}
}
if errs != nil {
return nil, errs
}

return empty{}, nil
}

func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) {
imexConfigFile, err := os.Open(configFilePath)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return empty{}, nil
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
Expand All @@ -52,15 +79,15 @@ func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, er
return nil, err
}
if clusterUUID == "" || cliqueID == "" {
return empty{}, nil
return nil, nil
}

imexDomainID, err := getImexDomainID(imexConfigFile)
if err != nil {
return nil, err
}
if imexDomainID == "" {
return empty{}, nil
return nil, nil
}

labels := Labels{
Expand Down

0 comments on commit 620aaed

Please sign in to comment.