Skip to content

Commit

Permalink
Replace k8s-client.go with client sets
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Feb 29, 2024
1 parent 0727338 commit 4fa5123
Show file tree
Hide file tree
Showing 8 changed files with 285 additions and 108 deletions.
80 changes: 59 additions & 21 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"k8s.io/klog/v2"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/flags"
"github.com/NVIDIA/k8s-device-plugin/internal/info"
"github.com/NVIDIA/k8s-device-plugin/internal/lm"
"github.com/NVIDIA/k8s-device-plugin/internal/logger"
Expand All @@ -22,18 +23,29 @@ import (
"github.com/NVIDIA/k8s-device-plugin/internal/watch"
)

// Config represents a collection of config options for GFD.
type Config struct {
configFile string

kubeClientConfig flags.KubeClientConfig
nodeConfig flags.NodeConfig

// flags stores the CLI flags for later processing.
flags []cli.Flag
}

func main() {
var configFile string
config := &Config{}

c := cli.NewApp()
c.Name = "GPU Feature Discovery"
c.Usage = "generate labels for NVIDIA devices"
c.Version = info.GetVersionString()
c.Action = func(ctx *cli.Context) error {
return start(ctx, c.Flags)
return start(ctx, config)
}

c.Flags = []cli.Flag{
config.flags = []cli.Flag{
&cli.StringFlag{
Name: "mig-strategy",
Value: spec.MigStrategyNone,
Expand Down Expand Up @@ -79,7 +91,7 @@ func main() {
&cli.StringFlag{
Name: "config-file",
Usage: "the path to a config file as an alternative to command line options or environment variables",
Destination: &configFile,
Destination: &config.configFile,
EnvVars: []string{"GFD_CONFIG_FILE", "CONFIG_FILE"},
},
&cli.BoolFlag{
Expand All @@ -89,6 +101,11 @@ func main() {
},
}

config.flags = append(config.flags, config.kubeClientConfig.Flags()...)
config.flags = append(config.flags, config.nodeConfig.Flags()...)

c.Flags = config.flags

if err := c.Run(os.Args); err != nil {
klog.Error(err)
os.Exit(1)
Expand All @@ -99,8 +116,9 @@ func validateFlags(config *spec.Config) error {
return nil
}

func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {
config, err := spec.NewConfig(c, flags)
// loadConfig loads the config from the spec file.
func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) {
config, err := spec.NewConfig(c, cfg.flags)
if err != nil {
return nil, fmt.Errorf("unable to finalize config: %v", err)
}
Expand All @@ -113,7 +131,7 @@ func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {
return config, nil
}

func start(c *cli.Context, flags []cli.Flag) error {
func start(c *cli.Context, cfg *Config) error {
defer func() {
klog.Info("Exiting")
}()
Expand All @@ -124,7 +142,7 @@ func start(c *cli.Context, flags []cli.Flag) error {
for {
// Load the configuration file
klog.Info("Loading configuration.")
config, err := loadConfig(c, flags)
config, err := cfg.loadConfig(c)
if err != nil {
return fmt.Errorf("unable to load config: %v", err)
}
Expand All @@ -140,8 +158,19 @@ func start(c *cli.Context, flags []cli.Flag) error {
manager := resource.NewManager(config)
vgpul := vgpu.NewVGPULib(vgpu.NewNvidiaPCILib())

clientSets, err := cfg.kubeClientConfig.NewClientSets()
if err != nil {
return fmt.Errorf("failed to create clientsets: %w", err)
}
klog.Info("Start running")
restart, err := run(manager, vgpul, config, sigs)
d := &gfd{
manager: manager,
vgpu: vgpul,
config: config,
clientsets: clientSets,
nodeconfig: cfg.nodeConfig,
}
restart, err := d.run(sigs)
if err != nil {
return err
}
Expand All @@ -152,26 +181,35 @@ func start(c *cli.Context, flags []cli.Flag) error {
}
}

func run(manager resource.Manager, vgpu vgpu.Interface, config *spec.Config, sigs chan os.Signal) (bool, error) {
type gfd struct {
manager resource.Manager
vgpu vgpu.Interface
config *spec.Config

clientsets flags.ClientSets
nodeconfig flags.NodeConfig
}

func (d *gfd) run(sigs chan os.Signal) (bool, error) {
defer func() {
if config.Flags.UseNodeFeatureAPI != nil && *config.Flags.UseNodeFeatureAPI {
if d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI {
return
}
if config.Flags.GFD.Oneshot != nil && *config.Flags.GFD.Oneshot {
if d.config.Flags.GFD.Oneshot != nil && *d.config.Flags.GFD.Oneshot {
return
}
if config.Flags.GFD.OutputFile != nil && *config.Flags.GFD.OutputFile == "" {
if d.config.Flags.GFD.OutputFile != nil && *d.config.Flags.GFD.OutputFile == "" {
return
}
err := removeOutputFile(*config.Flags.GFD.OutputFile)
err := removeOutputFile(*d.config.Flags.GFD.OutputFile)
if err != nil {
klog.Warningf("Error removing output file: %v", err)
}
}()

timestampLabeler := lm.NewTimestampLabeler(config)
timestampLabeler := lm.NewTimestampLabeler(d.config)
rerun:
loopLabelers, err := lm.NewLabelers(manager, vgpu, config)
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
if err != nil {
return false, err
}
Expand All @@ -191,18 +229,18 @@ rerun:
}

klog.Info("Creating Labels")
useNodeFeatureAPI := config.Flags.UseNodeFeatureAPI != nil && *config.Flags.UseNodeFeatureAPI
err = labels.Output(*config.Flags.GFD.OutputFile, useNodeFeatureAPI)
useNodeFeatureAPI := d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI
err = labels.Output(*d.config.Flags.GFD.OutputFile, useNodeFeatureAPI, d.nodeconfig, d.clientsets)
if err != nil {
return false, err
}

if *config.Flags.GFD.Oneshot {
if *d.config.Flags.GFD.Oneshot {
return false, nil
}

klog.Info("Sleeping for ", *config.Flags.GFD.SleepInterval)
rerunTimeout := time.After(time.Duration(*config.Flags.GFD.SleepInterval))
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
rerunTimeout := time.After(time.Duration(*d.config.Flags.GFD.SleepInterval))

for {
select {
Expand Down
28 changes: 24 additions & 4 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,12 @@ func TestRunOneshot(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -158,7 +163,12 @@ func TestRunWithNoTimestamp(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -216,7 +226,12 @@ func TestRunSleep(t *testing.T) {
var runRestart bool
var runError error
go func() {
runRestart, runError = run(nvmlMock, vgpuMock, conf, sigs)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
runRestart, runError = d.run(sigs)
}()

outFileModificationTime := make([]int64, 2)
Expand Down Expand Up @@ -370,7 +385,12 @@ func TestFailOnNVMLInitError(t *testing.T) {

nvmlMock := rt.NewManagerMockWithDevices(rt.NewFullGPU()).WithErrorOnInit(tc.errorOnInit)

restart, err := run(resource.WithConfig(nvmlMock, conf), vgpuMock, conf, nil)
d := gfd{
manager: resource.WithConfig(nvmlMock, conf),
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
if tc.expectError {
require.Error(t, err)
} else {
Expand Down
35 changes: 30 additions & 5 deletions cmd/gpu-feature-discovery/mig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@ func TestMigStrategyNone(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -97,7 +102,12 @@ func TestMigStrategySingleForNoMigDevices(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -156,7 +166,12 @@ func TestMigStrategySingleForMigDeviceMigDisabled(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -215,7 +230,12 @@ func TestMigStrategySingle(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -275,7 +295,12 @@ func TestMigStrategyMixed(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,11 @@ spec:
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
{{- if typeIs "string" .Values.migStrategy }}
- name: MIG_STRATEGY
value: "{{ .Values.migStrategy }}"
Expand Down
Loading

0 comments on commit 4fa5123

Please sign in to comment.