diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index 050ee7e5e..66672bd12 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -164,11 +164,15 @@ func start(c *cli.Context, cfg *Config) error { } klog.Info("Start running") d := &gfd{ - manager: manager, - vgpu: vgpul, - config: config, - clientsets: clientSets, - nodeconfig: cfg.nodeConfig, + manager: manager, + vgpu: vgpul, + config: config, + + labelOutputer: lm.NewOutputer( + config, + cfg.nodeConfig, + clientSets, + ), } restart, err := d.run(sigs) if err != nil { @@ -186,8 +190,7 @@ type gfd struct { vgpu vgpu.Interface config *spec.Config - clientsets flags.ClientSets - nodeconfig flags.NodeConfig + labelOutputer lm.Outputer } func (d *gfd) run(sigs chan os.Signal) (bool, error) { @@ -229,9 +232,7 @@ rerun: } klog.Info("Creating Labels") - useNodeFeatureAPI := d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI - err = labels.Output(*d.config.Flags.GFD.OutputFile, useNodeFeatureAPI, d.nodeconfig, d.clientsets) - if err != nil { + if err := d.labelOutputer.Output(labels); err != nil { return false, err } diff --git a/cmd/gpu-feature-discovery/main_test.go b/cmd/gpu-feature-discovery/main_test.go index e188194c1..7e67a24b5 100644 --- a/cmd/gpu-feature-discovery/main_test.go +++ b/cmd/gpu-feature-discovery/main_test.go @@ -18,6 +18,8 @@ import ( "github.com/stretchr/testify/require" spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/NVIDIA/k8s-device-plugin/internal/flags" + "github.com/NVIDIA/k8s-device-plugin/internal/lm" "github.com/NVIDIA/k8s-device-plugin/internal/resource" rt "github.com/NVIDIA/k8s-device-plugin/internal/resource/testing" "github.com/NVIDIA/k8s-device-plugin/internal/vgpu" @@ -113,9 +115,10 @@ func TestRunOneshot(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -164,9 +167,10 @@ func TestRunWithNoTimestamp(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -227,9 +231,10 @@ func TestRunSleep(t *testing.T) { var runError error go func() { d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } runRestart, runError = d.run(sigs) }() @@ -386,9 +391,10 @@ func TestFailOnNVMLInitError(t *testing.T) { nvmlMock := rt.NewManagerMockWithDevices(rt.NewFullGPU()).WithErrorOnInit(tc.errorOnInit) d := gfd{ - manager: resource.WithConfig(nvmlMock, conf), - vgpu: vgpuMock, - config: conf, + manager: resource.WithConfig(nvmlMock, conf), + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) if tc.expectError { diff --git a/cmd/gpu-feature-discovery/mig_test.go b/cmd/gpu-feature-discovery/mig_test.go index c46cf211e..0aa1ad6cc 100644 --- a/cmd/gpu-feature-discovery/mig_test.go +++ b/cmd/gpu-feature-discovery/mig_test.go @@ -11,6 +11,8 @@ import ( "github.com/stretchr/testify/require" spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/NVIDIA/k8s-device-plugin/internal/flags" + "github.com/NVIDIA/k8s-device-plugin/internal/lm" "github.com/NVIDIA/k8s-device-plugin/internal/resource" rt "github.com/NVIDIA/k8s-device-plugin/internal/resource/testing" ) @@ -46,9 +48,10 @@ func TestMigStrategyNone(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -103,9 +106,10 @@ func TestMigStrategySingleForNoMigDevices(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -167,9 +171,10 @@ func TestMigStrategySingleForMigDeviceMigDisabled(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -231,9 +236,10 @@ func TestMigStrategySingle(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") @@ -296,9 +302,10 @@ func TestMigStrategyMixed(t *testing.T) { defer removeMachineFile(t) d := gfd{ - manager: nvmlMock, - vgpu: vgpuMock, - config: conf, + manager: nvmlMock, + vgpu: vgpuMock, + config: conf, + labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}), } restart, err := d.run(nil) require.NoError(t, err, "Error from run function") diff --git a/internal/lm/labels.go b/internal/lm/labels.go index 8190df66c..8283b6a3a 100644 --- a/internal/lm/labels.go +++ b/internal/lm/labels.go @@ -16,28 +16,6 @@ package lm -import ( - "bytes" - "context" - "fmt" - "io" - "log" - "os" - "path/filepath" - "strings" - - "github.com/NVIDIA/k8s-device-plugin/internal/flags" - - nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" - nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" - - apiequality "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -const nodeFeatureVendorPrefix = "nvidia-features-for" - // Labels defines a type for labels type Labels map[string]string @@ -45,136 +23,3 @@ type Labels map[string]string func (labels Labels) Labels() (Labels, error) { return labels, nil } - -// Output creates labels according to the specified output format. -func (labels Labels) Output(path string, nodeFeatureAPI bool, nodeConifg flags.NodeConfig, clientSets flags.ClientSets) error { - if nodeFeatureAPI { - log.Print("Writing labels to NodeFeature CR") - return labels.UpdateNodeFeatureObject(nodeConifg, clientSets.NFD) - } - - return labels.UpdateFile(path) -} - -// UpdateFile writes labels to the specified path. The file is written atomocally -func (labels Labels) UpdateFile(path string) error { - log.Printf("Writing labels to output file %s", path) - - if path == "" { - _, err := labels.WriteTo(os.Stdout) - return err - } - - output := new(bytes.Buffer) - if _, err := labels.WriteTo(output); err != nil { - return fmt.Errorf("error writing labels to buffer: %v", err) - } - err := writeFileAtomically(path, output.Bytes(), 0644) - if err != nil { - return fmt.Errorf("error atomically writing file '%s': %v", path, err) - } - return nil -} - -// WriteTo writes labels to the specified writer -func (labels Labels) WriteTo(output io.Writer) (int64, error) { - var total int64 - for k, v := range labels { - n, err := fmt.Fprintf(output, "%s=%s\n", k, v) - total += int64(n) - if err != nil { - return total, err - } - } - - return total, nil -} - -func writeFileAtomically(path string, contents []byte, perm os.FileMode) error { - absPath, err := filepath.Abs(path) - if err != nil { - return fmt.Errorf("failed to retrieve absolute path of output file: %v", err) - } - - absDir := filepath.Dir(absPath) - tmpDir := filepath.Join(absDir, "gfd-tmp") - - err = os.MkdirAll(tmpDir, os.ModePerm) - if err != nil && !os.IsExist(err) { - return fmt.Errorf("failed to create temporary directory: %v", err) - } - defer func() { - if err != nil { - os.RemoveAll(tmpDir) - } - }() - - tmpFile, err := os.CreateTemp(tmpDir, "gfd-") - if err != nil { - return fmt.Errorf("fail to create temporary output file: %v", err) - } - defer func() { - if err != nil { - tmpFile.Close() - os.Remove(tmpFile.Name()) - } - }() - - err = os.WriteFile(tmpFile.Name(), contents, perm) - if err != nil { - return fmt.Errorf("error writing temporary file '%v': %v", tmpFile.Name(), err) - } - - err = os.Rename(tmpFile.Name(), path) - if err != nil { - return fmt.Errorf("error moving temporary file to '%v': %v", path, err) - } - - err = os.Chmod(path, perm) - if err != nil { - return fmt.Errorf("error setting permissions on '%v': %v", path, err) - } - - return nil -} - -// UpdateNodeFeatureObject creates/updates the node-specific NodeFeature custom resource. -func (labels Labels) UpdateNodeFeatureObject(nodeConfig flags.NodeConfig, cli nfdclientset.Interface) error { - nodename := nodeConfig.Name - namespace := nodeConfig.Namespace - nodeFeatureName := strings.Join([]string{nodeFeatureVendorPrefix, nodename}, "-") - - if nfr, err := cli.NfdV1alpha1().NodeFeatures(namespace).Get(context.TODO(), nodeFeatureName, metav1.GetOptions{}); errors.IsNotFound(err) { - log.Printf("creating NodeFeature object %s", nodeFeatureName) - nfr = &nfdv1alpha1.NodeFeature{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{Name: nodeFeatureName, Labels: map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename}}, - Spec: nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels}, - } - - nfrCreated, err := cli.NfdV1alpha1().NodeFeatures(namespace).Create(context.TODO(), nfr, metav1.CreateOptions{}) - if err != nil { - return fmt.Errorf("failed to create NodeFeature object %q: %w", nfr.Name, err) - } - - log.Printf("NodeFeature object created: %v", nfrCreated) - } else if err != nil { - return fmt.Errorf("failed to get NodeFeature object: %w", err) - } else { - nfrUpdated := nfr.DeepCopy() - nfrUpdated.Labels = map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename} - nfrUpdated.Spec = nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels} - - if !apiequality.Semantic.DeepEqual(nfr, nfrUpdated) { - log.Printf("updating NodeFeature object %s", nodeFeatureName) - nfrUpdated, err = cli.NfdV1alpha1().NodeFeatures(namespace).Update(context.TODO(), nfrUpdated, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("failed to update NodeFeature object %q: %w", nfr.Name, err) - } - log.Printf("NodeFeature object updated: %v", nfrUpdated) - } else { - log.Printf("no changes in NodeFeature object, not updating") - } - } - return nil -} diff --git a/internal/lm/output.go b/internal/lm/output.go new file mode 100644 index 000000000..2a3f8e291 --- /dev/null +++ b/internal/lm/output.go @@ -0,0 +1,192 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package lm + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + apiequality "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" + nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/NVIDIA/k8s-device-plugin/internal/flags" +) + +// Outputer defines a mechanism to output labels. +type Outputer interface { + Output(Labels) error +} + +// TODO: Replace this with functional options. +func NewOutputer(config *spec.Config, nodeConfig flags.NodeConfig, clientSets flags.ClientSets) Outputer { + if config.Flags.UseNodeFeatureAPI == nil || !*config.Flags.UseNodeFeatureAPI { + return ToFile(*config.Flags.GFD.OutputFile) + } + + return &nodeFeatureObject{ + nodeConfig: nodeConfig, + nfdClientset: clientSets.NFD, + } +} + +func ToFile(path string) Outputer { + if path == "" { + return &toWriter{os.Stdout} + } + + o := toFile(path) + return &o +} + +// toFile writes to the specified file. +type toFile string + +// toWriter writes to the specified writer +type toWriter struct { + io.Writer +} + +func (path *toFile) Output(labels Labels) error { + klog.Infof("Writing labels to output file %v", *path) + + buffer := new(bytes.Buffer) + output := &toWriter{buffer} + if err := output.Output(labels); err != nil { + return fmt.Errorf("error writing labels to buffer: %v", err) + } + err := writeFileAtomically(string(*path), buffer.Bytes(), 0644) + if err != nil { + return fmt.Errorf("error atomically writing file '%s': %w", *path, err) + } + return nil +} + +func (output *toWriter) Output(labels Labels) error { + for k, v := range labels { + _, err := fmt.Fprintf(output, "%s=%s\n", k, v) + if err != nil { + return err + } + } + return nil +} + +func writeFileAtomically(path string, contents []byte, perm os.FileMode) error { + absPath, err := filepath.Abs(path) + if err != nil { + return fmt.Errorf("failed to retrieve absolute path of output file: %v", err) + } + + absDir := filepath.Dir(absPath) + tmpDir := filepath.Join(absDir, "gfd-tmp") + + err = os.MkdirAll(tmpDir, os.ModePerm) + if err != nil && !os.IsExist(err) { + return fmt.Errorf("failed to create temporary directory: %v", err) + } + defer func() { + if err != nil { + os.RemoveAll(tmpDir) + } + }() + + tmpFile, err := os.CreateTemp(tmpDir, "gfd-") + if err != nil { + return fmt.Errorf("fail to create temporary output file: %v", err) + } + defer func() { + if err != nil { + tmpFile.Close() + os.Remove(tmpFile.Name()) + } + }() + + err = os.WriteFile(tmpFile.Name(), contents, perm) + if err != nil { + return fmt.Errorf("error writing temporary file '%v': %v", tmpFile.Name(), err) + } + + err = os.Rename(tmpFile.Name(), path) + if err != nil { + return fmt.Errorf("error moving temporary file to '%v': %v", path, err) + } + + err = os.Chmod(path, perm) + if err != nil { + return fmt.Errorf("error setting permissions on '%v': %v", path, err) + } + + return nil +} + +const nodeFeatureVendorPrefix = "nvidia-features-for" + +type nodeFeatureObject struct { + nodeConfig flags.NodeConfig + nfdClientset nfdclientset.Interface +} + +// UpdateNodeFeatureObject creates/updates the node-specific NodeFeature custom resource. +func (n *nodeFeatureObject) Output(labels Labels) error { + nodename := n.nodeConfig.Name + namespace := n.nodeConfig.Namespace + nodeFeatureName := strings.Join([]string{nodeFeatureVendorPrefix, nodename}, "-") + + if nfr, err := n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Get(context.TODO(), nodeFeatureName, metav1.GetOptions{}); errors.IsNotFound(err) { + klog.Infof("creating NodeFeature object %s", nodeFeatureName) + nfr = &nfdv1alpha1.NodeFeature{ + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{Name: nodeFeatureName, Labels: map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename}}, + Spec: nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels}, + } + + nfrCreated, err := n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Create(context.TODO(), nfr, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create NodeFeature object %q: %w", nfr.Name, err) + } + + klog.Infof("NodeFeature object created: %v", nfrCreated) + } else if err != nil { + return fmt.Errorf("failed to get NodeFeature object: %w", err) + } else { + nfrUpdated := nfr.DeepCopy() + nfrUpdated.Labels = map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename} + nfrUpdated.Spec = nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels} + + if !apiequality.Semantic.DeepEqual(nfr, nfrUpdated) { + klog.Infof("updating NodeFeature object %s", nodeFeatureName) + nfrUpdated, err = n.nfdClientset.NfdV1alpha1().NodeFeatures(namespace).Update(context.TODO(), nfrUpdated, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update NodeFeature object %q: %w", nfr.Name, err) + } + klog.Infof("NodeFeature object updated: %v", nfrUpdated) + } else { + klog.Infof("no changes in NodeFeature object, not updating") + } + } + return nil +}