From af676207dccd7fc5c1255f8d8b2a234c4bf2b7e8 Mon Sep 17 00:00:00 2001 From: Ivan Kolodiazhnyi Date: Sun, 22 Sep 2024 22:19:11 +0300 Subject: [PATCH] feat: Implement firmware version check --- cmd/manager/main.go | 5 +- cmd/nic-configuration-daemon/main.go | 16 +++++ config/configmap/configmap.yaml | 14 +++++ config/configmap/kustomization.yaml | 2 + config/default/kustomization.yaml | 1 + .../supported-nic-firmware-configmap.yaml | 13 ++++ .../controller/devicediscovery_controller.go | 30 +++++++++ .../devicediscovery_controller_test.go | 2 + pkg/consts/consts.go | 6 ++ pkg/helper/helper.go | 62 +++++++++++++++++++ pkg/host/host.go | 11 ++++ pkg/host/mocks/HostManager.go | 28 +++++++++ pkg/host/mocks/HostUtils.go | 28 +++++++++ pkg/host/utils.go | 12 ++++ 14 files changed, 227 insertions(+), 3 deletions(-) create mode 100644 config/configmap/configmap.yaml create mode 100644 config/configmap/kustomization.yaml create mode 100644 deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml create mode 100644 pkg/helper/helper.go diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 2c2503c..04c6d96 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -22,9 +22,6 @@ import ( "fmt" "os" - "github.com/Mellanox/nic-configuration-operator/pkg/ncolog" - "github.com/Mellanox/nic-configuration-operator/pkg/version" - // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" @@ -39,6 +36,8 @@ import ( configurationnetv1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" "github.com/Mellanox/nic-configuration-operator/internal/controller" + "github.com/Mellanox/nic-configuration-operator/pkg/ncolog" + "github.com/Mellanox/nic-configuration-operator/pkg/version" //+kubebuilder:scaffold:imports ) diff --git a/cmd/nic-configuration-daemon/main.go b/cmd/nic-configuration-daemon/main.go index c6d426f..061ccf8 100644 --- a/cmd/nic-configuration-daemon/main.go +++ b/cmd/nic-configuration-daemon/main.go @@ -7,6 +7,7 @@ import ( maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -14,6 +15,7 @@ import ( "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" "github.com/Mellanox/nic-configuration-operator/internal/controller" + "github.com/Mellanox/nic-configuration-operator/pkg/helper" "github.com/Mellanox/nic-configuration-operator/pkg/host" "github.com/Mellanox/nic-configuration-operator/pkg/maintenance" "github.com/Mellanox/nic-configuration-operator/pkg/ncolog" @@ -57,6 +59,11 @@ func main() { hostManager := host.NewHostManager(nodeName, hostUtils) maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace) + if err := initNicFwMap(namespace); err != nil { + log.Log.Error(err, "unable to init NicFwMap") + os.Exit(1) + } + deviceDiscovery := controller.NewDeviceRegistry(mgr.GetClient(), hostManager, nodeName, namespace) if err = mgr.Add(deviceDiscovery); err != nil { log.Log.Error(err, "unable to add device discovery runnable") @@ -92,3 +99,12 @@ func main() { os.Exit(1) } } + +func initNicFwMap(namespace string) error { + kubeclient := kubernetes.NewForConfigOrDie(ctrl.GetConfigOrDie()) + if err := helper.InitNicFwMapFromConfigMap(kubeclient, namespace); err != nil { + return err + } + + return nil +} diff --git a/config/configmap/configmap.yaml b/config/configmap/configmap.yaml new file mode 100644 index 0000000..a03335c --- /dev/null +++ b/config/configmap/configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: supported-nic-firmware + namespace: system +data: + Nvidia_mlx5_ConnectX-4: "1013 24.07-0.6.1 12.28.2006" + Nvidia_mlx5_ConnectX-5: "1017 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-5_Ex: "1019 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-6: "101b 24.07-0.6.1 20.42.1000" + Nvidia_mlx5_ConnectX-6_Dx: "101d 24.07-0.6.1 22.42.1000" + Nvidia_mlx5_ConnectX-6_Lx: "101f 24.07-0.6.1 26.42.1000" + Nvidia_mlx5_ConnectX-7: "1021 24.07-0.6.1 28.42.1000" + Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx: "a2d6 24.07-0.6.1 24.42.1000" diff --git a/config/configmap/kustomization.yaml b/config/configmap/kustomization.yaml new file mode 100644 index 0000000..5b0c161 --- /dev/null +++ b/config/configmap/kustomization.yaml @@ -0,0 +1,2 @@ +resources: + - configmap.yaml diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 979a5cf..9b8d314 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -17,6 +17,7 @@ namePrefix: nic-configuration-operator- resources: - ../crd - ../rbac +- ../configmap - ../manager - ../daemon # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in diff --git a/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml b/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml new file mode 100644 index 0000000..84f9a33 --- /dev/null +++ b/deployment/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: supported-nic-firmware +data: + Nvidia_mlx5_ConnectX-4: "1013 24.07-0.6.1 12.28.2006" + Nvidia_mlx5_ConnectX-5: "1017 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-5_Ex: "1019 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-6: "101b 24.07-0.6.1 20.42.1000" + Nvidia_mlx5_ConnectX-6_Dx: "101d 24.07-0.6.1 22.42.1000" + Nvidia_mlx5_ConnectX-6_Lx: "101f 24.07-0.6.1 26.42.1000" + Nvidia_mlx5_ConnectX-7: "1021 24.07-0.6.1 28.42.1000" + Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx: "a2d6 24.07-0.6.1 24.42.1000" diff --git a/internal/controller/devicediscovery_controller.go b/internal/controller/devicediscovery_controller.go index 8a81915..caa01f5 100644 --- a/internal/controller/devicediscovery_controller.go +++ b/internal/controller/devicediscovery_controller.go @@ -17,6 +17,7 @@ package controller import ( "context" + "fmt" "reflect" "strings" "time" @@ -33,6 +34,7 @@ import ( "github.com/Mellanox/nic-configuration-operator/api/v1alpha1" "github.com/Mellanox/nic-configuration-operator/pkg/consts" + "github.com/Mellanox/nic-configuration-operator/pkg/helper" "github.com/Mellanox/nic-configuration-operator/pkg/host" ) @@ -63,6 +65,28 @@ func setInitialsConditionsForDevice(device *v1alpha1.NicDevice) { meta.SetStatusCondition(&device.Status.Conditions, condition) } +func setFwConfigConditionsForDevice(device *v1alpha1.NicDevice, recommendedFirmware string) { + currentFirmware := device.Status.FirmwareVersion + + if currentFirmware == recommendedFirmware { + condition := metav1.Condition{ + Type: consts.FimwareConfigMatchCondition, + Status: metav1.ConditionTrue, + Reason: consts.DeviceFwMatchReason, + Message: fmt.Sprintf("Device firmware '%s' matches to recommended version '%s'", currentFirmware, recommendedFirmware), + } + meta.SetStatusCondition(&device.Status.Conditions, condition) + } else { + condition := metav1.Condition{ + Type: consts.FimwareConfigMatchCondition, + Status: metav1.ConditionFalse, + Reason: consts.DeviceFwMismatchReason, + Message: fmt.Sprintf("Device firmware '%s' doesn't match to recommended version '%s'", currentFirmware, recommendedFirmware), + } + meta.SetStatusCondition(&device.Status.Conditions, condition) + } +} + // reconcile reconciles the devices on the host by comparing the observed devices with the existing NicDevice custom resources (CRs). // It deletes CRs that do not represent observed devices, updates the CRs if the status of the device changes, // and creates new CRs for devices that do not have a CR representation. @@ -160,6 +184,12 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error { device.Status.Node = d.nodeName setInitialsConditionsForDevice(device) + ofedVersion, err := d.hostManager.DiscoverOfedVersion() + if err == nil { + recommendedFirmware := helper.GetRecommendedFwVersion(device.Status.Type, ofedVersion) + setFwConfigConditionsForDevice(device, recommendedFirmware) + } + err = d.Client.Status().Update(ctx, device) if err != nil { log.Log.Error(err, "failed to update NicDevice CR status", "device", deviceName) diff --git a/internal/controller/devicediscovery_controller_test.go b/internal/controller/devicediscovery_controller_test.go index 1d28fe6..8635192 100644 --- a/internal/controller/devicediscovery_controller_test.go +++ b/internal/controller/devicediscovery_controller_test.go @@ -165,6 +165,7 @@ var _ = Describe("DeviceDiscovery", func() { FirmwareVersion: fwVersion, }, }, nil) + hostManager.On("DiscoverOfedVersion").Return("00.00-0.0.0", nil) startManager() @@ -220,6 +221,7 @@ var _ = Describe("DeviceDiscovery", func() { Ports: []v1alpha1.NicDevicePortSpec{{PCI: "0000:81:00.0"}}, }, }, nil) + hostManager.On("DiscoverOfedVersion").Return("00.00-0.0.0", nil) startManager() diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 1d35862..e176daf 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -22,6 +22,7 @@ const ( Infiniband = "Infiniband" ConfigUpdateInProgressCondition = "ConfigUpdateInProgress" + FimwareConfigMatchCondition = "FirmwareConfigMatch" IncorrectSpecReason = "IncorrectSpec" UpdateStartedReason = "UpdateStarted" PendingRebootReason = "PendingReboot" @@ -31,6 +32,8 @@ const ( SpecValidationFailed = "SpecValidationFailed" DeviceConfigSpecEmptyReason = "DeviceConfigSpecEmpty" + DeviceFwMatchReason = "DeviceFirmwareConfigMatch" + DeviceFwMismatchReason = "DeviceFirmwareConfigMismatch" PartNumberPrefix = "pn:" SerialNumberPrefix = "sn:" @@ -72,4 +75,7 @@ const ( MaintenanceRequestName = "nic-configuration-operator-maintenance" HostPath = "/host" + + SupportedNicFirmwareConfigmap = "supported-nic-firmware" + Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version" ) diff --git a/pkg/helper/helper.go b/pkg/helper/helper.go new file mode 100644 index 0000000..dca798b --- /dev/null +++ b/pkg/helper/helper.go @@ -0,0 +1,62 @@ +/* +2024 NVIDIA CORPORATION & AFFILIATES +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package helper + +import ( + "context" + "strings" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/Mellanox/nic-configuration-operator/pkg/consts" +) + +// NicFirmwareMap contains supported mapping of NIC firmware with each in the format of: +// NIC ID, Firmware version +var NicFirmwareMap = []string{} + +func InitNicFwMapFromConfigMap(client kubernetes.Interface, namespace string) error { + cm, err := client.CoreV1().ConfigMaps(namespace).Get( + context.Background(), + consts.SupportedNicFirmwareConfigmap, + metav1.GetOptions{}, + ) + // if the configmap does not exist, return false + if err != nil { + return err + } + for _, v := range cm.Data { + NicFirmwareMap = append(NicFirmwareMap, v) + } + + return nil +} + +func GetRecommendedFwVersion(deviceId, ofed string) string { + for _, n := range NicFirmwareMap { + fw := strings.Split(n, " ") + if len(fw) < 3 { + log.Log.Info("incorrect NicFirmwareMap value", "fw", fw) + return "" + } + if deviceId == fw[0] && ofed == fw[1] { + return fw[2] + } + } + return "" +} diff --git a/pkg/host/host.go b/pkg/host/host.go index 648ce1e..28ec623 100644 --- a/pkg/host/host.go +++ b/pkg/host/host.go @@ -43,6 +43,10 @@ type HostManager interface { // ApplyDeviceRuntimeSpec calculates device's missing runtime spec configuration and applies it to the device on the host // returns error - there were errors while applying nv configuration ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error + // DiscoverOfedVersion retrieves installed OFED version + // returns string - installed OFED version + // returns error - OFED isn't installed or version couldn't be determined + DiscoverOfedVersion() (string, error) } type hostManager struct { @@ -317,6 +321,13 @@ func (h hostManager) ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error { return nil } +// DiscoverOfedVersion retrieves installed OFED version +// returns string - installed OFED version +// returns error - OFED isn't installed or version couldn't be determined +func (h hostManager) DiscoverOfedVersion() (string, error) { + return h.hostUtils.GetOfedVersion() +} + func NewHostManager(nodeName string, hostUtils HostUtils) HostManager { return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils)} } diff --git a/pkg/host/mocks/HostManager.go b/pkg/host/mocks/HostManager.go index 1f1a909..eaa5409 100644 --- a/pkg/host/mocks/HostManager.go +++ b/pkg/host/mocks/HostManager.go @@ -91,6 +91,34 @@ func (_m *HostManager) DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStatus return r0, r1 } +// DiscoverOfedVersion provides a mock function with given fields: +func (_m *HostManager) DiscoverOfedVersion() (string, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for DiscoverOfedVersion") + } + + var r0 string + var r1 error + if rf, ok := ret.Get(0).(func() (string, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // ValidateDeviceNvSpec provides a mock function with given fields: ctx, device func (_m *HostManager) ValidateDeviceNvSpec(ctx context.Context, device *v1alpha1.NicDevice) (bool, bool, error) { ret := _m.Called(ctx, device) diff --git a/pkg/host/mocks/HostUtils.go b/pkg/host/mocks/HostUtils.go index 8d12541..323ab45 100644 --- a/pkg/host/mocks/HostUtils.go +++ b/pkg/host/mocks/HostUtils.go @@ -116,6 +116,34 @@ func (_m *HostUtils) GetMaxReadRequestSize(pciAddr string) (int, error) { return r0, r1 } +// GetOfedVersion provides a mock function with given fields: +func (_m *HostUtils) GetOfedVersion() (string, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetOfedVersion") + } + + var r0 string + var r1 error + if rf, ok := ret.Get(0).(func() (string, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // GetPCIDevices provides a mock function with given fields: func (_m *HostUtils) GetPCIDevices() ([]*pci.Device, error) { ret := _m.Called() diff --git a/pkg/host/utils.go b/pkg/host/utils.go index 887e72d..23190f2 100644 --- a/pkg/host/utils.go +++ b/pkg/host/utils.go @@ -78,6 +78,8 @@ type HostUtils interface { SetTrustAndPFC(interfaceName string, trust string, pfc string) error // ScheduleReboot schedules reboot on the host ScheduleReboot() error + // GetOfedVersion retrieves installed OFED version + GetOfedVersion() (string, error) } type hostUtils struct { @@ -605,6 +607,16 @@ func (h *hostUtils) ScheduleReboot() error { return nil } +// GetOfedVersion retrieves installed OFED version +func (h *hostUtils) GetOfedVersion() (string, error) { + version, err := os.ReadFile(filepath.Join(consts.HostPath, consts.Mlx5ModuleVersionPath)) + if err != nil { + log.Log.Error(err, "GetOfedVersion(): failed to read mlx5_core version file") + return "", err + } + return string(version), nil +} + func NewHostUtils() HostUtils { return &hostUtils{execInterface: execUtils.New()} }