Skip to content

Commit

Permalink
Merge pull request #16 from e0ne/fw-mismatch
Browse files Browse the repository at this point in the history
  • Loading branch information
almaslennikov authored Oct 1, 2024
2 parents c6379f4 + af67620 commit 0463dbc
Show file tree
Hide file tree
Showing 14 changed files with 227 additions and 3 deletions.
5 changes: 2 additions & 3 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ import (
"fmt"
"os"

"github.com/Mellanox/nic-configuration-operator/pkg/ncolog"
"github.com/Mellanox/nic-configuration-operator/pkg/version"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"
Expand All @@ -39,6 +36,8 @@ import (

configurationnetv1alpha1 "github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/internal/controller"
"github.com/Mellanox/nic-configuration-operator/pkg/ncolog"
"github.com/Mellanox/nic-configuration-operator/pkg/version"
//+kubebuilder:scaffold:imports
)

Expand Down
16 changes: 16 additions & 0 deletions cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ import (
maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/kubernetes"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/internal/controller"
"github.com/Mellanox/nic-configuration-operator/pkg/helper"
"github.com/Mellanox/nic-configuration-operator/pkg/host"
"github.com/Mellanox/nic-configuration-operator/pkg/maintenance"
"github.com/Mellanox/nic-configuration-operator/pkg/ncolog"
Expand Down Expand Up @@ -57,6 +59,11 @@ func main() {
hostManager := host.NewHostManager(nodeName, hostUtils)
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

if err := initNicFwMap(namespace); err != nil {
log.Log.Error(err, "unable to init NicFwMap")
os.Exit(1)
}

deviceDiscovery := controller.NewDeviceRegistry(mgr.GetClient(), hostManager, nodeName, namespace)
if err = mgr.Add(deviceDiscovery); err != nil {
log.Log.Error(err, "unable to add device discovery runnable")
Expand Down Expand Up @@ -92,3 +99,12 @@ func main() {
os.Exit(1)
}
}

func initNicFwMap(namespace string) error {
kubeclient := kubernetes.NewForConfigOrDie(ctrl.GetConfigOrDie())
if err := helper.InitNicFwMapFromConfigMap(kubeclient, namespace); err != nil {
return err
}

return nil
}
14 changes: 14 additions & 0 deletions config/configmap/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: supported-nic-firmware
namespace: system
data:
Nvidia_mlx5_ConnectX-4: "1013 24.07-0.6.1 12.28.2006"
Nvidia_mlx5_ConnectX-5: "1017 24.07-0.6.1 16.35.4030"
Nvidia_mlx5_ConnectX-5_Ex: "1019 24.07-0.6.1 16.35.4030"
Nvidia_mlx5_ConnectX-6: "101b 24.07-0.6.1 20.42.1000"
Nvidia_mlx5_ConnectX-6_Dx: "101d 24.07-0.6.1 22.42.1000"
Nvidia_mlx5_ConnectX-6_Lx: "101f 24.07-0.6.1 26.42.1000"
Nvidia_mlx5_ConnectX-7: "1021 24.07-0.6.1 28.42.1000"
Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx: "a2d6 24.07-0.6.1 24.42.1000"
2 changes: 2 additions & 0 deletions config/configmap/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
resources:
- configmap.yaml
1 change: 1 addition & 0 deletions config/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ namePrefix: nic-configuration-operator-
resources:
- ../crd
- ../rbac
- ../configmap
- ../manager
- ../daemon
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: supported-nic-firmware
data:
Nvidia_mlx5_ConnectX-4: "1013 24.07-0.6.1 12.28.2006"
Nvidia_mlx5_ConnectX-5: "1017 24.07-0.6.1 16.35.4030"
Nvidia_mlx5_ConnectX-5_Ex: "1019 24.07-0.6.1 16.35.4030"
Nvidia_mlx5_ConnectX-6: "101b 24.07-0.6.1 20.42.1000"
Nvidia_mlx5_ConnectX-6_Dx: "101d 24.07-0.6.1 22.42.1000"
Nvidia_mlx5_ConnectX-6_Lx: "101f 24.07-0.6.1 26.42.1000"
Nvidia_mlx5_ConnectX-7: "1021 24.07-0.6.1 28.42.1000"
Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx: "a2d6 24.07-0.6.1 24.42.1000"
30 changes: 30 additions & 0 deletions internal/controller/devicediscovery_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package controller

import (
"context"
"fmt"
"reflect"
"strings"
"time"
Expand All @@ -33,6 +34,7 @@ import (

"github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
"github.com/Mellanox/nic-configuration-operator/pkg/consts"
"github.com/Mellanox/nic-configuration-operator/pkg/helper"
"github.com/Mellanox/nic-configuration-operator/pkg/host"
)

Expand Down Expand Up @@ -63,6 +65,28 @@ func setInitialsConditionsForDevice(device *v1alpha1.NicDevice) {
meta.SetStatusCondition(&device.Status.Conditions, condition)
}

func setFwConfigConditionsForDevice(device *v1alpha1.NicDevice, recommendedFirmware string) {
currentFirmware := device.Status.FirmwareVersion

if currentFirmware == recommendedFirmware {
condition := metav1.Condition{
Type: consts.FimwareConfigMatchCondition,
Status: metav1.ConditionTrue,
Reason: consts.DeviceFwMatchReason,
Message: fmt.Sprintf("Device firmware '%s' matches to recommended version '%s'", currentFirmware, recommendedFirmware),
}
meta.SetStatusCondition(&device.Status.Conditions, condition)
} else {
condition := metav1.Condition{
Type: consts.FimwareConfigMatchCondition,
Status: metav1.ConditionFalse,
Reason: consts.DeviceFwMismatchReason,
Message: fmt.Sprintf("Device firmware '%s' doesn't match to recommended version '%s'", currentFirmware, recommendedFirmware),
}
meta.SetStatusCondition(&device.Status.Conditions, condition)
}
}

// reconcile reconciles the devices on the host by comparing the observed devices with the existing NicDevice custom resources (CRs).
// It deletes CRs that do not represent observed devices, updates the CRs if the status of the device changes,
// and creates new CRs for devices that do not have a CR representation.
Expand Down Expand Up @@ -160,6 +184,12 @@ func (d *DeviceDiscovery) reconcile(ctx context.Context) error {
device.Status.Node = d.nodeName
setInitialsConditionsForDevice(device)

ofedVersion, err := d.hostManager.DiscoverOfedVersion()
if err == nil {
recommendedFirmware := helper.GetRecommendedFwVersion(device.Status.Type, ofedVersion)
setFwConfigConditionsForDevice(device, recommendedFirmware)
}

err = d.Client.Status().Update(ctx, device)
if err != nil {
log.Log.Error(err, "failed to update NicDevice CR status", "device", deviceName)
Expand Down
2 changes: 2 additions & 0 deletions internal/controller/devicediscovery_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ var _ = Describe("DeviceDiscovery", func() {
FirmwareVersion: fwVersion,
},
}, nil)
hostManager.On("DiscoverOfedVersion").Return("00.00-0.0.0", nil)

startManager()

Expand Down Expand Up @@ -220,6 +221,7 @@ var _ = Describe("DeviceDiscovery", func() {
Ports: []v1alpha1.NicDevicePortSpec{{PCI: "0000:81:00.0"}},
},
}, nil)
hostManager.On("DiscoverOfedVersion").Return("00.00-0.0.0", nil)

startManager()

Expand Down
6 changes: 6 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
Infiniband = "Infiniband"

ConfigUpdateInProgressCondition = "ConfigUpdateInProgress"
FimwareConfigMatchCondition = "FirmwareConfigMatch"
IncorrectSpecReason = "IncorrectSpec"
UpdateStartedReason = "UpdateStarted"
PendingRebootReason = "PendingReboot"
Expand All @@ -31,6 +32,8 @@ const (
SpecValidationFailed = "SpecValidationFailed"

DeviceConfigSpecEmptyReason = "DeviceConfigSpecEmpty"
DeviceFwMatchReason = "DeviceFirmwareConfigMatch"
DeviceFwMismatchReason = "DeviceFirmwareConfigMismatch"

PartNumberPrefix = "pn:"
SerialNumberPrefix = "sn:"
Expand Down Expand Up @@ -72,4 +75,7 @@ const (
MaintenanceRequestName = "nic-configuration-operator-maintenance"

HostPath = "/host"

SupportedNicFirmwareConfigmap = "supported-nic-firmware"
Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version"
)
62 changes: 62 additions & 0 deletions pkg/helper/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
2024 NVIDIA CORPORATION & AFFILIATES
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package helper

import (
"context"
"strings"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/Mellanox/nic-configuration-operator/pkg/consts"
)

// NicFirmwareMap contains supported mapping of NIC firmware with each in the format of:
// NIC ID, Firmware version
var NicFirmwareMap = []string{}

func InitNicFwMapFromConfigMap(client kubernetes.Interface, namespace string) error {
cm, err := client.CoreV1().ConfigMaps(namespace).Get(
context.Background(),
consts.SupportedNicFirmwareConfigmap,
metav1.GetOptions{},
)
// if the configmap does not exist, return false
if err != nil {
return err
}
for _, v := range cm.Data {
NicFirmwareMap = append(NicFirmwareMap, v)
}

return nil
}

func GetRecommendedFwVersion(deviceId, ofed string) string {
for _, n := range NicFirmwareMap {
fw := strings.Split(n, " ")
if len(fw) < 3 {
log.Log.Info("incorrect NicFirmwareMap value", "fw", fw)
return ""
}
if deviceId == fw[0] && ofed == fw[1] {
return fw[2]
}
}
return ""
}
11 changes: 11 additions & 0 deletions pkg/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ type HostManager interface {
// ApplyDeviceRuntimeSpec calculates device's missing runtime spec configuration and applies it to the device on the host
// returns error - there were errors while applying nv configuration
ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error
// DiscoverOfedVersion retrieves installed OFED version
// returns string - installed OFED version
// returns error - OFED isn't installed or version couldn't be determined
DiscoverOfedVersion() (string, error)
}

type hostManager struct {
Expand Down Expand Up @@ -317,6 +321,13 @@ func (h hostManager) ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice) error {
return nil
}

// DiscoverOfedVersion retrieves installed OFED version
// returns string - installed OFED version
// returns error - OFED isn't installed or version couldn't be determined
func (h hostManager) DiscoverOfedVersion() (string, error) {
return h.hostUtils.GetOfedVersion()
}

func NewHostManager(nodeName string, hostUtils HostUtils) HostManager {
return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils)}
}
28 changes: 28 additions & 0 deletions pkg/host/mocks/HostManager.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions pkg/host/mocks/HostUtils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions pkg/host/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ type HostUtils interface {
SetTrustAndPFC(interfaceName string, trust string, pfc string) error
// ScheduleReboot schedules reboot on the host
ScheduleReboot() error
// GetOfedVersion retrieves installed OFED version
GetOfedVersion() (string, error)
}

type hostUtils struct {
Expand Down Expand Up @@ -605,6 +607,16 @@ func (h *hostUtils) ScheduleReboot() error {
return nil
}

// GetOfedVersion retrieves installed OFED version
func (h *hostUtils) GetOfedVersion() (string, error) {
version, err := os.ReadFile(filepath.Join(consts.HostPath, consts.Mlx5ModuleVersionPath))
if err != nil {
log.Log.Error(err, "GetOfedVersion(): failed to read mlx5_core version file")
return "", err
}
return string(version), nil
}

func NewHostUtils() HostUtils {
return &hostUtils{execInterface: execUtils.New()}
}

0 comments on commit 0463dbc

Please sign in to comment.