Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ type DriverSpec struct {
// +kubebuilder:default=true
Enable *bool `json:"enable,omitempty"`

// blacklist amdgpu drivers on the host
// blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
// Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
// Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"}
Blacklist *bool `json:"blacklist,omitempty"`

Expand All @@ -117,7 +119,7 @@ type DriverSpec struct {
// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
// +optional
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
Image string `json:"image,omitempty"`

// driver image registry TLS setting for the container image
Expand Down Expand Up @@ -251,12 +253,11 @@ type DevicePluginSpec struct {
// +optional
DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"`

// resource naming strategy for device plugin
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"}
// +kubebuilder:validation:Enum=single;mixed
// +kubebuilder:default:="single"
// device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
// supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"}
// +optional
ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"`
DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"`

// node labeller image
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"}
Expand Down
7 changes: 7 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 50 additions & 14 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,30 @@ metadata:
}
}
]
capabilities: Basic Install
createdAt: "2025-03-20T06:06:57Z"
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.2.0
createdAt: "2025-04-02T23:22:18Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest
features.operators.openshift.io/disconnected: "true"
features.operators.openshift.io/fips-compliant: "false"
features.operators.openshift.io/proxy-aware: "true"
features.operators.openshift.io/tls-profiles: "false"
features.operators.openshift.io/token-auth-aws: "false"
features.operators.openshift.io/token-auth-azure: "false"
features.operators.openshift.io/token-auth-gcp: "false"
metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
operatorframework.io/cluster-monitoring: "true"
operatorframework.io/suggested-namespace: openshift-amd-gpu
operators.openshift.io/valid-subscription: '[]'
operators.operatorframework.io/builder: operator-sdk-v1.32.0
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
repository: https://github.com/ROCm/gpu-operator
support: Advanced Micro Devices, Inc.
name: amd-gpu-operator.v1.2.0
namespace: placeholder
spec:
Expand Down Expand Up @@ -152,6 +170,13 @@ spec:
path: devicePlugin
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
- description: 'device plugin arguments is used to pass supported flags and
their values while starting device plugin daemonset supported flag values:
{"resource_naming_strategy": {"single", "mixed"}}'
displayName: DevicePluginArguments
path: devicePlugin.devicePluginArguments
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
- description: device plugin image
displayName: DevicePluginImage
path: devicePlugin.devicePluginImage
Expand Down Expand Up @@ -192,11 +217,6 @@ spec:
path: devicePlugin.nodeLabellerTolerations
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
- description: resource naming strategy for device plugin
displayName: ResourceNamingStrategy
path: devicePlugin.resourceNamingStrategy
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
- description: upgrade policy for device plugin and node labeller daemons
displayName: UpgradePolicy
path: devicePlugin.upgradePolicy
Expand Down Expand Up @@ -227,7 +247,10 @@ spec:
path: driver.amdgpuInstallerRepoURL
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL
- description: blacklist amdgpu drivers on the host
- description: blacklist amdgpu drivers on the host. Node reboot is required
to apply the baclklist on the worker nodes. Not working for OpenShift cluster.
OpenShift users please use the Machine Config Operator (MCO) resource to
configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
displayName: BlacklistDrivers
path: driver.blacklist
x-descriptors:
Expand Down Expand Up @@ -606,7 +629,7 @@ spec:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
version: v1alpha1
description: |-
Operator responsible for deploying AMD GPU kernel drivers and device plugin
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
displayName: amd-gpu-operator
icon:
Expand Down Expand Up @@ -1110,11 +1133,24 @@ spec:
- supported: true
type: AllNamespaces
keywords:
- amd-gpu-operator
- AMD
- GPU
- AI
- Deep Learning
- Hardware
- Driver
- Monitoring
links:
- name: Amd Gpu Operator
url: https://amd-gpu-operator.domain
maturity: alpha
- name: AMD GPU Operator
url: https://github.com/ROCm/gpu-operator
maintainers:
- email: [email protected]
name: Yan Sun
- email: [email protected]
name: Farshad Ghodsian
- email: [email protected]
name: Shrey Ajmera
maturity: stable
provider:
name: amd-gpu-operator
name: Advanced Micro Devices, Inc.
version: 1.2.0
21 changes: 12 additions & 9 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -306,13 +313,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -342,7 +342,10 @@ spec:
installer URL is https://repo.radeon.com/amdgpu-install by default
type: string
blacklist:
description: blacklist amdgpu drivers on the host
description: |-
blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
type: boolean
enable:
default: true
Expand All @@ -357,7 +360,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private
Expand Down
12 changes: 7 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ package main
import (
"flag"

"github.com/ROCm/gpu-operator/internal/configmanager"
"github.com/ROCm/gpu-operator/internal/metricsexporter"
"github.com/ROCm/gpu-operator/internal/testrunner"
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand All @@ -51,11 +48,15 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"

gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
utils "github.com/ROCm/gpu-operator/internal"
"github.com/ROCm/gpu-operator/internal/cmd"
"github.com/ROCm/gpu-operator/internal/config"
"github.com/ROCm/gpu-operator/internal/configmanager"
"github.com/ROCm/gpu-operator/internal/controllers"
"github.com/ROCm/gpu-operator/internal/kmmmodule"
"github.com/ROCm/gpu-operator/internal/metricsexporter"
"github.com/ROCm/gpu-operator/internal/nodelabeller"
"github.com/ROCm/gpu-operator/internal/testrunner"
//+kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -107,8 +108,9 @@ func main() {
}

client := mgr.GetClient()
kmmHandler := kmmmodule.NewKMMModule(client, scheme)
nlHandler := nodelabeller.NewNodeLabeller(scheme)
isOpenShift := utils.IsOpenShift(setupLogger)
kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift)
nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift)
metricsHandler := metricsexporter.NewMetricsExporter(scheme)
testrunnerHandler := testrunner.NewTestRunner(scheme)
configmanagerHandler := configmanager.NewConfigManager(scheme)
Expand Down
21 changes: 12 additions & 9 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ spec:
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
Expand Down Expand Up @@ -302,13 +309,6 @@ spec:
type: string
type: object
type: array
resourceNamingStrategy:
default: single
description: resource naming strategy for device plugin
enum:
- single
- mixed
type: string
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
Expand Down Expand Up @@ -338,7 +338,10 @@ spec:
installer URL is https://repo.radeon.com/amdgpu-install by default
type: string
blacklist:
description: blacklist amdgpu drivers on the host
description: |-
blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
type: boolean
enable:
default: true
Expand All @@ -353,7 +356,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
description: secrets used for pull/push images from/to private
Expand Down
Loading