-
Notifications
You must be signed in to change notification settings - Fork 339
/
Copy pathstate_manager.go
1038 lines (937 loc) · 35.3 KB
/
state_manager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package controllers
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/go-logr/logr"
apiconfigv1 "github.com/openshift/api/config/v1"
configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
"golang.org/x/mod/semver"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/discovery"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
)
const (
commonGPULabelKey = "nvidia.com/gpu.present"
commonGPULabelValue = "true"
commonOperandsLabelKey = "nvidia.com/gpu.deploy.operands"
commonOperandsLabelValue = "true"
migManagerLabelKey = "nvidia.com/gpu.deploy.mig-manager"
migManagerLabelValue = "true"
migCapableLabelKey = "nvidia.com/mig.capable"
migCapableLabelValue = "true"
migConfigLabelKey = "nvidia.com/mig.config"
migConfigDisabledValue = "all-disabled"
vgpuHostDriverLabelKey = "nvidia.com/vgpu.host-driver-version"
gpuProductLabelKey = "nvidia.com/gpu.product"
nfdLabelPrefix = "feature.node.kubernetes.io/"
nfdKernelLabelKey = "feature.node.kubernetes.io/kernel-version.full"
nfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"
nfdOSReleaseIDLabelKey = "feature.node.kubernetes.io/system-os_release.ID"
nfdOSVersionIDLabelKey = "feature.node.kubernetes.io/system-os_release.VERSION_ID"
ocpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos"
ocpDriverToolkitIdentificationLabel = "openshift.driver-toolkit"
appLabelKey = "app"
ocpDriverToolkitIdentificationValue = "true"
ocpNamespaceMonitoringLabelKey = "openshift.io/cluster-monitoring"
ocpNamespaceMonitoringLabelValue = "true"
precompiledIdentificationLabelKey = "nvidia.com/precompiled"
precompiledIdentificationLabelValue = "true"
// see bundle/manifests/gpu-operator.clusterserviceversion.yaml
// --> ClusterServiceVersion.metadata.annotations.operatorframework.io/suggested-namespace
ocpSuggestedNamespace = "nvidia-gpu-operator"
gpuWorkloadConfigLabelKey = "nvidia.com/gpu.workload.config"
gpuWorkloadConfigContainer = "container"
gpuWorkloadConfigVMPassthrough = "vm-passthrough"
gpuWorkloadConfigVMVgpu = "vm-vgpu"
podSecurityLabelPrefix = "pod-security.kubernetes.io/"
podSecurityLevelPrivileged = "privileged"
driverAutoUpgradeAnnotationKey = "nvidia.com/gpu-driver-upgrade-enabled"
commonDriverDaemonsetName = "nvidia-driver-daemonset"
commonVGPUManagerDaemonsetName = "nvidia-vgpu-manager-daemonset"
)
var (
defaultGPUWorkloadConfig = gpuWorkloadConfigContainer
podSecurityModes = []string{"enforce", "audit", "warn"}
)
var gpuStateLabels = map[string]map[string]string{
gpuWorkloadConfigContainer: {
"nvidia.com/gpu.deploy.driver": "true",
"nvidia.com/gpu.deploy.gpu-feature-discovery": "true",
"nvidia.com/gpu.deploy.container-toolkit": "true",
"nvidia.com/gpu.deploy.device-plugin": "true",
"nvidia.com/gpu.deploy.dcgm": "true",
"nvidia.com/gpu.deploy.dcgm-exporter": "true",
"nvidia.com/gpu.deploy.node-status-exporter": "true",
"nvidia.com/gpu.deploy.operator-validator": "true",
},
gpuWorkloadConfigVMPassthrough: {
"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
"nvidia.com/gpu.deploy.sandbox-validator": "true",
"nvidia.com/gpu.deploy.vfio-manager": "true",
"nvidia.com/gpu.deploy.kata-manager": "true",
"nvidia.com/gpu.deploy.cc-manager": "true",
},
gpuWorkloadConfigVMVgpu: {
"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
"nvidia.com/gpu.deploy.vgpu-manager": "true",
"nvidia.com/gpu.deploy.vgpu-device-manager": "true",
"nvidia.com/gpu.deploy.sandbox-validator": "true",
"nvidia.com/gpu.deploy.cc-manager": "true",
},
}
var gpuNodeLabels = map[string]string{
"feature.node.kubernetes.io/pci-10de.present": "true",
"feature.node.kubernetes.io/pci-0302_10de.present": "true",
"feature.node.kubernetes.io/pci-0300_10de.present": "true",
}
type gpuWorkloadConfiguration struct {
config string
node string
log logr.Logger
}
// OpenShiftDriverToolkit contains the values required to deploy
// OpenShift DriverToolkit DaemonSet.
type OpenShiftDriverToolkit struct {
// true if the cluster runs OpenShift and
// Operator.UseOpenShiftDriverToolkit is turned on in the
// ClusterPolicy
requested bool
// true of the DriverToolkit is requested and the cluster has all
// the required components (NFD RHCOS OSTree label + OCP
// DriverToolkit imagestream)
enabled bool
currentRhcosVersion string
rhcosVersions map[string]bool
rhcosDriverToolkitImages map[string]string
}
// ClusterPolicyController represents clusterpolicy controller spec for GPU operator
type ClusterPolicyController struct {
client client.Client
ctx context.Context
singleton *gpuv1.ClusterPolicy
logger logr.Logger
scheme *runtime.Scheme
operatorNamespace string
resources []Resources
controls []controlFunc
stateNames []string
operatorMetrics *OperatorMetrics
idx int
kernelVersionMap map[string]string
currentKernelVersion string
k8sVersion string
openshift string
ocpDriverToolkit OpenShiftDriverToolkit
runtime gpuv1.Runtime
hasGPUNodes bool
hasNFDLabels bool
sandboxEnabled bool
}
func addState(n *ClusterPolicyController, path string) {
// TODO check for path
res, ctrl := addResourcesControls(n, path)
n.controls = append(n.controls, ctrl)
n.resources = append(n.resources, res)
n.stateNames = append(n.stateNames, filepath.Base(path))
}
// OpenshiftVersion fetches OCP version
func OpenshiftVersion(ctx context.Context) (string, error) {
cfg := config.GetConfigOrDie()
client, err := configv1.NewForConfig(cfg)
if err != nil {
return "", err
}
v, err := client.ClusterVersions().Get(ctx, "version", metav1.GetOptions{})
if err != nil {
return "", err
}
for _, condition := range v.Status.History {
if condition.State != "Completed" {
continue
}
ocpV := strings.Split(condition.Version, ".")
if len(ocpV) > 1 {
return ocpV[0] + "." + ocpV[1], nil
}
return ocpV[0], nil
}
return "", fmt.Errorf("Failed to find Completed Cluster Version")
}
// KubernetesVersion fetches the Kubernetes API server version
func KubernetesVersion() (string, error) {
cfg := config.GetConfigOrDie()
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
return "", fmt.Errorf("error building discovery client: %v", err)
}
info, err := discoveryClient.ServerVersion()
if err != nil {
return "", fmt.Errorf("unable to fetch server version information: %v", err)
}
return info.GitVersion, nil
}
// GetClusterWideProxy returns cluster wide proxy object setup in OCP
func GetClusterWideProxy(ctx context.Context) (*apiconfigv1.Proxy, error) {
cfg := config.GetConfigOrDie()
client, err := configv1.NewForConfig(cfg)
if err != nil {
return nil, err
}
proxy, err := client.Proxies().Get(ctx, "cluster", metav1.GetOptions{})
if err != nil {
return nil, err
}
return proxy, nil
}
func hasMIGConfigLabel(labels map[string]string) bool {
if _, ok := labels[migConfigLabelKey]; ok {
if labels[migConfigLabelKey] != "" {
return true
}
}
return false
}
// hasCommonGPULabel returns true if common Nvidia GPU label exists among provided node labels
func hasCommonGPULabel(labels map[string]string) bool {
if _, ok := labels[commonGPULabelKey]; ok {
if labels[commonGPULabelKey] == commonGPULabelValue {
// node is already labelled with common label
return true
}
}
return false
}
// hasGPULabels return true if node labels contain Nvidia GPU labels
func hasGPULabels(labels map[string]string) bool {
for key, val := range labels {
if _, ok := gpuNodeLabels[key]; ok {
if gpuNodeLabels[key] == val {
return true
}
}
}
return false
}
// hasNFDLabels return true if node labels contain NFD labels
func hasNFDLabels(labels map[string]string) bool {
for key := range labels {
if strings.HasPrefix(key, nfdLabelPrefix) {
return true
}
}
return false
}
// hasMIGCapableGPU returns true if this node has GPU capable of MIG partitioning.
func hasMIGCapableGPU(labels map[string]string) bool {
if value, exists := labels[vgpuHostDriverLabelKey]; exists && value != "" {
// vGPU node
return false
}
if value, exists := labels[migCapableLabelKey]; exists {
return value == migCapableLabelValue
}
// check product label if mig.capable label does not exist
if value, exists := labels[gpuProductLabelKey]; exists {
if strings.Contains(strings.ToLower(value), "h100") ||
strings.Contains(strings.ToLower(value), "a100") ||
strings.Contains(strings.ToLower(value), "a30") {
return true
}
}
return false
}
func hasMIGManagerLabel(labels map[string]string) bool {
for key := range labels {
if key == migManagerLabelKey {
return true
}
}
return false
}
func hasOperandsDisabled(labels map[string]string) bool {
if value, ok := labels[commonOperandsLabelKey]; ok {
if value == "false" {
return true
}
}
return false
}
func isValidWorkloadConfig(workloadConfig string) bool {
_, ok := gpuStateLabels[workloadConfig]
return ok
}
// getWorkloadConfig returns the GPU workload configured for the node.
// If an error occurs when searching for the workload config,
// return defaultGPUWorkloadConfig.
func getWorkloadConfig(labels map[string]string, sandboxEnabled bool) (string, error) {
if !sandboxEnabled {
return gpuWorkloadConfigContainer, nil
}
if workloadConfig, ok := labels[gpuWorkloadConfigLabelKey]; ok {
if isValidWorkloadConfig(workloadConfig) {
return workloadConfig, nil
}
return defaultGPUWorkloadConfig, fmt.Errorf("Invalid GPU workload config: %v", workloadConfig)
}
return defaultGPUWorkloadConfig, fmt.Errorf("No GPU workload config found")
}
// removeAllGPUStateLabels removes all gpuStateLabels from the provided map of node labels.
// removeAllGPUStateLabels returns true if the labels map has been modified.
func removeAllGPUStateLabels(labels map[string]string) bool {
modified := false
for _, labelsMap := range gpuStateLabels {
for key := range labelsMap {
if _, ok := labels[key]; ok {
delete(labels, key)
modified = true
}
}
}
if _, ok := labels[migManagerLabelKey]; ok {
delete(labels, migManagerLabelKey)
modified = true
}
return modified
}
// updateGPUStateLabels applies the correct GPU state labels for the GPU workload configuration.
// updateGPUStateLabels returns true if the input labels map is modified.
func (w *gpuWorkloadConfiguration) updateGPUStateLabels(labels map[string]string) bool {
if hasOperandsDisabled(labels) {
// Operands are disabled, delete all GPU state labels
w.log.Info("Operands are disabled for node", "NodeName", w.node, "Label", commonOperandsLabelKey, "Value", "false")
w.log.Info("Disabling all operands for node", "NodeName", w.node)
return removeAllGPUStateLabels(labels)
}
removed := w.removeGPUStateLabels(labels)
added := w.addGPUStateLabels(labels)
return removed || added
}
// addGPUStateLabels adds GPU state labels needed for the GPU workload configuration.
// If a required state label already exists on the node, honor the current value.
func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) bool {
modified := false
for key, value := range gpuStateLabels[w.config] {
if _, ok := labels[key]; !ok {
w.log.Info("Setting node label", "NodeName", w.node, "Label", key, "Value", value)
labels[key] = value
modified = true
}
}
if w.config == gpuWorkloadConfigContainer && hasMIGCapableGPU(labels) && !hasMIGManagerLabel(labels) {
w.log.Info("Setting node label", "NodeName", w.node, "Label", migManagerLabelKey, "Value", migManagerLabelValue)
labels[migManagerLabelKey] = migManagerLabelValue
modified = true
}
return modified
}
// removeGPUStateLabels removes GPU state labels not needed for the GPU workload configuration
func (w *gpuWorkloadConfiguration) removeGPUStateLabels(labels map[string]string) bool {
modified := false
for workloadConfig, labelsMap := range gpuStateLabels {
if workloadConfig == w.config {
continue
}
for key := range labelsMap {
if _, ok := gpuStateLabels[w.config][key]; ok {
// skip label if it is in the set of states for workloadConfig
continue
}
if _, ok := labels[key]; ok {
w.log.Info("Deleting node label", "NodeName", w.node, "Label", key)
delete(labels, key)
modified = true
}
}
}
if w.config != gpuWorkloadConfigContainer {
if _, ok := labels[migManagerLabelKey]; ok {
w.log.Info("Deleting node label", "NodeName", w.node, "Label", migManagerLabelKey)
delete(labels, migManagerLabelKey)
modified = true
}
}
return modified
}
func (n *ClusterPolicyController) applyDriverAutoUpgradeAnnotation() error {
// fetch all nodes
opts := []client.ListOption{}
list := &corev1.NodeList{}
err := n.client.List(n.ctx, list, opts...)
if err != nil {
return fmt.Errorf("Unable to list nodes to check annotations, err %s", err.Error())
}
for _, node := range list.Items {
node := node
labels := node.GetLabels()
if !hasCommonGPULabel(labels) {
// not a gpu node
continue
}
// set node annotation for driver auto-upgrade
updateRequired := false
value := "true"
annotationValue, annotationExists := node.ObjectMeta.Annotations[driverAutoUpgradeAnnotationKey]
if n.singleton.Spec.Driver.UpgradePolicy != nil &&
n.singleton.Spec.Driver.UpgradePolicy.AutoUpgrade &&
!n.sandboxEnabled {
// check if we need to add the annotation
if !annotationExists {
updateRequired = true
} else if annotationValue != "true" {
updateRequired = true
}
} else {
// check if we need to remove the annotation
if annotationExists {
updateRequired = true
}
value = "null"
}
if !updateRequired {
continue
}
// update annotation
node.ObjectMeta.Annotations[driverAutoUpgradeAnnotationKey] = value
if value == "null" {
// remove annotation if value is null
delete(node.ObjectMeta.Annotations, driverAutoUpgradeAnnotationKey)
}
err := n.client.Update(n.ctx, &node)
if err != nil {
n.logger.Info("Failed to update node state annotation on a node",
"node", node.Name,
"annotationKey", driverAutoUpgradeAnnotationKey,
"annotationValue", value, "error", err)
return err
}
}
return nil
}
// labelGPUNodes labels nodes with GPU's with NVIDIA common label
// it return clusterHasNFDLabels (bool), gpuNodesTotal (int), error
func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) {
ctx := n.ctx
// fetch all nodes
opts := []client.ListOption{}
list := &corev1.NodeList{}
err := n.client.List(ctx, list, opts...)
if err != nil {
return false, 0, fmt.Errorf("Unable to list nodes to check labels, err %s", err.Error())
}
clusterHasNFDLabels := false
updateLabels := false
gpuNodesTotal := 0
for _, node := range list.Items {
node := node
// get node labels
labels := node.GetLabels()
if !clusterHasNFDLabels {
clusterHasNFDLabels = hasNFDLabels(labels)
}
config, err := getWorkloadConfig(labels, n.sandboxEnabled)
if err != nil {
n.logger.Info("WARNING: failed to get GPU workload config for node; using default",
"NodeName", node.ObjectMeta.Name, "SandboxEnabled", n.sandboxEnabled,
"Error", err, "defaultGPUWorkloadConfig", defaultGPUWorkloadConfig)
}
n.logger.Info("GPU workload configuration", "NodeName", node.ObjectMeta.Name, "GpuWorkloadConfig", config)
gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.ObjectMeta.Name, n.logger}
if !hasCommonGPULabel(labels) && hasGPULabels(labels) {
n.logger.Info("Node has GPU(s)", "NodeName", node.ObjectMeta.Name)
// label the node with common Nvidia GPU label
n.logger.Info("Setting node label", "NodeName", node.ObjectMeta.Name, "Label", commonGPULabelKey, "Value", commonGPULabelValue)
labels[commonGPULabelKey] = commonGPULabelValue
// update node labels
node.SetLabels(labels)
updateLabels = true
} else if hasCommonGPULabel(labels) && !hasGPULabels(labels) {
// previously labelled node and no longer has GPU's
// label node to reset common Nvidia GPU label
n.logger.Info("Node no longer has GPUs", "NodeName", node.ObjectMeta.Name)
n.logger.Info("Setting node label", "Label", commonGPULabelKey, "Value", "false")
labels[commonGPULabelKey] = "false"
n.logger.Info("Disabling all operands for node", "NodeName", node.ObjectMeta.Name)
removeAllGPUStateLabels(labels)
// update node labels
node.SetLabels(labels)
updateLabels = true
}
if hasCommonGPULabel(labels) {
// If node has GPU, then add state labels as per the workload type
n.logger.Info("Checking GPU state labels on the node", "NodeName", node.ObjectMeta.Name)
if gpuWorkloadConfig.updateGPUStateLabels(labels) {
n.logger.Info("Applying correct GPU state labels to the node", "NodeName", node.ObjectMeta.Name)
node.SetLabels(labels)
updateLabels = true
}
// Disable MIG on the node explicitly where no MIG config is specified
if n.singleton.Spec.MIGManager.IsEnabled() && hasMIGCapableGPU(labels) && !hasMIGConfigLabel(labels) {
if n.singleton.Spec.MIGManager.Config != nil && n.singleton.Spec.MIGManager.Config.Default == migConfigDisabledValue {
n.logger.Info("Setting MIG config label", "NodeName", node.ObjectMeta.Name, "Label", migConfigLabelKey, "Value", migConfigDisabledValue)
labels[migConfigLabelKey] = migConfigDisabledValue
node.SetLabels(labels)
updateLabels = true
}
}
// increment GPU node count
gpuNodesTotal++
// add GPU node CoreOS version for OCP
if n.ocpDriverToolkit.requested {
rhcosVersion, ok := labels[nfdOSTreeVersionLabelKey]
if ok {
n.ocpDriverToolkit.rhcosVersions[rhcosVersion] = true
n.logger.V(1).Info("GPU node running RHCOS",
"nodeName", node.ObjectMeta.Name,
"RHCOS version", rhcosVersion,
)
} else {
n.logger.Info("node doesn't have the proper NFD RHCOS version label.",
"nodeName", node.ObjectMeta.Name,
"nfdLabel", nfdOSTreeVersionLabelKey,
)
}
}
}
// update node with the latest labels
if updateLabels {
err = n.client.Update(ctx, &node)
if err != nil {
return false, 0, fmt.Errorf("Unable to label node %s for the GPU Operator deployment, err %s",
node.ObjectMeta.Name, err.Error())
}
}
} // end node loop
n.logger.Info("Number of nodes with GPU label", "NodeCount", gpuNodesTotal)
n.operatorMetrics.gpuNodesTotal.Set(float64(gpuNodesTotal))
return clusterHasNFDLabels, gpuNodesTotal, nil
}
func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
// ContainerRuntimeVersion string will look like <runtime>://<x.y.z>
runtimeVer := node.Status.NodeInfo.ContainerRuntimeVersion
var runtime gpuv1.Runtime
switch {
case strings.HasPrefix(runtimeVer, "docker"):
runtime = gpuv1.Docker
case strings.HasPrefix(runtimeVer, "containerd"):
runtime = gpuv1.Containerd
case strings.HasPrefix(runtimeVer, "cri-o"):
runtime = gpuv1.CRIO
default:
return "", fmt.Errorf("runtime not recognized: %s", runtimeVer)
}
return runtime, nil
}
func (n *ClusterPolicyController) setPodSecurityLabelsForNamespace() error {
ctx := n.ctx
namespaceName := clusterPolicyCtrl.operatorNamespace
if n.openshift != "" && namespaceName != ocpSuggestedNamespace {
// The GPU Operator is not installed in the suggested
// namespace, so the namespace may be shared with other
// untrusted operators. Do not set Pod Security Admission labels.
n.logger.Info("GPU Operator is not installed in the suggested namespace. Not setting Pod Security Admission labels for namespace",
"namespace", namespaceName,
"suggested namespace", ocpSuggestedNamespace)
return nil
}
ns := &corev1.Namespace{}
opts := client.ObjectKey{Name: namespaceName}
err := n.client.Get(ctx, opts, ns)
if err != nil {
return fmt.Errorf("ERROR: could not get Namespace %s from client: %v", namespaceName, err)
}
patch := client.MergeFrom(ns.DeepCopy())
modified := false
// On K8s<1.21, namespaces are not automatically labeled with an immutable label. Initialize
// a labels map if needed before adding PSA labels.
// https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/#automatic-labelling
if ns.ObjectMeta.Labels == nil {
ns.ObjectMeta.Labels = make(map[string]string)
modified = true
}
for _, mode := range podSecurityModes {
key := podSecurityLabelPrefix + mode
if val, ok := ns.ObjectMeta.Labels[key]; !ok || (val != podSecurityLevelPrivileged) {
ns.ObjectMeta.Labels[key] = podSecurityLevelPrivileged
modified = true
}
}
if !modified {
return nil
}
err = n.client.Patch(ctx, ns, patch)
if err != nil {
return fmt.Errorf("unable to label namespace %s with pod security levels: %v", namespaceName, err)
}
return nil
}
func (n *ClusterPolicyController) ocpEnsureNamespaceMonitoring() error {
ctx := n.ctx
namespaceName := clusterPolicyCtrl.operatorNamespace
if namespaceName != ocpSuggestedNamespace {
// The GPU Operator is not installed in the suggested
// namespace, so the namespace may be shared with other
// untrusted operators. Do not enable namespace monitoring in
// this case, as per OpenShift/Prometheus best practices.
n.logger.Info("GPU Operator not installed in the suggested namespace, skipping namespace monitoring verification",
"namespace", namespaceName,
"suggested namespace", ocpSuggestedNamespace)
return nil
}
ns := &corev1.Namespace{}
opts := client.ObjectKey{Name: namespaceName}
err := n.client.Get(ctx, opts, ns)
if err != nil {
return fmt.Errorf("ERROR: could not get Namespace %s from client: %v", namespaceName, err)
}
val, ok := ns.ObjectMeta.Labels[ocpNamespaceMonitoringLabelKey]
if ok {
// label already defined, do not change it
var msg string
if val == ocpNamespaceMonitoringLabelValue {
msg = "OpenShift monitoring is enabled on the GPU Operator namespace"
} else {
msg = "WARNING: OpenShift monitoring currently disabled on user request"
}
n.logger.Info(msg,
"namespace", namespaceName,
"label", ocpNamespaceMonitoringLabelKey,
"value", val,
"excepted value", ocpNamespaceMonitoringLabelValue)
return nil
}
// label not defined, enable monitoring
n.logger.Info("Enabling OpenShift monitoring")
n.logger.V(1).Info("Adding monitoring label to the operator namespace",
"namespace", namespaceName,
"label", ocpNamespaceMonitoringLabelKey,
"value", ocpNamespaceMonitoringLabelValue)
n.logger.Info("Monitoring can be disabled by setting the namespace label " +
ocpNamespaceMonitoringLabelKey + "=false")
patch := client.MergeFrom(ns.DeepCopy())
ns.ObjectMeta.Labels[ocpNamespaceMonitoringLabelKey] = ocpNamespaceMonitoringLabelValue
err = n.client.Patch(ctx, ns, patch)
if err != nil {
return fmt.Errorf("Unable to label namespace %s for the GPU Operator monitoring, err %s",
namespaceName, err.Error())
}
return nil
}
// getRuntime will detect the container runtime used by nodes in the
// cluster and correctly set the value for clusterPolicyController.runtime
// For openshift, set runtime to crio. Otherwise, the default runtime is
// containerd -- if >=1 node is configured with containerd, set
// clusterPolicyController.runtime = containerd
func (n *ClusterPolicyController) getRuntime() error {
ctx := n.ctx
// assume crio for openshift clusters
if n.openshift != "" {
n.runtime = gpuv1.CRIO
return nil
}
opts := []client.ListOption{
client.MatchingLabels{commonGPULabelKey: "true"},
}
list := &corev1.NodeList{}
err := n.client.List(ctx, list, opts...)
if err != nil {
return fmt.Errorf("Unable to list nodes prior to checking container runtime: %v", err)
}
var runtime gpuv1.Runtime
for _, node := range list.Items {
rt, err := getRuntimeString(node)
if err != nil {
n.logger.Info(fmt.Sprintf("Unable to get runtime info for node %s: %v", node.Name, err))
continue
}
runtime = rt
if runtime == gpuv1.Containerd {
// default to containerd if >=1 node running containerd
break
}
}
if runtime.String() == "" {
n.logger.Info("Unable to get runtime info from the cluster, defaulting to containerd")
runtime = gpuv1.Containerd
}
n.runtime = runtime
return nil
}
func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterPolicyReconciler, clusterPolicy *gpuv1.ClusterPolicy) error {
n.singleton = clusterPolicy
n.ctx = ctx
n.idx = 0
n.logger = reconciler.Log
n.client = reconciler.Client
n.scheme = reconciler.Scheme
if len(n.controls) == 0 {
clusterPolicyCtrl.operatorNamespace = os.Getenv("OPERATOR_NAMESPACE")
if clusterPolicyCtrl.operatorNamespace == "" {
n.logger.Error(nil, "OPERATOR_NAMESPACE environment variable not set, cannot proceed")
// we cannot do anything without the operator namespace,
// let the operator Pod run into `CrashloopBackOff`
os.Exit(1)
}
version, err := OpenshiftVersion(ctx)
if err != nil && !apierrors.IsNotFound(err) {
return err
}
n.openshift = version
k8sVersion, err := KubernetesVersion()
if err != nil {
return err
}
if !semver.IsValid(k8sVersion) {
return fmt.Errorf("k8s version detected '%s' is not a valid semantic version", k8sVersion)
}
n.k8sVersion = k8sVersion
n.logger.Info("Kubernetes version detected", "version", k8sVersion)
n.operatorMetrics = initOperatorMetrics(n)
n.logger.Info("Operator metrics initialized.")
addState(n, "/opt/gpu-operator/pre-requisites")
addState(n, "/opt/gpu-operator/state-operator-metrics")
addState(n, "/opt/gpu-operator/state-driver")
addState(n, "/opt/gpu-operator/state-container-toolkit")
addState(n, "/opt/gpu-operator/state-operator-validation")
addState(n, "/opt/gpu-operator/state-device-plugin")
addState(n, "/opt/gpu-operator/state-mps-control-daemon")
addState(n, "/opt/gpu-operator/state-dcgm")
addState(n, "/opt/gpu-operator/state-dcgm-exporter")
addState(n, "/opt/gpu-operator/gpu-feature-discovery")
addState(n, "/opt/gpu-operator/state-mig-manager")
addState(n, "/opt/gpu-operator/state-node-status-exporter")
// add sandbox workload states
addState(n, "/opt/gpu-operator/state-vgpu-manager")
addState(n, "/opt/gpu-operator/state-vgpu-device-manager")
addState(n, "/opt/gpu-operator/state-sandbox-validation")
addState(n, "/opt/gpu-operator/state-vfio-manager")
addState(n, "/opt/gpu-operator/state-sandbox-device-plugin")
addState(n, "/opt/gpu-operator/state-kata-manager")
addState(n, "/opt/gpu-operator/state-cc-manager")
}
if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() {
n.sandboxEnabled = true
// defaultGPUWorkloadConfig is container, unless
// user overrides in ClusterPolicy with a valid GPU
// workload configuration
defaultWorkload := clusterPolicy.Spec.SandboxWorkloads.DefaultWorkload
if isValidWorkloadConfig(defaultWorkload) {
n.logger.Info("Default GPU workload is overridden in ClusterPolicy", "DefaultWorkload", defaultWorkload)
defaultGPUWorkloadConfig = defaultWorkload
}
} else {
n.sandboxEnabled = false
}
n.logger.Info("Sandbox workloads", "Enabled", n.sandboxEnabled, "DefaultWorkload", defaultGPUWorkloadConfig)
if n.openshift != "" && (n.singleton.Spec.Operator.UseOpenShiftDriverToolkit == nil ||
*n.singleton.Spec.Operator.UseOpenShiftDriverToolkit) {
// DTK is enabled by default on OpenShift
n.ocpDriverToolkit.requested = true
// mind that this is executed at every reconciliation loop,
// do not assume "permanent" data storage.
n.ocpDriverToolkit.rhcosVersions = make(map[string]bool)
n.ocpDriverToolkit.rhcosDriverToolkitImages = make(map[string]string)
} else {
// DTK is disabled on non-OpenShift deployments or when operator.use_ocp_driver_toolkit
// is explicitly set to 'false' in ClusterPolicy
n.ocpDriverToolkit.requested = false
n.ocpDriverToolkit.enabled = false
n.operatorMetrics.openshiftDriverToolkitEnabled.Set(openshiftDriverToolkitDisabled)
}
if clusterPolicy.Spec.PSA.IsEnabled() {
// label namespace with Pod Security Admission levels
n.logger.Info("Pod Security is enabled. Adding labels to GPU Operator namespace", "namespace", n.operatorNamespace)
err := n.setPodSecurityLabelsForNamespace()
if err != nil {
return err
}
n.logger.Info("Pod Security Admission labels added to GPU Operator namespace", "namespace", n.operatorNamespace)
}
// fetch all nodes and label gpu nodes
hasNFDLabels, gpuNodeCount, err := n.labelGPUNodes()
if err != nil {
return err
}
n.hasGPUNodes = gpuNodeCount != 0
n.hasNFDLabels = hasNFDLabels
// fetch all nodes and annotate gpu nodes
err = n.applyDriverAutoUpgradeAnnotation()
if err != nil {
return err
}
// detect the container runtime on worker nodes
err = n.getRuntime()
if err != nil {
return err
}
n.logger.Info(fmt.Sprintf("Using container runtime: %s", n.runtime.String()))
// fetch all kernel versions from the GPU nodes in the cluster
if n.singleton.Spec.Driver.IsEnabled() && n.singleton.Spec.Driver.UsePrecompiledDrivers() {
kernelVersionMap, err := n.getKernelVersionsMap()
if err != nil {
n.logger.Info("Unable to obtain all kernel versions of the GPU nodes in the cluster", "err", err)
return err
}
n.kernelVersionMap = kernelVersionMap
}
if n.openshift != "" {
// initialize openshift specific parameters
err = n.initOCPParams()
if err != nil {
return err
}
}
return nil
}
func (n *ClusterPolicyController) initOCPParams() error {
// initialize openshift specific parameters
if n.singleton.Spec.Driver.UsePrecompiledDrivers() {
// disable DTK for OCP when already pre-compiled drivers are used
n.ocpDriverToolkit.enabled = false
} else if n.ocpDriverToolkit.requested {
hasImageStream, err := ocpHasDriverToolkitImageStream(n)
if err != nil {
n.logger.Info("ocpHasDriverToolkitImageStream", "err", err)
return err
}
hasCompatibleNFD := len(n.ocpDriverToolkit.rhcosVersions) != 0
n.ocpDriverToolkit.enabled = hasImageStream && hasCompatibleNFD
if n.ocpDriverToolkit.enabled {
n.operatorMetrics.openshiftDriverToolkitEnabled.Set(openshiftDriverToolkitEnabled)
} else {
n.operatorMetrics.openshiftDriverToolkitEnabled.Set(openshiftDriverToolkitNotPossible)
}
n.logger.Info("OpenShift Driver Toolkit requested",
"hasCompatibleNFD", hasCompatibleNFD,
"hasDriverToolkitImageStream", hasImageStream)
n.logger.Info("OpenShift Driver Toolkit",
"enabled", n.ocpDriverToolkit.enabled)
if hasImageStream {
n.operatorMetrics.openshiftDriverToolkitIsMissing.Set(0)
} else {
n.operatorMetrics.openshiftDriverToolkitIsMissing.Set(1)
}
if n.hasGPUNodes && !hasCompatibleNFD {
n.operatorMetrics.openshiftDriverToolkitNfdTooOld.Set(1)
} else {
n.operatorMetrics.openshiftDriverToolkitNfdTooOld.Set(0)
}
}
// enable monitoring for the gpu-operator namespace
if err := n.ocpEnsureNamespaceMonitoring(); err != nil {
return err
}
return nil
}
func (n *ClusterPolicyController) step() (gpuv1.State, error) {
result := gpuv1.Ready
// Skip state-driver if NVIDIADriver CRD is enabled
// TODO:
// - Properly clean up any k8s object associated with 'state-driver'
// and owned by the Clusterpolicy controller.
// - In object_controls.go, check the OwnerRef for existing objects
// before managing them. Clusterpolicy controller should not be creating /
// updating / deleting objects owned by another controller.
if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") &&
n.singleton.Spec.Driver.UseNvdiaDriverCRDType() {
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
n.idx++
// Cleanup all driver daemonsets owned by ClusterPolicy, but orphan the dependent pod objects.
// This way, switching to the new NVIDIADriver API does not cause a cluster-wide disruption.
// NVIDIA driver pods owned by ClusterPolicy daemonsets will remain running until the NVIDIADriver
// controller migrates these pods to new ones owned by NVIDIADriver daemonsets.
deletePropagationOrphan := metav1.DeletePropagationOrphan
err := n.cleanupAllDriverDaemonSets(n.ctx, &client.DeleteOptions{PropagationPolicy: &deletePropagationOrphan})
if err != nil {
return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
}
return gpuv1.Disabled, nil
}
for _, fs := range n.controls[n.idx] {
stat, err := fs(*n)
if err != nil {
return stat, err
}
// successfully deployed resource, now check if its ready
if stat != gpuv1.Ready {
// mark overall status of this component as not-ready and continue with other resources, while this becomes ready
result = stat
}
}
// move to next state
n.idx++
return result, nil
}
// TODO
// func (n ClusterPolicyController) validate() {
// add custom validation functions
// }
func (n ClusterPolicyController) last() bool {
return n.idx == len(n.controls)
}
func (n ClusterPolicyController) isStateEnabled(stateName string) bool {
clusterPolicySpec := &n.singleton.Spec
switch stateName {
case "state-driver":
return clusterPolicySpec.Driver.IsEnabled()
case "state-container-toolkit":