From 23bf664c2cde1327311daa0489233b7b132ba649 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 27 Oct 2025 16:19:42 -0600 Subject: [PATCH 1/2] fix: remove duplicates from imagePullSecrets Signed-off-by: Julien Mancuso --- .../operator/templates/deployment.yaml | 4 +- .../cloud/operator/internal/dynamo/graph.go | 17 +++++- .../operator/internal/dynamo/graph_test.go | 61 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml index 7fdd6f3787..b7194087b8 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml @@ -87,12 +87,12 @@ spec: {{- if .Values.natsAddr }} - --natsAddr={{ .Values.natsAddr }} {{- else }} - - --natsAddr=nats://{{ .Release.Name }}-nats.{{ .Release.Namespace }}:4222 + - --natsAddr=nats://{{ .Release.Name }}-nats.{{ .Release.Namespace }}.svc.cluster.local:4222 {{- end }} {{- if .Values.etcdAddr }} - --etcdAddr={{ .Values.etcdAddr }} {{- else }} - - --etcdAddr={{ .Release.Name }}-etcd.{{ .Release.Namespace }}:2379 + - --etcdAddr={{ .Release.Name }}-etcd.{{ .Release.Namespace }}.svc.cluster.local:2379 {{- end }} {{- if and .Values.dynamo.istio.enabled .Values.dynamo.istio.gateway }} - --istio-virtual-service-gateway={{ .Values.dynamo.istio.gateway }} diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index e1cb39c396..94c7f3cd57 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -853,12 +853,27 @@ func GenerateBasePodSpec( } podSpec.Containers = append(podSpec.Containers, container) podSpec.Volumes = append(podSpec.Volumes, volumes...) - podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, imagePullSecrets...) + podSpec.ImagePullSecrets = ensureUniqueImagePullSecrets(podSpec.ImagePullSecrets, imagePullSecrets) backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName) return controller_common.CanonicalizePodSpec(&podSpec), nil } +func ensureUniqueImagePullSecrets(existing, new []corev1.LocalObjectReference) []corev1.LocalObjectReference { + if len(existing) == 0 && len(new) == 0 { + return nil + } + uniqueSecrets := make(map[string]corev1.LocalObjectReference) + for _, secret := range append(existing, new...) { + uniqueSecrets[secret.Name] = secret + } + uniqueSecretsList := make([]corev1.LocalObjectReference, 0, len(uniqueSecrets)) + for _, secret := range uniqueSecrets { + uniqueSecretsList = append(uniqueSecretsList, secret) + } + return uniqueSecretsList +} + func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.DynamoGraphDeployment) { // Convert user-provided metrics annotation into controller-managed label // By default (no annotation), metrics are enabled diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index 99d6b13dc0..860fe9868c 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -5205,3 +5205,64 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) }) } } + +func Test_ensureUniqueImagePullSecrets(t *testing.T) { + tests := []struct { + name string + existing []corev1.LocalObjectReference + new []corev1.LocalObjectReference + want []corev1.LocalObjectReference + }{ + { + name: "no existing or new secrets", + existing: []corev1.LocalObjectReference{}, + new: []corev1.LocalObjectReference{}, + want: nil, + }, + { + name: "existing and new secrets", + existing: []corev1.LocalObjectReference{ + {Name: "secret1"}, + {Name: "secret2"}, + }, + new: []corev1.LocalObjectReference{ + {Name: "secret3"}, + }, + want: []corev1.LocalObjectReference{ + {Name: "secret1"}, + {Name: "secret2"}, + {Name: "secret3"}, + }, + }, + { + name: "existing secrets with duplicate new secret", + existing: []corev1.LocalObjectReference{ + {Name: "secret1"}, + {Name: "secret2"}, + }, + new: []corev1.LocalObjectReference{ + {Name: "secret2"}, + {Name: "secret3"}, + }, + want: []corev1.LocalObjectReference{ + {Name: "secret1"}, + {Name: "secret2"}, + {Name: "secret3"}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ensureUniqueImagePullSecrets(tt.existing, tt.new) + sort.Slice(got, func(i, j int) bool { + return got[i].Name < got[j].Name + }) + sort.Slice(tt.want, func(i, j int) bool { + return tt.want[i].Name < tt.want[j].Name + }) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("ensureUniqueImagePullSecrets() = %v, want %v", got, tt.want) + } + }) + } +} From 7d6b737dba1f7ef91d9a27d40e9a57abaf932947 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Tue, 28 Oct 2025 10:39:05 -0600 Subject: [PATCH 2/2] fix: remove duplicates from imagePullSecrets Signed-off-by: Julien Mancuso --- ...ographdeploymentrequest_controller_test.go | 186 +++++++++--------- .../internal/controller_common/pod.go | 21 +- .../internal/controller_common/pod_test.go | 10 + .../cloud/operator/internal/dynamo/graph.go | 17 +- .../operator/internal/dynamo/graph_test.go | 61 ------ 5 files changed, 125 insertions(+), 170 deletions(-) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go index c5dfb39bb3..35307bcaa5 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go @@ -37,6 +37,10 @@ import ( "sigs.k8s.io/yaml" ) +const ( + defaultNamespace = "default" +) + // MockRBACManager implements RBACManager for testing type MockRBACManager struct { EnsureServiceAccountWithRBACFunc func(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error @@ -88,7 +92,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { It("Should validate spec and transition to Pending", func() { ctx := context.Background() dgdrName := "test-dgdr-initial" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -96,10 +100,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -120,7 +124,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // First reconcile: Empty -> Pending _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -134,20 +138,20 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { // Check status Eventually(func() string { var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) + _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) return updated.Status.State }, timeout, interval).Should(Equal(StatePending)) // Verify observedGeneration is set var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) + _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation)) }) It("Should pass validation with minimal config", func() { ctx := context.Background() dgdrName := "test-dgdr-minimal" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -155,10 +159,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "sla": map[string]interface{}{ "ttft": 100.0, @@ -170,7 +174,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed with minimal config _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -184,7 +188,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { // Check status transitions to Pending (not Failed) Eventually(func() string { var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) + _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) return updated.Status.State }, timeout, interval).Should(Equal(StatePending)) }) @@ -194,7 +198,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { It("Should create online profiling job", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-online" - namespace := "default" + namespace := defaultNamespace // Create ConfigMap for DGD base config configMap := &corev1.ConfigMap{ @@ -207,7 +211,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { }, } Expect(k8sClient.Create(ctx, configMap)).Should(Succeed()) - defer k8sClient.Delete(ctx, configMap) + defer func() { _ = k8sClient.Delete(ctx, configMap) }() // Create ServiceAccount sa := &corev1.ServiceAccount{ @@ -217,7 +221,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) - defer k8sClient.Delete(ctx, sa) + defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -225,10 +229,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "profiler_image": "test-profiler:latest", @@ -253,7 +257,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile multiple times to move through states _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -278,7 +282,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { // Verify job has correct labels jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} - k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job) + _ = k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job) Expect(job.Labels[LabelApp]).Should(Equal(LabelValueDynamoProfiler)) Expect(job.Labels[LabelDGDR]).Should(Equal(dgdrName)) @@ -300,13 +304,13 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { )) // Clean up job - k8sClient.Delete(ctx, job) + _ = k8sClient.Delete(ctx, job) }) It("Should create offline (AIC) profiling job", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-aic" - namespace := "default" + namespace := defaultNamespace // Create ServiceAccount sa := &corev1.ServiceAccount{ @@ -315,8 +319,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, } - _ = k8sClient.Create(ctx, sa) - defer k8sClient.Delete(ctx, sa) + Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -324,10 +328,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "trtllm", + Model: "test-model", + Backend: "trtllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -355,7 +359,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -382,7 +386,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err == nil { - k8sClient.Delete(ctx, job) + _ = k8sClient.Delete(ctx, job) } }) }) @@ -391,7 +395,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { It("Should generate DGD spec from ConfigMap", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-complete" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -399,10 +403,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -419,7 +423,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Profiling using Status subresource dgdr.Status.State = StateProfiling @@ -451,7 +455,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) - defer k8sClient.Delete(ctx, job) + defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status to completed using Status subresource job.Status.Conditions = []batchv1.JobCondition{{ @@ -481,7 +485,7 @@ spec: }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) - defer k8sClient.Delete(ctx, cm) + defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to process the profiling completion _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -505,7 +509,7 @@ spec: It("Should create DGD after profiling", func() { ctx := context.Background() dgdrName := "test-dgdr-autoapply" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -513,10 +517,10 @@ spec: Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -534,7 +538,7 @@ spec: } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Profiling using Status subresource dgdr.Status.State = StateProfiling @@ -566,7 +570,7 @@ spec: }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) - defer k8sClient.Delete(ctx, job) + defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status to completed using Status subresource job.Status.Conditions = []batchv1.JobCondition{{ @@ -596,7 +600,7 @@ spec: }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) - defer k8sClient.Delete(ctx, cm) + defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to generate spec (transitions to Deploying because autoApply=true) _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -620,14 +624,14 @@ spec: Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) // Get final DGDR status - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Deployment).NotTo(BeNil()) Expect(updated.Status.Deployment.Created).Should(BeTrue()) Expect(updated.Status.Deployment.Name).Should(Equal("test-dgd-auto")) // Clean up DGD - k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd) - k8sClient.Delete(ctx, dgd) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) + _ = k8sClient.Delete(ctx, dgd) }) }) @@ -635,7 +639,7 @@ spec: It("Should reject spec changes after profiling starts", func() { ctx := context.Background() dgdrName := "test-dgdr-immutable" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -643,10 +647,10 @@ spec: Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -663,7 +667,7 @@ spec: } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile to initialize _, err := reconciler.Reconcile(ctx, reconcile.Request{ @@ -673,22 +677,22 @@ spec: // Get current generation var current nvidiacomv1alpha1.DynamoGraphDeploymentRequest - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) initialGeneration := current.Generation observedGeneration := current.Status.ObservedGeneration // Manually set state to Profiling to simulate in-progress profiling current.Status.State = StateProfiling - k8sClient.Status().Update(ctx, ¤t) + Expect(k8sClient.Status().Update(ctx, ¤t)).Should(Succeed()) // Try to modify spec - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) // Unmarshal config, modify it, and marshal back var config map[string]interface{} - yaml.Unmarshal(current.Spec.ProfilingConfig.Config.Raw, &config) + Expect(yaml.Unmarshal(current.Spec.ProfilingConfig.Config.Raw, &config)).Should(Succeed()) config["sla"].(map[string]interface{})["ttft"] = 200.0 current.Spec.ProfilingConfig.Config = createTestConfig(config) - k8sClient.Update(ctx, ¤t) + Expect(k8sClient.Update(ctx, ¤t)).Should(Succeed()) // Reconcile _, err = reconciler.Reconcile(ctx, reconcile.Request{ @@ -697,7 +701,7 @@ spec: Expect(err).NotTo(HaveOccurred()) // Verify generation changed but observedGeneration stayed the same - k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) Expect(current.Generation).Should(BeNumerically(">", initialGeneration)) Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration)) Expect(current.Status.State).Should(Equal(StateProfiling)) // State unchanged @@ -718,7 +722,7 @@ spec: It("Should transition to DeploymentDeleted state", func() { ctx := context.Background() dgdrName := "test-dgdr-dgd-deleted" - namespace := "default" + namespace := defaultNamespace dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -726,10 +730,10 @@ spec: Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -747,7 +751,7 @@ spec: } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Ready with Deployment info using Status subresource dgdr.Status.State = StateReady @@ -873,10 +877,10 @@ var _ = Describe("DGDR Validation", func() { ctx := context.Background() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -900,10 +904,10 @@ var _ = Describe("DGDR Validation", func() { ctx := context.Background() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "sla": map[string]interface{}{ "ttft": 100.0, @@ -948,8 +952,8 @@ var _ = Describe("DGDR Profiler Arguments", func() { Namespace: namespace, }, } - _ = k8sClient.Create(ctx, sa) - defer k8sClient.Delete(ctx, sa) + Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -957,10 +961,10 @@ var _ = Describe("DGDR Profiler Arguments", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "trtllm", + Model: "test-model", + Backend: "trtllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -986,7 +990,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Re-fetch DGDR to get proper metadata from API server var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest @@ -1009,12 +1013,12 @@ var _ = Describe("DGDR Profiler Arguments", func() { Expect(args).Should(ContainElement("--profile-config")) // Clean up - k8sClient.Delete(ctx, job) + _ = k8sClient.Delete(ctx, job) }) It("Should pass config with AI Configurator settings for offline profiling", func() { ctx := context.Background() - namespace := "default" + namespace := defaultNamespace dgdrName := "test-args-offline" // Create ServiceAccount @@ -1024,8 +1028,8 @@ var _ = Describe("DGDR Profiler Arguments", func() { Namespace: namespace, }, } - _ = k8sClient.Create(ctx, sa) - defer k8sClient.Delete(ctx, sa) + Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) + defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ @@ -1033,10 +1037,10 @@ var _ = Describe("DGDR Profiler Arguments", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "trtllm", + Model: "test-model", + Backend: "trtllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -1065,7 +1069,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Re-fetch DGDR to get proper metadata from API server var fetchedDGDR nvidiacomv1alpha1.DynamoGraphDeploymentRequest @@ -1088,7 +1092,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { Expect(args).Should(ContainElement("--profile-config")) // Clean up - k8sClient.Delete(ctx, job) + _ = k8sClient.Delete(ctx, job) }) }) }) @@ -1112,7 +1116,7 @@ var _ = Describe("DGDR Error Handling", func() { Context("When profiling job fails", func() { It("Should capture detailed error from pod termination state", func() { ctx := context.Background() - namespace := "default" + namespace := defaultNamespace dgdrName := "test-error-capture" dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ @@ -1121,10 +1125,10 @@ var _ = Describe("DGDR Error Handling", func() { Namespace: namespace, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ - Model: "test-model", - ProfilerImage: "test-profiler:latest", - Backend: "vllm", + Model: "test-model", + Backend: "vllm", ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ + ProfilerImage: "test-profiler:latest", Config: createTestConfig(map[string]interface{}{ "engine": map[string]interface{}{ "config": "/tmp/test-config.yaml", @@ -1145,7 +1149,7 @@ var _ = Describe("DGDR Error Handling", func() { } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) - defer k8sClient.Delete(ctx, dgdr) + defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Set status to Profiling dgdr.Status.State = StateProfiling @@ -1178,7 +1182,7 @@ var _ = Describe("DGDR Error Handling", func() { }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) - defer k8sClient.Delete(ctx, job) + defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status job.Status.Conditions = []batchv1.JobCondition{{ @@ -1219,7 +1223,7 @@ var _ = Describe("DGDR Error Handling", func() { }, } Expect(k8sClient.Create(ctx, pod)).Should(Succeed()) - defer k8sClient.Delete(ctx, pod) + defer func() { _ = k8sClient.Delete(ctx, pod) }() // Reconcile - should capture error details _, err := reconciler.Reconcile(ctx, reconcile.Request{ diff --git a/deploy/cloud/operator/internal/controller_common/pod.go b/deploy/cloud/operator/internal/controller_common/pod.go index dc5ea92563..48415b4451 100644 --- a/deploy/cloud/operator/internal/controller_common/pod.go +++ b/deploy/cloud/operator/internal/controller_common/pod.go @@ -101,9 +101,11 @@ func CanonicalizePodSpec(podSpec *corev1.PodSpec) *corev1.PodSpec { // Sort image pull secrets if len(podSpec.ImagePullSecrets) > 1 { - sort.Slice(podSpec.ImagePullSecrets, func(i, j int) bool { - return podSpec.ImagePullSecrets[i].Name < podSpec.ImagePullSecrets[j].Name + uniqueSecrets := ensureUniqueImagePullSecrets(podSpec.ImagePullSecrets) + sort.Slice(uniqueSecrets, func(i, j int) bool { + return uniqueSecrets[i].Name < uniqueSecrets[j].Name }) + podSpec.ImagePullSecrets = uniqueSecrets } // Sort volumes and their nested items @@ -275,3 +277,18 @@ func CanonicalizePodSpec(podSpec *corev1.PodSpec) *corev1.PodSpec { return podSpec } + +func ensureUniqueImagePullSecrets(secrets []corev1.LocalObjectReference) []corev1.LocalObjectReference { + if len(secrets) == 0 { + return nil + } + uniqueSecrets := make(map[string]corev1.LocalObjectReference) + for _, secret := range secrets { + uniqueSecrets[secret.Name] = secret + } + uniqueSecretsList := make([]corev1.LocalObjectReference, 0, len(uniqueSecrets)) + for secretName := range uniqueSecrets { + uniqueSecretsList = append(uniqueSecretsList, corev1.LocalObjectReference{Name: secretName}) + } + return uniqueSecretsList +} diff --git a/deploy/cloud/operator/internal/controller_common/pod_test.go b/deploy/cloud/operator/internal/controller_common/pod_test.go index 8f555ebf60..94a646cf67 100644 --- a/deploy/cloud/operator/internal/controller_common/pod_test.go +++ b/deploy/cloud/operator/internal/controller_common/pod_test.go @@ -208,6 +208,7 @@ func TestCanonicalizePodSpec(t *testing.T) { {Name: "registry-z"}, {Name: "registry-a"}, {Name: "registry-b"}, + {Name: "registry-a"}, }, }, expected: &corev1.PodSpec{ @@ -218,6 +219,15 @@ func TestCanonicalizePodSpec(t *testing.T) { }, }, }, + { + name: "sorts nil image pull secrets", + input: &corev1.PodSpec{ + ImagePullSecrets: nil, + }, + expected: &corev1.PodSpec{ + ImagePullSecrets: nil, + }, + }, { name: "sorts volumes by name", input: &corev1.PodSpec{ diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index 94c7f3cd57..e1cb39c396 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -853,27 +853,12 @@ func GenerateBasePodSpec( } podSpec.Containers = append(podSpec.Containers, container) podSpec.Volumes = append(podSpec.Volumes, volumes...) - podSpec.ImagePullSecrets = ensureUniqueImagePullSecrets(podSpec.ImagePullSecrets, imagePullSecrets) + podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, imagePullSecrets...) backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName) return controller_common.CanonicalizePodSpec(&podSpec), nil } -func ensureUniqueImagePullSecrets(existing, new []corev1.LocalObjectReference) []corev1.LocalObjectReference { - if len(existing) == 0 && len(new) == 0 { - return nil - } - uniqueSecrets := make(map[string]corev1.LocalObjectReference) - for _, secret := range append(existing, new...) { - uniqueSecrets[secret.Name] = secret - } - uniqueSecretsList := make([]corev1.LocalObjectReference, 0, len(uniqueSecrets)) - for _, secret := range uniqueSecrets { - uniqueSecretsList = append(uniqueSecretsList, secret) - } - return uniqueSecretsList -} - func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.DynamoGraphDeployment) { // Convert user-provided metrics annotation into controller-managed label // By default (no annotation), metrics are enabled diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index 860fe9868c..99d6b13dc0 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -5205,64 +5205,3 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) }) } } - -func Test_ensureUniqueImagePullSecrets(t *testing.T) { - tests := []struct { - name string - existing []corev1.LocalObjectReference - new []corev1.LocalObjectReference - want []corev1.LocalObjectReference - }{ - { - name: "no existing or new secrets", - existing: []corev1.LocalObjectReference{}, - new: []corev1.LocalObjectReference{}, - want: nil, - }, - { - name: "existing and new secrets", - existing: []corev1.LocalObjectReference{ - {Name: "secret1"}, - {Name: "secret2"}, - }, - new: []corev1.LocalObjectReference{ - {Name: "secret3"}, - }, - want: []corev1.LocalObjectReference{ - {Name: "secret1"}, - {Name: "secret2"}, - {Name: "secret3"}, - }, - }, - { - name: "existing secrets with duplicate new secret", - existing: []corev1.LocalObjectReference{ - {Name: "secret1"}, - {Name: "secret2"}, - }, - new: []corev1.LocalObjectReference{ - {Name: "secret2"}, - {Name: "secret3"}, - }, - want: []corev1.LocalObjectReference{ - {Name: "secret1"}, - {Name: "secret2"}, - {Name: "secret3"}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := ensureUniqueImagePullSecrets(tt.existing, tt.new) - sort.Slice(got, func(i, j int) bool { - return got[i].Name < got[j].Name - }) - sort.Slice(tt.want, func(i, j int) bool { - return tt.want[i].Name < tt.want[j].Name - }) - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ensureUniqueImagePullSecrets() = %v, want %v", got, tt.want) - } - }) - } -}