opea-project · poussa · Jun 26, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 5, 2025
@@ -44,11 +44,6 @@ tolerations: []
 
 affinity: {}
 
-# This is just to avoid Helm errors when HPA is NOT used
-# (use hpa-values.yaml files to actually enable HPA).
-horizontalPodAutoscaler:
-  enabled: false
-
 docretriever:
   image:
     repository: opea/doc-index-retriever

@@ -14,6 +14,15 @@ dashboard:
 
 autoscaling:
   enabled: true
+  minReplicas: 1
+  maxReplicas: 4
+  # ChatQnA becomes scaling bottleneck when gets close to 100% CPU usage
+  targetCPUUtilizationPercentage: 80
+resources:
+  # CPU side HPA won't work without resource requests
+  requests:
+    # ChatQnA does not thread currently
+    cpu: 1
 
 global:
   # K8s custom metrics (used for scaling thresholds) are based on metrics from service monitoring

@@ -0,0 +1,25 @@
+{{- if .Values.autoscaling.enabled }}
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "chatqna.fullname" . }}
+  labels:
+    {{- include "chatqna.labels" . | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "chatqna.fullname" . }}
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+{{- end }}
@@ -1690,6 +1690,30 @@ data:
               "legendFormat": "vLLM: used",
               "range": true,
               "refId": "H"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${Metrics}"
+              },
+              "editorMode": "code",
+              "expr": "count(up{service=\"$release\",namespace=\"$namespace\"})",
+              "hide": false,
+              "legendFormat": "MegaService: instances",
+              "range": true,
+              "refId": "I"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "${Metrics}"
+              },
+              "editorMode": "code",
+              "expr": "count(megaservice_first_token_latency_count{service=\"$release\",namespace=\"$namespace\"})",
+              "hide": false,
+              "legendFormat": "MegaService: used",
+              "range": true,
+              "refId": "J"
             }
           ],
           "title": "Replicas",