opea-project · poussa · Jun 26, 2025 · May 2, 2025 · May 19, 2025 · May 28, 2025
@@ -3,8 +3,9 @@
 
 # Enable HorizontalPodAutoscaler (HPA)
 #
-# That will overwrite named PrometheusAdapter configMap with ChatQnA specific
-# custom metric queries for embedding, reranking, and LLM services.
+# Will create configMap with ChatQnA specific custom metric queries for embedding, reranking,
+# and LLM services, which can be used to overwrite current PrometheusAdapter rules.  This
+# will then provide custom metrics used by HorizontalPodAutoscaler rules of each service.
 #
 # Default upstream configMap is in:
 #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
@@ -20,27 +21,43 @@ global:
   monitoring: true
 
 # Override values in specific subcharts
+#
+# Note: enabling "autoscaling" for any of the subcharts requires enabling it also above!
 
-# Enabling "autoscaling" for any of the subcharts requires enabling it also above!
 vllm:
   # vLLM startup takes too long for autoscaling, especially with Gaudi
   VLLM_SKIP_WARMUP: "true"
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 4
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 tgi:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 4
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 teirerank:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 3
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 tei:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 2
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
@@ -1,30 +1,28 @@
-# Copyright (C) 2024 Intel Corporation
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  # easy to find for the required manual step
+  # easy to find for the manual step required to install this for Prometheus-adapter
   namespace: default
   name: {{ include "chatqna.fullname" . }}-custom-metrics
   labels:
     app.kubernetes.io/name: prometheus-adapter
 data:
   config.yaml: |
     rules:
-    {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
     # check metric with:
     # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
     #
-    - seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
-      # Average output token latency from vLLM histograms, over 1 min
-      # (interval should be at least 4x serviceMonitor query interval,
-      # 0.001 divider add is to make sure there's always a valid value)
-      metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))'
+    {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
+    - seriesQuery: '{__name__="vllm:num_requests_waiting",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
+      # Sum of requests waiting to be processed in pods
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
-        matches: ^vllm:time_per_output_token_seconds_sum
-        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency"
+        matches: ^vllm:num_requests_waiting
+        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_queue_size_sum"
       resources:
         # HPA needs both namespace + suitable object resource for its query paths:
         # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
@@ -34,63 +32,37 @@ data:
           service:   {resource: "service"}
     {{- end }}
     {{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }}
-    {{- if .Values.tgi.accelDevice }}
     - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
       # TGI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
+      # - GroupBy/LabelMatches provide labels from resources section
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^tgi_queue_size
         as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
-      # Average request latency from TGI histograms, over 1 min
-      metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^tgi_request_inference_duration_sum
-        as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}
           service:   {resource: "service"}
     {{- end }}
-    {{- if .Values.teirerank.autoscaling.enabled }}
-    {{- if .Values.teirerank.accelDevice }}
+    {{- if and .Values.teirerank.enabled .Values.teirerank.autoscaling.enabled }}
     - seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
       # TEI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^te_queue_size
         as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
-      # Average request latency from TEI histograms, over 1 min
-      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^te_request_inference_duration_sum
-        as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}
           service:   {resource: "service"}
     {{- end }}
     {{- if .Values.tei.autoscaling.enabled }}
-    {{- if .Values.tei.accelDevice }}
     - seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
       # TEI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^te_queue_size
         as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
-      # Average request latency from TEI histograms, over 1 min
-      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^te_request_inference_duration_sum
-        as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}

@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "tei.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "tei.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "tei.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "tei.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180

@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 2
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2081
 shmSize: 1Gi

@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "teirerank.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "teirerank.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180

@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 3
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2082
 shmSize: 1Gi

@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "tgi.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "tgi.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180

@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 4
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2080
 shmSize: 1Gi

@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -23,18 +23,18 @@ spec:
         name: {{ include "vllm.fullname" . }}
       target:
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
 {{- if .Values.accelDevice }}
-        averageValue: 0.1
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # allow larger latencies with unaccelerated service
-        averageValue: 1.0
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
       metric:
-        name: {{ include "vllm.metricPrefix" . }}_token_latency
+        name: {{ include "vllm.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180

@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 4
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 # empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)
 accelDevice: ""