From e0452d5e11b8dd69e70b35f755a2cbf6e1d960ed Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Fri, 2 May 2025 19:06:14 +0300
Subject: [PATCH 1/7] Disable vLLM warmup with HPA autoscaling

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/chatqna/hpa-values.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml
index ccf17454b..29fb50a16 100644
--- a/helm-charts/chatqna/hpa-values.yaml
+++ b/helm-charts/chatqna/hpa-values.yaml
@@ -23,6 +23,8 @@ global:
 
 # Enabling "autoscaling" for any of the subcharts requires enabling it also above!
 vllm:
+  # vLLM startup takes too long for autoscaling, especially with Gaudi
+  VLLM_SKIP_WARMUP: "true"
   autoscaling:
     minReplicas: 1
     maxReplicas: 4

From 00bbd13d2ff2c3746ef3a4ac1a8decc2d0e9bf2e Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Mon, 19 May 2025 21:42:13 +0300
Subject: [PATCH 2/7] Simplify HPA rules + add HPA queue size target variable

* Use .Series, .GroupBy and .LabelMatchers to simplify rules
* Drop request latency metric for TGI/TEI.  It depending on
  number of generated tokens, makes it unsuitable as generic metric
  - With that, support for HPA Value type could also be dropped
    (leaving only queue size AverageValue)
* Because vLLM token mean latency metric does not react much
  to vLLM load, and for consistency with TGI/TEI, switch vLLM
  also to be scaled based on queue size
  - KubeAI scales vLLM also based on queue size
* Add queue size target Helm variables for all inferencing engines

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/chatqna/hpa-values.yaml           | 31 +++++++---
 .../templates/custom-metrics-configmap.yaml   | 56 +++++--------------
 .../templates/horizontal-pod-autoscaler.yaml  | 21 +++----
 helm-charts/common/tei/values.yaml            |  5 +-
 .../templates/horizontal-pod-autoscaler.yaml  | 21 +++----
 helm-charts/common/teirerank/values.yaml      |  5 +-
 .../templates/horizontal-pod-autoscaler.yaml  | 21 +++----
 helm-charts/common/tgi/values.yaml            |  5 +-
 .../templates/horizontal-pod-autoscaler.yaml  | 12 ++--
 helm-charts/common/vllm/values.yaml           |  5 +-
 10 files changed, 84 insertions(+), 98 deletions(-)

diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml
index 29fb50a16..8e31fe426 100644
--- a/helm-charts/chatqna/hpa-values.yaml
+++ b/helm-charts/chatqna/hpa-values.yaml
@@ -3,8 +3,9 @@
 
 # Enable HorizontalPodAutoscaler (HPA)
 #
-# That will overwrite named PrometheusAdapter configMap with ChatQnA specific
-# custom metric queries for embedding, reranking, and LLM services.
+# Will create configMap with ChatQnA specific custom metric queries for embedding, reranking,
+# and LLM services, which can be used to overwrite current PrometheusAdapter rules.  This
+# will then provide custom metrics used by HorizontalPodAutoscaler rules of each service.
 #
 # Default upstream configMap is in:
 #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
@@ -20,27 +21,43 @@ global:
   monitoring: true
 
 # Override values in specific subcharts
+#
+# Note: enabling "autoscaling" for any of the subcharts requires enabling it also above!
 
-# Enabling "autoscaling" for any of the subcharts requires enabling it also above!
 vllm:
   # vLLM startup takes too long for autoscaling, especially with Gaudi
   VLLM_SKIP_WARMUP: "true"
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 4
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 tgi:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 4
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 teirerank:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 3
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
+
 tei:
   autoscaling:
+    enabled: true
     minReplicas: 1
     maxReplicas: 2
-    enabled: true
+    queueSizeTarget:
+      accel: 10
+      cpu: 10
diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
index 416b8910b..6a87ae7d0 100644
--- a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
+++ b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
@@ -1,11 +1,11 @@
-# Copyright (C) 2024 Intel Corporation
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  # easy to find for the required manual step
+  # easy to find for the manual step required to install this for Prometheus-adapter
   namespace: default
   name: {{ include "chatqna.fullname" . }}-custom-metrics
   labels:
@@ -13,18 +13,16 @@ metadata:
 data:
   config.yaml: |
     rules:
-    {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
     # check metric with:
     # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
     #
-    - seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
-      # Average output token latency from vLLM histograms, over 1 min
-      # (interval should be at least 4x serviceMonitor query interval,
-      # 0.001 divider add is to make sure there's always a valid value)
-      metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))'
+    {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
+    - seriesQuery: '{__name__="vllm:num_requests_waiting",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
+      # Sum of requests waiting to be processed in pods
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
-        matches: ^vllm:time_per_output_token_seconds_sum
-        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency"
+        matches: ^vllm:num_requests_waiting
+        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_queue_size_sum"
       resources:
         # HPA needs both namespace + suitable object resource for its query paths:
         # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
@@ -34,63 +32,37 @@ data:
           service:   {resource: "service"}
     {{- end }}
     {{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }}
-    {{- if .Values.tgi.accelDevice }}
     - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
       # TGI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
+      # - GroupBy/LabelMatches provide labels from resources section
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^tgi_queue_size
         as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
-      # Average request latency from TGI histograms, over 1 min
-      metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^tgi_request_inference_duration_sum
-        as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}
           service:   {resource: "service"}
     {{- end }}
-    {{- if .Values.teirerank.autoscaling.enabled }}
-    {{- if .Values.teirerank.accelDevice }}
+    {{- if and .Values.teirerank.enabled .Values.teirerank.autoscaling.enabled }}
     - seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
       # TEI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^te_queue_size
         as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
-      # Average request latency from TEI histograms, over 1 min
-      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^te_request_inference_duration_sum
-        as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}
           service:   {resource: "service"}
     {{- end }}
     {{- if .Values.tei.autoscaling.enabled }}
-    {{- if .Values.tei.accelDevice }}
     - seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
       # TEI instances queue_size sum
-      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
+      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^te_queue_size
         as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
-    {{- else }}
-    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
-      # Average request latency from TEI histograms, over 1 min
-      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
-      name:
-        matches: ^te_request_inference_duration_sum
-        as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
-    {{- end }}
       resources:
         overrides:
           namespace: {resource: "namespace"}
diff --git a/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
index 92a295728..19b240222 100644
--- a/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
+++ b/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "tei.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "tei.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "tei.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "tei.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml
index 652882646..351f4fc58 100644
--- a/helm-charts/common/tei/values.yaml
+++ b/helm-charts/common/tei/values.yaml
@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 2
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2081
 shmSize: 1Gi
diff --git a/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml
index 0bf47a288..7a046108f 100644
--- a/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml
+++ b/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml
@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "teirerank.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "teirerank.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
index 79117bc38..b40116ede 100644
--- a/helm-charts/common/teirerank/values.yaml
+++ b/helm-charts/common/teirerank/values.yaml
@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 3
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2082
 shmSize: 1Gi
diff --git a/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
index 279aa636e..bc1554245 100644
--- a/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
+++ b/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -22,24 +22,19 @@ spec:
         kind: Service
         name: {{ include "tgi.fullname" . }}
       target:
-{{- if .Values.accelDevice }}
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
-        averageValue: 15
-      metric:
-        name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
+{{- if .Values.accelDevice }}
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # Metric is average for all the pods. To avoid replica fluctuation when pod
-        # startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
-        type: Value
-        value: 4 # seconds
-      metric:
-        name: {{ include "tgi.metricPrefix" . }}_request_latency
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
+      metric:
+        name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
index 74c0ad2d8..c50a3cc7a 100644
--- a/helm-charts/common/tgi/values.yaml
+++ b/helm-charts/common/tgi/values.yaml
@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 4
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 port: 2080
 shmSize: 1Gi
diff --git a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
index aeb6fe383..4f866fb3e 100644
--- a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
+++ b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
@@ -1,7 +1,7 @@
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
@@ -23,18 +23,18 @@ spec:
         name: {{ include "vllm.fullname" . }}
       target:
         # Metric is sum from all pods. "AverageValue" divides value returned from
-        # the custom metrics API by the number of Pods before comparing to the target:
+        # the custom metrics API by the number of Pods before comparing to the target
+        # (pods need to be in Ready state faster than specified stabilization window):
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
 {{- if .Values.accelDevice }}
-        averageValue: 0.1
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
 {{- else }}
-        # allow larger latencies with unaccelerated service
-        averageValue: 1.0
+        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
 {{- end }}
       metric:
-        name: {{ include "vllm.metricPrefix" . }}_token_latency
+        name: {{ include "vllm.metricPrefix" . }}_queue_size_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml
index 2e24029e4..8a605cdaa 100644
--- a/helm-charts/common/vllm/values.yaml
+++ b/helm-charts/common/vllm/values.yaml
@@ -12,9 +12,12 @@ replicaCount: 1
 # - Requires custom metrics ConfigMap available in the main application chart
 # - https://kubernetes.io/docs/concepts/workloads/autoscaling/
 autoscaling:
+  enabled: false
   minReplicas: 1
   maxReplicas: 4
-  enabled: false
+  queueSizeTarget:
+    accel: 10
+    cpu: 10
 
 # empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)
 accelDevice: ""

From f479ea051bddeffda9c7d85300650e040fe45179 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 28 May 2025 22:18:16 +0300
Subject: [PATCH 3/7] Fix metrics dashboad next tokens rate query for TGI

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/common/dashboard/templates/configmap-metrics.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm-charts/common/dashboard/templates/configmap-metrics.yaml b/helm-charts/common/dashboard/templates/configmap-metrics.yaml
index 081da18bd..db8f82387 100644
--- a/helm-charts/common/dashboard/templates/configmap-metrics.yaml
+++ b/helm-charts/common/dashboard/templates/configmap-metrics.yaml
@@ -1137,7 +1137,7 @@ data:
                 "uid": "${Metrics}"
               },
               "editorMode": "code",
-              "expr": "sum by (service)(rate(tgi_request_mean_time_per_token_duration_count{service=\"$release-tgi\",namespace=\"$namespace\"}[$__rate_interval]))",
+              "expr": "sum by (service)(rate(tgi_request_generated_tokens_sum{service=\"$release-tgi\",namespace=\"$namespace\"}[$__rate_interval]))",
               "hide": false,
               "instant": false,
               "legendFormat": "TGI",

From 456481e5f2c0c2a6a7af957a307a77ee17bd7161 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Fri, 30 May 2025 17:00:16 +0300
Subject: [PATCH 4/7] Change vLLM custom metric from queue size to active
 requests sum

TGI and TEI queue size metric is only for requests that are waiting to
be processed, but that number can fluctuate a lot, as it's non-zero
only when pod is fully utilized, and needs to buffer requests.  On the
plus side, it's agnostic to how fast engine instance can process the
queries for given model.

vLLM provides also gauge metric for how many requests are currently
being processed (running).  Adding that to the waiting requests count
(queue size) makes the resulting metric much more stable, and allows
scaling up extra replicas before current ones are full. KubeAI autoscaling
is also based on number of active requests, so results will be more
comparable.

Howeover, this means that the suitable threshold will be model and
engine config specific (depending on how request batches HW can run in
parallel).

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/chatqna/hpa-values.yaml                         | 4 ++--
 helm-charts/chatqna/templates/custom-metrics-configmap.yaml | 6 +++---
 .../common/vllm/templates/horizontal-pod-autoscaler.yaml    | 6 +++---
 helm-charts/common/vllm/values.yaml                         | 5 +++--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml
index 8e31fe426..4899eb4c5 100644
--- a/helm-charts/chatqna/hpa-values.yaml
+++ b/helm-charts/chatqna/hpa-values.yaml
@@ -31,8 +31,8 @@ vllm:
     enabled: true
     minReplicas: 1
     maxReplicas: 4
-    queueSizeTarget:
-      accel: 10
+    activeRequestsTarget:
+      accel: 120
       cpu: 10
 
 tgi:
diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
index 6a87ae7d0..0774c17c5 100644
--- a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
+++ b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
@@ -18,11 +18,11 @@ data:
     #
     {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
     - seriesQuery: '{__name__="vllm:num_requests_waiting",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
-      # Sum of requests waiting to be processed in pods
-      metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
+      # Sum of active requests in pods, both ones already being processed, and ones waiting to be processed
+      metricsQuery: 'sum by (<<.GroupBy>>)(vllm:num_requests_running{<<.LabelMatchers>>} + <<.Series>>{<<.LabelMatchers>>})'
       name:
         matches: ^vllm:num_requests_waiting
-        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_queue_size_sum"
+        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_active_request_sum"
       resources:
         # HPA needs both namespace + suitable object resource for its query paths:
         # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
diff --git a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
index 4f866fb3e..fb6c41aa1 100644
--- a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
+++ b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
@@ -29,12 +29,12 @@ spec:
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
         type: AverageValue
 {{- if .Values.accelDevice }}
-        averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
+        averageValue: {{ .Values.autoscaling.activeRequestsTarget.accel }}
 {{- else }}
-        averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
+        averageValue: {{ .Values.autoscaling.activeRequestsTarget.cpu }}
 {{- end }}
       metric:
-        name: {{ include "vllm.metricPrefix" . }}_queue_size_sum
+        name: {{ include "vllm.metricPrefix" . }}_active_request_sum
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml
index 8a605cdaa..5b692cbcd 100644
--- a/helm-charts/common/vllm/values.yaml
+++ b/helm-charts/common/vllm/values.yaml
@@ -15,8 +15,9 @@ autoscaling:
   enabled: false
   minReplicas: 1
   maxReplicas: 4
-  queueSizeTarget:
-    accel: 10
+  # targeted active requests average per engine pod instance
+  activeRequestsTarget:
+    accel: 100
     cpu: 10
 
 # empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)

From 30a3265c5505e131f8369a44539d1d666cfc080f Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Tue, 3 Jun 2025 21:16:50 +0300
Subject: [PATCH 5/7] Add autoscaling metric documentation

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/HPA.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/helm-charts/HPA.md b/helm-charts/HPA.md
index c0daf61e7..12a09205e 100644
--- a/helm-charts/HPA.md
+++ b/helm-charts/HPA.md
@@ -12,6 +12,10 @@
   - [Install](#install)
   - [Post-install](#post-install)
 - [Verify](#verify)
+- [Scaling metric considerations](#scaling-metric-considerations)
+  - [Autoscaling principles](#autoscaling-principles)
+  - [Current scaling metrics](#current-scaling-metrics)
+  - [Other potential metrics](#other-potential-metrics)
 
 ## Introduction
 
@@ -133,3 +137,64 @@ watch -n 5 scale-monitor-helm.sh default chatqna
 **NOTE**: inferencing services provide metrics only after they've processed their first request.
 The reranking service is used only after the query context data has been uploaded. Until then,
 no metrics will be available for them.
+
+## Scaling metric considerations
+
+### Autoscaling principles
+
+The used model, underlying HW and engine parameters are supposed to be selected so that engine
+instance can satisfy service SLA (Service Level Agreement) requirements for its own requests,
+also when it's becoming saturated. Autoscaling is then intended to scale up the service so that
+requests can be directed to unsaturated instances.
+
+Problem is finding a good metric, and its threshold, for indicatating this saturation point.
+Preferably it should be something that can anticipate this point, so that startup delay for
+the new engine instances does not cause SLA breakage (or in worse case requests being rejected,
+if engine queue fills up).
+
+Note: Another problem is Kubernetes service routing sending requests (also) to already saturated
+instances, instead of idle ones. Using [KubeAI](../kubeai/#readme) (instead of HPA) to manage
+both engine scaling + query routing can solve that.
+
+### Current scaling metrics
+
+Currently following inference engine metrics are used to autoscale their replica counts:
+
+- vLLM: Active requests i.e. count of waiting (queued) + (already) running requests
+  - Good overall scaling metric, used also by [KubeAI](../kubeai/#readme) for scaling vLLM
+  - Threshold depends on how many requests underlying HW / engine config can process for given model in parallel
+- TGI / TEI: Queue size, i.e. how many requests are waiting to be processed
+  - Used because TGI and TEI do not offer metric for (already) running requests, just waiting ones
+  - Independent of the used model, so works well as an example, but not that good for production because
+    scaling happens late and fluctuates a lot (due to metric dropping to zero when engine is not saturated)
+
+### Other potential metrics
+
+All the metrics provided by the inference engines are listed in their documentation:
+
+- [vLLM metrics](https://docs.vllm.ai/en/v0.8.5/serving/metrics.html)
+  - [Metric design](https://docs.vllm.ai/en/v0.8.5/design/v1/metrics.html)
+- [TGI metrics](https://huggingface.co/docs/text-generation-inference/en/reference/metrics)
+  - TEI (embed and reranking) services provide a subset of these TGI metrics
+
+OPEA application [dashboard](monitoring.md#dashboards) provides (Prometheus query) examples
+for deriving service performance metrics out of engine Histogram metrics.
+
+Their suitability for autoscaling:
+
+- Request latency, request per second (RPS) - not suitable
+  - Depends completely on input and output token counts and is an indicator for past performance, not incoming load
+- First token latency (TTFT) - potential
+  - Relevancy depends on use-case; number of used tokens and what's important
+- Next token latency (TPOT, ITL), tokens per second (TPS) - potential
+  - Relevancy depends on use-case; number of used tokens and what's important
+
+Performance metrics will be capped by the performance of the underlying engine setup
+=> at some point, they stop corresponding to incoming load / how much scaling would be needed.
+
+Therefore such metrics could be used in production _when_ their thresholds are carefully
+fine-tuned and rechecked every time underlying setup (model, HW, engine config) changes.
+In OPEA Helm charts that setup is user selectable, so such metrics are unsuitable for
+autoscaling examples.
+
+(General [explanation](https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html) on how these metrics are measured.)

From a39b5c023048979729c1366c09db304b8e87154a Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 4 Jun 2025 20:56:58 +0300
Subject: [PATCH 6/7] Improve ChatQnA HPA enabling comments

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/chatqna/hpa-values.yaml | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml
index 4899eb4c5..5a20350fc 100644
--- a/helm-charts/chatqna/hpa-values.yaml
+++ b/helm-charts/chatqna/hpa-values.yaml
@@ -1,29 +1,30 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Enable HorizontalPodAutoscaler (HPA)
+# Enable HorizontalPodAutoscaler (HPA) for ChatQnA and its components
 #
 # Will create configMap with ChatQnA specific custom metric queries for embedding, reranking,
-# and LLM services, which can be used to overwrite current PrometheusAdapter rules.  This
-# will then provide custom metrics used by HorizontalPodAutoscaler rules of each service.
+# and LLM inferencing services, which can be used to overwrite current PrometheusAdapter rules.
+# This will then provide custom metrics used by HorizontalPodAutoscaler rules of each service.
 #
-# Default upstream configMap is in:
+# Default upstream adapter configMap is in:
 #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
 
-dashboard:
-  scaling: true
-
 autoscaling:
   enabled: true
 
 global:
-  # K8s custom metrics (used for scaling thresholds) are based on metrics from service monitoring
+  # Both Grafana dashboards and k8s custom metrics need (Prometheus) metrics for services
   monitoring: true
 
 # Override values in specific subcharts
 #
 # Note: enabling "autoscaling" for any of the subcharts requires enabling it also above!
 
+dashboard:
+  # add also scaling metrics dashboard to Grafana
+  scaling: true
+
 vllm:
   # vLLM startup takes too long for autoscaling, especially with Gaudi
   VLLM_SKIP_WARMUP: "true"

From f2370c0742526e4f6a27de1dca06f210c4d48021 Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Mon, 23 Jun 2025 15:09:07 +0300
Subject: [PATCH 7/7] HPA document improvement suggestions from Markus

And use same formatting for all notes.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
---
 helm-charts/HPA.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/helm-charts/HPA.md b/helm-charts/HPA.md
index 12a09205e..695623079 100644
--- a/helm-charts/HPA.md
+++ b/helm-charts/HPA.md
@@ -66,8 +66,8 @@ $ helm install  prometheus-adapter prometheus-community/prometheus-adapter --ver
   --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false
 ```
 
-NOTE: the service name given above in `prometheus.url` must match the listed Prometheus service name,
-otherwise adapter cannot access it!
+> **NOTE**: the service name given above in `prometheus.url` must match the listed Prometheus
+> service name, otherwise adapter cannot access it!
 
 (Alternative for setting the above `prometheusSpec` variable to `false` is making sure that
 `prometheusRelease` value in top-level chart matches the release name given to the Prometheus
@@ -134,9 +134,9 @@ watch -n 5 scale-monitor-helm.sh default chatqna
 
 (Assumes that HPA scaled chart is installed to `default` namespace with `chatqna` release name.)
 
-**NOTE**: inferencing services provide metrics only after they've processed their first request.
-The reranking service is used only after the query context data has been uploaded. Until then,
-no metrics will be available for them.
+> **NOTE**: inferencing services provide metrics only after they've processed their first request.
+> The reranking service is used only after the query context data has been uploaded. Until then,
+> no metrics will be available for them.
 
 ## Scaling metric considerations
 
@@ -149,16 +149,16 @@ requests can be directed to unsaturated instances.
 
 Problem is finding a good metric, and its threshold, for indicatating this saturation point.
 Preferably it should be something that can anticipate this point, so that startup delay for
-the new engine instances does not cause SLA breakage (or in worse case requests being rejected,
-if engine queue fills up).
+the new engine instances does not cause SLA breakage (or in the worst case requests being
+rejected, if the engine queue fills up).
 
-Note: Another problem is Kubernetes service routing sending requests (also) to already saturated
-instances, instead of idle ones. Using [KubeAI](../kubeai/#readme) (instead of HPA) to manage
-both engine scaling + query routing can solve that.
+> **NOTE**: Another problem is Kubernetes service routing sending requests (also) to already saturated
+> instances, instead of idle ones. Using [KubeAI](../kubeai/#readme) (instead of HPA) to manage
+> both engine scaling + query routing can solve that.
 
 ### Current scaling metrics
 
-Currently following inference engine metrics are used to autoscale their replica counts:
+The following inference engine metrics are used to autoscale their replica counts:
 
 - vLLM: Active requests i.e. count of waiting (queued) + (already) running requests
   - Good overall scaling metric, used also by [KubeAI](../kubeai/#readme) for scaling vLLM
@@ -166,7 +166,7 @@ Currently following inference engine metrics are used to autoscale their replica
 - TGI / TEI: Queue size, i.e. how many requests are waiting to be processed
   - Used because TGI and TEI do not offer metric for (already) running requests, just waiting ones
   - Independent of the used model, so works well as an example, but not that good for production because
-    scaling happens late and fluctuates a lot (due to metric dropping to zero when engine is not saturated)
+    scaling happens late and fluctuates a lot (due to metric being zero when engine is not saturated)
 
 ### Other potential metrics
 
@@ -189,8 +189,9 @@ Their suitability for autoscaling:
 - Next token latency (TPOT, ITL), tokens per second (TPS) - potential
   - Relevancy depends on use-case; number of used tokens and what's important
 
-Performance metrics will be capped by the performance of the underlying engine setup
-=> at some point, they stop corresponding to incoming load / how much scaling would be needed.
+Performance metrics will be capped by the performance of the underlying engine setup.
+Beyond a certain point, they no longer reflect the actual incoming load or indicate how
+much scaling is needed.
 
 Therefore such metrics could be used in production _when_ their thresholds are carefully
 fine-tuned and rechecked every time underlying setup (model, HW, engine config) changes.