Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions helm-charts/chatqna/hpa-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

# Enable HorizontalPodAutoscaler (HPA)
#
# That will overwrite named PrometheusAdapter configMap with ChatQnA specific
# custom metric queries for embedding, reranking, and LLM services.
# Will create configMap with ChatQnA specific custom metric queries for embedding, reranking,
# and LLM services, which can be used to overwrite current PrometheusAdapter rules. This
# will then provide custom metrics used by HorizontalPodAutoscaler rules of each service.
#
# Default upstream configMap is in:
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
Expand All @@ -20,27 +21,43 @@ global:
monitoring: true

# Override values in specific subcharts
#
# Note: enabling "autoscaling" for any of the subcharts requires enabling it also above!

# Enabling "autoscaling" for any of the subcharts requires enabling it also above!
vllm:
# vLLM startup takes too long for autoscaling, especially with Gaudi
VLLM_SKIP_WARMUP: "true"
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 4
enabled: true
queueSizeTarget:
accel: 10
cpu: 10

tgi:
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 4
enabled: true
queueSizeTarget:
accel: 10
cpu: 10

teirerank:
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 3
enabled: true
queueSizeTarget:
accel: 10
cpu: 10

tei:
autoscaling:
enabled: true
minReplicas: 1
maxReplicas: 2
enabled: true
queueSizeTarget:
accel: 10
cpu: 10
56 changes: 14 additions & 42 deletions helm-charts/chatqna/templates/custom-metrics-configmap.yaml
Original file line number Diff line number Diff line change
@@ -1,30 +1,28 @@
# Copyright (C) 2024 Intel Corporation
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
# Copyright (C) 2024-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
# easy to find for the required manual step
# easy to find for the manual step required to install this for Prometheus-adapter
namespace: default
name: {{ include "chatqna.fullname" . }}-custom-metrics
labels:
app.kubernetes.io/name: prometheus-adapter
data:
config.yaml: |
rules:
{{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
# check metric with:
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
#
- seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
# Average output token latency from vLLM histograms, over 1 min
# (interval should be at least 4x serviceMonitor query interval,
# 0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))'
{{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
- seriesQuery: '{__name__="vllm:num_requests_waiting",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
# Sum of requests waiting to be processed in pods
metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
name:
matches: ^vllm:time_per_output_token_seconds_sum
as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency"
matches: ^vllm:num_requests_waiting
as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_queue_size_sum"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
Expand All @@ -34,63 +32,37 @@ data:
service: {resource: "service"}
{{- end }}
{{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }}
{{- if .Values.tgi.accelDevice }}
- seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# TGI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
# - GroupBy/LabelMatches provide labels from resources section
metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
name:
matches: ^tgi_queue_size
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# Average request latency from TGI histograms, over 1 min
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.teirerank.autoscaling.enabled }}
{{- if .Values.teirerank.accelDevice }}
{{- if and .Values.teirerank.enabled .Values.teirerank.autoscaling.enabled }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.tei.autoscaling.enabled }}
{{- if .Values.tei.accelDevice }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
metricsQuery: 'sum by (<<.GroupBy>>)(<<.Series>>{<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
Expand Down
21 changes: 8 additions & 13 deletions helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand All @@ -22,24 +22,19 @@ spec:
kind: Service
name: {{ include "tei.fullname" . }}
target:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# the custom metrics API by the number of Pods before comparing to the target
# (pods need to be in Ready state faster than specified stabilization window):
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tei.metricPrefix" . }}_queue_size_sum
{{- if .Values.accelDevice }}
averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4 # seconds
metric:
name: {{ include "tei.metricPrefix" . }}_request_latency
averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
{{- end }}
metric:
name: {{ include "tei.metricPrefix" . }}_queue_size_sum
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
5 changes: 4 additions & 1 deletion helm-charts/common/tei/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ replicaCount: 1
# - Requires custom metrics ConfigMap available in the main application chart
# - https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 2
enabled: false
queueSizeTarget:
accel: 10
cpu: 10

port: 2081
shmSize: 1Gi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand All @@ -22,24 +22,19 @@ spec:
kind: Service
name: {{ include "teirerank.fullname" . }}
target:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# the custom metrics API by the number of Pods before comparing to the target
# (pods need to be in Ready state faster than specified stabilization window):
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
{{- if .Values.accelDevice }}
averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4 # seconds
metric:
name: {{ include "teirerank.metricPrefix" . }}_request_latency
averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
{{- end }}
metric:
name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
5 changes: 4 additions & 1 deletion helm-charts/common/teirerank/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ replicaCount: 1
# - Requires custom metrics ConfigMap available in the main application chart
# - https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 3
enabled: false
queueSizeTarget:
accel: 10
cpu: 10

port: 2082
shmSize: 1Gi
Expand Down
21 changes: 8 additions & 13 deletions helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand All @@ -22,24 +22,19 @@ spec:
kind: Service
name: {{ include "tgi.fullname" . }}
target:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# the custom metrics API by the number of Pods before comparing to the target
# (pods need to be in Ready state faster than specified stabilization window):
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
{{- if .Values.accelDevice }}
averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4 # seconds
metric:
name: {{ include "tgi.metricPrefix" . }}_request_latency
averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
{{- end }}
metric:
name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
5 changes: 4 additions & 1 deletion helm-charts/common/tgi/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ replicaCount: 1
# - Requires custom metrics ConfigMap available in the main application chart
# - https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 4
enabled: false
queueSizeTarget:
accel: 10
cpu: 10

port: 2080
shmSize: 1Gi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand All @@ -23,18 +23,18 @@ spec:
name: {{ include "vllm.fullname" . }}
target:
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# the custom metrics API by the number of Pods before comparing to the target
# (pods need to be in Ready state faster than specified stabilization window):
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
{{- if .Values.accelDevice }}
averageValue: 0.1
averageValue: {{ .Values.autoscaling.queueSizeTarget.accel }}
{{- else }}
# allow larger latencies with unaccelerated service
averageValue: 1.0
averageValue: {{ .Values.autoscaling.queueSizeTarget.cpu }}
{{- end }}
metric:
name: {{ include "vllm.metricPrefix" . }}_token_latency
name: {{ include "vllm.metricPrefix" . }}_queue_size_sum
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
5 changes: 4 additions & 1 deletion helm-charts/common/vllm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ replicaCount: 1
# - Requires custom metrics ConfigMap available in the main application chart
# - https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 4
enabled: false
queueSizeTarget:
accel: 10
cpu: 10

# empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)
accelDevice: ""
Expand Down