diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml index e6a2074..5d9085a 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-and-redis-lookup-preset.yaml @@ -54,6 +54,15 @@ data: - "--port=8000" - "--vllm-port=8001" - "--connector=nixlv2" + env: + - name: OTEL_TRACING_ENABLED + value: '{{ if and .Values.tracing.enabled .Values.tracing.components.routingProxy }}"true"{{ else }}"false"{{ end }}' + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: '{{ .Values.tracing.otelCollectorEndpoint }}' + - name: OTEL_SERVICE_NAME + value: "llm-d-routing-sidecar" + - name: OTEL_SAMPLING_RATE + value: '{{ .Values.tracing.samplingRate }}' ports: - containerPort: 8000 protocol: TCP diff --git a/charts/llm-d/values.schema.json b/charts/llm-d/values.schema.json index 47a46fa..a93eecd 100644 --- a/charts/llm-d/values.schema.json +++ b/charts/llm-d/values.schema.json @@ -6867,6 +6867,70 @@ "required": [], "title": "nameOverride" }, + "tracing": { + "additionalProperties": false, + "default": { + "enabled": false + }, + "description": "Distributed tracing configuration for llm-d components", + "properties": { + "alwaysPropagateContext": { + "default": true, + "description": "Always propagate trace context even when tracing is disabled", + "type": "boolean" + }, + "apiToken": { + "default": "", + "description": "API token for trace export (if required by collector)", + "type": "string" + }, + "components": { + "additionalProperties": false, + "description": "Per-component tracing configuration", + "properties": { + "eppInferenceScheduler": { + "default": true, + "description": "Enable tracing for EPP inference scheduler (includes kv-cache-manager)", + "type": "boolean" + }, + "inferenceGateway": { + "default": true, + "description": "Enable tracing for inference gateway", + "type": "boolean" + }, + "routingProxy": { + "default": true, + "description": "Enable tracing for routing proxy (llm-d-routing-sidecar)", + "type": "boolean" + }, + "vllm": { + "default": true, + "description": "Enable tracing for vLLM instances", + "type": "boolean" + } + }, + "type": "object" + }, + "enabled": { + "default": false, + "description": "Global tracing enablement (can be overridden per component)", + "type": "boolean" + }, + "otelCollectorEndpoint": { + "default": "http://otel-collector:4317", + "description": "OpenTelemetry collector endpoint", + "type": "string" + }, + "samplingRate": { + "default": 0.1, + "description": "Sampling rate for traces (0.0 to 1.0)", + "type": "number" + } + }, + "required": [], + "title": "tracing", + "type": "object" + }, "redis": { "$schema": "http://json-schema.org/schema#", "properties": { diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index cd62337..4e64e68 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -607,6 +607,14 @@ modelservice: value: "false" - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT value: "1" + - name: OTEL_TRACING_ENABLED + value: '{{ if and .Values.tracing.enabled .Values.tracing.components.eppInferenceScheduler }}"true"{{ else }}"false"{{ end }}' + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: '{{ .Values.tracing.otelCollectorEndpoint }}' + - name: OTEL_SERVICE_NAME + value: "llm-d-kv-cache-manager" + - name: OTEL_SAMPLING_RATE + value: '{{ .Values.tracing.samplingRate }}' # @schema # items: @@ -927,6 +935,35 @@ modelservice: # -- Enable the creation of RBAC resources create: true +# -- Distributed tracing configuration for llm-d components +# @default -- Tracing disabled by default +tracing: + # -- Global tracing enablement (can be overridden per component) + enabled: true + + # -- OpenTelemetry collector endpoint + otelCollectorEndpoint: "http://otel-collector:4317" + + # -- API token for trace export (if required by collector) + apiToken: "" + + # -- Sampling rate for traces (0.0 to 1.0) + samplingRate: 0.1 + + # -- Per-component tracing configuration + components: + # -- Enable tracing for EPP inference scheduler (includes kv-cache-manager) + eppInferenceScheduler: true + + # -- Enable tracing for inference gateway + inferenceGateway: true + + # -- Enable tracing for routing proxy (llm-d-routing-sidecar) + routingProxy: true + + # -- Enable tracing for vLLM instances + vllm: true + # @schema # $ref: https://raw.githubusercontent.com/bitnami/charts/refs/tags/redis/20.13.4/bitnami/redis/values.schema.json # @schema diff --git a/quickstart/examples/choose-adventure.yaml b/quickstart/examples/choose-adventure.yaml new file mode 100644 index 0000000..093581a --- /dev/null +++ b/quickstart/examples/choose-adventure.yaml @@ -0,0 +1,83 @@ +# Tested on AWS g6.12xlarge, minikube setup +# ./llmd-installer.sh --minikube --values-file examples/choose-adventure.yaml \ + +sampleApplication: + baseConfigMapRefName: basic-gpu-with-nixl-and-redis-lookup-preset + model: + modelArtifactURI: hf://meta-llama/Llama-3.2-3B-Instruct + modelName: "meta-llama/Llama-3.2-3B-Instruct" + resources: + limits: + nvidia.com/gpu: 2 + requests: + nvidia.com/gpu: 2 + prefill: + replicas: 1 + extraArgs: + - "--tensor-parallel-size" + - "2" + - "--distributed-executor-backend" + - "mp" + - "--max-model-len" + - "20000" + - '--enable-auto-tool-choice' + - '--tool-call-parser' + - llama3_json + - '--chat-template' + - /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja + decode: + replicas: 1 + extraArgs: + - "--tensor-parallel-size" + - "2" + - "--distributed-executor-backend" + - "mp" + - '--enable-auto-tool-choice' + - '--tool-call-parser' + - llama3_json + - '--chat-template' + - /workspace/vllm/examples/tool_chat_template_llama3.2_json.jinja + - "--max-model-len" + - "20000" +redis: + enabled: true +modelservice: + routingProxy: + image: + registry: quay.io + repository: sallyom/llm-d-routing-sidecar + tag: tracing-dev + epp: + image: + registry: quay.io + repository: sallyom/llm-d-inference-scheduler + tag: tracing-dev-1 + defaultEnvVarsOverride: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "true" + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "true" +tracing: + enabled: true + otelCollectorEndpoint: "otel-collector-collector.tracing.svc.cluster.local:4317" + samplingRate: 0.1 + alwaysPropagateContext: true + components: + eppInferenceScheduler: true + routingProxy: true