fix: fixed container stopping problem and added tags for vllm in gpu telemetry documentation

ilana-n · Ilana Nguyen · commit ef35e8ca4a33 · 2025-10-14T13:46:30.000-07:00
diff --git a/docs/tutorials/gpu-telemetry.md b/docs/tutorials/gpu-telemetry.md
@@ -141,6 +141,7 @@ This path works with **vLLM, SGLang, TRT-LLM, or any inference server**. We'll u
 
 The setup includes three steps: creating a custom metrics configuration, starting the DCGM Exporter, and launching the vLLM server.
 
+<!-- setup-vllm-gpu-telemetry-default-openai-endpoint-server -->
 ```bash
 # Step 1: Create a custom metrics configuration
 cat > custom_gpu_metrics.csv << 'EOF'
@@ -204,6 +205,7 @@ docker run -d --name vllm-server \
   --host 0.0.0.0 \
   --port 8000
 ```
+<!-- /setup-vllm-gpu-telemetry-default-openai-endpoint-server -->
 
 > [!TIP]
 > You can customize the `custom_gpu_metrics.csv` file by commenting out metrics you don't need. Lines starting with `#` are ignored.
@@ -246,6 +248,7 @@ uv pip install ./aiperf
 
 ## Verify Everything is Running
 
+<!-- health-check-vllm-gpu-telemetry-default-openai-endpoint-server -->
 ```bash
 # Wait for vLLM inference server to be ready (up to 15 minutes)
 timeout 900 bash -c 'while [ "$(curl -s -o /dev/null -w "%{http_code}" localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"test\"}],\"max_tokens\":1}")" != "200" ]; do sleep 2; done' || { echo "vLLM not ready after 15min"; exit 1; }
@@ -255,9 +258,11 @@ echo "vLLM ready, waiting for DCGM metrics to be available..."
 timeout 120 bash -c 'while true; do OUTPUT=$(curl -s localhost:9401/metrics); if echo "$OUTPUT" | grep -q "DCGM_FI_DEV_GPU_UTIL"; then break; fi; echo "Waiting for DCGM metrics..."; sleep 5; done' || { echo "GPU utilization metrics not found after 2min"; exit 1; }
 echo "DCGM GPU metrics are now available"
 ```
+<!-- /health-check-vllm-gpu-telemetry-default-openai-endpoint-server -->
 
 ## Run AIPerf Benchmark
 
+<!-- aiperf-run-vllm-gpu-telemetry-default-openai-endpoint-server -->
 ```bash
 aiperf profile \
     --model Qwen/Qwen3-0.6B \
@@ -278,6 +283,7 @@ aiperf profile \
     --random-seed 100 \
     --gpu-telemetry
 ```
+<!-- /aiperf-run-vllm-gpu-telemetry-default-openai-endpoint-server -->
 
 ## Multi-Node GPU Telemetry Example
 
diff --git a/tests/ci/test_docs_end_to_end/test_runner.py b/tests/ci/test_docs_end_to_end/test_runner.py
@@ -325,6 +325,7 @@ def _graceful_server_shutdown(self, server_name: str):
                     timeout 30 bash -c '
                         echo "Stopping Docker Compose services..."
                         docker compose -f docker-compose.yml down 2>/dev/null || true
+                        sleep 3
 
                         echo "Stopping Dynamo containers..."
                         # Stop containers by Dynamo image
@@ -344,12 +345,19 @@ def _graceful_server_shutdown(self, server_name: str):
                 logger.info("Executing vLLM graceful shutdown...")
                 shutdown_cmd = """
                     timeout 30 bash -c '
+                        echo "Stopping DCGM exporter containers..."
+                        # Stop DCGM exporter containers explicitly since they are brought up separately
+                        docker stop dcgm-exporter 2>/dev/null || true
+                        docker rm dcgm-exporter 2>/dev/null || true
+                        docker ps --filter ancestor=*dcgm-exporter* --format "{{.ID}}" | xargs -r docker stop 2>/dev/null || true
+                        docker ps -aq --filter ancestor=*dcgm-exporter* | xargs -r docker rm 2>/dev/null || true
+
                         echo "Stopping vLLM containers..."
-                        # Stop containers by vLLM image
-                        docker ps --filter ancestor=*vllm* --format "{{.ID}}" | xargs -r docker stop 2>/dev/null || true
+                        # Stop containers with vllm in image name
+                        docker ps --format "{{.ID}} {{.Image}}" | grep vllm | awk "{print \$1}" | xargs -r docker stop 2>/dev/null || true
 
                         # Remove containers
-                        docker ps -aq --filter ancestor=*vllm* | xargs -r docker rm 2>/dev/null || true
+                        docker ps -aq --format "{{.ID}} {{.Image}}" | grep vllm | awk "{print \$1}" | xargs -r docker rm 2>/dev/null || true
 
                         echo "vLLM graceful shutdown completed"
                     '
@@ -363,6 +371,11 @@ def _graceful_server_shutdown(self, server_name: str):
                         echo "Stopping containers for {server_name}..."
                         docker ps --filter name={server_name} --format "{{.ID}}" | xargs -r docker stop 2>/dev/null || true
                         docker ps -aq --filter name={server_name} | xargs -r docker rm 2>/dev/null || true
+
+                        echo "Stopping DCGM containers..."
+                        docker stop dcgm-exporter 2>/dev/null || true
+                        docker rm dcgm-exporter 2>/dev/null || true
+
                         echo "Generic server shutdown completed"
                     '
                 """