-
Notifications
You must be signed in to change notification settings - Fork 18
Open
Description
Hey folks,
I am using the inference benchmarker to benchmark gemma-3-27B. I've made the following baseline calculation for the case of using 2x H-100 GPUs: 4k tokens/second. I have made the assumption that if I am able to spawn 4x vlllm instances of gemma (each vllm getting 2x GPUs - 8x GPUS in total), I should get around 16k tokens/second. I'm using nginx as my load-balancer. This assumption did not materialize. When I spawn 4 VLLM containers with the loab balancer, I am getting around 6k req/second. I'll post my nginx config and my docker compose. Any help is much appreciated.
nginx config:
worker_processes auto;
events {
worker_connections 1024;
}
http{
upstream backend {
random two least_conn;
server gemma3_benchmark_01:8000 max_fails=3 fail_timeout=10000s;
server gemma3_benchmark_23:8000 max_fails=3 fail_timeout=10000s;
server gemma3_benchmark_45:8000 max_fails=3 fail_timeout=10000s;
server gemma3_benchmark_67:8000 max_fails=3 fail_timeout=10000s;
}
server {
listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
}
docker compose
services:
gemma3_01:
image: vllm/vllm-openai
container_name: gemma3_benchmark_01
restart: always
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0','1']
capabilities: [gpu]
volumes:
- /data/models/gemma-3-27b-it: /data/docker_path/gemma3
ports:
- "5858:8000"
ipc: host
command: >
--model /data/docker_path/gemma3
-tp 2
--gpu-memory-utilization 0.95
--served-model-name gemma3
--disable-fastapi-docs
--max-num-batched-tokens 8192
--max-model-len 50000
gemma3_23:
image: vllm/vllm-openai
container_name: gemma3_benchmark_23
restart: always
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['2','3']
capabilities: [gpu]
volumes:
- /data/models/gemma-3-27b-it: /data/docker_path/gemma3
ports:
- "5859:8000"
ipc: host
command: >
--model /data/docker_path/gemma3
-tp 2
--gpu-memory-utilization 0.95
--served-model-name gemma3
--disable-fastapi-docs
--max-num-batched-tokens 8192
--max-model-len 50000
gemma3_45:
image: vllm/vllm-openai
container_name: gemma3_benchmark_45
restart: always
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['4','5']
capabilities: [gpu]
volumes:
- /data/models/gemma-3-27b-it:/data/docker_path/gemma3
ports:
- "5860:8000"
ipc: host
command: >
--model /data/docker_path/gemma3
-tp 2
--gpu-memory-utilization 0.95
--served-model-name gemma3
--disable-fastapi-docs
--max-num-batched-tokens 8192
--max-model-len 50000
gemma3_67:
image: vllm/vllm-openai
container_name: gemma3_benchmark_67
restart: always
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['6','7']
capabilities: [gpu]
volumes:
- /data/models/gemma-3-27b-it:/data/docker_path/gemma3
ports:
- "5861:8000"
ipc: host
command: >
--model /data/docker_path/gemma3
-tp 2
--gpu-memory-utilization 0.95
--served-model-name gemma3
--disable-fastapi-docs
--max-num-batched-tokens 8192
--max-model-len 50000
Metadata
Metadata
Assignees
Labels
No labels