Skip to content

multiple vllms with nginx #14

@mhamiri96

Description

@mhamiri96

Hey folks,
I am using the inference benchmarker to benchmark gemma-3-27B. I've made the following baseline calculation for the case of using 2x H-100 GPUs: 4k tokens/second. I have made the assumption that if I am able to spawn 4x vlllm instances of gemma (each vllm getting 2x GPUs - 8x GPUS in total), I should get around 16k tokens/second. I'm using nginx as my load-balancer. This assumption did not materialize. When I spawn 4 VLLM containers with the loab balancer, I am getting around 6k req/second. I'll post my nginx config and my docker compose. Any help is much appreciated.

nginx config:

worker_processes auto;
events {
    worker_connections 1024;
}

http{
upstream backend {
    random two least_conn;
    server gemma3_benchmark_01:8000 max_fails=3 fail_timeout=10000s;
    server gemma3_benchmark_23:8000 max_fails=3 fail_timeout=10000s;
    server gemma3_benchmark_45:8000 max_fails=3 fail_timeout=10000s;
    server gemma3_benchmark_67:8000 max_fails=3 fail_timeout=10000s;

}
server {
    listen 80;
    location / {
        proxy_pass http://backend;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
    }
}
}

docker compose

services:
  gemma3_01:
    image: vllm/vllm-openai
    container_name: gemma3_benchmark_01
    restart: always
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0','1']
              capabilities: [gpu]
    volumes:
      - /data/models/gemma-3-27b-it: /data/docker_path/gemma3
    ports:
      - "5858:8000"
    ipc: host
    command: >
      --model /data/docker_path/gemma3
      -tp 2
      --gpu-memory-utilization 0.95
      --served-model-name gemma3
      --disable-fastapi-docs
      --max-num-batched-tokens 8192
      --max-model-len 50000
  gemma3_23:
    image: vllm/vllm-openai
    container_name: gemma3_benchmark_23
    restart: always
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['2','3']
              capabilities: [gpu]
    volumes:
      - /data/models/gemma-3-27b-it: /data/docker_path/gemma3
    ports:
      - "5859:8000"
    ipc: host
    command: >
      --model /data/docker_path/gemma3
      -tp 2
      --gpu-memory-utilization 0.95
      --served-model-name gemma3
      --disable-fastapi-docs
      --max-num-batched-tokens 8192
      --max-model-len 50000
  gemma3_45:
    image: vllm/vllm-openai
    container_name: gemma3_benchmark_45
    restart: always
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['4','5']
              capabilities: [gpu]
    volumes:
      - /data/models/gemma-3-27b-it:/data/docker_path/gemma3
    ports:
      - "5860:8000"
    ipc: host
    command: >
      --model /data/docker_path/gemma3
      -tp 2
      --gpu-memory-utilization 0.95
      --served-model-name gemma3
      --disable-fastapi-docs
      --max-num-batched-tokens 8192
      --max-model-len 50000
  gemma3_67:
    image: vllm/vllm-openai
    container_name: gemma3_benchmark_67
    restart: always
    runtime: nvidia
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['6','7']
              capabilities: [gpu]
    volumes:
      - /data/models/gemma-3-27b-it:/data/docker_path/gemma3
    ports:
      - "5861:8000"
    ipc: host
    command: >
      --model /data/docker_path/gemma3
      -tp 2
      --gpu-memory-utilization 0.95
      --served-model-name gemma3
      --disable-fastapi-docs
      --max-num-batched-tokens 8192
      --max-model-len 50000

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions