diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml index 9b4002b5bf..50e2f00591 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml @@ -97,7 +97,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} container_name: chatqna-gaudi-backend-server diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml index 715db8976f..eabff7f865 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml @@ -1286,7 +1286,6 @@ spec: type: RuntimeDefault image: "opea/vllm-gaudi:latest" args: - - "--enforce-eager" - "--model" - "$(MODEL_ID)" - "--tensor-parallel-size" diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh index 3b1efa8547..263a17a0d5 100644 --- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh @@ -39,7 +39,7 @@ function start_services() { # Start Docker Containers docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log n=0 - until [[ "$n" -ge 100 ]]; do + until [[ "$n" -ge 160 ]]; do echo "n=$n" docker logs vllm-gaudi-server > vllm_service_start.log if grep -q "Warmup finished" vllm_service_start.log; then