opea-project · chensuyue · May 19, 2025 · Apr 5, 2025 · Apr 10, 2025 · Apr 25, 2025
@@ -54,6 +54,18 @@ If you want to try with latest version, use `helm pull oci://ghcr.io/opea-projec
 export HUGGINGFACEHUB_API_TOKEN="YourOwnToken"
 helm pull oci://ghcr.io/opea-project/charts/agentqna --untar
 helm install agentqna agentqna -f agentqna/gaudi-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+
+# To use AMD ROCm device
+cd GenAIInfra/helm-charts/
+./update_dependency.sh
+helm dependency update agentqna
+export HFTOKEN="your_huggingface_token"
+export MODELDIR="/mnt/opea-models"
+# with vLLM
+helm upgrade --install agentqna agentqna -f agentqna/rocm-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+
+# with TGI
+helm upgrade --install agentqna agentqna -f agentqna/rocm-tgi-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 ```
 
 ## Verify
@@ -81,5 +93,5 @@ Open another terminal and run the following command to verify the service if wor
 curl http://localhost:9090/v1/chat/completions \
     -X POST \
     -H "Content-Type: application/json" \
-    -d '{"messages": "How many albums does Iron Maiden have?"}'
+    -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct","messages": "How many albums does Iron Maiden have?"}'
 ```
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: ghcr.io/huggingface/text-generation-inference
+    tag: "3.0.0-rocm"
+  LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+  USE_FLASH_ATTENTION: "false"
+  FLASH_ATTENTION_RECOMPUTE: "false"
+  HIP_VISIBLE_DEVICES: "0,1"
+  MAX_BATCH_SIZE: "4"
+  extraCmdArgs: [ "--num-shard","2" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+    requests:
+      cpu: 1
+      memory: 16Gi
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+    capabilities:
+      add:
+        - SYS_PTRACE
+  readinessProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  startupProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: opea/vllm-rocm
+    tag: latest
+  LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
+  env:
+    HIP_VISIBLE_DEVICES: "0,1"
+    TENSOR_PARALLEL_SIZE: "2"
+    HF_HUB_DISABLE_PROGRESS_BARS: "1"
+    HF_HUB_ENABLE_HF_TRANSFER: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    PYTORCH_JIT: "0"
+    HF_HOME: "/data"
+  extraCmd:
+    command: [ "python3", "/workspace/api_server.py" ]
+  extraCmdArgs: [ "--swap-space", "16",
+                  "--disable-log-requests",
+                  "--dtype", "float16",
+                  "--num-scheduler-steps", "1",
+                  "--distributed-executor-backend", "mp" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+  startupProbe:
+    failureThreshold: 180
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"