diff --git a/helm-charts/agentqna/README.md b/helm-charts/agentqna/README.md index 5a2d3d249..030c0bfda 100644 --- a/helm-charts/agentqna/README.md +++ b/helm-charts/agentqna/README.md @@ -54,6 +54,18 @@ If you want to try with latest version, use `helm pull oci://ghcr.io/opea-projec export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" helm pull oci://ghcr.io/opea-project/charts/agentqna --untar helm install agentqna agentqna -f agentqna/gaudi-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + +# To use AMD ROCm device +cd GenAIInfra/helm-charts/ +./update_dependency.sh +helm dependency update agentqna +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +# with vLLM +helm upgrade --install agentqna agentqna -f agentqna/rocm-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + +# with TGI +helm upgrade --install agentqna agentqna -f agentqna/rocm-tgi-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} ``` ## Verify @@ -81,5 +93,5 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:9090/v1/chat/completions \ -X POST \ -H "Content-Type: application/json" \ - -d '{"messages": "How many albums does Iron Maiden have?"}' + -d '{"model": "meta-llama/Llama-3.3-70B-Instruct","messages": "How many albums does Iron Maiden have?"}' ``` diff --git a/helm-charts/agentqna/rocm-tgi-values.yaml b/helm-charts/agentqna/rocm-tgi-values.yaml new file mode 100644 index 000000000..9c2bc98eb --- /dev/null +++ b/helm-charts/agentqna/rocm-tgi-values.yaml @@ -0,0 +1,57 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values +vllm: + enabled: false +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "3.0.0-rocm" + LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + PYTORCH_TUNABLEOP_ENABLED: "0" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi + model: "meta-llama/Llama-3.3-70B-Instruct" diff --git a/helm-charts/agentqna/rocm-values.yaml b/helm-charts/agentqna/rocm-values.yaml new file mode 100644 index 000000000..0d5393b70 --- /dev/null +++ b/helm-charts/agentqna/rocm-values.yaml @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct + HIP_VISIBLE_DEVICES: "0,1" + TENSOR_PARALLEL_SIZE: "2" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "2" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm + llm_engine: vllm + model: "meta-llama/Llama-3.3-70B-Instruct" diff --git a/helm-charts/common/tgi/rocm-values.yaml b/helm-charts/common/tgi/rocm-values.yaml index 62094666a..6fc8689f2 100644 --- a/helm-charts/common/tgi/rocm-values.yaml +++ b/helm-charts/common/tgi/rocm-values.yaml @@ -3,9 +3,10 @@ accelDevice: "rocm" image: repository: ghcr.io/huggingface/text-generation-inference - tag: "2.4.1-rocm" -MAX_INPUT_LENGTH: "1024" -MAX_TOTAL_TOKENS: "2048" + tag: "3.0.0-rocm" +MAX_INPUT_LENGTH: "2048" +MAX_TOTAL_TOKENS: "4096" +PYTORCH_TUNABLEOP_ENABLED: "0" USE_FLASH_ATTENTION: "false" FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0" diff --git a/helm-charts/common/tgi/templates/configmap.yaml b/helm-charts/common/tgi/templates/configmap.yaml index 13633b205..c5a5132c9 100644 --- a/helm-charts/common/tgi/templates/configmap.yaml +++ b/helm-charts/common/tgi/templates/configmap.yaml @@ -70,3 +70,6 @@ data: {{- if .Values.MAX_BATCH_SIZE }} MAX_BATCH_SIZE: {{ .Values.MAX_BATCH_SIZE | quote }} {{- end }} + {{- if .Values.PYTORCH_TUNABLEOP_ENABLED }} + PYTORCH_TUNABLEOP_ENABLED: {{ .Values.PYTORCH_TUNABLEOP_ENABLED | quote }} + {{- end }} diff --git a/helm-charts/valuefiles.yaml b/helm-charts/valuefiles.yaml index e91aa923e..49a877ae9 100644 --- a/helm-charts/valuefiles.yaml +++ b/helm-charts/valuefiles.yaml @@ -32,6 +32,8 @@ agentqna: values: - cpu-values.yaml - gaudi-values.yaml + - rocm-values.yaml + - rocm-tgi-values.yaml audioqna: src_repo: GenAIInfra src_dir: helm-charts/audioqna