diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index cdf36512a..ebad203d3 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -19,35 +19,40 @@ To install the chart, run the following: ```console cd GenAIInfra/helm-charts/ -scripts/update_dependency.sh +./update_dependency.sh helm dependency update chatqna export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" export MODELNAME="meta-llama/Meta-Llama-3-8B-Instruct" # To use CPU with vLLM -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} # To use Gaudi device with vLLM -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml # To use CPU with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-tgi-values.yaml # To use Gaudi device with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml # To use Nvidia GPU with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml -# To use CPU with Ollama -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set ollama.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-ollama-values.yaml -# To include guardrail component in chatqna on Gaudi with vLLM +# To include guardrail component in chatqna on Gaudi with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml # To run chatqna with Intel TDX feature #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vllm.LLM_MODEL_ID=${MODELNAME} --set redis-vector-db.tdxEnabled=true --set redis-vector-db.resources.limits.memory=4Gi --set retriever-usvc.tdxEnabled=true --set retriever-usvc.resources.limits.memory=7Gi --set tei.tdxEnabled=true --set tei.resources.limits.memory=4Gi --set teirerank.tdxEnabled=true --set teirerank.resources.limits.memory=6Gi --set nginx.tdxEnabled=true --set chatqna-ui.tdxEnabled=true --set chatqna-ui.resources.limits.memory=2Gi --set data-prep.tdxEnabled=true --set data-prep.resources.limits.memory=11Gi --set vllm.tdxEnabled=true --set vllm.resources.limits.memory=80Gi - # To use CPU with vLLM with Qdrant DB #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-qdrant-values.yaml -# To use CPU with vLLM with Milvus DB -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-milvus-values.yaml +# To use AMD ROCm device with vLLM +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/rocm-values.yaml +# To use AMD ROCm device with TGI +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/rocm-tgi-values.yaml # To deploy FaqGen #helm install faqgen chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/faqgen-cpu-values.yaml + +# To deploy FaqGen based application on AMD ROCm device with vLLM +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/faqgen-rocm-values.yaml +# To deploy FaqGen based application on AMD ROCm device with TGI +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/faqgen-rocm-tgi-values.yaml + ``` ### IMPORTANT NOTE diff --git a/helm-charts/chatqna/faqgen-rocm-tgi-values.yaml b/helm-charts/chatqna/faqgen-rocm-tgi-values.yaml new file mode 100644 index 000000000..3951386cb --- /dev/null +++ b/helm-charts/chatqna/faqgen-rocm-tgi-values.yaml @@ -0,0 +1,66 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "TGI" + service: + port: 80 +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "3.0.0-rocm" + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "false" + PYTORCH_TUNABLEOP_ENABLED: "0" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + # securityContext: + # readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/chatqna/faqgen-rocm-values.yaml b/helm-charts/chatqna/faqgen-rocm-values.yaml new file mode 100644 index 000000000..279c59721 --- /dev/null +++ b/helm-charts/chatqna/faqgen-rocm-values.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "vLLM" + service: + port: 80 +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + # securityContext: + # readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/chatqna/rocm-tgi-values.yaml b/helm-charts/chatqna/rocm-tgi-values.yaml new file mode 100644 index 000000000..35690fbf0 --- /dev/null +++ b/helm-charts/chatqna/rocm-tgi-values.yaml @@ -0,0 +1,61 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "3.0.0-rocm" + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + MAX_INPUT_LENGTH: "2048" + MAX_TOTAL_TOKENS: "4096" + PYTORCH_TUNABLEOP_ENABLED: "0" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +vllm: + enabled: false + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + securityContext: + readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/chatqna/rocm-values.yaml b/helm-charts/chatqna/rocm-values.yaml new file mode 100644 index 000000000..085b04408 --- /dev/null +++ b/helm-charts/chatqna/rocm-values.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + securityContext: + readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/common/tgi/rocm-values.yaml b/helm-charts/common/tgi/rocm-values.yaml index 62094666a..6fc8689f2 100644 --- a/helm-charts/common/tgi/rocm-values.yaml +++ b/helm-charts/common/tgi/rocm-values.yaml @@ -3,9 +3,10 @@ accelDevice: "rocm" image: repository: ghcr.io/huggingface/text-generation-inference - tag: "2.4.1-rocm" -MAX_INPUT_LENGTH: "1024" -MAX_TOTAL_TOKENS: "2048" + tag: "3.0.0-rocm" +MAX_INPUT_LENGTH: "2048" +MAX_TOTAL_TOKENS: "4096" +PYTORCH_TUNABLEOP_ENABLED: "0" USE_FLASH_ATTENTION: "false" FLASH_ATTENTION_RECOMPUTE: "false" HIP_VISIBLE_DEVICES: "0" diff --git a/helm-charts/common/tgi/templates/configmap.yaml b/helm-charts/common/tgi/templates/configmap.yaml index 13633b205..c5a5132c9 100644 --- a/helm-charts/common/tgi/templates/configmap.yaml +++ b/helm-charts/common/tgi/templates/configmap.yaml @@ -70,3 +70,6 @@ data: {{- if .Values.MAX_BATCH_SIZE }} MAX_BATCH_SIZE: {{ .Values.MAX_BATCH_SIZE | quote }} {{- end }} + {{- if .Values.PYTORCH_TUNABLEOP_ENABLED }} + PYTORCH_TUNABLEOP_ENABLED: {{ .Values.PYTORCH_TUNABLEOP_ENABLED | quote }} + {{- end }} diff --git a/helm-charts/valuefiles.yaml b/helm-charts/valuefiles.yaml index e91aa923e..4d02f90f9 100644 --- a/helm-charts/valuefiles.yaml +++ b/helm-charts/valuefiles.yaml @@ -24,6 +24,10 @@ chatqna: - faqgen-cpu-tgi-values.yaml - faqgen-gaudi-values.yaml - faqgen-gaudi-tgi-values.yaml + - rocm-values.yaml + - rocm-tgi-values.yaml + - faqgen-rocm-values.yaml + - faqgen-rocm-tgi-values.yaml agentqna: src_repo: GenAIInfra src_dir: helm-charts/agentqna