diff --git a/CodeGen/kubernetes/helm/README.md b/CodeGen/kubernetes/helm/README.md index d496fbe51b..e67341126e 100644 --- a/CodeGen/kubernetes/helm/README.md +++ b/CodeGen/kubernetes/helm/README.md @@ -131,3 +131,150 @@ Optionally, delete the namespace if it's no longer needed and empty: ```bash # kubectl delete ns codegen ``` + +## Deploy on AMD ROCm using Helm charts from the binary Helm repository + +```bash +mkdir ~/codegen-k8s-install && cd ~/codegen-k8s-install +``` + +### Cloning repos + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/CodeGen/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Qwen/Qwen2.5-Coder-7B-Instruct" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/codegen-k8s-install/GenAIExamples/CodeGen/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/codegen-k8s-install/GenAIExamples/CodeGen/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +helm upgrade --install codegen oci://ghcr.io/opea-project/charts/codegen \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +helm upgrade --install codegen oci://ghcr.io/opea-project/charts/codegen \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-tgi-values.yaml +``` + +## Deploy on AMD ROCm using Helm charts from Git repositories + +### Creating working dirs + +```bash +mkdir ~/codegen-k8s-install && cd ~/codegen-k8s-install +``` + +### Cloning repos + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +git clone https://github.com/opea-project/GenAIInfra.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/CodeGen/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Qwen/Qwen2.5-Coder-7B-Instruct" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/codegen-k8s-install/GenAIExamples/CodeGen/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/codegen-k8s-install/GenAIExamples/CodeGen/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +cd ~/codegen-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update codegen +helm upgrade --install codegen codegen \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/CodeGen/kubernetes/helm/rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +cd ~/codegen-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update codegen +helm upgrade --install codegen codegen \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/CodeGen/kubernetes/helm/rocm-tgi-values.yaml +``` diff --git a/CodeGen/kubernetes/helm/rocm-tgi-values.yaml b/CodeGen/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..40b81a9bd9 --- /dev/null +++ b/CodeGen/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,45 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. + + +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","1" ] + resources: + limits: + amd.com/gpu: "1" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" diff --git a/CodeGen/kubernetes/helm/rocm-values.yaml b/CodeGen/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..0c96b475c2 --- /dev/null +++ b/CodeGen/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,41 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. + + +tgi: + enabled: false + +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +llm-uservice: + TEXTGEN_BACKEND: vLLM + retryTimeoutSeconds: 720 diff --git a/CodeTrans/kubernetes/helm/README.md b/CodeTrans/kubernetes/helm/README.md index f4a14e5422..647589b950 100644 --- a/CodeTrans/kubernetes/helm/README.md +++ b/CodeTrans/kubernetes/helm/README.md @@ -16,3 +16,150 @@ helm install codetrans oci://ghcr.io/opea-project/charts/codetrans --set global export HFTOKEN="insert-your-huggingface-token-here" helm install codetrans oci://ghcr.io/opea-project/charts/codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml ``` + +## Deploy on AMD ROCm using Helm charts from the binary Helm repository + +```bash +mkdir ~/codetrans-k8s-install && cd ~/codetrans-k8s-install +``` + +### Cloning repos + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/CodeTrans/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="mistralai/Mistral-7B-Instruct-v0.3" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/codetrans-k8s-install/GenAIExamples/CodeTrans/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/codetrans-k8s-install/GenAIExamples/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +helm upgrade --install codetrans oci://ghcr.io/opea-project/charts/codetrans \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +helm upgrade --install codetrans oci://ghcr.io/opea-project/charts/codetrans \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-tgi-values.yaml +``` + +## Deploy on AMD ROCm using Helm charts from Git repositories + +### Creating working dirs + +```bash +mkdir ~/codetrans-k8s-install && cd ~/codetrans-k8s-install +``` + +### Cloning repos + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +git clone https://github.com/opea-project/GenAIInfra.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/CodeGen/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="mistralai/Mistral-7B-Instruct-v0.3" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/codetrans-k8s-install/GenAIExamples/CodeTrans/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/codetrans-k8s-install/GenAIExamples/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +cd ~/codetrans-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update codetrans +helm upgrade --install codetrans codetrans \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/CodeTrans/kubernetes/helm/rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +cd ~/codetrans-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update codetrans +helm upgrade --install codetrans codetrans \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml +``` diff --git a/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml b/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..3529c00e71 --- /dev/null +++ b/CodeTrans/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,44 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. + +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + USE_FLASH_ATTENTION: "false" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0" + MAX_BATCH_SIZE: "4" + extraCmdArgs: [ "--num-shard","1" ] + resources: + limits: + amd.com/gpu: "1" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false +llm-uservice: + TEXTGEN_BACKEND: TGI + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" diff --git a/CodeTrans/kubernetes/helm/rocm-values.yaml b/CodeTrans/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..48a99d7e6f --- /dev/null +++ b/CodeTrans/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,42 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. + +tgi: + enabled: false + +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +llm-uservice: + TEXTGEN_BACKEND: vLLM + retryTimeoutSeconds: 720 + LLM_MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct"