diff --git a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml new file mode 100644 index 000000000..a694a4439 --- /dev/null +++ b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: deepseek-r1-distill-llama-70b-gaudi +spec: + features: [TextGeneration] + url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B + cacheProfile: nfs + engine: VLLM + args: + - --tensor-parallel-size=8 + env: + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" + + # scale-from-zero avoids idle instance occupying a node, but causes long delay + minReplicas: 0 + maxReplicas: 1 + resourceProfile: gaudi-for-text-generation:8 diff --git a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml new file mode 100644 index 000000000..0266dc261 --- /dev/null +++ b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml @@ -0,0 +1,26 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: deepseek-r1-distill-llama-8b-gaudi +spec: + features: [TextGeneration] + url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B + cacheProfile: nfs + engine: VLLM + args: + - --tensor-parallel-size=1 + - --block-size=128 + - --max-num-seqs=256 + - --max-seq-len-to-capture=2048 + env: + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" + minReplicas: 1 + maxReplicas: 4 + targetRequests: 120 + resourceProfile: gaudi-for-text-generation:1 diff --git a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml new file mode 100644 index 000000000..7079bfb4c --- /dev/null +++ b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: qwen2.5-72b-instruct-gaudi +spec: + features: [TextGeneration] + url: hf://Qwen/Qwen2.5-72B-Instruct + cacheProfile: nfs + engine: VLLM + args: + - --tensor-parallel-size=4 + env: + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" + + # scale-from-zero avoids idle instance occupying half a node, but causes long delay + minReplicas: 0 + maxReplicas: 2 + resourceProfile: gaudi-for-text-generation:4 diff --git a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml new file mode 100644 index 000000000..ec1772366 --- /dev/null +++ b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml @@ -0,0 +1,26 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: qwen2.5-7b-instruct-gaudi +spec: + features: [TextGeneration] + url: hf://Qwen/Qwen2.5-7B-Instruct + cacheProfile: nfs + engine: VLLM + args: + - --tensor-parallel-size=1 + - --block-size=128 + - --max-num-seqs=256 + - --max-seq-len-to-capture=2048 + env: + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" + minReplicas: 1 + maxReplicas: 4 + targetRequests: 120 + resourceProfile: gaudi-for-text-generation:1