opea-project · poussa · Jun 4, 2025 · Jun 3, 2025
@@ -0,0 +1,25 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: deepseek-r1-distill-llama-70b-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+  cacheProfile: nfs
+  engine: VLLM
+  args:
+    - --tensor-parallel-size=8
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    VLLM_SKIP_WARMUP: "true"
+
+  # scale-from-zero avoids idle instance occupying a node, but causes long delay
+  minReplicas: 0
+  maxReplicas: 1
+  resourceProfile: gaudi-for-text-generation:8
@@ -0,0 +1,26 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: deepseek-r1-distill-llama-8b-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  cacheProfile: nfs
+  engine: VLLM
+  args:
+    - --tensor-parallel-size=1
+    - --block-size=128
+    - --max-num-seqs=256
+    - --max-seq-len-to-capture=2048
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 4
+  targetRequests: 120
+  resourceProfile: gaudi-for-text-generation:1
@@ -0,0 +1,25 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: qwen2.5-72b-instruct-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://Qwen/Qwen2.5-72B-Instruct
+  cacheProfile: nfs
+  engine: VLLM
+  args:
+    - --tensor-parallel-size=4
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    VLLM_SKIP_WARMUP: "true"
+
+  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
+  minReplicas: 0
+  maxReplicas: 2
+  resourceProfile: gaudi-for-text-generation:4
@@ -0,0 +1,26 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: qwen2.5-7b-instruct-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://Qwen/Qwen2.5-7B-Instruct
+  cacheProfile: nfs
+  engine: VLLM
+  args:
+    - --tensor-parallel-size=1
+    - --block-size=128
+    - --max-num-seqs=256
+    - --max-seq-len-to-capture=2048
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 4
+  targetRequests: 120
+  resourceProfile: gaudi-for-text-generation:1