Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: deepseek-r1-distill-llama-70b-gaudi
spec:
features: [TextGeneration]
url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B
cacheProfile: nfs
engine: VLLM
args:
- --tensor-parallel-size=8
env:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
# vLLM startup takes too long for autoscaling, especially with Gaudi
VLLM_SKIP_WARMUP: "true"

# scale-from-zero avoids idle instance occupying a node, but causes long delay
minReplicas: 0
maxReplicas: 1
resourceProfile: gaudi-for-text-generation:8
26 changes: 26 additions & 0 deletions kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: deepseek-r1-distill-llama-8b-gaudi
spec:
features: [TextGeneration]
url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B
cacheProfile: nfs
engine: VLLM
args:
- --tensor-parallel-size=1
- --block-size=128
- --max-num-seqs=256
- --max-seq-len-to-capture=2048
env:
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
# vLLM startup takes too long for autoscaling, especially with Gaudi
VLLM_SKIP_WARMUP: "true"
minReplicas: 1
maxReplicas: 4
targetRequests: 120
resourceProfile: gaudi-for-text-generation:1
25 changes: 25 additions & 0 deletions kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: qwen2.5-72b-instruct-gaudi
spec:
features: [TextGeneration]
url: hf://Qwen/Qwen2.5-72B-Instruct
cacheProfile: nfs
engine: VLLM
args:
- --tensor-parallel-size=4
env:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
# vLLM startup takes too long for autoscaling, especially with Gaudi
VLLM_SKIP_WARMUP: "true"

# scale-from-zero avoids idle instance occupying half a node, but causes long delay
minReplicas: 0
maxReplicas: 2
resourceProfile: gaudi-for-text-generation:4
26 changes: 26 additions & 0 deletions kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Source: models/templates/models.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: qwen2.5-7b-instruct-gaudi
spec:
features: [TextGeneration]
url: hf://Qwen/Qwen2.5-7B-Instruct
cacheProfile: nfs
engine: VLLM
args:
- --tensor-parallel-size=1
- --block-size=128
- --max-num-seqs=256
- --max-seq-len-to-capture=2048
env:
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
# vLLM startup takes too long for autoscaling, especially with Gaudi
VLLM_SKIP_WARMUP: "true"
minReplicas: 1
maxReplicas: 4
targetRequests: 120
resourceProfile: gaudi-for-text-generation:1