diff --git a/kubeai/README.md b/kubeai/README.md index 465589267..8c0244be6 100644 --- a/kubeai/README.md +++ b/kubeai/README.md @@ -13,7 +13,7 @@ The following features are available at the moment. - Persistent Volume cache for models - tested/working - Model downloading & inference engine deployment - tested/working - Scaling pods to/from zero - tested/working -- Load based autoscaling - not tested/included +- Load based autoscaling - tested/working - Integration with OPEA application - missing The following models are included. @@ -60,7 +60,7 @@ kubectl explain models.kubeai.org # Deploying the Models -This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `opea-values.yaml`, `cacheProfiles` for more information). +This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `cacheProfiles` in `opea-values.yaml`). The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs. @@ -98,7 +98,9 @@ kubect apply -f models/llama-3.1-8b-instruct-gaudi.yaml -n $NAMESPACE kubect apply -f models/llama-3.3-70b-instruct-gaudi.yaml -n $NAMESPACE ``` -The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gpu-xxxx` and/or `model-llama-3.3-70b-instruct-gpu-xxxx`. +The rest is the same as in the previous example. You should see a pod running with the name `model-llama-3.1-8b-instruct-gaudi-xxxx`. When request load for that model increases enough, KubeAI will automatically deploy more instances (model `maxReplicas` > `minReplicas`). + +Latter model is set to scale from zero (`minReplicas` = 0), so `model-llama-3.3-70b-instruct-gaudi-xxxx` pod(s) will be present only when KubeAI gets requests for that model (avoids multiple devices being exclusively reserved for idle pods, but significantly slows down first response). ## Text Embeddings with BGE on CPU diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml index 37908426f..34bdbeac7 100644 --- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml +++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml @@ -17,6 +17,10 @@ spec: - --max-num-seqs=256 - --max-seq-len-to-capture=2048 env: - OMPI_MCA_btl_vader_single_copy_mechanism: none + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + # vLLM startup takes too long for autoscaling, especially with Gaudi + VLLM_SKIP_WARMUP: "true" minReplicas: 1 + maxReplicas: 4 + targetRequests: 120 resourceProfile: gaudi-for-text-generation:1 diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml index 350d04eaf..86d967e7b 100644 --- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml +++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml @@ -19,8 +19,10 @@ spec: env: OMPI_MCA_btl_vader_single_copy_mechanism: none PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + # vLLM startup takes too long for autoscaling, especially with Gaudi VLLM_SKIP_WARMUP: "true" - minReplicas: 1 + # scale-from-zero avoids idle instance occupying half a node, but causes long delay + minReplicas: 0 maxReplicas: 2 resourceProfile: gaudi-for-text-generation:4 diff --git a/kubeai/opea-values.yaml b/kubeai/opea-values.yaml index f4fc29637..6f0ff7bfb 100644 --- a/kubeai/opea-values.yaml +++ b/kubeai/opea-values.yaml @@ -28,3 +28,5 @@ resourceProfiles: requests: cpu: "2" memory: "2Gi" + nodeSelector: + #kubeai-inference: "true"