diff --git a/kubeai/README.md b/kubeai/README.md
index 79b9c7f5a..20d26c56f 100644
--- a/kubeai/README.md
+++ b/kubeai/README.md
@@ -79,7 +79,7 @@ kubectl explain models.kubeai.org
 
 This section describes how to deploy various models. All the examples below use Kubernetes Persistent Volumes and Claims (PV/PVC) to store the models. The Kubernetes Storage Class (SC) is called `standard`. You can tune the storage configuration to match your environment during the installation (see `cacheProfiles` in `opea-values.yaml`).
 
-The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs.
+The models in the examples below are deployed to `$NAMESPACE`. Please set that according to your needs. Model README is located here [models](models/README.md)
 
 ```
 export NAMESPACE="kubeai"
diff --git a/kubeai/models/README.md b/kubeai/models/README.md
new file mode 100644
index 000000000..d78f5eb99
--- /dev/null
+++ b/kubeai/models/README.md
@@ -0,0 +1,24 @@
+# Model Files
+
+This directory contains YAML configuration files for various AI models designed to run on Kubernetes clusters. These files define the specifications, arguments, and resource profiles required for deploying and running the models efficiently.
+
+## Benchmarking
+
+The parameters for the models were determined using the KubeAI benchmarking tool 'benchmark_serving.py' The benchmarking script can be found [here](https://github.com/substratusai/kubeai/blob/main/benchmarks/chat-py/benchmark_serving.py).
+
+The following arguments were used during benchmarking:
+
+- `--request-rate=800`
+- `--max-concurrency=800`
+- `--num-prompts=8000`
+- `--max-conversations=800`
+
+These parameters were chosen to optimize the model's performance in terms of throughput.
+
+## Additional Notes
+
+- The `cacheProfile` is set to `default`.
+- The `targetRequests` value matches the `max-num-seqs` (batch size).
+- Most Models enabled autoscaling (`maxReplicas` > `MinReplicas`) so vLLM warmup is disabled as it would slow down new Gaudi vLLM instances startup too much
+
+For more details, refer to the individual YAML files in this directory.
diff --git a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
index a694a4439..7d2ba9cc9 100644
--- a/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-70b-gaudi.yaml
@@ -9,17 +9,24 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
-    - --tensor-parallel-size=8
+    - --tensor-parallel-size=2
+    - --max-model-len=2048
+    - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=16000
+    - --max-num-seqs=64
+    - --gpu-memory-utilization=0.9
+    - --enable-auto-tool-choice
+    - --tool-call-parser=llama3_json
+    - --disable-log-requests
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 1
-  resourceProfile: gaudi-for-text-generation:8
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 64
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
index 0266dc261..ac3b763fa 100644
--- a/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
+++ b/kubeai/models/deepseek-r1-distill-llama-8b-gaudi.yaml
@@ -9,18 +9,21 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  cacheProfile: nfs
   engine: VLLM
+  cacheProfile: default
   args:
     - --tensor-parallel-size=1
-    - --block-size=128
-    - --max-num-seqs=256
+    - --max-model-len=2048
     - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=2048
+    - --max-num-seqs=512
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
+  maxReplicas: 8
+  # same as max-num-seqs (batch size)
+  targetRequests: 512
   resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/llama-3.1-8b-instruct-cpu.yaml b/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
index 6530af689..03e40ff06 100644
--- a/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-cpu.yaml
@@ -9,7 +9,7 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
-  cacheProfile: standard
+  cacheProfile: default
   engine: VLLM
   args:
     - --max-model-len=32768
diff --git a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
index 34bdbeac7..7b8c475ad 100644
--- a/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.1-8b-instruct-gaudi.yaml
@@ -1,6 +1,5 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
@@ -9,18 +8,20 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
     - --tensor-parallel-size=1
     - --block-size=128
     - --max-num-seqs=256
     - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=16000
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
+  maxReplicas: 8
+  # same as max-num-seqs (batch size)
+  targetRequests: 256
   resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
index 86d967e7b..0b9babade 100644
--- a/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
+++ b/kubeai/models/llama-3.3-70b-instruct-gaudi.yaml
@@ -9,20 +9,23 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://meta-llama/Llama-3.3-70B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
-    - --tensor-parallel-size=4
-    - --max-seq-len-to-capture=16384
+    - --tensor-parallel-size=2
+    - --max-num-seqs=64
+    - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=16000
     - --enable-auto-tool-choice
     - --tool-call-parser=llama3_json
+    - --disable-log-request
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
     PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 2
-  resourceProfile: gaudi-for-text-generation:4
+  minReplicas: 1
+  maxReplicas: 4
+  # Equals to max-num-seqs (batch-size)
+  targetRequests: 64
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
new file mode 100644
index 000000000..fb9681df5
--- /dev/null
+++ b/kubeai/models/mistral-7b-instruct-v0.3-gaudi.yaml
@@ -0,0 +1,32 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: mistral-7b-instruct-v0.3-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://mistralai/Mistral-7B-Instruct-v0.3
+  cacheProfile: default
+  engine: VLLM
+  args:
+    - --model=mistralai/Mistral-7B-Instruct-v0.3
+    - --load_format=mistral
+    - --config_format=mistral
+    - --tensor-parallel-size=1
+    - --block-size=128
+    - --max-num-seqs=512
+    - --max-seq-len-to-capture=2048
+    - --max-model-len=2048
+    - --max-num-batched-token=2048
+    - --disable-log-request
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 8
+  # Equals to max-num-seqs (batch-size) parameter
+  targetRequests: 512
+  resourceProfile: gaudi-for-text-generation:1
diff --git a/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
new file mode 100644
index 000000000..907a2347f
--- /dev/null
+++ b/kubeai/models/mixtral-8x7b-instruct-v0.1-gaudi.yaml
@@ -0,0 +1,31 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: mixtral-8x7b-instruct-v0.1-gaudi
+spec:
+  features: [TextGeneration]
+  url: hf://mistralai/Mixtral-8x7B-Instruct-v0.1
+  cacheProfile: default
+  engine: VLLM
+  args:
+    - --model=mistralai/Mixtral-8x7B-Instruct-v0.1
+    - --tensor-parallel-size=2
+    - --block-size=128
+    - --max-num-seqs=512
+    - --max-model-len=32000
+    - --max-seq-len-to-capture=32000
+    - --max-num-batched-token=64000
+    - --disable-log-request
+  env:
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
+    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+    VLLM_SKIP_WARMUP: "true"
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 512
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
index 7079bfb4c..87898c1a7 100644
--- a/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-72b-instruct-gaudi.yaml
@@ -1,6 +1,5 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
@@ -9,17 +8,21 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://Qwen/Qwen2.5-72B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
-    - --tensor-parallel-size=4
+    - --tensor-parallel-size=2
+    - --max-model-len=2048
+    - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=16000
+    - --max-num-seqs=128
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
     OMPI_MCA_btl_vader_single_copy_mechanism: none
-    PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
     VLLM_SKIP_WARMUP: "true"
-
-  # scale-from-zero avoids idle instance occupying half a node, but causes long delay
-  minReplicas: 0
-  maxReplicas: 2
-  resourceProfile: gaudi-for-text-generation:4
+  minReplicas: 1
+  maxReplicas: 4
+  # same as max-num-seqs (batch size)
+  targetRequests: 128
+  resourceProfile: gaudi-for-text-generation:2
diff --git a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
index ec1772366..ec83e51af 100644
--- a/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
+++ b/kubeai/models/qwen2.5-7b-instruct-gaudi.yaml
@@ -1,6 +1,5 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 # Source: models/templates/models.yaml
 apiVersion: kubeai.org/v1
 kind: Model
@@ -9,18 +8,21 @@ metadata:
 spec:
   features: [TextGeneration]
   url: hf://Qwen/Qwen2.5-7B-Instruct
-  cacheProfile: nfs
+  cacheProfile: default
   engine: VLLM
   args:
     - --tensor-parallel-size=1
-    - --block-size=128
-    - --max-num-seqs=256
+    - --max-model-len=2048
     - --max-seq-len-to-capture=2048
+    - --max-num-batched-token=2048
+    - --max-num-seqs=512
+    - --gpu-memory-utilization=0.9
+    - --disable-log-requests
   env:
-    OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-    # vLLM startup takes too long for autoscaling, especially with Gaudi
+    OMPI_MCA_btl_vader_single_copy_mechanism: none
     VLLM_SKIP_WARMUP: "true"
   minReplicas: 1
-  maxReplicas: 4
-  targetRequests: 120
-  resourceProfile: gaudi-for-text-generation:1
+  maxReplicas: 8
+  # Equals to max-num-seqs (batch-size)
+  targetRequests: 512
+  resourceProfile: gaudi-for-text-generation::1