diff --git a/language/llama2-70b/api-endpoint-artifacts/model.yaml b/language/llama2-70b/api-endpoint-artifacts/model.yaml index 301b73631b..5c21a38364 100644 --- a/language/llama2-70b/api-endpoint-artifacts/model.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/model.yaml @@ -8,6 +8,8 @@ metadata: name: llama-2-70b-chat-isvc spec: predictor: + minReplicas: 1 + maxReplicas: 1 apiVersion: serving.kserve.io/v1alpha2 serviceAccountName: sa timeout: 240 diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml index 1958ae9270..f8e05fe5da 100644 --- a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml @@ -62,11 +62,11 @@ spec: # value: float16 # Dynamic batch size changes - name: MAX_BATCH_SIZE - value: "256" + value: "128" - name: MAX_CONCURRENT_REQUESTS - value: "256" + value: "200" - name: MAX_BATCH_WEIGHT - value: "540000" + value: "550000" - name: MAX_SEQUENCE_LENGTH value: "2048" - name: MAX_PREFILL_WEIGHT @@ -79,8 +79,8 @@ spec: value: hf_custom_tp resources: # configure as required requests: - cpu: 36 - memory: 700Gi + cpu: 64 + memory: 900Gi nvidia.com/gpu: 8 limits: nvidia.com/gpu: 8 @@ -88,7 +88,7 @@ spec: image: quay.io/opendatahub/caikit-tgis-serving:fast env: - name: RUNTIME_GRPC_SERVER_THREAD_POOL_SIZE - value: "160" + value: "200" volumeMounts: - name: config-volume mountPath: /caikit/config/