diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml index f77a40ca8b..012e78c332 100644 --- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml @@ -6,7 +6,7 @@ spec: restartPolicy: Never containers: - name: mlperf-env - image: quay.io/meyceoz/mlperf-inference:v3-greedy + image: quay.io/meyceoz/mlperf-inference:v4 resources: requests: memory: 20000Mi diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml index 2b7e25184b..1958ae9270 100644 --- a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml @@ -68,7 +68,7 @@ spec: - name: MAX_BATCH_WEIGHT value: "540000" - name: MAX_SEQUENCE_LENGTH - value: "4096" + value: "2048" - name: MAX_PREFILL_WEIGHT value: "0" - name: MAX_NEW_TOKENS