diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py index 6fbc756561..8065023493 100644 --- a/language/llama2-70b/SUT.py +++ b/language/llama2-70b/SUT.py @@ -112,7 +112,7 @@ def __init__(self, if not batch_size: if device == "cpu": - batch_size = 512 + batch_size = 2000 else: batch_size = 32 # Reduce to 8 if using 4 GPUs, 16 for 8. self.batch_size = batch_size diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml index a9e10c15b1..402342a4e6 100644 --- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml @@ -6,7 +6,7 @@ spec: restartPolicy: Never containers: - name: mlperf-env - image: quay.io/meyceoz/mlperf-inference:v6 + image: quay.io/meyceoz/mlperf-inference:v7 resources: requests: memory: 20000Mi