From 207a29b3987d8987dbf623d79e3510a40b44dd49 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 25 Jan 2024 15:42:08 -0500
Subject: [PATCH] Fully functional, updated README

---
 language/llama2-70b/SUT.py                    |  2 +-
 .../api-endpoint-artifacts/README.md          | 30 ++++++++++++++++++-
 .../api-endpoint-artifacts/benchmark.yaml     |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 06a4d9a02f..cd169a7abd 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -376,7 +376,7 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
         super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 6a7260b23d..30a5cd889f 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -3,11 +3,39 @@
 Prerequisites:
  - Install the OpenShift AI model serving stack
  - Add your AWS credentials to `secret.yaml` access the model files
- - Apply `secret.yaml`, `sa.yaml`, `serving-runtime.yaml`, then finally `model.yaml`
+ - Apply `secret.yaml`, `sa.yaml`
+ - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml`
+ - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/llama2-70b`
 
+## STANDALONE TGIS INSTRUCTIONS
+For the full accuracy benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+You can then run the same evaluation/consolidation scripts as the regular benchmark
+Example API host: `https://llama-2-70b-chat-isvc-predictor-llama-service.apps.h100serving.perf.lab.eng.bos.redhat.com`
+
+
+For the performance benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+(It is the same, just with `--accuracy` removed)
+
+
+For the performance benchmark (server), run in the pod:
+```
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+```
+(Configure target qps in `user.conf`)
+
+
+NOTE: Hyperparams are currently configured for 8xH100
+
+## CAIKIT INSTRUCTIONS
 For the full accuracy benchmark (offline), run in the pod:
 ```
 python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT SERVER API CALL ENDPOINT> --api-model-name Llama-2-70b-chat-hf-caikit --accuracy --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 9ddd0c3842..a9e10c15b1 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v5
+    image: quay.io/meyceoz/mlperf-inference:v6
     resources:
       requests:
         memory: 20000Mi