From 207a29b3987d8987dbf623d79e3510a40b44dd49 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 25 Jan 2024 15:42:08 -0500 Subject: [PATCH] Fully functional, updated README --- language/llama2-70b/SUT.py | 2 +- .../api-endpoint-artifacts/README.md | 30 ++++++++++++++++++- .../api-endpoint-artifacts/benchmark.yaml | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py index 06a4d9a02f..cd169a7abd 100644 --- a/language/llama2-70b/SUT.py +++ b/language/llama2-70b/SUT.py @@ -376,7 +376,7 @@ def __del__(self): class SUTServer(SUT): - def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1): + def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1): super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers) diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md index 6a7260b23d..30a5cd889f 100644 --- a/language/llama2-70b/api-endpoint-artifacts/README.md +++ b/language/llama2-70b/api-endpoint-artifacts/README.md @@ -3,11 +3,39 @@ Prerequisites: - Install the OpenShift AI model serving stack - Add your AWS credentials to `secret.yaml` access the model files - - Apply `secret.yaml`, `sa.yaml`, `serving-runtime.yaml`, then finally `model.yaml` + - Apply `secret.yaml`, `sa.yaml` + - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml` + - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml` - Create a benchmark pod using `benchmark.yaml` In the pod, before any benchmark, first run `cd inference/language/llama2-70b` +## STANDALONE TGIS INSTRUCTIONS +For the full accuracy benchmark (offline), run in the pod: +``` +python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log +``` +You can then run the same evaluation/consolidation scripts as the regular benchmark +Example API host: `https://llama-2-70b-chat-isvc-predictor-llama-service.apps.h100serving.perf.lab.eng.bos.redhat.com` + + +For the performance benchmark (offline), run in the pod: +``` +python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log +``` +(It is the same, just with `--accuracy` removed) + + +For the performance benchmark (server), run in the pod: +``` +python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log +``` +(Configure target qps in `user.conf`) + + +NOTE: Hyperparams are currently configured for 8xH100 + +## CAIKIT INSTRUCTIONS For the full accuracy benchmark (offline), run in the pod: ``` python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server --api-model-name Llama-2-70b-chat-hf-caikit --accuracy --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml index 9ddd0c3842..a9e10c15b1 100644 --- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml +++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml @@ -6,7 +6,7 @@ spec: restartPolicy: Never containers: - name: mlperf-env - image: quay.io/meyceoz/mlperf-inference:v5 + image: quay.io/meyceoz/mlperf-inference:v6 resources: requests: memory: 20000Mi