From 4d0e24659edc0c96623b9325efe813dc065092aa Mon Sep 17 00:00:00 2001
From: nvyihengz <109830892+nvyihengz@users.noreply.github.com>
Date: Tue, 23 Jan 2024 08:55:46 -0800
Subject: [PATCH 01/75] Add TEST01 for stable diffusion XL (#1574)

---
 .../nvidia/TEST01/stable-diffusion-xl/audit.config       | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 compliance/nvidia/TEST01/stable-diffusion-xl/audit.config

diff --git a/compliance/nvidia/TEST01/stable-diffusion-xl/audit.config b/compliance/nvidia/TEST01/stable-diffusion-xl/audit.config
new file mode 100644
index 000000000..7e7cfdf55
--- /dev/null
+++ b/compliance/nvidia/TEST01/stable-diffusion-xl/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 128

From 901ce67aad45c62a2cb840cc5826b805a442cbf3 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 25 Jan 2024 00:31:46 +0000
Subject: [PATCH 02/75] Fixes for report generation and submission checker for
 models without compliance tests (#1576)

* Fix offline_min_samples in submission checker and #1569

* Removed mlperf.conf from llama2 directory to avoid confusion

* Update submission_checker.py

* Fixes for 4.0

* Cleanup compliance dir check for models without compliance tests
---
 tools/submission/generate_final_report.py |  4 +++
 tools/submission/preprocess_submission.py |  2 +-
 tools/submission/submission_checker.py    | 35 ++++++++++++-----------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index 7d1200a30..70ca9affc 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -127,6 +127,9 @@ def main():
           '3d-unet-99.9': ['Offline'],
           'gptj-99': ['Server', 'Offline'],
           'gptj-99.9': ['Server', 'Offline'],
+          'stable-diffusion-xl': ['Server', 'Offline'],
+          'llama2-70b-99': ['Server', 'Offline'],
+          'llama2-70b-99.9': ['Server', 'Offline'],
       },
       'edge': {
           'resnet': ['SingleStream', 'MultiStream', 'Offline'],
@@ -140,6 +143,7 @@ def main():
           '3d-unet-99.9': ['SingleStream', 'Offline'],
           'gptj-99': ['SingleStream', 'Offline'],
           'gptj-99.9': ['SingleStream', 'Offline'],
+          'stable-diffusion-xl': ['SingleStream', 'Offline'],
       }
   }
 
diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index f81cfae0a..d6e021253 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -44,7 +44,7 @@ def get_args():
         default=False, action="store_true")
     parser.add_argument(
         "--version",
-        default="v3.1",
+        default="v4.0",
         choices=list(checker.MODEL_CONFIG.keys()),
         help="mlperf version")
     parser.add_argument("--submitter", help="filter to submitter")
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index e61590e34..2d00b6a0e 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -3071,24 +3071,21 @@ def log_result(
                                 model_name,
                                 scenario,
                             )
-                            if not os.path.exists(compliance_dir) and "gptj" not in model_name:
-                                log.error("no compliance dir for %s", name)
+                            if not check_compliance_dir(
+                                compliance_dir,
+                                mlperf_model,
+                                scenario_fixed,
+                                config,
+                                division,
+                                system_json,
+                                name
+                            ):
+                                log.error(
+                                    "compliance dir %s has issues", compliance_dir
+                                )
                                 results[name] = None
                             else:
-                                if not check_compliance_dir(
-                                    compliance_dir,
-                                    mlperf_model,
-                                    scenario_fixed,
-                                    config,
-                                    division,
-                                    system_json,
-                                ):
-                                    log.error(
-                                        "compliance dir %s has issues", compliance_dir
-                                    )
-                                    results[name] = None
-                                else:
-                                    compliance = 1
+                                compliance = 1
 
                         if results.get(name):
                             if accuracy_is_valid:
@@ -3476,7 +3473,7 @@ def check_compliance_acc_dir(test_dir, model, config):
 
 
 def check_compliance_dir(
-    compliance_dir, model, scenario, config, division, system_json
+    compliance_dir, model, scenario, config, division, system_json, name
 ):
     compliance_perf_pass = True
     compliance_perf_dir_pass = True
@@ -3517,6 +3514,10 @@ def check_compliance_dir(
     ]:
         test_list.append("TEST06") 
 
+    if test_list and not os.path.exists(compliance_dir):
+        log.error("no compliance dir for %s: %s", name, compliance_dir)
+        return False
+
     # Check performance of all Tests
     for test in test_list:
         test_dir = os.path.join(compliance_dir, test)

From 190413d97b5a75b167a88e2edf5933a3f015fb0d Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Thu, 25 Jan 2024 09:57:00 -0500
Subject: [PATCH 03/75] Add Llama2 checks and log additional values (#1578)

---
 loadgen/bindings/python_api.cc         |  2 ++
 loadgen/results.cc                     | 16 ++++++++++++--
 loadgen/test_settings.h                |  3 +++
 loadgen/test_settings_internal.cc      | 18 +++++++++++++++-
 loadgen/test_settings_internal.h       |  2 ++
 mlperf.conf                            |  1 +
 tools/submission/submission_checker.py | 29 ++++++++++++++++++++++++++
 7 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 816f7a4f6..cfe24bd3c 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -336,6 +336,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::test05_sample_index_rng_seed)
       .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
+      .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
+      .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
       .def("FromConfig", &TestSettings::FromConfig, "FromConfig.");
 
   pybind11::enum_<LoggingMode>(m, "LoggingMode")
diff --git a/loadgen/results.cc b/loadgen/results.cc
index 21fd2c90a..5048ac72e 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -636,16 +636,28 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
       MLPERF_LOG(detail, "result_first_token_mean_latency_ns", first_token_latency_mean);
       for (auto& lp : token_latency_percentiles) {
         MLPERF_LOG(detail,
-                    "result_" + DoubleToString(lp.percentile * 100) +
+                    "result_first_token_" + DoubleToString(lp.percentile * 100) +
                         "_percentile_latency_ns",
                     lp.sample_latency);
+        if ((lp.percentile == .999) & (lp.sample_latency > settings.server_ttft_latency)){
+          MLPERF_LOG_WARNING(detail, "warning_generic_message", 
+            "Value for result_first_token_" + DoubleToString(lp.percentile * 100) +
+            "_percentile_latency_ns greater than target time_to_first_token"
+          );
+        }
       }
       double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
       double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count);
       MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg);
       MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
-      double tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / ((double)token_count);
+      uint64_t tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / (token_count);
       MLPERF_LOG(detail, "result_time_to_output_token", tpot);
+      if (tpot > settings.server_tpot_latency){
+        MLPERF_LOG_WARNING(detail, "warning_generic_message",
+          "Value for result_time_to_output_token "
+          "greater than target time_to_output_token"
+        );
+      }
     } else {
       double tokens_per_second = token_count / pr.max_latency;
       MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index ccae5327a..b0018380d 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -264,6 +264,9 @@ struct TestSettings {
   uint64_t performance_sample_count_override = 0;
   /// \brief Measure token latencies
   bool use_token_latencies = false;
+  /// Token latency parameters
+  uint64_t server_ttft_latency = 100000000;
+  uint64_t server_tpot_latency = 100000000;
   /**@}*/
 };
 
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 0a4b9af6e..95e53b731 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -49,7 +49,9 @@ TestSettingsInternal::TestSettingsInternal(
       performance_issue_same_index(requested.performance_issue_same_index),
       performance_sample_count(0),
       sample_concatenate_permutation(false),
-      use_token_latencies(requested.use_token_latencies){
+      use_token_latencies(requested.use_token_latencies),
+      server_ttft_latency(requested.server_ttft_latency),
+      server_tpot_latency(requested.server_tpot_latency){
   // Target QPS, target latency, and max_async_queries.
   switch (requested.scenario) {
     case TestScenario::SingleStream:
@@ -330,6 +332,14 @@ void LogRequestedTestSettings(const TestSettings &s) {
                s.performance_issue_same_index);
     MLPERF_LOG(detail, "requested_performance_sample_count_override",
                s.performance_sample_count_override);
+    // Token latencies specific values
+    if (s.use_token_latencies){
+      MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies);
+      if (s.scenario != TestScenario::Offline){
+        MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency);
+        MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency);
+      }
+    }
 #else
     detail("");
     detail("Requested Settings:");
@@ -673,8 +683,14 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
   lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed,
            nullptr);
   lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);
+
+  // keys that apply to token metrics
   if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr))
     use_token_latencies = (val == 1) ? true : false;
+    if (use_token_latencies){
+      lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000);
+      lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000);
+    }
 
   // keys that apply to SingleStream
   lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 6bc1c3cf3..3392dd59d 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -83,6 +83,8 @@ struct TestSettingsInternal {
 
   bool sample_concatenate_permutation;
   bool use_token_latencies = false;
+  uint64_t server_ttft_latency;
+  uint64_t server_tpot_latency;
 };
 
 /// \brief A namespace of collections of FindPeakPerformance helper functions,
diff --git a/mlperf.conf b/mlperf.conf
index fe743a8dd..8c6f53849 100644
--- a/mlperf.conf
+++ b/mlperf.conf
@@ -58,6 +58,7 @@ rnnt.Server.target_latency = 1000
 gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
 # Falcon Server scenario requires two latency constraints
+llama2-70b.*.use_token_latencies = 1
 llama2-70b.Server.target_latency = 2000
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 2d00b6a0e..bd716fbd0 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1389,6 +1389,17 @@
     }
 }
 
+LLAMA2_LATENCY_LIMITS = {
+    "interactive": {
+        "ttft": 500,
+        "tpot": 50
+    },
+    "conversational": {
+        "ttft": 2000,
+        "tpot": 200
+    }
+}
+
 ACC_PATTERN = {
     "acc": r"^accuracy=([\d\.]+).*",
     "AUC": r"^AUC=([\d\.]+).*",
@@ -1891,6 +1902,19 @@ def check_accuracy_dir(config, model, path, verbose):
     return is_valid, result_acc
 
 
+def extra_check_llama2(mlperf_log, scenario):
+    if (mlperf_log["use_token_latencies"]):
+        if scenario == "Offline":
+            # For offline no further checks are necessary
+            return None, True
+        else:
+            for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
+                if mlperf_log["result_first_token_99.9_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_to_output_token"] < limits["tpot"]:
+                    return constraint, True
+    else:
+        return None, False
+            
+
 def get_performance_metric(
     config, model, path, scenario_fixed, division, system_json, has_power=False
 ):
@@ -1911,6 +1935,8 @@ def get_performance_metric(
     )
 
     res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
+    if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
+        res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])
 
     inferred = False
     if scenario_fixed != scenario:
@@ -1946,6 +1972,9 @@ def check_performance_dir(
         res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]])
         if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]:
             res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]])
+        
+        if model in ["llama2-70b-99", "llama2-70b-99.9"]:
+            llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed)
 
         latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
         latency_mean = mlperf_log["result_mean_latency_ns"]

From 27ef43ae504f11b40878473e4fabafc0e1f01066 Mon Sep 17 00:00:00 2001
From: mlcommons-bot <74634038+mlcommons-bot@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:13:38 -0600
Subject: [PATCH 04/75] =?UTF-8?q?=F0=9F=94=84=20synced=20local=20'tools/su?=
 =?UTF-8?q?bmission/power/sources=5Fchecksums.json'=20with=20remote=20'com?=
 =?UTF-8?q?pliance/sources=5Fchecksums.json'=20(#1582)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: mlcommons-bot <null>
---
 tools/submission/power/sources_checksums.json | 38 +------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/tools/submission/power/sources_checksums.json b/tools/submission/power/sources_checksums.json
index 0ae4cc020..78a240f1a 100644
--- a/tools/submission/power/sources_checksums.json
+++ b/tools/submission/power/sources_checksums.json
@@ -1,38 +1,4 @@
 [
-    {
-        "server.py": "c3f90f2f7eeb4db30727556d0c815ebc89b3d28b",
-        "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
-        "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a",
-        "tests/unit/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
-        "lib/time_sync.py": "3210db56eb0ff0df57bf4293dc4d4b03fffd46f1",
-        "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
-        "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
-        "lib/server.py": "8054263a14dedddcf8e1c01adc19596c21bad591",
-        "lib/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
-        "lib/client.py": "c146491755e219a28d440b31f83998dbd5532483",
-        "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709"
-    },
-    {
-        "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
-        "lib/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/client.py": "ac2aa093c8e8bbc9569b9e2a3471bc64e58a2258",
-        "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
-        "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/server.py": "c7af63c31bb2fbedea4345f571f6e3507d268ada",
-        "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
-        "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
-        "lib/time_sync.py": "122eba67a9abc85635223e054def53be1367ade2",
-        "server.py": "c3f90f2f7eeb4db30727556d0c815ebc89b3d28b",
-        "tests/unit/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-        "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
-        "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a"
-    },
     {
         "__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
         "client.py": "33ca4f26368777ac06e01f9567b714a4b8063886",
@@ -41,7 +7,7 @@
         "lib/common.py": "611d8b29633d331eb19c9455ea3b5fa3284ed6df",
         "lib/external/__init__.py": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
         "lib/external/ntplib.py": "4da8f970656505a40483206ef2b5d3dd5e81711d",
-        "lib/server.py": "c7af63c31bb2fbedea4345f571f6e3507d268ada",
+        "lib/server.py": "99303c836c683aa9017ec565104e636161d02acb",
         "lib/source_hashes.py": "60a2e02193209e8d392803326208d5466342da18",
         "lib/summary.py": "aa92f0a3f975eecd44d3c0cd0236342ccc9f941d",
         "lib/time_sync.py": "80894ef2389e540781ff78de94db16aa4203a14e",
@@ -50,4 +16,4 @@
         "tests/unit/test_server.py": "948c1995d4008bc2aa6c4046a34ffa3858d6d671",
         "tests/unit/test_source_hashes.py": "00468a2907583c593e6574a1f6b404e4651c221a"
     }
-]
+]
\ No newline at end of file

From 9b8006ff2f0ade58cbc1935fbda967a19b2e363b Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Thu, 25 Jan 2024 19:26:57 -0500
Subject: [PATCH 05/75] Fix image list mismatch (#1579)

Co-authored-by: Miro <mirhodak@amd.com>
---
 text_to_image/coco.py                  | 4 ++--
 tools/submission/submission_checker.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/text_to_image/coco.py b/text_to_image/coco.py
index fa89ba215..b2c9d6dfc 100644
--- a/text_to_image/coco.py
+++ b/text_to_image/coco.py
@@ -174,9 +174,9 @@ def save_images(self, ids, ds):
         for id in ids:
             caption = ds.get_caption(id)
             generated = Image.fromarray(self.results[idx[id]])
-            image_path_tmp = f"images/{self.content_ids[id]}.png"
+            image_path_tmp = f"images/{self.content_ids[idx[id]]}.png"
             generated.save(image_path_tmp)
-            info.append((self.content_ids[id], caption))
+            info.append((self.content_ids[idx[id]], caption))
         with open("images/captions.txt", "w+") as f:
             for id, caption in info:
                 f.write(f"{id}  {caption}\n")
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index bd716fbd0..c4c66a4aa 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1785,6 +1785,8 @@ def check_extra_files(path, target_files):
                 if target_file not in files:
                     check_pass = False
                     missing_files.append(f"{os.path.join(path, dir, target_file)}.png")
+            if "captions" not in files:
+                missing_files.append(f"{os.path.join(path, dir, 'captions.txt')}")
     return check_pass, missing_files
 
 

From 180014ad5724de84fd16a42bef69e8f58faf9e4f Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Thu, 25 Jan 2024 19:08:54 -0800
Subject: [PATCH 06/75] #1558 update llama2 reference fp32 accuracy (#1583)

Co-authored-by: Miro <mirhodak@amd.com>
---
 language/llama2-70b/README.md          | 12 ++++++------
 tools/submission/submission_checker.py |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index a9a150b4c..2614749a7 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -195,15 +195,15 @@ if [ -e ${ACCURACY_LOG_FILE} ]; then
 fi
 ```
 
-The ServerSUT was not tested for GPU runs. You can try setting `--device cuda:0`, but YMMV.
+The ServerSUT was not tested for GPU runs.
 
 
 ## Accuracy Target
 Running the GPU implementation in FP32 precision resulted in the following FP32 accuracy targets (normalized to a 0-100
 scale from a 0.0-1.0 scale):
-- Rouge1: 43.88
-- Rouge2: 21.7108
-- RougeL: 28.2502
-- RougeLsum: 41.4821
+- Rouge1: 44.4312
+- Rouge2: 22.0352
+- RougeL: 28.6162
+- Tokens per sample: 294.45
 
-This was run an 8xH100 node. Total runtime was ~4.5 days.
+This was run on a DGX-H100 node. Total runtime was ~4.5 days.
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index c4c66a4aa..b6cd599df 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1124,8 +1124,8 @@
             "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
             "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9),
             "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9),
-            "llama2-70b-99" : ("ROUGE1", 43.88 * 0.99, "ROUGE2", 21.7108 * 0.99, "ROUGEL", 28.2502 * 0.99, "TOKENS_PER_SAMPLE", 293.3*0.9),
-            "llama2-70b-99.9" : ("ROUGE1", 43.88 * 0.999, "ROUGE2", 21.7108 * 0.999, "ROUGEL", 28.2502 * 0.999, "TOKENS_PER_SAMPLE", 293.3*0.9),
+            "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9),
+            "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9),
             "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758)
         },
         "accuracy-upper-limit": {

From 523316e5a49f458945e8c559de597df48abbf272 Mon Sep 17 00:00:00 2001
From: mlcommons-bot <74634038+mlcommons-bot@users.noreply.github.com>
Date: Fri, 26 Jan 2024 11:34:08 -0600
Subject: [PATCH 07/75] =?UTF-8?q?=F0=9F=94=84=20synced=20local=20'tools/su?=
 =?UTF-8?q?bmission/power/power=5Fchecker.py'=20with=20remote=20'complianc?=
 =?UTF-8?q?e/check.py'=20(#1587)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: mlcommons-bot <null>
---
 tools/submission/power/power_checker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/submission/power/power_checker.py b/tools/submission/power/power_checker.py
index 5adcd197c..93d9d6fb9 100755
--- a/tools/submission/power/power_checker.py
+++ b/tools/submission/power/power_checker.py
@@ -408,6 +408,8 @@ def get_avg_power(power_path: str, run_path: str) -> Tuple[float, float]:
 
         with open(spl_fname) as f:
             for line in f:
+                if not line.startswith("Time"):
+                    continue
                 timestamp = (
                     datetime.strptime(line.split(",")[1], datetime_format)
                 ).replace(tzinfo=timezone.utc)

From 3ad853426528a3d692ea34a72b81a7c4fed0346e Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Fri, 26 Jan 2024 19:10:13 +0000
Subject: [PATCH 08/75] Update the main README.md for 4.0 (#1586)

---
 README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c635c73b0..f1cfa2721 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,23 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
 ```
 ## MLPerf Inference v4.0 (submission deadline February 23, 2024)
 
-Code freeze coming soon...
+There is an extra one-week extension allowed only for the llama2-70b submissions. For submissions, please use the master branch and any commit since the [4.0 seed release](https://github.com/mlcommons/inference/commit/8e36925bd36a503e39fcbbc488e9e46126f079ed) although it is best to use the latest commit. v4.0 tag will be created from the master branch after the result publication.
+
+For power submissions please use [SPEC PTD 1.10](https://github.com/mlcommons/power/tree/main/inference_v1.0) (needs special access) and any commit of the power-dev repository after the [code-freeze](https://github.com/mlcommons/power-dev/commit/4e026f43481f46ad57d2464d28924018444b0428)
+
+| model | reference app | framework | dataset | category
+| ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | tensorflow, onnx, tvm, ncnn | imagenet2012 | edge,datacenter |
+| retinanet 800x800 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | pytorch, onnx | openimages resized to 800x800| edge,datacenter |
+| bert | [language/bert](https://github.com/mlcommons/inference/tree/master/language/bert) | tensorflow, pytorch, onnx | squad-1.1 | edge,datacenter |
+| dlrm-v2 | [recommendation/dlrm_v2](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm_v2/pytorch) | pytorch | Multihot Criteo Terabyte | datacenter |
+| 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 | edge,datacenter |
+| rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus | edge,datacenter |
+| gpt-j | [language/gpt-j](https://github.com/mlcommons/inference/tree/master/language/gpt-j)| pytorch | CNN-Daily Mail | edge,datacenter |
+| stable-diffusion-xl | [text_to_image] (https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
+| llama2-70b | [language/llama2-70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) | pytorch | OpenOrca | datacenter |
+
+* Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.
 
 ## MLPerf Inference v3.1 (submission August 18, 2023)
 Please use [v3.1 tag](https://github.com/mlcommons/inference/releases/tag/v3.1) (```git checkout v3.1```) if you would like to reproduce the v3.1 results. 

From a04b1f52c2b0baa628bd4b2d1ddec245b6e65008 Mon Sep 17 00:00:00 2001
From: Anton Lokhmotov <psyhtest@users.noreply.github.com>
Date: Tue, 30 Jan 2024 18:01:24 +0000
Subject: [PATCH 09/75] Ignore trailing whitespace lines in spl.txt files
 (#1584)

* Ignore trailing whitespace lines in spl.txt files.

* Remove fix from sync'ed power_checker.py.

* Reformat according to black.
---
 tools/submission/submission_checker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index b6cd599df..ac76c5dea 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -2261,6 +2261,8 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
     power_list = []
     with open(spl_fname) as f:
         for line in f:
+            if not line.startswith("Time"):
+                continue
             timestamp = (
                 datetime.datetime.strptime(line.split(",")[1], datetime_format)
                 + server_timezone

From 4bdf56f883bafa08cd571cc0f515a7362afa0b79 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 1 Feb 2024 00:33:13 +0000
Subject: [PATCH 10/75] Add stable diffusion and llama2 to the final
 spreadsheet (#1589)

---
 tools/submission/generate_final_report.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index 70ca9affc..13a01fe55 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -14,8 +14,8 @@ def get_args():
   """Parse commandline."""
   parser = argparse.ArgumentParser()
   parser.add_argument('--input', required=True, help='results csv from checker')
-  parser.add_argument('--version', default='3.1', help='mlperf version')
-  parser.add_argument('--repository', default='submissions_inference_3.1', help='mlperf repository')
+  parser.add_argument('--version', default='4.0', help='mlperf version')
+  parser.add_argument('--repository', default='submissions_inference_4.0', help='mlperf repository')
   args = parser.parse_args()
   return args
 
@@ -104,7 +104,7 @@ def main():
                    [
                        'resnet', 'retinanet', '3d-unet-99', '3d-unet-99.9',
                        'rnnt', 'bert-99', 'bert-99.9', 'dlrm-v2-99', 'dlrm-v2-99.9',
-                       'gptj-99', 'gptj-99.9'
+                       'gptj-99', 'gptj-99.9', 'stable-diffusion-xl', 'llama2-70b-99', 'llama2-70b-99.9'
                    ], ['SingleStream', 'MultiStream', 'Server', 'Offline'],
                    [
                        'Latency (ms)',

From 3a902e598e5a17013c26fea975c0a60e6d7feae1 Mon Sep 17 00:00:00 2001
From: nvyihengz <109830892+nvyihengz@users.noreply.github.com>
Date: Thu, 1 Feb 2024 06:29:59 -0800
Subject: [PATCH 11/75] Add support to dump 10 compliance images during
 accuracy run for SDXL (#1591)

* Add support to dump 10 compliance images during accuracy run for SDXL

* Fix typo

* Dump caption.txt in the same path
---
 text_to_image/tools/accuracy_coco.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index ad5c93c0c..f831c4b6e 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -26,6 +26,7 @@ def get_args():
     parser.add_argument("--statistics-path", default=None, help="path to statistics")
     parser.add_argument("--verbose", action="store_true", help="verbose messages")
     parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
+    parser.add_argument("--compliance-images-path", required=False, help="path to dump 10 stable diffusion xl compliance images")
     parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
     args = parser.parse_args()
     return args
@@ -43,7 +44,6 @@ def preprocess_image(img_dir, file_name):
 
 def main():
     args = get_args()
-
     result_dict = {}
 
     # Load dataset annotations
@@ -63,6 +63,22 @@ def main():
     if args.statistics_path is None:
         statistics_path = os.path.join(os.path.dirname(__file__), "val2014.npz")
 
+    # Set compliance images path
+    dump_compliance_images = False
+    if args.compliance_images_path:
+        if not os.path.exists(args.compliance_images_path):
+            os.makedirs(args.compliance_images_path)
+        dump_compliance_images = True
+        compliance_images_idx_list = []
+        with open(os.path.join(os.path.dirname(__file__), "sample_ids.txt"), 'r') as compliance_id_file:
+            for line in compliance_id_file:
+                idx = int(line.strip())
+                compliance_images_idx_list.append(idx)
+        # Dump caption.txt
+        with open(os.path.join(args.compliance_images_path, "captions.txt"), "w+") as caption_file:
+            for idx in compliance_images_idx_list:
+                caption_file.write(f"{idx}  {df_captions.iloc[idx]['caption']}\n")
+
     # Load torchmetrics modules
     clip = CLIPEncoder(device=device)
     clip_scores = []
@@ -78,6 +94,11 @@ def main():
         generated_img = np.frombuffer(bytes.fromhex(j['data']), np.uint8).reshape(1024, 1024, 3)
         result_list.append(generated_img)
         generated_img = Image.fromarray(generated_img)
+
+        # Dump compliance images
+        if dump_compliance_images and idx in compliance_images_idx_list:
+            generated_img.save(os.path.join(args.compliance_images_path, f"{idx}.png"))
+
         # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
         # Load Ground Truth
         caption = df_captions.iloc[idx]["caption"]

From cc3daae3c7f18d7eedc5ffbc271d324e5aca6233 Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Thu, 1 Feb 2024 10:56:34 -0800
Subject: [PATCH 12/75] #1598: fix token and sample logging for Llama2 when
 accuracy_log_sampling_target is enabled (#1599)

---
 compliance/nvidia/TEST06/run_verification.py | 1 -
 loadgen/loadgen.cc                           | 7 ++++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index 9d4cd1ad7..afa4081d3 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -61,7 +61,6 @@ def first_token_check(acc_data, dtype):
     for sample in acc_data:
         data = np.frombuffer(bytes.fromhex(sample["data"]), dtype=dtype)
         token_data = np.frombuffer(bytes.fromhex(sample["token_data"]), dtype=dtype)
-        print(token_data)
         for t1, t2 in zip(data, token_data):
             if t1 != t2:
                 return False
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 7db118613..c9785b78f 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -140,7 +140,12 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
     // For some reason, using std::unique_ptr<std::vector> wasn't moving
     // into the lambda; even with C++14.
     std::vector<uint8_t>* token_data_copy = nullptr;
-    if (mode == TestMode::AccuracyOnly) {
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
       uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
       uint8_t* src_end = src_begin + response->size;
       token_data_copy = new std::vector<uint8_t>(src_begin, src_end);

From 473053fc9f1b8c5edf60e11ec74607126938fbc4 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Fri, 2 Feb 2024 09:25:07 -0500
Subject: [PATCH 13/75] Fix loadgen token metrics latency constrains (#1596)

* Fix loadgen token metrics latency constrains

* Update perf constraints check for token metrics

* Add equal issue mode for LLMs models
---
 language/llama2-70b/user.conf     |   2 +
 loadgen/loadgen.cc                |   4 +
 loadgen/logging.cc                |  38 ++++++-
 loadgen/logging.h                 |   3 +
 loadgen/results.cc                | 162 +++++++++++++++++++++++-------
 loadgen/results.h                 |  12 ++-
 loadgen/test_settings_internal.cc |   7 +-
 mlperf.conf                       |   4 +-
 8 files changed, 193 insertions(+), 39 deletions(-)

diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
index bb97c437a..945082fe9 100644
--- a/language/llama2-70b/user.conf
+++ b/language/llama2-70b/user.conf
@@ -9,3 +9,5 @@
 *.Server.target_qps = 0.5
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
+
+llama2-70b.Server.sample_concatenate_permutation = 1
\ No newline at end of file
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index c9785b78f..1d7d7420d 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -529,6 +529,9 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
   std::vector<QuerySampleLatency> first_token_latencies(
     GlobalLogger().GetTokenLatencies(expected_latencies));
 
+  std::vector<QuerySampleLatency> time_per_output_token_arr(
+    GlobalLogger().GetTimePerOutputToken(expected_latencies));
+
   std::vector<int64_t> tokens_per_sample(
     GlobalLogger().GetTokensPerSample(expected_latencies));
 
@@ -588,6 +591,7 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
                           final_query_all_samples_done_time,
                           TokenPerformanceResults{
                             first_token_latencies,
+                            time_per_output_token_arr,
                             tokens_per_sample
                           }
                         };
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index d990e7c4d..bd8e377fa 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -349,6 +349,7 @@ void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
   latencies_.reserve(latencies_to_reserve);
   token_latencies_.reserve(latencies_to_reserve);
   tokens_per_sample_.reserve(latencies_to_reserve);
+  time_per_output_token_.reserve(latencies_to_reserve);
 }
 
 void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
@@ -430,15 +431,40 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
       // If the SUT recorded the wrong sample, the test will hang and see
       // the error above.
       return;
-    } else if (n_tokens == 0){
+    } 
+    if (n_tokens == 0){
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "n_tokens argument missing or attempted to record 0 as number of tokens");
     } else if (n_tokens < 0){
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "Attempted to record a negative number of tokens");
       n_tokens = 0;
+    } else if (n_tokens == 1){
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Number of tokens need to be greater than 1");
+      n_tokens = 0;
+    }
+    if (time_per_output_token_.size() <= i){
+      time_per_output_token_.resize(i + 1, kInvalidLatency);
+    } else if (time_per_output_token_[i] != kInvalidLatency) {
+      // Call LogErrorSync here since this kind of error could result in a
+      // segfault in the near future.
+  #if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to complete a sample twice.");
+  #else
+      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+  #endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
     }
     tokens_per_sample_[i] = n_tokens;
+    time_per_output_token_[i] = (latency - token_latencies_[i]) / (n_tokens - 1);
   }
   latencies_[i] = latency;
   latencies_recorded_++;
@@ -572,6 +598,12 @@ std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(size_t expected_coun
   return token_latencies;
 }
 
+std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(size_t expected_count){
+  std::vector<QuerySampleLatency> tpot_latencies;
+  tpot_latencies.swap(time_per_output_token_);
+  return tpot_latencies;
+}
+
 std::vector<int64_t> AsyncLog::GetTokensPerSample(size_t expected_count) {
   std::vector<int64_t> tokens_per_sample;
   tokens_per_sample.swap(tokens_per_sample_);
@@ -908,6 +940,10 @@ std::vector<QuerySampleLatency> Logger::GetTokenLatencies(
     size_t expected_count) {
   return async_logger_.GetTokenLatencies(expected_count);
 }
+std::vector<QuerySampleLatency> Logger::GetTimePerOutputToken(
+    size_t expected_count) {
+  return async_logger_.GetTimePerOutputToken(expected_count);
+}
 std::vector<QuerySampleLatency> Logger::GetTokensPerSample(
     size_t expected_count) {
   return async_logger_.GetTokensPerSample(expected_count);
diff --git a/loadgen/logging.h b/loadgen/logging.h
index d10f574d6..b9c032a72 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -322,6 +322,7 @@ class AsyncLog {
                               QuerySampleLatency latency);
   std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
   std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
   std::vector<int64_t> GetTokensPerSample(size_t expected_count);
   PerfClock::time_point GetMaxCompletionTime();
   QuerySampleLatency GetMaxLatencySoFar();
@@ -386,6 +387,7 @@ class AsyncLog {
   uint64_t latencies_first_sample_sequence_id_ = 0;
   std::vector<QuerySampleLatency> latencies_;
   std::vector<QuerySampleLatency> token_latencies_;
+  std::vector<QuerySampleLatency> time_per_output_token_;
   std::vector<LogBinaryAsHexString> token_records_;
   std::vector<int64_t> tokens_per_sample_;
   QuerySampleLatency max_latency_ = 0;
@@ -421,6 +423,7 @@ class Logger {
                                size_t latencies_to_reserve);
   std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
   std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
   std::vector<int64_t> GetTokensPerSample(size_t expected_count);
   PerfClock::time_point GetMaxCompletionTime();
   QuerySampleLatency GetMaxLatencySoFar();
diff --git a/loadgen/results.cc b/loadgen/results.cc
index 5048ac72e..d5173f441 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -97,8 +97,15 @@ void PerformanceSummary::ProcessTokenLatencies() {
     accumulated_first_token_latency += latency;
   }
   first_token_latency_mean = accumulated_first_token_latency / sample_count;
+  QuerySampleLatency accumulated_tpot = 0;
+  for (auto latency : pr.token_results.time_per_output_token_arr) {
+    accumulated_tpot += latency;
+  }
+  time_per_output_token_mean = accumulated_tpot / sample_count;
   std::sort(pr.token_results.first_token_latencies.begin(), 
     pr.token_results.first_token_latencies.end());
+  std::sort(pr.token_results.time_per_output_token_arr.begin(),
+    pr.token_results.time_per_output_token_arr.end());
   
   token_target_latency_percentile.sample_latency =
       pr.token_results.first_token_latencies[sample_count * token_target_latency_percentile.percentile];
@@ -110,6 +117,16 @@ void PerformanceSummary::ProcessTokenLatencies() {
     lp.sample_latency = pr.token_results.first_token_latencies[sample_count * lp.percentile];
   }
 
+  target_tpot_percentile.sample_latency =
+      pr.token_results.time_per_output_token_arr[sample_count * target_tpot_percentile.percentile];
+  time_per_output_token_min = pr.token_results.time_per_output_token_arr.front();
+  time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
+  for (auto& lp : tpot_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency = pr.token_results.time_per_output_token_arr[sample_count * lp.percentile];
+  }
+
   if (settings.scenario == TestScenario::Server) {
     // TODO: Maybe another target latency needs to be added?
     QuerySampleLatency max_latency = settings.target_latency.count() + 1;
@@ -121,10 +138,12 @@ void PerformanceSummary::ProcessTokenLatencies() {
 
 }
 
-bool PerformanceSummary::EarlyStopping(std::string* recommendation) {
+bool PerformanceSummary::EarlyStopping(std::string* recommendation, int64_t queries_issued, 
+                                        std::vector<QuerySampleLatency>* sample_latencies,
+                                        std::vector<QuerySampleLatency>* query_latencies,
+                                        std::chrono::nanoseconds target_latency) {
   recommendation->clear();
 
-  int64_t queries_issued = pr.queries_issued;
   MinPassingQueriesFinder find_min_passing;
   double confidence = 0.99;
   double tolerance = 0.0;
@@ -155,7 +174,7 @@ bool PerformanceSummary::EarlyStopping(std::string* recommendation) {
         }
       }
       QuerySampleLatency percentile_estimate =
-          pr.sample_latencies[queries_issued - t];
+          (*sample_latencies)[queries_issued - t];
       *recommendation =
           " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
           std::to_string(queries_issued) + ").\n" + " * Would discard " +
@@ -187,7 +206,7 @@ bool PerformanceSummary::EarlyStopping(std::string* recommendation) {
             break;
           }
         }
-        percentile_estimate = pr.sample_latencies[queries_issued - t];
+        percentile_estimate = (*sample_latencies)[queries_issued - t];
         *recommendation +=
             "\n * Early stopping " +
             DoubleToString(multi_stream_percentile * 100, 0) +
@@ -198,9 +217,9 @@ bool PerformanceSummary::EarlyStopping(std::string* recommendation) {
     }
     case TestScenario::Server: {
       int64_t t =
-          std::count_if(pr.sample_latencies.begin(), pr.sample_latencies.end(),
+          std::count_if((*sample_latencies).begin(), (*sample_latencies).end(),
                         [=](auto const& latency) {
-                          return latency > settings.target_latency.count();
+                          return latency > target_latency.count();
                         });
       int64_t h = find_min_passing(t, target_latency_percentile.percentile,
                                    tolerance, confidence);
@@ -239,7 +258,7 @@ bool PerformanceSummary::EarlyStopping(std::string* recommendation) {
         }
       }
       QuerySampleLatency percentile_estimate =
-          pr.query_latencies[queries_issued - t];
+          (*query_latencies)[queries_issued - t];
       *recommendation =
           " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
           std::to_string(queries_issued) + ").\n" + " * Would discard " +
@@ -317,10 +336,28 @@ bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
       break;
     case TestScenario::Server:
       ProcessLatencies();
-      if (target_latency_percentile.sample_latency >
-          settings.target_latency.count()) {
-        *recommendation = "Reduce target QPS to improve latency.";
-        perf_constraints_met = false;
+      if (!settings.use_token_latencies){
+        if (target_latency_percentile.sample_latency >
+            settings.target_latency.count()) {
+          *recommendation = "Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+      } else {
+        if ( token_target_latency_percentile.sample_latency >
+            settings.server_ttft_latency) {
+          *recommendation = "TTFT constrain not met: Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+
+        if ( target_tpot_percentile.sample_latency >
+            settings.server_tpot_latency) {
+          if (recommendation->empty()){
+            *recommendation = "TPOT constrain not met: Reduce target QPS to improve latency.";
+          } else {
+            recommendation->append("\n * TPOT constrain not met: Reduce target QPS to improve latency.");
+          }
+          perf_constraints_met = false;
+        }
       }
       break;
     case TestScenario::Offline:
@@ -400,10 +437,30 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
   std::string min_duration_recommendation;
   std::string perf_constraints_recommendation;
   std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
 
   bool min_duration_met = MinDurationMet(&min_duration_recommendation);
   bool min_queries_met = MinQueriesMet() && MinSamplesMet();
-  bool early_stopping_met = EarlyStopping(&early_stopping_recommendation);
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies){
+    early_stopping_met = EarlyStopping(&early_stopping_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.sample_latencies, 
+                                        &pr.query_latencies,
+                                        settings.target_latency);
+  } else {
+    early_stopping_met = EarlyStopping(&early_stopping_tpot_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.token_results.time_per_output_token_arr, 
+                                        &pr.query_latencies,
+                                        std::chrono::nanoseconds(settings.server_tpot_latency)) && 
+                          EarlyStopping(&early_stopping_ttft_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.token_results.first_token_latencies, 
+                                        &pr.query_latencies,
+                                        std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
   bool perf_constraints_met =
       PerfConstraintsMet(&perf_constraints_recommendation);
   bool all_constraints_met = min_duration_met && min_queries_met &&
@@ -435,8 +492,15 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
   if (settings.scenario == TestScenario::SingleStream ||
       settings.scenario == TestScenario::Server ||
       settings.scenario == TestScenario::MultiStream) {
-    summary("Early Stopping Result:");
-    summary(early_stopping_recommendation);
+    if (!settings.use_token_latencies){
+      summary("Early Stopping Result:");
+      summary(early_stopping_recommendation);
+    } else {
+      summary("TTFT Early Stopping Result:");
+      summary(early_stopping_ttft_recommendation);
+      summary("TPOT Early Stopping Result:");
+      summary(early_stopping_tpot_recommendation);
+    }
   }
 
   summary(
@@ -490,17 +554,26 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
     } else if (settings.scenario == TestScenario::Server) {
       double tps_as_completed =
           token_count / pr.final_query_all_samples_done_time;
-      summary("Completed tokens per second    : ",
+      summary("Completed tokens per second                 : ",
               DoubleToString(tps_as_completed));
     }
 
     if (settings.scenario != TestScenario::Offline) {
-      summary("Min First Token latency (ns)    : ", first_token_latency_min);
-      summary("Max First Token latency (ns)    : ", first_token_latency_max);
-      summary("Mean First Token latency (ns)   : ", first_token_latency_mean);
+      summary("Min First Token latency (ns)                : ", first_token_latency_min);
+      summary("Max First Token latency (ns)                : ", first_token_latency_max);
+      summary("Mean First Token latency (ns)               : ", first_token_latency_mean);
       for (auto& lp : token_latency_percentiles) {
         summary(
-            DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+            DoubleToString(lp.percentile * 100) + " percentile first token latency (ns)   : ",
+            lp.sample_latency);
+      }
+      summary("");
+      summary("Min Time to Output Token (ns)                : ", time_per_output_token_min);
+      summary("Max Time to Output Token (ns)                : ", time_per_output_token_max);
+      summary("Mean Time to Output Token (ns)               : ", time_per_output_token_mean);
+      for (auto& lp : tpot_percentiles) {
+        summary(
+            DoubleToString(lp.percentile * 100) + " percentile time to output token (ns)   : ",
             lp.sample_latency);
       }
     }
@@ -522,11 +595,31 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
   std::string min_duration_recommendation;
   std::string perf_constraints_recommendation;
   std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
   bool min_duration_met = MinDurationMet(&min_duration_recommendation);
   bool min_queries_met = MinQueriesMet() && MinSamplesMet();
   bool perf_constraints_met =
       PerfConstraintsMet(&perf_constraints_recommendation);
-  bool early_stopping_met = EarlyStopping(&early_stopping_recommendation);
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies){
+    early_stopping_met = EarlyStopping(&early_stopping_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.sample_latencies, 
+                                        &pr.query_latencies,
+                                        settings.target_latency);
+  } else {
+    early_stopping_met = EarlyStopping(&early_stopping_tpot_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.token_results.time_per_output_token_arr, 
+                                        &pr.query_latencies,
+                                        std::chrono::nanoseconds(settings.server_tpot_latency)) && 
+                          EarlyStopping(&early_stopping_ttft_recommendation,
+                                        pr.queries_issued, 
+                                        &pr.token_results.first_token_latencies, 
+                                        &pr.query_latencies,
+                                        std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
   bool all_constraints_met = min_duration_met && min_queries_met &&
                              perf_constraints_met && early_stopping_met;
 
@@ -554,8 +647,12 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
   }
   std::replace(early_stopping_recommendation.begin(),
                early_stopping_recommendation.end(), '\n', ' ');
-  MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
-
+  if (!settings.use_token_latencies){
+    MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
+  } else{
+    MLPERF_LOG(detail, "early_stopping_ttft_result", early_stopping_ttft_recommendation);
+    MLPERF_LOG(detail, "early_stopping_tpot_result", early_stopping_tpot_recommendation);
+  }
   // Report number of queries
   MLPERF_LOG(detail, "result_query_count", query_count);
   if (settings.scenario == TestScenario::Server) {
@@ -639,25 +736,20 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
                     "result_first_token_" + DoubleToString(lp.percentile * 100) +
                         "_percentile_latency_ns",
                     lp.sample_latency);
-        if ((lp.percentile == .999) & (lp.sample_latency > settings.server_ttft_latency)){
-          MLPERF_LOG_WARNING(detail, "warning_generic_message", 
-            "Value for result_first_token_" + DoubleToString(lp.percentile * 100) +
-            "_percentile_latency_ns greater than target time_to_first_token"
-          );
-        }
       }
       double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
       double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count);
       MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg);
       MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
-      uint64_t tpot = sample_count * (sample_latency_mean - first_token_latency_mean) / (token_count);
-      MLPERF_LOG(detail, "result_time_to_output_token", tpot);
-      if (tpot > settings.server_tpot_latency){
-        MLPERF_LOG_WARNING(detail, "warning_generic_message",
-          "Value for result_time_to_output_token "
-          "greater than target time_to_output_token"
-        );
+      for (auto& lp : tpot_percentiles) {
+        MLPERF_LOG(detail,
+                    "result_time_per_output_token_" + DoubleToString(lp.percentile * 100) +
+                        "_percentile_ns",
+                    lp.sample_latency);
       }
+      MLPERF_LOG(detail, "result_time_to_output_token_min", time_per_output_token_min);
+      MLPERF_LOG(detail, "result_time_to_output_token_max", time_per_output_token_max);
+      MLPERF_LOG(detail, "result_time_to_output_token_mean", time_per_output_token_mean);
     } else {
       double tokens_per_second = token_count / pr.max_latency;
       MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
diff --git a/loadgen/results.h b/loadgen/results.h
index 69825a9d3..38bbe32d4 100644
--- a/loadgen/results.h
+++ b/loadgen/results.h
@@ -29,6 +29,7 @@ namespace loadgen {
 /// token based metrics
 struct TokenPerformanceResults {
   std::vector<QuerySampleLatency> first_token_latencies;
+  std::vector<QuerySampleLatency> time_per_output_token_arr;
   std::vector<int64_t> tokens_per_sample;
 };
 
@@ -86,11 +87,17 @@ struct PerformanceSummary {
   QuerySampleLatency first_token_latency_min;
   QuerySampleLatency first_token_latency_max;
   QuerySampleLatency first_token_latency_mean;
+  QuerySampleLatency time_per_output_token_min;
+  QuerySampleLatency time_per_output_token_max;
+  QuerySampleLatency time_per_output_token_mean;
 
   // Latency token target percentile
   PercentileEntry token_target_latency_percentile{settings.target_latency_percentile};
   PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
                                                   {.97}, {.99}, {.999}};
+  PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
+  PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
+                                        {.97}, {.99}, {.999}};
 
 #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
   // MSVC complains if there is no explicit constructor.
@@ -104,7 +111,10 @@ struct PerformanceSummary {
   void ProcessTokenLatencies();
 
   bool MinDurationMet(std::string* recommendation);
-  bool EarlyStopping(std::string* recommendation);
+  bool EarlyStopping(std::string* recommendation, int64_t queries_issued, 
+                      std::vector<QuerySampleLatency>* sample_latencies,
+                      std::vector<QuerySampleLatency>* query_latencies,
+                      std::chrono::nanoseconds target_latency);
   bool MinQueriesMet();
   bool MinSamplesMet();
   bool HasPerfConstraints();
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 95e53b731..814814df8 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -482,7 +482,12 @@ void TestSettingsInternal::LogAllSettings() const {
 void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
   summary("samples_per_query : ", samples_per_query);
   summary("target_qps : ", target_qps);
-  summary("target_latency (ns): ", target_latency.count());
+  if (!use_token_latencies){
+    summary("target_latency (ns): ", target_latency.count());
+  } else {
+    summary("ttft_latency (ns): ", server_ttft_latency);
+    summary("tpot_latency (ns): ", server_tpot_latency);
+  }
   summary("max_async_queries : ", max_async_queries);
   summary("min_duration (ms): ", min_duration.count());
   summary("max_duration (ms): ", max_duration.count());
diff --git a/mlperf.conf b/mlperf.conf
index 8c6f53849..c02222668 100644
--- a/mlperf.conf
+++ b/mlperf.conf
@@ -43,6 +43,7 @@ retinanet.MultiStream.target_latency = 528
 
 # GPT-J uses equal issue mode for Single-Stream
 gptj.SingleStream.sample_concatenate_permutation = 1
+gptj.Server.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
@@ -59,7 +60,8 @@ gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000
 # Falcon Server scenario requires two latency constraints
 llama2-70b.*.use_token_latencies = 1
-llama2-70b.Server.target_latency = 2000
+# Only ttft and tpot are tracked for the llama2-70b benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
 

From 104d85531af7ae2a07ded36dd6e8c7f2c7746d06 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 6 Feb 2024 12:38:31 -0500
Subject: [PATCH 14/75] Add sample length check to test06 (#1603)

* Add sample length check to test06

* Remove spaces in token metrics recomendation

* Add important item to Llama readme

* Fix Bug: number of tokens logged before computing them

* Fix typo: lenght -> length
---
 compliance/nvidia/TEST06/README.md           |  5 ++++-
 compliance/nvidia/TEST06/run_verification.py | 16 +++++++++++++++-
 language/llama2-70b/README.md                |  4 +++-
 language/llama2-70b/SUT.py                   |  4 ++--
 loadgen/loadgen.cc                           |  2 +-
 loadgen/logging.cc                           | 14 ++++++++++----
 loadgen/logging.h                            |  2 +-
 loadgen/results.cc                           |  4 ++++
 8 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/compliance/nvidia/TEST06/README.md b/compliance/nvidia/TEST06/README.md
index a93312f94..a867e7527 100644
--- a/compliance/nvidia/TEST06/README.md
+++ b/compliance/nvidia/TEST06/README.md
@@ -8,9 +8,10 @@ This repository provides the config files and scripts to run and verify TEST 06
 
 ## Introduction
 
-The purpose of this test is to ensure the consistency of the output of the Llama2 model and avoid a potential EOS exploit. This test will make a performance run, with a limit of 100 samples and logging them into `mlperf_log_accuracy.json`. To achieve a passing result in this test, two criteria must be met:
+The purpose of this test is to ensure the consistency of the output of the Llama2 model and avoid a potential EOS exploit. This test will make a performance run, with a limit of 100 samples and logging them into `mlperf_log_accuracy.json`. To achieve a passing result in this test, three criteria must be met:
 - In the case the first token is reported independently (not applicable for Offline scenario), it should match for every query with the first token of the model output.
 - For each query, the model output should only end with zero or one EOS token
+- The number of reported tokens should match with the length of it's
 
 ## Requisites
 
@@ -36,6 +37,7 @@ Expected output
 ```
 First token check pass: True                
 EOS check pass: True             
+Sample length check pass: True  
 TEST06 verification complete   
 ```
 
@@ -44,5 +46,6 @@ Or:
 ```
 First token check pass: Skipped                
 EOS check pass: True             
+Sample length check pass: True  
 TEST06 verification complete     
 ```
\ No newline at end of file
diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index afa4081d3..698d5d6dc 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -67,6 +67,15 @@ def first_token_check(acc_data, dtype):
         
     return True
 
+def sample_len_check(acc_data, dtype):
+    for sample in acc_data:
+        data = np.frombuffer(bytes.fromhex(sample["data"]), dtype=dtype)
+        token_count = int(sample["token_count"])
+        if len(data) != token_count:
+            return False
+    return True
+
+
 def main():
     args = get_args()
     accuracy_file = os.path.join(args.compliance_dir, "mlperf_log_accuracy.json")
@@ -89,6 +98,8 @@ def main():
             print("Unexpected error occured while doing the first token check")
             first_token_pass = False
 
+    sample_len_pass = sample_len_check(acc_data, DTYPE_MAP[args.dtype])
+
     # Construct output based on the results of checks
     output = ""
     # Add first token check
@@ -100,7 +111,10 @@ def main():
     # Add EOS check
     output += f"EOS check pass: {eos_pass}\n"
 
-    if eos_pass and first_token_pass:
+    # Add sample length check
+    output += f"Sample length check pass: {sample_len_pass}\n"
+
+    if eos_pass and first_token_pass and sample_len_pass:
         output += "TEST06 verification complete\n"
     else:
         output += "TEST06 verification failed\n"
diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 2614749a7..979488e91 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -4,7 +4,9 @@
 
 + Processing of Validation dataset is not finalized yet. Decision on input token lengths is pending
 + Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation
-
++ For custom/optimized implementations of this benchmark it is important to include the :
+        - For server scenario, it is necesary to call `lg.FirstTokenComplete(response)` for each query. This way the first token will be reported and it's latency will be measured.
+        - For all scenarios, when calling `lg.QuerySamplesComplete(response)`, it is necessary that each of the elements in response is a `lg.QuerySampleResponse` that contains the number of tokens (can be create this way: `lg.QuerySampleResponse(qitem.id, bi[0], bi[1], n_tokens)`). The number of tokens reported should match with the number of tokens on your answer and this will be checked in [TEST06](../../compliance/nvidia/TEST06/)
 
 ## Prepare environment
 
diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 30fb21f8f..228324d73 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -342,11 +342,11 @@ def process_queries(self):
                                         )
 
             output_tokens = tokens_streamer.get_out_tokens()
-
+            n_tokens = len(output_tokens)
             response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
             bi = response_array.buffer_info()
             response = [lg.QuerySampleResponse(
-                qitem.id, bi[0], bi[1])]
+                qitem.id, bi[0], bi[1], n_tokens)]
             lg.QuerySamplesComplete(response)
 
 
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 1d7d7420d..7b4fcbbd2 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -120,7 +120,7 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
 
       if (sample_data_copy) {
         log.LogAccuracy(sample->sequence_id, sample->sample_index,
-                        LogBinaryAsHexString{sample_data_copy});
+                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
         delete sample_data_copy;
       }
 
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index bd8e377fa..d33074c01 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -279,19 +279,25 @@ void AsyncLog::StopTrace() {
 }
 
 void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                           const LogBinaryAsHexString& response) {
+                           const LogBinaryAsHexString& response,
+                           int64_t n_tokens = 0) {
   std::unique_lock<std::mutex> lock(log_mutex_);
   if (!accuracy_out_) {
     return;
   }
   *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
-  if (!use_tokens_ || !needs_first_token_){
+  if (!use_tokens_){
     LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
           response);
-  } else {
+  } else if (!needs_first_token_)
+  {
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+          response, "token_count", n_tokens);
+  }
+  else {
     const size_t i = seq_id - latencies_first_sample_sequence_id_;
     LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-          response, "token_data", token_records_[i]);
+          response, "token_data", token_records_[i], "token_count", n_tokens);
   }
   
   *accuracy_out_ << " }";
diff --git a/loadgen/logging.h b/loadgen/logging.h
index b9c032a72..e62825859 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -226,7 +226,7 @@ class AsyncLog {
   void SetCurrentPidTid(uint64_t pid, uint64_t tid);
 
   void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                   const LogBinaryAsHexString& response);
+                   const LogBinaryAsHexString& response, int64_t n_tokens);
   void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
 
   template <typename... Args>
diff --git a/loadgen/results.cc b/loadgen/results.cc
index d5173f441..4fbfa74b1 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -650,6 +650,10 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
   if (!settings.use_token_latencies){
     MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
   } else{
+    std::replace(early_stopping_ttft_recommendation.begin(),
+               early_stopping_ttft_recommendation.end(), '\n', ' ');
+    std::replace(early_stopping_tpot_recommendation.begin(),
+                early_stopping_tpot_recommendation.end(), '\n', ' ');
     MLPERF_LOG(detail, "early_stopping_ttft_result", early_stopping_ttft_recommendation);
     MLPERF_LOG(detail, "early_stopping_tpot_result", early_stopping_tpot_recommendation);
   }

From 357ccefca4db55ff373e029d5d1e3e6246bde5d4 Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Tue, 6 Feb 2024 20:24:06 -0800
Subject: [PATCH 15/75] Enable equal issue mode for LLM benchmarks (#1610)

* Enable equal issue mode for LLM benchmarks

* Reduce min_query_count to 1 for server/MS/SS

* Remove scenario

* Remove min_query_count so default is used; revoke padding change for equal issue offline

* Pad min_queries, not samples_per_query for non-offline

* Add documentation to the sample equal issue
---
 loadgen/loadgen.cc | 45 +++++++++++++++++++++++++++------------------
 mlperf.conf        | 13 ++++++-------
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 7b4fcbbd2..464f2a741 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -227,10 +227,8 @@ auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
              auto& gen) mutable { return dist(gen); };
 }
 
-/// \brief SampleDistribution for 3D-UNet SingleStream, for v2.0
-// FIXME: meant for 3D UNet SingleStream only at the moment but the logic should
-// work for others
-// TODO: consolidate the distribution generator after v2.0
+/// \brief Sample across the dataset, and ensure coverage of each of the samples.
+// Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
 auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
                                   std::mt19937* rng) {
   std::vector<size_t> indices;
@@ -306,8 +304,6 @@ std::vector<QueryMetadata> GenerateQueries(
   auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
       loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
 
-  // FIXME: Only used for v2.0 3D-UNet KiTS19 SingleStream
-  // TODO: Need to consolidate the code for any generic usage after v2.0
   auto sample_distribution_equal_issue =
       SampleDistributionEqualIssue(min_queries,
                                    loaded_samples.size(),
@@ -317,14 +313,24 @@ std::vector<QueryMetadata> GenerateQueries(
       ScheduleDistribution<scenario>(settings.target_qps);
 
   // When sample_concatenate_permutation is turned on, pad to a multiple of the
-  // complete dataset to ensure complete fairness.
-  // FIXME: Only override this for Offline; fix after v2.0
-  if (settings.sample_concatenate_permutation &&
-      scenario == TestScenario::Offline &&
-      samples_per_query % loaded_samples.size() != 0) {
-    size_t pad_size =
+  // complete dataset to ensure fairness.
+  auto enable_equal_issue = settings.sample_concatenate_permutation;
+  if (enable_equal_issue)
+  {
+    if (scenario == TestScenario::Offline &&
+      samples_per_query % loaded_samples.size() != 0)
+    {
+      // In offline mode, we pad samples_per_query
+      size_t pad_size =
         (loaded_samples.size() - samples_per_query % loaded_samples.size());
-    samples_per_query += pad_size;
+      samples_per_query += pad_size;
+    }
+    else if (min_queries % loaded_samples.size() != 0)
+    {
+      // In Server, SingleStream, MultiStream mode, the min_queries should be padded
+      size_t pad_size = (loaded_samples.size() - min_queries % loaded_samples.size());
+      min_queries += pad_size;
+    }
   }
 
   std::vector<QuerySampleIndex> samples(samples_per_query);
@@ -380,16 +386,12 @@ std::vector<QueryMetadata> GenerateQueries(
         }
       }
     } else {
-      // FIXME: only used for v2.0 3D-UNet KiTS19 SingleStream
-      // TODO: consolidate after v2.0
-      auto equal_issue = settings.sample_concatenate_permutation &&
-                         scenario == TestScenario::SingleStream;
       for (auto& s : samples) {
         s = loaded_samples[settings.performance_issue_unique
                            ? sample_distribution_unique(sample_rng)
                            : settings.performance_issue_same
                              ? same_sample
-                             : equal_issue
+                             : enable_equal_issue
                                ? sample_distribution_equal_issue(sample_rng)
                                : sample_distribution(sample_rng)];
       }
@@ -397,6 +399,13 @@ std::vector<QueryMetadata> GenerateQueries(
     queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
     prev_timestamp = timestamp;
     timestamp += schedule_distribution(schedule_rng);
+    // In equal_issue mode, the min_queries will be bumped up by a multiple of the dataset size
+    // if the test time has not met the threshold.
+    if (enable_equal_issue && (queries.size() >= min_queries) &&
+      (prev_timestamp < gen_duration) && (scenario != TestScenario::Offline))
+    {
+      min_queries += loaded_samples.size();
+    }
   }
 
   // See if we need to create a "remainder" query for offline+accuracy to
diff --git a/mlperf.conf b/mlperf.conf
index c02222668..7e286b565 100644
--- a/mlperf.conf
+++ b/mlperf.conf
@@ -12,6 +12,8 @@ bert.*.performance_sample_count_override = 10833
 dlrm.*.performance_sample_count_override = 204800
 dlrm-v2.*.performance_sample_count_override = 204800
 rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+llama2-70b.*.performance_sample_count_override = 24576
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
@@ -28,28 +30,25 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 
 *.SingleStream.target_latency_percentile = 90
 *.SingleStream.min_duration = 600000
-#*.SingleStream.min_query_count = 1024
 
 *.MultiStream.target_latency_percentile = 99
 *.MultiStream.samples_per_query = 8
 *.MultiStream.min_duration = 600000
-#*.MultiStream.min_query_count = 270336
 *.MultiStream.min_query_count = 662
 retinanet.MultiStream.target_latency = 528
 
-
-# 3D-UNet uses equal issue mode
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
 3d-unet.*.sample_concatenate_permutation = 1
 
-# GPT-J uses equal issue mode for Single-Stream
-gptj.SingleStream.sample_concatenate_permutation = 1
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
 gptj.Server.sample_concatenate_permutation = 1
+gptj.SingleStream.sample_concatenate_permutation = 1
+llama2-70b.Server.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
 *.Server.target_duration = 0
 *.Server.min_duration = 600000
-#*.Server.min_query_count = 270336
 resnet50.Server.target_latency = 15
 retinanet.Server.target_latency = 100
 bert.Server.target_latency = 130

From 44285d9a44041c3d4f7df0acf1e72c375e9fcbd2 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 6 Feb 2024 23:47:02 -0500
Subject: [PATCH 16/75] Set completed samples per second as llama metric
 (#1613)

---
 loadgen/results.cc                     | 22 +++++++++++++++++++---
 tools/submission/submission_checker.py |  4 ++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/loadgen/results.cc b/loadgen/results.cc
index 4fbfa74b1..55cc83a36 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -399,10 +399,18 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
       //    the 1 second time point; but that would be the 1001th sample in
       //    the stream. Given the first 1001 queries, the QPS is
       //    1000 queries / 1 second.
+      // TODO: make a more permanent solution
+      if (settings.use_token_latencies){
+      double qps_as_completed =
+        (sample_count - 1) / pr.final_query_all_samples_done_time;
+      summary("Completed samples per second    : ",
+              DoubleToString(qps_as_completed));
+    } else {
       double qps_as_scheduled =
           (sample_count - 1) / pr.final_query_scheduled_time;
       summary("Scheduled samples per second : ",
               DoubleToString(qps_as_scheduled));
+    }
       break;
     }
     case TestScenario::Offline: {
@@ -516,10 +524,18 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
     summary("QPS w/o loadgen overhead        : " + DoubleToString(qps_wo_lg));
     summary("");
   } else if (settings.scenario == TestScenario::Server) {
-    double qps_as_completed =
+    // TODO: make a more permanent solution
+    if (!settings.use_token_latencies){
+      double qps_as_completed =
         (sample_count - 1) / pr.final_query_all_samples_done_time;
-    summary("Completed samples per second    : ",
-            DoubleToString(qps_as_completed));
+      summary("Completed samples per second    : ",
+              DoubleToString(qps_as_completed));
+    } else {
+      double qps_as_scheduled =
+          (sample_count - 1) / pr.final_query_scheduled_time;
+      summary("Scheduled samples per second : ",
+              DoubleToString(qps_as_scheduled));
+    }
     summary("");
   } else if (settings.scenario == TestScenario::MultiStream) {
     summary("Per-query latency:  ");
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index ac76c5dea..5d75da9b7 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1381,11 +1381,11 @@
 RESULT_FIELD_BENCHMARK_OVERWRITE = {
     "llama2-70b-99": {
         "Offline": "result_tokens_per_second",
-        "Server": "result_scheduled_samples_per_sec",
+        "Server": "result_completed_samples_per_sec",
     },
     "llama2-70b-99.9": {
         "Offline": "result_tokens_per_second",
-        "Server": "result_scheduled_samples_per_sec",
+        "Server": "result_completed_samples_per_sec",
     }
 }
 

From d45a66c33bb0b6548d6172b38645bf94ffe24b39 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 7 Feb 2024 09:27:20 -0500
Subject: [PATCH 17/75] Add upper limit to tokens per sample (#1612)

---
 tools/submission/submission_checker.py | 27 ++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 5d75da9b7..a4b9a1421 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1129,7 +1129,9 @@
             "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758)
         },
         "accuracy-upper-limit": {
-            "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626)
+            "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626),
+            "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1),
+            "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1)
         },
         "performance-sample-count": {
             "resnet": 1024,
@@ -1847,13 +1849,17 @@ def check_accuracy_dir(config, model, path, verbose):
     acc_targets = []
     if acc_upper_limit is not None:
         acc_limits = []
+        up_patterns = []
         acc_limit_check = True
+        for i in range(0, len(acc_upper_limit), 2):
+            acc_type, acc_target = acc_upper_limit[i:i+2]
+            acc_limits.append(acc_target)
+            up_patterns.append(ACC_PATTERN[acc_type])
+
     for i in range(0, len(target), 2):
         acc_type, acc_target = target[i:i+2]
         patterns.append(ACC_PATTERN[acc_type])
         acc_targets.append(acc_target)
-        if acc_upper_limit is not None:
-            acc_limits.append(acc_upper_limit[i+1])
     acc_seen = [False for _ in acc_targets]
     with open(os.path.join(path, "accuracy.txt"), "r", encoding="utf-8") as f:
         for line in f:
@@ -1870,12 +1876,21 @@ def check_accuracy_dir(config, model, path, verbose):
                 elif acc is not None:
                     all_accuracy_valid = False
                     log.warning("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc)
-                if acc is not None and acc_upper_limit is not None and float(acc) > acc_limits[i]:
-                    acc_limit_check = False
-                    log.warning("%s accuracy not met: upper limit=%f, found=%s", path, acc_limits[i], acc)
                 if i == 0 and acc:
                     result_acc = acc
                 acc = None
+            if acc_upper_limit is not None:
+                for i, (pattern, acc_limit) in enumerate(zip(up_patterns, acc_limits)):
+                    m = re.match(pattern, line)
+                    if m:
+                        acc = m.group(1)
+                    m = re.match(r"^hash=([\w\d]+)$", line)
+                    if m:
+                        hash_val = m.group(1)
+                    if acc is not None and acc_upper_limit is not None and float(acc) > acc_limit:
+                        acc_limit_check = False
+                        log.warning("%s accuracy not met: upper limit=%f, found=%s", path, acc_limit, acc)
+                    acc = None
             if all(acc_seen) and hash_val:
                 break;
         is_valid = all_accuracy_valid & all(acc_seen)

From d7dba0826ac96a0e8bcae525086961e47c409a38 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 7 Feb 2024 09:50:16 -0500
Subject: [PATCH 18/75] Remove loadgen warnings (#1608)

Co-authored-by: Miro <mirhodak@amd.com>
---
 loadgen/results.cc                | 2 ++
 loadgen/test_settings_internal.cc | 3 ++-
 loadgen/test_settings_internal.h  | 4 ++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/loadgen/results.cc b/loadgen/results.cc
index 55cc83a36..445de8901 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -439,6 +439,8 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
         summary("Tokens per second: ", tokens_per_second);
         break;
       }
+      case TestScenario::Server:
+        break;
     }
   }
 
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 814814df8..2bcf62c29 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -690,12 +690,13 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
   lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);
 
   // keys that apply to token metrics
-  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr))
+  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)){
     use_token_latencies = (val == 1) ? true : false;
     if (use_token_latencies){
       lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000);
       lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000);
     }
+  }
 
   // keys that apply to SingleStream
   lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 3392dd59d..5222f3156 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -83,8 +83,8 @@ struct TestSettingsInternal {
 
   bool sample_concatenate_permutation;
   bool use_token_latencies = false;
-  uint64_t server_ttft_latency;
-  uint64_t server_tpot_latency;
+  int64_t server_ttft_latency;
+  int64_t server_tpot_latency;
 };
 
 /// \brief A namespace of collections of FindPeakPerformance helper functions,

From b0777f0bd3b7e5cd6a51d670d8b4df656cf6fb29 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Wed, 7 Feb 2024 15:04:47 +0000
Subject: [PATCH 19/75] Update README.md - remove unwanted lines in CM commands
 (#1601)

* Update README.md

No longer need custom fork as the relevant changes are in the inference repository

* Update dataset.py

---------

Co-authored-by: Miro <mirhodak@amd.com>
---
 language/bert/README.md        | 4 ----
 language/llama2-70b/dataset.py | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/language/bert/README.md b/language/bert/README.md
index 1c5fefe3a..144e6d9d0 100644
--- a/language/bert/README.md
+++ b/language/bert/README.md
@@ -51,8 +51,6 @@ The below CM command will launch the SUT server
 
 ```
 cm run script --tags=generate-run-cmds,inference --model=bert-99 --backend=pytorch  \
---rerun --adr.mlperf-implementation.version=custom \
---adr.mlperf-implementation.tags=_repo.https://github.com/GATEOVerflow/inference \
 --mode=performance --device=cuda --quiet --test_query_count=1000 --network=sut
 ```
 
@@ -61,8 +59,6 @@ Once the SUT server is launched, the below command can be run on the loadgen nod
 
 ```
 cm run script --tags=generate-run-cmds,inference --model=bert-99 --backend=pytorch  --rerun \
---adr.mlperf-implementation.version=custom \
---adr.mlperf-implementation.tags=_repo.https://github.com/GATEOVerflow/inference \
 --mode=performance --device=cuda --quiet --test_query_count=1000  \
 --sut_servers,=http://localhost:8000 --network=lon
 ```
diff --git a/language/llama2-70b/dataset.py b/language/llama2-70b/dataset.py
index a59fd7f55..4b1b1bb91 100644
--- a/language/llama2-70b/dataset.py
+++ b/language/llama2-70b/dataset.py
@@ -84,6 +84,8 @@ def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sampl
         assert len(query_id_list) == output_seq.shape[0]
 
         # Save outputs
+        if not os.path.exists("run_outputs"):
+            os.makedirs("run_outputs")
         fname = "q" + "_".join([str(i) for i in query_id_list])
         fname = f"run_outputs/{fname}.pkl"
         with open(fname, mode='wb') as f:

From 3190d09a148bc1d72c9d946e4cf3b54db0d0e60a Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Wed, 7 Feb 2024 15:12:27 +0000
Subject: [PATCH 20/75] Typo fix in README.md (#1588)

Co-authored-by: Miro <mirhodak@amd.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f1cfa2721..dde466980 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ For power submissions please use [SPEC PTD 1.10](https://github.com/mlcommons/po
 | 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 | edge,datacenter |
 | rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus | edge,datacenter |
 | gpt-j | [language/gpt-j](https://github.com/mlcommons/inference/tree/master/language/gpt-j)| pytorch | CNN-Daily Mail | edge,datacenter |
-| stable-diffusion-xl | [text_to_image] (https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
+| stable-diffusion-xl | [text_to_image](https://github.com/mlcommons/inference/tree/master/text_to_image) | pytorch | COCO 2014| edge,datacenter |
 | llama2-70b | [language/llama2-70b](https://github.com/mlcommons/inference/tree/master/language/llama2-70b) | pytorch | OpenOrca | datacenter |
 
 * Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.

From 840435a1abe8be823e0a5a18ad3173bc520aa24f Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Wed, 7 Feb 2024 15:13:05 +0000
Subject: [PATCH 21/75] Update README.md with CM commands to download
 stable-diffusion, gptj and dlrmv2 models (#1604)

* Update README.md

Add CM commands to download Stable diffusion models

* Update README.md

* Update README.md
---
 language/gpt-j/README.md                 | 8 ++++++++
 recommendation/dlrm_v2/pytorch/README.md | 9 +++++++++
 text_to_image/README.md                  | 8 ++++++++
 3 files changed, 25 insertions(+)

diff --git a/language/gpt-j/README.md b/language/gpt-j/README.md
index cc46135f2..58af37ba3 100644
--- a/language/gpt-j/README.md
+++ b/language/gpt-j/README.md
@@ -74,6 +74,14 @@ Please download the fine-tuned GPT-J checkpoint from [here](https://cloud.mlcomm
 wget https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download --output-document checkpoint.zip
 ```
 
+The following MLCommons CM commands can be used to programmatically download the model checkpoint. 
+
+```
+pip install cmind
+cm pull repo mlcommons@ck
+cm run script --tags=get,ml-model,gptj,_pytorch,_rclone -j
+```
+
 ### Running the Benchmark
 Replace the model and dataset path arguments with your corresponding paths. For evaluating the ROUGE score after the run, include --accuracy as shown below. For user specific target qps, please include user.conf.
 ```
diff --git a/recommendation/dlrm_v2/pytorch/README.md b/recommendation/dlrm_v2/pytorch/README.md
index 313bf1e1f..64d2af249 100755
--- a/recommendation/dlrm_v2/pytorch/README.md
+++ b/recommendation/dlrm_v2/pytorch/README.md
@@ -79,6 +79,15 @@ You can download the weights by running:
 wget https://cloud.mlcommons.org/index.php/s/XzfSeLgW8FYfR3S/download -O weights.zip
 unzip weights.zip
 ```
+
+The following MLCommons CM commands can be used to programmatically download the model checkpoint. 
+
+```
+pip install cmind
+cm pull repo mlcommons@ck
+cm run script --tags=get,ml-model,dlrm,_pytorch,_weight_sharded,_rclone -j
+```
+
 (optional) To speed up future downloads, we recommend you save the weights in a bucket (E.g GCP, AWS). For example, after saving the checkpoint in a GCP bucket, you can download the weights faster by running:
 ```
 export BUCKET_NAME=<BUCKET_CONTAINING_MODEL>
diff --git a/text_to_image/README.md b/text_to_image/README.md
index 26393f54f..b8573624c 100644
--- a/text_to_image/README.md
+++ b/text_to_image/README.md
@@ -8,6 +8,14 @@ This is the reference implementation for MLPerf Inference text to image
 | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
 | StableDiffusion | - | Coco2014 | [fp32](https://cloud.mlcommons.org/index.php/s/DjnCSGyNBkWA4Ro) and [f16](https://cloud.mlcommons.org/index.php/s/LCdW5RM6wgGWbxC) | [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | fp32 | NCHW |
 
+The following MLCommons CM commands can be used to programmatically download the model checkpoints. 
+
+```
+pip install cmind
+cm pull repo mlcommons@ck
+cm run script --tags=get,ml-model,sdxl,_fp16,_rclone -j
+cm run script --tags=get,ml-model,sdxl,_fp32,_rclone -j
+```
 ## Dataset
 
 | Data | Description |

From 817dd961221d3ca284b024368130bd5dc5e4b54a Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Wed, 7 Feb 2024 20:50:01 -0800
Subject: [PATCH 22/75] Turn equal issue mode off for TEST06 (#1615)

* Turn equal issue mode off for Llama2 TEST06

* Add TEST06 to the output dir
---
 compliance/nvidia/TEST06/audit.config        | 4 +++-
 compliance/nvidia/TEST06/run_verification.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/compliance/nvidia/TEST06/audit.config b/compliance/nvidia/TEST06/audit.config
index c8888be50..090ec03b1 100644
--- a/compliance/nvidia/TEST06/audit.config
+++ b/compliance/nvidia/TEST06/audit.config
@@ -8,4 +8,6 @@
 *.*.accuracy_log_rng_seed = 720381539243781796
 *.*.accuracy_log_sampling_target = 100
 *.*.min_query_count = 100
-*.*.min_duration = 0
\ No newline at end of file
+*.*.min_duration = 0
+# Turn off equal issue mode for TEST06
+*.*.sample_concatenate_permutation = 0
\ No newline at end of file
diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index 698d5d6dc..6cdb082e6 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -120,7 +120,7 @@ def main():
         output += "TEST06 verification failed\n"
 
     # Output test output to console and folder
-    output_dir = args.output_dir
+    output_dir = os.path.join(args.output_dir, "TEST06")
     output_accuracy_dir = os.path.join(args.output_dir, "accuracy")
     
     if not os.path.isdir(output_dir):

From 0ed5190e82e1ca1cf95626e50e61cb6678c7f2ff Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:29:58 -0800
Subject: [PATCH 23/75] Fix submission checker and TEST06 for Llama2 (#1616)

* Fix submission checker and TEST06 for Llama2

* Remove redundant line

* Move test_dir check
---
 compliance/nvidia/TEST06/run_verification.py |   2 +-
 tools/submission/submission_checker.py       | 212 ++++++++++---------
 2 files changed, 118 insertions(+), 96 deletions(-)

diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index 6cdb082e6..03c60c2ef 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -121,7 +121,7 @@ def main():
 
     # Output test output to console and folder
     output_dir = os.path.join(args.output_dir, "TEST06")
-    output_accuracy_dir = os.path.join(args.output_dir, "accuracy")
+    output_accuracy_dir = os.path.join(output_dir, "accuracy")
     
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index a4b9a1421..0bae5b0d4 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -1392,13 +1392,10 @@
 }
 
 LLAMA2_LATENCY_LIMITS = {
-    "interactive": {
-        "ttft": 500,
-        "tpot": 50
-    },
+    # We might add interactive in the next round. Latency in ns
     "conversational": {
-        "ttft": 2000,
-        "tpot": 200
+        "ttft": 2000 * 1000000,
+        "tpot": 200 * 1000000
     }
 }
 
@@ -1920,16 +1917,20 @@ def check_accuracy_dir(config, model, path, verbose):
 
 
 def extra_check_llama2(mlperf_log, scenario):
-    if (mlperf_log["use_token_latencies"]):
+    if (mlperf_log["requested_use_token_latencies"]):
         if scenario == "Offline":
             # For offline no further checks are necessary
             return None, True
         else:
             for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
-                if mlperf_log["result_first_token_99.9_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_to_output_token"] < limits["tpot"]:
+                if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]:
                     return constraint, True
     else:
+        log.error(f'use_token_latencies flag needs to be enabled for Llama2 benchmark')
         return None, False
+
+    log.error(f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}')
+    return None, False
             
 
 def get_performance_metric(
@@ -2986,6 +2987,7 @@ def log_result(
                             n = ["run_1"]
 
                         for i in n:
+                            is_valid = True
                             perf_path = os.path.join(name, "performance", i)
                             if not os.path.exists(perf_path):
                                 log.error("%s is missing", perf_path)
@@ -3432,90 +3434,106 @@ def check_compliance_acc_dir(test_dir, model, config):
     if not os.path.exists(fname):
         log.error("%s is missing in %s", fname, test_dir)
     else:
-        # Accuracy can fail for TEST01
-        is_valid = True
-        with open(fname, "r") as f:
-            for line in f:
-                # look for: TEST PASS
-                if "TEST PASS" in line:
-                    acc_passed = True
-                    break
-        if acc_passed == False:
-            log.info(
-                "Compliance test accuracy check (deterministic mode) in %s failed",
-                test_dir,
-            )
+        if "TEST01" in test_dir:
+            # Accuracy can fail for TEST01
+            is_valid = True
+            with open(fname, "r") as f:
+                for line in f:
+                    # look for: TEST PASS
+                    if "TEST PASS" in line:
+                        acc_passed = True
+                        break
+            if acc_passed == False:
+                log.info(
+                    "Compliance test accuracy check (deterministic mode) in %s failed",
+                    test_dir,
+                )
 
-        # Check Accuracy dir
-        test_acc_path = os.path.join(test_dir, "accuracy")
-        if not os.path.exists(test_acc_path):
-            log.error("%s has no accuracy directory", test_dir)
-            is_valid = False
-        else:
-            diff = files_diff(
-                list_files(test_acc_path),
-                REQUIRED_TEST01_ACC_FILES_1
-                if acc_passed
-                else REQUIRED_TEST01_ACC_FILES,
-            )
-            if diff:
-                log.error("%s has file list mismatch (%s)", test_acc_path, diff)
+            # Check Accuracy dir
+            test_acc_path = os.path.join(test_dir, "accuracy")
+            if not os.path.exists(test_acc_path):
+                log.error("%s has no accuracy directory", test_dir)
                 is_valid = False
-            elif not acc_passed:
-                target = config.get_accuracy_target(model)
-                patterns = []
-                acc_types = []
-                for i in range(0, len(target), 2):
-                    acc_type = target[i:i+2]
-                    acc_types.append(acc_type)
-                    patterns.append(ACC_PATTERN[acc_type[0]])
-                acc_seen = [False for _ in acc_type]
-
-
-
-                more_accurate = model.find("99.9")
-                if more_accurate == -1:
-                    required_delta_perc = 1
-                else:
-                    required_delta_perc = 0.1
-                
-                acc_baseline = {
-                    acc_type: 0 for acc_type in acc_types
-                }
-                acc_compliance = {
-                    acc_type: 0 for acc_type in acc_types
-                }
-                with open(
-                    os.path.join(test_acc_path, "baseline_accuracy.txt"),
-                    "r",
-                    encoding="utf-8",
-                ) as f:
-                    for line in f:
-                        for acc_type, pattern in zip(acc_types, patterns):
-                            m = re.match(pattern, line)
-                            if m:
-                                acc_baseline[acc_type] = float(m.group(1))
-                with open(
-                    os.path.join(test_acc_path, "compliance_accuracy.txt"),
-                    "r",
-                    encoding="utf-8",
-                ) as f:
-                    for line in f:
-                        for acc_type, pattern in zip(acc_types, patterns):
-                            m = re.match(pattern, line)
-                            if m:
-                                acc_compliance[acc_type] = float(m.group(1))
-                for acc_type in acc_types:
-                    if acc_baseline[acc_type] == 0 or acc_compliance[acc_type] == 0:
-                        is_valid = False
-                        break
+            else:
+                diff = files_diff(
+                    list_files(test_acc_path),
+                    REQUIRED_TEST01_ACC_FILES_1
+                    if acc_passed
+                    else REQUIRED_TEST01_ACC_FILES,
+                )
+                if diff:
+                    log.error("%s has file list mismatch (%s)", test_acc_path, diff)
+                    is_valid = False
+                elif not acc_passed:
+                    target = config.get_accuracy_target(model)
+                    patterns = []
+                    acc_types = []
+                    for i in range(0, len(target), 2):
+                        acc_type = target[i:i+2]
+                        acc_types.append(acc_type)
+                        patterns.append(ACC_PATTERN[acc_type[0]])
+                    acc_seen = [False for _ in acc_type]
+
+                    more_accurate = model.find("99.9")
+                    if more_accurate == -1:
+                        required_delta_perc = 1
                     else:
-                        delta_perc = abs(1 - acc_baseline[acc_type] / acc_compliance[acc_type]) * 100
-                        if delta_perc <= required_delta_perc:
-                            is_valid = True
-                        else:
+                        required_delta_perc = 0.1
+                    acc_baseline = {
+                        acc_type: 0 for acc_type in acc_types
+                    }
+                    acc_compliance = {
+                        acc_type: 0 for acc_type in acc_types
+                    }
+                    with open(
+                        os.path.join(test_acc_path, "baseline_accuracy.txt"),
+                        "r",
+                        encoding="utf-8",
+                    ) as f:
+                        for line in f:
+                            for acc_type, pattern in zip(acc_types, patterns):
+                                m = re.match(pattern, line)
+                                if m:
+                                    acc_baseline[acc_type] = float(m.group(1))
+                    with open(
+                        os.path.join(test_acc_path, "compliance_accuracy.txt"),
+                        "r",
+                        encoding="utf-8",
+                    ) as f:
+                        for line in f:
+                            for acc_type, pattern in zip(acc_types, patterns):
+                                m = re.match(pattern, line)
+                                if m:
+                                    acc_compliance[acc_type] = float(m.group(1))
+                    for acc_type in acc_types:
+                        if acc_baseline[acc_type] == 0 or acc_compliance[acc_type] == 0:
                             is_valid = False
                             break
+                        else:
+                            delta_perc = abs(1 - acc_baseline[acc_type] / acc_compliance[acc_type]) * 100
+                            if delta_perc <= required_delta_perc:
+                                is_valid = True
+                            else:
+                                is_valid = False
+                                break
+        elif "TEST06" in test_dir:
+            """
+            Expected output
+            First token check pass: True (or First token check pass: Skipped)
+            EOS check pass: True
+            TEST06 verification complete
+            """
+            with open(fname, "r") as f:
+                lines = f.readlines()
+            lines = [line.strip() for line in lines]
+            first_token_pass = "First token check pass: True" in lines or "First token check pass: Skipped" in lines
+            eos_pass = "EOS check pass: True" in lines
+            length_check_pass = "Sample length check pass: True" in lines
+            is_valid = first_token_pass and eos_pass and length_check_pass
+            if not is_valid:
+                log.error(f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}.")
+        else:
+            raise NotImplemented(f"{test_dir} is neither TEST01 and TEST06, which doesn't require accuracy check")
 
     return is_valid
 
@@ -3566,13 +3584,16 @@ def check_compliance_dir(
         log.error("no compliance dir for %s: %s", name, compliance_dir)
         return False
 
-    # Check performance of all Tests
+    # Check performance of all Tests (except for TEST06)
     for test in test_list:
         test_dir = os.path.join(compliance_dir, test)
         if not os.path.exists(test_dir):
             log.error("Missing %s in compliance dir %s", test, compliance_dir)
             compliance_perf_dir_pass = False
         else:
+            # TEST06 has no performance test.
+            if "TEST06" in test_list:
+                continue
             try:
                 compliance_perf_dir = os.path.join(
                     compliance_dir, test, "performance", "run_1"
@@ -3595,13 +3616,14 @@ def check_compliance_dir(
                 and compliance_perf_valid
             )
 
-    if "TEST01" in test_list:
-        # Check accuracy for TEST01
-        compliance_acc_pass = check_compliance_acc_dir(
-            os.path.join(compliance_dir, "TEST01"), model, config
-        )
-    else:
-        compliance_acc_pass= True
+    compliance_acc_pass= True
+    for test in ["TEST01", "TEST06"]:
+        if test in test_list:
+            # Check accuracy for TEST01
+            compliance_acc_pass &= check_compliance_acc_dir(
+                os.path.join(compliance_dir, test), model, config
+            )
+
 
     return compliance_perf_pass and compliance_acc_pass and compliance_perf_dir_pass
 

From f06b920eac2170df38b5502c7443f50c9afa6045 Mon Sep 17 00:00:00 2001
From: Jinho Suh <83969361+nv-jinhosuh@users.noreply.github.com>
Date: Mon, 12 Feb 2024 13:47:52 -0600
Subject: [PATCH 24/75] Bugfix: equal-issue mode on offline causing accuracy
 run to fail (3D-UNet) (#1624)

Currently 3D-UNet is the only workload using equal-issue mode on Offline scenario.
Recent code change on LLM equal-issue mode caused 3D-UNet accuracy run to run more than 1 queries, causing the accuracy log to bloat and fail the accuracy checking script.
This change fixes the problem described above.
---
 loadgen/loadgen.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 464f2a741..76e59151b 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -315,7 +315,7 @@ std::vector<QueryMetadata> GenerateQueries(
   // When sample_concatenate_permutation is turned on, pad to a multiple of the
   // complete dataset to ensure fairness.
   auto enable_equal_issue = settings.sample_concatenate_permutation;
-  if (enable_equal_issue)
+  if (mode != TestMode::AccuracyOnly && enable_equal_issue)
   {
     if (scenario == TestScenario::Offline &&
       samples_per_query % loaded_samples.size() != 0)

From f9a643c0a0e920588da1b51a1d822e1071a9dbec Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Mon, 12 Feb 2024 14:49:27 -0500
Subject: [PATCH 25/75] Add number of tokens to offline (#1623)

---
 language/llama2-70b/SUT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 228324d73..4ff78c38c 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -207,9 +207,10 @@ def process_queries(self):
                                                                 query_id_list=query_ids)
 
             for i in range(len(qitem)):
+                n_tokens = processed_output[i].shape[0]
                 response_array = array.array("B", processed_output[i].tobytes())
                 bi = response_array.buffer_info()
-                response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1])]
+                response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1], n_tokens)]
                 lg.QuerySamplesComplete(response)
 
             tok = time.time()

From 486a629ea4d5c5150f452d0b0a196bf71fd2021e Mon Sep 17 00:00:00 2001
From: Jinho Suh <83969361+nv-jinhosuh@users.noreply.github.com>
Date: Thu, 15 Feb 2024 14:46:50 -0600
Subject: [PATCH 26/75] Hotfix: DLRMv2 Audit Test01 fallback failure (#1626)

* Hotfix: DLRMv2 Audit Test01 fallback failure

DLRMv2 Audit TEST01 may go to fallback route and the accuracy check script (accuracy-dlrm.py) didn't expect this to happen. It always expects entire sample set to be in the accuracy log while Audit TEST01 would generate subset only.

This fixes the Audit TEST01 failure described above.

* typo fix
---
 recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
index 873f6a0e1..ce662071f 100644
--- a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
+++ b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
@@ -43,7 +43,9 @@ def get_targets(args, qsl_indices):
     with open(args.aggregation_trace_file) as f:
         for line in f:
             sample_boundaries.append(sample_boundaries[-1] + int(line.split(", ")[2]))
-    assert len(sample_boundaries) == len(qsl_indices) + 1, "Number of samples in trace file does not match number of samples in loadgen accuracy log!"
+    if len(sample_boundaries) != len(qsl_indices) + 1:
+        print("Warning: number of samples in trace file ({}) does not match number of samples ({}) in "
+              "loadgen accuracy log!".format(len(sample_boundaries)-1, len(qsl_indices)))
     # Get all the ground truth labels in the original order in day_23
     print("Parsing ground truth labels from day_23 file...")
     ground_truths = []

From de31ee2cc35db7704f11e615cf5ee63a82f17b0a Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Tue, 20 Feb 2024 17:25:36 +0000
Subject: [PATCH 27/75] Fix preprocess_submission script to copy the code path
 while inferring low accuracy results (#1627)

---
 tools/submission/preprocess_submission.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index d6e021253..fee2aadde 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -247,6 +247,13 @@ def infer_scenario_results(filter_submitter, noinfer_low_accuracy_results, confi
 
                                     shutil.copytree(high_accuracy_model_path, \
                                             low_accuracy_model_path)
+                                high_accuracy_model_code_path = os.path.join(log_path, "..", \
+                                        "code", model)
+                                low_accuracy_model_code_path = os.path.join(log_path, "..", \
+                                        "code", low_accuracy_model)
+                                if not os.path.exists(low_accuracy_model_code_path):
+                                    shutil.copytree(high_accuracy_model_code_path, \
+                                            low_accuracy_model_code_path)
 
 
 

From 268bc9dc8a3c0a96bbb7d38482c0ce5016507633 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Tue, 13 Feb 2024 11:20:33 -0500
Subject: [PATCH 28/75] Add batching to SD reference

---
 text_to_image/main.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/text_to_image/main.py b/text_to_image/main.py
index 0b93ceb21..32425762f 100644
--- a/text_to_image/main.py
+++ b/text_to_image/main.py
@@ -394,11 +394,18 @@ def main():
     count = ds.get_item_count()
 
     # warmup
-    ds.load_query_samples([0])
+    syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit"
+    latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device)
+    warmup_samples = [
+        {
+            "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
+            "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
+            "latents": latents_pt,
+        }
+        for _ in range(args.max_batchsize)
+    ]
     for i in range(5):
-        captions, _ = ds.get_samples([0])
-        _ = backend.predict(captions)
-    ds.unload_query_samples(None)
+        _ = backend.predict(warmup_samples)
 
     scenario = SCENARIO_MAP[args.scenario]
     runner_map = {

From 5d0c221255b3c86759bbf7e20df521c4bc7c8469 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 12:13:25 -0600
Subject: [PATCH 29/75] Add Rclone-Cloudflare download instructions to
 README.md

---
 text_to_image/README.md | 51 ++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/text_to_image/README.md b/text_to_image/README.md
index b8573624c..243067459 100644
--- a/text_to_image/README.md
+++ b/text_to_image/README.md
@@ -4,18 +4,11 @@ This is the reference implementation for MLPerf Inference text to image
 
 ## Supported Models
 
-| model | accuracy | dataset | model link | model source | precision | notes |
-| ---- | ---- | ---- | ---- | ---- | ---- | ---- |
-| StableDiffusion | - | Coco2014 | [fp32](https://cloud.mlcommons.org/index.php/s/DjnCSGyNBkWA4Ro) and [f16](https://cloud.mlcommons.org/index.php/s/LCdW5RM6wgGWbxC) | [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | fp32 | NCHW |
+| model | accuracy | dataset | model source | precision | notes |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| StableDiffusion | - | Coco2014 | [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) | fp32 | NCHW |
 
-The following MLCommons CM commands can be used to programmatically download the model checkpoints. 
 
-```
-pip install cmind
-cm pull repo mlcommons@ck
-cm run script --tags=get,ml-model,sdxl,_fp16,_rclone -j
-cm run script --tags=get,ml-model,sdxl,_fp32,_rclone -j
-```
 ## Dataset
 
 | Data | Description |
@@ -55,7 +48,43 @@ CFLAGS="-std=c++14" python setup.py install
 
 ### Download model
 
-We host two checkpoints ([fp32](https://cloud.mlcommons.org/index.php/s/DjnCSGyNBkWA4Ro) and [f16](https://cloud.mlcommons.org/index.php/s/LCdW5RM6wgGWbxC)) that are a snapshot of the [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) pipeline at the time of the release of the benchmark. Download them and move them to your model path.
+We host two checkpoints (fp32 and fp16) that are a snapshot of the [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) pipeline at the time of the release of the benchmark. Download them and move them to your model path.
+
+#### CM method
+
+The following MLCommons CM commands can be used to programmatically download the model checkpoints. 
+
+```
+pip install cmind
+cm pull repo mlcommons@ck
+cm run script --tags=get,ml-model,sdxl,_fp16,_rclone -j
+cm run script --tags=get,ml-model,sdxl,_fp32,_rclone -j
+```
+#### Manual method
+
+The above command automatically runs a set of Rclone commands to download the data from a Cloudflare R2 bucket. However, if you'd like to run the Rclone commands manually, you can do so as follows:
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
+```
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
+```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following commands to download the dataset and checkpoints:
+
+**`fp32`**
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp32 ./stable_diffusion_fp32 -P
+```
+**`fp16`**
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp16 ./stable_diffusion_fp16 -P
+```
+
+#### Move to model path
 
 ```bash
 mkdir $MODEL_PATH

From dc94ae31918b68aaaf2071b8bc1e3a10548521d4 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 12:33:02 -0600
Subject: [PATCH 30/75] Add Rclone-Cloudflare download instructiosn to
 README.md

---
 recommendation/dlrm_v2/pytorch/README.md | 45 +++++++++++++-----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/recommendation/dlrm_v2/pytorch/README.md b/recommendation/dlrm_v2/pytorch/README.md
index 64d2af249..6de850b81 100755
--- a/recommendation/dlrm_v2/pytorch/README.md
+++ b/recommendation/dlrm_v2/pytorch/README.md
@@ -67,18 +67,15 @@ cd $HOME/mlcommons/inference/loadgen
 CFLAGS="-std=c++14" python setup.py develop --user
 ```
 
+
 ### Downloading model weights
 
-File name | framework | Size in bytes (`du *`) | MD5 hash (`md5sum *`)
--|-|-|-
+framework | Size in bytes (`du *`) | MD5 hash (`md5sum *`)
+-|-|-
 N/A | pytorch | <2GB | -
-[weight_sharded](https://cloud.mlcommons.org/index.php/s/XzfSeLgW8FYfR3S/download) | pytorch | 97.31GB | -
+ pytorch | 97.31GB | -
 
-You can download the weights by running:
-```
-wget https://cloud.mlcommons.org/index.php/s/XzfSeLgW8FYfR3S/download -O weights.zip
-unzip weights.zip
-```
+#### CM method
 
 The following MLCommons CM commands can be used to programmatically download the model checkpoint. 
 
@@ -88,24 +85,32 @@ cm pull repo mlcommons@ck
 cm run script --tags=get,ml-model,dlrm,_pytorch,_weight_sharded,_rclone -j
 ```
 
-(optional) To speed up future downloads, we recommend you save the weights in a bucket (E.g GCP, AWS). For example, after saving the checkpoint in a GCP bucket, you can download the weights faster by running:
+#### Manual method
+
+The above command automatically runs a set of Rclone commands to download the data from a Cloudflare R2 bucket. However, if you'd like to run the Rclone commands manually, you can do so as follows:
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
 ```
-export BUCKET_NAME=<BUCKET_CONTAINING_MODEL>
-cd $HOME/mlcommons/inference/recommendation/dlrm_v2/pytorch/model/
-gsutil -m cp -r "gs://$BUCKET_NAME/model_weights/*" .
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
 ```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following commands to download the model weights:
 
-### Downloading dataset
-| Original dataset | download link |
-| ---- | ---- |
-| Criteo Terabyte (day 23) | https://labs.criteo.com/2013/12/download-terabyte-click-logs/ |
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/model_weights ./model_weights -P
+```
 
+#### (optional) 
 
-1. The Criteo fake dataset can be created in place of the real datasets in order to facilitate debugging and testing. We provide a fake (random) data generator that can be used to quickly generate data samples in a format compatible with the original dataset. Please use the following script in `./tools` to quickly create random samples for the corresponding models, which will be placed into `./fake_criteo` directory
+To speed up future downloads, we recommend you save the weights in a bucket (E.g GCP, AWS). For example, after saving the checkpoint in a GCP bucket, you can download the weights faster by running:
 ```
-./make_fake_criteo.sh
-mv ./fake_criteo .. && cd ..
-export DATA_DIR=./fake_criteo
+export BUCKET_NAME=<BUCKET_CONTAINING_MODEL>
+cd $HOME/mlcommons/inference/recommendation/dlrm_v2/pytorch/model/
+gsutil -m cp -r "gs://$BUCKET_NAME/model_weights/*" .
 ```
 
 

From ab747c42bc233f0331a8c673cf5ba2ab068dc95d Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 12:33:40 -0600
Subject: [PATCH 31/75] Minor wording edit to README.md

---
 text_to_image/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_to_image/README.md b/text_to_image/README.md
index 243067459..e353bfac0 100644
--- a/text_to_image/README.md
+++ b/text_to_image/README.md
@@ -73,7 +73,7 @@ Once Rclone is installed, run the following command to authenticate with the buc
 ```
 rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 ```
-You can then navigate in the terminal to your desired download directory and run the following commands to download the dataset and checkpoints:
+You can then navigate in the terminal to your desired download directory and run the following commands to download the checkpoints:
 
 **`fp32`**
 ```

From d037f22f408db7e0e6b0fa98baea04ef159955a7 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 12:47:23 -0600
Subject: [PATCH 32/75] Add Rclone-Cloudflare download instructions to
 README.md

---
 language/gpt-j/README.md | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/language/gpt-j/README.md b/language/gpt-j/README.md
index 58af37ba3..cbb5b3e3c 100644
--- a/language/gpt-j/README.md
+++ b/language/gpt-j/README.md
@@ -68,11 +68,9 @@ pip install datasets
 python prepare-calibration.py --calibration-list-file calibration-list.txt --output-dir </path/to/output-folder>
 ```
 ### Download GPT-J model
-Please download the fine-tuned GPT-J checkpoint from [here](https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx) and extract it as model/. The download_gptj.py only downloads the default huggingface model which is not fine-tuned on CNN-Daily mail dataset. 
+Please download the fine-tuned GPT-J checkpoint using the instructions below. The download_gptj.py only downloads the default huggingface model which is not fine-tuned on CNN-Daily mail dataset. 
 
-```
-wget https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download --output-document checkpoint.zip
-```
+#### CM method
 
 The following MLCommons CM commands can be used to programmatically download the model checkpoint. 
 
@@ -82,6 +80,26 @@ cm pull repo mlcommons@ck
 cm run script --tags=get,ml-model,gptj,_pytorch,_rclone -j
 ```
 
+#### Manual method
+
+The above command automatically runs a set of Rclone commands to download the data from a Cloudflare R2 bucket. However, if you'd like to run the Rclone commands manually, you can do so as follows:
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
+```
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
+```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following commands to download the model checkpoint:
+
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/gpt-j ./model -P
+```
+
+
 ### Running the Benchmark
 Replace the model and dataset path arguments with your corresponding paths. For evaluating the ROUGE score after the run, include --accuracy as shown below. For user specific target qps, please include user.conf.
 ```

From 147a91a69fb9857dea70e4be0cc9fd19961cea21 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 13:03:58 -0600
Subject: [PATCH 33/75] Add Rclone-GDrive download instructions to README.md

---
 language/llama2-70b/README.md | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 979488e91..8c6a0c27c 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -63,32 +63,8 @@ Inside the container, set up the environment with `bash build.sh`. This will ins
 CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch.
 
 
-## Get Model
-+ For now, MLCommons is not hosting the checkpoint, so you must first go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to huggingface (if you don't have account, you'd need to create one). **Please note your authentication credentials** as you may be required to provide them when cloninng below
-+ Requires Git Large Files Storage
-```
-export CHECKPOINT_PATH=${PWD}/Llama-2-70b-chat-hf
-git lfs install
-git clone https://huggingface.co/meta-llama/Llama-2-70b-chat-hf ${CHECKPOINT_PATH}
-
-```
-
-## Get Dataset
-
-```
-# First get the `open-orca` parquet from huggingface
-export OPENORCA_DATASET=${PWD}/open-orca
-git clone https://huggingface.co/datasets/Open-Orca/OpenOrca ${OPENORCA_DATASET}
-
-export OPENORCA_PARQUET=${OPENORCA_DATASET}/1M-GPT4-Augmented.parquet
-EXPORT_DIR=${PWD}/processed-openorca
-export DATASET_PATH=${PWD}/processed-data.pkl
-
-# Process the dataset according the Taskforce's agreed criteria
-python3 processorca.py --dataset_pq_path=${OPENORCA_PARQUET} --model_dir=${CHECKPOINT_PATH} --seqlen_limit=1024 --export_dir=${EXPORT_DIR} --num_total_samples=24576
-
-mv ${EXPORT_DIR}/open_orca_gpt4_tokenized_llama.sampled_24576.pkl ${DATASET_PATH}
-```
+## Get Model and Dataset
+MLCommons hosts the model and preprocessed dataset for download. You must first agree to the [confidentiality notice](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform), then follow the link to a directory containing Rclone download instructions.
 
 
 ## Run Performance Benchmarks

From 15d14c95c86c7a42215c162827bab2daaec61da1 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 16:23:00 -0600
Subject: [PATCH 34/75] Add new and old instructions to README.md

---
 language/llama2-70b/README.md | 56 +++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 8c6a0c27c..f648b1b5d 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -63,8 +63,60 @@ Inside the container, set up the environment with `bash build.sh`. This will ins
 CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch.
 
 
-## Get Model and Dataset
-MLCommons hosts the model and preprocessed dataset for download. You must first agree to the [confidentiality notice](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform), then follow the link to a directory containing Rclone download instructions.
+## Get Model
+### MLCommons Members Download
+MLCommons hosts the model and preprocessed dataset for download exclusively by MLCommons Members. You must first agree to the [confidentiality notice](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform), then follow the link to a directory containing Rclone download instructions.
+
+
+### External Download
++ First go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloninng below.
++ Requires Git Large Files Storage
+```
+export CHECKPOINT_PATH=${PWD}/Llama-2-70b-chat-hf
+git lfs install
+git clone https://huggingface.co/meta-llama/Llama-2-70b-chat-hf ${CHECKPOINT_PATH}
+
+```
+
+## Get Dataset
+
+### Preprocessed
+
+You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket.
+
+To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows).
+To install Rclone on Linux/macOS/BSD systems, run:
+```
+sudo -v ; curl https://rclone.org/install.sh | sudo bash
+```
+Once Rclone is installed, run the following command to authenticate with the bucket:
+```
+rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
+```
+You can then navigate in the terminal to your desired download directory and run the following commands to download the model checkpoint:
+
+```
+rclone copy mlc-inference:mlcommons-inference-wg-public/open_orca ./open_orca -P
+```
+
+### Unprocessed
+
+You can also download and process the dataset yourself as follows:
+
+```
+# First get the `open-orca` parquet from huggingface
+export OPENORCA_DATASET=${PWD}/open-orca
+git clone https://huggingface.co/datasets/Open-Orca/OpenOrca ${OPENORCA_DATASET}
+
+export OPENORCA_PARQUET=${OPENORCA_DATASET}/1M-GPT4-Augmented.parquet
+EXPORT_DIR=${PWD}/processed-openorca
+export DATASET_PATH=${PWD}/processed-data.pkl
+
+# Process the dataset according the Taskforce's agreed criteria
+python3 processorca.py --dataset_pq_path=${OPENORCA_PARQUET} --model_dir=${CHECKPOINT_PATH} --seqlen_limit=1024 --export_dir=${EXPORT_DIR} --num_total_samples=24576
+
+mv ${EXPORT_DIR}/open_orca_gpt4_tokenized_llama.sampled_24576.pkl ${DATASET_PATH}
+```
 
 
 ## Run Performance Benchmarks

From 46a35c2e5535252a300f98b4c238a9f1e96ccb7d Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 17:09:29 -0600
Subject: [PATCH 35/75] Tweak language in README.md

---
 language/llama2-70b/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index f648b1b5d..3fdf71ce2 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -93,7 +93,7 @@ Once Rclone is installed, run the following command to authenticate with the buc
 ```
 rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 ```
-You can then navigate in the terminal to your desired download directory and run the following commands to download the model checkpoint:
+You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
 
 ```
 rclone copy mlc-inference:mlcommons-inference-wg-public/open_orca ./open_orca -P

From c0bd844fd6d1f11c0ec035fd52f07895d4146f3e Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 17:10:08 -0600
Subject: [PATCH 36/75] Language tweak in README.md

---
 language/gpt-j/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/gpt-j/README.md b/language/gpt-j/README.md
index cbb5b3e3c..061067027 100644
--- a/language/gpt-j/README.md
+++ b/language/gpt-j/README.md
@@ -93,7 +93,7 @@ Once Rclone is installed, run the following command to authenticate with the buc
 ```
 rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 ```
-You can then navigate in the terminal to your desired download directory and run the following commands to download the model checkpoint:
+You can then navigate in the terminal to your desired download directory and run the following command to download the model checkpoint:
 
 ```
 rclone copy mlc-inference:mlcommons-inference-wg-public/gpt-j ./model -P

From 8219069f83c2396be1c1b4b4c8f80cca756db5ca Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Wed, 21 Feb 2024 17:11:15 -0600
Subject: [PATCH 37/75] Minor language tweak in README.md

---
 recommendation/dlrm_v2/pytorch/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recommendation/dlrm_v2/pytorch/README.md b/recommendation/dlrm_v2/pytorch/README.md
index 6de850b81..12937c0c8 100755
--- a/recommendation/dlrm_v2/pytorch/README.md
+++ b/recommendation/dlrm_v2/pytorch/README.md
@@ -98,7 +98,7 @@ Once Rclone is installed, run the following command to authenticate with the buc
 ```
 rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 ```
-You can then navigate in the terminal to your desired download directory and run the following commands to download the model weights:
+You can then navigate in the terminal to your desired download directory and run the following command to download the model weights:
 
 ```
 rclone copy mlc-inference:mlcommons-inference-wg-public/model_weights ./model_weights -P

From e39003a9c4c89a2215db0ca57ad7a57b16f9a785 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Fri, 23 Feb 2024 13:55:22 -0600
Subject: [PATCH 38/75] Fix typo in README.md

---
 language/llama2-70b/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index 3fdf71ce2..b539b3e3c 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -69,7 +69,7 @@ MLCommons hosts the model and preprocessed dataset for download exclusively by M
 
 
 ### External Download
-+ First go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloninng below.
++ First go to [llama2-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
 + Requires Git Large Files Storage
 ```
 export CHECKPOINT_PATH=${PWD}/Llama-2-70b-chat-hf

From 396d3f85a27f8732f9fa6b82649f97af11aa3957 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 10 Jan 2024 15:52:18 -0500
Subject: [PATCH 39/75] TGI support first pass

---
 language/llama2-70b/SUT.py  | 67 ++++++++++++++++++++++++-------------
 language/llama2-70b/main.py |  2 ++
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 4ff78c38c..f617bdf1c 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -14,6 +14,9 @@
 import tqdm
 import queue
 
+from huggingface_hub import InferenceClient
+import concurrent
+
 import logging
 from typing import TYPE_CHECKING, Optional, List
 from pathlib import Path
@@ -80,6 +83,7 @@ def get_out_tokens(self):
 class SUT():
     def __init__(self,
                  model_path=None,
+                 api_server=None,
                  dtype="bfloat16",
                  device="cpu",
                  batch_size=None,
@@ -89,6 +93,7 @@ def __init__(self,
                  workers=1):
 
         self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
+        self.api_server = api_server
         self.device = device
 
         if not batch_size:
@@ -191,20 +196,33 @@ def process_queries(self):
                 assert input_ids_tensor.shape == input_masks_tensor.shape
                 assert input_ids_tensor.shape[0] <= self.batch_size
 
+                if self.api_server:
+                    decoded = self.tokenizer.batch_decode(input_ids_tensor)
+                    cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
+                    bs = len(cleaned)
+
                 tik2 = time.time()
 
-                pred_output_tokens = self.model.generate(
-                    input_ids=input_ids_tensor,
-                    attention_mask=input_masks_tensor,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    **gen_kwargs
-                )
+                if self.api_server:
+                    def api_gen_text(text): return self.client.generate(text,max_new_tokens=1024).generated_text
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=bs) as executor:
+                        output = list(executor.map(api_gen_text,cleaned))
+                else:
+                    pred_output_tokens = self.model.generate(
+                        input_ids=input_ids_tensor,
+                        attention_mask=input_masks_tensor,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        **gen_kwargs
+                    )
 
                 tik3 = time.time()
 
-                processed_output = self.data_object.postProcess(pred_output_tokens,
-                                                                input_seq_lens=input_len,
-                                                                query_id_list=query_ids)
+                if self.api_server:
+                    processed_output = np.array(self.tokenizer(output))
+                else:
+                    processed_output = self.data_object.postProcess(pred_output_tokens,
+                                                                    input_seq_lens=input_len,
+                                                                    query_id_list=query_ids)
 
             for i in range(len(qitem)):
                 n_tokens = processed_output[i].shape[0]
@@ -228,26 +246,29 @@ def process_queries(self):
 
 
     def load_model(self):
-        self.model = LlamaForCausalLM.from_pretrained(
-            self.model_path,
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            torch_dtype=self.amp_dtype
-        )
-        print("Loaded model")
+        if self.api_server:
+            self.client = InferenceClient(model=self.api_server)
+        else:
+            self.model = LlamaForCausalLM.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                torch_dtype=self.amp_dtype
+            )
+            print("Loaded model")
 
-        self.device = torch.device(self.device)
-        if self.device == "cpu":
-            self.model = self.model.to(self.device)  # Force CPU if your system has GPU and you specifically want CPU-only run
+            self.device = torch.device(self.device)
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)  # Force CPU if your system has GPU and you specifically want CPU-only run
 
-        self.model.eval()
-        self.model = self.model.to(memory_format=torch.channels_last)
+            self.model.eval()
+            self.model = self.model.to(memory_format=torch.channels_last)
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
             model_max_length=1024,
             padding_side="left",
-            use_fast=False,)
+            use_fast=True,) #changed from false
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
         print("Loaded tokenizer")
@@ -285,7 +306,7 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
         super().__init__(model_path=model_path, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index bf1def806..35624bbab 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -16,6 +16,7 @@ def get_args():
     parser.add_argument("--scenario", type=str, choices=["Offline", "Server"], default="Offline", help="Scenario")
     parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name")
     parser.add_argument("--dataset-path", type=str, default=None, help="")
+    parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint to use api mode")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
     parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
@@ -68,6 +69,7 @@ def main():
 
     sut = sut_cls(
         model_path=args.model_path,
+        api_server=args.api_server,
         dtype=args.dtype,
         dataset_path=args.dataset_path,
         total_sample_count=args.total_sample_count,

From 84c9673dd045e0554daa7b6f3f6d07c609b6e84e Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 11 Jan 2024 15:44:49 -0500
Subject: [PATCH 40/75] Added functional API querying

---
 language/llama2-70b/SUT.py  | 48 +++++++++++++++++++++++++++++++------
 language/llama2-70b/main.py |  4 +++-
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index f617bdf1c..12828cd91 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import time
 import numpy as np
 import array
@@ -14,8 +15,12 @@
 import tqdm
 import queue
 
-from huggingface_hub import InferenceClient
-import concurrent
+from concurrent.futures.thread import ThreadPoolExecutor
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+import json
+
+requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
 import logging
 from typing import TYPE_CHECKING, Optional, List
@@ -84,6 +89,7 @@ class SUT():
     def __init__(self,
                  model_path=None,
                  api_server=None,
+                 api_model_name=None,
                  dtype="bfloat16",
                  device="cpu",
                  batch_size=None,
@@ -94,6 +100,7 @@ def __init__(self,
 
         self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
         self.api_server = api_server
+        self.api_model_name = api_model_name
         self.device = device
 
         if not batch_size:
@@ -151,6 +158,30 @@ def stop(self):
             worker.join()
 
 
+    def query_api(self, input):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': self.api_model_name,
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+            },
+        }
+
+        response = requests.post(
+            'https://' + self.api_server,
+            headers=headers,
+            json=json_data,
+            verify=False,
+        )
+
+        return json.loads(response.text)["generated_text"]
+
+
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
 
@@ -204,9 +235,8 @@ def process_queries(self):
                 tik2 = time.time()
 
                 if self.api_server:
-                    def api_gen_text(text): return self.client.generate(text,max_new_tokens=1024).generated_text
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=bs) as executor:
-                        output = list(executor.map(api_gen_text,cleaned))
+                    with ThreadPoolExecutor(max_workers=bs) as executor:
+                        output = list(executor.map(self.query_api,cleaned))
                 else:
                     pred_output_tokens = self.model.generate(
                         input_ids=input_ids_tensor,
@@ -218,7 +248,7 @@ def api_gen_text(text): return self.client.generate(text,max_new_tokens=1024).ge
                 tik3 = time.time()
 
                 if self.api_server:
-                    processed_output = np.array(self.tokenizer(output))
+                    processed_output = np.array(self.tokenizer(output, padding='longest')['input_ids'])
                 else:
                     processed_output = self.data_object.postProcess(pred_output_tokens,
                                                                     input_seq_lens=input_len,
@@ -247,7 +277,11 @@ def api_gen_text(text): return self.client.generate(text,max_new_tokens=1024).ge
 
     def load_model(self):
         if self.api_server:
-            self.client = InferenceClient(model=self.api_server)
+            if not "http" in self.api_server:
+                self.api_server = "http://" + self.api_server
+
+            if not self.api_model_name:
+                sys.exit("API Server was specified but no model name was provided")
         else:
             self.model = LlamaForCausalLM.from_pretrained(
                 self.model_path,
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index 35624bbab..424d48df7 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -16,7 +16,8 @@ def get_args():
     parser.add_argument("--scenario", type=str, choices=["Offline", "Server"], default="Offline", help="Scenario")
     parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name")
     parser.add_argument("--dataset-path", type=str, default=None, help="")
-    parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint to use api mode")
+    parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
+    parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
     parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
@@ -70,6 +71,7 @@ def main():
     sut = sut_cls(
         model_path=args.model_path,
         api_server=args.api_server,
+        api_model_name=args.api_model_name,
         dtype=args.dtype,
         dataset_path=args.dataset_path,
         total_sample_count=args.total_sample_count,

From dbab9f064cd8f53dd946d520a89df9601de4d09d Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 12 Jan 2024 01:27:32 -0500
Subject: [PATCH 41/75] Changed batch size and str bug fix

---
 language/llama2-70b/SUT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 12828cd91..f6b5448e0 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -105,7 +105,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 1
+                batch_size = 32
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
@@ -173,7 +173,7 @@ def query_api(self, input):
         }
 
         response = requests.post(
-            'https://' + self.api_server,
+            self.api_server,
             headers=headers,
             json=json_data,
             verify=False,

From ffcbc0e0729a1bb19babba3e8d4578efaf98754b Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 15 Jan 2024 01:40:29 -0500
Subject: [PATCH 42/75] Added v1 offline artifacts

---
 language/llama2-70b/SUT.py                    |   2 +-
 .../api-endpoint-artifacts/Dockerfile-API     |  15 +++
 .../api-endpoint-artifacts/README.md          |  24 ++++
 .../api-endpoint-artifacts/benchmark.yaml     |  21 ++++
 .../api-endpoint-artifacts/model.yaml         |  18 +++
 .../llama2-70b/api-endpoint-artifacts/sa.yaml |   6 +
 .../api-endpoint-artifacts/secret.yaml        |  12 ++
 .../serving-runtime.yaml                      | 107 ++++++++++++++++++
 8 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/README.md
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/model.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/sa.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/secret.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index f6b5448e0..0eb777066 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -105,7 +105,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 32
+                batch_size = 110
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
diff --git a/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
new file mode 100644
index 000000000..ae2bcd455
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
@@ -0,0 +1,15 @@
+FROM pytorch/pytorch:latest
+
+COPY inference ./inference
+COPY processed-openorca/processed-data.pkl ./processed-data.pkl
+COPY llama-model-info ./llama-model-info
+
+RUN apt-get update && apt install build-essential -y
+RUN conda install pybind11==2.10.4 -c conda-forge -y
+RUN pip install --upgrade pip
+RUN pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
+RUN cd inference/loadgen && python -m pip install .
+
+ENV DATASET_PATH=/workspace/processed-data.pkl
+ENV CHECKPOINT_PATH=/workspace/llama-model-info
+ENV ACCURACY_LOG_FILE=/workspace/inference/language/llama2-70b/offline-logs/mlperf_log_accuracy.json
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
new file mode 100644
index 000000000..72c573949
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -0,0 +1,24 @@
+# Using OpenShift AI Model Serving (Caikit/TGIS) with MLPerf Inference
+
+Prerequisites:
+ - Install the OpenShift AI model serving stack
+ - Add your AWS credentials to `secret.yaml` access the model files
+ - Apply `secret.yaml`, `sa.yaml`, `serving-runtime.yaml`, then finally `model.yaml`
+ - Create a benchmark pod using `benchmark.yaml`
+
+
+For the full accuracy benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT SERVER API CALL ENDPOINT> --api-model-name Llama-2-70b-chat-hf-caikit --accuracy --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+You can then run the same evaluation/consolidation scripts as the regular benchmark
+
+
+For the performance benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT SERVER API CALL ENDPOINT> --api-model-name Llama-2-70b-chat-hf-caikit --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+(It is the same, just with `--accuracy` removed)
+
+
+NOTE: Hyperparams are currently configured for 8xH100
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
new file mode 100644
index 000000000..57dbd6a36
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mlperf-inference
+spec:
+  restartPolicy: Never
+  containers:
+  - name: mlperf-test
+    image: quay.io/meyceoz/mlperf-inference:v1
+    resources:
+      requests:
+        memory: 20000Mi
+    volumeMounts:
+    - mountPath: /dev/shm
+      name: dshm
+    command: [ "/bin/sh", "-c" ]
+    args: [ "sleep infinity" ]
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/model.yaml b/language/llama2-70b/api-endpoint-artifacts/model.yaml
new file mode 100644
index 000000000..301b73631
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/model.yaml
@@ -0,0 +1,18 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    serving.knative.openshift.io/enablePassthrough: "true"
+    sidecar.istio.io/inject: "true"
+    sidecar.istio.io/rewriteAppHTTPProbers: "true"
+  name: llama-2-70b-chat-isvc
+spec:
+  predictor:
+    apiVersion: serving.kserve.io/v1alpha2
+    serviceAccountName: sa
+    timeout: 240
+    model:
+      modelFormat:
+        name: caikit
+      runtime: caikit-runtime
+      storageUri: s3://mlperf-inference-models/Llama-2-70b-chat-hf-caikit
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/sa.yaml b/language/llama2-70b/api-endpoint-artifacts/sa.yaml
new file mode 100644
index 000000000..6ccfd13b7
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/sa.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sa
+secrets:
+- name: storage-config
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/secret.yaml b/language/llama2-70b/api-endpoint-artifacts/secret.yaml
new file mode 100644
index 000000000..1b90db6c0
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/secret.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  annotations:
+    serving.kserve.io/s3-endpoint: "s3.amazonaws.com" # replace with your s3 endpoint e.g minio-service.kubeflow:9000
+    serving.kserve.io/s3-usehttps: "1" # by default 1, if testing with minio you can set to 0
+    serving.kserve.io/s3-region: "us-east-1"
+    serving.kserve.io/s3-useanoncredential: "false" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials
+  name: storage-config
+stringData:
+  "AWS_ACCESS_KEY_ID": "XXXXXXXXX"
+  "AWS_SECRET_ACCESS_KEY": "XXXXXXXXX"
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
new file mode 100644
index 000000000..2b7e25184
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
@@ -0,0 +1,107 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: caikit-tgis-config
+data:
+  caikit.yml: |
+    runtime:
+      library: caikit_nlp
+      local_models_dir: /mnt/models/
+      lazy_load_local_models: true
+    model_management:
+      finders:
+        default:
+          type: MULTI
+          config:
+            finder_priority:
+              - tgis-auto
+        tgis-auto:
+          type: TGIS-AUTO
+          config:
+            test_connection: true
+      initializers:
+        default:
+          type: LOCAL
+          config:
+            backend_priority:
+              - type: TGIS
+                config:
+                  local:
+                    num_gpus: 8
+                    load_timeout: 2000
+                  connection:
+                    hostname: localhost:8033
+---
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: caikit-runtime
+spec:
+  multiModel: false
+  supportedModelFormats:
+    # Note: this currently *only* supports caikit format models
+    - autoSelect: true
+      name: caikit
+  containers:
+    - name: kserve-container
+      image: quay.io/opendatahub/text-generation-inference:fast
+      command: ["text-generation-launcher"]
+      args: ["--model-name=/mnt/models/artifacts/"]
+      env:
+        - name: TRANSFORMERS_CACHE
+          value: /tmp/transformers_cache
+          #- name: RUNTIME_LOCAL_MODELS_DIR
+          #value: /mnt/models
+        - name: NUM_GPUS
+          value: "8"
+          #- name: TRANSFORMERS_CACHE
+          #  value: /shared_model_storage/transformers_cache
+          #- name: HUGGINGFACE_HUB_CACHE
+          #  value: /shared_model_storage/transformers_cache
+          #- name: DTYPE_STR
+          #  value: float16
+        # Dynamic batch size changes
+        - name: MAX_BATCH_SIZE
+          value: "256"
+        - name: MAX_CONCURRENT_REQUESTS
+          value: "256"
+        - name: MAX_BATCH_WEIGHT
+          value: "540000"
+        - name: MAX_SEQUENCE_LENGTH
+          value: "4096"
+        - name: MAX_PREFILL_WEIGHT
+          value: "0"
+        - name: MAX_NEW_TOKENS
+          value: "1024"
+        - name: FLASH_ATTENTION
+          value: "true"
+        - name: DEPLOYMENT_FRAMEWORK
+          value: hf_custom_tp
+      resources: # configure as required
+        requests:
+          cpu: 36
+          memory: 700Gi
+          nvidia.com/gpu: 8
+        limits:
+          nvidia.com/gpu: 8
+    - name: transformer-container
+      image: quay.io/opendatahub/caikit-tgis-serving:fast
+      env:
+        - name: RUNTIME_GRPC_SERVER_THREAD_POOL_SIZE
+          value: "160"
+      volumeMounts:
+        - name: config-volume
+          mountPath: /caikit/config/
+          readOnly: true
+      ports:
+        - containerPort: 8080
+          #name: h2c
+          protocol: TCP
+      resources: # configure as required
+        requests:
+          cpu: 2
+          memory: 4Gi
+  volumes:
+    - name: config-volume
+      configMap:
+        name: caikit-tgis-config

From 86c594ef415aa708aca320ee4f6be7254f8a15f9 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 16 Jan 2024 17:29:02 -0500
Subject: [PATCH 43/75] server scenario first pass

---
 language/llama2-70b/SUT.py | 73 ++++++++++++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 11 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 0eb777066..fbeeefadb 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -340,11 +340,16 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+
+        with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
+            llama_tokenizer = json.load(token_file)
+        self.llama_vocab = llama_tokenizer["model"]["vocab"]
 
         self.first_token_queue = queue.Queue()
+        
 
     def start(self):
 
@@ -375,6 +380,44 @@ def process_first_tokens(self):
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
 
+    def stream_api(self, input, response_ids):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': 'Llama-2-70b-chat-hf-caikit',
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+            },
+        }
+
+        token_cache = []
+        s = requests.Session()
+        first = True
+        with s.post(
+            self.api_server,
+            headers=headers,
+            json=json_data,
+            verify=False,
+            stream=True
+        ) as resp:
+            for line in resp.iter_lines():
+                if line:
+                    decoded = line.decode()
+                    if decoded.startswith("data"):
+                        token_l = json.loads(decoded[6:])["tokens"]
+                        if token_l:
+                            token = self.llama_vocab[token_l[0]["text"]]
+                            if first:
+                                self.first_token.put((token, response_ids[0]))
+                                first = False
+                            else:
+                                token_cache.append(token)
+        return token_cache
+
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
         while True:
@@ -386,18 +429,26 @@ def process_queries(self):
             input_ids_tensor = self.data_object.input_ids[qitem.index]
             input_masks_tensor = self.data_object.attention_masks[qitem.index]
 
+            if self.api_server:
+                decoded = self.tokenizer.decode(input_ids_tensor)
+                tokens_cache = []
+                response_ids = [qitem.id]
+                output_tokens = self.stream_api(decoded, response_ids)
+
+            else:
             #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
-            tokens_cache = []
-            tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
+                tokens_cache = []
+                tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
+
+                _ = self.model.generate(    input_ids=input_ids_tensor,
+                                            attention_mask=input_masks_tensor,
+                                            pad_token_id=self.tokenizer.pad_token_id,
+                                            streamer = tokens_streamer,
+                                            **gen_kwargs
+                                            )
 
-            _ = self.model.generate(    input_ids=input_ids_tensor,
-                                        attention_mask=input_masks_tensor,
-                                        pad_token_id=self.tokenizer.pad_token_id,
-                                        streamer = tokens_streamer,
-                                        **gen_kwargs
-                                        )
+                output_tokens = tokens_streamer.get_out_tokens()
 
-            output_tokens = tokens_streamer.get_out_tokens()
             n_tokens = len(output_tokens)
             response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
             bi = response_array.buffer_info()

From 8751a35f02c9c9c6a001f39566116e20211fc06d Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 17 Jan 2024 15:01:38 -0500
Subject: [PATCH 44/75] Funcional server scenario

---
 language/llama2-70b/SUT.py                                | 6 +++---
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index fbeeefadb..ca4d6bb26 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -412,7 +412,7 @@ def stream_api(self, input, response_ids):
                         if token_l:
                             token = self.llama_vocab[token_l[0]["text"]]
                             if first:
-                                self.first_token.put((token, response_ids[0]))
+                                self.first_token_queue.put((token, response_ids[0]))
                                 first = False
                             else:
                                 token_cache.append(token)
@@ -430,13 +430,13 @@ def process_queries(self):
             input_masks_tensor = self.data_object.attention_masks[qitem.index]
 
             if self.api_server:
-                decoded = self.tokenizer.decode(input_ids_tensor)
+                decoded = self.tokenizer.decode(input_ids_tensor[0])
                 tokens_cache = []
                 response_ids = [qitem.id]
                 output_tokens = self.stream_api(decoded, response_ids)
 
             else:
-            #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
+                #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
                 tokens_cache = []
                 tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 57dbd6a36..36d1636d7 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-test
-    image: quay.io/meyceoz/mlperf-inference:v1
+    image: quay.io/meyceoz/mlperf-inference:v2
     resources:
       requests:
         memory: 20000Mi

From 6ff40906907ffb0c8363fd6907fcef6038ec6b79 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 18 Jan 2024 13:14:45 -0500
Subject: [PATCH 45/75] Added concurrent request support for server scenario

---
 language/llama2-70b/SUT.py                    | 31 ++++++++++++-------
 .../api-endpoint-artifacts/benchmark.yaml     |  4 +--
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index ca4d6bb26..e748a5ec2 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -418,6 +418,19 @@ def stream_api(self, input, response_ids):
                                 token_cache.append(token)
         return token_cache
 
+    def async_process_query(self, input_ids_tensor, qitem_id):
+        decoded = self.tokenizer.decode(input_ids_tensor[0])
+        response_ids = [qitem_id]
+        output_tokens = self.stream_api(decoded, response_ids)
+
+        n_tokens = len(output_tokens)
+        response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+        bi = response_array.buffer_info()
+        response = [lg.QuerySampleResponse(
+            qitem_id, bi[0], bi[1], n_tokens)]
+        lg.QuerySamplesComplete(response)
+        sys.exit()
+
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
         while True:
@@ -430,11 +443,7 @@ def process_queries(self):
             input_masks_tensor = self.data_object.attention_masks[qitem.index]
 
             if self.api_server:
-                decoded = self.tokenizer.decode(input_ids_tensor[0])
-                tokens_cache = []
-                response_ids = [qitem.id]
-                output_tokens = self.stream_api(decoded, response_ids)
-
+                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
             else:
                 #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
                 tokens_cache = []
@@ -449,12 +458,12 @@ def process_queries(self):
 
                 output_tokens = tokens_streamer.get_out_tokens()
 
-            n_tokens = len(output_tokens)
-            response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
-            bi = response_array.buffer_info()
-            response = [lg.QuerySampleResponse(
-                qitem.id, bi[0], bi[1], n_tokens)]
-            lg.QuerySamplesComplete(response)
+                n_tokens = len(output_tokens)
+                response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+                bi = response_array.buffer_info()
+                response = [lg.QuerySampleResponse(
+                    qitem.id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
 
 
     def issue_queries(self, query_samples):
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 36d1636d7..e0bd53fd7 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -5,8 +5,8 @@ metadata:
 spec:
   restartPolicy: Never
   containers:
-  - name: mlperf-test
-    image: quay.io/meyceoz/mlperf-inference:v2
+  - name: mlperf-env
+    image: quay.io/meyceoz/mlperf-inference:v3-base
     resources:
       requests:
         memory: 20000Mi

From 2225a459e85c43992f13a8ecd64982a2d5896c32 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 18 Jan 2024 15:47:58 -0500
Subject: [PATCH 46/75] Added explicit greedy and updated readme

---
 language/llama2-70b/SUT.py                                | 2 ++
 language/llama2-70b/api-endpoint-artifacts/README.md      | 8 ++++++++
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index e748a5ec2..54b32e558 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -169,6 +169,7 @@ def query_api(self, input):
             'parameters': {
                 'max_new_tokens': 1024,
                 'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
             },
         }
 
@@ -391,6 +392,7 @@ def stream_api(self, input, response_ids):
             'parameters': {
                 'max_new_tokens': 1024,
                 'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
             },
         }
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 72c573949..6a7260b23 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -6,6 +6,7 @@ Prerequisites:
  - Apply `secret.yaml`, `sa.yaml`, `serving-runtime.yaml`, then finally `model.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
+In the pod, before any benchmark, first run `cd inference/language/llama2-70b`
 
 For the full accuracy benchmark (offline), run in the pod:
 ```
@@ -21,4 +22,11 @@ python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-serv
 (It is the same, just with `--accuracy` removed)
 
 
+For the performance benchmark (server), run in the pod:
+```
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT SERVER STREAM API CALL ENDPOINT> --api-model-name Llama-2-70b-chat-hf-caikit --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+```
+(Configure target qps in `user.conf`)
+
+
 NOTE: Hyperparams are currently configured for 8xH100
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index e0bd53fd7..f77a40ca8 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v3-base
+    image: quay.io/meyceoz/mlperf-inference:v3-greedy
     resources:
       requests:
         memory: 20000Mi

From 22eb5742b459b63b513b141ff102e62da3f05c2e Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 22 Jan 2024 12:24:53 -0500
Subject: [PATCH 47/75] Update image to include ommitted mlperf conf

---
 language/llama2-70b/api-endpoint-artifacts/Dockerfile-API | 1 +
 1 file changed, 1 insertion(+)

diff --git a/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
index ae2bcd455..e7231ba0e 100644
--- a/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
+++ b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
@@ -9,6 +9,7 @@ RUN conda install pybind11==2.10.4 -c conda-forge -y
 RUN pip install --upgrade pip
 RUN pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
 RUN cd inference/loadgen && python -m pip install .
+RUN cp inference/mlperf.conf inference/language/llama2-70b/mlperf.conf
 
 ENV DATASET_PATH=/workspace/processed-data.pkl
 ENV CHECKPOINT_PATH=/workspace/llama-model-info

From 7bb4c1b55ec9b666b460c63ce5fb3d3615e1bf46 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 22 Jan 2024 15:04:39 -0500
Subject: [PATCH 48/75] Update for new image version

---
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml       | 2 +-
 language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index f77a40ca8..012e78c33 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v3-greedy
+    image: quay.io/meyceoz/mlperf-inference:v4
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
index 2b7e25184..1958ae927 100644
--- a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
@@ -68,7 +68,7 @@ spec:
         - name: MAX_BATCH_WEIGHT
           value: "540000"
         - name: MAX_SEQUENCE_LENGTH
-          value: "4096"
+          value: "2048"
         - name: MAX_PREFILL_WEIGHT
           value: "0"
         - name: MAX_NEW_TOKENS

From e96c8a64bf5cb3869e3881032e82f8deeec80041 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 22 Jan 2024 18:28:05 -0500
Subject: [PATCH 49/75] Updated model serving yamls

---
 .../llama2-70b/api-endpoint-artifacts/model.yaml     |  2 ++
 .../api-endpoint-artifacts/serving-runtime.yaml      | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/language/llama2-70b/api-endpoint-artifacts/model.yaml b/language/llama2-70b/api-endpoint-artifacts/model.yaml
index 301b73631..5c21a3836 100644
--- a/language/llama2-70b/api-endpoint-artifacts/model.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/model.yaml
@@ -8,6 +8,8 @@ metadata:
   name: llama-2-70b-chat-isvc
 spec:
   predictor:
+    minReplicas: 1
+    maxReplicas: 1
     apiVersion: serving.kserve.io/v1alpha2
     serviceAccountName: sa
     timeout: 240
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
index 1958ae927..f8e05fe5d 100644
--- a/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-runtime.yaml
@@ -62,11 +62,11 @@ spec:
           #  value: float16
         # Dynamic batch size changes
         - name: MAX_BATCH_SIZE
-          value: "256"
+          value: "128"
         - name: MAX_CONCURRENT_REQUESTS
-          value: "256"
+          value: "200"
         - name: MAX_BATCH_WEIGHT
-          value: "540000"
+          value: "550000"
         - name: MAX_SEQUENCE_LENGTH
           value: "2048"
         - name: MAX_PREFILL_WEIGHT
@@ -79,8 +79,8 @@ spec:
           value: hf_custom_tp
       resources: # configure as required
         requests:
-          cpu: 36
-          memory: 700Gi
+          cpu: 64
+          memory: 900Gi
           nvidia.com/gpu: 8
         limits:
           nvidia.com/gpu: 8
@@ -88,7 +88,7 @@ spec:
       image: quay.io/opendatahub/caikit-tgis-serving:fast
       env:
         - name: RUNTIME_GRPC_SERVER_THREAD_POOL_SIZE
-          value: "160"
+          value: "200"
       volumeMounts:
         - name: config-volume
           mountPath: /caikit/config/

From 84475d791c51f62769574dc7bf38d0450f3683b1 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 24 Jan 2024 19:13:41 -0500
Subject: [PATCH 50/75] First pass: standalone TGIS, grpc, batching

---
 language/llama2-70b/SUT.py                    |  62 ++++--
 .../api-endpoint-artifacts/Dockerfile-API     |   2 +-
 .../api-endpoint-artifacts/benchmark.yaml     |   2 +-
 .../api-endpoint-artifacts/model-tgis.yaml    |  20 ++
 .../api-endpoint-artifacts/serving-tgis.yaml  |  67 +++++++
 language/llama2-70b/generation_pb2.py         |  70 +++++++
 language/llama2-70b/generation_pb2_grpc.py    | 169 ++++++++++++++++
 language/llama2-70b/inference.py              | 186 ++++++++++++++++++
 language/llama2-70b/main.py                   |   4 +
 9 files changed, 567 insertions(+), 15 deletions(-)
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/model-tgis.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
 create mode 100644 language/llama2-70b/generation_pb2.py
 create mode 100644 language/llama2-70b/generation_pb2_grpc.py
 create mode 100644 language/llama2-70b/inference.py

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 54b32e558..b7793563b 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -1,6 +1,7 @@
 import os
 import sys
 import time
+import re
 import numpy as np
 import array
 import torch
@@ -20,6 +21,8 @@
 from urllib3.exceptions import InsecureRequestWarning
 import json
 
+from inference import GrpcClient
+
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
 import logging
@@ -90,6 +93,8 @@ def __init__(self,
                  model_path=None,
                  api_server=None,
                  api_model_name=None,
+                 grpc=False,
+                 batch_grpc=False,
                  dtype="bfloat16",
                  device="cpu",
                  batch_size=None,
@@ -101,11 +106,13 @@ def __init__(self,
         self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
         self.api_server = api_server
         self.api_model_name = api_model_name
+        self.grpc = grpc
+        self.batch_grpc = batch_grpc
         self.device = device
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 110
+                batch_size = 300
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
@@ -173,15 +180,28 @@ def query_api(self, input):
             },
         }
 
-        response = requests.post(
-            self.api_server,
-            headers=headers,
-            json=json_data,
-            verify=False,
-        )
-
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(
+                    self.api_server,
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                )
+                response_code = response.status_code
+            except:
+                print("connection failure")
         return json.loads(response.text)["generated_text"]
+    
+
+    def query_api_grpc(self, input):
+        resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
+        return resp.responses[0].text
 
+    def query_api_batch_grpc(self, inputs):
+        resps = self.grpc_client.make_request(inputs, model_id=self.api_model_name)
+        return [resp.text for resp in resps.responses]
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
@@ -236,8 +256,15 @@ def process_queries(self):
                 tik2 = time.time()
 
                 if self.api_server:
-                    with ThreadPoolExecutor(max_workers=bs) as executor:
-                        output = list(executor.map(self.query_api,cleaned))
+                    if self.grpc:
+                        if self.batch_grpc:
+                            output = self.query_api_batch_grpc(cleaned)
+                        else:
+                            with ThreadPoolExecutor(max_workers=bs) as executor:
+                                output = list(executor.map(self.query_api_grpc,cleaned))
+                    else:
+                        with ThreadPoolExecutor(max_workers=bs) as executor:
+                            output = list(executor.map(self.query_api,cleaned))
                 else:
                     pred_output_tokens = self.model.generate(
                         input_ids=input_ids_tensor,
@@ -278,7 +305,16 @@ def process_queries(self):
 
     def load_model(self):
         if self.api_server:
-            if not "http" in self.api_server:
+            if self.grpc:
+                hostname = re.sub("https://|http://", "", self.api_server)
+                if hostname[-1] == "/":
+                    hostname = hostname[:-1]
+                self.grpc_client = GrpcClient(
+                    hostname,
+                    443,
+                    verify=False,
+                )
+            elif not "http" in self.api_server:
                 self.api_server = "http://" + self.api_server
 
             if not self.api_model_name:
@@ -341,9 +377,9 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, api_model_name=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
         with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
             llama_tokenizer = json.load(token_file)
diff --git a/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
index e7231ba0e..77385d42e 100644
--- a/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
+++ b/language/llama2-70b/api-endpoint-artifacts/Dockerfile-API
@@ -7,7 +7,7 @@ COPY llama-model-info ./llama-model-info
 RUN apt-get update && apt install build-essential -y
 RUN conda install pybind11==2.10.4 -c conda-forge -y
 RUN pip install --upgrade pip
-RUN pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0
+RUN pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0 grpcio-tools
 RUN cd inference/loadgen && python -m pip install .
 RUN cp inference/mlperf.conf inference/language/llama2-70b/mlperf.conf
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 012e78c33..79001774d 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v4
+    image: quay.io/meyceoz/mlperf-inference:grpc-batch
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/api-endpoint-artifacts/model-tgis.yaml b/language/llama2-70b/api-endpoint-artifacts/model-tgis.yaml
new file mode 100644
index 000000000..6f920658b
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/model-tgis.yaml
@@ -0,0 +1,20 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    serving.knative.openshift.io/enablePassthrough: "true"
+    sidecar.istio.io/inject: "true"
+    sidecar.istio.io/rewriteAppHTTPProbers: "true"
+  name: llama-2-70b-chat-isvc
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    #apiVersion: serving.kserve.io/v1alpha2
+    serviceAccountName: sa
+    #timeout: 240
+    model:
+      modelFormat:
+        name: pytorch
+      runtime: tgis-runtime-grpc
+      storageUri: s3://mlperf-inference-models/Llama-2-70b-chat-hf/
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
new file mode 100644
index 000000000..b46148133
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
@@ -0,0 +1,67 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: tgis-runtime-grpc
+spec:
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: pytorch
+  containers:
+    - name: kserve-container
+      image: quay.io/opendatahub/text-generation-inference:fast
+      command: ["text-generation-launcher"]
+      args:
+        - "--model-name=/mnt/models/"
+        - "--port=3000"
+        - "--grpc-port=8033"
+      env:
+        - name: TRANSFORMERS_CACHE
+          value: /tmp/transformers_cache
+        - name: NUM_GPUS
+          value: "8"
+          #- name: DTYPE_STR
+          #  value: float16
+        # Dynamic batch size changes
+        - name: MAX_BATCH_SIZE
+          value: "256"
+        - name: MAX_CONCURRENT_REQUESTS
+          value: "300"
+        - name: MAX_BATCH_WEIGHT
+          value: "550000"
+        - name: MAX_SEQUENCE_LENGTH
+          value: "2048"
+        - name: MAX_PREFILL_WEIGHT
+          value: "0"
+        - name: MAX_NEW_TOKENS
+          value: "1024"
+        - name: FLASH_ATTENTION
+          value: "true"
+        - name: DEPLOYMENT_FRAMEWORK
+          value: hf_custom_tp
+        - name: LOG_GPU_USAGE_INTERVAL
+          value: "5"
+      resources: # configure as required
+        requests:
+          cpu: 64
+          memory: 900Gi
+          nvidia.com/gpu: 8
+        limits:
+          nvidia.com/gpu: 8
+      readinessProbe: # Use exec probes instad of httpGet since the probes' port gets rewritten to the containerPort
+        exec:
+          command:
+            - curl
+            - localhost:3000/health
+        initialDelaySeconds: 5
+      livenessProbe:
+        exec:
+          command:
+            - curl
+            - localhost:3000/health
+        initialDelaySeconds: 5
+        # periodSeconds: 5
+      ports:
+        - containerPort: 8033
+          name: h2c
+          protocol: TCP
\ No newline at end of file
diff --git a/language/llama2-70b/generation_pb2.py b/language/llama2-70b/generation_pb2.py
new file mode 100644
index 000000000..544e1ec7d
--- /dev/null
+++ b/language/llama2-70b/generation_pb2.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: generation.proto
+# Protobuf Python Version: 4.25.0
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10generation.proto\x12\x05\x66maas\"\xa1\x01\n\x18\x42\x61tchedGenerationRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12\x16\n\tprefix_id\x18\x02 \x01(\tH\x00\x88\x01\x01\x12*\n\x08requests\x18\x03 \x03(\x0b\x32\x18.fmaas.GenerationRequest\x12!\n\x06params\x18\n \x01(\x0b\x32\x11.fmaas.ParametersB\x0c\n\n_prefix_id\"\x9f\x01\n\x17SingleGenerationRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12\x16\n\tprefix_id\x18\x02 \x01(\tH\x00\x88\x01\x01\x12)\n\x07request\x18\x03 \x01(\x0b\x32\x18.fmaas.GenerationRequest\x12!\n\x06params\x18\n \x01(\x0b\x32\x11.fmaas.ParametersB\x0c\n\n_prefix_id\"I\n\x19\x42\x61tchedGenerationResponse\x12,\n\tresponses\x18\x01 \x03(\x0b\x32\x19.fmaas.GenerationResponse\"!\n\x11GenerationRequest\x12\x0c\n\x04text\x18\x02 \x01(\t\"\xf3\x01\n\x12GenerationResponse\x12\x19\n\x11input_token_count\x18\x06 \x01(\r\x12\x1d\n\x15generated_token_count\x18\x02 \x01(\r\x12\x0c\n\x04text\x18\x04 \x01(\t\x12&\n\x0bstop_reason\x18\x07 \x01(\x0e\x32\x11.fmaas.StopReason\x12\x15\n\rstop_sequence\x18\x0b \x01(\t\x12\x0c\n\x04seed\x18\n \x01(\x04\x12 \n\x06tokens\x18\x08 \x03(\x0b\x32\x10.fmaas.TokenInfo\x12&\n\x0cinput_tokens\x18\t \x03(\x0b\x32\x10.fmaas.TokenInfo\"\x81\x02\n\nParameters\x12%\n\x06method\x18\x01 \x01(\x0e\x32\x15.fmaas.DecodingMethod\x12+\n\x08sampling\x18\x02 \x01(\x0b\x32\x19.fmaas.SamplingParameters\x12)\n\x08stopping\x18\x03 \x01(\x0b\x32\x17.fmaas.StoppingCriteria\x12(\n\x08response\x18\x04 \x01(\x0b\x32\x16.fmaas.ResponseOptions\x12+\n\x08\x64\x65\x63oding\x18\x05 \x01(\x0b\x32\x19.fmaas.DecodingParameters\x12\x1d\n\x15truncate_input_tokens\x18\x06 \x01(\r\"\xc5\x01\n\x12\x44\x65\x63odingParameters\x12\x1a\n\x12repetition_penalty\x18\x01 \x01(\x02\x12\x44\n\x0elength_penalty\x18\x02 \x01(\x0b\x32\'.fmaas.DecodingParameters.LengthPenaltyH\x00\x88\x01\x01\x1a:\n\rLengthPenalty\x12\x13\n\x0bstart_index\x18\x01 \x01(\r\x12\x14\n\x0c\x64\x65\x63\x61y_factor\x18\x02 \x01(\x02\x42\x11\n\x0f_length_penalty\"v\n\x12SamplingParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\ttypical_p\x18\x04 \x01(\x02\x12\x11\n\x04seed\x18\x05 \x01(\x04H\x00\x88\x01\x01\x42\x07\n\x05_seed\"\xb3\x01\n\x10StoppingCriteria\x12\x16\n\x0emax_new_tokens\x18\x01 \x01(\r\x12\x16\n\x0emin_new_tokens\x18\x02 \x01(\r\x12\x19\n\x11time_limit_millis\x18\x03 \x01(\r\x12\x16\n\x0estop_sequences\x18\x04 \x03(\t\x12\"\n\x15include_stop_sequence\x18\x05 \x01(\x08H\x00\x88\x01\x01\x42\x18\n\x16_include_stop_sequence\"\x98\x01\n\x0fResponseOptions\x12\x12\n\ninput_text\x18\x01 \x01(\x08\x12\x18\n\x10generated_tokens\x18\x02 \x01(\x08\x12\x14\n\x0cinput_tokens\x18\x03 \x01(\x08\x12\x16\n\x0etoken_logprobs\x18\x04 \x01(\x08\x12\x13\n\x0btoken_ranks\x18\x05 \x01(\x08\x12\x14\n\x0ctop_n_tokens\x18\x06 \x01(\r\"\x92\x01\n\tTokenInfo\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07logprob\x18\x03 \x01(\x02\x12\x0c\n\x04rank\x18\x04 \x01(\r\x12-\n\ntop_tokens\x18\x05 \x03(\x0b\x32\x19.fmaas.TokenInfo.TopToken\x1a)\n\x08TopToken\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07logprob\x18\x03 \x01(\x02\"k\n\x16\x42\x61tchedTokenizeRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12(\n\x08requests\x18\x02 \x03(\x0b\x32\x16.fmaas.TokenizeRequest\x12\x15\n\rreturn_tokens\x18\x03 \x01(\x08\"E\n\x17\x42\x61tchedTokenizeResponse\x12*\n\tresponses\x18\x01 \x03(\x0b\x32\x17.fmaas.TokenizeResponse\"\x1f\n\x0fTokenizeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\"7\n\x10TokenizeResponse\x12\x13\n\x0btoken_count\x18\x01 \x01(\r\x12\x0e\n\x06tokens\x18\x02 \x03(\t\"$\n\x10ModelInfoRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\"\xb4\x01\n\x11ModelInfoResponse\x12\x36\n\nmodel_kind\x18\x01 \x01(\x0e\x32\".fmaas.ModelInfoResponse.ModelKind\x12\x1b\n\x13max_sequence_length\x18\x02 \x01(\r\x12\x16\n\x0emax_new_tokens\x18\x03 \x01(\r\"2\n\tModelKind\x12\x10\n\x0c\x44\x45\x43ODER_ONLY\x10\x00\x12\x13\n\x0f\x45NCODER_DECODER\x10\x01*(\n\x0e\x44\x65\x63odingMethod\x12\n\n\x06GREEDY\x10\x00\x12\n\n\x06SAMPLE\x10\x01*\x8b\x01\n\nStopReason\x12\x10\n\x0cNOT_FINISHED\x10\x00\x12\x0e\n\nMAX_TOKENS\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\r\n\tCANCELLED\x10\x03\x12\x0e\n\nTIME_LIMIT\x10\x04\x12\x11\n\rSTOP_SEQUENCE\x10\x05\x12\x0f\n\x0bTOKEN_LIMIT\x10\x06\x12\t\n\x05\x45RROR\x10\x07\x32\xc4\x02\n\x11GenerationService\x12O\n\x08Generate\x12\x1f.fmaas.BatchedGenerationRequest\x1a .fmaas.BatchedGenerationResponse\"\x00\x12O\n\x0eGenerateStream\x12\x1e.fmaas.SingleGenerationRequest\x1a\x19.fmaas.GenerationResponse\"\x00\x30\x01\x12K\n\x08Tokenize\x12\x1d.fmaas.BatchedTokenizeRequest\x1a\x1e.fmaas.BatchedTokenizeResponse\"\x00\x12@\n\tModelInfo\x12\x17.fmaas.ModelInfoRequest\x1a\x18.fmaas.ModelInfoResponse\"\x00\x62\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generation_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+  DESCRIPTOR._options = None
+  _globals['_DECODINGMETHOD']._serialized_start=2266
+  _globals['_DECODINGMETHOD']._serialized_end=2306
+  _globals['_STOPREASON']._serialized_start=2309
+  _globals['_STOPREASON']._serialized_end=2448
+  _globals['_BATCHEDGENERATIONREQUEST']._serialized_start=28
+  _globals['_BATCHEDGENERATIONREQUEST']._serialized_end=189
+  _globals['_SINGLEGENERATIONREQUEST']._serialized_start=192
+  _globals['_SINGLEGENERATIONREQUEST']._serialized_end=351
+  _globals['_BATCHEDGENERATIONRESPONSE']._serialized_start=353
+  _globals['_BATCHEDGENERATIONRESPONSE']._serialized_end=426
+  _globals['_GENERATIONREQUEST']._serialized_start=428
+  _globals['_GENERATIONREQUEST']._serialized_end=461
+  _globals['_GENERATIONRESPONSE']._serialized_start=464
+  _globals['_GENERATIONRESPONSE']._serialized_end=707
+  _globals['_PARAMETERS']._serialized_start=710
+  _globals['_PARAMETERS']._serialized_end=967
+  _globals['_DECODINGPARAMETERS']._serialized_start=970
+  _globals['_DECODINGPARAMETERS']._serialized_end=1167
+  _globals['_DECODINGPARAMETERS_LENGTHPENALTY']._serialized_start=1090
+  _globals['_DECODINGPARAMETERS_LENGTHPENALTY']._serialized_end=1148
+  _globals['_SAMPLINGPARAMETERS']._serialized_start=1169
+  _globals['_SAMPLINGPARAMETERS']._serialized_end=1287
+  _globals['_STOPPINGCRITERIA']._serialized_start=1290
+  _globals['_STOPPINGCRITERIA']._serialized_end=1469
+  _globals['_RESPONSEOPTIONS']._serialized_start=1472
+  _globals['_RESPONSEOPTIONS']._serialized_end=1624
+  _globals['_TOKENINFO']._serialized_start=1627
+  _globals['_TOKENINFO']._serialized_end=1773
+  _globals['_TOKENINFO_TOPTOKEN']._serialized_start=1732
+  _globals['_TOKENINFO_TOPTOKEN']._serialized_end=1773
+  _globals['_BATCHEDTOKENIZEREQUEST']._serialized_start=1775
+  _globals['_BATCHEDTOKENIZEREQUEST']._serialized_end=1882
+  _globals['_BATCHEDTOKENIZERESPONSE']._serialized_start=1884
+  _globals['_BATCHEDTOKENIZERESPONSE']._serialized_end=1953
+  _globals['_TOKENIZEREQUEST']._serialized_start=1955
+  _globals['_TOKENIZEREQUEST']._serialized_end=1986
+  _globals['_TOKENIZERESPONSE']._serialized_start=1988
+  _globals['_TOKENIZERESPONSE']._serialized_end=2043
+  _globals['_MODELINFOREQUEST']._serialized_start=2045
+  _globals['_MODELINFOREQUEST']._serialized_end=2081
+  _globals['_MODELINFORESPONSE']._serialized_start=2084
+  _globals['_MODELINFORESPONSE']._serialized_end=2264
+  _globals['_MODELINFORESPONSE_MODELKIND']._serialized_start=2214
+  _globals['_MODELINFORESPONSE_MODELKIND']._serialized_end=2264
+  _globals['_GENERATIONSERVICE']._serialized_start=2451
+  _globals['_GENERATIONSERVICE']._serialized_end=2775
+# @@protoc_insertion_point(module_scope)
diff --git a/language/llama2-70b/generation_pb2_grpc.py b/language/llama2-70b/generation_pb2_grpc.py
new file mode 100644
index 000000000..9ab1e4eb5
--- /dev/null
+++ b/language/llama2-70b/generation_pb2_grpc.py
@@ -0,0 +1,169 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import generation_pb2 as generation__pb2
+
+
+class GenerationServiceStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Generate = channel.unary_unary(
+                '/fmaas.GenerationService/Generate',
+                request_serializer=generation__pb2.BatchedGenerationRequest.SerializeToString,
+                response_deserializer=generation__pb2.BatchedGenerationResponse.FromString,
+                )
+        self.GenerateStream = channel.unary_stream(
+                '/fmaas.GenerationService/GenerateStream',
+                request_serializer=generation__pb2.SingleGenerationRequest.SerializeToString,
+                response_deserializer=generation__pb2.GenerationResponse.FromString,
+                )
+        self.Tokenize = channel.unary_unary(
+                '/fmaas.GenerationService/Tokenize',
+                request_serializer=generation__pb2.BatchedTokenizeRequest.SerializeToString,
+                response_deserializer=generation__pb2.BatchedTokenizeResponse.FromString,
+                )
+        self.ModelInfo = channel.unary_unary(
+                '/fmaas.GenerationService/ModelInfo',
+                request_serializer=generation__pb2.ModelInfoRequest.SerializeToString,
+                response_deserializer=generation__pb2.ModelInfoResponse.FromString,
+                )
+
+
+class GenerationServiceServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Generate(self, request, context):
+        """Generates text given a text prompt, for one or more inputs
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateStream(self, request, context):
+        """Generates text given a single input prompt, streaming the response
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Tokenize(self, request, context):
+        """Tokenize text
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ModelInfo(self, request, context):
+        """Model info
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_GenerationServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Generate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Generate,
+                    request_deserializer=generation__pb2.BatchedGenerationRequest.FromString,
+                    response_serializer=generation__pb2.BatchedGenerationResponse.SerializeToString,
+            ),
+            'GenerateStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.GenerateStream,
+                    request_deserializer=generation__pb2.SingleGenerationRequest.FromString,
+                    response_serializer=generation__pb2.GenerationResponse.SerializeToString,
+            ),
+            'Tokenize': grpc.unary_unary_rpc_method_handler(
+                    servicer.Tokenize,
+                    request_deserializer=generation__pb2.BatchedTokenizeRequest.FromString,
+                    response_serializer=generation__pb2.BatchedTokenizeResponse.SerializeToString,
+            ),
+            'ModelInfo': grpc.unary_unary_rpc_method_handler(
+                    servicer.ModelInfo,
+                    request_deserializer=generation__pb2.ModelInfoRequest.FromString,
+                    response_serializer=generation__pb2.ModelInfoResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'fmaas.GenerationService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class GenerationService(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Generate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/Generate',
+            generation__pb2.BatchedGenerationRequest.SerializeToString,
+            generation__pb2.BatchedGenerationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/fmaas.GenerationService/GenerateStream',
+            generation__pb2.SingleGenerationRequest.SerializeToString,
+            generation__pb2.GenerationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Tokenize(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/Tokenize',
+            generation__pb2.BatchedTokenizeRequest.SerializeToString,
+            generation__pb2.BatchedTokenizeResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ModelInfo(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/ModelInfo',
+            generation__pb2.ModelInfoRequest.SerializeToString,
+            generation__pb2.ModelInfoResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/language/llama2-70b/inference.py b/language/llama2-70b/inference.py
new file mode 100644
index 000000000..e62a51331
--- /dev/null
+++ b/language/llama2-70b/inference.py
@@ -0,0 +1,186 @@
+from typing import Optional, Union
+
+import grpc
+import generation_pb2_grpc
+import socket
+import ssl
+import sys
+
+
+def get_server_certificate(host: str, port: int) -> str:
+    """connect to host:port and get the certificate it presents
+
+    This is almost the same as `ssl.get_server_certificate`, but
+    when opening the TLS socket, `server_hostname` is also provided.
+
+    This retrieves the correct certificate for hosts using name-based
+    virtual hosting.
+    """
+    if sys.version_info >= (3, 10):
+        # ssl.get_server_certificate supports TLS SNI only above 3.10
+        # https://github.com/python/cpython/pull/16820
+        return ssl.get_server_certificate((host, port))
+
+    context = ssl.SSLContext()
+
+    with socket.create_connection((host, port)) as sock, context.wrap_socket(
+        sock, server_hostname=host
+    ) as ssock:
+        cert_der = ssock.getpeercert(binary_form=True)
+
+    assert cert_der
+    return ssl.DER_cert_to_PEM_cert(cert_der)
+
+
+class GrpcClient:
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        *,
+        insecure: bool = False,
+        verify: Optional[bool] = None,
+        ca_cert: Union[None, bytes, str] = None,
+        client_cert: Union[None, bytes, str] = None,
+        client_key: Union[None, bytes, str] = None,
+    ) -> None:
+        self._channel = self._make_channel(
+            host,
+            port,
+            insecure=insecure,
+            verify=verify,
+            client_key=client_key,
+            client_cert=client_cert,
+            ca_cert=ca_cert,
+        )
+        self.generation_service_stub = generation_pb2_grpc.GenerationServiceStub(
+            self._channel
+        )
+
+    def make_request(self, texts: str, model_id: str = "flan-t5-small"):
+        request = generation_pb2_grpc.generation__pb2.BatchedGenerationRequest(
+            model_id=model_id,
+            requests=[generation_pb2_grpc.generation__pb2.GenerationRequest(text=text) for text in texts],
+            params=generation_pb2_grpc.generation__pb2.Parameters(
+                   method=generation_pb2_grpc.generation__pb2.GREEDY,
+                   stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
+                       max_new_tokens=1024,
+                       min_new_tokens=1
+                   )
+            )
+        )
+        result = self.generation_service_stub.Generate(request=request)
+        return result
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc_info):
+        self._close()
+        return False
+
+    def _close(self):
+        try:
+            if hasattr(self, "_channel") and self._channel:
+                self._channel.close()
+        except Exception as exc:
+            print(f"Unexpected exception while closing client: {exc}")
+
+    def __del__(self):
+        self._close()
+
+    def _make_channel(
+        self,
+        host: str,
+        port: int,
+        *,
+        insecure: bool = False,
+        verify: Optional[bool] = None,
+        ca_cert: Union[None, bytes, str] = None,
+        client_key: Union[None, bytes, str] = None,
+        client_cert: Union[None, bytes, str] = None,
+    ) -> grpc.Channel:
+        """Creates a grpc channel
+
+        Args:
+        - host: str
+        - port: str
+        - (optional) insecure: use a plaintext connection (default=False)
+        - (optional) verify: set to False to disable remote host certificate(s)
+                     verification. Cannot be used with `plaintext` or with MTLS
+        - (optional) ca_cert: certificate authority to use
+        - (optional) client_key: client key for mTLS mode
+        - (optional) client_cert: client cert for mTLS mode
+
+        """
+        if not host.strip():
+            raise ValueError("A non empty host name is required")
+        if int(port) <= 0:
+            raise ValueError("A non zero port number is required")
+        if insecure and any(
+            (val is not None) for val in (ca_cert, client_key, client_cert)
+        ):
+            raise ValueError("cannot use insecure with TLS/mTLS certificates")
+        if insecure and verify:
+            raise ValueError("insecure cannot be used with verify")
+
+        client_key_bytes = self._try_load_certificate(client_key)
+        client_cert_bytes = self._try_load_certificate(client_cert)
+        ca_cert_bytes = self._try_load_certificate(ca_cert)
+
+        connection = f"{host}:{port}"
+        if insecure:
+            print("Connecting over an insecure plaintext grpc channel")
+            return grpc.insecure_channel(connection)
+
+        credentials_kwargs: dict[str, bytes] = {}
+        if ca_cert_bytes and not (any((client_cert_bytes, client_key_bytes))):
+            print("Connecting using provided CA certificate for secure channel")
+            credentials_kwargs.update(root_certificates=ca_cert_bytes)
+        elif client_cert_bytes and client_key_bytes and ca_cert_bytes:
+            print("Connecting using mTLS for secure channel")
+            credentials_kwargs.update(
+                root_certificates=ca_cert_bytes,
+                private_key=client_key_bytes,
+                certificate_chain=client_cert_bytes,
+            )
+        elif verify is False:
+            cert = get_server_certificate(host, port).encode()
+            credentials_kwargs.update(root_certificates=cert)
+
+        return grpc.secure_channel(
+            connection, grpc.ssl_channel_credentials(**credentials_kwargs)
+        )
+
+    @staticmethod
+    def _try_load_certificate(certificate: Union[None, bytes, str]) -> Optional[bytes]:
+        """If the certificate points to a file, return the contents (plaintext reads).
+        Else return the bytes"""
+        if not certificate:
+            return None
+
+        if isinstance(certificate, bytes):
+            return certificate
+
+        if isinstance(certificate, str):
+            with open(certificate, "rb") as secret_file:
+                return secret_file.read()
+        raise ValueError(
+            f"{certificate=} should be a path to a certificate files or bytes"
+        )
+
+
+if __name__ == "__main__":
+    # host = "flan-t5-small-predictor-caikit-testing.apps.aisrhods-wx.8goc.p1.openshiftapps.com"
+    # port = 443
+    host = "localhost"
+    port = 8033
+    print(f"connecting to {host=}:{port}")
+    client = GrpcClient(
+        host,
+        port,
+        insecure=True,
+        # verify=False,
+    )
+
+    client.make_request("this is the query text", model_id="flan-t5-small")
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index 424d48df7..6ccddd0e7 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -19,6 +19,8 @@ def get_args():
     parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
     parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
+    parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
+    parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
     parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
     parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
     parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
@@ -72,6 +74,8 @@ def main():
         model_path=args.model_path,
         api_server=args.api_server,
         api_model_name=args.api_model_name,
+        grpc=args.grpc,
+        batch_grpc=args.batch_grpc,
         dtype=args.dtype,
         dataset_path=args.dataset_path,
         total_sample_count=args.total_sample_count,

From 58e16ea7ee0022887848e4559c86fd9f139d5ae0 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 25 Jan 2024 14:23:41 -0500
Subject: [PATCH 51/75] Streaming first pass

---
 language/llama2-70b/SUT.py                    | 22 +++++++++++++++++--
 .../api-endpoint-artifacts/benchmark.yaml     |  2 +-
 .../api-endpoint-artifacts/serving-tgis.yaml  | 10 ++++-----
 language/llama2-70b/inference.py              | 15 +++++++++++++
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index b7793563b..bf7c195e3 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -112,7 +112,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 300
+                batch_size = 512
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
@@ -456,10 +456,28 @@ def stream_api(self, input, response_ids):
                                 token_cache.append(token)
         return token_cache
 
+    def stream_api_grpc(self, input, response_ids):
+        token_cache = []
+        first = True
+        resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
+        for resp in resps:
+            if resp.text:
+                tokens = self.tokenizer(resp.text)["input_ids"][1:]
+                if first:
+                    self.first_token_queue.put((tokens[0], response_ids[0]))
+                    token_cache.extend(tokens[1:])
+                    first = False
+                else:
+                    token_cache.extend(tokens)
+        return token_cache
+
     def async_process_query(self, input_ids_tensor, qitem_id):
         decoded = self.tokenizer.decode(input_ids_tensor[0])
         response_ids = [qitem_id]
-        output_tokens = self.stream_api(decoded, response_ids)
+        if self.grpc:
+            output_tokens = self.stream_api_grpc(decoded, response_ids)
+        else:
+            output_tokens = self.stream_api(decoded, response_ids)
 
         n_tokens = len(output_tokens)
         response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 79001774d..0fe72ded5 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:grpc-batch
+    image: quay.io/meyceoz/mlperf-inference:grpc-stream
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
index b46148133..fa0644af5 100644
--- a/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-tgis.yaml
@@ -24,11 +24,11 @@ spec:
           #  value: float16
         # Dynamic batch size changes
         - name: MAX_BATCH_SIZE
-          value: "256"
+          value: "512"
         - name: MAX_CONCURRENT_REQUESTS
-          value: "300"
+          value: "5000"
         - name: MAX_BATCH_WEIGHT
-          value: "550000"
+          value: "1000000"
         - name: MAX_SEQUENCE_LENGTH
           value: "2048"
         - name: MAX_PREFILL_WEIGHT
@@ -43,8 +43,8 @@ spec:
           value: "5"
       resources: # configure as required
         requests:
-          cpu: 64
-          memory: 900Gi
+          cpu: 96
+          memory: 1000Gi
           nvidia.com/gpu: 8
         limits:
           nvidia.com/gpu: 8
diff --git a/language/llama2-70b/inference.py b/language/llama2-70b/inference.py
index e62a51331..c7893c2ab 100644
--- a/language/llama2-70b/inference.py
+++ b/language/llama2-70b/inference.py
@@ -71,6 +71,21 @@ def make_request(self, texts: str, model_id: str = "flan-t5-small"):
         )
         result = self.generation_service_stub.Generate(request=request)
         return result
+    
+    def make_request_stream(self, text: str, model_id: str = "flan-t5-small"):
+        request = generation_pb2_grpc.generation__pb2.SingleGenerationRequest(
+            model_id=model_id,
+            request=generation_pb2_grpc.generation__pb2.GenerationRequest(text=text),
+            params=generation_pb2_grpc.generation__pb2.Parameters(
+                   method=generation_pb2_grpc.generation__pb2.GREEDY,
+                   stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
+                       max_new_tokens=1024,
+                       min_new_tokens=1
+                   )
+            )
+        )
+        result = self.generation_service_stub.GenerateStream(request=request)
+        return result
 
     def __enter__(self):
         return self

From f35c17e3decd7bb24053ce02ab31222c78658517 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 25 Jan 2024 15:04:58 -0500
Subject: [PATCH 52/75] Updated server impl

---
 language/llama2-70b/SUT.py                               | 9 ++++-----
 .../llama2-70b/api-endpoint-artifacts/benchmark.yaml     | 2 +-
 language/llama2-70b/inference.py                         | 3 +++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index bf7c195e3..14fdaf08f 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -461,14 +461,13 @@ def stream_api_grpc(self, input, response_ids):
         first = True
         resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
         for resp in resps:
-            if resp.text:
-                tokens = self.tokenizer(resp.text)["input_ids"][1:]
+            if resp.tokens:
+                token = self.llama_vocab[resp.tokens[0].text]
                 if first:
-                    self.first_token_queue.put((tokens[0], response_ids[0]))
-                    token_cache.extend(tokens[1:])
+                    self.first_token_queue.put((token, response_ids[0]))
                     first = False
                 else:
-                    token_cache.extend(tokens)
+                    token_cache.append(token)
         return token_cache
 
     def async_process_query(self, input_ids_tensor, qitem_id):
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 0fe72ded5..9ddd0c384 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:grpc-stream
+    image: quay.io/meyceoz/mlperf-inference:v5
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/inference.py b/language/llama2-70b/inference.py
index c7893c2ab..51de405ef 100644
--- a/language/llama2-70b/inference.py
+++ b/language/llama2-70b/inference.py
@@ -81,6 +81,9 @@ def make_request_stream(self, text: str, model_id: str = "flan-t5-small"):
                    stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
                        max_new_tokens=1024,
                        min_new_tokens=1
+                   ),
+                   response=generation_pb2_grpc.generation__pb2.ResponseOptions(
+                       generated_tokens=True
                    )
             )
         )

From 132f72566a626974c67fc7d3d0303593907a94cc Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 25 Jan 2024 15:42:08 -0500
Subject: [PATCH 53/75] Fully functional, updated README

---
 language/llama2-70b/SUT.py                    |  2 +-
 .../api-endpoint-artifacts/README.md          | 30 ++++++++++++++++++-
 .../api-endpoint-artifacts/benchmark.yaml     |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 14fdaf08f..6fbc75656 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -377,7 +377,7 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
         super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 6a7260b23..30a5cd889 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -3,11 +3,39 @@
 Prerequisites:
  - Install the OpenShift AI model serving stack
  - Add your AWS credentials to `secret.yaml` access the model files
- - Apply `secret.yaml`, `sa.yaml`, `serving-runtime.yaml`, then finally `model.yaml`
+ - Apply `secret.yaml`, `sa.yaml`
+ - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml`
+ - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/llama2-70b`
 
+## STANDALONE TGIS INSTRUCTIONS
+For the full accuracy benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+You can then run the same evaluation/consolidation scripts as the regular benchmark
+Example API host: `https://llama-2-70b-chat-isvc-predictor-llama-service.apps.h100serving.perf.lab.eng.bos.redhat.com`
+
+
+For the performance benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+(It is the same, just with `--accuracy` removed)
+
+
+For the performance benchmark (server), run in the pod:
+```
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+```
+(Configure target qps in `user.conf`)
+
+
+NOTE: Hyperparams are currently configured for 8xH100
+
+## CAIKIT INSTRUCTIONS
 For the full accuracy benchmark (offline), run in the pod:
 ```
 python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT SERVER API CALL ENDPOINT> --api-model-name Llama-2-70b-chat-hf-caikit --accuracy --mlperf-conf mlperf.conf --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 9ddd0c384..a9e10c15b 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v5
+    image: quay.io/meyceoz/mlperf-inference:v6
     resources:
       requests:
         memory: 20000Mi

From 9f31f19fcae4b829466b4b67391ea1df68c666d5 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 12 Feb 2024 09:56:56 -0500
Subject: [PATCH 54/75] Update default client-side batches

---
 language/llama2-70b/SUT.py                                | 2 +-
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 6fbc75656..806502349 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -112,7 +112,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 512
+                batch_size = 2000
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index a9e10c15b..402342a4e 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v6
+    image: quay.io/meyceoz/mlperf-inference:v7
     resources:
       requests:
         memory: 20000Mi

From 654dda5bcee0e73af14455836c0d5f36a4f67307 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 12 Feb 2024 15:12:40 -0500
Subject: [PATCH 55/75] v8 Update

---
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 402342a4e..71d8e912d 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v7
+    image: quay.io/meyceoz/mlperf-inference:v8
     resources:
       requests:
         memory: 20000Mi

From c7f699d6888edda70e332aa7e1ef34831dce9aff Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 15 Feb 2024 13:27:50 -0500
Subject: [PATCH 56/75] GPT-J first pass

---
 language/gpt-j/SUT.py                         | 527 ++++++++++++++++++
 .../api-endpoint-artifacts/Dockerfile-API     |  27 +
 .../gpt-j/api-endpoint-artifacts/README.md    |  37 ++
 .../api-endpoint-artifacts/benchmark.yaml     |  21 +
 .../gpt-j/api-endpoint-artifacts/model.yaml   |  20 +
 .../offline_performance_log.log               |   1 +
 language/gpt-j/api-endpoint-artifacts/sa.yaml |   6 +
 .../gpt-j/api-endpoint-artifacts/secret.yaml  |  12 +
 .../api-endpoint-artifacts/serving-tgis.yaml  |  67 +++
 language/gpt-j/dataset.py                     |  21 +-
 language/gpt-j/generation_pb2.py              |  70 +++
 language/gpt-j/generation_pb2_grpc.py         | 169 ++++++
 language/gpt-j/inference.py                   | 210 +++++++
 language/gpt-j/main.py                        | 116 ++--
 language/gpt-j/old_main.py                    |  94 ++++
 15 files changed, 1336 insertions(+), 62 deletions(-)
 create mode 100644 language/gpt-j/SUT.py
 create mode 100644 language/gpt-j/api-endpoint-artifacts/Dockerfile-API
 create mode 100644 language/gpt-j/api-endpoint-artifacts/README.md
 create mode 100644 language/gpt-j/api-endpoint-artifacts/benchmark.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/model.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
 create mode 100644 language/gpt-j/api-endpoint-artifacts/sa.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/secret.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
 create mode 100644 language/gpt-j/generation_pb2.py
 create mode 100644 language/gpt-j/generation_pb2_grpc.py
 create mode 100644 language/gpt-j/inference.py
 create mode 100644 language/gpt-j/old_main.py

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
new file mode 100644
index 000000000..331a5e495
--- /dev/null
+++ b/language/gpt-j/SUT.py
@@ -0,0 +1,527 @@
+import os
+import sys
+import time
+import re
+import numpy as np
+import array
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.streamers import BaseStreamer
+
+import pickle
+import time
+import threading
+import tqdm
+import queue
+
+from concurrent.futures.thread import ThreadPoolExecutor
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+import json
+
+from inference import GrpcClient
+
+requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
+
+import logging
+from typing import TYPE_CHECKING, Optional, List
+from pathlib import Path
+
+import mlperf_loadgen as lg
+from dataset import Dataset
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("GPT-J-SUT")
+
+gen_kwargs = {
+    "early_stopping": True,
+    "max_new_tokens": 128,
+    "min_new_tokens": 30,
+    "num_beams": 4,
+}
+
+
+
+class FirstTokenStreamer(BaseStreamer):
+    """ Streams first tokens to a 'holder' """
+
+    def __init__(self, first_token, tokens_cache=[], is_first_token=True, response_ids=[] ):
+        """ Response ids added to 'sign' the first token"""
+
+        self.first_token = first_token # Queue for first token
+        self.is_first_token = is_first_token
+
+        # Cache for subsequent generated tokens
+        self.tokens_cache = tokens_cache
+
+        self.response_ids = response_ids
+
+        self.is_prompt = True # The first tokens sent to the streamer are actually the input prompts
+
+    def put(self, value):
+        """ Caches the tokens as they're generated. Assumes bs=1 """
+
+        # Prompts are streamed first so we need to skip the first time value that arrives
+        if self.is_prompt:
+            self.is_prompt = False
+            return
+
+        value = value.item()
+        if self.is_first_token:
+
+            # Add generated first token together with its query response_id to first tokens queue
+            self.first_token.put((value, self.response_ids[0]))
+
+            self.is_first_token = False
+            return
+
+        self.tokens_cache.append(value)
+
+
+    def end(self):
+        pass
+
+    def get_out_tokens(self):
+        return self.tokens_cache
+
+
+class SUT():
+    def __init__(self,
+                 model_path=None,
+                 api_server=None,
+                 api_model_name=None,
+                 grpc=False,
+                 batch_grpc=False,
+                 dtype="bfloat16",
+                 device="cpu",
+                 batch_size=None,
+                 total_sample_count=13368,
+                 dataset_path=None,
+                 use_cached_outputs=False,  # Set this to True *only for test accuracy runs* in case your prior session was killed partway through
+                 workers=1):
+
+        self.model_path = model_path or "EleutherAI/gpt-j-6B"
+        self.api_server = api_server
+        self.api_model_name = api_model_name
+        self.grpc = grpc
+        self.batch_grpc = batch_grpc
+        self.device = device
+
+        if not batch_size:
+            if device == "cpu":
+                batch_size = 2
+            else:
+                batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
+        self.batch_size = batch_size
+
+        # dtype
+        if dtype == 'bfloat16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.bfloat16
+        elif dtype == 'float16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.float16
+        else:
+            self.amp_enabled = False
+            self.amp_dtype = torch.float32
+
+        if 'cuda' in self.device:
+            assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
+
+        self.dataset_path = dataset_path
+        self.data_object = Dataset(dataset_path=self.dataset_path,
+                                   total_count_override=total_sample_count)
+        self.qsl = lg.ConstructQSL(self.data_object.count, self.data_object.perf_count,
+                                   self.data_object.LoadSamplesToRam, self.data_object.UnloadSamplesFromRam)
+        self.load_model()
+
+        self.num_workers = workers
+        self.worker_threads = [None] * self.num_workers
+        self.query_queue = queue.Queue()
+
+        self.use_cached_outputs = use_cached_outputs
+        self.sample_counter = 0
+        self.sample_counter_lock = threading.Lock()
+
+
+    def start(self):
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+
+    def query_api(self, input):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': self.api_model_name,
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
+            },
+        }
+
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(
+                    self.api_server,
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                )
+                response_code = response.status_code
+            except:
+                print("connection failure")
+        return json.loads(response.text)["generated_text"]
+    
+
+    def query_api_grpc(self, input):
+        resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
+        return resp.responses[0].text
+
+    def query_api_batch_grpc(self, inputs):
+        resps = self.grpc_client.make_request(inputs, model_id=self.api_model_name)
+        return [resp.text for resp in resps.responses]
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic """
+
+        while True:
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            query_ids = [q.index for q in qitem]
+
+            fname = "q" + "_".join([str(i) for i in query_ids])
+            fname = f"run_outputs/{fname}.pkl"
+            _p = Path(fname)
+            if self.use_cached_outputs and _p.exists():
+                # Read cache
+                with _p.open(mode="rb") as f:
+                    d = pickle.load(f)
+                processed_output = d["outputs"]
+                tik1 = None
+                tik2 = None
+                tik3 = None
+                tok = None
+            else:
+                # Construct / collate batch
+                tik1 = time.time()
+
+                input_ids_tensor = []
+                input_masks_tensor = []
+                input_len = []
+                for q in qitem:
+                    input_ids_tensor.append(self.data_object.source_encoded_input_ids[q.index])
+                    #input_masks_tensor.append(self.data_object.source_encoded_attn_masks[q.index])
+                    #input_len.append(self.data_object.input_lens[q.index])
+                #input_ids_tensor = torch.cat(input_ids_tensor)
+                #input_masks_tensor = torch.cat(input_masks_tensor)
+
+                #assert input_ids_tensor.shape == input_masks_tensor.shape
+                #assert input_ids_tensor.shape[0] <= self.batch_size
+
+                if self.api_server:
+                    #decoded = self.tokenizer.batch_decode(input_ids_tensor)
+                    #cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
+                    bs = len(input_ids_tensor)
+
+                tik2 = time.time()
+
+                if self.api_server:
+                    if self.grpc:
+                        if self.batch_grpc:
+                            output = self.query_api_batch_grpc(input_ids_tensor)
+                        else:
+                            with ThreadPoolExecutor(max_workers=bs) as executor:
+                                output = list(executor.map(self.query_api_grpc,input_ids_tensor))
+                    else:
+                        with ThreadPoolExecutor(max_workers=bs) as executor:
+                            output = list(executor.map(self.query_api,input_ids_tensor))
+                else:
+                    pred_output_tokens = self.model.generate(
+                        input_ids=input_ids_tensor,
+                        attention_mask=input_masks_tensor,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        **gen_kwargs
+                    )
+
+                tik3 = time.time()
+
+                if self.api_server:
+                    processed_output = np.array(self.tokenizer(output, padding='longest')['input_ids'])
+                else:
+                    processed_output = self.data_object.postProcess(pred_output_tokens,
+                                                                    input_seq_lens=input_len,
+                                                                    query_id_list=query_ids)
+
+            for i in range(len(qitem)):
+                n_tokens = processed_output[i].shape[0]
+                response_array = array.array("B", processed_output[i].tobytes())
+                bi = response_array.buffer_info()
+                response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+            tok = time.time()
+
+            with self.sample_counter_lock:
+                self.sample_counter += len(qitem)
+                print(f"Samples run: {self.sample_counter}")
+                if tik1:
+                    print(f"\tBatchMaker time: {tik2 - tik1}")
+                    print(f"\tInference time: {tik3 - tik2}")
+                    print(f"\tPostprocess time: {tok - tik3}")
+                    print(f"\t==== Total time: {tok - tik1}")
+                else:
+                    print(f"\tLoaded from cache: {_p}")
+
+
+    def load_model(self):
+        if self.api_server:
+            if self.grpc:
+                hostname = re.sub("https://|http://", "", self.api_server)
+                if hostname[-1] == "/":
+                    hostname = hostname[:-1]
+                self.grpc_client = GrpcClient(
+                    hostname,
+                    443,
+                    verify=False,
+                )
+            elif not "http" in self.api_server:
+                self.api_server = "http://" + self.api_server
+
+            if not self.api_model_name:
+                sys.exit("API Server was specified but no model name was provided")
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                torch_dtype=self.amp_dtype
+            )
+            print("Loaded model")
+
+            self.device = torch.device(self.device)
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)  # Force CPU if your system has GPU and you specifically want CPU-only run
+
+            self.model.eval()
+            self.model = self.model.to(memory_format=torch.channels_last)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            model_max_length=1024,
+            padding_side="left",
+            use_fast=True,) #changed from false
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("Loaded tokenizer")
+
+    def get_sut(self):
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        return self.sut
+
+    def get_qsl(self):
+        return self.qsl
+
+
+    def predict(self,**kwargs):
+        raise NotImplementedError
+
+
+    def issue_queries(self, query_samples):
+        """ Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+
+        list_prompts_tokens = []
+        list_prompts_attn_masks = []
+
+        print(f"IssueQuery started with {len(query_samples)} samples")
+        while len(query_samples) > 0:
+            self.query_queue.put(query_samples[:self.batch_size])
+            query_samples = query_samples[self.batch_size:]
+        print(f"IssueQuery done")
+
+
+    def flush_queries(self):
+        pass
+
+    def __del__(self):
+        pass
+
+
+class SUTServer(SUT):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+
+        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+
+        with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
+            gpt_tokenizer = json.load(token_file)
+        self.gpt_vocab = gpt_tokenizer["model"]["vocab"]
+
+        self.first_token_queue = queue.Queue()
+        
+
+    def start(self):
+
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+        # Create first token response thread
+        self.ft_response_thread = threading.Thread(target=self.process_first_tokens)
+        self.ft_response_thread.start()
+
+
+    def process_first_tokens(self):
+
+        while True:
+            first_token_item = self.first_token_queue.get()
+
+            if first_token_item is None:
+                log.info("Exiting First token response thread")
+                break
+
+            first_tokens, response_id = first_token_item
+
+            response_data = array.array("B", np.array(first_tokens, np.float32).tobytes())
+            bi = response_data.buffer_info()
+            response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
+            lg.FirstTokenComplete(response)
+
+    def stream_api(self, input, response_ids):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': 'GPT-J',
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
+            },
+        }
+
+        token_cache = []
+        s = requests.Session()
+        first = True
+        with s.post(
+            self.api_server,
+            headers=headers,
+            json=json_data,
+            verify=False,
+            stream=True
+        ) as resp:
+            for line in resp.iter_lines():
+                if line:
+                    decoded = line.decode()
+                    if decoded.startswith("data"):
+                        token_l = json.loads(decoded[6:])["tokens"]
+                        if token_l:
+                            token = self.gpt_vocab[token_l[0]["text"]]
+                            if first:
+                                self.first_token_queue.put((token, response_ids[0]))
+                                first = False
+                            else:
+                                token_cache.append(token)
+        return token_cache
+
+    def stream_api_grpc(self, input, response_ids):
+        token_cache = []
+        first = True
+        resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
+        for resp in resps:
+            if resp.tokens:
+                token = self.gpt_vocab[resp.tokens[0].text]
+                if first:
+                    self.first_token_queue.put((token, response_ids[0]))
+                    first = False
+                else:
+                    token_cache.append(token)
+        return token_cache
+
+    def async_process_query(self, input_ids_tensor, qitem_id):
+        decoded = self.tokenizer.decode(input_ids_tensor[0])
+        response_ids = [qitem_id]
+        if self.grpc:
+            output_tokens = self.stream_api_grpc(decoded, response_ids)
+        else:
+            output_tokens = self.stream_api(decoded, response_ids)
+
+        n_tokens = len(output_tokens)
+        response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+        bi = response_array.buffer_info()
+        response = [lg.QuerySampleResponse(
+            qitem_id, bi[0], bi[1], n_tokens)]
+        lg.QuerySamplesComplete(response)
+        sys.exit()
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic """
+        while True:
+
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            input_ids_tensor = self.data_object.source_encoded_input_ids[qitem.index]
+            input_masks_tensor = self.data_object.source_encoded_attn_masks[qitem.index]
+
+            if self.api_server:
+                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
+            else:
+                #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
+                tokens_cache = []
+                tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
+
+                _ = self.model.generate(    input_ids=input_ids_tensor,
+                                            attention_mask=input_masks_tensor,
+                                            pad_token_id=self.tokenizer.pad_token_id,
+                                            streamer = tokens_streamer,
+                                            **gen_kwargs
+                                            )
+
+                output_tokens = tokens_streamer.get_out_tokens()
+
+                n_tokens = len(output_tokens)
+                response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+                bi = response_array.buffer_info()
+                response = [lg.QuerySampleResponse(
+                    qitem.id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+
+    def issue_queries(self, query_samples):
+
+        self.query_queue.put(query_samples[0])
+
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+        self.first_token_queue.put(None)
+        self.ft_response_thread.join()
diff --git a/language/gpt-j/api-endpoint-artifacts/Dockerfile-API b/language/gpt-j/api-endpoint-artifacts/Dockerfile-API
new file mode 100644
index 000000000..163974c7f
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/Dockerfile-API
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:latest
+
+COPY inference ./inference
+COPY cnn_eval.json ./cnn_eval.json
+COPY gpt-model-info ./gpt-model-info
+
+RUN apt-get update && apt install build-essential -y
+RUN conda install pybind11==2.10.4 -c conda-forge -y
+#RUN conda install mkl mkl-include -y
+#RUN conda install gperftools jemalloc==5.2.1 -c conda-forge -y
+RUN pip install --upgrade pip
+RUN pip install transformers==4.31.0 nltk==3.8.1 evaluate==0.4.0 absl-py==1.4.0 rouge-score==0.1.2 sentencepiece==0.1.99 accelerate==0.21.0 grpcio-tools datasets simplejson
+RUN cd inference/loadgen && python -m pip install .
+#RUN cd inference/loadgen && CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel && cd .. && pip install --force-reinstall loadgen/dist/`ls -r loadgen/dist/ | head -n1` ;
+RUN cp inference/mlperf.conf inference/language/gpt-j/mlperf.conf
+
+ENV DATASET_PATH=/workspace/cnn_eval.json
+ENV CHECKPOINT_PATH=/workspace/gpt-model-info
+ENV ACCURACY_LOG_FILE=/workspace/inference/language/gpt-j/offline-logs/mlperf_log_accuracy.json
+
+#ENV KMP_BLOCKTIME=1
+#ENV KMP_SETTINGS=1
+#ENV KMP_AFFINITY=granularity=fine,compact,1,0
+# IOMP
+#ENV LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
+# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+#ENV LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
diff --git a/language/gpt-j/api-endpoint-artifacts/README.md b/language/gpt-j/api-endpoint-artifacts/README.md
new file mode 100644
index 000000000..956f9fd2e
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/README.md
@@ -0,0 +1,37 @@
+# Using OpenShift AI Model Serving (Caikit/TGIS) with MLPerf Inference
+
+Prerequisites:
+ - Install the OpenShift AI model serving stack
+ - Add your AWS credentials to `secret.yaml` access the model files
+ - Apply `secret.yaml`, `sa.yaml`
+ - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml`
+ - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
+ - Create a benchmark pod using `benchmark.yaml`
+
+In the pod, before any benchmark, first run `cd inference/language/gpt-j`
+
+## STANDALONE TGIS INSTRUCTIONS
+For the full accuracy benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+You can then run the same evaluation/consolidation scripts as the regular benchmark
+Example API host: `https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com`
+
+
+For the performance benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+(It is the same, just with `--accuracy` removed)
+
+
+TBD
+For the performance benchmark (server), run in the pod:
+```
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+```
+(Configure target qps in `user.conf`)
+
+
+NOTE: Hyperparams are currently configured for 1xA100 40GB
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
new file mode 100644
index 000000000..322543537
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mlperf-inference-gpt
+spec:
+  restartPolicy: Never
+  containers:
+  - name: mlperf-env
+    image: quay.io/meyceoz/mlperf-inference-gpt:offline
+    resources:
+      requests:
+        memory: 20000Mi
+    volumeMounts:
+    - mountPath: /dev/shm
+      name: dshm
+    command: [ "/bin/sh", "-c" ]
+    args: [ "sleep infinity" ]
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/model.yaml b/language/gpt-j/api-endpoint-artifacts/model.yaml
new file mode 100644
index 000000000..69ce8ae25
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/model.yaml
@@ -0,0 +1,20 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    serving.knative.openshift.io/enablePassthrough: "true"
+    sidecar.istio.io/inject: "true"
+    sidecar.istio.io/rewriteAppHTTPProbers: "true"
+  name: gpt-j-isvc
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    #apiVersion: serving.kserve.io/v1alpha2
+    serviceAccountName: sa
+    #timeout: 240
+    model:
+      modelFormat:
+        name: pytorch
+      runtime: tgis-runtime-grpc
+      storageUri: s3://mlperf-inference-models/gpt-j-cnn/
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log b/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
new file mode 100644
index 000000000..4e4fdb9e2
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
@@ -0,0 +1 @@
+/Users/meyceoz/Documents/model-serving/conversion-zone/venv/bin/python3: can't open file '/Users/meyceoz/Documents/model-serving/benchmarking/inference/language/gpt-j/api-endpoint-artifacts/main.py': [Errno 2] No such file or directory
diff --git a/language/gpt-j/api-endpoint-artifacts/sa.yaml b/language/gpt-j/api-endpoint-artifacts/sa.yaml
new file mode 100644
index 000000000..6ccfd13b7
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/sa.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sa
+secrets:
+- name: storage-config
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/secret.yaml b/language/gpt-j/api-endpoint-artifacts/secret.yaml
new file mode 100644
index 000000000..436eaa057
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/secret.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  annotations:
+    serving.kserve.io/s3-endpoint: "s3.amazonaws.com" # replace with your s3 endpoint e.g minio-service.kubeflow:9000
+    serving.kserve.io/s3-usehttps: "1" # by default 1, if testing with minio you can set to 0
+    serving.kserve.io/s3-region: "us-east-1"
+    serving.kserve.io/s3-useanoncredential: "false" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials
+  name: storage-config
+stringData:
+  "AWS_ACCESS_KEY_ID": "XXXXXXXXXXXXXXXXXX"
+  "AWS_SECRET_ACCESS_KEY": "XXXXXXXXXXXXXXXXXXXXXX"
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
new file mode 100644
index 000000000..d3e1e99d0
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
@@ -0,0 +1,67 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: tgis-runtime-grpc
+spec:
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: pytorch
+  containers:
+    - name: kserve-container
+      image: quay.io/opendatahub/text-generation-inference:fast
+      command: ["text-generation-launcher"]
+      args:
+        - "--model-name=/mnt/models/"
+        - "--port=3000"
+        - "--grpc-port=8033"
+      env:
+        - name: TRANSFORMERS_CACHE
+          value: /tmp/transformers_cache
+        - name: NUM_GPUS
+          value: "1"
+          #- name: DTYPE_STR
+          #  value: float16
+        # Dynamic batch size changes
+        - name: MAX_BATCH_SIZE
+          value: "4"
+        - name: MAX_CONCURRENT_REQUESTS
+          value: "50"
+        - name: MAX_BATCH_WEIGHT
+          value: "10000"
+        - name: MAX_SEQUENCE_LENGTH
+          value: "2048"
+        - name: MAX_PREFILL_WEIGHT
+          value: "0"
+        - name: MAX_NEW_TOKENS
+          value: "128"
+#        - name: FLASH_ATTENTION
+#          value: "false"
+#        - name: DEPLOYMENT_FRAMEWORK
+#          value: hf_custom_tp
+        - name: LOG_GPU_USAGE_INTERVAL
+          value: "5"
+      resources: # configure as required
+        requests:
+          cpu: 12
+          memory: 100Gi
+          nvidia.com/gpu: 1
+        limits:
+          nvidia.com/gpu: 1
+      readinessProbe: # Use exec probes instad of httpGet since the probes' port gets rewritten to the containerPort
+        exec:
+          command:
+            - curl
+            - localhost:3000/health
+        initialDelaySeconds: 5
+      livenessProbe:
+        exec:
+          command:
+            - curl
+            - localhost:3000/health
+        initialDelaySeconds: 5
+        # periodSeconds: 5
+      ports:
+        - containerPort: 8033
+          name: h2c
+          protocol: TCP
\ No newline at end of file
diff --git a/language/gpt-j/dataset.py b/language/gpt-j/dataset.py
index 37d9cf354..1ec37ae1d 100644
--- a/language/gpt-j/dataset.py
+++ b/language/gpt-j/dataset.py
@@ -51,7 +51,7 @@ def __init__(self, dataset_path, batch_size=1, pad_val=1, pad_max=196, total_cou
         self.targets = [
             f"{example['output']}" for example in self.list_data_dict]
 
-        self.source_encoded_input_ids, self.source_encoded_attn_masks = self.encode_samples()
+        self.source_encoded_input_ids = self.encode_samples()#, self.source_encoded_attn_masks = self.encode_samples()
 
         self.count = total_count_override or len(self.sources)
         self.perf_count = perf_count_override or self.count
@@ -62,16 +62,19 @@ def encode_samples(self):
         total_samples = len(self.sources)
 
         source_encoded_input_ids = []
-        source_encoded_attn_masks = []
+        #source_encoded_attn_masks = []
 
         for i in range(total_samples):
-            source_encoded = self.tokenizer(self.sources[i], return_tensors="pt",
-                                            padding=True, truncation=True,
-                                            max_length=1919)
-            source_encoded_input_ids.append(source_encoded.input_ids)
-            source_encoded_attn_masks.append(source_encoded.attention_mask)
-
-        return source_encoded_input_ids, source_encoded_attn_masks
+            #source_encoded = self.tokenizer(self.sources[i], return_tensors="pt",
+            #                                padding=True, truncation=True,
+            #                                max_length=1919)
+            tok = self.tokenizer(self.sources[i])["input_ids"]
+            if len(tok) > 1920:
+                self.sources[i] = self.sources[i][:-16 - (len(tok) - 1920)*4] + self.sources[i][-16:]
+            source_encoded_input_ids.append(self.sources[i])
+            #source_encoded_attn_masks.append(source_encoded.attention_mask)
+
+        return source_encoded_input_ids#, source_encoded_attn_masks
 
     def LoadSamplesToRam(self, sample_list):
         pass
diff --git a/language/gpt-j/generation_pb2.py b/language/gpt-j/generation_pb2.py
new file mode 100644
index 000000000..544e1ec7d
--- /dev/null
+++ b/language/gpt-j/generation_pb2.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: generation.proto
+# Protobuf Python Version: 4.25.0
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10generation.proto\x12\x05\x66maas\"\xa1\x01\n\x18\x42\x61tchedGenerationRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12\x16\n\tprefix_id\x18\x02 \x01(\tH\x00\x88\x01\x01\x12*\n\x08requests\x18\x03 \x03(\x0b\x32\x18.fmaas.GenerationRequest\x12!\n\x06params\x18\n \x01(\x0b\x32\x11.fmaas.ParametersB\x0c\n\n_prefix_id\"\x9f\x01\n\x17SingleGenerationRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12\x16\n\tprefix_id\x18\x02 \x01(\tH\x00\x88\x01\x01\x12)\n\x07request\x18\x03 \x01(\x0b\x32\x18.fmaas.GenerationRequest\x12!\n\x06params\x18\n \x01(\x0b\x32\x11.fmaas.ParametersB\x0c\n\n_prefix_id\"I\n\x19\x42\x61tchedGenerationResponse\x12,\n\tresponses\x18\x01 \x03(\x0b\x32\x19.fmaas.GenerationResponse\"!\n\x11GenerationRequest\x12\x0c\n\x04text\x18\x02 \x01(\t\"\xf3\x01\n\x12GenerationResponse\x12\x19\n\x11input_token_count\x18\x06 \x01(\r\x12\x1d\n\x15generated_token_count\x18\x02 \x01(\r\x12\x0c\n\x04text\x18\x04 \x01(\t\x12&\n\x0bstop_reason\x18\x07 \x01(\x0e\x32\x11.fmaas.StopReason\x12\x15\n\rstop_sequence\x18\x0b \x01(\t\x12\x0c\n\x04seed\x18\n \x01(\x04\x12 \n\x06tokens\x18\x08 \x03(\x0b\x32\x10.fmaas.TokenInfo\x12&\n\x0cinput_tokens\x18\t \x03(\x0b\x32\x10.fmaas.TokenInfo\"\x81\x02\n\nParameters\x12%\n\x06method\x18\x01 \x01(\x0e\x32\x15.fmaas.DecodingMethod\x12+\n\x08sampling\x18\x02 \x01(\x0b\x32\x19.fmaas.SamplingParameters\x12)\n\x08stopping\x18\x03 \x01(\x0b\x32\x17.fmaas.StoppingCriteria\x12(\n\x08response\x18\x04 \x01(\x0b\x32\x16.fmaas.ResponseOptions\x12+\n\x08\x64\x65\x63oding\x18\x05 \x01(\x0b\x32\x19.fmaas.DecodingParameters\x12\x1d\n\x15truncate_input_tokens\x18\x06 \x01(\r\"\xc5\x01\n\x12\x44\x65\x63odingParameters\x12\x1a\n\x12repetition_penalty\x18\x01 \x01(\x02\x12\x44\n\x0elength_penalty\x18\x02 \x01(\x0b\x32\'.fmaas.DecodingParameters.LengthPenaltyH\x00\x88\x01\x01\x1a:\n\rLengthPenalty\x12\x13\n\x0bstart_index\x18\x01 \x01(\r\x12\x14\n\x0c\x64\x65\x63\x61y_factor\x18\x02 \x01(\x02\x42\x11\n\x0f_length_penalty\"v\n\x12SamplingParameters\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_k\x18\x02 \x01(\r\x12\r\n\x05top_p\x18\x03 \x01(\x02\x12\x11\n\ttypical_p\x18\x04 \x01(\x02\x12\x11\n\x04seed\x18\x05 \x01(\x04H\x00\x88\x01\x01\x42\x07\n\x05_seed\"\xb3\x01\n\x10StoppingCriteria\x12\x16\n\x0emax_new_tokens\x18\x01 \x01(\r\x12\x16\n\x0emin_new_tokens\x18\x02 \x01(\r\x12\x19\n\x11time_limit_millis\x18\x03 \x01(\r\x12\x16\n\x0estop_sequences\x18\x04 \x03(\t\x12\"\n\x15include_stop_sequence\x18\x05 \x01(\x08H\x00\x88\x01\x01\x42\x18\n\x16_include_stop_sequence\"\x98\x01\n\x0fResponseOptions\x12\x12\n\ninput_text\x18\x01 \x01(\x08\x12\x18\n\x10generated_tokens\x18\x02 \x01(\x08\x12\x14\n\x0cinput_tokens\x18\x03 \x01(\x08\x12\x16\n\x0etoken_logprobs\x18\x04 \x01(\x08\x12\x13\n\x0btoken_ranks\x18\x05 \x01(\x08\x12\x14\n\x0ctop_n_tokens\x18\x06 \x01(\r\"\x92\x01\n\tTokenInfo\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07logprob\x18\x03 \x01(\x02\x12\x0c\n\x04rank\x18\x04 \x01(\r\x12-\n\ntop_tokens\x18\x05 \x03(\x0b\x32\x19.fmaas.TokenInfo.TopToken\x1a)\n\x08TopToken\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07logprob\x18\x03 \x01(\x02\"k\n\x16\x42\x61tchedTokenizeRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12(\n\x08requests\x18\x02 \x03(\x0b\x32\x16.fmaas.TokenizeRequest\x12\x15\n\rreturn_tokens\x18\x03 \x01(\x08\"E\n\x17\x42\x61tchedTokenizeResponse\x12*\n\tresponses\x18\x01 \x03(\x0b\x32\x17.fmaas.TokenizeResponse\"\x1f\n\x0fTokenizeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\"7\n\x10TokenizeResponse\x12\x13\n\x0btoken_count\x18\x01 \x01(\r\x12\x0e\n\x06tokens\x18\x02 \x03(\t\"$\n\x10ModelInfoRequest\x12\x10\n\x08model_id\x18\x01 \x01(\t\"\xb4\x01\n\x11ModelInfoResponse\x12\x36\n\nmodel_kind\x18\x01 \x01(\x0e\x32\".fmaas.ModelInfoResponse.ModelKind\x12\x1b\n\x13max_sequence_length\x18\x02 \x01(\r\x12\x16\n\x0emax_new_tokens\x18\x03 \x01(\r\"2\n\tModelKind\x12\x10\n\x0c\x44\x45\x43ODER_ONLY\x10\x00\x12\x13\n\x0f\x45NCODER_DECODER\x10\x01*(\n\x0e\x44\x65\x63odingMethod\x12\n\n\x06GREEDY\x10\x00\x12\n\n\x06SAMPLE\x10\x01*\x8b\x01\n\nStopReason\x12\x10\n\x0cNOT_FINISHED\x10\x00\x12\x0e\n\nMAX_TOKENS\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\r\n\tCANCELLED\x10\x03\x12\x0e\n\nTIME_LIMIT\x10\x04\x12\x11\n\rSTOP_SEQUENCE\x10\x05\x12\x0f\n\x0bTOKEN_LIMIT\x10\x06\x12\t\n\x05\x45RROR\x10\x07\x32\xc4\x02\n\x11GenerationService\x12O\n\x08Generate\x12\x1f.fmaas.BatchedGenerationRequest\x1a .fmaas.BatchedGenerationResponse\"\x00\x12O\n\x0eGenerateStream\x12\x1e.fmaas.SingleGenerationRequest\x1a\x19.fmaas.GenerationResponse\"\x00\x30\x01\x12K\n\x08Tokenize\x12\x1d.fmaas.BatchedTokenizeRequest\x1a\x1e.fmaas.BatchedTokenizeResponse\"\x00\x12@\n\tModelInfo\x12\x17.fmaas.ModelInfoRequest\x1a\x18.fmaas.ModelInfoResponse\"\x00\x62\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'generation_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+  DESCRIPTOR._options = None
+  _globals['_DECODINGMETHOD']._serialized_start=2266
+  _globals['_DECODINGMETHOD']._serialized_end=2306
+  _globals['_STOPREASON']._serialized_start=2309
+  _globals['_STOPREASON']._serialized_end=2448
+  _globals['_BATCHEDGENERATIONREQUEST']._serialized_start=28
+  _globals['_BATCHEDGENERATIONREQUEST']._serialized_end=189
+  _globals['_SINGLEGENERATIONREQUEST']._serialized_start=192
+  _globals['_SINGLEGENERATIONREQUEST']._serialized_end=351
+  _globals['_BATCHEDGENERATIONRESPONSE']._serialized_start=353
+  _globals['_BATCHEDGENERATIONRESPONSE']._serialized_end=426
+  _globals['_GENERATIONREQUEST']._serialized_start=428
+  _globals['_GENERATIONREQUEST']._serialized_end=461
+  _globals['_GENERATIONRESPONSE']._serialized_start=464
+  _globals['_GENERATIONRESPONSE']._serialized_end=707
+  _globals['_PARAMETERS']._serialized_start=710
+  _globals['_PARAMETERS']._serialized_end=967
+  _globals['_DECODINGPARAMETERS']._serialized_start=970
+  _globals['_DECODINGPARAMETERS']._serialized_end=1167
+  _globals['_DECODINGPARAMETERS_LENGTHPENALTY']._serialized_start=1090
+  _globals['_DECODINGPARAMETERS_LENGTHPENALTY']._serialized_end=1148
+  _globals['_SAMPLINGPARAMETERS']._serialized_start=1169
+  _globals['_SAMPLINGPARAMETERS']._serialized_end=1287
+  _globals['_STOPPINGCRITERIA']._serialized_start=1290
+  _globals['_STOPPINGCRITERIA']._serialized_end=1469
+  _globals['_RESPONSEOPTIONS']._serialized_start=1472
+  _globals['_RESPONSEOPTIONS']._serialized_end=1624
+  _globals['_TOKENINFO']._serialized_start=1627
+  _globals['_TOKENINFO']._serialized_end=1773
+  _globals['_TOKENINFO_TOPTOKEN']._serialized_start=1732
+  _globals['_TOKENINFO_TOPTOKEN']._serialized_end=1773
+  _globals['_BATCHEDTOKENIZEREQUEST']._serialized_start=1775
+  _globals['_BATCHEDTOKENIZEREQUEST']._serialized_end=1882
+  _globals['_BATCHEDTOKENIZERESPONSE']._serialized_start=1884
+  _globals['_BATCHEDTOKENIZERESPONSE']._serialized_end=1953
+  _globals['_TOKENIZEREQUEST']._serialized_start=1955
+  _globals['_TOKENIZEREQUEST']._serialized_end=1986
+  _globals['_TOKENIZERESPONSE']._serialized_start=1988
+  _globals['_TOKENIZERESPONSE']._serialized_end=2043
+  _globals['_MODELINFOREQUEST']._serialized_start=2045
+  _globals['_MODELINFOREQUEST']._serialized_end=2081
+  _globals['_MODELINFORESPONSE']._serialized_start=2084
+  _globals['_MODELINFORESPONSE']._serialized_end=2264
+  _globals['_MODELINFORESPONSE_MODELKIND']._serialized_start=2214
+  _globals['_MODELINFORESPONSE_MODELKIND']._serialized_end=2264
+  _globals['_GENERATIONSERVICE']._serialized_start=2451
+  _globals['_GENERATIONSERVICE']._serialized_end=2775
+# @@protoc_insertion_point(module_scope)
diff --git a/language/gpt-j/generation_pb2_grpc.py b/language/gpt-j/generation_pb2_grpc.py
new file mode 100644
index 000000000..9ab1e4eb5
--- /dev/null
+++ b/language/gpt-j/generation_pb2_grpc.py
@@ -0,0 +1,169 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import generation_pb2 as generation__pb2
+
+
+class GenerationServiceStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Generate = channel.unary_unary(
+                '/fmaas.GenerationService/Generate',
+                request_serializer=generation__pb2.BatchedGenerationRequest.SerializeToString,
+                response_deserializer=generation__pb2.BatchedGenerationResponse.FromString,
+                )
+        self.GenerateStream = channel.unary_stream(
+                '/fmaas.GenerationService/GenerateStream',
+                request_serializer=generation__pb2.SingleGenerationRequest.SerializeToString,
+                response_deserializer=generation__pb2.GenerationResponse.FromString,
+                )
+        self.Tokenize = channel.unary_unary(
+                '/fmaas.GenerationService/Tokenize',
+                request_serializer=generation__pb2.BatchedTokenizeRequest.SerializeToString,
+                response_deserializer=generation__pb2.BatchedTokenizeResponse.FromString,
+                )
+        self.ModelInfo = channel.unary_unary(
+                '/fmaas.GenerationService/ModelInfo',
+                request_serializer=generation__pb2.ModelInfoRequest.SerializeToString,
+                response_deserializer=generation__pb2.ModelInfoResponse.FromString,
+                )
+
+
+class GenerationServiceServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Generate(self, request, context):
+        """Generates text given a text prompt, for one or more inputs
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateStream(self, request, context):
+        """Generates text given a single input prompt, streaming the response
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Tokenize(self, request, context):
+        """Tokenize text
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ModelInfo(self, request, context):
+        """Model info
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_GenerationServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Generate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Generate,
+                    request_deserializer=generation__pb2.BatchedGenerationRequest.FromString,
+                    response_serializer=generation__pb2.BatchedGenerationResponse.SerializeToString,
+            ),
+            'GenerateStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.GenerateStream,
+                    request_deserializer=generation__pb2.SingleGenerationRequest.FromString,
+                    response_serializer=generation__pb2.GenerationResponse.SerializeToString,
+            ),
+            'Tokenize': grpc.unary_unary_rpc_method_handler(
+                    servicer.Tokenize,
+                    request_deserializer=generation__pb2.BatchedTokenizeRequest.FromString,
+                    response_serializer=generation__pb2.BatchedTokenizeResponse.SerializeToString,
+            ),
+            'ModelInfo': grpc.unary_unary_rpc_method_handler(
+                    servicer.ModelInfo,
+                    request_deserializer=generation__pb2.ModelInfoRequest.FromString,
+                    response_serializer=generation__pb2.ModelInfoResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'fmaas.GenerationService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class GenerationService(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Generate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/Generate',
+            generation__pb2.BatchedGenerationRequest.SerializeToString,
+            generation__pb2.BatchedGenerationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/fmaas.GenerationService/GenerateStream',
+            generation__pb2.SingleGenerationRequest.SerializeToString,
+            generation__pb2.GenerationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Tokenize(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/Tokenize',
+            generation__pb2.BatchedTokenizeRequest.SerializeToString,
+            generation__pb2.BatchedTokenizeResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ModelInfo(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/fmaas.GenerationService/ModelInfo',
+            generation__pb2.ModelInfoRequest.SerializeToString,
+            generation__pb2.ModelInfoResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/language/gpt-j/inference.py b/language/gpt-j/inference.py
new file mode 100644
index 000000000..a643ea0f2
--- /dev/null
+++ b/language/gpt-j/inference.py
@@ -0,0 +1,210 @@
+from typing import Optional, Union
+
+import grpc
+import generation_pb2_grpc
+import socket
+import ssl
+import sys
+
+
+def get_server_certificate(host: str, port: int) -> str:
+    """connect to host:port and get the certificate it presents
+
+    This is almost the same as `ssl.get_server_certificate`, but
+    when opening the TLS socket, `server_hostname` is also provided.
+
+    This retrieves the correct certificate for hosts using name-based
+    virtual hosting.
+    """
+    if sys.version_info >= (3, 10):
+        # ssl.get_server_certificate supports TLS SNI only above 3.10
+        # https://github.com/python/cpython/pull/16820
+        return ssl.get_server_certificate((host, port))
+
+    context = ssl.SSLContext()
+
+    with socket.create_connection((host, port)) as sock, context.wrap_socket(
+        sock, server_hostname=host
+    ) as ssock:
+        cert_der = ssock.getpeercert(binary_form=True)
+
+    assert cert_der
+    return ssl.DER_cert_to_PEM_cert(cert_der)
+
+
+class GrpcClient:
+    def __init__(
+        self,
+        host: str,
+        port: int,
+        *,
+        insecure: bool = False,
+        verify: Optional[bool] = None,
+        ca_cert: Union[None, bytes, str] = None,
+        client_cert: Union[None, bytes, str] = None,
+        client_key: Union[None, bytes, str] = None,
+    ) -> None:
+        self._channel = self._make_channel(
+            host,
+            port,
+            insecure=insecure,
+            verify=verify,
+            client_key=client_key,
+            client_cert=client_cert,
+            ca_cert=ca_cert,
+        )
+        self.generation_service_stub = generation_pb2_grpc.GenerationServiceStub(
+            self._channel
+        )
+
+    def make_request(self, texts: str, model_id: str = "flan-t5-small"):
+        request = generation_pb2_grpc.generation__pb2.BatchedGenerationRequest(
+            model_id=model_id,
+            requests=[generation_pb2_grpc.generation__pb2.GenerationRequest(text=text) for text in texts],
+            params=generation_pb2_grpc.generation__pb2.Parameters(
+                   method=generation_pb2_grpc.generation__pb2.SAMPLE,
+                   stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
+                       max_new_tokens=128,
+                       min_new_tokens=30,
+                   ),
+                   sampling=generation_pb2_grpc.generation__pb2.SamplingParameters(
+                       top_k=4                       
+                   )
+            )
+        )
+        result = self.generation_service_stub.Generate(request=request)
+        return result
+    
+    def make_request_stream(self, text: str, model_id: str = "flan-t5-small"):
+        request = generation_pb2_grpc.generation__pb2.SingleGenerationRequest(
+            model_id=model_id,
+            request=generation_pb2_grpc.generation__pb2.GenerationRequest(text=text),
+            params=generation_pb2_grpc.generation__pb2.Parameters(
+                   method=generation_pb2_grpc.generation__pb2.SAMPLE,
+                   stopping=generation_pb2_grpc.generation__pb2.StoppingCriteria(
+                       max_new_tokens=128,
+                       min_new_tokens=30,
+                   ),
+                   sampling=generation_pb2_grpc.generation__pb2.SamplingParameters(
+                       top_k=4                       
+                   ),
+                   response=generation_pb2_grpc.generation__pb2.ResponseOptions(
+                       generated_tokens=True
+                   )
+            )
+        )
+        result = self.generation_service_stub.GenerateStream(request=request)
+        return result
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc_info):
+        self._close()
+        return False
+
+    def _close(self):
+        try:
+            if hasattr(self, "_channel") and self._channel:
+                self._channel.close()
+        except Exception as exc:
+            print(f"Unexpected exception while closing client: {exc}")
+
+    def __del__(self):
+        self._close()
+
+    def _make_channel(
+        self,
+        host: str,
+        port: int,
+        *,
+        insecure: bool = False,
+        verify: Optional[bool] = None,
+        ca_cert: Union[None, bytes, str] = None,
+        client_key: Union[None, bytes, str] = None,
+        client_cert: Union[None, bytes, str] = None,
+    ) -> grpc.Channel:
+        """Creates a grpc channel
+
+        Args:
+        - host: str
+        - port: str
+        - (optional) insecure: use a plaintext connection (default=False)
+        - (optional) verify: set to False to disable remote host certificate(s)
+                     verification. Cannot be used with `plaintext` or with MTLS
+        - (optional) ca_cert: certificate authority to use
+        - (optional) client_key: client key for mTLS mode
+        - (optional) client_cert: client cert for mTLS mode
+
+        """
+        if not host.strip():
+            raise ValueError("A non empty host name is required")
+        if int(port) <= 0:
+            raise ValueError("A non zero port number is required")
+        if insecure and any(
+            (val is not None) for val in (ca_cert, client_key, client_cert)
+        ):
+            raise ValueError("cannot use insecure with TLS/mTLS certificates")
+        if insecure and verify:
+            raise ValueError("insecure cannot be used with verify")
+
+        client_key_bytes = self._try_load_certificate(client_key)
+        client_cert_bytes = self._try_load_certificate(client_cert)
+        ca_cert_bytes = self._try_load_certificate(ca_cert)
+
+        connection = f"{host}:{port}"
+        if insecure:
+            print("Connecting over an insecure plaintext grpc channel")
+            return grpc.insecure_channel(connection)
+
+        credentials_kwargs: dict[str, bytes] = {}
+        if ca_cert_bytes and not (any((client_cert_bytes, client_key_bytes))):
+            print("Connecting using provided CA certificate for secure channel")
+            credentials_kwargs.update(root_certificates=ca_cert_bytes)
+        elif client_cert_bytes and client_key_bytes and ca_cert_bytes:
+            print("Connecting using mTLS for secure channel")
+            credentials_kwargs.update(
+                root_certificates=ca_cert_bytes,
+                private_key=client_key_bytes,
+                certificate_chain=client_cert_bytes,
+            )
+        elif verify is False:
+            cert = get_server_certificate(host, port).encode()
+            credentials_kwargs.update(root_certificates=cert)
+
+        return grpc.secure_channel(
+            connection, grpc.ssl_channel_credentials(**credentials_kwargs)
+        )
+
+    @staticmethod
+    def _try_load_certificate(certificate: Union[None, bytes, str]) -> Optional[bytes]:
+        """If the certificate points to a file, return the contents (plaintext reads).
+        Else return the bytes"""
+        if not certificate:
+            return None
+
+        if isinstance(certificate, bytes):
+            return certificate
+
+        if isinstance(certificate, str):
+            with open(certificate, "rb") as secret_file:
+                return secret_file.read()
+        raise ValueError(
+            f"{certificate=} should be a path to a certificate files or bytes"
+        )
+
+
+if __name__ == "__main__":
+    # host = "flan-t5-small-predictor-caikit-testing.apps.aisrhods-wx.8goc.p1.openshiftapps.com"
+    # port = 443
+    host = "localhost"
+    port = 8033
+    print(f"connecting to {host=}:{port}")
+    client = GrpcClient(
+        host,
+        port,
+        insecure=True,
+        # verify=False,
+    )
+
+    client.make_request("this is the query text", model_id="flan-t5-small")
diff --git a/language/gpt-j/main.py b/language/gpt-j/main.py
index 367af5212..edb8ec84e 100644
--- a/language/gpt-j/main.py
+++ b/language/gpt-j/main.py
@@ -2,91 +2,101 @@
 import mlperf_loadgen as lg
 import argparse
 import os
-
+import logging
 import sys
-from backend import get_SUT
+from SUT import SUT, SUTServer
+
 sys.path.insert(0, os.getcwd())
 
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("GPT-J-MAIN")
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--backend", choices=["pytorch"], default="pytorch", help="Backend")
-    parser.add_argument("--scenario", choices=["SingleStream", "Offline",
-                        "Server"], default="Offline", help="Scenario")
-    parser.add_argument("--model-path", default="EleutherAI/gpt-j-6B", help="")
-    parser.add_argument(
-        "--dataset-path", default="./data/cnn_eval.json", help="")
-    parser.add_argument("--accuracy", action="store_true",
-                        help="enable accuracy pass")
-    parser.add_argument("--dtype", default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
-    parser.add_argument("--quantized", action="store_true",
-                        help="use quantized model (only valid for onnxruntime backend)")
-    parser.add_argument("--profile", action="store_true",
-                        help="enable profiling (only valid for onnxruntime backend)")
-    parser.add_argument("--gpu", action="store_true",
-                        help="use GPU instead of CPU for the inference")
-    parser.add_argument("--audit_conf", default="audit.conf",
-                        help="audit config for LoadGen settings during compliance runs")
-    parser.add_argument(
-        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
-    parser.add_argument("--user_conf", default="user.conf",
-                        help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--max_examples", type=int, default=13368,
-                        help="Maximum number of examples to consider (not limited by default)")
+    parser.add_argument("--scenario", type=str, choices=["Offline", "Server"], default="Offline", help="Scenario")
+    parser.add_argument("--model-path", type=str, default="EleutherAI/gpt-j-6B", help="Model name")
+    parser.add_argument("--dataset-path", type=str, default=None, help="")
+    parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
+    parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
+    parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
+    parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
+    parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
+    parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
+    parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
+    parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
+    parser.add_argument("--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config")
+    parser.add_argument("--user-conf", type=str, default="user.conf", help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument("--total-sample-count", type=int, default=13368, help="Number of samples to use in benchmark.") # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it
+    parser.add_argument("--output-log-dir", type=str, default="output-logs", help="Where logs are saved")
+    parser.add_argument("--enable-log-trace", action="store_true", help="Enable log tracing. This file can become quite large")
+    parser.add_argument("--num-workers", type=int, default=1, help="Number of workers to process queries")
+
     args = parser.parse_args()
     return args
 
 
 scenario_map = {
-    "SingleStream": lg.TestScenario.SingleStream,
-    "Offline": lg.TestScenario.Offline,
-    "Server": lg.TestScenario.Server,
-    "MultiStream": lg.TestScenario.MultiStream
-}
+    "offline": lg.TestScenario.Offline,
+    "server": lg.TestScenario.Server,
+    }
 
+sut_map = {
+        "offline": SUT,
+        "server": SUTServer
+        }
 
 def main():
     args = get_args()
 
-    sut = get_SUT(
-        model_path=args.model_path,
-        scenario=args.scenario,
-        dtype=args.dtype,
-        dataset_path=args.dataset_path,
-        max_examples=args.max_examples,
-        use_gpu=args.gpu,
-    )
-
     settings = lg.TestSettings()
-    settings.scenario = scenario_map[args.scenario]
+    settings.scenario = scenario_map[args.scenario.lower()]
     # Need to update the conf
     settings.FromConfig(args.mlperf_conf, "gptj", args.scenario)
     settings.FromConfig(args.user_conf, "gptj", args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
+        log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
     else:
         settings.mode = lg.TestMode.PerformanceOnly
-    log_path = os.environ.get("LOG_PATH")
-    if not log_path:
-        log_path = "build/logs"
-    if not os.path.exists(log_path):
-        os.makedirs(log_path)
+
+    os.makedirs(args.output_log_dir, exist_ok=True)
     log_output_settings = lg.LogOutputSettings()
-    log_output_settings.outdir = log_path
+    log_output_settings.outdir = args.output_log_dir
     log_output_settings.copy_summary_to_stdout = True
     log_settings = lg.LogSettings()
     log_settings.log_output = log_output_settings
-    log_settings.enable_trace = True
+    log_settings.enable_trace = args.enable_log_trace
+
+    sut_cls = sut_map[args.scenario.lower()]
+
+    sut = sut_cls(
+        model_path=args.model_path,
+        api_server=args.api_server,
+        api_model_name=args.api_model_name,
+        grpc=args.grpc,
+        batch_grpc=args.batch_grpc,
+        dtype=args.dtype,
+        dataset_path=args.dataset_path,
+        total_sample_count=args.total_sample_count,
+        device=args.device,
+    )
+
+    # Start sut before loadgen starts
+    sut.start()
+    lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
+    log.info("Starting Benchmark run")
+    lg.StartTestWithLogSettings(lgSUT, sut.qsl, settings, log_settings, args.audit_conf)
+
+    # Stop sut after completion
+    sut.stop()
 
-    lg.StartTestWithLogSettings(sut.sut, sut.qsl, settings, log_settings, args.audit_conf)
-    print("Test Done!")
+    log.info("Run Completed!")
 
-    print("Destroying SUT...")
-    lg.DestroySUT(sut.sut)
+    log.info("Destroying SUT...")
+    lg.DestroySUT(lgSUT)
 
-    print("Destroying QSL...")
+    log.info("Destroying QSL...")
     lg.DestroyQSL(sut.qsl)
 
 
diff --git a/language/gpt-j/old_main.py b/language/gpt-j/old_main.py
new file mode 100644
index 000000000..367af5212
--- /dev/null
+++ b/language/gpt-j/old_main.py
@@ -0,0 +1,94 @@
+import subprocess
+import mlperf_loadgen as lg
+import argparse
+import os
+
+import sys
+from backend import get_SUT
+sys.path.insert(0, os.getcwd())
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--backend", choices=["pytorch"], default="pytorch", help="Backend")
+    parser.add_argument("--scenario", choices=["SingleStream", "Offline",
+                        "Server"], default="Offline", help="Scenario")
+    parser.add_argument("--model-path", default="EleutherAI/gpt-j-6B", help="")
+    parser.add_argument(
+        "--dataset-path", default="./data/cnn_eval.json", help="")
+    parser.add_argument("--accuracy", action="store_true",
+                        help="enable accuracy pass")
+    parser.add_argument("--dtype", default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
+    parser.add_argument("--quantized", action="store_true",
+                        help="use quantized model (only valid for onnxruntime backend)")
+    parser.add_argument("--profile", action="store_true",
+                        help="enable profiling (only valid for onnxruntime backend)")
+    parser.add_argument("--gpu", action="store_true",
+                        help="use GPU instead of CPU for the inference")
+    parser.add_argument("--audit_conf", default="audit.conf",
+                        help="audit config for LoadGen settings during compliance runs")
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
+    parser.add_argument("--user_conf", default="user.conf",
+                        help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument("--max_examples", type=int, default=13368,
+                        help="Maximum number of examples to consider (not limited by default)")
+    args = parser.parse_args()
+    return args
+
+
+scenario_map = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "Offline": lg.TestScenario.Offline,
+    "Server": lg.TestScenario.Server,
+    "MultiStream": lg.TestScenario.MultiStream
+}
+
+
+def main():
+    args = get_args()
+
+    sut = get_SUT(
+        model_path=args.model_path,
+        scenario=args.scenario,
+        dtype=args.dtype,
+        dataset_path=args.dataset_path,
+        max_examples=args.max_examples,
+        use_gpu=args.gpu,
+    )
+
+    settings = lg.TestSettings()
+    settings.scenario = scenario_map[args.scenario]
+    # Need to update the conf
+    settings.FromConfig(args.mlperf_conf, "gptj", args.scenario)
+    settings.FromConfig(args.user_conf, "gptj", args.scenario)
+
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    else:
+        settings.mode = lg.TestMode.PerformanceOnly
+    log_path = os.environ.get("LOG_PATH")
+    if not log_path:
+        log_path = "build/logs"
+    if not os.path.exists(log_path):
+        os.makedirs(log_path)
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = log_path
+    log_output_settings.copy_summary_to_stdout = True
+    log_settings = lg.LogSettings()
+    log_settings.log_output = log_output_settings
+    log_settings.enable_trace = True
+
+    lg.StartTestWithLogSettings(sut.sut, sut.qsl, settings, log_settings, args.audit_conf)
+    print("Test Done!")
+
+    print("Destroying SUT...")
+    lg.DestroySUT(sut.sut)
+
+    print("Destroying QSL...")
+    lg.DestroyQSL(sut.qsl)
+
+
+if __name__ == "__main__":
+    main()

From 081024f6b4e0446c889b45b564821cbda92afba6 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 15 Feb 2024 16:12:28 -0500
Subject: [PATCH 57/75] Offline functional, now testing server

---
 language/gpt-j/SUT.py                                   | 3 ++-
 language/gpt-j/api-endpoint-artifacts/benchmark.yaml    | 2 +-
 language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index 331a5e495..c1c6a7faa 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -310,6 +310,7 @@ def load_model(self):
             if not self.api_model_name:
                 sys.exit("API Server was specified but no model name was provided")
         else:
+            sys.exit("ONLY API SERVER MODE SUPPORTED FOR GPT-J")
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_path,
                 device_map="auto",
@@ -461,7 +462,7 @@ def stream_api_grpc(self, input, response_ids):
         return token_cache
 
     def async_process_query(self, input_ids_tensor, qitem_id):
-        decoded = self.tokenizer.decode(input_ids_tensor[0])
+        decoded = [input_ids_tensor]
         response_ids = [qitem_id]
         if self.grpc:
             output_tokens = self.stream_api_grpc(decoded, response_ids)
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index 322543537..2ba7d6824 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:offline
+    image: quay.io/meyceoz/mlperf-inference-gpt:server-test
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
index d3e1e99d0..e3f505604 100644
--- a/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
@@ -24,9 +24,9 @@ spec:
           #  value: float16
         # Dynamic batch size changes
         - name: MAX_BATCH_SIZE
-          value: "4"
+          value: "64"
         - name: MAX_CONCURRENT_REQUESTS
-          value: "50"
+          value: "128"
         - name: MAX_BATCH_WEIGHT
           value: "10000"
         - name: MAX_SEQUENCE_LENGTH

From 43afdea915218716ed277d9661acbe559572a110 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 15 Feb 2024 16:54:24 -0500
Subject: [PATCH 58/75] v1 full implementation

---
 language/gpt-j/SUT.py                                |  6 +++---
 language/gpt-j/api-endpoint-artifacts/README.md      | 11 +++++------
 language/gpt-j/api-endpoint-artifacts/benchmark.yaml |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index c1c6a7faa..99e80653b 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -111,7 +111,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu":
-                batch_size = 2
+                batch_size = 128
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
@@ -462,7 +462,7 @@ def stream_api_grpc(self, input, response_ids):
         return token_cache
 
     def async_process_query(self, input_ids_tensor, qitem_id):
-        decoded = [input_ids_tensor]
+        decoded = input_ids_tensor
         response_ids = [qitem_id]
         if self.grpc:
             output_tokens = self.stream_api_grpc(decoded, response_ids)
@@ -486,7 +486,7 @@ def process_queries(self):
                 break
 
             input_ids_tensor = self.data_object.source_encoded_input_ids[qitem.index]
-            input_masks_tensor = self.data_object.source_encoded_attn_masks[qitem.index]
+            input_masks_tensor = []#self.data_object.source_encoded_attn_masks[qitem.index]
 
             if self.api_server:
                 threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
diff --git a/language/gpt-j/api-endpoint-artifacts/README.md b/language/gpt-j/api-endpoint-artifacts/README.md
index 956f9fd2e..8f3a87abe 100644
--- a/language/gpt-j/api-endpoint-artifacts/README.md
+++ b/language/gpt-j/api-endpoint-artifacts/README.md
@@ -1,11 +1,10 @@
-# Using OpenShift AI Model Serving (Caikit/TGIS) with MLPerf Inference
+# Using OpenShift AI Model Serving (TGIS) with MLPerf Inference
 
 Prerequisites:
  - Install the OpenShift AI model serving stack
  - Add your AWS credentials to `secret.yaml` access the model files
  - Apply `secret.yaml`, `sa.yaml`
- - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml`
- - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
+ - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/gpt-j`
@@ -13,7 +12,7 @@ In the pod, before any benchmark, first run `cd inference/language/gpt-j`
 ## STANDALONE TGIS INSTRUCTIONS
 For the full accuracy benchmark (offline), run in the pod:
 ```
-python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
 ```
 You can then run the same evaluation/consolidation scripts as the regular benchmark
 Example API host: `https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com`
@@ -21,7 +20,7 @@ Example API host: `https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.e
 
 For the performance benchmark (offline), run in the pod:
 ```
-python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
 ```
 (It is the same, just with `--accuracy` removed)
 
@@ -29,7 +28,7 @@ python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-serv
 TBD
 For the performance benchmark (server), run in the pod:
 ```
-python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --grpc --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
 ```
 (Configure target qps in `user.conf`)
 
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index 2ba7d6824..f49144b4d 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:server-test
+    image: quay.io/meyceoz/mlperf-inference-gpt:v1
     resources:
       requests:
         memory: 20000Mi

From f8cc5ba8d9829c2b7457e37f1c431ec1169ec2af Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 16 Feb 2024 10:53:25 -0500
Subject: [PATCH 59/75] Update README for gpt-j

---
 language/gpt-j/api-endpoint-artifacts/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/gpt-j/api-endpoint-artifacts/README.md b/language/gpt-j/api-endpoint-artifacts/README.md
index 8f3a87abe..db4f1cf9f 100644
--- a/language/gpt-j/api-endpoint-artifacts/README.md
+++ b/language/gpt-j/api-endpoint-artifacts/README.md
@@ -15,6 +15,7 @@ For the full accuracy benchmark (offline), run in the pod:
 python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
 ```
 You can then run the same evaluation/consolidation scripts as the regular benchmark
+
 Example API host: `https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.eng.bos2.dc.redhat.com`
 
 
@@ -25,7 +26,6 @@ python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-serv
 (It is the same, just with `--accuracy` removed)
 
 
-TBD
 For the performance benchmark (server), run in the pod:
 ```
 python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log

From 8ae2bf42a26d744de73473a6497f71ffd0dce046 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 16 Feb 2024 15:00:39 -0500
Subject: [PATCH 60/75] First pass multi-endpoint

---
 language/llama2-70b/SUT.py  | 107 ++++++++++++++++++++++--------------
 language/llama2-70b/main.py |   2 +
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 806502349..1add8119f 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -22,6 +22,8 @@
 import json
 
 from inference import GrpcClient
+import more_itertools as mit
+from itertools import repeat
 
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
@@ -93,6 +95,7 @@ def __init__(self,
                  model_path=None,
                  api_server=None,
                  api_model_name=None,
+                 additional_servers=[],
                  grpc=False,
                  batch_grpc=False,
                  dtype="bfloat16",
@@ -104,7 +107,13 @@ def __init__(self,
                  workers=1):
 
         self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
-        self.api_server = api_server
+        self.api_servers = []
+        if api_server:
+            self.api_servers.append(api_server)
+        if additional_servers and not api_server:
+            sys.exit("Additional servers cannot be used without primary api server")
+        for server in additional_servers:
+            self.api_servers.append(server)
         self.api_model_name = api_model_name
         self.grpc = grpc
         self.batch_grpc = batch_grpc
@@ -165,7 +174,7 @@ def stop(self):
             worker.join()
 
 
-    def query_api(self, input):
+    def query_api(self, input, idx):
         headers = {
             'Content-Type': 'application/json',
         }
@@ -184,7 +193,7 @@ def query_api(self, input):
         while response_code != 200:
             try:
                 response = requests.post(
-                    self.api_server,
+                    self.api_servers[idx],
                     headers=headers,
                     json=json_data,
                     verify=False,
@@ -195,13 +204,25 @@ def query_api(self, input):
         return json.loads(response.text)["generated_text"]
     
 
-    def query_api_grpc(self, input):
-        resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
+    def query_api_grpc(self, input, idx):
+        resp = self.grpc_clients[idx].make_request([input], model_id=self.api_model_name)
         return resp.responses[0].text
 
-    def query_api_batch_grpc(self, inputs):
-        resps = self.grpc_client.make_request(inputs, model_id=self.api_model_name)
+    def query_api_batch_grpc(self, inputs, idx):
+        resps = self.grpc_clients[idx].make_request(inputs, model_id=self.api_model_name)
         return [resp.text for resp in resps.responses]
+    
+    def api_action_handler(self, chunk, server_idx):
+        if self.grpc:
+            if self.batch_grpc:
+                output = self.query_api_batch_grpc(chunk, server_idx)
+            else:
+                with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
+                    output = list(executor.map(self.query_api_grpc,chunk, repeat(server_idx)))
+        else:
+            with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
+                output = list(executor.map(self.query_api,chunk, repeat(server_idx)))
+        return output
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
@@ -248,23 +269,20 @@ def process_queries(self):
                 assert input_ids_tensor.shape == input_masks_tensor.shape
                 assert input_ids_tensor.shape[0] <= self.batch_size
 
-                if self.api_server:
+                if self.api_servers:
                     decoded = self.tokenizer.batch_decode(input_ids_tensor)
                     cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
-                    bs = len(cleaned)
+                    cleaned_chunks = [list(c) for c in mit.divide(len(self.api_servers), cleaned)]
 
                 tik2 = time.time()
 
-                if self.api_server:
-                    if self.grpc:
-                        if self.batch_grpc:
-                            output = self.query_api_batch_grpc(cleaned)
-                        else:
-                            with ThreadPoolExecutor(max_workers=bs) as executor:
-                                output = list(executor.map(self.query_api_grpc,cleaned))
-                    else:
-                        with ThreadPoolExecutor(max_workers=bs) as executor:
-                            output = list(executor.map(self.query_api,cleaned))
+                if self.api_servers:
+                    with ThreadPoolExecutor(max_workers=len(self.api_servers)) as executor:
+                        #needs to be tested
+                        output_chunks = list(executor.map(self.api_action_handler,cleaned_chunks,range(len(self.api_servers))))
+                    output = []
+                    for row in output_chunks:
+                        output += row
                 else:
                     pred_output_tokens = self.model.generate(
                         input_ids=input_ids_tensor,
@@ -275,7 +293,7 @@ def process_queries(self):
 
                 tik3 = time.time()
 
-                if self.api_server:
+                if self.api_servers:
                     processed_output = np.array(self.tokenizer(output, padding='longest')['input_ids'])
                 else:
                     processed_output = self.data_object.postProcess(pred_output_tokens,
@@ -304,21 +322,24 @@ def process_queries(self):
 
 
     def load_model(self):
-        if self.api_server:
-            if self.grpc:
-                hostname = re.sub("https://|http://", "", self.api_server)
-                if hostname[-1] == "/":
-                    hostname = hostname[:-1]
-                self.grpc_client = GrpcClient(
-                    hostname,
-                    443,
-                    verify=False,
-                )
-            elif not "http" in self.api_server:
-                self.api_server = "http://" + self.api_server
-
+        if self.api_servers:
             if not self.api_model_name:
                 sys.exit("API Server was specified but no model name was provided")
+            self.grpc_clients = []
+            for server in self.api_servers:
+                if self.grpc:
+                    hostname = re.sub("https://|http://", "", server)
+                    if hostname[-1] == "/":
+                        hostname = hostname[:-1]
+                    grpc_client = GrpcClient(
+                        hostname,
+                        443,
+                        verify=False,
+                    )
+                    self.grpc_clients.append(grpc_client)
+                elif not "http" in server:
+                    server = "http://" + server
+
         else:
             self.model = LlamaForCausalLM.from_pretrained(
                 self.model_path,
@@ -417,7 +438,7 @@ def process_first_tokens(self):
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
 
-    def stream_api(self, input, response_ids):
+    def stream_api(self, input, response_ids, idx):
         headers = {
             'Content-Type': 'application/json',
         }
@@ -436,7 +457,7 @@ def stream_api(self, input, response_ids):
         s = requests.Session()
         first = True
         with s.post(
-            self.api_server,
+            self.api_servers[idx],
             headers=headers,
             json=json_data,
             verify=False,
@@ -456,10 +477,10 @@ def stream_api(self, input, response_ids):
                                 token_cache.append(token)
         return token_cache
 
-    def stream_api_grpc(self, input, response_ids):
+    def stream_api_grpc(self, input, response_ids, idx):
         token_cache = []
         first = True
-        resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
+        resps = self.grpc_clients[idx].make_request_stream(input, model_id=self.api_model_name)
         for resp in resps:
             if resp.tokens:
                 token = self.llama_vocab[resp.tokens[0].text]
@@ -470,13 +491,13 @@ def stream_api_grpc(self, input, response_ids):
                     token_cache.append(token)
         return token_cache
 
-    def async_process_query(self, input_ids_tensor, qitem_id):
+    def async_process_query(self, input_ids_tensor, qitem_id, idx):
         decoded = self.tokenizer.decode(input_ids_tensor[0])
         response_ids = [qitem_id]
         if self.grpc:
-            output_tokens = self.stream_api_grpc(decoded, response_ids)
+            output_tokens = self.stream_api_grpc(decoded, response_ids, idx)
         else:
-            output_tokens = self.stream_api(decoded, response_ids)
+            output_tokens = self.stream_api(decoded, response_ids, idx)
 
         n_tokens = len(output_tokens)
         response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
@@ -488,6 +509,7 @@ def async_process_query(self, input_ids_tensor, qitem_id):
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
+        server_idx = 0
         while True:
 
             qitem = self.query_queue.get()
@@ -497,8 +519,9 @@ def process_queries(self):
             input_ids_tensor = self.data_object.input_ids[qitem.index]
             input_masks_tensor = self.data_object.attention_masks[qitem.index]
 
-            if self.api_server:
-                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
+            if self.api_servers:
+                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id, server_idx)).start()
+                server_idx = (server_idx + 1) % len(self.api_servers)
             else:
                 #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
                 tokens_cache = []
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index 6ccddd0e7..fc05cf882 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -18,6 +18,7 @@ def get_args():
     parser.add_argument("--dataset-path", type=str, default=None, help="")
     parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
     parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
+    parser.add_argument("--additional-servers", type=list, default=[], help="Specify additional endpoints for load splitting")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
     parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
@@ -74,6 +75,7 @@ def main():
         model_path=args.model_path,
         api_server=args.api_server,
         api_model_name=args.api_model_name,
+        secondary_server=args.additional_servers,
         grpc=args.grpc,
         batch_grpc=args.batch_grpc,
         dtype=args.dtype,

From 0ad354ed1766cf2b9bf6c447532008875074752d Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Mon, 19 Feb 2024 12:44:57 -0500
Subject: [PATCH 61/75] Full multiple endpoint support

---
 language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml   | 4 ++--
 language/gpt-j/dataset.py                                 | 3 ++-
 language/llama2-70b/SUT.py                                | 4 ++--
 language/llama2-70b/api-endpoint-artifacts/README.md      | 2 ++
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 language/llama2-70b/main.py                               | 4 ++--
 6 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
index e3f505604..43719baf2 100644
--- a/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/serving-tgis.yaml
@@ -20,8 +20,8 @@ spec:
           value: /tmp/transformers_cache
         - name: NUM_GPUS
           value: "1"
-          #- name: DTYPE_STR
-          #  value: float16
+        - name: DTYPE_STR
+          value: float16
         # Dynamic batch size changes
         - name: MAX_BATCH_SIZE
           value: "64"
diff --git a/language/gpt-j/dataset.py b/language/gpt-j/dataset.py
index 1ec37ae1d..1f0aec786 100644
--- a/language/gpt-j/dataset.py
+++ b/language/gpt-j/dataset.py
@@ -69,8 +69,9 @@ def encode_samples(self):
             #                                padding=True, truncation=True,
             #                                max_length=1919)
             tok = self.tokenizer(self.sources[i])["input_ids"]
-            if len(tok) > 1920:
+            while len(tok) > 1920:
                 self.sources[i] = self.sources[i][:-16 - (len(tok) - 1920)*4] + self.sources[i][-16:]
+                tok = self.tokenizer(self.sources[i])["input_ids"]
             source_encoded_input_ids.append(self.sources[i])
             #source_encoded_attn_masks.append(source_encoded.attention_mask)
 
diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 1add8119f..fcc302f53 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -398,9 +398,9 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
         with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
             llama_tokenizer = json.load(token_file)
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 30a5cd889..24b392d33 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -25,6 +25,8 @@ python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-serv
 ```
 (It is the same, just with `--accuracy` removed)
 
+ - For multiple endpoints, add `--additional-servers <server 1> <server 2> ...`
+
 
 For the performance benchmark (server), run in the pod:
 ```
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 71d8e912d..df258ae55 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v8
+    image: quay.io/meyceoz/mlperf-inference:v9-multi
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index fc05cf882..b526d805c 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -18,7 +18,7 @@ def get_args():
     parser.add_argument("--dataset-path", type=str, default=None, help="")
     parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
     parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
-    parser.add_argument("--additional-servers", type=list, default=[], help="Specify additional endpoints for load splitting")
+    parser.add_argument("--additional-servers", nargs='+', default=[], help="Specify additional endpoints for load splitting")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
     parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
@@ -75,7 +75,7 @@ def main():
         model_path=args.model_path,
         api_server=args.api_server,
         api_model_name=args.api_model_name,
-        secondary_server=args.additional_servers,
+        additional_servers=args.additional_servers,
         grpc=args.grpc,
         batch_grpc=args.batch_grpc,
         dtype=args.dtype,

From 6b117e050d37c115db8916008e0606a1c80a8c47 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 21 Feb 2024 16:16:48 -0500
Subject: [PATCH 62/75] Random gpt-j vllm experimental bits

---
 language/gpt-j/SUT.py                         |  21 +
 language/gpt-j/SUT_old.py                     | 557 ++++++++++++++++++
 .../api-endpoint-artifacts/benchmark.yaml     |   2 +-
 .../api-endpoint-artifacts/model-vllm.yaml    |  20 +
 .../api-endpoint-artifacts/serving-vllm.yaml  |  47 ++
 .../gpt-j/api-endpoint-artifacts/super.yaml   |  27 +
 language/gpt-j/main.py                        |   2 +
 language/gpt-j/user.conf                      |   6 +
 8 files changed, 681 insertions(+), 1 deletion(-)
 create mode 100644 language/gpt-j/SUT_old.py
 create mode 100644 language/gpt-j/api-endpoint-artifacts/model-vllm.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/super.yaml

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index 99e80653b..769c2b33a 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -23,6 +23,8 @@
 
 from inference import GrpcClient
 
+from vllm import LLM, SamplingParams
+
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
 import logging
@@ -94,6 +96,7 @@ def __init__(self,
                  api_model_name=None,
                  grpc=False,
                  batch_grpc=False,
+                 vllm=False,
                  dtype="bfloat16",
                  device="cpu",
                  batch_size=None,
@@ -107,6 +110,9 @@ def __init__(self,
         self.api_model_name = api_model_name
         self.grpc = grpc
         self.batch_grpc = batch_grpc
+        self.vllm = vllm
+        if self.vllm and (self.grpc or self.batch_grpc):
+            sys.exit("vllm does not support grpc")
         self.device = device
 
         if not batch_size:
@@ -190,6 +196,17 @@ def query_api(self, input):
                 print("connection failure")
         return json.loads(response.text)["generated_text"]
     
+    def query_vllm(self, inputs):
+        sampling_params = SamplingParams(
+            max_tokens=128,
+            use_beam_search=True,
+            best_of=4,
+            temperature=0,
+            early_stopping=True,
+        )
+
+        outputs = self.llm.generate(inputs, sampling_params)
+        return [output.outputs[0].text for output in outputs]
 
     def query_api_grpc(self, input):
         resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
@@ -252,6 +269,8 @@ def process_queries(self):
                         else:
                             with ThreadPoolExecutor(max_workers=bs) as executor:
                                 output = list(executor.map(self.query_api_grpc,input_ids_tensor))
+                    elif self.vllm:
+                        output = self.query_vllm(input_ids_tensor)
                     else:
                         with ThreadPoolExecutor(max_workers=bs) as executor:
                             output = list(executor.map(self.query_api,input_ids_tensor))
@@ -304,6 +323,8 @@ def load_model(self):
                     443,
                     verify=False,
                 )
+            elif self.vllm:
+                self.llm = LLM(model="/workspace/gpt-model-info/", dtype="float16")
             elif not "http" in self.api_server:
                 self.api_server = "http://" + self.api_server
 
diff --git a/language/gpt-j/SUT_old.py b/language/gpt-j/SUT_old.py
new file mode 100644
index 000000000..ae65f825d
--- /dev/null
+++ b/language/gpt-j/SUT_old.py
@@ -0,0 +1,557 @@
+import os
+import sys
+import time
+import re
+import numpy as np
+import array
+import torch
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.streamers import BaseStreamer
+
+import pickle
+import time
+import threading
+import tqdm
+import queue
+
+from concurrent.futures.thread import ThreadPoolExecutor
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+import json
+
+from inference import GrpcClient
+
+requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
+
+import logging
+from typing import TYPE_CHECKING, Optional, List
+from pathlib import Path
+
+import mlperf_loadgen as lg
+from dataset import Dataset
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("GPT-J-SUT")
+
+gen_kwargs = {
+    "early_stopping": True,
+    "max_new_tokens": 128,
+    "min_new_tokens": 30,
+    "num_beams": 4,
+}
+
+
+
+class FirstTokenStreamer(BaseStreamer):
+    """ Streams first tokens to a 'holder' """
+
+    def __init__(self, first_token, tokens_cache=[], is_first_token=True, response_ids=[] ):
+        """ Response ids added to 'sign' the first token"""
+
+        self.first_token = first_token # Queue for first token
+        self.is_first_token = is_first_token
+
+        # Cache for subsequent generated tokens
+        self.tokens_cache = tokens_cache
+
+        self.response_ids = response_ids
+
+        self.is_prompt = True # The first tokens sent to the streamer are actually the input prompts
+
+    def put(self, value):
+        """ Caches the tokens as they're generated. Assumes bs=1 """
+
+        # Prompts are streamed first so we need to skip the first time value that arrives
+        if self.is_prompt:
+            self.is_prompt = False
+            return
+
+        value = value.item()
+        if self.is_first_token:
+
+            # Add generated first token together with its query response_id to first tokens queue
+            self.first_token.put((value, self.response_ids[0]))
+
+            self.is_first_token = False
+            return
+
+        self.tokens_cache.append(value)
+
+
+    def end(self):
+        pass
+
+    def get_out_tokens(self):
+        return self.tokens_cache
+
+
+class SUT():
+    def __init__(self,
+                 model_path=None,
+                 api_server=None,
+                 api_model_name=None,
+                 grpc=False,
+                 batch_grpc=False,
+                 vllm=False,
+                 dtype="bfloat16",
+                 device="cpu",
+                 batch_size=None,
+                 total_sample_count=13368,
+                 dataset_path=None,
+                 use_cached_outputs=False,  # Set this to True *only for test accuracy runs* in case your prior session was killed partway through
+                 workers=1):
+
+        self.model_path = model_path or "EleutherAI/gpt-j-6B"
+        self.api_server = api_server
+        self.api_model_name = api_model_name
+        self.grpc = grpc
+        self.batch_grpc = batch_grpc
+        self.vllm = vllm
+        if self.vllm and (self.grpc or self.batch_grpc):
+            sys.exit("vllm does not support grpc")
+        self.device = device
+
+        if not batch_size:
+            if device == "cpu":
+                batch_size = 128
+            else:
+                batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
+        self.batch_size = batch_size
+
+        # dtype
+        if dtype == 'bfloat16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.bfloat16
+        elif dtype == 'float16':
+            self.amp_enabled = True
+            self.amp_dtype = torch.float16
+        else:
+            self.amp_enabled = False
+            self.amp_dtype = torch.float32
+
+        if 'cuda' in self.device:
+            assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
+
+        self.dataset_path = dataset_path
+        self.data_object = Dataset(dataset_path=self.dataset_path,
+                                   total_count_override=total_sample_count)
+        self.qsl = lg.ConstructQSL(self.data_object.count, self.data_object.perf_count,
+                                   self.data_object.LoadSamplesToRam, self.data_object.UnloadSamplesFromRam)
+        self.load_model()
+
+        self.num_workers = workers
+        self.worker_threads = [None] * self.num_workers
+        self.query_queue = queue.Queue()
+
+        self.use_cached_outputs = use_cached_outputs
+        self.sample_counter = 0
+        self.sample_counter_lock = threading.Lock()
+
+
+    def start(self):
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+
+    def query_api(self, input):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': self.api_model_name,
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
+            },
+        }
+
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(
+                    self.api_server,
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                )
+                response_code = response.status_code
+            except:
+                print("connection failure")
+        return json.loads(response.text)["generated_text"]
+    
+    def query_api_vllm(self, inputs):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model': '/mnt/models/',
+            'prompt': inputs,
+            'max_tokens': 128,
+            'use_beam_search': True,
+            'best_of': 4,
+            'temperature': 0,
+            'early_stopping': True
+        }
+
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(f'{self.api_server}/v1/completions', headers=headers, json=json_data, verify=False)
+                response_code = response.status_code
+            except:
+                print("connection failure")
+        return [resp["text"] for resp in json.loads(response.text)["choices"]]
+
+    def query_api_grpc(self, input):
+        resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
+        return resp.responses[0].text
+
+    def query_api_batch_grpc(self, inputs):
+        resps = self.grpc_client.make_request(inputs, model_id=self.api_model_name)
+        return [resp.text for resp in resps.responses]
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic """
+
+        while True:
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            query_ids = [q.index for q in qitem]
+
+            fname = "q" + "_".join([str(i) for i in query_ids])
+            fname = f"run_outputs/{fname}.pkl"
+            _p = Path(fname)
+            if self.use_cached_outputs and _p.exists():
+                # Read cache
+                with _p.open(mode="rb") as f:
+                    d = pickle.load(f)
+                processed_output = d["outputs"]
+                tik1 = None
+                tik2 = None
+                tik3 = None
+                tok = None
+            else:
+                # Construct / collate batch
+                tik1 = time.time()
+
+                input_ids_tensor = []
+                input_masks_tensor = []
+                input_len = []
+                for q in qitem:
+                    input_ids_tensor.append(self.data_object.source_encoded_input_ids[q.index])
+                    #input_masks_tensor.append(self.data_object.source_encoded_attn_masks[q.index])
+                    #input_len.append(self.data_object.input_lens[q.index])
+                #input_ids_tensor = torch.cat(input_ids_tensor)
+                #input_masks_tensor = torch.cat(input_masks_tensor)
+
+                #assert input_ids_tensor.shape == input_masks_tensor.shape
+                #assert input_ids_tensor.shape[0] <= self.batch_size
+
+                if self.api_server:
+                    #decoded = self.tokenizer.batch_decode(input_ids_tensor)
+                    #cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
+                    bs = len(input_ids_tensor)
+
+                tik2 = time.time()
+
+                if self.api_server:
+                    if self.grpc:
+                        if self.batch_grpc:
+                            output = self.query_api_batch_grpc(input_ids_tensor)
+                        else:
+                            with ThreadPoolExecutor(max_workers=bs) as executor:
+                                output = list(executor.map(self.query_api_grpc,input_ids_tensor))
+                    elif self.vllm:
+                        output = self.query_api_vllm(input_ids_tensor)
+                    else:
+                        with ThreadPoolExecutor(max_workers=bs) as executor:
+                            output = list(executor.map(self.query_api,input_ids_tensor))
+                else:
+                    pred_output_tokens = self.model.generate(
+                        input_ids=input_ids_tensor,
+                        attention_mask=input_masks_tensor,
+                        pad_token_id=self.tokenizer.pad_token_id,
+                        **gen_kwargs
+                    )
+
+                tik3 = time.time()
+
+                if self.api_server:
+                    processed_output = np.array(self.tokenizer(output, padding='longest')['input_ids'])
+                else:
+                    processed_output = self.data_object.postProcess(pred_output_tokens,
+                                                                    input_seq_lens=input_len,
+                                                                    query_id_list=query_ids)
+
+            for i in range(len(qitem)):
+                n_tokens = processed_output[i].shape[0]
+                response_array = array.array("B", processed_output[i].tobytes())
+                bi = response_array.buffer_info()
+                response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+            tok = time.time()
+
+            with self.sample_counter_lock:
+                self.sample_counter += len(qitem)
+                print(f"Samples run: {self.sample_counter}")
+                if tik1:
+                    print(f"\tBatchMaker time: {tik2 - tik1}")
+                    print(f"\tInference time: {tik3 - tik2}")
+                    print(f"\tPostprocess time: {tok - tik3}")
+                    print(f"\t==== Total time: {tok - tik1}")
+                else:
+                    print(f"\tLoaded from cache: {_p}")
+
+
+    def load_model(self):
+        if self.api_server:
+            if self.grpc:
+                hostname = re.sub("https://|http://", "", self.api_server)
+                if hostname[-1] == "/":
+                    hostname = hostname[:-1]
+                self.grpc_client = GrpcClient(
+                    hostname,
+                    443,
+                    verify=False,
+                )
+            elif not "http" in self.api_server:
+                self.api_server = "http://" + self.api_server
+
+            if not self.api_model_name:
+                sys.exit("API Server was specified but no model name was provided")
+        else:
+            sys.exit("ONLY API SERVER MODE SUPPORTED FOR GPT-J")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                low_cpu_mem_usage=True,
+                torch_dtype=self.amp_dtype
+            )
+            print("Loaded model")
+
+            self.device = torch.device(self.device)
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)  # Force CPU if your system has GPU and you specifically want CPU-only run
+
+            self.model.eval()
+            self.model = self.model.to(memory_format=torch.channels_last)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            model_max_length=1024,
+            padding_side="left",
+            use_fast=True,) #changed from false
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("Loaded tokenizer")
+
+    def get_sut(self):
+        self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
+        return self.sut
+
+    def get_qsl(self):
+        return self.qsl
+
+
+    def predict(self,**kwargs):
+        raise NotImplementedError
+
+
+    def issue_queries(self, query_samples):
+        """ Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+
+        list_prompts_tokens = []
+        list_prompts_attn_masks = []
+
+        print(f"IssueQuery started with {len(query_samples)} samples")
+        while len(query_samples) > 0:
+            self.query_queue.put(query_samples[:self.batch_size])
+            query_samples = query_samples[self.batch_size:]
+        print(f"IssueQuery done")
+
+
+    def flush_queries(self):
+        pass
+
+    def __del__(self):
+        pass
+
+
+class SUTServer(SUT):
+    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+
+        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+
+        with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
+            gpt_tokenizer = json.load(token_file)
+        self.gpt_vocab = gpt_tokenizer["model"]["vocab"]
+
+        self.first_token_queue = queue.Queue()
+        
+
+    def start(self):
+
+        # Create worker threads
+        for j in range(self.num_workers):
+            worker = threading.Thread(target=self.process_queries)
+            worker.start()
+            self.worker_threads[j] = worker
+
+        # Create first token response thread
+        self.ft_response_thread = threading.Thread(target=self.process_first_tokens)
+        self.ft_response_thread.start()
+
+
+    def process_first_tokens(self):
+
+        while True:
+            first_token_item = self.first_token_queue.get()
+
+            if first_token_item is None:
+                log.info("Exiting First token response thread")
+                break
+
+            first_tokens, response_id = first_token_item
+
+            response_data = array.array("B", np.array(first_tokens, np.float32).tobytes())
+            bi = response_data.buffer_info()
+            response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
+            lg.FirstTokenComplete(response)
+
+    def stream_api(self, input, response_ids):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model_id': 'GPT-J',
+            'inputs': input,
+            'parameters': {
+                'max_new_tokens': 1024,
+                'min_new_tokens': 1,
+                'decoding_method': "GREEDY"
+            },
+        }
+
+        token_cache = []
+        s = requests.Session()
+        first = True
+        with s.post(
+            self.api_server,
+            headers=headers,
+            json=json_data,
+            verify=False,
+            stream=True
+        ) as resp:
+            for line in resp.iter_lines():
+                if line:
+                    decoded = line.decode()
+                    if decoded.startswith("data"):
+                        token_l = json.loads(decoded[6:])["tokens"]
+                        if token_l:
+                            token = self.gpt_vocab[token_l[0]["text"]]
+                            if first:
+                                self.first_token_queue.put((token, response_ids[0]))
+                                first = False
+                            else:
+                                token_cache.append(token)
+        return token_cache
+
+    def stream_api_grpc(self, input, response_ids):
+        token_cache = []
+        first = True
+        resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
+        for resp in resps:
+            if resp.tokens:
+                token = self.gpt_vocab[resp.tokens[0].text]
+                if first:
+                    self.first_token_queue.put((token, response_ids[0]))
+                    first = False
+                else:
+                    token_cache.append(token)
+        return token_cache
+
+    def async_process_query(self, input_ids_tensor, qitem_id):
+        decoded = input_ids_tensor
+        response_ids = [qitem_id]
+        if self.grpc:
+            output_tokens = self.stream_api_grpc(decoded, response_ids)
+        else:
+            output_tokens = self.stream_api(decoded, response_ids)
+
+        n_tokens = len(output_tokens)
+        response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+        bi = response_array.buffer_info()
+        response = [lg.QuerySampleResponse(
+            qitem_id, bi[0], bi[1], n_tokens)]
+        lg.QuerySamplesComplete(response)
+        sys.exit()
+
+    def process_queries(self):
+        """Processor of the queued queries. User may choose to add batching logic """
+        while True:
+
+            qitem = self.query_queue.get()
+            if qitem is None:
+                break
+
+            input_ids_tensor = self.data_object.source_encoded_input_ids[qitem.index]
+            input_masks_tensor = []#self.data_object.source_encoded_attn_masks[qitem.index]
+
+            if self.api_server:
+                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
+            else:
+                #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
+                tokens_cache = []
+                tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
+
+                _ = self.model.generate(    input_ids=input_ids_tensor,
+                                            attention_mask=input_masks_tensor,
+                                            pad_token_id=self.tokenizer.pad_token_id,
+                                            streamer = tokens_streamer,
+                                            **gen_kwargs
+                                            )
+
+                output_tokens = tokens_streamer.get_out_tokens()
+
+                n_tokens = len(output_tokens)
+                response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+                bi = response_array.buffer_info()
+                response = [lg.QuerySampleResponse(
+                    qitem.id, bi[0], bi[1], n_tokens)]
+                lg.QuerySamplesComplete(response)
+
+
+    def issue_queries(self, query_samples):
+
+        self.query_queue.put(query_samples[0])
+
+
+    def stop(self):
+        for _ in range(self.num_workers):
+            self.query_queue.put(None)
+
+        for worker in self.worker_threads:
+            worker.join()
+
+        self.first_token_queue.put(None)
+        self.ft_response_thread.join()
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index f49144b4d..a39cd22b4 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:v1
+    image: quay.io/meyceoz/mlperf-inference-gpt:vllm-fix-offline
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/gpt-j/api-endpoint-artifacts/model-vllm.yaml b/language/gpt-j/api-endpoint-artifacts/model-vllm.yaml
new file mode 100644
index 000000000..d314cf952
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/model-vllm.yaml
@@ -0,0 +1,20 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    serving.knative.openshift.io/enablePassthrough: "true"
+    sidecar.istio.io/inject: "true"
+    sidecar.istio.io/rewriteAppHTTPProbers: "true"
+  name: gpt-j-isvc
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    #apiVersion: serving.kserve.io/v1alpha2
+    serviceAccountName: sa
+    #timeout: 240
+    model:
+      modelFormat:
+        name: pytorch
+      runtime: vllm
+      storageUri: s3://mlperf-inference-models/gpt-j-cnn/
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
new file mode 100644
index 000000000..51d300a41
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
@@ -0,0 +1,47 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+labels:
+  opendatahub.io/dashboard: "true"
+metadata:
+  annotations:
+    openshift.io/display-name: vLLM
+  name: vllm
+spec:
+  builtInAdapter:
+    modelLoadingTimeoutMillis: 90000
+  containers:
+    - args:
+        - --model
+        - /mnt/models/
+        - --download-dir
+        - /models-cache
+        - --port
+        - "8080"
+        - --dtype
+        - float16
+      image: quay.io/rh-aiservices-bu/vllm-openai-ubi9:0.3.1-fix-2939
+      name: kserve-container
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      resources: # configure as required
+        requests:
+          cpu: 96
+          memory: 1024Gi
+          nvidia.com/gpu: 8
+        limits:
+          cpu: 96
+          memory: 1024Gi
+          nvidia.com/gpu: 8
+      volumeMounts:
+      - mountPath: /dev/shm
+        name: dshm
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: pytorch
\ No newline at end of file
diff --git a/language/gpt-j/api-endpoint-artifacts/super.yaml b/language/gpt-j/api-endpoint-artifacts/super.yaml
new file mode 100644
index 000000000..9586f470d
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/super.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mlperf-inference-gpt
+spec:
+  restartPolicy: Never
+  containers:
+  - name: mlperf-env
+    image: quay.io/meyceoz/mlperf-inference-gpt:vllm-fix-offline
+    resources:
+      requests:
+        cpu: 96
+        memory: 1024Gi
+        nvidia.com/gpu: 8
+      limits:
+        cpu: 96
+        memory: 1024Gi
+        nvidia.com/gpu: 8
+    volumeMounts:
+    - mountPath: /dev/shm
+      name: dshm
+    command: [ "/bin/sh", "-c" ]
+    args: [ "sleep infinity" ]
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
\ No newline at end of file
diff --git a/language/gpt-j/main.py b/language/gpt-j/main.py
index edb8ec84e..304b717d3 100644
--- a/language/gpt-j/main.py
+++ b/language/gpt-j/main.py
@@ -21,6 +21,7 @@ def get_args():
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
     parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
+    parser.add_argument("--vllm", action="store_true", help="Switch runtime to vllm for api endpoint")
     parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
     parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
     parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
@@ -76,6 +77,7 @@ def main():
         api_model_name=args.api_model_name,
         grpc=args.grpc,
         batch_grpc=args.batch_grpc,
+        vllm=args.vllm,
         dtype=args.dtype,
         dataset_path=args.dataset_path,
         total_sample_count=args.total_sample_count,
diff --git a/language/gpt-j/user.conf b/language/gpt-j/user.conf
index 07a10bbe2..a69446c6b 100644
--- a/language/gpt-j/user.conf
+++ b/language/gpt-j/user.conf
@@ -2,3 +2,9 @@
 # The key has the format 'model.scenario.key'. Value is mostly int64_t.
 # Model maybe '*' as wildcard. In that case the value applies to all models.
 # All times are in milli seconds
+*.Offline.min_duration = 60000
+*.Offline.min_query_count = 1000
+
+*.Server.target_qps = 0.5
+*.Server.min_duration = 120000
+*.Server.min_query_count = 100

From ebf0710efbfe927e26a288369df6e503efaf771c Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 21 Feb 2024 16:55:19 -0500
Subject: [PATCH 63/75] Change file names

---
 language/gpt-j/SUT.py                       | 40 ++++++++++++---------
 language/gpt-j/{SUT_old.py => SUT_local.py} | 40 +++++++++------------
 2 files changed, 40 insertions(+), 40 deletions(-)
 rename language/gpt-j/{SUT_old.py => SUT_local.py} (95%)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index 769c2b33a..ae65f825d 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -23,8 +23,6 @@
 
 from inference import GrpcClient
 
-from vllm import LLM, SamplingParams
-
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
 import logging
@@ -196,17 +194,29 @@ def query_api(self, input):
                 print("connection failure")
         return json.loads(response.text)["generated_text"]
     
-    def query_vllm(self, inputs):
-        sampling_params = SamplingParams(
-            max_tokens=128,
-            use_beam_search=True,
-            best_of=4,
-            temperature=0,
-            early_stopping=True,
-        )
-
-        outputs = self.llm.generate(inputs, sampling_params)
-        return [output.outputs[0].text for output in outputs]
+    def query_api_vllm(self, inputs):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model': '/mnt/models/',
+            'prompt': inputs,
+            'max_tokens': 128,
+            'use_beam_search': True,
+            'best_of': 4,
+            'temperature': 0,
+            'early_stopping': True
+        }
+
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(f'{self.api_server}/v1/completions', headers=headers, json=json_data, verify=False)
+                response_code = response.status_code
+            except:
+                print("connection failure")
+        return [resp["text"] for resp in json.loads(response.text)["choices"]]
 
     def query_api_grpc(self, input):
         resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
@@ -270,7 +280,7 @@ def process_queries(self):
                             with ThreadPoolExecutor(max_workers=bs) as executor:
                                 output = list(executor.map(self.query_api_grpc,input_ids_tensor))
                     elif self.vllm:
-                        output = self.query_vllm(input_ids_tensor)
+                        output = self.query_api_vllm(input_ids_tensor)
                     else:
                         with ThreadPoolExecutor(max_workers=bs) as executor:
                             output = list(executor.map(self.query_api,input_ids_tensor))
@@ -323,8 +333,6 @@ def load_model(self):
                     443,
                     verify=False,
                 )
-            elif self.vllm:
-                self.llm = LLM(model="/workspace/gpt-model-info/", dtype="float16")
             elif not "http" in self.api_server:
                 self.api_server = "http://" + self.api_server
 
diff --git a/language/gpt-j/SUT_old.py b/language/gpt-j/SUT_local.py
similarity index 95%
rename from language/gpt-j/SUT_old.py
rename to language/gpt-j/SUT_local.py
index ae65f825d..769c2b33a 100644
--- a/language/gpt-j/SUT_old.py
+++ b/language/gpt-j/SUT_local.py
@@ -23,6 +23,8 @@
 
 from inference import GrpcClient
 
+from vllm import LLM, SamplingParams
+
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
 import logging
@@ -194,29 +196,17 @@ def query_api(self, input):
                 print("connection failure")
         return json.loads(response.text)["generated_text"]
     
-    def query_api_vllm(self, inputs):
-        headers = {
-            'Content-Type': 'application/json',
-        }
-
-        json_data = {
-            'model': '/mnt/models/',
-            'prompt': inputs,
-            'max_tokens': 128,
-            'use_beam_search': True,
-            'best_of': 4,
-            'temperature': 0,
-            'early_stopping': True
-        }
-
-        response_code = 0
-        while response_code != 200:
-            try:
-                response = requests.post(f'{self.api_server}/v1/completions', headers=headers, json=json_data, verify=False)
-                response_code = response.status_code
-            except:
-                print("connection failure")
-        return [resp["text"] for resp in json.loads(response.text)["choices"]]
+    def query_vllm(self, inputs):
+        sampling_params = SamplingParams(
+            max_tokens=128,
+            use_beam_search=True,
+            best_of=4,
+            temperature=0,
+            early_stopping=True,
+        )
+
+        outputs = self.llm.generate(inputs, sampling_params)
+        return [output.outputs[0].text for output in outputs]
 
     def query_api_grpc(self, input):
         resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
@@ -280,7 +270,7 @@ def process_queries(self):
                             with ThreadPoolExecutor(max_workers=bs) as executor:
                                 output = list(executor.map(self.query_api_grpc,input_ids_tensor))
                     elif self.vllm:
-                        output = self.query_api_vllm(input_ids_tensor)
+                        output = self.query_vllm(input_ids_tensor)
                     else:
                         with ThreadPoolExecutor(max_workers=bs) as executor:
                             output = list(executor.map(self.query_api,input_ids_tensor))
@@ -333,6 +323,8 @@ def load_model(self):
                     443,
                     verify=False,
                 )
+            elif self.vllm:
+                self.llm = LLM(model="/workspace/gpt-model-info/", dtype="float16")
             elif not "http" in self.api_server:
                 self.api_server = "http://" + self.api_server
 

From a357cf4824f6d0e875f00e8611375757bc4def82 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 22 Feb 2024 14:24:42 -0500
Subject: [PATCH 64/75] Added vllm server + multi-endpoint for gpt-j

---
 language/gpt-j/SUT.py                         | 178 +++++++++++-------
 .../gpt-j/api-endpoint-artifacts/README.md    |  11 +-
 .../api-endpoint-artifacts/benchmark.yaml     |   2 +-
 .../offline_performance_log.log               |   1 -
 .../api-endpoint-artifacts/serving-vllm.yaml  |  12 +-
 .../gpt-j/api-endpoint-artifacts/super.yaml   |  27 ---
 language/gpt-j/main.py                        |   2 +
 language/gpt-j/{backend.py => old_backend.py} |   0
 8 files changed, 129 insertions(+), 104 deletions(-)
 delete mode 100644 language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
 delete mode 100644 language/gpt-j/api-endpoint-artifacts/super.yaml
 rename language/gpt-j/{backend.py => old_backend.py} (100%)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index ae65f825d..dbfa744c2 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -22,6 +22,8 @@
 import json
 
 from inference import GrpcClient
+import more_itertools as mit
+from itertools import repeat
 
 requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
 
@@ -92,6 +94,7 @@ def __init__(self,
                  model_path=None,
                  api_server=None,
                  api_model_name=None,
+                 additional_servers=[],
                  grpc=False,
                  batch_grpc=False,
                  vllm=False,
@@ -104,8 +107,14 @@ def __init__(self,
                  workers=1):
 
         self.model_path = model_path or "EleutherAI/gpt-j-6B"
-        self.api_server = api_server
         self.api_model_name = api_model_name
+        self.api_servers = []
+        if api_server:
+            self.api_servers.append(api_server)
+        if additional_servers and not api_server:
+            sys.exit("Additional servers cannot be used without primary api server")
+        for server in additional_servers:
+            self.api_servers.append(server)
         self.grpc = grpc
         self.batch_grpc = batch_grpc
         self.vllm = vllm
@@ -114,14 +123,14 @@ def __init__(self,
         self.device = device
 
         if not batch_size:
-            if device == "cpu":
+            if device == "cpu": # Also applies to API server mode
                 batch_size = 128
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
 
         # dtype
-        if dtype == 'bfloat16':
+        if dtype == 'bfloat16': # Irrelevant for API server mode
             self.amp_enabled = True
             self.amp_dtype = torch.bfloat16
         elif dtype == 'float16':
@@ -165,7 +174,7 @@ def stop(self):
             worker.join()
 
 
-    def query_api(self, input):
+    def query_api(self, input, idx):
         headers = {
             'Content-Type': 'application/json',
         }
@@ -174,8 +183,8 @@ def query_api(self, input):
             'model_id': self.api_model_name,
             'inputs': input,
             'parameters': {
-                'max_new_tokens': 1024,
-                'min_new_tokens': 1,
+                'max_new_tokens': 128,
+                'min_new_tokens': 30,
                 'decoding_method': "GREEDY"
             },
         }
@@ -184,7 +193,7 @@ def query_api(self, input):
         while response_code != 200:
             try:
                 response = requests.post(
-                    self.api_server,
+                    self.api_servers[idx],
                     headers=headers,
                     json=json_data,
                     verify=False,
@@ -194,7 +203,7 @@ def query_api(self, input):
                 print("connection failure")
         return json.loads(response.text)["generated_text"]
     
-    def query_api_vllm(self, inputs):
+    def query_api_vllm(self, inputs, idx):
         headers = {
             'Content-Type': 'application/json',
         }
@@ -203,28 +212,39 @@ def query_api_vllm(self, inputs):
             'model': '/mnt/models/',
             'prompt': inputs,
             'max_tokens': 128,
-            'use_beam_search': True,
-            'best_of': 4,
             'temperature': 0,
-            'early_stopping': True
         }
 
         response_code = 0
         while response_code != 200:
             try:
-                response = requests.post(f'{self.api_server}/v1/completions', headers=headers, json=json_data, verify=False)
+                response = requests.post(f'{self.api_servers[idx]}/v1/completions', headers=headers, json=json_data, verify=False)
                 response_code = response.status_code
             except:
                 print("connection failure")
         return [resp["text"] for resp in json.loads(response.text)["choices"]]
 
-    def query_api_grpc(self, input):
-        resp = self.grpc_client.make_request([input], model_id=self.api_model_name)
+    def query_api_grpc(self, input, idx):
+        resp = self.grpc_clients[idx].make_request([input], model_id=self.api_model_name)
         return resp.responses[0].text
 
-    def query_api_batch_grpc(self, inputs):
-        resps = self.grpc_client.make_request(inputs, model_id=self.api_model_name)
+    def query_api_batch_grpc(self, inputs, idx):
+        resps = self.grpc_clients[idx].make_request(inputs, model_id=self.api_model_name)
         return [resp.text for resp in resps.responses]
+    
+    def api_action_handler(self, chunk, server_idx):
+        if self.grpc:
+            if self.batch_grpc:
+                output = self.query_api_batch_grpc(chunk, server_idx)
+            else:
+                with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
+                    output = list(executor.map(self.query_api_grpc,chunk, repeat(server_idx)))
+        elif self.vllm:
+            output = self.query_api_vllm(chunk, server_idx)
+        else:
+            with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
+                output = list(executor.map(self.query_api,chunk, repeat(server_idx)))
+        return output
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
@@ -257,33 +277,17 @@ def process_queries(self):
                 input_len = []
                 for q in qitem:
                     input_ids_tensor.append(self.data_object.source_encoded_input_ids[q.index])
-                    #input_masks_tensor.append(self.data_object.source_encoded_attn_masks[q.index])
-                    #input_len.append(self.data_object.input_lens[q.index])
-                #input_ids_tensor = torch.cat(input_ids_tensor)
-                #input_masks_tensor = torch.cat(input_masks_tensor)
-
-                #assert input_ids_tensor.shape == input_masks_tensor.shape
-                #assert input_ids_tensor.shape[0] <= self.batch_size
-
-                if self.api_server:
-                    #decoded = self.tokenizer.batch_decode(input_ids_tensor)
-                    #cleaned = [entry.replace('</s>','').replace('<s>','') for entry in decoded]
-                    bs = len(input_ids_tensor)
+                if self.api_servers:
+                    cleaned_chunks = [list(c) for c in mit.divide(len(self.api_servers), input_ids_tensor)]
 
                 tik2 = time.time()
 
-                if self.api_server:
-                    if self.grpc:
-                        if self.batch_grpc:
-                            output = self.query_api_batch_grpc(input_ids_tensor)
-                        else:
-                            with ThreadPoolExecutor(max_workers=bs) as executor:
-                                output = list(executor.map(self.query_api_grpc,input_ids_tensor))
-                    elif self.vllm:
-                        output = self.query_api_vllm(input_ids_tensor)
-                    else:
-                        with ThreadPoolExecutor(max_workers=bs) as executor:
-                            output = list(executor.map(self.query_api,input_ids_tensor))
+                if self.api_servers:
+                    with ThreadPoolExecutor(max_workers=len(self.api_servers)) as executor:
+                        output_chunks = list(executor.map(self.api_action_handler,cleaned_chunks,range(len(self.api_servers))))
+                    output = []
+                    for row in output_chunks:
+                        output += row
                 else:
                     pred_output_tokens = self.model.generate(
                         input_ids=input_ids_tensor,
@@ -294,7 +298,7 @@ def process_queries(self):
 
                 tik3 = time.time()
 
-                if self.api_server:
+                if self.api_servers:
                     processed_output = np.array(self.tokenizer(output, padding='longest')['input_ids'])
                 else:
                     processed_output = self.data_object.postProcess(pred_output_tokens,
@@ -323,18 +327,23 @@ def process_queries(self):
 
 
     def load_model(self):
-        if self.api_server:
-            if self.grpc:
-                hostname = re.sub("https://|http://", "", self.api_server)
-                if hostname[-1] == "/":
-                    hostname = hostname[:-1]
-                self.grpc_client = GrpcClient(
-                    hostname,
-                    443,
-                    verify=False,
-                )
-            elif not "http" in self.api_server:
-                self.api_server = "http://" + self.api_server
+        if self.api_servers:
+            if not self.api_model_name:
+                sys.exit("API Server was specified but no model name was provided")
+            self.grpc_clients = []
+            for server in self.api_servers:
+                if self.grpc:
+                    hostname = re.sub("https://|http://", "", server)
+                    if hostname[-1] == "/":
+                        hostname = hostname[:-1]
+                    grpc_client = GrpcClient(
+                        hostname,
+                        443,
+                        verify=False,
+                    )
+                    self.grpc_clients.append(grpc_client)
+                elif not "http" in server:
+                    server = "http://" + server
 
             if not self.api_model_name:
                 sys.exit("API Server was specified but no model name was provided")
@@ -397,9 +406,9 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, api_server=api_server, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
         with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
             gpt_tokenizer = json.load(token_file)
@@ -437,7 +446,7 @@ def process_first_tokens(self):
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
 
-    def stream_api(self, input, response_ids):
+    def stream_api(self, input, response_ids, idx):
         headers = {
             'Content-Type': 'application/json',
         }
@@ -456,7 +465,7 @@ def stream_api(self, input, response_ids):
         s = requests.Session()
         first = True
         with s.post(
-            self.api_server,
+            self.api_servers[idx],
             headers=headers,
             json=json_data,
             verify=False,
@@ -476,10 +485,10 @@ def stream_api(self, input, response_ids):
                                 token_cache.append(token)
         return token_cache
 
-    def stream_api_grpc(self, input, response_ids):
+    def stream_api_grpc(self, input, response_ids, idx):
         token_cache = []
         first = True
-        resps = self.grpc_client.make_request_stream(input, model_id=self.api_model_name)
+        resps = self.grpc_clients[idx].make_request_stream(input, model_id=self.api_model_name)
         for resp in resps:
             if resp.tokens:
                 token = self.gpt_vocab[resp.tokens[0].text]
@@ -489,14 +498,53 @@ def stream_api_grpc(self, input, response_ids):
                 else:
                     token_cache.append(token)
         return token_cache
+    
+    def stream_api_vllm(self, input, response_ids, idx):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model': '/mnt/models/',
+            'prompt': input,
+            'max_tokens': 128,
+            'temperature': 0,
+            'stream': True,
+            'logprobs': 1
+        }
+    
+        token_cache = []
+        s = requests.Session()
+        first = True
+        with s.post(
+            f'{self.api_servers[idx]}/v1/completions',
+            headers=headers,
+            json=json_data,
+            verify=False,
+            stream=True
+        ) as resp:
+            for line in resp.iter_lines():
+                if line:
+                    decoded = line.decode()
+                    if decoded.startswith("data") and "[DONE]" not in decoded:
+                        inter = json.loads(decoded[6:])["choices"][0]["logprobs"]
+                        if "top_logprobs" in inter:
+                            token_s = list(inter["top_logprobs"][0].keys())[0]
+                            token = self.gpt_vocab[token_s]
+                            if first:
+                                self.first_token_queue.put((token, response_ids[0]))
+                                first = False
+                            else:
+                                token_cache.append(token)
+        return token_cache
 
-    def async_process_query(self, input_ids_tensor, qitem_id):
+    def async_process_query(self, input_ids_tensor, qitem_id, idx):
         decoded = input_ids_tensor
         response_ids = [qitem_id]
         if self.grpc:
-            output_tokens = self.stream_api_grpc(decoded, response_ids)
+            output_tokens = self.stream_api_grpc(decoded, response_ids, idx)
         else:
-            output_tokens = self.stream_api(decoded, response_ids)
+            output_tokens = self.stream_api(decoded, response_ids, idx)
 
         n_tokens = len(output_tokens)
         response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
@@ -508,6 +556,7 @@ def async_process_query(self, input_ids_tensor, qitem_id):
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
+        server_idx = 0
         while True:
 
             qitem = self.query_queue.get()
@@ -517,8 +566,9 @@ def process_queries(self):
             input_ids_tensor = self.data_object.source_encoded_input_ids[qitem.index]
             input_masks_tensor = []#self.data_object.source_encoded_attn_masks[qitem.index]
 
-            if self.api_server:
-                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id)).start()
+            if self.api_servers:
+                threading.Thread(target=self.async_process_query, args=(input_ids_tensor, qitem.id, server_idx)).start()
+                server_idx = (server_idx + 1) % len(self.api_servers)
             else:
                 #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
                 tokens_cache = []
diff --git a/language/gpt-j/api-endpoint-artifacts/README.md b/language/gpt-j/api-endpoint-artifacts/README.md
index db4f1cf9f..6f5b4a70f 100644
--- a/language/gpt-j/api-endpoint-artifacts/README.md
+++ b/language/gpt-j/api-endpoint-artifacts/README.md
@@ -4,7 +4,8 @@ Prerequisites:
  - Install the OpenShift AI model serving stack
  - Add your AWS credentials to `secret.yaml` access the model files
  - Apply `secret.yaml`, `sa.yaml`
- - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model.yaml`
+ - FOR TGIS: Apply `serving-tgis.yaml`, then finally `model.yaml`
+ - FOR VLLM: Apply `serving-vllm.yaml`, then finally `model-vllm.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/gpt-j`
@@ -12,7 +13,7 @@ In the pod, before any benchmark, first run `cd inference/language/gpt-j`
 ## STANDALONE TGIS INSTRUCTIONS
 For the full accuracy benchmark (offline), run in the pod:
 ```
-python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --accuracy --vllm --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
 ```
 You can then run the same evaluation/consolidation scripts as the regular benchmark
 
@@ -21,16 +22,16 @@ Example API host: `https://gpt-j-isvc-predictor-gpt-service.apps.gdr-perf.perf.e
 
 For the performance benchmark (offline), run in the pod:
 ```
-python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --batch-grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --vllm --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
 ```
 (It is the same, just with `--accuracy` removed)
 
 
 For the performance benchmark (server), run in the pod:
 ```
-python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --grpc --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name gpt-j-cnn --mlperf-conf mlperf.conf --vllm --user-conf user.conf --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
 ```
 (Configure target qps in `user.conf`)
 
 
-NOTE: Hyperparams are currently configured for 1xA100 40GB
+NOTE: Hyperparams are currently configured for N instance x H100 80GB
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index a39cd22b4..253a71600 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:vllm-fix-offline
+    image: quay.io/meyceoz/mlperf-inference-gpt:work-check
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log b/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
deleted file mode 100644
index 4e4fdb9e2..000000000
--- a/language/gpt-j/api-endpoint-artifacts/offline_performance_log.log
+++ /dev/null
@@ -1 +0,0 @@
-/Users/meyceoz/Documents/model-serving/conversion-zone/venv/bin/python3: can't open file '/Users/meyceoz/Documents/model-serving/benchmarking/inference/language/gpt-j/api-endpoint-artifacts/main.py': [Errno 2] No such file or directory
diff --git a/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
index 51d300a41..0685a883b 100644
--- a/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
@@ -27,13 +27,13 @@ spec:
           protocol: TCP
       resources: # configure as required
         requests:
-          cpu: 96
-          memory: 1024Gi
-          nvidia.com/gpu: 8
+          cpu: 24
+          memory: 128Gi
+          nvidia.com/gpu: 1
         limits:
-          cpu: 96
-          memory: 1024Gi
-          nvidia.com/gpu: 8
+          cpu: 24
+          memory: 128Gi
+          nvidia.com/gpu: 1
       volumeMounts:
       - mountPath: /dev/shm
         name: dshm
diff --git a/language/gpt-j/api-endpoint-artifacts/super.yaml b/language/gpt-j/api-endpoint-artifacts/super.yaml
deleted file mode 100644
index 9586f470d..000000000
--- a/language/gpt-j/api-endpoint-artifacts/super.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: mlperf-inference-gpt
-spec:
-  restartPolicy: Never
-  containers:
-  - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:vllm-fix-offline
-    resources:
-      requests:
-        cpu: 96
-        memory: 1024Gi
-        nvidia.com/gpu: 8
-      limits:
-        cpu: 96
-        memory: 1024Gi
-        nvidia.com/gpu: 8
-    volumeMounts:
-    - mountPath: /dev/shm
-      name: dshm
-    command: [ "/bin/sh", "-c" ]
-    args: [ "sleep infinity" ]
-  volumes:
-  - name: dshm
-    emptyDir:
-      medium: Memory
\ No newline at end of file
diff --git a/language/gpt-j/main.py b/language/gpt-j/main.py
index 304b717d3..b8e99963f 100644
--- a/language/gpt-j/main.py
+++ b/language/gpt-j/main.py
@@ -18,6 +18,7 @@ def get_args():
     parser.add_argument("--dataset-path", type=str, default=None, help="")
     parser.add_argument("--api-server", type=str, default=None, help="Specify an api endpoint call to use api mode")
     parser.add_argument("--api-model-name", type=str, default=None, help="Specify a model name to use api mode")
+    parser.add_argument("--additional-servers", nargs='+', default=[], help="Specify additional endpoints for load splitting")
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
     parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
@@ -75,6 +76,7 @@ def main():
         model_path=args.model_path,
         api_server=args.api_server,
         api_model_name=args.api_model_name,
+        additional_servers=args.additional_servers,
         grpc=args.grpc,
         batch_grpc=args.batch_grpc,
         vllm=args.vllm,
diff --git a/language/gpt-j/backend.py b/language/gpt-j/old_backend.py
similarity index 100%
rename from language/gpt-j/backend.py
rename to language/gpt-j/old_backend.py

From a505e83e9f6fe141d4698645ec28e551eba83868 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 22 Feb 2024 15:25:28 -0500
Subject: [PATCH 65/75] Minor adjustments

---
 language/gpt-j/SUT.py                                | 8 +++++---
 language/gpt-j/api-endpoint-artifacts/README.md      | 2 ++
 language/gpt-j/api-endpoint-artifacts/benchmark.yaml | 2 +-
 language/gpt-j/user.conf                             | 7 +------
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index dbfa744c2..a3c524cb4 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -124,7 +124,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu": # Also applies to API server mode
-                batch_size = 128
+                batch_size = 13368
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
@@ -406,9 +406,9 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, vllm=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, vllm=vllm, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
         with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
             gpt_tokenizer = json.load(token_file)
@@ -543,6 +543,8 @@ def async_process_query(self, input_ids_tensor, qitem_id, idx):
         response_ids = [qitem_id]
         if self.grpc:
             output_tokens = self.stream_api_grpc(decoded, response_ids, idx)
+        elif self.vllm:
+            output_tokens = self.stream_api_vllm(decoded, response_ids, idx)
         else:
             output_tokens = self.stream_api(decoded, response_ids, idx)
 
diff --git a/language/gpt-j/api-endpoint-artifacts/README.md b/language/gpt-j/api-endpoint-artifacts/README.md
index 6f5b4a70f..895ee47fa 100644
--- a/language/gpt-j/api-endpoint-artifacts/README.md
+++ b/language/gpt-j/api-endpoint-artifacts/README.md
@@ -26,6 +26,8 @@ python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-serv
 ```
 (It is the same, just with `--accuracy` removed)
 
+ - For multiple endpoints, add `--additional-servers <server 1> <server 2> ...`
+
 
 For the performance benchmark (server), run in the pod:
 ```
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index 253a71600..44c2ba208 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference-gpt:work-check
+    image: quay.io/meyceoz/mlperf-inference-gpt:v2
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/gpt-j/user.conf b/language/gpt-j/user.conf
index a69446c6b..5ca6a1529 100644
--- a/language/gpt-j/user.conf
+++ b/language/gpt-j/user.conf
@@ -2,9 +2,4 @@
 # The key has the format 'model.scenario.key'. Value is mostly int64_t.
 # Model maybe '*' as wildcard. In that case the value applies to all models.
 # All times are in milli seconds
-*.Offline.min_duration = 60000
-*.Offline.min_query_count = 1000
-
-*.Server.target_qps = 0.5
-*.Server.min_duration = 120000
-*.Server.min_query_count = 100
+*.Server.target_qps = 120

From ef6b3db30e309259c37da96bcb69c19a1a7e4127 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 22 Feb 2024 17:37:54 -0500
Subject: [PATCH 66/75] Updated for exact values

---
 language/gpt-j/SUT.py    | 2 +-
 language/gpt-j/user.conf | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index a3c524cb4..a836bdc38 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -124,7 +124,7 @@ def __init__(self,
 
         if not batch_size:
             if device == "cpu": # Also applies to API server mode
-                batch_size = 13368
+                batch_size = 31192
             else:
                 batch_size = 32  # Reduce to 8 if using 4 GPUs, 16 for 8.
         self.batch_size = batch_size
diff --git a/language/gpt-j/user.conf b/language/gpt-j/user.conf
index 5ca6a1529..396e20244 100644
--- a/language/gpt-j/user.conf
+++ b/language/gpt-j/user.conf
@@ -3,3 +3,4 @@
 # Model maybe '*' as wildcard. In that case the value applies to all models.
 # All times are in milli seconds
 *.Server.target_qps = 120
+*.Offline.min_query_count = 93576

From 230d495794e9d510c6b4687a64cc426dd2bc273c Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 27 Feb 2024 14:52:26 -0500
Subject: [PATCH 67/75] Update llama-2 with vllm

---
 language/gpt-j/SUT.py                         |  63 +++++-----
 .../api-endpoint-artifacts/benchmark.yaml     |   3 +-
 .../gpt-j/api-endpoint-artifacts/service.yaml |  20 ++++
 .../api-endpoint-artifacts/serving-vllm.yaml  |   4 +-
 .../api-endpoint-artifacts/standalone.yaml    | 111 ++++++++++++++++++
 language/gpt-j/user.conf                      |   2 +-
 language/llama2-70b/SUT.py                    |  77 +++++++++++-
 .../api-endpoint-artifacts/README.md          |   5 +-
 .../api-endpoint-artifacts/benchmark.yaml     |   2 +-
 .../api-endpoint-artifacts/model-vllm.yaml    |  20 ++++
 .../api-endpoint-artifacts/serving-vllm.yaml  |  49 ++++++++
 language/llama2-70b/main.py                   |   2 +
 language/llama2-70b/user.conf                 |   7 +-
 13 files changed, 322 insertions(+), 43 deletions(-)
 create mode 100644 language/gpt-j/api-endpoint-artifacts/service.yaml
 create mode 100644 language/gpt-j/api-endpoint-artifacts/standalone.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/model-vllm.yaml
 create mode 100644 language/llama2-70b/api-endpoint-artifacts/serving-vllm.yaml

diff --git a/language/gpt-j/SUT.py b/language/gpt-j/SUT.py
index a836bdc38..7233676c6 100644
--- a/language/gpt-j/SUT.py
+++ b/language/gpt-j/SUT.py
@@ -455,8 +455,8 @@ def stream_api(self, input, response_ids, idx):
             'model_id': 'GPT-J',
             'inputs': input,
             'parameters': {
-                'max_new_tokens': 1024,
-                'min_new_tokens': 1,
+                'max_new_tokens': 128,
+                'min_new_tokens': 30,
                 'decoding_method': "GREEDY"
             },
         }
@@ -512,31 +512,38 @@ def stream_api_vllm(self, input, response_ids, idx):
             'stream': True,
             'logprobs': 1
         }
-    
-        token_cache = []
-        s = requests.Session()
-        first = True
-        with s.post(
-            f'{self.api_servers[idx]}/v1/completions',
-            headers=headers,
-            json=json_data,
-            verify=False,
-            stream=True
-        ) as resp:
-            for line in resp.iter_lines():
-                if line:
-                    decoded = line.decode()
-                    if decoded.startswith("data") and "[DONE]" not in decoded:
-                        inter = json.loads(decoded[6:])["choices"][0]["logprobs"]
-                        if "top_logprobs" in inter:
-                            token_s = list(inter["top_logprobs"][0].keys())[0]
-                            token = self.gpt_vocab[token_s]
-                            if first:
-                                self.first_token_queue.put((token, response_ids[0]))
-                                first = False
-                            else:
-                                token_cache.append(token)
-        return token_cache
+
+        while True:
+            try:
+                token_cache = []
+                s = requests.Session()
+                first = True
+                with s.post(
+                    f'{self.api_servers[idx]}/v1/completions',
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                    stream=True
+                ) as resp:
+                    for line in resp.iter_lines():
+                        if line:
+                            decoded = line.decode()
+                            if decoded.startswith("data") and "[DONE]" not in decoded:
+                                inter = json.loads(decoded[6:])["choices"][0]["logprobs"]
+                                if "top_logprobs" in inter:
+                                    token_s = list(inter["top_logprobs"][0].keys())[0]
+                                    token = self.gpt_vocab[token_s]
+                                    if first:
+                                        self.first_token_queue.put((token, response_ids[0]))
+                                        first = False
+                                    else:
+                                        token_cache.append(token)
+                s.close()
+                return token_cache
+            except:
+                s.close()
+                print("Connection failure")
+            
 
     def async_process_query(self, input_ids_tensor, qitem_id, idx):
         decoded = input_ids_tensor
@@ -554,7 +561,7 @@ def async_process_query(self, input_ids_tensor, qitem_id, idx):
         response = [lg.QuerySampleResponse(
             qitem_id, bi[0], bi[1], n_tokens)]
         lg.QuerySamplesComplete(response)
-        sys.exit()
+        return
 
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic """
diff --git a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
index 44c2ba208..b117ddbbc 100644
--- a/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/benchmark.yaml
@@ -1,7 +1,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: mlperf-inference-gpt
+  name: mlperf-inference-gpt-2
 spec:
   restartPolicy: Never
   containers:
@@ -9,6 +9,7 @@ spec:
     image: quay.io/meyceoz/mlperf-inference-gpt:v2
     resources:
       requests:
+        cpu: 140
         memory: 20000Mi
     volumeMounts:
     - mountPath: /dev/shm
diff --git a/language/gpt-j/api-endpoint-artifacts/service.yaml b/language/gpt-j/api-endpoint-artifacts/service.yaml
new file mode 100644
index 000000000..7d9a2c523
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/service.yaml
@@ -0,0 +1,20 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: vllm
+  labels:
+    app: vllm
+spec:
+  clusterIP: None
+  ipFamilies:
+    - IPv4
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8000
+      targetPort: http
+  type: ClusterIP
+  ipFamilyPolicy: SingleStack
+  sessionAffinity: None
+  selector:
+    app: vllm
diff --git a/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
index 0685a883b..0f7d51556 100644
--- a/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
+++ b/language/gpt-j/api-endpoint-artifacts/serving-vllm.yaml
@@ -27,11 +27,11 @@ spec:
           protocol: TCP
       resources: # configure as required
         requests:
-          cpu: 24
+          cpu: 12
           memory: 128Gi
           nvidia.com/gpu: 1
         limits:
-          cpu: 24
+          cpu: 12
           memory: 128Gi
           nvidia.com/gpu: 1
       volumeMounts:
diff --git a/language/gpt-j/api-endpoint-artifacts/standalone.yaml b/language/gpt-j/api-endpoint-artifacts/standalone.yaml
new file mode 100644
index 000000000..3ea95ac55
--- /dev/null
+++ b/language/gpt-j/api-endpoint-artifacts/standalone.yaml
@@ -0,0 +1,111 @@
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  annotations:
+    deployment.kubernetes.io/revision: '8'
+  resourceVersion: '249593'
+  name: vllm
+  generation: 24
+  namespace: gpt-service
+  labels:
+    app: vllm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        app: vllm
+    spec:
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      affinity: {}
+      terminationGracePeriodSeconds: 120
+      securityContext: {}
+      containers:
+        - resources:
+            limits:
+              cpu: '12'
+              memory: 128Gi
+              nvidia.com/gpu: '1'
+            requests:
+              cpu: '12'
+              memory: 128Gi
+              nvidia.com/gpu: '1'
+#          readinessProbe:
+#            httpGet:
+#              path: /health
+#              port: http
+#              scheme: HTTP
+#            timeoutSeconds: 5
+#            periodSeconds: 30
+#            successThreshold: 1
+#            failureThreshold: 3
+          terminationMessagePath: /dev/termination-log
+          name: server
+#          livenessProbe:
+#            httpGet:
+#              path: /health
+#              port: http
+#              scheme: HTTP
+#            timeoutSeconds: 8
+#            periodSeconds: 100
+#            successThreshold: 1
+#            failureThreshold: 3
+          securityContext:
+            capabilities:
+              drop:
+                - ALL
+            runAsNonRoot: true
+            allowPrivilegeEscalation: false
+            seccompProfile:
+              type: RuntimeDefault
+          ports:
+            - name: http
+              containerPort: 8000
+              protocol: TCP
+          imagePullPolicy: IfNotPresent
+#          startupProbe:
+#            httpGet:
+#              path: /health
+#              port: http
+#              scheme: HTTP
+#            timeoutSeconds: 1
+#            periodSeconds: 30
+#            successThreshold: 1
+#            failureThreshold: 24
+          volumeMounts:
+            - name: models-cache
+              mountPath: /models-cache
+            - name: shm
+              mountPath: /dev/shm
+          terminationMessagePolicy: File
+          image: 'quay.io/rh-aiservices-bu/vllm-openai-ubi9:0.3.1-fix-2939'
+          args:
+            - '--model'
+            - /models-cache/gpt-model-info/
+#            - EleutherAI/gpt-j-6b
+            - '--download-dir'
+            - /models-cache
+            - '--dtype'
+            - float16
+      volumes:
+        - name: models-cache
+          persistentVolumeClaim:
+            claimName: vllm-model-cache
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+      dnsPolicy: ClusterFirst
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+  strategy:
+    type: Recreate
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
diff --git a/language/gpt-j/user.conf b/language/gpt-j/user.conf
index 396e20244..ded93b822 100644
--- a/language/gpt-j/user.conf
+++ b/language/gpt-j/user.conf
@@ -2,5 +2,5 @@
 # The key has the format 'model.scenario.key'. Value is mostly int64_t.
 # Model maybe '*' as wildcard. In that case the value applies to all models.
 # All times are in milli seconds
-*.Server.target_qps = 120
+*.Server.target_qps = 132
 *.Offline.min_query_count = 93576
diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index fcc302f53..a3a8d3a05 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -98,6 +98,7 @@ def __init__(self,
                  additional_servers=[],
                  grpc=False,
                  batch_grpc=False,
+                 vllm=False,
                  dtype="bfloat16",
                  device="cpu",
                  batch_size=None,
@@ -117,6 +118,9 @@ def __init__(self,
         self.api_model_name = api_model_name
         self.grpc = grpc
         self.batch_grpc = batch_grpc
+        self.vllm = vllm
+        if self.vllm and (self.grpc or self.batch_grpc):
+            sys.exit("vllm does not support grpc")
         self.device = device
 
         if not batch_size:
@@ -203,6 +207,26 @@ def query_api(self, input, idx):
                 print("connection failure")
         return json.loads(response.text)["generated_text"]
     
+    def query_api_vllm(self, inputs, idx):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model': '/mnt/models/',
+            'prompt': inputs,
+            'max_tokens': 128,
+            'temperature': 0,
+        }
+
+        response_code = 0
+        while response_code != 200:
+            try:
+                response = requests.post(f'{self.api_servers[idx]}/v1/completions', headers=headers, json=json_data, verify=False)
+                response_code = response.status_code
+            except:
+                print("connection failure")
+        return [resp["text"] for resp in json.loads(response.text)["choices"]]
 
     def query_api_grpc(self, input, idx):
         resp = self.grpc_clients[idx].make_request([input], model_id=self.api_model_name)
@@ -219,6 +243,8 @@ def api_action_handler(self, chunk, server_idx):
             else:
                 with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
                     output = list(executor.map(self.query_api_grpc,chunk, repeat(server_idx)))
+        elif self.vllm:
+            output = self.query_api_vllm(chunk, server_idx)
         else:
             with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
                 output = list(executor.map(self.query_api,chunk, repeat(server_idx)))
@@ -398,9 +424,9 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(self, model_path=None, api_server=None, additional_servers=[], api_model_name=None, grpc=False, batch_grpc=False, vllm=False, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
 
-        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+        super().__init__(model_path=model_path, api_server=api_server, additional_servers=additional_servers, api_model_name=api_model_name, grpc=grpc, vllm=vllm, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
 
         with open(f"{self.model_path}/tokenizer.json", 'r') as token_file:
             llama_tokenizer = json.load(token_file)
@@ -490,12 +516,59 @@ def stream_api_grpc(self, input, response_ids, idx):
                 else:
                     token_cache.append(token)
         return token_cache
+    
+    def stream_api_vllm(self, input, response_ids, idx):
+        headers = {
+            'Content-Type': 'application/json',
+        }
+
+        json_data = {
+            'model': '/mnt/models/',
+            'prompt': input,
+            'max_tokens': 128,
+            'temperature': 0,
+            'stream': True,
+            'logprobs': 1
+        }
+
+        while True:
+            try:
+                token_cache = []
+                s = requests.Session()
+                first = True
+                with s.post(
+                    f'{self.api_servers[idx]}/v1/completions',
+                    headers=headers,
+                    json=json_data,
+                    verify=False,
+                    stream=True
+                ) as resp:
+                    for line in resp.iter_lines():
+                        if line:
+                            decoded = line.decode()
+                            if decoded.startswith("data") and "[DONE]" not in decoded:
+                                inter = json.loads(decoded[6:])["choices"][0]["logprobs"]
+                                if "top_logprobs" in inter:
+                                    token_s = list(inter["top_logprobs"][0].keys())[0]
+                                    token = self.gpt_vocab[token_s]
+                                    if first:
+                                        self.first_token_queue.put((token, response_ids[0]))
+                                        first = False
+                                    else:
+                                        token_cache.append(token)
+                s.close()
+                return token_cache
+            except:
+                s.close()
+                print("Connection failure")
 
     def async_process_query(self, input_ids_tensor, qitem_id, idx):
         decoded = self.tokenizer.decode(input_ids_tensor[0])
         response_ids = [qitem_id]
         if self.grpc:
             output_tokens = self.stream_api_grpc(decoded, response_ids, idx)
+        elif self.vllm:
+            output_tokens = self.stream_api_vllm(decoded, response_ids, idx)
         else:
             output_tokens = self.stream_api(decoded, response_ids, idx)
 
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 24b392d33..14e5987ab 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -4,8 +4,9 @@ Prerequisites:
  - Install the OpenShift AI model serving stack
  - Add your AWS credentials to `secret.yaml` access the model files
  - Apply `secret.yaml`, `sa.yaml`
- - FOR CAIKIT: Apply `serving-runtime.yaml`, then finally `model.yaml`
- - FOR TGIS STANDALONE: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
+ - FOR CAIKIT+TGIS: Apply `serving-runtime.yaml`, then finally `model.yaml`
+ - FOR TGIS: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
+ - FOR VLLM: Apply `serving-vllm.yaml`, then finally `model-vllm.yaml` (NOTE: we'll see if further instructions necessary based on performance)
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/llama2-70b`
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index df258ae55..77f8c3440 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v9-multi
+    image: quay.io/meyceoz/mlperf-inference:vllm-test
     resources:
       requests:
         memory: 20000Mi
diff --git a/language/llama2-70b/api-endpoint-artifacts/model-vllm.yaml b/language/llama2-70b/api-endpoint-artifacts/model-vllm.yaml
new file mode 100644
index 000000000..01459b7ce
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/model-vllm.yaml
@@ -0,0 +1,20 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  annotations:
+    serving.knative.openshift.io/enablePassthrough: "true"
+    sidecar.istio.io/inject: "true"
+    sidecar.istio.io/rewriteAppHTTPProbers: "true"
+  name: llama-2-isvc
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    #apiVersion: serving.kserve.io/v1alpha2
+    serviceAccountName: sa
+    #timeout: 240
+    model:
+      modelFormat:
+        name: pytorch
+      runtime: vllm
+      storageUri: s3://mlperf-inference-models/Llama-2-70b-chat-hf/
\ No newline at end of file
diff --git a/language/llama2-70b/api-endpoint-artifacts/serving-vllm.yaml b/language/llama2-70b/api-endpoint-artifacts/serving-vllm.yaml
new file mode 100644
index 000000000..f14b1f78e
--- /dev/null
+++ b/language/llama2-70b/api-endpoint-artifacts/serving-vllm.yaml
@@ -0,0 +1,49 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+labels:
+  opendatahub.io/dashboard: "true"
+metadata:
+  annotations:
+    openshift.io/display-name: vLLM
+  name: vllm
+spec:
+  builtInAdapter:
+    modelLoadingTimeoutMillis: 90000
+  containers:
+    - args:
+        - --model
+        - /mnt/models/
+        - --download-dir
+        - /models-cache
+        - --port
+        - "8080"
+        - --dtype
+        - float16
+        - --tensor-parallel-size
+        - "4"
+      image: quay.io/rh-aiservices-bu/vllm-openai-ubi9:0.3.1-fix-2939
+      name: kserve-container
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      resources: # configure as required
+        requests:
+          cpu: 16
+          memory: 512Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 16
+          memory: 512Gi
+          nvidia.com/gpu: 4
+      volumeMounts:
+      - mountPath: /dev/shm
+        name: dshm
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: pytorch
\ No newline at end of file
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index b526d805c..5f26e3bcc 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -22,6 +22,7 @@ def get_args():
     parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
     parser.add_argument("--grpc", action="store_true", help="Enable grpc for api endpoint")
     parser.add_argument("--batch-grpc", action="store_true", help="Enable batch requests for grpc")
+    parser.add_argument("--vllm", action="store_true", help="Switch runtime to vllm for api endpoint")
     parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
     parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
     parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
@@ -78,6 +79,7 @@ def main():
         additional_servers=args.additional_servers,
         grpc=args.grpc,
         batch_grpc=args.batch_grpc,
+        vllm=args.vllm,
         dtype=args.dtype,
         dataset_path=args.dataset_path,
         total_sample_count=args.total_sample_count,
diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
index 945082fe9..6a9b4fb6d 100644
--- a/language/llama2-70b/user.conf
+++ b/language/llama2-70b/user.conf
@@ -3,11 +3,6 @@
 # Model maybe '*' as wildcard. In that case the value applies to all models.
 # All times are in milli seconds
 #
-*.Offline.min_duration = 600000
-*.Offline.min_query_count = 2000
-
-*.Server.target_qps = 0.5
-*.Server.min_duration = 120000
-*.Server.min_query_count = 100
+*.Server.target_qps = 20
 
 llama2-70b.Server.sample_concatenate_permutation = 1
\ No newline at end of file

From 48a439689c43be073d546854d9c903ead286ad89 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 27 Feb 2024 16:11:54 -0500
Subject: [PATCH 68/75] Fixed output cap bug

---
 language/llama2-70b/SUT.py                                | 4 ++--
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index a3a8d3a05..5b0faffb9 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -215,7 +215,7 @@ def query_api_vllm(self, inputs, idx):
         json_data = {
             'model': '/mnt/models/',
             'prompt': inputs,
-            'max_tokens': 128,
+            'max_tokens': 1024,
             'temperature': 0,
         }
 
@@ -525,7 +525,7 @@ def stream_api_vllm(self, input, response_ids, idx):
         json_data = {
             'model': '/mnt/models/',
             'prompt': input,
-            'max_tokens': 128,
+            'max_tokens': 1024,
             'temperature': 0,
             'stream': True,
             'logprobs': 1
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 77f8c3440..e4edf790a 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:vllm-test
+    image: quay.io/meyceoz/mlperf-inference:vllm-fix
     resources:
       requests:
         memory: 20000Mi

From 57e241dbad4d472ba4b38b9274bfd45cf6984f8a Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 27 Feb 2024 17:53:20 -0500
Subject: [PATCH 69/75] Fix llama server bug

---
 language/llama2-70b/SUT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 5b0faffb9..b470e00ca 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -550,7 +550,7 @@ def stream_api_vllm(self, input, response_ids, idx):
                                 inter = json.loads(decoded[6:])["choices"][0]["logprobs"]
                                 if "top_logprobs" in inter:
                                     token_s = list(inter["top_logprobs"][0].keys())[0]
-                                    token = self.gpt_vocab[token_s]
+                                    token = self.llama_vocab[token_s]
                                     if first:
                                         self.first_token_queue.put((token, response_ids[0]))
                                         first = False

From 6ef5023523038bac6d0c23f1e3a6d3e0b477b3af Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 27 Feb 2024 17:54:08 -0500
Subject: [PATCH 70/75] Added v10 image for llama

---
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index e4edf790a..81aeee7e7 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:vllm-fix
+    image: quay.io/meyceoz/mlperf-inference:v10
     resources:
       requests:
         memory: 20000Mi

From 3bc09fa62647849210629e8e4dfee7bf14c11a64 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 28 Feb 2024 17:41:38 -0500
Subject: [PATCH 71/75] Updated token gen count for offline llama

---
 language/llama2-70b/SUT.py                                | 5 +++--
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index b470e00ca..c393b0170 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -327,8 +327,9 @@ def process_queries(self):
                                                                     query_id_list=query_ids)
 
             for i in range(len(qitem)):
-                n_tokens = processed_output[i].shape[0]
-                response_array = array.array("B", processed_output[i].tobytes())
+                unpadded = np.delete(processed_output[i], np.where(processed_output[i] == 2))
+                n_tokens = unpadded.shape[0]
+                response_array = array.array("B", unpadded.tobytes())
                 bi = response_array.buffer_info()
                 response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1], n_tokens)]
                 lg.QuerySamplesComplete(response)
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 81aeee7e7..fc74f5545 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v10
+    image: quay.io/meyceoz/mlperf-inference:v11
     resources:
       requests:
         memory: 20000Mi

From 38e3aeafdfcc5cdb74a578e455d888e0b5e5862b Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 28 Feb 2024 18:36:28 -0500
Subject: [PATCH 72/75] Updated READMEs

---
 language/llama2-70b/README.md                 |  6 ++++
 .../api-endpoint-artifacts/README.md          | 29 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md
index b539b3e3c..cd3af3184 100644
--- a/language/llama2-70b/README.md
+++ b/language/llama2-70b/README.md
@@ -1,3 +1,9 @@
+# Red Hat OpenShift Model Serving Stack Implementation
+
+Please see the `README.md` located in the `api-endpoint-artifacts` dir for instructions on running the model serving stack implementation.
+
+Below is the original guide for the reference implementation, which we retained backwards compatibility with:
+
 # Reference Implementation for llama2-70b
 
 **Basic implementation for llama2-70b. Few noteworthy items:**
diff --git a/language/llama2-70b/api-endpoint-artifacts/README.md b/language/llama2-70b/api-endpoint-artifacts/README.md
index 14e5987ab..376601f55 100644
--- a/language/llama2-70b/api-endpoint-artifacts/README.md
+++ b/language/llama2-70b/api-endpoint-artifacts/README.md
@@ -6,11 +6,38 @@ Prerequisites:
  - Apply `secret.yaml`, `sa.yaml`
  - FOR CAIKIT+TGIS: Apply `serving-runtime.yaml`, then finally `model.yaml`
  - FOR TGIS: Apply `serving-tgis.yaml`, then finally `model-tgis.yaml`
- - FOR VLLM: Apply `serving-vllm.yaml`, then finally `model-vllm.yaml` (NOTE: we'll see if further instructions necessary based on performance)
+ - FOR VLLM (Best Performing): Apply `serving-vllm.yaml`, then finally `model-vllm.yaml`
  - Create a benchmark pod using `benchmark.yaml`
 
 In the pod, before any benchmark, first run `cd inference/language/llama2-70b`
 
+## VLLM INSTRUCTIONS
+For the full accuracy benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --accuracy --vllm --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+You can then run the same evaluation/consolidation scripts as the regular benchmark
+Example API host: `https://llama-2-70b-chat-isvc-predictor-llama-service.apps.h100serving.perf.lab.eng.bos.redhat.com`
+
+
+For the performance benchmark (offline), run in the pod:
+```
+python3 -u main.py --scenario Offline --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --vllm --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir offline-logs --dtype float32 --device cpu 2>&1 | tee offline_performance_log.log
+```
+(It is the same, just with `--accuracy` removed)
+
+ - For multiple endpoints, add `--additional-servers <server 1> <server 2> ...`
+
+
+For the performance benchmark (server), run in the pod:
+```
+python3 -u main.py --scenario Server --model-path ${CHECKPOINT_PATH} --api-server <INSERT API HOST> --api-model-name Llama-2-70b-chat-hf --mlperf-conf mlperf.conf --vllm --user-conf user.conf --total-sample-count 24576 --dataset-path ${DATASET_PATH} --output-log-dir server-logs --dtype float32 --device cpu 2>&1 | tee server_performance_log.log
+```
+(Configure target qps in `user.conf`)
+
+
+NOTE: Hyperparams are currently configured for 8xH100
+
 ## STANDALONE TGIS INSTRUCTIONS
 For the full accuracy benchmark (offline), run in the pod:
 ```

From 84f1aac3d4fa68eccaa006e6f2e33a7406c89595 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 29 Feb 2024 14:28:15 -0500
Subject: [PATCH 73/75] Updated server first token inclusion in first query

---
 language/llama2-70b/SUT.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index c393b0170..d26f3bb0f 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -500,8 +500,7 @@ def stream_api(self, input, response_ids, idx):
                             if first:
                                 self.first_token_queue.put((token, response_ids[0]))
                                 first = False
-                            else:
-                                token_cache.append(token)
+                            token_cache.append(token)
         return token_cache
 
     def stream_api_grpc(self, input, response_ids, idx):
@@ -514,8 +513,7 @@ def stream_api_grpc(self, input, response_ids, idx):
                 if first:
                     self.first_token_queue.put((token, response_ids[0]))
                     first = False
-                else:
-                    token_cache.append(token)
+                token_cache.append(token)
         return token_cache
     
     def stream_api_vllm(self, input, response_ids, idx):
@@ -555,10 +553,10 @@ def stream_api_vllm(self, input, response_ids, idx):
                                     if first:
                                         self.first_token_queue.put((token, response_ids[0]))
                                         first = False
-                                    else:
-                                        token_cache.append(token)
+                                    token_cache.append(token)
                 s.close()
-                return token_cache
+                if token_cache:
+                    return token_cache
             except:
                 s.close()
                 print("Connection failure")
@@ -574,6 +572,10 @@ def async_process_query(self, input_ids_tensor, qitem_id, idx):
             output_tokens = self.stream_api(decoded, response_ids, idx)
 
         n_tokens = len(output_tokens)
+        if n_tokens <= 1:
+            print("WARNING: caught low token count")
+            print(input_ids_tensor)
+            print(output_tokens)
         response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
         bi = response_array.buffer_info()
         response = [lg.QuerySampleResponse(

From c7eeef946ebeb0587a79a7ced337e02d7f3ff2c9 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 29 Feb 2024 14:30:47 -0500
Subject: [PATCH 74/75] Updated image in yaml

---
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index fc74f5545..320045ea3 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v11
+    image: quay.io/meyceoz/mlperf-inference:v12
     resources:
       requests:
         memory: 20000Mi

From 8ab599868793ac84c67e2d71ab5c516e4d23a087 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 29 Feb 2024 16:46:41 -0500
Subject: [PATCH 75/75] Fix first token dtype

---
 language/llama2-70b/SUT.py                                | 2 +-
 language/llama2-70b/api-endpoint-artifacts/benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index d26f3bb0f..ff263e961 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -460,7 +460,7 @@ def process_first_tokens(self):
 
             first_tokens, response_id = first_token_item
 
-            response_data = array.array("B", np.array(first_tokens, np.float32).tobytes())
+            response_data = array.array("B", np.array(first_tokens, np.int32).tobytes())
             bi = response_data.buffer_info()
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
diff --git a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
index 320045ea3..5959ca63d 100644
--- a/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
+++ b/language/llama2-70b/api-endpoint-artifacts/benchmark.yaml
@@ -6,7 +6,7 @@ spec:
   restartPolicy: Never
   containers:
   - name: mlperf-env
-    image: quay.io/meyceoz/mlperf-inference:v12
+    image: quay.io/meyceoz/mlperf-inference:v13
     resources:
       requests:
         memory: 20000Mi