Add low latency checks to submission checker

mlcommons · Jan 6, 2025 · 726be53 · 726be53
1 parent b9f22d6
commit 726be53
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 18 deletions.
diff --git a/language/llama2-70b/user.conf b/language/llama2-70b/user.conf
@@ -10,4 +10,11 @@
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
 
-llama2-70b.Server.sample_concatenate_permutation = 1
+llama2-70b.Server.sample_concatenate_permutation = 1
+
+# Target Latencies for low latency setting. In order to run low latency:
+# 1. Uncomment this set of latencies
+# 2. Comment the set of latencies in mlperf.conf
+# llama2-70b.Server.target_latency = 0
+# llama2-70b.Server.ttft_latency = 450
+# llama2-70b.Server.tpot_latency = 40
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
@@ -75,6 +75,13 @@ llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
 
+# Target Latencies for low latency setting. In order to run low latency:
+# 1. Uncomment this set of latencies
+# 2. Comment the previous set of latencies
+# llama2-70b.Server.target_latency = 0
+# llama2-70b.Server.ttft_latency = 450
+# llama2-70b.Server.tpot_latency = 40
+
 mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
@@ -63,6 +63,14 @@ def main():
     df.rename(columns={"Model": "UsedModel"}, inplace=True)
     df.rename(columns={"MlperfModel": "Model"}, inplace=True)
 
+    # Replace low latency names
+    def SubLowLatencyModel(model, mlperf_model):
+        if model in ["llama2-70b-low-latency-99", "llama2-70b-low-latency-99.9"]:
+            return model
+        else:
+            return mlperf_model
+    df["Model"] = df.apply(lambda x: SubLowLatencyModel(x["UsedModel"], x["Model"]), axis = 1)
+
     # fix issues with raw data
     df["host_processor_core_count"] = df["host_processor_core_count"].apply(
         lambda x: 2 if x == "2 (big); 4 (LITTLE)" else x
@@ -147,6 +155,8 @@ def main():
             "stable-diffusion-xl",
             "llama2-70b-99",
             "llama2-70b-99.9",
+            "llama2-70b-low-latency-99",
+            "llama2-70b-low-latency-99.9",
             "mixtral-8x7b",
         ],
         ["SingleStream", "MultiStream", "Server", "Offline"],
@@ -209,6 +219,8 @@ def main():
                 "stable-diffusion-xl": ["Server", "Offline"],
                 "llama2-70b-99": ["Server", "Offline"],
                 "llama2-70b-99.9": ["Server", "Offline"],
+                "llama2-70b-low-latency-99": ["Server", "Offline"],
+                "llama2-70b-low-latency-99.9": ["Server", "Offline"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
                 "llama3.1-405b": ["Offline", "Server"]

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
@@ -194,7 +194,6 @@
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
-            "llama3_1-405b": "llama3.1-405b"
         },
         "seeds": {
             "qsl_rng_seed": 3066443479025735752,
@@ -439,8 +438,12 @@
         # not really needed
         "model_mapping": {
             # map model names to the official mlperf model class
+            "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
+            "llama2-70b-low-latency-99": "llama2-70b-99",
+            "llama2-70b-low-latency-99.9": "llama2-70b-99.9",
+            "llama3_1-405b": "llama3.1-405b",
         },
         "seeds": {
             # TODO: Update random seeds
@@ -666,9 +669,11 @@
 
 LLM_LATENCY_LIMITS = {
     "llama2-70b-99": {
+        "low-latency": {"ttft": 450 * 1000000, "tpot": 40 * 1000000},
         "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
     },
     "llama2-70b-99.9": {
+        "low-latency": {"ttft": 450 * 1000000, "tpot": 40 * 1000000},
         "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
     },
     "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
@@ -883,6 +888,11 @@ def get_mlperf_model(self, model, extra_model_mapping=None):
         # map again
         mlperf_model = self.base["model_mapping"].get(model, model)
         return mlperf_model
+
+    def get_llm_constraint(self, model_name):
+        if model_name in ["llama2-70b-low-latency-99", "llama2-70b-low-latency-99.9"]:
+            return "low-latency"
+        return None
 
     def get_required(self, model):
         model = self.get_mlperf_model(model)
@@ -1235,29 +1245,35 @@ def check_accuracy_dir(config, model, path, verbose):
     return is_valid, result_acc
 
 
-def extra_check_llm(mlperf_log, scenario, model):
+def extra_check_llm(mlperf_log, scenario, model, llm_constraint):
     if mlperf_log["requested_use_token_latencies"]:
         if scenario == "Offline":
             # For offline no further checks are necessary
-            return None, True
+            return True
         else:
-            for constraint, limits in LLM_LATENCY_LIMITS[model].items():
-                if (
-                    mlperf_log["result_first_token_99.00_percentile_latency_ns"]
-                    < limits["ttft"]
-                    and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
-                    < limits["tpot"]
-                ):
-                    return constraint, True
+            if llm_constraint is None:
+                llm_constraint = "conversational"
+            limits = LLM_LATENCY_LIMITS[model][llm_constraint]
+            if (
+                mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                < limits["ttft"]
+                and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                < limits["tpot"]
+            ):
+                return True
     else:
         log.error(
             f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
-        return None, False
+        return False
 
     log.error(
-        f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
+        'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
+        mlperf_log["result_first_token_99.00_percentile_latency_ns"],
+        mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
+        limits["ttft"],
+        limits["tpot"]
     )
-    return None, False
+    return False
 
 
 def get_performance_metric(
@@ -1295,7 +1311,7 @@ def get_performance_metric(
 
 
 def check_performance_dir(
-        config, model, path, scenario_fixed, division, system_json):
+        config, model, path, scenario_fixed, division, system_json, llm_constraint = None):
     is_valid = False
     rt = {}
 
@@ -1327,8 +1343,9 @@ def check_performance_dir(
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
                  "mixtral-8x7b", "llama3.1-405b"]:
-        llama_constraint, is_valid = extra_check_llm(
-            mlperf_log, scenario_fixed, model)
+        llm_is_valid = extra_check_llm(
+            mlperf_log, scenario_fixed, model, llm_constraint)
+        is_valid = (llm_is_valid and is_valid)
 
     latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
     latency_mean = mlperf_log["result_mean_latency_ns"]
@@ -2168,6 +2185,7 @@ def log_result(
                     mlperf_model = config.get_mlperf_model(
                         model_name, extra_model_mapping
                     )
+                    llm_constraint = config.get_llm_constraint(model_name)
 
                     if is_closed_or_network and mlperf_model not in config.models:
                         # for closed/network divisions we want the model name to match.
@@ -2385,6 +2403,7 @@ def log_result(
                                     scenario_fixed,
                                     division,
                                     system_json,
+                                    llm_constraint
                                 )
                                 if is_inferred:
                                     inferred = 1
@@ -3002,6 +3021,8 @@ def check_compliance_dir(
                 compliance_perf_dir = os.path.join(
                     compliance_dir, test, "performance", "run_1"
                 )
+                # WARNING: LLMs for now only have TEST06, so for no llm_constraint is 
+                # not needed to call
                 compliance_perf_valid, r, is_inferred = check_performance_dir(
                     config, model, compliance_perf_dir, scenario, division, system_json
                 )