Skip to content

Commit

Permalink
Add low latency checks to submission checker
Browse files Browse the repository at this point in the history
  • Loading branch information
pgmpablo157321 committed Jan 6, 2025
1 parent b9f22d6 commit 726be53
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 18 deletions.
9 changes: 8 additions & 1 deletion language/llama2-70b/user.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,11 @@
*.Server.min_duration = 120000
*.Server.min_query_count = 100

llama2-70b.Server.sample_concatenate_permutation = 1
llama2-70b.Server.sample_concatenate_permutation = 1

# Target Latencies for low latency setting. In order to run low latency:
# 1. Uncomment this set of latencies
# 2. Comment the set of latencies in mlperf.conf
# llama2-70b.Server.target_latency = 0
# llama2-70b.Server.ttft_latency = 450
# llama2-70b.Server.tpot_latency = 40
7 changes: 7 additions & 0 deletions loadgen/mlperf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ llama2-70b.Server.target_latency = 0
llama2-70b.Server.ttft_latency = 2000
llama2-70b.Server.tpot_latency = 200

# Target Latencies for low latency setting. In order to run low latency:
# 1. Uncomment this set of latencies
# 2. Comment the previous set of latencies
# llama2-70b.Server.target_latency = 0
# llama2-70b.Server.ttft_latency = 450
# llama2-70b.Server.tpot_latency = 40

mixtral-8x7b.Server.target_latency = 0
mixtral-8x7b.Server.ttft_latency = 2000
mixtral-8x7b.Server.tpot_latency = 200
Expand Down
12 changes: 12 additions & 0 deletions tools/submission/generate_final_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ def main():
df.rename(columns={"Model": "UsedModel"}, inplace=True)
df.rename(columns={"MlperfModel": "Model"}, inplace=True)

# Replace low latency names
def SubLowLatencyModel(model, mlperf_model):
if model in ["llama2-70b-low-latency-99", "llama2-70b-low-latency-99.9"]:
return model
else:
return mlperf_model
df["Model"] = df.apply(lambda x: SubLowLatencyModel(x["UsedModel"], x["Model"]), axis = 1)

# fix issues with raw data
df["host_processor_core_count"] = df["host_processor_core_count"].apply(
lambda x: 2 if x == "2 (big); 4 (LITTLE)" else x
Expand Down Expand Up @@ -147,6 +155,8 @@ def main():
"stable-diffusion-xl",
"llama2-70b-99",
"llama2-70b-99.9",
"llama2-70b-low-latency-99",
"llama2-70b-low-latency-99.9",
"mixtral-8x7b",
],
["SingleStream", "MultiStream", "Server", "Offline"],
Expand Down Expand Up @@ -209,6 +219,8 @@ def main():
"stable-diffusion-xl": ["Server", "Offline"],
"llama2-70b-99": ["Server", "Offline"],
"llama2-70b-99.9": ["Server", "Offline"],
"llama2-70b-low-latency-99": ["Server", "Offline"],
"llama2-70b-low-latency-99.9": ["Server", "Offline"],
"mixtral-8x7b": ["Server", "Offline"],
"rgat": ["Offline"],
"llama3.1-405b": ["Offline", "Server"]
Expand Down
55 changes: 38 additions & 17 deletions tools/submission/submission_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@
"ssd-resnet34": "retinanet",
"mobilenet": "resnet",
"resnet50": "resnet",
"llama3_1-405b": "llama3.1-405b"
},
"seeds": {
"qsl_rng_seed": 3066443479025735752,
Expand Down Expand Up @@ -439,8 +438,12 @@
# not really needed
"model_mapping": {
# map model names to the official mlperf model class
"ssd-resnet34": "retinanet",
"mobilenet": "resnet",
"resnet50": "resnet",
"llama2-70b-low-latency-99": "llama2-70b-99",
"llama2-70b-low-latency-99.9": "llama2-70b-99.9",
"llama3_1-405b": "llama3.1-405b",
},
"seeds": {
# TODO: Update random seeds
Expand Down Expand Up @@ -666,9 +669,11 @@

LLM_LATENCY_LIMITS = {
"llama2-70b-99": {
"low-latency": {"ttft": 450 * 1000000, "tpot": 40 * 1000000},
"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
},
"llama2-70b-99.9": {
"low-latency": {"ttft": 450 * 1000000, "tpot": 40 * 1000000},
"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
},
"mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
Expand Down Expand Up @@ -883,6 +888,11 @@ def get_mlperf_model(self, model, extra_model_mapping=None):
# map again
mlperf_model = self.base["model_mapping"].get(model, model)
return mlperf_model

def get_llm_constraint(self, model_name):
if model_name in ["llama2-70b-low-latency-99", "llama2-70b-low-latency-99.9"]:
return "low-latency"
return None

def get_required(self, model):
model = self.get_mlperf_model(model)
Expand Down Expand Up @@ -1235,29 +1245,35 @@ def check_accuracy_dir(config, model, path, verbose):
return is_valid, result_acc


def extra_check_llm(mlperf_log, scenario, model):
def extra_check_llm(mlperf_log, scenario, model, llm_constraint):
if mlperf_log["requested_use_token_latencies"]:
if scenario == "Offline":
# For offline no further checks are necessary
return None, True
return True
else:
for constraint, limits in LLM_LATENCY_LIMITS[model].items():
if (
mlperf_log["result_first_token_99.00_percentile_latency_ns"]
< limits["ttft"]
and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
< limits["tpot"]
):
return constraint, True
if llm_constraint is None:
llm_constraint = "conversational"
limits = LLM_LATENCY_LIMITS[model][llm_constraint]
if (
mlperf_log["result_first_token_99.00_percentile_latency_ns"]
< limits["ttft"]
and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
< limits["tpot"]
):
return True
else:
log.error(
f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
return None, False
return False

log.error(
f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
'Failed extra check for TTFT and TPOT. Obtained: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f. Required: TTFT 99-tile: %.4f, TPOT 99-tile: %.4f',
mlperf_log["result_first_token_99.00_percentile_latency_ns"],
mlperf_log["result_time_per_output_token_99.00_percentile_ns"],
limits["ttft"],
limits["tpot"]
)
return None, False
return False


def get_performance_metric(
Expand Down Expand Up @@ -1295,7 +1311,7 @@ def get_performance_metric(


def check_performance_dir(
config, model, path, scenario_fixed, division, system_json):
config, model, path, scenario_fixed, division, system_json, llm_constraint = None):
is_valid = False
rt = {}

Expand Down Expand Up @@ -1327,8 +1343,9 @@ def check_performance_dir(

if model in ["llama2-70b-99", "llama2-70b-99.9",
"mixtral-8x7b", "llama3.1-405b"]:
llama_constraint, is_valid = extra_check_llm(
mlperf_log, scenario_fixed, model)
llm_is_valid = extra_check_llm(
mlperf_log, scenario_fixed, model, llm_constraint)
is_valid = (llm_is_valid and is_valid)

latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
latency_mean = mlperf_log["result_mean_latency_ns"]
Expand Down Expand Up @@ -2168,6 +2185,7 @@ def log_result(
mlperf_model = config.get_mlperf_model(
model_name, extra_model_mapping
)
llm_constraint = config.get_llm_constraint(model_name)

if is_closed_or_network and mlperf_model not in config.models:
# for closed/network divisions we want the model name to match.
Expand Down Expand Up @@ -2385,6 +2403,7 @@ def log_result(
scenario_fixed,
division,
system_json,
llm_constraint
)
if is_inferred:
inferred = 1
Expand Down Expand Up @@ -3002,6 +3021,8 @@ def check_compliance_dir(
compliance_perf_dir = os.path.join(
compliance_dir, test, "performance", "run_1"
)
# WARNING: LLMs for now only have TEST06, so for no llm_constraint is
# not needed to call
compliance_perf_valid, r, is_inferred = check_performance_dir(
config, model, compliance_perf_dir, scenario, division, system_json
)
Expand Down

0 comments on commit 726be53

Please sign in to comment.