Skip to content

Commit

Permalink
Merge pull request mlcommons#702 from guschmue/gs/fix_cas
Browse files Browse the repository at this point in the history
ignore certain loadgen errors
  • Loading branch information
christ1ne authored Aug 28, 2020
2 parents 54bd325 + cd33e07 commit 737ea9b
Showing 1 changed file with 22 additions and 20 deletions.
42 changes: 22 additions & 20 deletions tools/submission/submission-checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,19 @@
"sample_index_rng_seed": 665484352860916858,
"schedule_rng_seed": 3622009729038561421,
},
"ignore_errors": [
"check for ERROR in detailed",
"Loadgen built with uncommitted changes",
"Ran out of generated queries to issue before the minimum query count and test duration were reached",
"CAS failed",
],
},
"v0.7": {
"models": [
"ssd-small", "ssd-large", "resnet", "rnnt",
"bert-99", "bert-99.9",
"dlrm-99", "dlrm-99.9"
"3d-unet-99", "3d-unet-99.9"
"dlrm-99", "dlrm-99.9",
"3d-unet-99", "3d-unet-99.9",
],
"required-scenarios-datacenter": {
"resnet": ["Server", "Offline"],
Expand Down Expand Up @@ -131,6 +137,9 @@
"sample_index_rng_seed": 665484352860916858,
"schedule_rng_seed": 3622009729038561421,
},
"ignore_errors": [
"CAS failed",
],
"latency-constraint": {
"resnet": {"Server": 15000000, "MultiStream": 50000000},
"ssd-small": {"MultiStream": 50000000},
Expand Down Expand Up @@ -287,6 +296,12 @@ def get_performance_sample_count(self, model):
raise ValueError("model not known: " + model)
return self.performance_sample_count[model]

def ignore_errors(self, line):
for error in self.base["ignore_errors"]:
if error in line:
return True
return False

def get_min_query_count(self, model, scenario):
model = self.get_mlperf_model(model)
if model not in self.min_queries:
Expand Down Expand Up @@ -319,18 +334,6 @@ def split_path(m):
return m.replace("\\", "/").split("/")


def ignore_errors_for_v0_5(line):
if "check for ERROR in detailed" in line:
return True
if "Loadgen built with uncommitted changes" in line:
return True
if "Ran out of generated queries to issue before the minimum query count and test duration were reached" in line:
return True
if "CAS failed" in line:
return True
return False


def check_accuracy_dir(config, model, path):
is_valid = False
acc = None
Expand Down Expand Up @@ -377,9 +380,8 @@ def check_accuracy_dir(config, model, path):
for line in f:
# look for: ERROR
if "ERROR" in line:
if config.version in ["v0.5"] and ignore_errors_for_v0_5(line):
if config.ignore_errors(line):
continue
# TODO: should this be a failed run?
log.error("%s contains error: %s", fname, line)
is_valid = False

Expand Down Expand Up @@ -412,7 +414,7 @@ def check_performance_dir(config, model, path):
for line in f:
# look for: ERROR
if "ERROR" in line:
if config.version in ["v0.5"] and ignore_errors_for_v0_5(line):
if config.ignore_errors(line):
continue
log.error("%s contains error: %s", fname, line)
is_valid = False
Expand Down Expand Up @@ -548,15 +550,15 @@ def check_results_dir(config, filter_submitter, csv):
# we are looking at ./$division/$submitter/results/$system_desc/$model,
# ie ./closed/mlperf_org/results/t4-ort/bert
name = os.path.join(results_path, system_desc, model_name)
mlperf_model = config.get_mlperf_model(model_name)

if is_closed and model_name not in config.models:
if is_closed and mlperf_model not in config.models:
# for closed division we want the model name to match.
# for open division the model_name might be different than the task
log.error("%s has a invalid model (%s) for closed division", name, model_name)
log.error("%s has a invalid model %s for closed division", name, model_name)
results[name] = None
continue

mlperf_model = config.get_mlperf_model(model_name)

#
# Look at each scenario
Expand Down

0 comments on commit 737ea9b

Please sign in to comment.