From 9e6fabf6845c2a401709e763d6b08d19bf821e46 Mon Sep 17 00:00:00 2001
From: George Yuan <gyuan@nvidia.com>
Date: Fri, 31 Jul 2020 09:52:07 -0700
Subject: [PATCH 01/22] adding preliminary loadgen changes for compliance
 testing as well as first test

---
 loadgen/loadgen.cc                            |  10 +
 loadgen/test_settings.h                       |   4 +
 loadgen/test_settings_internal.cc             |   6 +
 loadgen/test_settings_internal.h              |   1 +
 .../nvidia/TEST01/3d-unet/audit.config        |   9 +
 v0.7/compliance/nvidia/TEST01/README.md       |  44 +++++
 .../nvidia/TEST01/bert/audit.config           |   9 +
 .../nvidia/TEST01/dlrm/audit.config           |   9 +
 .../nvidia/TEST01/resnet/audit.config         |   9 +
 .../nvidia/TEST01/rnnt/audit.config           |   9 +
 .../nvidia/TEST01/run_verification.py         | 143 ++++++++++++++
 .../nvidia/TEST01/ssd-large/audit.config      |   9 +
 .../nvidia/TEST01/ssd-small/audit.config      |   9 +
 .../nvidia/TEST01/verify_accuracy.py          | 177 ++++++++++++++++++
 .../nvidia/TEST01/verify_performance.py       | 140 ++++++++++++++
 15 files changed, 588 insertions(+)
 create mode 100644 v0.7/compliance/nvidia/TEST01/3d-unet/audit.config
 create mode 100755 v0.7/compliance/nvidia/TEST01/README.md
 create mode 100644 v0.7/compliance/nvidia/TEST01/bert/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/dlrm/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/resnet/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/rnnt/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/run_verification.py
 create mode 100644 v0.7/compliance/nvidia/TEST01/ssd-large/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/ssd-small/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST01/verify_accuracy.py
 create mode 100644 v0.7/compliance/nvidia/TEST01/verify_performance.py
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index e524b95fa..07ded1b7d 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -358,6 +358,16 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
   auto sequence_id_start = sequence_gen->CurrentSampleId();
   std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
       settings, loaded_sample_set, sequence_gen, &response_logger);
+
+  // Calculated expected number of queries
+  uint64_t expected_queries = settings.target_qps * settings.min_duration.count() / 1000;
+  if (scenario != TestScenario::Offline) {
+      expected_queries *= settings.samples_per_query;
+  }
+
+  if (settings.accuracy_log_sampling_target > 0) {
+    response_logger.accuracy_log_prob = (double) settings.accuracy_log_sampling_target / expected_queries;
+  }
   auto sequence_id_end = sequence_gen->CurrentSampleId();
   size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
 
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 90a41ad31..d656d1ae1 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -262,6 +262,10 @@ struct TestSettings {
   /// accuracy log in performance mode
   double accuracy_log_probability = 0.0;
 
+  /// \brief Target number of samples that will have their results printed to
+  /// accuracy log in performance mode for compliance testing
+  uint64_t accuracy_log_sampling_target = 0;
+
   /// \brief Load mlperf parameter config from file.
   int FromConfig(const std::string &path, const std::string &model,
                  const std::string &scenario);
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 5f9094a32..3b19214d3 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -42,6 +42,7 @@ TestSettingsInternal::TestSettingsInternal(
       schedule_rng_seed(requested.schedule_rng_seed),
       accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
       accuracy_log_probability(requested.accuracy_log_probability),
+      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
       print_timestamps(requested.print_timestamps),
       performance_issue_unique(requested.performance_issue_unique),
       performance_issue_same(requested.performance_issue_same),
@@ -256,6 +257,7 @@ void LogRequestedTestSettings(const TestSettings &s) {
     detail("schedule_rng_seed : ", s.schedule_rng_seed);
     detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
     detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
     detail("print_timestamps : ", s.print_timestamps);
     detail("performance_issue_unique : ", s.performance_issue_unique);
     detail("performance_issue_same : ", s.performance_issue_same);
@@ -290,6 +292,7 @@ void TestSettingsInternal::LogEffectiveSettings() const {
     detail("schedule_rng_seed : ", s.schedule_rng_seed);
     detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
     detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
     detail("print_timestamps : ", s.print_timestamps);
     detail("performance_issue_unique : ", s.performance_issue_unique);
     detail("performance_issue_same : ", s.performance_issue_same);
@@ -317,6 +320,7 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
   summary("schedule_rng_seed : ", schedule_rng_seed);
   summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
   summary("accuracy_log_probability : ", accuracy_log_probability);
+  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
   summary("print_timestamps : ", print_timestamps);
   summary("performance_issue_unique : ", performance_issue_unique);
   summary("performance_issue_same : ", performance_issue_same);
@@ -462,6 +466,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
            nullptr);
   lookupkv(model, scenario, "accuracy_log_probability", nullptr,
            &accuracy_log_probability, 0.01);
+  lookupkv(model, scenario, "accuracy_log_sampling_target",
+           &accuracy_log_sampling_target, nullptr);
   if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
     print_timestamps = (val == 0) ? false : true;
   if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index 676b2fefe..df903dd91 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -74,6 +74,7 @@ struct TestSettingsInternal {
   uint64_t schedule_rng_seed;
   uint64_t accuracy_log_rng_seed;
   double accuracy_log_probability;
+  uint64_t accuracy_log_sampling_target;
   bool print_timestamps;
   bool performance_issue_unique;
   bool performance_issue_same;
diff --git a/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config b/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config
new file mode 100644
index 000000000..984895a24
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 64
diff --git a/v0.7/compliance/nvidia/TEST01/README.md b/v0.7/compliance/nvidia/TEST01/README.md
new file mode 100755
index 000000000..6e92f3277
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/README.md
@@ -0,0 +1,44 @@
+﻿
+# Test 01 - Verify accuracy in performance mode
+## Introduction
+The purpose of this test is to ensure that valid inferences are being performed in performance mode. By default, the inference result that is returned from SUT to Loadgen is not written to the accuracy JSON file and thus not checked for accuracy. In this test, the inference results of a subset of the total samples issued by loadgen are written to the accuracy JSON. In order to pass this test, two criteria must be satisfied:
+
+ 1. The inference results in the accuracy JSON file must match the inference results in the accuracy JSON generated in accuracy mode in the submission run.
+ 2. The performance while running this test must match the performance of the submission within 10%. 
+
+## Performance considerations
+The subset of samples results chosen to to be written to the accuracy JSON is determined randomly using a probability based on `accuracy_log_sampling_target` specified in the audit.config file divided by the total expected number of completed samples in the test run. This total expected number of completed samples is based on `min_duration_count`, `samples_per_query`, and `target_qps`. The goal is to ensure that a reasonable number of sample results gets written to the accuracy JSON regardless of the throughput of the system-under-test. Given that the number of actual completed samples may not match the expected number, the number of inference results written to the accuracy JSON may not exactly match `accuracy_log_sampling_target`.
+
+There is an audit.config file for each individual benchmark, located in the benchmark subdirectories in this test directory. The `accuracy_log_sampling_target` value for each benchmark is chosen taking into consideration the performance sample count and size of the inference result. If performance with sampling enabled cannot meet the pass threshold set in verify_performance.py, `accuracy_log_sampling_target` may be reduced to check that performance approaches the submission score.
+
+## Log size
+3d-unet is unique in that its inference result output per-sample is drastically larger than that of other benchmarks. For all other benchmarks, the accuracy JSON results can be checked using python JSON libraries, which can be enabled by providing `--fastmode` to the run_verification.py script. For 3d-unet, using fastmode will result in verify_performance.py running out of memory, so the alternative way of using UNIX-based commandline utilities must be used by not supplying the `--fastmode` switch.
+
+## Prerequisites
+This script works best with Python 3.3 or later. For 3d-unet, the accuracy verification script require the `wc`,`sed`,`awk`,`head`,`tail`,`grep`, and `md5sum` UNIX commandline utilities.
+
+## Non-determinism
+Note that under MLPerf inference rules, certain forms of non-determinism is acceptable, which can cause inference results to differ across runs. It is foreseeable that the results obtained during the accuracy run can be different from that obtained during the performance run, which will cause the accuracy checking script to report failure. Test failure will automatically result in an objection, but the objection can be overturned by comparing the quality of the results generated in performance mode to that obtained in accuracy mode. This can be done by using the accuracy measurement scripts provided as part of the repo to ensure that the accuracy score meets the target. An example is provided for GNMT in the gnmt folder.
+
+## Instructions
+
+### Part I
+Run test with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt 
+
+### Part II
+Run the verification script:
+  `python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}] [--fastmode]`
+  
+RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline`
+COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run.
+OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline`
+
+Expected outcome:
+
+    Accuracy check pass: True                
+    Performance check pass: True             
+    TEST01 verification complete        
+
+     
+
+
diff --git a/v0.7/compliance/nvidia/TEST01/bert/audit.config b/v0.7/compliance/nvidia/TEST01/bert/audit.config
new file mode 100644
index 000000000..c861e2a3d
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/bert/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/dlrm/audit.config b/v0.7/compliance/nvidia/TEST01/dlrm/audit.config
new file mode 100644
index 000000000..c861e2a3d
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/dlrm/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/resnet/audit.config b/v0.7/compliance/nvidia/TEST01/resnet/audit.config
new file mode 100644
index 000000000..c861e2a3d
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/resnet/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/rnnt/audit.config b/v0.7/compliance/nvidia/TEST01/rnnt/audit.config
new file mode 100644
index 000000000..c861e2a3d
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/rnnt/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/run_verification.py b/v0.7/compliance/nvidia/TEST01/run_verification.py
new file mode 100644
index 000000000..8124ceb35
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/run_verification.py
@@ -0,0 +1,143 @@
+#! /usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import os
+import sys
+import shutil
+import subprocess
+import argparse
+import json
+
+import numpy as np
+
+sys.path.append(os.getcwd())
+
+dtype_map = {
+    "byte": np.byte,
+    "float32": np.float32,
+    "int32": np.int32,
+    "int64": np.int64
+}
+
+def main():
+
+
+    py3 = sys.version_info >= (3,0)
+    # Parse arguments to identify the path to the accuracy logs from
+    #   the accuracy and performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--results_dir", "-r",
+        help="Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--compliance_dir", "-c",
+        help="Specifies the path to the directory containing the logs from the compliance test run.",
+        default=""
+    )
+    parser.add_argument(
+        "--output_dir", "-o",
+        help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode")
+    parser.add_argument(
+        "--fastmode", action="store_true",
+        help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.")
+
+    args = parser.parse_args()
+
+    print("Parsing arguments.")
+    results_dir = args.results_dir
+    compliance_dir = args.compliance_dir
+    output_dir = os.path.join(args.output_dir, "TEST01")
+    fastmode = ""
+    if args.fastmode:
+        fastmode = " --fastmode"
+    else: 
+        for binary in ["wc", "md5sum", "grep", "awk", "sed", "head", "tail"]:
+            missing_binary = False
+            if shutil.which(binary) == None:
+                print("Error: This script requires the {:} commandline utility".format(binary))
+                missing_binary = True
+        if missing_binary:
+            exit()
+
+    dtype = args.dtype
+
+    # run verify accuracy
+    verify_accuracy_command = "python3 verify_accuracy.py --dtype " + args.dtype + fastmode + " -r " + results_dir + "/accuracy/mlperf_log_accuracy.json" + " -t " + compliance_dir + "/mlperf_log_accuracy.json | tee verify_accuracy.txt"
+    try:
+        os.system(verify_accuracy_command)
+    except:
+        print("Exception occurred trying to execute:\n  " + verify_accuracy_command)
+    # check if verify accuracy script passes
+
+    accuracy_pass_command = "grep PASS verify_accuracy.txt"
+    accuracy_pass = "TEST PASS" in subprocess.check_output(accuracy_pass_command, shell=True).decode("utf-8")
+
+    # run verify performance
+    verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    try:
+        os.system(verify_performance_command)
+    except:
+        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+
+    # check if verify performance script passes
+    performance_pass_command = "grep PASS verify_performance.txt"
+    performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+    
+    # setup output compliance directory structure
+    output_accuracy_dir = os.path.join(output_dir, "accuracy")
+    output_performance_dir = os.path.join(output_dir, "performance", "run_1")
+    try:
+        if not os.path.isdir(output_accuracy_dir):
+            os.makedirs(output_accuracy_dir)
+    except:
+        print("Exception occurred trying to create " + output_accuracy_dir)
+    try:
+        if not os.path.isdir(output_performance_dir):
+            os.makedirs(output_performance_dir)
+    except:
+        print("Exception occurred trying to create " + output_performance_dir)
+
+    # copy compliance logs to output compliance directory
+    shutil.copy2("verify_accuracy.txt",output_dir)
+    shutil.copy2("verify_performance.txt",output_dir)
+    accuracy_file = os.path.join(compliance_dir,"mlperf_log_accuracy.json")
+    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+
+    try:
+        shutil.copy2(accuracy_file,output_accuracy_dir)
+    except:
+        print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir)
+    try:
+        shutil.copy2(summary_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+    try:
+        shutil.copy2(detail_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+
+    print("Accuracy check pass: {:}".format(accuracy_pass))
+    print("Performance check pass: {:}".format(performance_pass))
+    print("TEST01 verification complete")
+
+if __name__ == '__main__':
+	main()
diff --git a/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config
new file mode 100644
index 000000000..03e70a4c7
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 256
diff --git a/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config
new file mode 100644
index 000000000..846c3d9da
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 1024
diff --git a/v0.7/compliance/nvidia/TEST01/verify_accuracy.py b/v0.7/compliance/nvidia/TEST01/verify_accuracy.py
new file mode 100644
index 000000000..b7859b78e
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/verify_accuracy.py
@@ -0,0 +1,177 @@
+#! /usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import os
+import subprocess
+import sys
+import shutil
+sys.path.append(os.getcwd())
+
+import argparse
+import json
+
+import numpy as np
+
+dtype_map = {
+    "byte": np.byte,
+    "float32": np.float32,
+    "int32": np.int32,
+    "int64": np.int64
+}
+
+def main():
+
+    # Parse arguments to identify the path to the accuracy logs from
+    #   the accuracy and performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reference_accuracy", "-r",
+        help="Specifies the path to the accuracy log from a submission/accuracy run.",
+        default=""
+    )
+    parser.add_argument(
+        "--test_accuracy", "-t",
+        help="Specifies the path to the accuracy log from a performance run with accuracy log sampling enabled.",
+        default=""
+    )
+    parser.add_argument(
+        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label")
+
+    parser.add_argument(
+        "--fastmode", action="store_true",
+        help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.")
+    args = parser.parse_args()
+
+    print("Verifying accuracy. This might take a while...")
+    acc_log  = args.reference_accuracy
+    perf_log = args.test_accuracy
+
+    if args.fastmode:
+        with open(acc_log, "r") as acc_json:
+            acc_data = json.load(acc_json)
+
+        with open(perf_log, "r") as perf_json:
+            perf_data = json.load(perf_json)
+
+        # read accuracy log json and create a dictionary of qsl_idx/data pairs
+        results_dict = {}
+        num_acc_log_duplicate_keys = 0
+        num_acc_log_data_mismatch = 0
+        num_perf_log_qsl_idx_match = 0
+        num_perf_log_data_mismatch = 0
+        num_missing_qsl_idxs = 0
+
+        print("Reading accuracy mode results...")
+        for sample in acc_data:
+            #print sample["qsl_idx"]
+            qsl_idx = sample["qsl_idx"]
+            data = sample["data"]
+            if data == '':
+                data = ""
+            if qsl_idx in results_dict.keys():
+                num_acc_log_duplicate_keys += 1
+                if results_dict[qsl_idx] != data:
+                    num_acc_log_data_mismatch += 1
+            else:
+                results_dict[qsl_idx] = data
+
+        print("Reading performance mode results...")
+        for sample in perf_data:
+            qsl_idx = sample["qsl_idx"]
+            data = np.frombuffer(bytes.fromhex(sample['data']), dtype_map[args.dtype]) if py33 == True \
+                else np.frombuffer(bytearray.fromhex(sample['data']), dtype_map[args.dtype])
+
+            if qsl_idx in results_dict.keys():
+                num_perf_log_qsl_idx_match += 1
+                data_perf = np.frombuffer(bytes.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) \
+                    if py33 == True else np.frombuffer(bytearray.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype])
+                if data_perf.size == 0 or data.size == 0:
+                    if data_perf.size != data.size:
+                        num_perf_log_data_mismatch += 1
+                elif data[0] != data_perf[0]:
+                    num_perf_log_data_mismatch += 1
+            else:
+                num_missing_qsl_idxs += 1
+
+            results_dict[sample["qsl_idx"]] = sample["data"]
+
+
+        print("num_acc_log_entries = {:}".format(len(acc_data)))
+        print("num_acc_log_duplicate_keys = {:}".format(num_acc_log_duplicate_keys))
+        print("num_acc_log_data_mismatch = {:}".format(num_acc_log_data_mismatch))
+        print("num_perf_log_entries = {:}".format(len(perf_data)))
+        print("num_perf_log_qsl_idx_match = {:}".format(num_perf_log_qsl_idx_match))
+        print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch))
+        print("num_missing_qsl_idxs = {:}".format(num_missing_qsl_idxs))
+        if num_perf_log_data_mismatch > 0 :
+            print("TEST FAIL\n");
+        else :
+            print("TEST PASS\n");
+        exit()
+
+    py33 = sys.version_info >= (3,3)
+
+    if not py33:
+        print("Error: This script requires Python v3.3 or later")
+        exit()
+
+
+    get_perf_lines_cmd = "wc -l " + perf_log + "| awk '{print $1}'"
+    num_perf_lines = int(subprocess.check_output(get_perf_lines_cmd, shell=True).decode("utf-8"))
+
+    get_acc_lines_cmd = "wc -l " + acc_log + "| awk '{print $1}'"
+    num_acc_lines = int(subprocess.check_output(get_acc_lines_cmd, shell=True).decode("utf-8"))
+
+    num_acc_log_entries = num_acc_lines - 2
+    num_perf_log_entries = num_perf_lines - 2
+    #print(perf_qsl_idx)
+    #print(get_perf_lines_cmd)
+    #print(num_perf_lines)
+    
+    num_perf_log_data_mismatch = 0
+    for perf_line in range(0, num_perf_lines):
+        if perf_line % int(num_perf_lines/100) == 0:
+            print(".", end = "", flush=True)
+        # first and last line are brackets
+        if perf_line == 0 or perf_line == int(num_perf_lines)-1:
+            continue
+
+        # calculate md5sum of line in perf mode accuracy_log
+        perf_md5sum_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum"
+        #print(perf_md5sum_cmd)
+        perf_md5sum = subprocess.check_output(perf_md5sum_cmd, shell=True).decode("utf-8")
+
+        # get qsl idx
+        get_qsl_idx_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| awk -F\": |,\" '{print $4}'"
+        qsl_idx = subprocess.check_output(get_qsl_idx_cmd, shell=True).decode("utf-8").rstrip()
+
+        # calculate md5sum of line in acc mode accuracy_log
+        acc_md5sum_cmd = "grep \"qsl_idx\\\" : " + qsl_idx + ",\" " + acc_log + "| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum"
+        acc_md5sum = subprocess.check_output(acc_md5sum_cmd, shell=True).decode("utf-8")
+
+        if perf_md5sum != acc_md5sum:
+            num_perf_log_data_mismatch += 1
+
+    print("")
+    print("num_acc_log_entries = {:}".format(num_acc_log_entries))
+    print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch))
+    print("num_perf_log_entries = {:}".format(num_perf_log_entries))
+    if num_perf_log_data_mismatch > 0 :
+        print("TEST FAIL\n");
+    else :
+        print("TEST PASS\n");
+
+if __name__ == '__main__':
+	main()
diff --git a/v0.7/compliance/nvidia/TEST01/verify_performance.py b/v0.7/compliance/nvidia/TEST01/verify_performance.py
new file mode 100644
index 000000000..000141f31
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST01/verify_performance.py
@@ -0,0 +1,140 @@
+#! /usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import os
+import sys
+import re
+sys.path.append(os.getcwd())
+
+import argparse
+import json
+
+def main():
+    # Parse arguments to identify the path to the accuracy logs from
+    #   the accuracy and performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reference_summary", "-r",
+        help="Specifies the path to the summary log for TEST00.",
+        default=""
+    )
+    parser.add_argument(
+        "--test_summary", "-t",
+        help="Specifies the path to the summary log for this test.",
+        default=""
+    )
+    args = parser.parse_args()
+
+    print("Verifying performance.")
+    ref_file = open(args.reference_summary, "r")
+    test_file = open(args.test_summary, "r")
+    ref_score = 0
+    test_score = 0
+    ref_mode = ''
+    test_mode = ''
+
+    for line in ref_file:
+        if re.match("Scenario", line):
+            ref_mode = line.split(": ",1)[1].strip()
+            continue
+
+        if ref_mode == "Single Stream":
+            if re.match("90th percentile latency", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Multi Stream":
+            if re.match("Samples per query", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Server":
+            if re.match("Scheduled samples per second", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Offline":
+            if re.match("Samples per second", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if re.match("Result is", line):
+            valid = line.split(": ",1)[1].strip()
+            if valid == 'INVALID':
+                sys.exit("TEST FAIL: Reference results are invalid")
+
+        if re.match("\d+ ERROR", line):
+            error = line.split(" ",1)[0].strip()
+            print("WARNING: " + error + " ERROR reported in reference results")
+
+
+    for line in test_file:
+        if re.match("Scenario", line):
+            test_mode = line.split(": ",1)[1].strip()
+            continue
+
+        if test_mode == "Single Stream":
+            if re.match("90th percentile latency", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Multi Stream":
+            if re.match("Samples per query", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Server":
+            if re.match("Scheduled samples per second", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Offline":
+            if re.match("Samples per second", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if re.match("Result is", line):
+            valid = line.split(": ",1)[1].strip()
+            if valid == 'INVALID':
+                sys.exit("TEST FAIL: Test results are invalid")
+            
+        if re.match("\d+ ERROR", line):
+            error = line.split(" ",1)[0].strip()
+            print("WARNING: " + error + " ERROR reported in test results")
+
+    if test_mode != ref_mode:
+        sys.exit("Test and reference scenarios do not match!")
+
+    print("reference score = {}".format(ref_score))
+    print("test score = {}".format(test_score))
+
+ 
+    threshold = 0.10
+
+    # In single stream mode, latencies can be very short for high performance systems
+    # and run-to-run variation due to external disturbances (OS) can be significant.
+    # In this case we relax pass threshold to 20%
+
+    if ref_mode == "Single Stream" and float(ref_score) <= 200000:
+        threshold = 0.20
+        
+    if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold):
+        print("TEST PASS")
+    else:
+        print("TEST FAIL: Test score invalid")
+
+if __name__ == '__main__':
+	main()
+

From bf3c0c59add3c49ce639c7d3a57ac121120c0f82 Mon Sep 17 00:00:00 2001
From: Zhihan <zhihanj@nvidia.com>
Date: Fri, 31 Jul 2020 14:33:14 -0700
Subject: [PATCH 02/22] fix 3dunet syspath

---
 v0.7/medical_imaging/3d-unet/accuracy-brats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v0.7/medical_imaging/3d-unet/accuracy-brats.py b/v0.7/medical_imaging/3d-unet/accuracy-brats.py
index 6fece895c..82c9ce6c7 100644
--- a/v0.7/medical_imaging/3d-unet/accuracy-brats.py
+++ b/v0.7/medical_imaging/3d-unet/accuracy-brats.py
@@ -21,7 +21,7 @@
 import pickle
 import sys
 
-sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "nnUnet"))
 
 from multiprocessing import Pool
 from nnunet.evaluation.region_based_evaluation import evaluate_regions, get_brats_regions

From cbb0f2ad8a256eadc112b27c0b1874266852b283 Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Mon, 3 Aug 2020 16:58:53 -0700
Subject: [PATCH 03/22] Adjusting recording of query response to avoid
 duplicating it for each query index. The accuracy and AUC metric checks are
 not affected because they are invariant under uniform scaling.

---
 v0.5/recommendation/python/criteo.py | 12 +++++++++---
 v0.5/recommendation/python/main.py   | 27 ++++++++++++++++-----------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/v0.5/recommendation/python/criteo.py b/v0.5/recommendation/python/criteo.py
index 16af23148..de08b82f1 100755
--- a/v0.5/recommendation/python/criteo.py
+++ b/v0.5/recommendation/python/criteo.py
@@ -274,9 +274,16 @@ def load_query_samples(self, sample_list):
     def get_samples(self, id_list):
 
         # build list tuples as need by the batch conversion routine
+        # index i from id_list corresponds to a particular query_id
+        idx_offsets = [0]
         ls = []
         for i in id_list:
+            (_, _, _, T) = self.items_in_memory[i]
+            idx_offsets.append(idx_offsets[-1] + T.numel())
+
             ls.append(self.items_in_memory[i])
+        # debug prints
+        # print(idx_offsets)
 
         # approach 1: collate a mini-batch of single samples
         '''
@@ -304,9 +311,8 @@ def get_samples(self, id_list):
         lS_i = torch.cat(ls_t[2], dim=1)
         T = torch.cat(ls_t[3])
         # debug prints
-        # print('get_samples', (X, lS_o, lS_i, T))
-        # print('get_samples', X.shape)
-        return (X, lS_o, lS_i, T)
+        # print('get_samples', (X, lS_o, lS_i, T, idx_offsets))
+        return (X, lS_o, lS_i, T, idx_offsets)
 
 
 # Pre  processing
diff --git a/v0.5/recommendation/python/main.py b/v0.5/recommendation/python/main.py
index d66d4f957..429d0a796 100755
--- a/v0.5/recommendation/python/main.py
+++ b/v0.5/recommendation/python/main.py
@@ -203,13 +203,14 @@ def get_backend(backend, dataset, max_ind_range, data_sub_sample_rate, use_gpu):
 class Item:
     """An item that we queue for processing by the thread pool."""
 
-    def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i, batch_T=None):
+    def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i, batch_T=None, idx_offsets=None):
         self.query_id = query_id
         self.content_id = content_id
         self.batch_dense_X = batch_dense_X
         self.batch_lS_o = batch_lS_o
         self.batch_lS_i = batch_lS_i
         self.batch_T = batch_T
+        self.idx_offsets = idx_offsets
         self.start = time.time()
 
 class RunnerBase:
@@ -252,7 +253,11 @@ def run_one_item(self, qitem):
                 # result = processed_results[idx][0] and target = processed_results[idx][1]
                 # also each idx might be a query of samples, rather than a single sample
                 # depending on the --samples-to-aggregate* arguments.
-                response_array = array.array("B", np.array(processed_results, np.float32).tobytes())
+                s_idx = qitem.idx_offsets[idx]
+                e_idx = qitem.idx_offsets[idx + 1]
+                # debug prints
+                # print("s,e:",s_idx,e_idx, len(processed_results))
+                response_array = array.array("B", np.array(processed_results[s_idx:e_idx], np.float32).tobytes())
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
                 response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
@@ -264,14 +269,14 @@ def enqueue(self, query_samples):
         query_len = len(query_samples)
 
         if query_len < self.max_batchsize:
-            batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx)
-            self.run_one_item(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T))
+            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx)
+            self.run_one_item(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
         else:
             bs = self.max_batchsize
             for i in range(0, query_len, bs):
                 ie = min(i + bs, query_len)
-                batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx[i:ie])
-                self.run_one_item(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T))
+                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie])
+                self.run_one_item(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
 
     def finish(self):
         pass
@@ -308,14 +313,14 @@ def enqueue(self, query_samples):
         query_len = len(query_samples)
 
         if query_len < self.max_batchsize:
-            batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx)
-            self.tasks.put(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T))
+            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx)
+            self.tasks.put(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
         else:
             bs = self.max_batchsize
             for i in range(0, query_len, bs):
                 ie = min(i + bs, query_len)
-                batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx[i:ie])
-                self.tasks.put(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T))
+                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie])
+                self.tasks.put(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
 
     def finish(self):
         # exit all threads
@@ -416,7 +421,7 @@ def main():
     ds.load_query_samples([0])
 
     for _ in range(5):
-        batch_dense_X, batch_lS_o, batch_lS_i, batch_T = ds.get_samples([0])
+        batch_dense_X, batch_lS_o, batch_lS_i, _, _ = ds.get_samples([0])
         _ = backend.predict(batch_dense_X, batch_lS_o, batch_lS_i)
 
     ds.unload_query_samples(None)

From bb04fc96265d3067e9931854a45b1668e616832f Mon Sep 17 00:00:00 2001
From: Zhihan <zhihanj@nvidia.com>
Date: Mon, 3 Aug 2020 22:45:26 -0700
Subject: [PATCH 04/22] Fix RNNT and BERT script

---
 v0.7/language/bert/accuracy-squad.py          | 24 ++++-
 v0.7/language/bert/evaluate.py                | 94 +++++++++++++++++++
 v0.7/speech_recognition/rnnt/accuracy_eval.py | 16 +++-
 3 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 v0.7/language/bert/evaluate.py

diff --git a/v0.7/language/bert/accuracy-squad.py b/v0.7/language/bert/accuracy-squad.py
index f1365f4e6..2723a2692 100644
--- a/v0.7/language/bert/accuracy-squad.py
+++ b/v0.7/language/bert/accuracy-squad.py
@@ -45,6 +45,16 @@
 
 RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
+dtype_map = {
+    "int8": np.int8,
+    "int16": np.int16,
+    "int32": np.int32,
+    "int64": np.int64,
+    "float16": np.float16,
+    "float32": np.float32,
+    "float64": np.float64
+}
+
 def get_final_text(pred_text, orig_text, do_lower_case):
     """Project the tokenized prediction back to the original text."""
 
@@ -302,7 +312,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
     with open(output_prediction_file, "w") as writer:
         writer.write(json.dumps(all_predictions, indent=4) + "\n")
 
-def load_loadgen_log(log_path, eval_features, output_transposed=False):
+def load_loadgen_log(log_path, eval_features, dtype=np.float32, output_transposed=False):
     with open(log_path) as f:
         predictions = json.load(f)
 
@@ -310,10 +320,10 @@ def load_loadgen_log(log_path, eval_features, output_transposed=False):
     for prediction in predictions:
         qsl_idx = prediction["qsl_idx"]
         if output_transposed:
-            logits = np.frombuffer(bytes.fromhex(prediction["data"]), np.float32).reshape(2, -1)
+            logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(2, -1)
             logits = np.transpose(logits)
         else:
-            logits = np.frombuffer(bytes.fromhex(prediction["data"]), np.float32).reshape(-1, 2)
+            logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(-1, 2)
         # Pad logits to max_seq_length
         seq_length = logits.shape[0]
         start_logits = np.ones(max_seq_length) * -10000.0
@@ -336,8 +346,11 @@ def main():
     parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file")
     parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file")
     parser.add_argument("--output_transposed", action="store_true", help="Transpose the output")
+    parser.add_argument("--output_dtype", default="float16", choices=dtype_map.keys(), help="Output data type")
     args = parser.parse_args()
 
+    output_dtype = dtype_map[args.output_dtype]
+
     print("Reading examples...")
     eval_examples = read_squad_examples(input_file=args.val_data,
         is_training=False, version_2_with_negative=False)
@@ -374,13 +387,14 @@ def append_feature(feature):
             pickle.dump(eval_features, cache_file)
 
     print("Loading LoadGen logs...")
-    results = load_loadgen_log(args.log_file, eval_features, args.output_transposed)
+    results = load_loadgen_log(args.log_file, eval_features, output_dtype, args.output_transposed)
 
     print("Post-processing predictions...")
     write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file)
 
     print("Evaluating predictions...")
-    cmd = "python3 build/data/evaluate-v1.1.py build/data/dev-v1.1.json build/result/predictions.json"
+    cmd = "python3 {:}/evaluate.py {:} {:}".format(os.path.dirname(__file__),
+        args.val_data, args.out_file)
     subprocess.check_call(cmd, shell=True)
 
 if __name__ == "__main__":
diff --git a/v0.7/language/bert/evaluate.py b/v0.7/language/bert/evaluate.py
new file mode 100644
index 000000000..0137fbca0
--- /dev/null
+++ b/v0.7/language/bert/evaluate.py
@@ -0,0 +1,94 @@
+""" Official evaluation script for v1.1 of the SQuAD dataset. """
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    return {'exact_match': exact_match, 'f1': f1}
+
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))
diff --git a/v0.7/speech_recognition/rnnt/accuracy_eval.py b/v0.7/speech_recognition/rnnt/accuracy_eval.py
index a1a12a7ad..6ac13e98c 100644
--- a/v0.7/speech_recognition/rnnt/accuracy_eval.py
+++ b/v0.7/speech_recognition/rnnt/accuracy_eval.py
@@ -6,9 +6,9 @@
 import sys
 import os
 
-from QSL import AudioQSL
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch"))
 
-sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
+from QSL import AudioQSL
 from helpers import process_evaluation_epoch, __gather_predictions
 from parts.manifest import Manifest
 
@@ -31,13 +31,19 @@ def main():
     hypotheses = []
     references = []
     for result in results:
-        hypotheses.append(array.array('q', bytes.fromhex(result["data"])).tolist())
+        hypotheses.append(array.array('b', bytes.fromhex(result["data"])).tolist())
         references.append(manifest[result["qsl_idx"]]["transcript"])
-    hypotheses = __gather_predictions([hypotheses], labels=labels)
+
+    # Convert ASCII output into string
+    for idx in range(len(hypotheses)):
+        hypotheses[idx] = ''.join([chr(c) for c in hypotheses[idx]])
+
     references = __gather_predictions([references], labels=labels)
+
     d = dict(predictions=hypotheses,
              transcripts=references)
-    print("Word Error Rate:", process_evaluation_epoch(d))
+    wer = process_evaluation_epoch(d)
+    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
 
 if __name__ == '__main__':
     main()

From 08fba1945ad5668aaf963db17da0d850ec7dfe85 Mon Sep 17 00:00:00 2001
From: Zhihan <zhihanj@nvidia.com>
Date: Mon, 3 Aug 2020 23:58:57 -0700
Subject: [PATCH 05/22] Address review comment

---
 v0.7/language/bert/accuracy-squad.py               |  4 ++--
 .../bert/{evaluate.py => evaluate-v1.1.py}         | 14 ++++++++++++++
 v0.7/speech_recognition/rnnt/accuracy_eval.py      |  9 ++++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)
 rename v0.7/language/bert/{evaluate.py => evaluate-v1.1.py} (84%)

diff --git a/v0.7/language/bert/accuracy-squad.py b/v0.7/language/bert/accuracy-squad.py
index 2723a2692..113e1c8d8 100644
--- a/v0.7/language/bert/accuracy-squad.py
+++ b/v0.7/language/bert/accuracy-squad.py
@@ -346,7 +346,7 @@ def main():
     parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file")
     parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file")
     parser.add_argument("--output_transposed", action="store_true", help="Transpose the output")
-    parser.add_argument("--output_dtype", default="float16", choices=dtype_map.keys(), help="Output data type")
+    parser.add_argument("--output_dtype", default="float32", choices=dtype_map.keys(), help="Output data type")
     args = parser.parse_args()
 
     output_dtype = dtype_map[args.output_dtype]
@@ -393,7 +393,7 @@ def append_feature(feature):
     write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file)
 
     print("Evaluating predictions...")
-    cmd = "python3 {:}/evaluate.py {:} {:}".format(os.path.dirname(__file__),
+    cmd = "python3 {:}/evaluate-v1.1.py {:} {:}".format(os.path.dirname(__file__),
         args.val_data, args.out_file)
     subprocess.check_call(cmd, shell=True)
 
diff --git a/v0.7/language/bert/evaluate.py b/v0.7/language/bert/evaluate-v1.1.py
similarity index 84%
rename from v0.7/language/bert/evaluate.py
rename to v0.7/language/bert/evaluate-v1.1.py
index 0137fbca0..c582e6877 100644
--- a/v0.7/language/bert/evaluate.py
+++ b/v0.7/language/bert/evaluate-v1.1.py
@@ -1,3 +1,17 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Source: https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
+
 """ Official evaluation script for v1.1 of the SQuAD dataset. """
 from __future__ import print_function
 from collections import Counter
diff --git a/v0.7/speech_recognition/rnnt/accuracy_eval.py b/v0.7/speech_recognition/rnnt/accuracy_eval.py
index 6ac13e98c..efb6a7927 100644
--- a/v0.7/speech_recognition/rnnt/accuracy_eval.py
+++ b/v0.7/speech_recognition/rnnt/accuracy_eval.py
@@ -12,12 +12,19 @@
 from helpers import process_evaluation_epoch, __gather_predictions
 from parts.manifest import Manifest
 
+dtype_map = {
+    "int8": 'b',
+    "int16": 'h',
+    "int32": 'l',
+    "int64": 'q',
+}
 
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", required=True)
     parser.add_argument("--dataset_dir", required=True)
     parser.add_argument("--manifest", required=True)
+    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
     args = parser.parse_args()
     return args
 
@@ -31,7 +38,7 @@ def main():
     hypotheses = []
     references = []
     for result in results:
-        hypotheses.append(array.array('b', bytes.fromhex(result["data"])).tolist())
+        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
         references.append(manifest[result["qsl_idx"]]["transcript"])
 
     # Convert ASCII output into string

From b3a33224d181c6e75b207b47bf4c5b034c2f650f Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Wed, 5 Aug 2020 01:01:44 -0700
Subject: [PATCH 06/22] Adjusting README with more precise benchmark commands.

---
 v0.5/recommendation/README.md | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md
index b16d2f991..4e178aaab 100755
--- a/v0.5/recommendation/README.md
+++ b/v0.5/recommendation/README.md
@@ -6,7 +6,7 @@ This is the reference implementation for MLPerf Inference benchmarks.
 
 | name | framework | acc. | AUC | dataset | weights  | size | prec. | notes |
 | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
-| dlrm (debugging) | PyTorch | 78.9% | N/A | [Criteo KaggleDAC](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)       | N/A                                                                     | ~1GB | fp32 |                          |
+| dlrm (debugging) | PyTorch | 78.82% | N/A | [Criteo KaggleDAC](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)       | N/A                                                                     | ~1GB | fp32 |                          |
 | dlrm (debugging) | PyTorch | 81.07% | N/A | [Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) | [pytorch](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt), [onnx](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.onnx.tar) | ~10GB | fp32 | --max-ind-range=10000000 --data-sub-sample-rate=0.875 |
 | dlrm (official) | PyTorch | N/A | 80.25% | [Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) | [pytorch](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt), [onnx](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.onnx.tar) | ~100GB | fp32 | --max-ind-range=40000000 |
 
@@ -181,21 +181,38 @@ options are extra arguments that are passed along
 For example, to run on CPU you may choose to use:
 
 1. Criteo Kaggle DAC
+Offline scenario perf and accuracy modes
+```
+./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048
+./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy
+```
+Server scenario perf mode
 ```
-./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy
 ./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048
 ```
 
 2. Criteo Terabyte (0.875)
+Offline scenario perf and accuracy modes
+```
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
+```
+Server scenario perf mode
 ```
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader]
-./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048
+./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader]
 ```
+
 3. Criteo Terabyte
+Offline scenario perf and accuracy modes
+```
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
 ```
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader]
-./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048
+Server scenario perf mode
 ```
+./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader]
+```
+
 Note that the code support (i) original and (ii) mlperf binary loader, that have slightly different performance characteristics. The latter loader can be enabled by adding `--mlperf-bin-loader` to the command line.
 
 Note that this script will pre-process the data during the first run and reuse it over sub-sequent runs. The pre-processing of data can take a significant amount of time during the first run.

From be56bbde1172c7fe72c2285e003c72322305cae8 Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Wed, 5 Aug 2020 01:04:07 -0700
Subject: [PATCH 07/22] Adding blank lines for formating purposes.

---
 v0.5/recommendation/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md
index 4e178aaab..1f60f54ac 100755
--- a/v0.5/recommendation/README.md
+++ b/v0.5/recommendation/README.md
@@ -181,6 +181,7 @@ options are extra arguments that are passed along
 For example, to run on CPU you may choose to use:
 
 1. Criteo Kaggle DAC
+
 Offline scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048
@@ -192,6 +193,7 @@ Server scenario perf mode
 ```
 
 2. Criteo Terabyte (0.875)
+
 Offline scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
@@ -203,6 +205,7 @@ Server scenario perf mode
 ```
 
 3. Criteo Terabyte
+
 Offline scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]

From 46b605fa1da2a0e7bbf479b5a271d103981b0fb3 Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Wed, 5 Aug 2020 21:35:40 -0700
Subject: [PATCH 08/22] Adding accuracy mode command to Server scenario per
 suggestions.

---
 v0.5/recommendation/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md
index 1f60f54ac..615962f81 100755
--- a/v0.5/recommendation/README.md
+++ b/v0.5/recommendation/README.md
@@ -187,9 +187,10 @@ Offline scenario perf and accuracy modes
 ./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048
 ./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy
 ```
-Server scenario perf mode
+Server scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048
+./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy
 ```
 
 2. Criteo Terabyte (0.875)
@@ -199,9 +200,10 @@ Offline scenario perf and accuracy modes
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
 ```
-Server scenario perf mode
+Server scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy [--mlperf-bin-loader]
 ```
 
 3. Criteo Terabyte
@@ -211,9 +213,10 @@ Offline scenario perf and accuracy modes
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
 ```
-Server scenario perf mode
+Server scenario perf and accuracy modes
 ```
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy [--mlperf-bin-loader]
 ```
 
 Note that the code support (i) original and (ii) mlperf binary loader, that have slightly different performance characteristics. The latter loader can be enabled by adding `--mlperf-bin-loader` to the command line.

From 0cf2c57dce85b9d30765a1eac8e0204da08d71ce Mon Sep 17 00:00:00 2001
From: Zhihan <zhihanj@nvidia.com>
Date: Thu, 6 Aug 2020 10:37:08 -0700
Subject: [PATCH 09/22] Revert rnnt script to use standard label conversion

---
 v0.7/speech_recognition/rnnt/accuracy_eval.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/v0.7/speech_recognition/rnnt/accuracy_eval.py b/v0.7/speech_recognition/rnnt/accuracy_eval.py
index efb6a7927..ea8179285 100644
--- a/v0.7/speech_recognition/rnnt/accuracy_eval.py
+++ b/v0.7/speech_recognition/rnnt/accuracy_eval.py
@@ -41,11 +41,8 @@ def main():
         hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
         references.append(manifest[result["qsl_idx"]]["transcript"])
 
-    # Convert ASCII output into string
-    for idx in range(len(hypotheses)):
-        hypotheses[idx] = ''.join([chr(c) for c in hypotheses[idx]])
-
     references = __gather_predictions([references], labels=labels)
+    hypotheses = __gather_predictions([hypotheses], labels=labels)
 
     d = dict(predictions=hypotheses,
              transcripts=references)

From 7b5ea5e3b77213b9bdbada34d6e7e0f91da80ead Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 7 Aug 2020 09:43:08 -0700
Subject: [PATCH 10/22] move to top level

---
 .../submission/submission-checker.py          |   0
 v0.5/tools/submission/submission-to-csv.py    | 180 ------------------
 v0.5/tools/submission/system_desc_id.json     |  37 ----
 v0.5/tools/submission/system_desc_id_imp.json |   7 -
 4 files changed, 224 deletions(-)
 rename {v0.5/tools => tools}/submission/submission-checker.py (100%)
 delete mode 100644 v0.5/tools/submission/submission-to-csv.py
 delete mode 100755 v0.5/tools/submission/system_desc_id.json
 delete mode 100755 v0.5/tools/submission/system_desc_id_imp.json

diff --git a/v0.5/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
similarity index 100%
rename from v0.5/tools/submission/submission-checker.py
rename to tools/submission/submission-checker.py
diff --git a/v0.5/tools/submission/submission-to-csv.py b/v0.5/tools/submission/submission-to-csv.py
deleted file mode 100644
index 122cc2d85..000000000
--- a/v0.5/tools/submission/submission-to-csv.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Tool to create a csv file from a mlperf inference submission directory
-"""
-
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import collections
-import json
-import logging
-import os
-import re
-import sys
-import time
-
-# pylint: disable=missing-docstring
-
-
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger("main")
-
-VALID_MODELS = ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"]
-VALID_DIVISIONS = ["open", "closed"]
-
-
-def get_args():
-    """Parse commandline."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", required=True, help="submission directory")
-    parser.add_argument("--output", help="output")
-    parser.add_argument("--submitter", help="filter to submitter")
-    args = parser.parse_args()
-    return args
-
-
-def list_dir(*path):
-    path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
-
-
-def list_files(*path):
-    path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
-
-
-def split_path(m):
-    return m.replace("\\", "/").split("/")
-
-
-def model_map(model):
-    if model.startswith("mobilenet"):
-        model = "mobilenet"
-    elif model.startswith("rcnn"):
-        model = "ssd-small"
-    elif model.startswith("resnet50"):
-        model = "resnet"
-    elif model.startswith("ssdlite") or model.startswith("ssd-inception") or model.startswith("yolo") or \
-            model.startswith("ssd-mobilenet") or model.startswith("ssd-resnet50"):
-        model = "ssd-small"
-    if model not in VALID_MODELS:
-        model = None
-    return model
-
-
-def get_accuracy(model, dir):
-    is_valid = False
-    acc = 0
-    # look for: accuracy=... or mAP=...
-    with open(os.path.join(dir, "accuracy.txt"), "r") as f:
-        for line in f:
-            m = re.match("^accuracy=([\d\.]+).*", line)
-            if m:
-                acc = m.group(1)
-                break
-            m = re.match("^mAP=([\d\.]+).*", line)
-            if m:
-                acc = m.group(1)
-                break
-            m = re.match("^BLEU\:\s*([\d\.]+).*", line)
-            if m:
-                acc = m.group(1)
-                break
-    return float(acc)
-
-
-RESULT_VALUE = {
-    "Offline": "Samples per second",
-    "SingleStream": "90th percentile latency (ns)",
-    "MultiStream": "Samples per query",
-    "Server": "Scheduled samples per second"
-}
-
-TOMS = 1000 * 1000
-
-
-def get_performance(model, scenario, dir, kv):
-    rt = {}
-    # look for: Result is: VALID
-    fname = os.path.join(dir, "mlperf_log_summary.txt")
-    with open(fname, "r") as f:
-        for line in f:
-            m = re.match("^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line)
-            if m:
-                rt[m.group(1).strip()] = m.group(2).strip()
-
-    if scenario == "singlestream":
-        scenario = "SingleStream"
-    if scenario == "server":
-        scenario = "Server"
-    if scenario == "offline":
-        scenario = "Offline"
-    if scenario == "multistream":
-        scenario = "MultiStream"
-    kv["scenario"] = scenario
-    res = float(rt[RESULT_VALUE[scenario]])
-    if scenario in ["SingleStream"]:
-        res /= TOMS
-    kv["result"] = res
-    kv["p50"] = float(rt["50.00 percentile latency (ns)"]) / TOMS
-    kv["p90"] = float(rt["90.00 percentile latency (ns)"]) / TOMS
-    kv["p99"] = float(rt["99.00 percentile latency (ns)"]) / TOMS
-
-
-def walk_results_dir(dir, filter_submitter, results):
-    for division in list_dir("."):
-        if division not in ["closed", "open"]:
-            continue
-        for submitter in list_dir(division):
-            if "example" in submitter:
-                continue
-            if filter_submitter and submitter != filter_submitter:
-                continue
-            results_path = os.path.join(division, submitter, "results")
-            if not os.path.exists(results_path):
-                log.warning("no submission in {}/{}".format(division, submitter))
-                continue
-            for system_desc in list_dir(results_path):
-                # check if system_id is good. Report failure for each model/scenario.
-                for model in list_dir(results_path, system_desc):
-                    try:
-                        model_norm = model_map(model)
-                        for scenario in list_dir(results_path, system_desc, model):
-                            name = os.path.join(results_path, system_desc, model, scenario).replace("\\", "/")
-                            nn = os.path.join(submitter, division, system_desc, model)
-                            kv = {"name": nn, "model": model_norm, "system": system_desc,
-                                  "division": division, "submitter": submitter}
-                            acc_path = os.path.join(name, "accuracy")
-                            if not os.path.exists(os.path.join(acc_path, "accuracy.txt")):
-                                log.error("{} has no accuracy.txt".format(acc_path))
-                            kv["acc"] = get_accuracy(model, acc_path)
-                            n = ["1"]
-                            for i in n:
-                                perf_path = os.path.join(name, "performance", "run_" + str(i))
-                                get_performance(model_norm, scenario, perf_path, kv)
-                            results.append(kv)
-                    except Exception as ex:
-                        log.error("{}, {}".format(name, ex))
-
-
-def main():
-    args = get_args()
-
-    os.chdir(args.input)
-
-    results = []
-    walk_results_dir(args.input, args.submitter, results)
-    columns = ['name', 'model', 'system', 'division', 'submitter', 'acc', 'scenario', 'result',
-               'p50', 'p90', 'p99']
-    if args.output:
-        with open(args.output, "w") as f:
-            f.write(",".join(columns) + "\n")
-            for r in results:
-                col = [str(r[c]) for c in columns]
-                f.write(",".join(col) + "\n")
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/v0.5/tools/submission/system_desc_id.json b/v0.5/tools/submission/system_desc_id.json
deleted file mode 100755
index 9c792614c..000000000
--- a/v0.5/tools/submission/system_desc_id.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-    "division": "reqired",
-    "submitter": "required",
-    "status": "required",
-    "system_name": "required",
-
-    "number_of_nodes": "required",
-    "host_processor_model_name": "required",
-    "host_processors_per_node": "required",
-    "host_processor_core_count": "required",
-    "host_processor_frequency": "",
-    "host_processor_caches": "",
-    "host_memory_configuration": "",     
-    "host_memory_capacity": "required",
-    "host_storage_capacity": "required",
-    "host_storage_type": "required",
-    "host_processor_interconnect": "",
-    "host_networking": "",
-    "host_networking_topology": "",
-
-    "accelerators_per_node": "required",
-    "accelerator_model_name": "required",
-    "accelerator_frequency": "",
-    "accelerator_host_interconnect": "",
-    "accelerator_interconnect": "",
-    "accelerator_interconnect_topology": "",
-    "accelerator_memory_capacity": "required",
-    "accelerator_memory_configuration": "",
-    "accelerator_on-chip_memories": "",
-    "cooling": "",
-    "hw_notes": "",
-
-    "framework": "required",
-    "operating_system": "required",
-    "other_software_stack": "required",
-    "sw_notes": ""
-}
diff --git a/v0.5/tools/submission/system_desc_id_imp.json b/v0.5/tools/submission/system_desc_id_imp.json
deleted file mode 100755
index c0734b177..000000000
--- a/v0.5/tools/submission/system_desc_id_imp.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "input_data_types": "required",
-    "retraining": "required",
-    "starting_weights_filename": "required",
-    "weight_data_types": "required",
-    "weight_transformations": "required"
-}

From 14049ecb852b3a013f75013cf88e848a8d4be4b1 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 7 Aug 2020 09:44:15 -0700
Subject: [PATCH 11/22] v0.7 submission checker

---
 tools/submission/submission-checker.py | 574 +++++++++++++++----------
 1 file changed, 351 insertions(+), 223 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index ef6e8f136..e59c95737 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -7,13 +7,11 @@
 from __future__ import unicode_literals
 
 import argparse
-import collections
 import json
 import logging
 import os
 import re
 import sys
-import time
 
 # pylint: disable=missing-docstring
 
@@ -21,69 +19,190 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("main")
 
-VALID_MODELS = ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"]
+
+MODEL_CONFIG = {
+    "v0.5": {
+        "models": ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"],
+        "required-scenarios-datacenter": {
+            # anything goes
+        },
+        "required-scenarios-edge": {
+            # anything goes
+        },
+        "accuracy-target": {
+            "mobilenet": ("acc", 71.68 * 0.98),
+            "resnet": ("acc", 76.46 * 0.99),
+            "ssd-small": ("mAP", 22 * 0.99),
+            "ssd-large": ("mAP", 20 * 0.99),
+            "gnmt": ("bleu", 23.9 * 0.99),
+        },
+        "performance-sample-count": {
+            "mobilenet": 1024,
+            "resnet": 1024,
+            "ssd-small": 256,
+            "ssd-large": 64,
+            "gnmt": 3903900,
+        },
+        "seeds": {
+            "qsl_rng_seed": 3133965575612453542,
+            "sample_index_rng_seed": 665484352860916858,
+            "schedule_rng_seed": 3622009729038561421,
+        },
+    },
+    "v0.7": {
+        "models": ["ssd-large", "resnet", "rnnt", "3d-unet", "dlrm", "bert"],
+        "required-scenarios-datacenter": {
+            "resnet": ["Server", "Offline"],
+            "ssd-large": ["Server", "Offline"],
+            "rnnt": ["Server", "Offline"],
+            "bert": ["Server", "Offline"],
+            "dlrm": ["Server", "Offline"],
+            "3d-unet": ["Offline"],
+        },
+        "required-scenarios-edge": {
+            "resnet": ["Server", "Offline"],
+            "ssd-large": ["Server", "Offline"],
+            "rnnt": ["Server", "Offline"],
+            "bert": ["Server", "Offline"],
+            "dlrm": ["Server", "Offline"],
+            "3d-unet": ["Offline"],
+        },
+        "accuracy-target": {
+            "resnet": ("acc", 76.46 * 0.99),
+            "ssd-large": ("mAP", 20 * 0.99),
+            "rnnt": ("WER", 7.452 * 0.99),
+            "bert": ("F1", [90.874 * 0.99, 90.874 * 0.999]),
+            "dlrm": ("AUC", [76.46 * 0.99, 76.46 * 0.999]),
+            "3d-unet": ("mean", [0.853 * 0.99, 0.853 * 0.999]),
+        },
+        "performance-sample-count": {
+            "ssd-large": 64,
+            "resnet": 1024,
+            "rnnt": 2513,
+            "bert": 3903900,
+            "dlrm": 204800,
+            "3d-unet": 16,
+        },
+        "seeds": {
+            "qsl_rng_seed": 3133965575612453542,
+            "sample_index_rng_seed": 665484352860916858,
+            "schedule_rng_seed": 3622009729038561421,
+        },
+    },
+}
+
 VALID_DIVISIONS = ["open", "closed"]
 REQUIRED_PERF_FILES = ["mlperf_log_accuracy.json", "mlperf_log_summary.txt", "mlperf_log_detail.txt"]
 REQUIRED_ACC_FILES = REQUIRED_PERF_FILES + ["accuracy.txt"]
 REQUIRED_MEASURE_FILES = ["mlperf.conf", "user.conf", "README.md"]
-TOMS = 1000 * 1000
-
-
-PERFORMANCE_SAMPLE_COUNT = {
-    "mobilenet": 1024,
-    "resnet50": 1024,
-    "resnet": 1024,
-    "ssd-mobilenet": 256,
-    "ssd-small": 256,
-    "ssd-resnet34": 64,
-    "ssd-large": 64,
-    "gnmt": 3903900,
-}
+TO_MS = 1000 * 1000
 
-ACCURAY_TARGET = {
-    "mobilenet": 71.68 * 0.98,
-    "resnet50": 76.46 * 0.99,
-    "resnet": 76.46 * 0.99,
-    "ssd-mobilenet": 22 * 0.99,
-    "ssd-small": 22 * 0.99,
-    "ssd-resnet34": 20 * 0.99,
-    "ssd-large": 20 * 0.99,
-    "gnmt": 23.9 * 0.99,
+MODEL_MAPPING = {
+    "ssd-mobilenet": "ssd-small",
+    "ssd-resnet34": "ssd-large",
+    "resnet50": "resnet"
 }
 
-SEEDS = {
-    "qsl_rng_seed": 3133965575612453542,
-    "sample_index_rng_seed": 665484352860916858,
-    "schedule_rng_seed": 3622009729038561421
-}
-
-RESULT_VALUE = {
+RESULT_FIELD = {
     "Offline": "Samples per second",
     "Single": "90th percentile latency (ns)",
     "Multi": "Samples per query",
     "Server": "Scheduled samples per second"
 }
 
+ACC_PATTERN = {
+    "acc": r"^accuracy=([\d\.]+).*",
+    "AUC": r"^AUC=([\d\.]+).*",
+    "mAP": r"^mAP=([\d\.]+).*",
+    "bleu": r"^BLEU\:\s*([\d\.]+).*",
+    "F1": r"^{\"exact_match\"\:\s*[\d\.]+,\s*\"f1\"\:\s*([\d\.]+)}",
+    "WER": r"Word Error Rate\:\s*([\d\.]+).*",
+    "mean": r"Accuracy\:\s*mean\s*=\s*([\d\.]+).*",
+}
+
+SYSTEM_DESC_REQUIRED_FIELDS = [
+    "division", "submitter", "status", "system_name", "number_of_nodes", "host_processor_model_name",
+    "host_processors_per_node", "host_processor_core_count", "host_memory_capacity", "host_storage_capacity",
+    "host_storage_type", "accelerators_per_node", "accelerator_model_name", "accelerator_memory_capacity",
+    "framework", "operating_system"
+]
+
+SYSTEM_DESC_OPTIONAL_FIELDS = [
+    "system_type", "other_software_stack", "host_processor_frequency", "host_processor_caches",
+    "host_memory_configuration", "host_processor_interconnect", "host_networking", "host_networking_topology",
+    "accelerator_frequency", "accelerator_host_interconnect", "accelerator_interconnect",
+    "accelerator_interconnect_topology", "accelerator_memory_configuration",
+    "accelerator_on-chip_memories", "cooling", "hw_notes", "sw_notes"
+]
+
+SYSTEM_IMP_REQUIRED_FILES = [
+    "input_data_types", "retraining", "starting_weights_filename", "weight_data_types",
+    "weight_transformations",
+]
+
+
+class Config():
+    """Select config value by mlperf version and submission type."""
+    def __init__(self, version):
+        self.base = MODEL_CONFIG.get(version)
+        self.version = version
+        self.models = self.base["models"]
+        self.seeds = self.base["seeds"]
+        self.accuracy_target = self.base["accuracy-target"]
+        self.performance_sample_count = self.base["performance-sample-count"]
+
+    def set_type(self, submission_type):
+        if submission_type is None and self.version in ["v0.5"]:
+            return
+        elif submission_type == "datacenter":
+            self.required = self.base["required-scenarios-datacenter"]
+        elif submission_type == "edge":
+            self.required = self.base["required-scenarios-edge"]
+        else:
+            raise ValueError("innvalid system type")
+
+    def get_required(self, model):
+        if self.version in ["v0.5"]:
+            return set()
+        if model not in self.required:
+            raise ValueError("model not known: " + model)
+        return set(self.required[model])
+
+    def get_accuracy_target(self, model):
+        if model not in self.accuracy_target:
+            raise ValueError("model not known: " + model)
+        return self.accuracy_target[model]
+
+    def get_performance_sample_count(self, model):
+        if model not in self.performance_sample_count:
+            raise ValueError("model not known: " + model)
+        return self.performance_sample_count[model]
+
 
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
     parser.add_argument("--input", required=True, help="submission directory")
+    parser.add_argument("--version", default="v0.7", choices=list(MODEL_CONFIG.keys()), help="mlperf version")
     parser.add_argument("--submitter", help="filter to submitter")
+    parser.add_argument("--csv", default="summary.csv", help="csv file with results")
     args = parser.parse_args()
     return args
 
 
-def model_map(model):
+def model_map(config, model):
+    """Map models names to the official mlperf name."""
+    if model in config.models:
+        return model
+    if model in MODEL_MAPPING:
+        return  MODEL_MAPPING[model]
     if model.startswith("mobilenet"):
         model = "mobilenet"
     elif model.startswith("rcnn"):
         model = "ssd-small"
-    elif model.startswith("ssdlite") or model.startswith("ssd-inception") or  model.startswith("yolo") or \
+    elif model.startswith("ssdlite") or model.startswith("ssd-inception") or model.startswith("yolo") or \
             model.startswith("ssd-mobilenet") or model.startswith("ssd-resnet50"):
         model = "ssd-small"
-    if model not in PERFORMANCE_SAMPLE_COUNT:
-        model = None
     return model
 
 
@@ -113,42 +232,36 @@ def ignore_errors(line):
     return False
 
 
-def check_accuracy_dir(model, dir):
+def check_accuracy_dir(config, model, dir):
     is_valid = False
-    acc = 0
-    # look for: accuracy=... or mAP=...
+    acc = None
+    model_norm = model_map(config, model)
+    acc_type, acc_target = config.get_accuracy_target(model_norm)
+    if not isinstance(acc_target, list):
+        acc_target = [acc_target]
+    acc_target = list(sorted(acc_target, reverse=True))
+    pattern = ACC_PATTERN[acc_type]
     with open(os.path.join(dir, "accuracy.txt"), "r") as f:
         for line in f:
-            m = re.match("^accuracy=([\d\.]+).*", line)
-            if m:
-                is_valid = True
-                acc = m.group(1)
-                break
-            m = re.match("^mAP=([\d\.]+).*", line)
+            m = re.match(pattern, line)
             if m:
                 is_valid = True
                 acc = m.group(1)
                 break
-            m = re.match("^BLEU\:\s*([\d\.]+).*", line)
-            if m:
+
+    if acc:
+        for a in acc_target:
+            if float(acc) >= a:
                 is_valid = True
-                acc = m.group(1)
                 break
-
-    if is_valid:
-        model_norm = model_map(model)
-        if model_norm:
-            target_acc = ACCURAY_TARGET[model_norm]
-            if float(acc) < target_acc:
-                log.error("{} accuracy not met: {:.2f}/{}".format(dir, target_acc, acc))
-                is_valid = False
-        else:
-            log.error("{} unknown model, can't find target accuracy".format(dir))
+        if not is_valid:
+            log.error("%s accuracy not met: expected=%f, found=%f", dir, acc_target, acc)
 
     # check if there are any errors in the detailed log
     fname = os.path.join(dir, "mlperf_log_detail.txt")
     if not os.path.exists(fname):
-        log.warning("{} missing".format(fname))
+        log.error("%s is missing", fname)
+        is_valid = False
     else:
         with open(fname, "r") as f:
             for line in f:
@@ -157,31 +270,30 @@ def check_accuracy_dir(model, dir):
                     if ignore_errors(line):
                         continue
                     # TODO: should this be a failed run?
-                    log.warning("{} contains error: {}".format(fname, line))
-    return is_valid
+                    log.error("%s contains error: %s", fname, line)
+                    is_valid = False
+    return is_valid, acc
 
 
-def check_performance_dir(model, dir):
+def check_performance_dir(config, model, dir):
     is_valid = False
     rt = {}
     # look for: Result is: VALID
     fname = os.path.join(dir, "mlperf_log_summary.txt")
     with open(fname, "r") as f:
         for line in f:
-            m = re.match("^Result\s+is\s*\:\s+VALID", line)
+            m = re.match(r"^Result\s+is\s*\:\s+VALID", line)
             if m:
                 is_valid = True
-            m = re.match("^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line)
+            m = re.match(r"^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line)
             if m:
                 rt[m.group(1).strip()] = m.group(2).strip()
 
-    model = model_map(model)
-    if model in PERFORMANCE_SAMPLE_COUNT:
-        if int(rt['performance_sample_count']) < PERFORMANCE_SAMPLE_COUNT[model]:
-            log.error("{} performance_sample_count should be {}".format(fname, PERFORMANCE_SAMPLE_COUNT[model]))
-            is_valid = False
-    else:
-        log.error("{} performance_sample_count not checked, bad model name {}".format(fname, model))
+    model = model_map(config, model)
+    performance_sample_count = config.get_performance_sample_count(model)
+    if int(rt['performance_sample_count']) < performance_sample_count:
+        log.error("%s performance_sample_count should be %d", fname, performance_sample_count)
+        is_valid = False
 
     # check if there are any errors in the detailed log
     fname = os.path.join(dir, "mlperf_log_detail.txt")
@@ -191,17 +303,17 @@ def check_performance_dir(model, dir):
             if "ERROR" in line:
                 if ignore_errors(line):
                     continue
-                # TODO: does this make the run fail?
-                log.warning("{} contains error: {}".format(fname, line))
+                log.error("%s contains error: %s", fname, line)
+                is_valid = False
 
         for seed in ["qsl_rng_seed", "sample_index_rng_seed", "schedule_rng_seed"]:
-            if int(rt[seed]) != SEEDS[seed]:
-                log.error("{} {} wrong, {}/{}".format(fname, seed, rt[seed], SEEDS[seed]))
+            if int(rt[seed]) != config.seeds[seed]:
+                log.error("%s %s is wrong, expected=%s, found=%s", fname, seed, config.seeds[seed], rt[seed])
 
     scenario = rt["Scenario"]
-    res = float(rt[RESULT_VALUE[scenario]])
+    res = float(rt[RESULT_FIELD[scenario]])
     if scenario in ["Single Stream"]:
-        res /= TOMS
+        res /= TO_MS
 
     return is_valid, res
 
@@ -221,200 +333,216 @@ def files_diff(list1, list2):
     return []
 
 
-def check_results_dir(dir, filter_submitter):
-    good_submissions = []
-    bad_submissions = {}
+def check_results_dir(config, dir, filter_submitter, csv):
+    head = ["Organization", "Availability", "Division", "Platform", "Model", "Scenario", "Result", "Accuracy", "Location"]
+    fmt = ",".join(["{}"] * len(head)) + "\n"
+    csv.write(",".join(head) + "\n")
     results = {}
 
     for division in list_dir("."):
-        if division not in ["closed", "open"]:
+        if division not in VALID_DIVISIONS:
+            log.error("invalid division in input dir %s", division)
             continue
+        is_closed = division == "closed"
         for submitter in list_dir(division):
             if filter_submitter and submitter != filter_submitter:
                 continue
             results_path = os.path.join(division, submitter, "results")
             if not os.path.exists(results_path):
-                log.warning("no submission in {}/{}".format(division, submitter))
+                log.error("no submission in %s", results_path)
+                results[results_path] = None
                 continue
+
             for system_desc in list_dir(results_path):
-                # check if system_id is good. Report failure for each model/scenario.
+
+                # 
+                # check if system_id is good.
+                #
                 system_id_json = os.path.join(division, submitter, "systems", system_desc + ".json")
-                device_bad = not os.path.exists(system_id_json)
+                if not os.path.exists(system_id_json):
+                    log.error("no system_desc for %s/%s/%s", division, submitter, system_desc)
+                    results[os.path.join(results_path, system_desc)] = None
+                    continue
+
+                name = os.path.join(results_path, system_desc)
+                with open(system_id_json) as f:
+                    system_json = json.load(f)
+                    system_type = system_json.get("system_type")
+                    available = system_json.get("status")
+                    if config.version == "v0.7" and system_type not in ["datacenter", "edge"]:
+                        log.error("%s has invalid system type (%s)", system_id_json, system_type)
+                        results[name] = None
+                        continue
+                    config.set_type(system_type)
+                    if not check_system_desc_id(name, system_json, submitter, division):
+                        results[name] = None
+
+                # 
+                # Look at each model
+                #
                 for model in list_dir(results_path, system_desc):
-                    if division in "closed" and model not in VALID_MODELS:
-                        bad_submissions[os.path.join(system_desc, model)] = \
-                            "{} has an invalid model name {}".format(os.path.join(results_path, system_desc), model)
+                    if is_closed and model not in config.models:
+                        log.error("%s has a invalid model (%s) for closed division", name, model)
+                        results[name] = None
+                        continue
 
+                    # 
+                    # Look at each scenario
+                    #
+                    required_scenarios = config.get_required(MODEL_MAPPING.get(model, model))
                     for scenario in list_dir(results_path, system_desc, model):
                         name = os.path.join(results_path, system_desc, model, scenario)
-                        results[name] = "NoResults"
+                        results[name] = None
+
+                        # check if measurement_dir is good.
+                        measurement_dir = os.path.join(division, submitter, "measurements",
+                                                       system_desc, model, scenario)
+                        if not os.path.exists(measurement_dir):
+                            log.error("no measurement_dir for %s", name)
+                            results[measurement_dir] = None
+                        else:
+                            if not check_measurement_dir(measurement_dir, name, system_desc,
+                                                         os.path.join(division, submitter), model, scenario):
+                                log.error("measurement_dir %s has issues", measurement_dir)
+                                results[measurement_dir] = None
+
+                        # check accuracy
+                        accuracy_is_valid = False
                         acc_path = os.path.join(name, "accuracy")
                         if not os.path.exists(os.path.join(acc_path, "accuracy.txt")):
                             log.error(
-                                "{} has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or "
-                                "process_accuracy.py".format(acc_path))
-                            bad_submissions[name] = "{} has no accuracy.txt".format(acc_path)
+                                "%s has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or "
+                                "process_accuracy.py", acc_path)
                         else:
                             diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES)
                             if diff:
-                                bad_submissions[name] = "{} has file list mismatch ({})".format(acc_path, diff)
-                            if not check_accuracy_dir(model, acc_path):
-                                bad_submissions[name] = "{} has issues".format(acc_path)
-                        n = ["run_1"]
+                                log.error("%s has file list mismatch (%s)", acc_path, diff)
+                            accuracy_is_valid, acc = check_accuracy_dir(config, model, acc_path)
+                            if accuracy_is_valid:
+                                log.info("%s, accuracy is %s", acc_path, acc)
+                            else:
+                                log.error("%s, accuracy not valid", acc_path)
+
                         if scenario in ["Server"]:
                             n = ["run_1", "run_2", "run_3", "run_4", "run_5"]
-                        if not os.path.exists(os.path.join(name, "performance", n[0])):
-                            n = ["run1"]
-                            if not os.path.exists(os.path.join(name, "performance", n[0])):
-                                n = ["."]
-                            else:
-                                if scenario in ["Server"]:
-                                    n = ["run1", "run2", "run3", "run4", "run5"]
+                        else:
+                            n = ["run_1"]
 
                         for i in n:
                             perf_path = os.path.join(name, "performance", i)
                             if not os.path.exists(perf_path):
-                                bad_submissions[name] = "{} missing".format(perf_path)
+                                log.error("%s is missing", perf_path)
                                 continue
                             diff = files_diff(list_files(perf_path), REQUIRED_PERF_FILES)
                             if diff:
-                                bad_submissions[name] = "{} has file list mismatch ({})".format(perf_path, diff)
+                                log.error("%s has file list mismatch (%s)", perf_path, diff)
                             try:
-                                is_valid, results[name] = check_performance_dir(model, perf_path)
-                            except Exception as ex:
-                                is_valid, results[name] = False, "NoResults"
-                            if not is_valid:
-                                bad_submissions[name] = "{} has issues".format(perf_path)
-                        if device_bad:
-                            bad_submissions[name] = "{}: no such system id {}".format(name, system_desc)
-                        else:
-                            good_submissions.append(name)
+                                is_valid, r = check_performance_dir(config, model, perf_path)
+                            except:
+                                is_valid, r = False, None
+                            if is_valid:
+                                results[name] = r
+                                required_scenarios.discard(scenario)
+                            else:
+                                log.error("%s has issues", perf_path)
 
-    return good_submissions, bad_submissions, results
+                        if results.get(name):
+                            if accuracy_is_valid:
+                                log.info("%s is OK", name)
+                                csv.write(fmt.format(submitter, available, division, system_desc, model, scenario,
+                                          r, acc, name))
+                            else:
+                                results[name] = None
+                                log.error("%s is OK but accuracy has issues", name)
 
+                    if required_scenarios:
+                        name = os.path.join(results_path, system_desc, model)
+                        results[name] = None
+                        log.error("%s does not have all required scenarios, missing %s", name, required_scenarios)
 
-def compare_json(fname, template, errors):
-    error_count = len(errors)
-    try:
-        with open(fname, "r") as f:
-            j = json.load(f)
-        # make sure all required sections/fields are there
-        for k, v in template.items():
-            sz = j.get(k)
-            if sz is None and v == "required":
-                errors.append("{} field {} missing".format(fname, k))
-
-        # make sure no undefined sections/fields are in the meta data
-        for k, v in j.items():
-            z = template.get(k)
-            if z is None:
-                errors.append("{} has unknwon field {}".format(fname, k))
-    except Exception as ex:
-        errors.append("{} unexpected error {}".format(fname, ex))
-    return error_count == len(errors)
-
-
-def check_system_desc_id(good_submissions, systems_json):
-    errors = []
-    checked = set()
-    for submission in good_submissions:
-        parts = split_path(submission)
-        system_desc = parts[3]
-        submitter = parts[1]
-        division = parts[0]
-        if division not in VALID_DIVISIONS:
-            errors.append(("{} has invalid division {}".format(submission, j["submitter"], division)))
-            continue
 
-        fname = os.path.join(parts[0], parts[1], "systems", system_desc + ".json")
-        if fname not in checked:
-            checked.add(fname)
-            if not compare_json(fname, systems_json, errors):
-                continue
-            with open(fname, "r") as f:
-                j = json.load(f)
-                if j["submitter"] != submitter:
-                    errors.append(("{} has submitter {}, directory has {}".format(fname, j["submitter"], submitter)))
-                    continue
-                if j["division"] != division:
-                    errors.append(("{} has division {}, division has {}".format(fname, j["division"], division)))
-                    continue
-    if errors:
-        for i in errors:
-            log.error(i)
-    return errors
-
-
-def check_measurement_dir(good_submissions, systems_imp_json):
-    errors = []
-    for submission in good_submissions:
-        parts = split_path(submission)
-        system_desc = parts[3]
-        measurement_dir = os.path.join(parts[0], parts[1], "measurements", system_desc)
-        if not os.path.exists(measurement_dir):
-            errors.append("{} directory missing".format(measurement_dir))
-            continue
-        model = parts[4]
-        scenario = parts[5]
-        fname = os.path.join(measurement_dir, model, scenario)
-        files = list_files(fname)
-        system_file = None
-        for i in REQUIRED_MEASURE_FILES:
-            if i not in files:
-                errors.append("{} is missing {}".format(fname, i))
-        for i in files:
-            if i.startswith(system_desc) and i.endswith("_" + scenario + ".json"):
-                system_file = i
-                end = len("_" + scenario + ".json")
-                break
-            elif i.startswith(system_desc) and i.endswith(".json"):
-                system_file = i
-                end = len(".json")
-                break
-        if system_file:
-            compare_json(os.path.join(fname, system_file), systems_imp_json, errors)
-            impl = system_file[len(system_desc) + 1:-end]
-            code_dir = os.path.join(parts[0], parts[1], "code", model, impl)
-            if not os.path.exists(code_dir):
-                errors.append("{} is missing".format(code_dir))
-        else:
-            errors.append("{} is missing {}*.json".format(fname, system_desc))
+    return results
 
-    if errors:
-        for i in errors:
-            log.error(i)
-    return errors
 
+def check_system_desc_id(fname, systems_json, submitter, division):
+    is_valid = True
+    # check all required fields
+    for k in SYSTEM_DESC_REQUIRED_FIELDS:
+        if k not in systems_json:
+            is_valid = False
+            log.error("%s, field %s is missing", fname, k)
+
+    all_fields = SYSTEM_DESC_REQUIRED_FIELDS + SYSTEM_DESC_OPTIONAL_FIELDS
+    for k in systems_json.keys():
+        if k not in all_fields:
+            log.warning("%s, field %s is unknwon", fname, k)
+
+    if systems_json.get("submitter") != submitter:
+        log.error("%s has submitter %s, directory has %s", fname, systems_json.get("submitter"), submitter)
+        is_valid = False
+    if systems_json.get("division") != division:
+        log.error("%s has division %s, division has %s", fname, systems_json.get("division"), division)
+        is_valid = False
+    return is_valid
 
-def main():
-    args = get_args()
 
-    script_path = os.path.dirname(sys.argv[0])
-    with open(os.path.join(script_path, "system_desc_id.json"), "r") as f:
-        systems_json = json.load(f)
-    with open(os.path.join(script_path, "system_desc_id_imp.json"), "r") as f:
-        systems_imp_json = json.load(f)
+def check_measurement_dir(measurement_dir, fname, system_desc, root, model, scenario):
+    files = list_files(measurement_dir)
+    system_file = None
+    is_valid = True
+    for i in REQUIRED_MEASURE_FILES:
+        if i not in files:
+            log.error("%s is missing %s", measurement_dir, i)
+            is_valid = False
+    for i in files:
+        if i.startswith(system_desc) and i.endswith("_" + scenario + ".json"):
+            system_file = i
+            end = len("_" + scenario + ".json")
+            break
+        elif i.startswith(system_desc) and i.endswith(".json"):
+            system_file = i
+            end = len(".json")
+            break
+    if system_file:
+        with open(os.path.join(measurement_dir, system_file), "r") as f:
+            j = json.load(f)
+            for k in SYSTEM_IMP_REQUIRED_FILES:
+                if k not in j:
+                    is_valid = False
+                    log.error("%s, field %s is missing", fname, k)
+
+        impl = system_file[len(system_desc) + 1:-end]
+        code_dir = os.path.join(root, "code", model, impl)
+        if not os.path.exists(code_dir):
+            log.error("%s is missing %s*.json", fname, system_desc)
+    else:
+        log.error("%s is missing %s*.json", fname, system_desc)
+
+    return is_valid
+
 
-    os.chdir(args.input)
+def main():
+    args = get_args()
 
-    # 1. check results directory
-    good_submissions, bad_submissions, results = check_results_dir(args.input, args.submitter)
+    config = Config(args.version)
 
-    # 2. check the meta data under systems
-    meta_errors = check_system_desc_id(good_submissions, systems_json)
+    with open(args.csv, "w") as csv:
+        os.chdir(args.input)
+        # check results directory
+        results = check_results_dir(config, args.input, args.submitter, csv)
 
-    # 3. check measurement and code dir
-    measurement_errors = check_measurement_dir(good_submissions, systems_imp_json)
+    # log results
     with_results = 0
     for k, v in results.items():
-        if v == "NoResults":
-            log.error("NoResults {}".format(k))
+        if v is None:
+            log.error("NoResults %s", k)
         else:
-            log.info("Results {} {}".format(k, v))
-            with_results +=1
+            log.info("Results %s %s", k, v)
+            with_results += 1
 
-    log.info("Results={}, NoResults={}".format(with_results, len(results)-with_results))
-    if bad_submissions or meta_errors or measurement_errors:
+    # print summary
+    log.info("Results=%d, NoResults=%d", with_results, len(results) - with_results)
+    if len(results) != with_results: # bad_submissions or meta_errors or measurement_errors:
         log.error("SUMMARY: submission has errors")
         return 1
     else:

From 4b3e320245b6db329f4b88b775c11e17bc8c7ed3 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 7 Aug 2020 11:08:19 -0700
Subject: [PATCH 12/22] take results with accuracy miss in open division

---
 tools/submission/submission-checker.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index e59c95737..8c42399a7 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -245,7 +245,6 @@ def check_accuracy_dir(config, model, dir):
         for line in f:
             m = re.match(pattern, line)
             if m:
-                is_valid = True
                 acc = m.group(1)
                 break
 
@@ -255,7 +254,7 @@ def check_accuracy_dir(config, model, dir):
                 is_valid = True
                 break
         if not is_valid:
-            log.error("%s accuracy not met: expected=%f, found=%f", dir, acc_target, acc)
+            log.error("%s accuracy not met: expected=%s, found=%s", dir, acc_target, acc)
 
     # check if there are any errors in the detailed log
     fname = os.path.join(dir, "mlperf_log_detail.txt")
@@ -418,6 +417,11 @@ def check_results_dir(config, dir, filter_submitter, csv):
                             if diff:
                                 log.error("%s has file list mismatch (%s)", acc_path, diff)
                             accuracy_is_valid, acc = check_accuracy_dir(config, model, acc_path)
+                            if not accuracy_is_valid and not is_closed:
+                                log.warning("%s, accuracy not valid but taken for open", acc_path)
+                                # TODO: is this correct?
+                                accuracy_is_valid = True
+
                             if accuracy_is_valid:
                                 log.info("%s, accuracy is %s", acc_path, acc)
                             else:

From 935b3d900158fdecfe4ffb3ddda66648d32317c5 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 7 Aug 2020 12:45:14 -0700
Subject: [PATCH 13/22] support for multiple accuracy targets per model

---
 tools/submission/submission-checker.py | 47 ++++++++++++++++++--------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index 8c42399a7..f5e72806c 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -50,30 +50,41 @@
         },
     },
     "v0.7": {
-        "models": ["ssd-large", "resnet", "rnnt", "3d-unet", "dlrm", "bert"],
+        "models": [
+            "ssd-large", "resnet", "rnnt",
+            "bert", "bert-99", "bert-99.9",
+            "dlrm", "dlrm-99", "dlrm-99.9"
+            "3dunet", "3d-unet-99", "3d-unet-99.9"
+        ],
         "required-scenarios-datacenter": {
             "resnet": ["Server", "Offline"],
             "ssd-large": ["Server", "Offline"],
             "rnnt": ["Server", "Offline"],
             "bert": ["Server", "Offline"],
             "dlrm": ["Server", "Offline"],
-            "3d-unet": ["Offline"],
+            "3dunet": ["Offline"],
         },
         "required-scenarios-edge": {
-            "resnet": ["Server", "Offline"],
-            "ssd-large": ["Server", "Offline"],
-            "rnnt": ["Server", "Offline"],
-            "bert": ["Server", "Offline"],
-            "dlrm": ["Server", "Offline"],
-            "3d-unet": ["Offline"],
+            "resnet": ["SingleStream", "Offline"],
+            "ssd-large": ["SingleStream", "Offline"],
+            "rnnt": ["SingleStream", "Offline"],
+            "bert": ["SingleStream", "Offline"],
+            "dlrm": ["SingleStream", "Offline"],
+            "3dunet": ["SingleStream", "Offline"],
         },
         "accuracy-target": {
             "resnet": ("acc", 76.46 * 0.99),
             "ssd-large": ("mAP", 20 * 0.99),
             "rnnt": ("WER", 7.452 * 0.99),
             "bert": ("F1", [90.874 * 0.99, 90.874 * 0.999]),
+            "bert-99": ("F1", 90.874 * 0.99),
+            "bert-99.9": ("F1", 90.874 * 0.999),
             "dlrm": ("AUC", [76.46 * 0.99, 76.46 * 0.999]),
-            "3d-unet": ("mean", [0.853 * 0.99, 0.853 * 0.999]),
+            "dlrm-99": ("AUC", 76.46 * 0.99),
+            "dlrm-99.9": ("AUC", 76.46 * 0.999),
+            "3dunet": ("DICE", [0.853 * 0.99, 0.853 * 0.999]),
+            "3dunet-99": ("DICE", 0.853 * 0.99),
+            "3dunet-99.9": ("DICE", 0.853 * 0.999),
         },
         "performance-sample-count": {
             "ssd-large": 64,
@@ -81,7 +92,7 @@
             "rnnt": 2513,
             "bert": 3903900,
             "dlrm": 204800,
-            "3d-unet": 16,
+            "3dunet": 16,
         },
         "seeds": {
             "qsl_rng_seed": 3133965575612453542,
@@ -100,7 +111,13 @@
 MODEL_MAPPING = {
     "ssd-mobilenet": "ssd-small",
     "ssd-resnet34": "ssd-large",
-    "resnet50": "resnet"
+    "resnet50": "resnet",
+    "bert-99": "bert",
+    "bert-99.9": "bert",
+    "dlrm-99": "dlrm",
+    "dlrm-99.9": "dlrm",
+    "3dunet-99": "3dunet",
+    "3dunet-99.9": "3dunet",
 }
 
 RESULT_FIELD = {
@@ -117,7 +134,7 @@
     "bleu": r"^BLEU\:\s*([\d\.]+).*",
     "F1": r"^{\"exact_match\"\:\s*[\d\.]+,\s*\"f1\"\:\s*([\d\.]+)}",
     "WER": r"Word Error Rate\:\s*([\d\.]+).*",
-    "mean": r"Accuracy\:\s*mean\s*=\s*([\d\.]+).*",
+    "DICE": r"Accuracy\:\s*mean\s*=\s*([\d\.]+).*",
 }
 
 SYSTEM_DESC_REQUIRED_FIELDS = [
@@ -164,6 +181,7 @@ def set_type(self, submission_type):
     def get_required(self, model):
         if self.version in ["v0.5"]:
             return set()
+        model = MODEL_MAPPING.get(model, model)
         if model not in self.required:
             raise ValueError("model not known: " + model)
         return set(self.required[model])
@@ -174,6 +192,7 @@ def get_accuracy_target(self, model):
         return self.accuracy_target[model]
 
     def get_performance_sample_count(self, model):
+        model = MODEL_MAPPING.get(model, model)
         if model not in self.performance_sample_count:
             raise ValueError("model not known: " + model)
         return self.performance_sample_count[model]
@@ -333,7 +352,7 @@ def files_diff(list1, list2):
 
 
 def check_results_dir(config, dir, filter_submitter, csv):
-    head = ["Organization", "Availability", "Division", "Platform", "Model", "Scenario", "Result", "Accuracy", "Location"]
+    head = ["Organization", "Availability", "Division", "SystemType", "Platform", "Model", "Scenario", "Result", "Accuracy", "Location"]
     fmt = ",".join(["{}"] * len(head)) + "\n"
     csv.write(",".join(head) + "\n")
     results = {}
@@ -453,7 +472,7 @@ def check_results_dir(config, dir, filter_submitter, csv):
                         if results.get(name):
                             if accuracy_is_valid:
                                 log.info("%s is OK", name)
-                                csv.write(fmt.format(submitter, available, division, system_desc, model, scenario,
+                                csv.write(fmt.format(submitter, available, division, system_type, system_desc, model, scenario,
                                           r, acc, name))
                             else:
                                 results[name] = None

From 973454e0ad3e4acda14d4db0d6655eb36bc21e97 Mon Sep 17 00:00:00 2001
From: guschmue <guschmue@microsoft.com>
Date: Fri, 7 Aug 2020 17:15:46 -0700
Subject: [PATCH 14/22] address review feedback

---
 tools/submission/submission-checker.py | 132 ++++++++++++++++++-------
 1 file changed, 97 insertions(+), 35 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index f5e72806c..414fad0ea 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -26,9 +26,15 @@
         "required-scenarios-datacenter": {
             # anything goes
         },
+        "optional-scenarios-datacenter": {
+            # anything goes
+        },
         "required-scenarios-edge": {
             # anything goes
         },
+        "optional-scenarios-edge": {
+            # anything goes
+        },
         "accuracy-target": {
             "mobilenet": ("acc", 71.68 * 0.98),
             "resnet": ("acc", 76.46 * 0.99),
@@ -51,7 +57,7 @@
     },
     "v0.7": {
         "models": [
-            "ssd-large", "resnet", "rnnt",
+            "ssd-small", "ssd-large", "resnet", "rnnt",
             "bert", "bert-99", "bert-99.9",
             "dlrm", "dlrm-99", "dlrm-99.9"
             "3dunet", "3d-unet-99", "3d-unet-99.9"
@@ -64,29 +70,39 @@
             "dlrm": ["Server", "Offline"],
             "3dunet": ["Offline"],
         },
+        "optional-scenarios-datacenter": {
+        },
         "required-scenarios-edge": {
             "resnet": ["SingleStream", "Offline"],
+            "ssd-small": ["SingleStream", "Offline"],
             "ssd-large": ["SingleStream", "Offline"],
             "rnnt": ["SingleStream", "Offline"],
             "bert": ["SingleStream", "Offline"],
             "dlrm": ["SingleStream", "Offline"],
             "3dunet": ["SingleStream", "Offline"],
         },
+        "optional-scenarios-edge": {
+            "resnet": ["MultiStream"],
+            "ssd-small": ["MultiStream"],
+            "ssd-large": ["MultiStream"],
+        },
         "accuracy-target": {
             "resnet": ("acc", 76.46 * 0.99),
+            "ssd-small": ("mAP", 22 * 0.99),
             "ssd-large": ("mAP", 20 * 0.99),
-            "rnnt": ("WER", 7.452 * 0.99),
-            "bert": ("F1", [90.874 * 0.99, 90.874 * 0.999]),
+            "rnnt": ("WER", (100 - 7.452) * 0.99),
+            "bert": ("F1", 90.874 * 0.99),
             "bert-99": ("F1", 90.874 * 0.99),
             "bert-99.9": ("F1", 90.874 * 0.999),
-            "dlrm": ("AUC", [76.46 * 0.99, 76.46 * 0.999]),
-            "dlrm-99": ("AUC", 76.46 * 0.99),
-            "dlrm-99.9": ("AUC", 76.46 * 0.999),
-            "3dunet": ("DICE", [0.853 * 0.99, 0.853 * 0.999]),
+            "dlrm": ("AUC", 80.25 * 0.99),
+            "dlrm-99": ("AUC", 80.25 * 0.99),
+            "dlrm-99.9": ("AUC", 80.25 * 0.999),
+            "3dunet": ("DICE", 0.853 * 0.99),
             "3dunet-99": ("DICE", 0.853 * 0.99),
             "3dunet-99.9": ("DICE", 0.853 * 0.999),
         },
         "performance-sample-count": {
+            "ssd-small": 256,
             "ssd-large": 64,
             "resnet": 1024,
             "rnnt": 2513,
@@ -167,14 +183,18 @@ def __init__(self, version):
         self.seeds = self.base["seeds"]
         self.accuracy_target = self.base["accuracy-target"]
         self.performance_sample_count = self.base["performance-sample-count"]
+        self.required = None
+        self.optional = None
 
     def set_type(self, submission_type):
         if submission_type is None and self.version in ["v0.5"]:
             return
         elif submission_type == "datacenter":
             self.required = self.base["required-scenarios-datacenter"]
+            self.optional = self.base["optional-scenarios-datacenter"]
         elif submission_type == "edge":
             self.required = self.base["required-scenarios-edge"]
+            self.optional = self.base["optional-scenarios-edge"]
         else:
             raise ValueError("innvalid system type")
 
@@ -186,6 +206,14 @@ def get_required(self, model):
             raise ValueError("model not known: " + model)
         return set(self.required[model])
 
+    def get_optional(self, model):
+        if self.version in ["v0.5"]:
+            return set(["SingleStream", "MultiStream", "Server", "Offline"])
+        model = MODEL_MAPPING.get(model, model)
+        if model not in self.optional:
+            return set()
+        return set(self.optional[model])
+
     def get_accuracy_target(self, model):
         if model not in self.accuracy_target:
             raise ValueError("model not known: " + model)
@@ -239,44 +267,38 @@ def split_path(m):
     return m.replace("\\", "/").split("/")
 
 
-def ignore_errors(line):
+def ignore_errors_for_v0_5(line):
     if "check for ERROR in detailed" in line:
         return True
     if "Loadgen built with uncommitted changes" in line:
         return True
     if "Ran out of generated queries to issue before the minimum query count and test duration were reached" in line:
         return True
-    if "CAS failed":
+    if "CAS failed" in line:
         return True
     return False
 
 
-def check_accuracy_dir(config, model, dir):
+def check_accuracy_dir(config, model, path):
     is_valid = False
     acc = None
     model_norm = model_map(config, model)
     acc_type, acc_target = config.get_accuracy_target(model_norm)
-    if not isinstance(acc_target, list):
-        acc_target = [acc_target]
-    acc_target = list(sorted(acc_target, reverse=True))
     pattern = ACC_PATTERN[acc_type]
-    with open(os.path.join(dir, "accuracy.txt"), "r") as f:
+    with open(os.path.join(path, "accuracy.txt"), "r") as f:
         for line in f:
             m = re.match(pattern, line)
             if m:
                 acc = m.group(1)
                 break
 
-    if acc:
-        for a in acc_target:
-            if float(acc) >= a:
-                is_valid = True
-                break
-        if not is_valid:
-            log.error("%s accuracy not met: expected=%s, found=%s", dir, acc_target, acc)
+    if acc and float(acc) >= acc_target:
+        is_valid = True
+    else:
+        log.error("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc)
 
     # check if there are any errors in the detailed log
-    fname = os.path.join(dir, "mlperf_log_detail.txt")
+    fname = os.path.join(path, "mlperf_log_detail.txt")
     if not os.path.exists(fname):
         log.error("%s is missing", fname)
         is_valid = False
@@ -285,7 +307,7 @@ def check_accuracy_dir(config, model, dir):
             for line in f:
                 # look for: ERROR
                 if "ERROR" in line:
-                    if ignore_errors(line):
+                    if config.version in ["v0.5"] and ignore_errors_for_v0_5(line):
                         continue
                     # TODO: should this be a failed run?
                     log.error("%s contains error: %s", fname, line)
@@ -293,11 +315,11 @@ def check_accuracy_dir(config, model, dir):
     return is_valid, acc
 
 
-def check_performance_dir(config, model, dir):
+def check_performance_dir(config, model, path):
     is_valid = False
     rt = {}
     # look for: Result is: VALID
-    fname = os.path.join(dir, "mlperf_log_summary.txt")
+    fname = os.path.join(path, "mlperf_log_summary.txt")
     with open(fname, "r") as f:
         for line in f:
             m = re.match(r"^Result\s+is\s*\:\s+VALID", line)
@@ -310,16 +332,17 @@ def check_performance_dir(config, model, dir):
     model = model_map(config, model)
     performance_sample_count = config.get_performance_sample_count(model)
     if int(rt['performance_sample_count']) < performance_sample_count:
-        log.error("%s performance_sample_count should be %d", fname, performance_sample_count)
+        log.error("%s performance_sample_count, found %s, needs to be > %d",
+                  fname, performance_sample_count, rt['performance_sample_count'])
         is_valid = False
 
     # check if there are any errors in the detailed log
-    fname = os.path.join(dir, "mlperf_log_detail.txt")
+    fname = os.path.join(path, "mlperf_log_detail.txt")
     with open(fname, "r") as f:
         for line in f:
             # look for: ERROR
             if "ERROR" in line:
-                if ignore_errors(line):
+                if config.version in ["v0.5"] and ignore_errors_for_v0_5(line):
                     continue
                 log.error("%s contains error: %s", fname, line)
                 is_valid = False
@@ -351,18 +374,48 @@ def files_diff(list1, list2):
     return []
 
 
-def check_results_dir(config, dir, filter_submitter, csv):
-    head = ["Organization", "Availability", "Division", "SystemType", "Platform", "Model", "Scenario", "Result", "Accuracy", "Location"]
+def check_results_dir(config, filter_submitter, csv):
+    """
+    Walk the results directory and do the checking.
+
+    We are called with the cdw at the root of the submission directory.
+    level1 division - closed|open
+    level2 submitter - for example mlperf_org
+    level3 - results, systems, measurements, code
+
+    For results the structure from here is:
+    results/$system_desc/$benchmark_model/$scenario/performance/run_n
+    and
+    results/$system_desc/$benchmark_model/$scenario/accuracy
+
+    We first walk into results/$system_desc
+        make sure there is a system_desc.json and its good
+    Next we walk into the model
+        make sure the model is good, make sure all required scenarios are there.
+    Next we walk into each scenario
+        check the performance directory
+        check the accuracy directory
+        if all was good, add the result to the results directory
+        if there are errors write a None as result so we can report later what failed
+    """
+    head = [
+        "Organization", "Availability", "Division", "SystemType", "Platform", "Model",
+        "Scenario", "Result", "Accuracy", "Location",
+    ]
     fmt = ",".join(["{}"] * len(head)) + "\n"
     csv.write(",".join(head) + "\n")
     results = {}
 
+    # we are at the top of the submission directory
     for division in list_dir("."):
+        # we are looking at ./$division, ie ./closed
         if division not in VALID_DIVISIONS:
             log.error("invalid division in input dir %s", division)
             continue
         is_closed = division == "closed"
+
         for submitter in list_dir(division):
+            # we are looking at ./$division/$submitter, ie ./closed/mlperf_org
             if filter_submitter and submitter != filter_submitter:
                 continue
             results_path = os.path.join(division, submitter, "results")
@@ -372,8 +425,9 @@ def check_results_dir(config, dir, filter_submitter, csv):
                 continue
 
             for system_desc in list_dir(results_path):
+                # we are looking at ./$division/$submitter/$system_desc, ie ./closed/mlperf_org/t4-ort
 
-                # 
+                #
                 # check if system_id is good.
                 #
                 system_id_json = os.path.join(division, submitter, "systems", system_desc + ".json")
@@ -399,18 +453,26 @@ def check_results_dir(config, dir, filter_submitter, csv):
                 # Look at each model
                 #
                 for model in list_dir(results_path, system_desc):
+                    # we are looking at ./$division/$submitter/$system_desc/$model,
+                    #   ie ./closed/mlperf_org/t4-ort/bert
                     if is_closed and model not in config.models:
                         log.error("%s has a invalid model (%s) for closed division", name, model)
                         results[name] = None
                         continue
 
-                    # 
+                    #
                     # Look at each scenario
                     #
                     required_scenarios = config.get_required(MODEL_MAPPING.get(model, model))
+                    all_scenarios = set(list(required_scenarios) + list(config.get_optional(MODEL_MAPPING.get(model, model))))
                     for scenario in list_dir(results_path, system_desc, model):
+                        # we are looking at ./$division/$submitter/$system_desc/$model/$scenario,
+                        #   ie ./closed/mlperf_org/t4-ort/bert/Offline
                         name = os.path.join(results_path, system_desc, model, scenario)
                         results[name] = None
+                        if scenario not in all_scenarios:
+                            log.warning("%s ignoring scenario %s (neither required nor optional)", name, scenario)
+                            continue
 
                         # check if measurement_dir is good.
                         measurement_dir = os.path.join(division, submitter, "measurements",
@@ -472,8 +534,8 @@ def check_results_dir(config, dir, filter_submitter, csv):
                         if results.get(name):
                             if accuracy_is_valid:
                                 log.info("%s is OK", name)
-                                csv.write(fmt.format(submitter, available, division, system_type, system_desc, model, scenario,
-                                          r, acc, name))
+                                csv.write(fmt.format(submitter, available, division, system_type, system_desc, model,
+                                                     scenario, r, acc, name))
                             else:
                                 results[name] = None
                                 log.error("%s is OK but accuracy has issues", name)
@@ -552,7 +614,7 @@ def main():
     with open(args.csv, "w") as csv:
         os.chdir(args.input)
         # check results directory
-        results = check_results_dir(config, args.input, args.submitter, csv)
+        results = check_results_dir(config, args.submitter, csv)
 
     # log results
     with_results = 0

From 2a262588d7fc5ecc9a74e971a8249b131e994be4 Mon Sep 17 00:00:00 2001
From: Po-Han Huang <pohanh@nvidia.com>
Date: Mon, 10 Aug 2020 08:36:46 -0700
Subject: [PATCH 15/22] Fix LoadGen bug: scheduled_delta is from start, not
 from previous query

---
 loadgen/issue_query_controller.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index 576866712..b8b30ad41 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -92,16 +92,14 @@ void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first,
                                     size_t last, size_t stride) {
   // Copy sample data over to current query, boldly assuming that each query
   // only has one sample.
-  auto prev_scheduled_time = scheduled_time;
   query_to_send.reserve((last - first) / stride +
                         2);  // Extra one for the current query.
   for (size_t i = first; i <= last; i += stride) {
     auto& q = queries[i];
     auto& s = q.samples_[0];
     query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
-    q.scheduled_time = prev_scheduled_time + q.scheduled_delta;
+    q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta;
     q.issued_start_time = issued_start_time;
-    prev_scheduled_time = q.scheduled_time;
   }
 }
 
@@ -442,18 +440,16 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
     if (scenario == TestScenario::Server &&
         settings.requested.server_coalesce_queries) {
       auto current_query_idx = queries_idx;
-      auto scheduled_time = query.scheduled_time;
       for (; queries_idx + query_stride < queries_count;
            queries_idx += query_stride) {
         auto next_scheduled_time =
-            scheduled_time +
+            start +
             queries[queries_idx + query_stride].scheduled_delta;
         // If current time hasn't reached the next query's scheduled time yet,
         // don't include next query.
         if (last_now < next_scheduled_time) {
           break;
         }
-        scheduled_time = next_scheduled_time;
         queries_issued_per_iter++;
       }
       if (queries_idx > current_query_idx) {

From c6659f89b9449062daf5b155cf10856ac0e2e52d Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 10 Aug 2020 09:08:48 -0700
Subject: [PATCH 16/22] use 3dunet and not 3d-unet

---
 tools/submission/submission-checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index 414fad0ea..b9d6d937e 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -60,7 +60,7 @@
             "ssd-small", "ssd-large", "resnet", "rnnt",
             "bert", "bert-99", "bert-99.9",
             "dlrm", "dlrm-99", "dlrm-99.9"
-            "3dunet", "3d-unet-99", "3d-unet-99.9"
+            "3dunet", "3dunet-99", "3dunet-99.9"
         ],
         "required-scenarios-datacenter": {
             "resnet": ["Server", "Offline"],

From c366de28bbb4517ce92e3abd690c8c966a55852a Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 10 Aug 2020 10:27:46 -0700
Subject: [PATCH 17/22] rename 3dunet to 3d-unet to match mlperf.conf

---
 tools/submission/submission-checker.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index b9d6d937e..bb32e95fa 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -60,7 +60,7 @@
             "ssd-small", "ssd-large", "resnet", "rnnt",
             "bert", "bert-99", "bert-99.9",
             "dlrm", "dlrm-99", "dlrm-99.9"
-            "3dunet", "3dunet-99", "3dunet-99.9"
+            "3d-unet", "3d-unet-99", "3d-unet-99.9"
         ],
         "required-scenarios-datacenter": {
             "resnet": ["Server", "Offline"],
@@ -68,7 +68,7 @@
             "rnnt": ["Server", "Offline"],
             "bert": ["Server", "Offline"],
             "dlrm": ["Server", "Offline"],
-            "3dunet": ["Offline"],
+            "3d-unet": ["Offline"],
         },
         "optional-scenarios-datacenter": {
         },
@@ -79,7 +79,7 @@
             "rnnt": ["SingleStream", "Offline"],
             "bert": ["SingleStream", "Offline"],
             "dlrm": ["SingleStream", "Offline"],
-            "3dunet": ["SingleStream", "Offline"],
+            "3d-unet": ["SingleStream", "Offline"],
         },
         "optional-scenarios-edge": {
             "resnet": ["MultiStream"],
@@ -97,9 +97,9 @@
             "dlrm": ("AUC", 80.25 * 0.99),
             "dlrm-99": ("AUC", 80.25 * 0.99),
             "dlrm-99.9": ("AUC", 80.25 * 0.999),
-            "3dunet": ("DICE", 0.853 * 0.99),
-            "3dunet-99": ("DICE", 0.853 * 0.99),
-            "3dunet-99.9": ("DICE", 0.853 * 0.999),
+            "3d-unet": ("DICE", 0.853 * 0.99),
+            "3d-unet-99": ("DICE", 0.853 * 0.99),
+            "3d-unet-99.9": ("DICE", 0.853 * 0.999),
         },
         "performance-sample-count": {
             "ssd-small": 256,
@@ -108,7 +108,7 @@
             "rnnt": 2513,
             "bert": 3903900,
             "dlrm": 204800,
-            "3dunet": 16,
+            "3d-unet": 16,
         },
         "seeds": {
             "qsl_rng_seed": 3133965575612453542,
@@ -132,8 +132,8 @@
     "bert-99.9": "bert",
     "dlrm-99": "dlrm",
     "dlrm-99.9": "dlrm",
-    "3dunet-99": "3dunet",
-    "3dunet-99.9": "3dunet",
+    "3d-unet-99": "3d-unet",
+    "3d-unet-99.9": "3d-unet",
 }
 
 RESULT_FIELD = {

From 729a7f2694707dafde84e0af5374dfa0c5ecbeed Mon Sep 17 00:00:00 2001
From: George Yuan <gyuan@nvidia.com>
Date: Mon, 10 Aug 2020 22:27:06 -0700
Subject: [PATCH 18/22] adding TEST05 and top-level readme

---
 v0.7/compliance/nvidia/README.md              |  25 ++++
 v0.7/compliance/nvidia/TEST05/README.md       |  37 +++++
 v0.7/compliance/nvidia/TEST05/audit.config    |  10 ++
 .../nvidia/TEST05/run_verification.py         | 104 +++++++++++++++
 .../nvidia/TEST05/verify_performance.py       | 126 ++++++++++++++++++
 5 files changed, 302 insertions(+)
 create mode 100755 v0.7/compliance/nvidia/README.md
 create mode 100755 v0.7/compliance/nvidia/TEST05/README.md
 create mode 100644 v0.7/compliance/nvidia/TEST05/audit.config
 create mode 100644 v0.7/compliance/nvidia/TEST05/run_verification.py
 create mode 100644 v0.7/compliance/nvidia/TEST05/verify_performance.py

diff --git a/v0.7/compliance/nvidia/README.md b/v0.7/compliance/nvidia/README.md
new file mode 100755
index 000000000..da96bb3d5
--- /dev/null
+++ b/v0.7/compliance/nvidia/README.md
@@ -0,0 +1,25 @@
+﻿# Compliance Testing
+This repository provides the compliance tests that need to be run in order to demonstrate a valid submission.
+
+# Table of Contents
+1. [Introduction](#introduction)
+2. [Test Infrastructure](#Test-Infrastructure)
+3. [Test Methodology](#Test-Methodology)
+
+## Introduction
+A handful of compliance tests have been created to help ensure that submissions comply with a subset of the MLPerf rules. Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of submission collateral. Scripts are provided in each of the test subdirectories to help with copying the compliance test logs into the correct directory structure for upload. 
+
+## Test Infrastructure
+The compliance tests exercise functionality in LoadGen, enabled through the use of a config file that overrides LoadGen functionality, enabling it to run in a variety of compliance testing modes. Upon invocation, LoadGen checks if a `audit.config` file exists in the current working directory. The configuration parameters in `audit.config` override any settings set by `mlperf.conf` or `user.conf`.
+## Test Methodology
+Running a compliance test entails typically three steps:
+#### 1. Setup
+Copy the provided `audit.config` file from the test repository into the current working directory from where the benchmark typically starts execution.
+#### 2. Execution
+Run the benchmark as one normally would for a submission run. LoadGen will read `audit.config` and execute the compliance test.
+Note: remove `audit.config` file from the working directory afterwards to prevent unintentionally running in compliance testing mode in future runs.
+#### 3. Verification
+Run the provided python-based verification script to ensure that the compliance test has successfully completed and meets expectations in terms of performance and/or accuracy. The script will also copy the output compliance logs to a path specified by the user in the correct  directory structure in preparation for upload to the MLPerf submission repository.
+
+
+
diff --git a/v0.7/compliance/nvidia/TEST05/README.md b/v0.7/compliance/nvidia/TEST05/README.md
new file mode 100755
index 000000000..63c30fb54
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST05/README.md
@@ -0,0 +1,37 @@
+﻿
+# Test 05 - Vary RNG seeds
+## Introduction
+The purpose of this test is to ensure that the SUT does not favor a particular set of Loadgen RNG seed values. The pass condition is that performance with non-default RNG seed values should be similar to the submitted performance.
+
+The seeds that are changed are listed below:
+ - qsl_rng_seed - determines order of samples in QSL
+ - sample_index_rng_seed - determines subset of samples in each loadable set
+  - schedule_rng_seed - determines scheduling of samples in server mode
+
+## Prerequisites
+This script works best with Python 3.3 or later.
+
+## Pass Criteria
+Performance must be within 5% of the submission performance. In single stream mode, latencies can be very short for high performance systems and run-to-run variation due to external disturbances (OS) can be significant. In such cases and when submission latencies are less or equal to 0.2ms, the pass threshold is relaxed to 20%.
+
+## Instructions
+
+### Part I
+Run the benchmark with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt 
+
+### Part II
+Run the verification script:
+  `python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}]`
+  
+RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline`
+COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run.
+OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline`
+
+Expected outcome:
+              
+    Performance check pass: True             
+    TEST05 verification complete        
+
+     
+
+
diff --git a/v0.7/compliance/nvidia/TEST05/audit.config b/v0.7/compliance/nvidia/TEST05/audit.config
new file mode 100644
index 000000000..6cc924912
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST05/audit.config
@@ -0,0 +1,10 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.qsl_rng_seed = 313588358309856706
+*.*.sample_index_rng_seed = 471397156132239067
+*.*.schedule_rng_seed = 413914573387865862
diff --git a/v0.7/compliance/nvidia/TEST05/run_verification.py b/v0.7/compliance/nvidia/TEST05/run_verification.py
new file mode 100644
index 000000000..3b2c76d46
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST05/run_verification.py
@@ -0,0 +1,104 @@
+#! /usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import os
+import sys
+import shutil
+import subprocess
+import argparse
+import json
+
+import numpy as np
+
+sys.path.append(os.getcwd())
+
+dtype_map = {
+    "byte": np.byte,
+    "float32": np.float32,
+    "int32": np.int32,
+    "int64": np.int64
+}
+
+def main():
+
+
+    py3 = sys.version_info >= (3,0)
+    # Parse arguments to identify the path to the logs from the performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--results_dir", "-r",
+        help="Specifies the path to the corresponding results directory that contains the performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--compliance_dir", "-c",
+        help="Specifies the path to the directory containing the logs from the compliance test run.",
+        default=""
+    )
+    parser.add_argument(
+        "--output_dir", "-o",
+        help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode")
+
+    args = parser.parse_args()
+
+    print("Parsing arguments.")
+    results_dir = args.results_dir
+    compliance_dir = args.compliance_dir
+    output_dir = os.path.join(args.output_dir, "TEST05")
+
+    dtype = args.dtype
+
+    # run verify performance
+    verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    try:
+        os.system(verify_performance_command)
+    except:
+        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+
+    # check if verify performance script passes
+    performance_pass_command = "grep PASS verify_performance.txt"
+    performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+    
+    # setup output compliance directory structure
+    output_performance_dir = os.path.join(output_dir, "performance", "run_1")
+    try:
+        if not os.path.isdir(output_performance_dir):
+            os.makedirs(output_performance_dir)
+    except:
+        print("Exception occurred trying to create " + output_performance_dir)
+
+    # copy compliance logs to output compliance directory
+    shutil.copy2("verify_performance.txt",output_dir)
+    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+
+    try:
+        shutil.copy2(summary_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+    try:
+        shutil.copy2(detail_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+
+    print("Performance check pass: {:}".format(performance_pass))
+    print("TEST05 verification complete")
+
+if __name__ == '__main__':
+	main()
diff --git a/v0.7/compliance/nvidia/TEST05/verify_performance.py b/v0.7/compliance/nvidia/TEST05/verify_performance.py
new file mode 100644
index 000000000..4c44f7dfd
--- /dev/null
+++ b/v0.7/compliance/nvidia/TEST05/verify_performance.py
@@ -0,0 +1,126 @@
+#! /usr/bin/env python3
+import os
+import sys
+import re
+sys.path.append(os.getcwd())
+
+import argparse
+import json
+
+def main():
+    # Parse arguments to identify the path to the accuracy logs from
+    #   the accuracy and performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reference_summary", "-r",
+        help="Specifies the path to the summary log for TEST00.",
+        default=""
+    )
+    parser.add_argument(
+        "--test_summary", "-t",
+        help="Specifies the path to the summary log for this test.",
+        default=""
+    )
+    args = parser.parse_args()
+
+    print("Verifying performance.")
+    ref_file = open(args.reference_summary, "r")
+    test_file = open(args.test_summary, "r")
+    ref_score = 0
+    test_score = 0
+    ref_mode = ''
+    test_mode = ''
+
+    for line in ref_file:
+        if re.match("Scenario", line):
+            ref_mode = line.split(": ",1)[1].strip()
+            continue
+
+        if ref_mode == "Single Stream":
+            if re.match("90th percentile latency", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Multi Stream":
+            if re.match("Samples per query", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Server":
+            if re.match("Scheduled samples per second", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if ref_mode == "Offline":
+            if re.match("Samples per second", line):
+                ref_score = line.split(": ",1)[1].strip()
+                continue
+
+        if re.match("Result is", line):
+            valid = line.split(": ",1)[1].strip()
+            if valid == 'INVALID':
+                sys.exit("TEST FAIL: Reference results are invalid")
+
+        if re.match("\d+ ERROR", line):
+            error = line.split(" ",1)[0].strip()
+            print("WARNING: " + error + " ERROR reported in reference results")
+
+
+    for line in test_file:
+        if re.match("Scenario", line):
+            test_mode = line.split(": ",1)[1].strip()
+            continue
+
+        if test_mode == "Single Stream":
+            if re.match("90th percentile latency", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Multi Stream":
+            if re.match("Samples per query", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Server":
+            if re.match("Scheduled samples per second", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if test_mode == "Offline":
+            if re.match("Samples per second", line):
+                test_score = line.split(": ",1)[1].strip()
+                continue
+
+        if re.match("Result is", line):
+            valid = line.split(": ",1)[1].strip()
+            if valid == 'INVALID':
+                sys.exit("TEST FAIL: Test results are invalid")
+            
+        if re.match("\d+ ERROR", line):
+            error = line.split(" ",1)[0].strip()
+            print("WARNING: " + error + " ERROR reported in test results")
+
+    if test_mode != ref_mode:
+        sys.exit("Test and reference scenarios do not match!")
+
+    print("reference score = {}".format(ref_score))
+    print("test score = {}".format(test_score))
+
+ 
+    threshold = 0.05
+
+    # In single stream mode, latencies can be very short for high performance systems
+    # and run-to-run variation due to external disturbances (OS) can be significant.
+    # In this case we relax pass threshold to 20%
+
+    if ref_mode == "Single Stream" and float(ref_score) <= 200000:
+        threshold = 0.20
+        
+    if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold):
+        print("TEST PASS")
+    else:
+        print("TEST FAIL: Test score invalid")
+
+if __name__ == '__main__':
+	main()
+

From 405de9d9273593d228958ce3ed68631c40e62c2b Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Tue, 11 Aug 2020 07:22:43 -0700
Subject: [PATCH 19/22] review feedback - dlrm is not in edge suite

---
 tools/submission/submission-checker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index bb32e95fa..8110cf87e 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -78,7 +78,6 @@
             "ssd-large": ["SingleStream", "Offline"],
             "rnnt": ["SingleStream", "Offline"],
             "bert": ["SingleStream", "Offline"],
-            "dlrm": ["SingleStream", "Offline"],
             "3d-unet": ["SingleStream", "Offline"],
         },
         "optional-scenarios-edge": {

From e9e4888116bd75a0d9ec28d1720fec984963b230 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Tue, 11 Aug 2020 07:48:33 -0700
Subject: [PATCH 20/22] handle bad scenario casing in v0.5 submission

---
 tools/submission/submission-checker.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py
index 8110cf87e..57165e223 100755
--- a/tools/submission/submission-checker.py
+++ b/tools/submission/submission-checker.py
@@ -123,6 +123,13 @@
 REQUIRED_MEASURE_FILES = ["mlperf.conf", "user.conf", "README.md"]
 TO_MS = 1000 * 1000
 
+SCENARIO_MAPPING = {
+    "singlestream": "SingleStream",
+    "multistream": "MultiStream",
+    "server": "server",
+    "offline": "Offline",
+}
+
 MODEL_MAPPING = {
     "ssd-mobilenet": "ssd-small",
     "ssd-resnet34": "ssd-large",
@@ -465,11 +472,14 @@ def check_results_dir(config, filter_submitter, csv):
                     required_scenarios = config.get_required(MODEL_MAPPING.get(model, model))
                     all_scenarios = set(list(required_scenarios) + list(config.get_optional(MODEL_MAPPING.get(model, model))))
                     for scenario in list_dir(results_path, system_desc, model):
+                        # some submissions in v0.5 use lower case scenarios - map them for now
+                        scenario_fixed = SCENARIO_MAPPING.get(scenario, scenario)
+
                         # we are looking at ./$division/$submitter/$system_desc/$model/$scenario,
                         #   ie ./closed/mlperf_org/t4-ort/bert/Offline
                         name = os.path.join(results_path, system_desc, model, scenario)
                         results[name] = None
-                        if scenario not in all_scenarios:
+                        if scenario_fixed not in all_scenarios:
                             log.warning("%s ignoring scenario %s (neither required nor optional)", name, scenario)
                             continue
 
@@ -534,7 +544,7 @@ def check_results_dir(config, filter_submitter, csv):
                             if accuracy_is_valid:
                                 log.info("%s is OK", name)
                                 csv.write(fmt.format(submitter, available, division, system_type, system_desc, model,
-                                                     scenario, r, acc, name))
+                                                     scenario_fixed, r, acc, name))
                             else:
                                 results[name] = None
                                 log.error("%s is OK but accuracy has issues", name)

From 239005fc90c8604360645f748321e6c00b9c572e Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Wed, 12 Aug 2020 16:20:22 -0700
Subject: [PATCH 21/22] Adjusting README commands based on MLPerf inference
 meeting discussion.

---
 v0.5/recommendation/README.md      | 18 +++++++++---------
 v0.5/recommendation/python/main.py |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md
index 615962f81..917e85b3f 100755
--- a/v0.5/recommendation/README.md
+++ b/v0.5/recommendation/README.md
@@ -180,7 +180,7 @@ options are extra arguments that are passed along
 
 For example, to run on CPU you may choose to use:
 
-1. Criteo Kaggle DAC
+1. Criteo Kaggle DAC (debugging)
 
 Offline scenario perf and accuracy modes
 ```
@@ -189,16 +189,16 @@ Offline scenario perf and accuracy modes
 ```
 Server scenario perf and accuracy modes
 ```
-./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048
-./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy
+./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-fix=2048 --max-batchsize=2048
+./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy
 ```
 
-2. Criteo Terabyte (0.875)
+2. Criteo Terabyte with 0.875 sub-sampling (debugging)
 
 Offline scenario perf and accuracy modes
 ```
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=1 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
 ```
 Server scenario perf and accuracy modes
 ```
@@ -206,12 +206,12 @@ Server scenario perf and accuracy modes
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Server  --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy [--mlperf-bin-loader]
 ```
 
-3. Criteo Terabyte
+3. Criteo Terabyte (official)
 
 Offline scenario perf and accuracy modes
 ```
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 [--mlperf-bin-loader]
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=204800 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=204800 --accuracy [--mlperf-bin-loader]
 ```
 Server scenario perf and accuracy modes
 ```
diff --git a/v0.5/recommendation/python/main.py b/v0.5/recommendation/python/main.py
index 429d0a796..8ed44847e 100755
--- a/v0.5/recommendation/python/main.py
+++ b/v0.5/recommendation/python/main.py
@@ -124,6 +124,7 @@ def get_args():
     parser.add_argument("--count-samples", type=int, help="dataset items to use")
     parser.add_argument("--count-queries", type=int, help="number of queries to use")
     parser.add_argument("--samples-per-query-multistream", type=int, help="query length for multi-stream scenario (in terms of aggregated samples)")
+    # --samples-per-query-offline is equivalent to perf_sample_count
     parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)")
     parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one")
     parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size")

From 5691281b680dd22adcc37511529956ddb2e4b821 Mon Sep 17 00:00:00 2001
From: mnaumovfb <mnaumov@fb.com>
Date: Wed, 12 Aug 2020 16:31:20 -0700
Subject: [PATCH 22/22] Minor adjustment

---
 v0.5/recommendation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md
index 917e85b3f..035dcf6cd 100755
--- a/v0.5/recommendation/README.md
+++ b/v0.5/recommendation/README.md
@@ -197,7 +197,7 @@ Server scenario perf and accuracy modes
 
 Offline scenario perf and accuracy modes
 ```
-./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=1 [--mlperf-bin-loader]
+./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader]
 ./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader]
 ```
 Server scenario perf and accuracy modes