Merge pull request mlcommons#673 from georgelyuan/master

adding preliminary loadgen changes for compliance testing as well as …
nv-rborkar · Aug 11, 2020 · b0f6aab · b0f6aab
2 parents d1ede1b + 729a7f2
commit b0f6aab
Show file tree

Hide file tree

Showing 20 changed files with 890 additions and 0 deletions.
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
@@ -358,6 +358,16 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
   auto sequence_id_start = sequence_gen->CurrentSampleId();
   std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
       settings, loaded_sample_set, sequence_gen, &response_logger);
+
+  // Calculated expected number of queries
+  uint64_t expected_queries = settings.target_qps * settings.min_duration.count() / 1000;
+  if (scenario != TestScenario::Offline) {
+      expected_queries *= settings.samples_per_query;
+  }
+
+  if (settings.accuracy_log_sampling_target > 0) {
+    response_logger.accuracy_log_prob = (double) settings.accuracy_log_sampling_target / expected_queries;
+  }
   auto sequence_id_end = sequence_gen->CurrentSampleId();
   size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
 

diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
@@ -262,6 +262,10 @@ struct TestSettings {
   /// accuracy log in performance mode
   double accuracy_log_probability = 0.0;
 
+  /// \brief Target number of samples that will have their results printed to
+  /// accuracy log in performance mode for compliance testing
+  uint64_t accuracy_log_sampling_target = 0;
+
   /// \brief Load mlperf parameter config from file.
   int FromConfig(const std::string &path, const std::string &model,
                  const std::string &scenario);

diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
@@ -42,6 +42,7 @@ TestSettingsInternal::TestSettingsInternal(
       schedule_rng_seed(requested.schedule_rng_seed),
       accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
       accuracy_log_probability(requested.accuracy_log_probability),
+      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
       print_timestamps(requested.print_timestamps),
       performance_issue_unique(requested.performance_issue_unique),
       performance_issue_same(requested.performance_issue_same),
@@ -256,6 +257,7 @@ void LogRequestedTestSettings(const TestSettings &s) {
     detail("schedule_rng_seed : ", s.schedule_rng_seed);
     detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
     detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
     detail("print_timestamps : ", s.print_timestamps);
     detail("performance_issue_unique : ", s.performance_issue_unique);
     detail("performance_issue_same : ", s.performance_issue_same);
@@ -290,6 +292,7 @@ void TestSettingsInternal::LogEffectiveSettings() const {
     detail("schedule_rng_seed : ", s.schedule_rng_seed);
     detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
     detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
     detail("print_timestamps : ", s.print_timestamps);
     detail("performance_issue_unique : ", s.performance_issue_unique);
     detail("performance_issue_same : ", s.performance_issue_same);
@@ -317,6 +320,7 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
   summary("schedule_rng_seed : ", schedule_rng_seed);
   summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
   summary("accuracy_log_probability : ", accuracy_log_probability);
+  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
   summary("print_timestamps : ", print_timestamps);
   summary("performance_issue_unique : ", performance_issue_unique);
   summary("performance_issue_same : ", performance_issue_same);
@@ -462,6 +466,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
            nullptr);
   lookupkv(model, scenario, "accuracy_log_probability", nullptr,
            &accuracy_log_probability, 0.01);
+  lookupkv(model, scenario, "accuracy_log_sampling_target",
+           &accuracy_log_sampling_target, nullptr);
   if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
     print_timestamps = (val == 0) ? false : true;
   if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))

diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
@@ -74,6 +74,7 @@ struct TestSettingsInternal {
   uint64_t schedule_rng_seed;
   uint64_t accuracy_log_rng_seed;
   double accuracy_log_probability;
+  uint64_t accuracy_log_sampling_target;
   bool print_timestamps;
   bool performance_issue_unique;
   bool performance_issue_same;

diff --git a/v0.7/compliance/nvidia/README.md b/v0.7/compliance/nvidia/README.md
@@ -0,0 +1,25 @@
+# Compliance Testing
+This repository provides the compliance tests that need to be run in order to demonstrate a valid submission.
+
+# Table of Contents
+1. [Introduction](#introduction)
+2. [Test Infrastructure](#Test-Infrastructure)
+3. [Test Methodology](#Test-Methodology)
+
+## Introduction
+A handful of compliance tests have been created to help ensure that submissions comply with a subset of the MLPerf rules. Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of submission collateral. Scripts are provided in each of the test subdirectories to help with copying the compliance test logs into the correct directory structure for upload. 
+
+## Test Infrastructure
+The compliance tests exercise functionality in LoadGen, enabled through the use of a config file that overrides LoadGen functionality, enabling it to run in a variety of compliance testing modes. Upon invocation, LoadGen checks if a `audit.config` file exists in the current working directory. The configuration parameters in `audit.config` override any settings set by `mlperf.conf` or `user.conf`.
+## Test Methodology
+Running a compliance test entails typically three steps:
+#### 1. Setup
+Copy the provided `audit.config` file from the test repository into the current working directory from where the benchmark typically starts execution.
+#### 2. Execution
+Run the benchmark as one normally would for a submission run. LoadGen will read `audit.config` and execute the compliance test.
+Note: remove `audit.config` file from the working directory afterwards to prevent unintentionally running in compliance testing mode in future runs.
+#### 3. Verification
+Run the provided python-based verification script to ensure that the compliance test has successfully completed and meets expectations in terms of performance and/or accuracy. The script will also copy the output compliance logs to a path specified by the user in the correct  directory structure in preparation for upload to the MLPerf submission repository.
+
+
+
diff --git a/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config b/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 64
diff --git a/v0.7/compliance/nvidia/TEST01/README.md b/v0.7/compliance/nvidia/TEST01/README.md
@@ -0,0 +1,44 @@
+
+# Test 01 - Verify accuracy in performance mode
+## Introduction
+The purpose of this test is to ensure that valid inferences are being performed in performance mode. By default, the inference result that is returned from SUT to Loadgen is not written to the accuracy JSON file and thus not checked for accuracy. In this test, the inference results of a subset of the total samples issued by loadgen are written to the accuracy JSON. In order to pass this test, two criteria must be satisfied:
+
+ 1. The inference results in the accuracy JSON file must match the inference results in the accuracy JSON generated in accuracy mode in the submission run.
+ 2. The performance while running this test must match the performance of the submission within 10%. 
+
+## Performance considerations
+The subset of samples results chosen to to be written to the accuracy JSON is determined randomly using a probability based on `accuracy_log_sampling_target` specified in the audit.config file divided by the total expected number of completed samples in the test run. This total expected number of completed samples is based on `min_duration_count`, `samples_per_query`, and `target_qps`. The goal is to ensure that a reasonable number of sample results gets written to the accuracy JSON regardless of the throughput of the system-under-test. Given that the number of actual completed samples may not match the expected number, the number of inference results written to the accuracy JSON may not exactly match `accuracy_log_sampling_target`.
+
+There is an audit.config file for each individual benchmark, located in the benchmark subdirectories in this test directory. The `accuracy_log_sampling_target` value for each benchmark is chosen taking into consideration the performance sample count and size of the inference result. If performance with sampling enabled cannot meet the pass threshold set in verify_performance.py, `accuracy_log_sampling_target` may be reduced to check that performance approaches the submission score.
+
+## Log size
+3d-unet is unique in that its inference result output per-sample is drastically larger than that of other benchmarks. For all other benchmarks, the accuracy JSON results can be checked using python JSON libraries, which can be enabled by providing `--fastmode` to the run_verification.py script. For 3d-unet, using fastmode will result in verify_performance.py running out of memory, so the alternative way of using UNIX-based commandline utilities must be used by not supplying the `--fastmode` switch.
+
+## Prerequisites
+This script works best with Python 3.3 or later. For 3d-unet, the accuracy verification script require the `wc`,`sed`,`awk`,`head`,`tail`,`grep`, and `md5sum` UNIX commandline utilities.
+
+## Non-determinism
+Note that under MLPerf inference rules, certain forms of non-determinism is acceptable, which can cause inference results to differ across runs. It is foreseeable that the results obtained during the accuracy run can be different from that obtained during the performance run, which will cause the accuracy checking script to report failure. Test failure will automatically result in an objection, but the objection can be overturned by comparing the quality of the results generated in performance mode to that obtained in accuracy mode. This can be done by using the accuracy measurement scripts provided as part of the repo to ensure that the accuracy score meets the target. An example is provided for GNMT in the gnmt folder.
+
+## Instructions
+
+### Part I
+Run test with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt 
+
+### Part II
+Run the verification script:
+  `python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}] [--fastmode]`
+
+RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline`
+COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run.
+OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline`
+
+Expected outcome:
+
+    Accuracy check pass: True                
+    Performance check pass: True             
+    TEST01 verification complete        
+
+     
+
+
diff --git a/v0.7/compliance/nvidia/TEST01/bert/audit.config b/v0.7/compliance/nvidia/TEST01/bert/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/dlrm/audit.config b/v0.7/compliance/nvidia/TEST01/dlrm/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/resnet/audit.config b/v0.7/compliance/nvidia/TEST01/resnet/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/rnnt/audit.config b/v0.7/compliance/nvidia/TEST01/rnnt/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 4096
diff --git a/v0.7/compliance/nvidia/TEST01/run_verification.py b/v0.7/compliance/nvidia/TEST01/run_verification.py
@@ -0,0 +1,143 @@
+#! /usr/bin/env python3
+# Copyright 2018 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import os
+import sys
+import shutil
+import subprocess
+import argparse
+import json
+
+import numpy as np
+
+sys.path.append(os.getcwd())
+
+dtype_map = {
+    "byte": np.byte,
+    "float32": np.float32,
+    "int32": np.int32,
+    "int64": np.int64
+}
+
+def main():
+
+
+    py3 = sys.version_info >= (3,0)
+    # Parse arguments to identify the path to the accuracy logs from
+    #   the accuracy and performance runs
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--results_dir", "-r",
+        help="Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--compliance_dir", "-c",
+        help="Specifies the path to the directory containing the logs from the compliance test run.",
+        default=""
+    )
+    parser.add_argument(
+        "--output_dir", "-o",
+        help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
+        default=""
+    )
+    parser.add_argument(
+        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode")
+    parser.add_argument(
+        "--fastmode", action="store_true",
+        help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.")
+
+    args = parser.parse_args()
+
+    print("Parsing arguments.")
+    results_dir = args.results_dir
+    compliance_dir = args.compliance_dir
+    output_dir = os.path.join(args.output_dir, "TEST01")
+    fastmode = ""
+    if args.fastmode:
+        fastmode = " --fastmode"
+    else: 
+        for binary in ["wc", "md5sum", "grep", "awk", "sed", "head", "tail"]:
+            missing_binary = False
+            if shutil.which(binary) == None:
+                print("Error: This script requires the {:} commandline utility".format(binary))
+                missing_binary = True
+        if missing_binary:
+            exit()
+
+    dtype = args.dtype
+
+    # run verify accuracy
+    verify_accuracy_command = "python3 verify_accuracy.py --dtype " + args.dtype + fastmode + " -r " + results_dir + "/accuracy/mlperf_log_accuracy.json" + " -t " + compliance_dir + "/mlperf_log_accuracy.json | tee verify_accuracy.txt"
+    try:
+        os.system(verify_accuracy_command)
+    except:
+        print("Exception occurred trying to execute:\n  " + verify_accuracy_command)
+    # check if verify accuracy script passes
+
+    accuracy_pass_command = "grep PASS verify_accuracy.txt"
+    accuracy_pass = "TEST PASS" in subprocess.check_output(accuracy_pass_command, shell=True).decode("utf-8")
+
+    # run verify performance
+    verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    try:
+        os.system(verify_performance_command)
+    except:
+        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+
+    # check if verify performance script passes
+    performance_pass_command = "grep PASS verify_performance.txt"
+    performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+
+    # setup output compliance directory structure
+    output_accuracy_dir = os.path.join(output_dir, "accuracy")
+    output_performance_dir = os.path.join(output_dir, "performance", "run_1")
+    try:
+        if not os.path.isdir(output_accuracy_dir):
+            os.makedirs(output_accuracy_dir)
+    except:
+        print("Exception occurred trying to create " + output_accuracy_dir)
+    try:
+        if not os.path.isdir(output_performance_dir):
+            os.makedirs(output_performance_dir)
+    except:
+        print("Exception occurred trying to create " + output_performance_dir)
+
+    # copy compliance logs to output compliance directory
+    shutil.copy2("verify_accuracy.txt",output_dir)
+    shutil.copy2("verify_performance.txt",output_dir)
+    accuracy_file = os.path.join(compliance_dir,"mlperf_log_accuracy.json")
+    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+
+    try:
+        shutil.copy2(accuracy_file,output_accuracy_dir)
+    except:
+        print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir)
+    try:
+        shutil.copy2(summary_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+    try:
+        shutil.copy2(detail_file,output_performance_dir)
+    except:
+        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+
+    print("Accuracy check pass: {:}".format(accuracy_pass))
+    print("Performance check pass: {:}".format(performance_pass))
+    print("TEST01 verification complete")
+
+if __name__ == '__main__':
+	main()
diff --git a/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 256
diff --git a/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config
@@ -0,0 +1,9 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
+*.*.mode = 2
+*.*.accuracy_log_rng_seed = 720381539243781796
+*.*.accuracy_log_sampling_target = 1024