Skip to content

Commit

Permalink
Merge pull request mlcommons#673 from georgelyuan/master
Browse files Browse the repository at this point in the history
adding preliminary loadgen changes for compliance testing as well as …
  • Loading branch information
christ1ne authored Aug 11, 2020
2 parents d1ede1b + 729a7f2 commit b0f6aab
Show file tree
Hide file tree
Showing 20 changed files with 890 additions and 0 deletions.
10 changes: 10 additions & 0 deletions loadgen/loadgen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,16 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
auto sequence_id_start = sequence_gen->CurrentSampleId();
std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
settings, loaded_sample_set, sequence_gen, &response_logger);

// Calculated expected number of queries
uint64_t expected_queries = settings.target_qps * settings.min_duration.count() / 1000;
if (scenario != TestScenario::Offline) {
expected_queries *= settings.samples_per_query;
}

if (settings.accuracy_log_sampling_target > 0) {
response_logger.accuracy_log_prob = (double) settings.accuracy_log_sampling_target / expected_queries;
}
auto sequence_id_end = sequence_gen->CurrentSampleId();
size_t max_latencies_to_record = sequence_id_end - sequence_id_start;

Expand Down
4 changes: 4 additions & 0 deletions loadgen/test_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ struct TestSettings {
/// accuracy log in performance mode
double accuracy_log_probability = 0.0;

/// \brief Target number of samples that will have their results printed to
/// accuracy log in performance mode for compliance testing
uint64_t accuracy_log_sampling_target = 0;

/// \brief Load mlperf parameter config from file.
int FromConfig(const std::string &path, const std::string &model,
const std::string &scenario);
Expand Down
6 changes: 6 additions & 0 deletions loadgen/test_settings_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ TestSettingsInternal::TestSettingsInternal(
schedule_rng_seed(requested.schedule_rng_seed),
accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
accuracy_log_probability(requested.accuracy_log_probability),
accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
print_timestamps(requested.print_timestamps),
performance_issue_unique(requested.performance_issue_unique),
performance_issue_same(requested.performance_issue_same),
Expand Down Expand Up @@ -256,6 +257,7 @@ void LogRequestedTestSettings(const TestSettings &s) {
detail("schedule_rng_seed : ", s.schedule_rng_seed);
detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
detail("accuracy_log_probability : ", s.accuracy_log_probability);
detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
detail("print_timestamps : ", s.print_timestamps);
detail("performance_issue_unique : ", s.performance_issue_unique);
detail("performance_issue_same : ", s.performance_issue_same);
Expand Down Expand Up @@ -290,6 +292,7 @@ void TestSettingsInternal::LogEffectiveSettings() const {
detail("schedule_rng_seed : ", s.schedule_rng_seed);
detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
detail("accuracy_log_probability : ", s.accuracy_log_probability);
detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
detail("print_timestamps : ", s.print_timestamps);
detail("performance_issue_unique : ", s.performance_issue_unique);
detail("performance_issue_same : ", s.performance_issue_same);
Expand Down Expand Up @@ -317,6 +320,7 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
summary("schedule_rng_seed : ", schedule_rng_seed);
summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
summary("accuracy_log_probability : ", accuracy_log_probability);
summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
summary("print_timestamps : ", print_timestamps);
summary("performance_issue_unique : ", performance_issue_unique);
summary("performance_issue_same : ", performance_issue_same);
Expand Down Expand Up @@ -462,6 +466,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
nullptr);
lookupkv(model, scenario, "accuracy_log_probability", nullptr,
&accuracy_log_probability, 0.01);
lookupkv(model, scenario, "accuracy_log_sampling_target",
&accuracy_log_sampling_target, nullptr);
if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
print_timestamps = (val == 0) ? false : true;
if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
Expand Down
1 change: 1 addition & 0 deletions loadgen/test_settings_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ struct TestSettingsInternal {
uint64_t schedule_rng_seed;
uint64_t accuracy_log_rng_seed;
double accuracy_log_probability;
uint64_t accuracy_log_sampling_target;
bool print_timestamps;
bool performance_issue_unique;
bool performance_issue_same;
Expand Down
25 changes: 25 additions & 0 deletions v0.7/compliance/nvidia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Compliance Testing
This repository provides the compliance tests that need to be run in order to demonstrate a valid submission.

# Table of Contents
1. [Introduction](#introduction)
2. [Test Infrastructure](#Test-Infrastructure)
3. [Test Methodology](#Test-Methodology)

## Introduction
A handful of compliance tests have been created to help ensure that submissions comply with a subset of the MLPerf rules. Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of submission collateral. Scripts are provided in each of the test subdirectories to help with copying the compliance test logs into the correct directory structure for upload.

## Test Infrastructure
The compliance tests exercise functionality in LoadGen, enabled through the use of a config file that overrides LoadGen functionality, enabling it to run in a variety of compliance testing modes. Upon invocation, LoadGen checks if a `audit.config` file exists in the current working directory. The configuration parameters in `audit.config` override any settings set by `mlperf.conf` or `user.conf`.
## Test Methodology
Running a compliance test entails typically three steps:
#### 1. Setup
Copy the provided `audit.config` file from the test repository into the current working directory from where the benchmark typically starts execution.
#### 2. Execution
Run the benchmark as one normally would for a submission run. LoadGen will read `audit.config` and execute the compliance test.
Note: remove `audit.config` file from the working directory afterwards to prevent unintentionally running in compliance testing mode in future runs.
#### 3. Verification
Run the provided python-based verification script to ensure that the compliance test has successfully completed and meets expectations in terms of performance and/or accuracy. The script will also copy the output compliance logs to a path specified by the user in the correct directory structure in preparation for upload to the MLPerf submission repository.



9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/3d-unet/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 64
44 changes: 44 additions & 0 deletions v0.7/compliance/nvidia/TEST01/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

# Test 01 - Verify accuracy in performance mode
## Introduction
The purpose of this test is to ensure that valid inferences are being performed in performance mode. By default, the inference result that is returned from SUT to Loadgen is not written to the accuracy JSON file and thus not checked for accuracy. In this test, the inference results of a subset of the total samples issued by loadgen are written to the accuracy JSON. In order to pass this test, two criteria must be satisfied:

1. The inference results in the accuracy JSON file must match the inference results in the accuracy JSON generated in accuracy mode in the submission run.
2. The performance while running this test must match the performance of the submission within 10%.

## Performance considerations
The subset of samples results chosen to to be written to the accuracy JSON is determined randomly using a probability based on `accuracy_log_sampling_target` specified in the audit.config file divided by the total expected number of completed samples in the test run. This total expected number of completed samples is based on `min_duration_count`, `samples_per_query`, and `target_qps`. The goal is to ensure that a reasonable number of sample results gets written to the accuracy JSON regardless of the throughput of the system-under-test. Given that the number of actual completed samples may not match the expected number, the number of inference results written to the accuracy JSON may not exactly match `accuracy_log_sampling_target`.

There is an audit.config file for each individual benchmark, located in the benchmark subdirectories in this test directory. The `accuracy_log_sampling_target` value for each benchmark is chosen taking into consideration the performance sample count and size of the inference result. If performance with sampling enabled cannot meet the pass threshold set in verify_performance.py, `accuracy_log_sampling_target` may be reduced to check that performance approaches the submission score.

## Log size
3d-unet is unique in that its inference result output per-sample is drastically larger than that of other benchmarks. For all other benchmarks, the accuracy JSON results can be checked using python JSON libraries, which can be enabled by providing `--fastmode` to the run_verification.py script. For 3d-unet, using fastmode will result in verify_performance.py running out of memory, so the alternative way of using UNIX-based commandline utilities must be used by not supplying the `--fastmode` switch.

## Prerequisites
This script works best with Python 3.3 or later. For 3d-unet, the accuracy verification script require the `wc`,`sed`,`awk`,`head`,`tail`,`grep`, and `md5sum` UNIX commandline utilities.

## Non-determinism
Note that under MLPerf inference rules, certain forms of non-determinism is acceptable, which can cause inference results to differ across runs. It is foreseeable that the results obtained during the accuracy run can be different from that obtained during the performance run, which will cause the accuracy checking script to report failure. Test failure will automatically result in an objection, but the objection can be overturned by comparing the quality of the results generated in performance mode to that obtained in accuracy mode. This can be done by using the accuracy measurement scripts provided as part of the repo to ensure that the accuracy score meets the target. An example is provided for GNMT in the gnmt folder.

## Instructions

### Part I
Run test with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt

### Part II
Run the verification script:
`python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}] [--fastmode]`

RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline`
COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run.
OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline`

Expected outcome:

Accuracy check pass: True
Performance check pass: True
TEST01 verification complete



9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/bert/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 4096
9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/dlrm/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 4096
9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/resnet/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 4096
9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/rnnt/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 4096
143 changes: 143 additions & 0 deletions v0.7/compliance/nvidia/TEST01/run_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#! /usr/bin/env python3
# Copyright 2018 The MLPerf Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
import os
import sys
import shutil
import subprocess
import argparse
import json

import numpy as np

sys.path.append(os.getcwd())

dtype_map = {
"byte": np.byte,
"float32": np.float32,
"int32": np.int32,
"int64": np.int64
}

def main():


py3 = sys.version_info >= (3,0)
# Parse arguments to identify the path to the accuracy logs from
# the accuracy and performance runs
parser = argparse.ArgumentParser()
parser.add_argument(
"--results_dir", "-r",
help="Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
default=""
)
parser.add_argument(
"--compliance_dir", "-c",
help="Specifies the path to the directory containing the logs from the compliance test run.",
default=""
)
parser.add_argument(
"--output_dir", "-o",
help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
default=""
)
parser.add_argument(
"--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode")
parser.add_argument(
"--fastmode", action="store_true",
help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.")

args = parser.parse_args()

print("Parsing arguments.")
results_dir = args.results_dir
compliance_dir = args.compliance_dir
output_dir = os.path.join(args.output_dir, "TEST01")
fastmode = ""
if args.fastmode:
fastmode = " --fastmode"
else:
for binary in ["wc", "md5sum", "grep", "awk", "sed", "head", "tail"]:
missing_binary = False
if shutil.which(binary) == None:
print("Error: This script requires the {:} commandline utility".format(binary))
missing_binary = True
if missing_binary:
exit()

dtype = args.dtype

# run verify accuracy
verify_accuracy_command = "python3 verify_accuracy.py --dtype " + args.dtype + fastmode + " -r " + results_dir + "/accuracy/mlperf_log_accuracy.json" + " -t " + compliance_dir + "/mlperf_log_accuracy.json | tee verify_accuracy.txt"
try:
os.system(verify_accuracy_command)
except:
print("Exception occurred trying to execute:\n " + verify_accuracy_command)
# check if verify accuracy script passes

accuracy_pass_command = "grep PASS verify_accuracy.txt"
accuracy_pass = "TEST PASS" in subprocess.check_output(accuracy_pass_command, shell=True).decode("utf-8")

# run verify performance
verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
try:
os.system(verify_performance_command)
except:
print("Exception occurred trying to execute:\n " + verify_performance_command)

# check if verify performance script passes
performance_pass_command = "grep PASS verify_performance.txt"
performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")

# setup output compliance directory structure
output_accuracy_dir = os.path.join(output_dir, "accuracy")
output_performance_dir = os.path.join(output_dir, "performance", "run_1")
try:
if not os.path.isdir(output_accuracy_dir):
os.makedirs(output_accuracy_dir)
except:
print("Exception occurred trying to create " + output_accuracy_dir)
try:
if not os.path.isdir(output_performance_dir):
os.makedirs(output_performance_dir)
except:
print("Exception occurred trying to create " + output_performance_dir)

# copy compliance logs to output compliance directory
shutil.copy2("verify_accuracy.txt",output_dir)
shutil.copy2("verify_performance.txt",output_dir)
accuracy_file = os.path.join(compliance_dir,"mlperf_log_accuracy.json")
summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")

try:
shutil.copy2(accuracy_file,output_accuracy_dir)
except:
print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir)
try:
shutil.copy2(summary_file,output_performance_dir)
except:
print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
try:
shutil.copy2(detail_file,output_performance_dir)
except:
print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)

print("Accuracy check pass: {:}".format(accuracy_pass))
print("Performance check pass: {:}".format(performance_pass))
print("TEST01 verification complete")

if __name__ == '__main__':
main()
9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/ssd-large/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 256
9 changes: 9 additions & 0 deletions v0.7/compliance/nvidia/TEST01/ssd-small/audit.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The format of this config file is 'key = value'.
# The key has the format 'model.scenario.key'. Value is mostly int64_t.
# Model maybe '*' as wildcard. In that case the value applies to all models.
# All times are in milli seconds

# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf)
*.*.mode = 2
*.*.accuracy_log_rng_seed = 720381539243781796
*.*.accuracy_log_sampling_target = 1024
Loading

0 comments on commit b0f6aab

Please sign in to comment.