diff --git a/language/bert/accuracy-squad.py b/language/bert/accuracy-squad.py index f1365f4e6..113e1c8d8 100644 --- a/language/bert/accuracy-squad.py +++ b/language/bert/accuracy-squad.py @@ -45,6 +45,16 @@ RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) +dtype_map = { + "int8": np.int8, + "int16": np.int16, + "int32": np.int32, + "int64": np.int64, + "float16": np.float16, + "float32": np.float32, + "float64": np.float64 +} + def get_final_text(pred_text, orig_text, do_lower_case): """Project the tokenized prediction back to the original text.""" @@ -302,7 +312,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") -def load_loadgen_log(log_path, eval_features, output_transposed=False): +def load_loadgen_log(log_path, eval_features, dtype=np.float32, output_transposed=False): with open(log_path) as f: predictions = json.load(f) @@ -310,10 +320,10 @@ def load_loadgen_log(log_path, eval_features, output_transposed=False): for prediction in predictions: qsl_idx = prediction["qsl_idx"] if output_transposed: - logits = np.frombuffer(bytes.fromhex(prediction["data"]), np.float32).reshape(2, -1) + logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(2, -1) logits = np.transpose(logits) else: - logits = np.frombuffer(bytes.fromhex(prediction["data"]), np.float32).reshape(-1, 2) + logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(-1, 2) # Pad logits to max_seq_length seq_length = logits.shape[0] start_logits = np.ones(max_seq_length) * -10000.0 @@ -336,8 +346,11 @@ def main(): parser.add_argument("--out_file", default="build/result/predictions.json", help="Path to output predictions file") parser.add_argument("--features_cache_file", default="eval_features.pickle", help="Path to features' cache file") parser.add_argument("--output_transposed", action="store_true", help="Transpose the output") + parser.add_argument("--output_dtype", default="float32", choices=dtype_map.keys(), help="Output data type") args = parser.parse_args() + output_dtype = dtype_map[args.output_dtype] + print("Reading examples...") eval_examples = read_squad_examples(input_file=args.val_data, is_training=False, version_2_with_negative=False) @@ -374,13 +387,14 @@ def append_feature(feature): pickle.dump(eval_features, cache_file) print("Loading LoadGen logs...") - results = load_loadgen_log(args.log_file, eval_features, args.output_transposed) + results = load_loadgen_log(args.log_file, eval_features, output_dtype, args.output_transposed) print("Post-processing predictions...") write_predictions(eval_examples, eval_features, results, 20, 30, True, args.out_file) print("Evaluating predictions...") - cmd = "python3 build/data/evaluate-v1.1.py build/data/dev-v1.1.json build/result/predictions.json" + cmd = "python3 {:}/evaluate-v1.1.py {:} {:}".format(os.path.dirname(__file__), + args.val_data, args.out_file) subprocess.check_call(cmd, shell=True) if __name__ == "__main__": diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc index 576866712..b8b30ad41 100644 --- a/loadgen/issue_query_controller.cc +++ b/loadgen/issue_query_controller.cc @@ -92,16 +92,14 @@ void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first, size_t last, size_t stride) { // Copy sample data over to current query, boldly assuming that each query // only has one sample. - auto prev_scheduled_time = scheduled_time; query_to_send.reserve((last - first) / stride + 2); // Extra one for the current query. for (size_t i = first; i <= last; i += stride) { auto& q = queries[i]; auto& s = q.samples_[0]; query_to_send.push_back({reinterpret_cast(&s), s.sample_index}); - q.scheduled_time = prev_scheduled_time + q.scheduled_delta; + q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta; q.issued_start_time = issued_start_time; - prev_scheduled_time = q.scheduled_time; } } @@ -442,18 +440,16 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride, if (scenario == TestScenario::Server && settings.requested.server_coalesce_queries) { auto current_query_idx = queries_idx; - auto scheduled_time = query.scheduled_time; for (; queries_idx + query_stride < queries_count; queries_idx += query_stride) { auto next_scheduled_time = - scheduled_time + + start + queries[queries_idx + query_stride].scheduled_delta; // If current time hasn't reached the next query's scheduled time yet, // don't include next query. if (last_now < next_scheduled_time) { break; } - scheduled_time = next_scheduled_time; queries_issued_per_iter++; } if (queries_idx > current_query_idx) { diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index e524b95fa..07ded1b7d 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -358,6 +358,16 @@ PerformanceResult IssueQueries(SystemUnderTest* sut, auto sequence_id_start = sequence_gen->CurrentSampleId(); std::vector queries = GenerateQueries( settings, loaded_sample_set, sequence_gen, &response_logger); + + // Calculated expected number of queries + uint64_t expected_queries = settings.target_qps * settings.min_duration.count() / 1000; + if (scenario != TestScenario::Offline) { + expected_queries *= settings.samples_per_query; + } + + if (settings.accuracy_log_sampling_target > 0) { + response_logger.accuracy_log_prob = (double) settings.accuracy_log_sampling_target / expected_queries; + } auto sequence_id_end = sequence_gen->CurrentSampleId(); size_t max_latencies_to_record = sequence_id_end - sequence_id_start; diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 90a41ad31..d656d1ae1 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -262,6 +262,10 @@ struct TestSettings { /// accuracy log in performance mode double accuracy_log_probability = 0.0; + /// \brief Target number of samples that will have their results printed to + /// accuracy log in performance mode for compliance testing + uint64_t accuracy_log_sampling_target = 0; + /// \brief Load mlperf parameter config from file. int FromConfig(const std::string &path, const std::string &model, const std::string &scenario); diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 5f9094a32..3b19214d3 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -42,6 +42,7 @@ TestSettingsInternal::TestSettingsInternal( schedule_rng_seed(requested.schedule_rng_seed), accuracy_log_rng_seed(requested.accuracy_log_rng_seed), accuracy_log_probability(requested.accuracy_log_probability), + accuracy_log_sampling_target(requested.accuracy_log_sampling_target), print_timestamps(requested.print_timestamps), performance_issue_unique(requested.performance_issue_unique), performance_issue_same(requested.performance_issue_same), @@ -256,6 +257,7 @@ void LogRequestedTestSettings(const TestSettings &s) { detail("schedule_rng_seed : ", s.schedule_rng_seed); detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed); detail("accuracy_log_probability : ", s.accuracy_log_probability); + detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target); detail("print_timestamps : ", s.print_timestamps); detail("performance_issue_unique : ", s.performance_issue_unique); detail("performance_issue_same : ", s.performance_issue_same); @@ -290,6 +292,7 @@ void TestSettingsInternal::LogEffectiveSettings() const { detail("schedule_rng_seed : ", s.schedule_rng_seed); detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed); detail("accuracy_log_probability : ", s.accuracy_log_probability); + detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target); detail("print_timestamps : ", s.print_timestamps); detail("performance_issue_unique : ", s.performance_issue_unique); detail("performance_issue_same : ", s.performance_issue_same); @@ -317,6 +320,7 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { summary("schedule_rng_seed : ", schedule_rng_seed); summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed); summary("accuracy_log_probability : ", accuracy_log_probability); + summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target); summary("print_timestamps : ", print_timestamps); summary("performance_issue_unique : ", performance_issue_unique); summary("performance_issue_same : ", performance_issue_same); @@ -462,6 +466,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, nullptr); lookupkv(model, scenario, "accuracy_log_probability", nullptr, &accuracy_log_probability, 0.01); + lookupkv(model, scenario, "accuracy_log_sampling_target", + &accuracy_log_sampling_target, nullptr); if (lookupkv(model, scenario, "print_timestamps", &val, nullptr)) print_timestamps = (val == 0) ? false : true; if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr)) diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h index 676b2fefe..df903dd91 100644 --- a/loadgen/test_settings_internal.h +++ b/loadgen/test_settings_internal.h @@ -74,6 +74,7 @@ struct TestSettingsInternal { uint64_t schedule_rng_seed; uint64_t accuracy_log_rng_seed; double accuracy_log_probability; + uint64_t accuracy_log_sampling_target; bool print_timestamps; bool performance_issue_unique; bool performance_issue_same; diff --git a/recommendation/dlrm/pytorch/README.md b/recommendation/dlrm/pytorch/README.md index b16d2f991..035dcf6cd 100755 --- a/recommendation/dlrm/pytorch/README.md +++ b/recommendation/dlrm/pytorch/README.md @@ -6,7 +6,7 @@ This is the reference implementation for MLPerf Inference benchmarks. | name | framework | acc. | AUC | dataset | weights | size | prec. | notes | | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | -| dlrm (debugging) | PyTorch | 78.9% | N/A | [Criteo KaggleDAC](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) | N/A | ~1GB | fp32 | | +| dlrm (debugging) | PyTorch | 78.82% | N/A | [Criteo KaggleDAC](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) | N/A | ~1GB | fp32 | | | dlrm (debugging) | PyTorch | 81.07% | N/A | [Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) | [pytorch](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt), [onnx](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.onnx.tar) | ~10GB | fp32 | --max-ind-range=10000000 --data-sub-sample-rate=0.875 | | dlrm (official) | PyTorch | N/A | 80.25% | [Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) | [pytorch](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt), [onnx](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.onnx.tar) | ~100GB | fp32 | --max-ind-range=40000000 | @@ -180,22 +180,45 @@ options are extra arguments that are passed along For example, to run on CPU you may choose to use: -1. Criteo Kaggle DAC +1. Criteo Kaggle DAC (debugging) + +Offline scenario perf and accuracy modes +``` +./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048 +./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-to-aggregate-fix=2048 --max-batchsize=2048 --samples-per-query-offline=1 --accuracy +``` +Server scenario perf and accuracy modes ``` -./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy -./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 +./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-fix=2048 --max-batchsize=2048 +./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy ``` -2. Criteo Terabyte (0.875) +2. Criteo Terabyte with 0.875 sub-sampling (debugging) + +Offline scenario perf and accuracy modes ``` -./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] -./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=1 --accuracy [--mlperf-bin-loader] ``` -3. Criteo Terabyte +Server scenario perf and accuracy modes ``` -./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] -./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy [--mlperf-bin-loader] ``` + +3. Criteo Terabyte (official) + +Offline scenario perf and accuracy modes +``` +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=204800 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --samples-per-query-offline=204800 --accuracy [--mlperf-bin-loader] +``` +Server scenario perf and accuracy modes +``` +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 --accuracy [--mlperf-bin-loader] +``` + Note that the code support (i) original and (ii) mlperf binary loader, that have slightly different performance characteristics. The latter loader can be enabled by adding `--mlperf-bin-loader` to the command line. Note that this script will pre-process the data during the first run and reuse it over sub-sequent runs. The pre-processing of data can take a significant amount of time during the first run. diff --git a/recommendation/dlrm/pytorch/python/criteo.py b/recommendation/dlrm/pytorch/python/criteo.py index 16af23148..de08b82f1 100755 --- a/recommendation/dlrm/pytorch/python/criteo.py +++ b/recommendation/dlrm/pytorch/python/criteo.py @@ -274,9 +274,16 @@ def load_query_samples(self, sample_list): def get_samples(self, id_list): # build list tuples as need by the batch conversion routine + # index i from id_list corresponds to a particular query_id + idx_offsets = [0] ls = [] for i in id_list: + (_, _, _, T) = self.items_in_memory[i] + idx_offsets.append(idx_offsets[-1] + T.numel()) + ls.append(self.items_in_memory[i]) + # debug prints + # print(idx_offsets) # approach 1: collate a mini-batch of single samples ''' @@ -304,9 +311,8 @@ def get_samples(self, id_list): lS_i = torch.cat(ls_t[2], dim=1) T = torch.cat(ls_t[3]) # debug prints - # print('get_samples', (X, lS_o, lS_i, T)) - # print('get_samples', X.shape) - return (X, lS_o, lS_i, T) + # print('get_samples', (X, lS_o, lS_i, T, idx_offsets)) + return (X, lS_o, lS_i, T, idx_offsets) # Pre processing diff --git a/recommendation/dlrm/pytorch/python/main.py b/recommendation/dlrm/pytorch/python/main.py index d66d4f957..8ed44847e 100755 --- a/recommendation/dlrm/pytorch/python/main.py +++ b/recommendation/dlrm/pytorch/python/main.py @@ -124,6 +124,7 @@ def get_args(): parser.add_argument("--count-samples", type=int, help="dataset items to use") parser.add_argument("--count-queries", type=int, help="number of queries to use") parser.add_argument("--samples-per-query-multistream", type=int, help="query length for multi-stream scenario (in terms of aggregated samples)") + # --samples-per-query-offline is equivalent to perf_sample_count parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)") parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one") parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size") @@ -203,13 +204,14 @@ def get_backend(backend, dataset, max_ind_range, data_sub_sample_rate, use_gpu): class Item: """An item that we queue for processing by the thread pool.""" - def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i, batch_T=None): + def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i, batch_T=None, idx_offsets=None): self.query_id = query_id self.content_id = content_id self.batch_dense_X = batch_dense_X self.batch_lS_o = batch_lS_o self.batch_lS_i = batch_lS_i self.batch_T = batch_T + self.idx_offsets = idx_offsets self.start = time.time() class RunnerBase: @@ -252,7 +254,11 @@ def run_one_item(self, qitem): # result = processed_results[idx][0] and target = processed_results[idx][1] # also each idx might be a query of samples, rather than a single sample # depending on the --samples-to-aggregate* arguments. - response_array = array.array("B", np.array(processed_results, np.float32).tobytes()) + s_idx = qitem.idx_offsets[idx] + e_idx = qitem.idx_offsets[idx + 1] + # debug prints + # print("s,e:",s_idx,e_idx, len(processed_results)) + response_array = array.array("B", np.array(processed_results[s_idx:e_idx], np.float32).tobytes()) response_array_refs.append(response_array) bi = response_array.buffer_info() response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1])) @@ -264,14 +270,14 @@ def enqueue(self, query_samples): query_len = len(query_samples) if query_len < self.max_batchsize: - batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx) - self.run_one_item(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T)) + batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx) + self.run_one_item(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets)) else: bs = self.max_batchsize for i in range(0, query_len, bs): ie = min(i + bs, query_len) - batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx[i:ie]) - self.run_one_item(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T)) + batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie]) + self.run_one_item(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets)) def finish(self): pass @@ -308,14 +314,14 @@ def enqueue(self, query_samples): query_len = len(query_samples) if query_len < self.max_batchsize: - batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx) - self.tasks.put(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T)) + batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx) + self.tasks.put(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets)) else: bs = self.max_batchsize for i in range(0, query_len, bs): ie = min(i + bs, query_len) - batch_dense_X, batch_lS_o, batch_lS_i, batch_T = self.ds.get_samples(idx[i:ie]) - self.tasks.put(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T)) + batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie]) + self.tasks.put(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets)) def finish(self): # exit all threads @@ -416,7 +422,7 @@ def main(): ds.load_query_samples([0]) for _ in range(5): - batch_dense_X, batch_lS_o, batch_lS_i, batch_T = ds.get_samples([0]) + batch_dense_X, batch_lS_o, batch_lS_i, _, _ = ds.get_samples([0]) _ = backend.predict(batch_dense_X, batch_lS_o, batch_lS_i) ds.unload_query_samples(None) diff --git a/speech_recognition/rnnt/accuracy_eval.py b/speech_recognition/rnnt/accuracy_eval.py index a1a12a7ad..ea8179285 100644 --- a/speech_recognition/rnnt/accuracy_eval.py +++ b/speech_recognition/rnnt/accuracy_eval.py @@ -6,18 +6,25 @@ import sys import os -from QSL import AudioQSL +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch")) -sys.path.insert(0, os.path.join(os.getcwd(), "pytorch")) +from QSL import AudioQSL from helpers import process_evaluation_epoch, __gather_predictions from parts.manifest import Manifest +dtype_map = { + "int8": 'b', + "int16": 'h', + "int32": 'l', + "int64": 'q', +} def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", required=True) parser.add_argument("--dataset_dir", required=True) parser.add_argument("--manifest", required=True) + parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type") args = parser.parse_args() return args @@ -31,13 +38,16 @@ def main(): hypotheses = [] references = [] for result in results: - hypotheses.append(array.array('q', bytes.fromhex(result["data"])).tolist()) + hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) references.append(manifest[result["qsl_idx"]]["transcript"]) - hypotheses = __gather_predictions([hypotheses], labels=labels) + references = __gather_predictions([references], labels=labels) + hypotheses = __gather_predictions([hypotheses], labels=labels) + d = dict(predictions=hypotheses, transcripts=references) - print("Word Error Rate:", process_evaluation_epoch(d)) + wer = process_evaluation_epoch(d) + print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100)) if __name__ == '__main__': main() diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py new file mode 100755 index 000000000..57165e223 --- /dev/null +++ b/tools/submission/submission-checker.py @@ -0,0 +1,648 @@ +""" +A checker for mlperf inference submissions +""" + +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import json +import logging +import os +import re +import sys + +# pylint: disable=missing-docstring + + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("main") + + +MODEL_CONFIG = { + "v0.5": { + "models": ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"], + "required-scenarios-datacenter": { + # anything goes + }, + "optional-scenarios-datacenter": { + # anything goes + }, + "required-scenarios-edge": { + # anything goes + }, + "optional-scenarios-edge": { + # anything goes + }, + "accuracy-target": { + "mobilenet": ("acc", 71.68 * 0.98), + "resnet": ("acc", 76.46 * 0.99), + "ssd-small": ("mAP", 22 * 0.99), + "ssd-large": ("mAP", 20 * 0.99), + "gnmt": ("bleu", 23.9 * 0.99), + }, + "performance-sample-count": { + "mobilenet": 1024, + "resnet": 1024, + "ssd-small": 256, + "ssd-large": 64, + "gnmt": 3903900, + }, + "seeds": { + "qsl_rng_seed": 3133965575612453542, + "sample_index_rng_seed": 665484352860916858, + "schedule_rng_seed": 3622009729038561421, + }, + }, + "v0.7": { + "models": [ + "ssd-small", "ssd-large", "resnet", "rnnt", + "bert", "bert-99", "bert-99.9", + "dlrm", "dlrm-99", "dlrm-99.9" + "3d-unet", "3d-unet-99", "3d-unet-99.9" + ], + "required-scenarios-datacenter": { + "resnet": ["Server", "Offline"], + "ssd-large": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "bert": ["Server", "Offline"], + "dlrm": ["Server", "Offline"], + "3d-unet": ["Offline"], + }, + "optional-scenarios-datacenter": { + }, + "required-scenarios-edge": { + "resnet": ["SingleStream", "Offline"], + "ssd-small": ["SingleStream", "Offline"], + "ssd-large": ["SingleStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert": ["SingleStream", "Offline"], + "3d-unet": ["SingleStream", "Offline"], + }, + "optional-scenarios-edge": { + "resnet": ["MultiStream"], + "ssd-small": ["MultiStream"], + "ssd-large": ["MultiStream"], + }, + "accuracy-target": { + "resnet": ("acc", 76.46 * 0.99), + "ssd-small": ("mAP", 22 * 0.99), + "ssd-large": ("mAP", 20 * 0.99), + "rnnt": ("WER", (100 - 7.452) * 0.99), + "bert": ("F1", 90.874 * 0.99), + "bert-99": ("F1", 90.874 * 0.99), + "bert-99.9": ("F1", 90.874 * 0.999), + "dlrm": ("AUC", 80.25 * 0.99), + "dlrm-99": ("AUC", 80.25 * 0.99), + "dlrm-99.9": ("AUC", 80.25 * 0.999), + "3d-unet": ("DICE", 0.853 * 0.99), + "3d-unet-99": ("DICE", 0.853 * 0.99), + "3d-unet-99.9": ("DICE", 0.853 * 0.999), + }, + "performance-sample-count": { + "ssd-small": 256, + "ssd-large": 64, + "resnet": 1024, + "rnnt": 2513, + "bert": 3903900, + "dlrm": 204800, + "3d-unet": 16, + }, + "seeds": { + "qsl_rng_seed": 3133965575612453542, + "sample_index_rng_seed": 665484352860916858, + "schedule_rng_seed": 3622009729038561421, + }, + }, +} + +VALID_DIVISIONS = ["open", "closed"] +REQUIRED_PERF_FILES = ["mlperf_log_accuracy.json", "mlperf_log_summary.txt", "mlperf_log_detail.txt"] +REQUIRED_ACC_FILES = REQUIRED_PERF_FILES + ["accuracy.txt"] +REQUIRED_MEASURE_FILES = ["mlperf.conf", "user.conf", "README.md"] +TO_MS = 1000 * 1000 + +SCENARIO_MAPPING = { + "singlestream": "SingleStream", + "multistream": "MultiStream", + "server": "server", + "offline": "Offline", +} + +MODEL_MAPPING = { + "ssd-mobilenet": "ssd-small", + "ssd-resnet34": "ssd-large", + "resnet50": "resnet", + "bert-99": "bert", + "bert-99.9": "bert", + "dlrm-99": "dlrm", + "dlrm-99.9": "dlrm", + "3d-unet-99": "3d-unet", + "3d-unet-99.9": "3d-unet", +} + +RESULT_FIELD = { + "Offline": "Samples per second", + "Single": "90th percentile latency (ns)", + "Multi": "Samples per query", + "Server": "Scheduled samples per second" +} + +ACC_PATTERN = { + "acc": r"^accuracy=([\d\.]+).*", + "AUC": r"^AUC=([\d\.]+).*", + "mAP": r"^mAP=([\d\.]+).*", + "bleu": r"^BLEU\:\s*([\d\.]+).*", + "F1": r"^{\"exact_match\"\:\s*[\d\.]+,\s*\"f1\"\:\s*([\d\.]+)}", + "WER": r"Word Error Rate\:\s*([\d\.]+).*", + "DICE": r"Accuracy\:\s*mean\s*=\s*([\d\.]+).*", +} + +SYSTEM_DESC_REQUIRED_FIELDS = [ + "division", "submitter", "status", "system_name", "number_of_nodes", "host_processor_model_name", + "host_processors_per_node", "host_processor_core_count", "host_memory_capacity", "host_storage_capacity", + "host_storage_type", "accelerators_per_node", "accelerator_model_name", "accelerator_memory_capacity", + "framework", "operating_system" +] + +SYSTEM_DESC_OPTIONAL_FIELDS = [ + "system_type", "other_software_stack", "host_processor_frequency", "host_processor_caches", + "host_memory_configuration", "host_processor_interconnect", "host_networking", "host_networking_topology", + "accelerator_frequency", "accelerator_host_interconnect", "accelerator_interconnect", + "accelerator_interconnect_topology", "accelerator_memory_configuration", + "accelerator_on-chip_memories", "cooling", "hw_notes", "sw_notes" +] + +SYSTEM_IMP_REQUIRED_FILES = [ + "input_data_types", "retraining", "starting_weights_filename", "weight_data_types", + "weight_transformations", +] + + +class Config(): + """Select config value by mlperf version and submission type.""" + def __init__(self, version): + self.base = MODEL_CONFIG.get(version) + self.version = version + self.models = self.base["models"] + self.seeds = self.base["seeds"] + self.accuracy_target = self.base["accuracy-target"] + self.performance_sample_count = self.base["performance-sample-count"] + self.required = None + self.optional = None + + def set_type(self, submission_type): + if submission_type is None and self.version in ["v0.5"]: + return + elif submission_type == "datacenter": + self.required = self.base["required-scenarios-datacenter"] + self.optional = self.base["optional-scenarios-datacenter"] + elif submission_type == "edge": + self.required = self.base["required-scenarios-edge"] + self.optional = self.base["optional-scenarios-edge"] + else: + raise ValueError("innvalid system type") + + def get_required(self, model): + if self.version in ["v0.5"]: + return set() + model = MODEL_MAPPING.get(model, model) + if model not in self.required: + raise ValueError("model not known: " + model) + return set(self.required[model]) + + def get_optional(self, model): + if self.version in ["v0.5"]: + return set(["SingleStream", "MultiStream", "Server", "Offline"]) + model = MODEL_MAPPING.get(model, model) + if model not in self.optional: + return set() + return set(self.optional[model]) + + def get_accuracy_target(self, model): + if model not in self.accuracy_target: + raise ValueError("model not known: " + model) + return self.accuracy_target[model] + + def get_performance_sample_count(self, model): + model = MODEL_MAPPING.get(model, model) + if model not in self.performance_sample_count: + raise ValueError("model not known: " + model) + return self.performance_sample_count[model] + + +def get_args(): + """Parse commandline.""" + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="submission directory") + parser.add_argument("--version", default="v0.7", choices=list(MODEL_CONFIG.keys()), help="mlperf version") + parser.add_argument("--submitter", help="filter to submitter") + parser.add_argument("--csv", default="summary.csv", help="csv file with results") + args = parser.parse_args() + return args + + +def model_map(config, model): + """Map models names to the official mlperf name.""" + if model in config.models: + return model + if model in MODEL_MAPPING: + return MODEL_MAPPING[model] + if model.startswith("mobilenet"): + model = "mobilenet" + elif model.startswith("rcnn"): + model = "ssd-small" + elif model.startswith("ssdlite") or model.startswith("ssd-inception") or model.startswith("yolo") or \ + model.startswith("ssd-mobilenet") or model.startswith("ssd-resnet50"): + model = "ssd-small" + return model + + +def list_dir(*path): + path = os.path.join(*path) + return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] + + +def list_files(*path): + path = os.path.join(*path) + return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + + +def split_path(m): + return m.replace("\\", "/").split("/") + + +def ignore_errors_for_v0_5(line): + if "check for ERROR in detailed" in line: + return True + if "Loadgen built with uncommitted changes" in line: + return True + if "Ran out of generated queries to issue before the minimum query count and test duration were reached" in line: + return True + if "CAS failed" in line: + return True + return False + + +def check_accuracy_dir(config, model, path): + is_valid = False + acc = None + model_norm = model_map(config, model) + acc_type, acc_target = config.get_accuracy_target(model_norm) + pattern = ACC_PATTERN[acc_type] + with open(os.path.join(path, "accuracy.txt"), "r") as f: + for line in f: + m = re.match(pattern, line) + if m: + acc = m.group(1) + break + + if acc and float(acc) >= acc_target: + is_valid = True + else: + log.error("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc) + + # check if there are any errors in the detailed log + fname = os.path.join(path, "mlperf_log_detail.txt") + if not os.path.exists(fname): + log.error("%s is missing", fname) + is_valid = False + else: + with open(fname, "r") as f: + for line in f: + # look for: ERROR + if "ERROR" in line: + if config.version in ["v0.5"] and ignore_errors_for_v0_5(line): + continue + # TODO: should this be a failed run? + log.error("%s contains error: %s", fname, line) + is_valid = False + return is_valid, acc + + +def check_performance_dir(config, model, path): + is_valid = False + rt = {} + # look for: Result is: VALID + fname = os.path.join(path, "mlperf_log_summary.txt") + with open(fname, "r") as f: + for line in f: + m = re.match(r"^Result\s+is\s*\:\s+VALID", line) + if m: + is_valid = True + m = re.match(r"^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line) + if m: + rt[m.group(1).strip()] = m.group(2).strip() + + model = model_map(config, model) + performance_sample_count = config.get_performance_sample_count(model) + if int(rt['performance_sample_count']) < performance_sample_count: + log.error("%s performance_sample_count, found %s, needs to be > %d", + fname, performance_sample_count, rt['performance_sample_count']) + is_valid = False + + # check if there are any errors in the detailed log + fname = os.path.join(path, "mlperf_log_detail.txt") + with open(fname, "r") as f: + for line in f: + # look for: ERROR + if "ERROR" in line: + if config.version in ["v0.5"] and ignore_errors_for_v0_5(line): + continue + log.error("%s contains error: %s", fname, line) + is_valid = False + + for seed in ["qsl_rng_seed", "sample_index_rng_seed", "schedule_rng_seed"]: + if int(rt[seed]) != config.seeds[seed]: + log.error("%s %s is wrong, expected=%s, found=%s", fname, seed, config.seeds[seed], rt[seed]) + + scenario = rt["Scenario"] + res = float(rt[RESULT_FIELD[scenario]]) + if scenario in ["Single Stream"]: + res /= TO_MS + + return is_valid, res + + +def files_diff(list1, list2): + """returns a list of files that are missing or added.""" + if list1 and list2: + for i in ["mlperf_log_trace.json", "results.json"]: + try: + list1.remove(i) + except: + pass + if len(list1) > len(list2): + return list(set(list1) - set(list2)) + else: + return list(set(list2) - set(list1)) + return [] + + +def check_results_dir(config, filter_submitter, csv): + """ + Walk the results directory and do the checking. + + We are called with the cdw at the root of the submission directory. + level1 division - closed|open + level2 submitter - for example mlperf_org + level3 - results, systems, measurements, code + + For results the structure from here is: + results/$system_desc/$benchmark_model/$scenario/performance/run_n + and + results/$system_desc/$benchmark_model/$scenario/accuracy + + We first walk into results/$system_desc + make sure there is a system_desc.json and its good + Next we walk into the model + make sure the model is good, make sure all required scenarios are there. + Next we walk into each scenario + check the performance directory + check the accuracy directory + if all was good, add the result to the results directory + if there are errors write a None as result so we can report later what failed + """ + head = [ + "Organization", "Availability", "Division", "SystemType", "Platform", "Model", + "Scenario", "Result", "Accuracy", "Location", + ] + fmt = ",".join(["{}"] * len(head)) + "\n" + csv.write(",".join(head) + "\n") + results = {} + + # we are at the top of the submission directory + for division in list_dir("."): + # we are looking at ./$division, ie ./closed + if division not in VALID_DIVISIONS: + log.error("invalid division in input dir %s", division) + continue + is_closed = division == "closed" + + for submitter in list_dir(division): + # we are looking at ./$division/$submitter, ie ./closed/mlperf_org + if filter_submitter and submitter != filter_submitter: + continue + results_path = os.path.join(division, submitter, "results") + if not os.path.exists(results_path): + log.error("no submission in %s", results_path) + results[results_path] = None + continue + + for system_desc in list_dir(results_path): + # we are looking at ./$division/$submitter/$system_desc, ie ./closed/mlperf_org/t4-ort + + # + # check if system_id is good. + # + system_id_json = os.path.join(division, submitter, "systems", system_desc + ".json") + if not os.path.exists(system_id_json): + log.error("no system_desc for %s/%s/%s", division, submitter, system_desc) + results[os.path.join(results_path, system_desc)] = None + continue + + name = os.path.join(results_path, system_desc) + with open(system_id_json) as f: + system_json = json.load(f) + system_type = system_json.get("system_type") + available = system_json.get("status") + if config.version == "v0.7" and system_type not in ["datacenter", "edge"]: + log.error("%s has invalid system type (%s)", system_id_json, system_type) + results[name] = None + continue + config.set_type(system_type) + if not check_system_desc_id(name, system_json, submitter, division): + results[name] = None + + # + # Look at each model + # + for model in list_dir(results_path, system_desc): + # we are looking at ./$division/$submitter/$system_desc/$model, + # ie ./closed/mlperf_org/t4-ort/bert + if is_closed and model not in config.models: + log.error("%s has a invalid model (%s) for closed division", name, model) + results[name] = None + continue + + # + # Look at each scenario + # + required_scenarios = config.get_required(MODEL_MAPPING.get(model, model)) + all_scenarios = set(list(required_scenarios) + list(config.get_optional(MODEL_MAPPING.get(model, model)))) + for scenario in list_dir(results_path, system_desc, model): + # some submissions in v0.5 use lower case scenarios - map them for now + scenario_fixed = SCENARIO_MAPPING.get(scenario, scenario) + + # we are looking at ./$division/$submitter/$system_desc/$model/$scenario, + # ie ./closed/mlperf_org/t4-ort/bert/Offline + name = os.path.join(results_path, system_desc, model, scenario) + results[name] = None + if scenario_fixed not in all_scenarios: + log.warning("%s ignoring scenario %s (neither required nor optional)", name, scenario) + continue + + # check if measurement_dir is good. + measurement_dir = os.path.join(division, submitter, "measurements", + system_desc, model, scenario) + if not os.path.exists(measurement_dir): + log.error("no measurement_dir for %s", name) + results[measurement_dir] = None + else: + if not check_measurement_dir(measurement_dir, name, system_desc, + os.path.join(division, submitter), model, scenario): + log.error("measurement_dir %s has issues", measurement_dir) + results[measurement_dir] = None + + # check accuracy + accuracy_is_valid = False + acc_path = os.path.join(name, "accuracy") + if not os.path.exists(os.path.join(acc_path, "accuracy.txt")): + log.error( + "%s has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or " + "process_accuracy.py", acc_path) + else: + diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES) + if diff: + log.error("%s has file list mismatch (%s)", acc_path, diff) + accuracy_is_valid, acc = check_accuracy_dir(config, model, acc_path) + if not accuracy_is_valid and not is_closed: + log.warning("%s, accuracy not valid but taken for open", acc_path) + # TODO: is this correct? + accuracy_is_valid = True + + if accuracy_is_valid: + log.info("%s, accuracy is %s", acc_path, acc) + else: + log.error("%s, accuracy not valid", acc_path) + + if scenario in ["Server"]: + n = ["run_1", "run_2", "run_3", "run_4", "run_5"] + else: + n = ["run_1"] + + for i in n: + perf_path = os.path.join(name, "performance", i) + if not os.path.exists(perf_path): + log.error("%s is missing", perf_path) + continue + diff = files_diff(list_files(perf_path), REQUIRED_PERF_FILES) + if diff: + log.error("%s has file list mismatch (%s)", perf_path, diff) + try: + is_valid, r = check_performance_dir(config, model, perf_path) + except: + is_valid, r = False, None + if is_valid: + results[name] = r + required_scenarios.discard(scenario) + else: + log.error("%s has issues", perf_path) + + if results.get(name): + if accuracy_is_valid: + log.info("%s is OK", name) + csv.write(fmt.format(submitter, available, division, system_type, system_desc, model, + scenario_fixed, r, acc, name)) + else: + results[name] = None + log.error("%s is OK but accuracy has issues", name) + + if required_scenarios: + name = os.path.join(results_path, system_desc, model) + results[name] = None + log.error("%s does not have all required scenarios, missing %s", name, required_scenarios) + + + return results + + +def check_system_desc_id(fname, systems_json, submitter, division): + is_valid = True + # check all required fields + for k in SYSTEM_DESC_REQUIRED_FIELDS: + if k not in systems_json: + is_valid = False + log.error("%s, field %s is missing", fname, k) + + all_fields = SYSTEM_DESC_REQUIRED_FIELDS + SYSTEM_DESC_OPTIONAL_FIELDS + for k in systems_json.keys(): + if k not in all_fields: + log.warning("%s, field %s is unknwon", fname, k) + + if systems_json.get("submitter") != submitter: + log.error("%s has submitter %s, directory has %s", fname, systems_json.get("submitter"), submitter) + is_valid = False + if systems_json.get("division") != division: + log.error("%s has division %s, division has %s", fname, systems_json.get("division"), division) + is_valid = False + return is_valid + + +def check_measurement_dir(measurement_dir, fname, system_desc, root, model, scenario): + files = list_files(measurement_dir) + system_file = None + is_valid = True + for i in REQUIRED_MEASURE_FILES: + if i not in files: + log.error("%s is missing %s", measurement_dir, i) + is_valid = False + for i in files: + if i.startswith(system_desc) and i.endswith("_" + scenario + ".json"): + system_file = i + end = len("_" + scenario + ".json") + break + elif i.startswith(system_desc) and i.endswith(".json"): + system_file = i + end = len(".json") + break + if system_file: + with open(os.path.join(measurement_dir, system_file), "r") as f: + j = json.load(f) + for k in SYSTEM_IMP_REQUIRED_FILES: + if k not in j: + is_valid = False + log.error("%s, field %s is missing", fname, k) + + impl = system_file[len(system_desc) + 1:-end] + code_dir = os.path.join(root, "code", model, impl) + if not os.path.exists(code_dir): + log.error("%s is missing %s*.json", fname, system_desc) + else: + log.error("%s is missing %s*.json", fname, system_desc) + + return is_valid + + +def main(): + args = get_args() + + config = Config(args.version) + + with open(args.csv, "w") as csv: + os.chdir(args.input) + # check results directory + results = check_results_dir(config, args.submitter, csv) + + # log results + with_results = 0 + for k, v in results.items(): + if v is None: + log.error("NoResults %s", k) + else: + log.info("Results %s %s", k, v) + with_results += 1 + + # print summary + log.info("Results=%d, NoResults=%d", with_results, len(results) - with_results) + if len(results) != with_results: # bad_submissions or meta_errors or measurement_errors: + log.error("SUMMARY: submission has errors") + return 1 + else: + log.info("SUMMARY: submission looks OK") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/v0.5/tools/submission/submission-checker.py b/v0.5/tools/submission/submission-checker.py deleted file mode 100755 index ef6e8f136..000000000 --- a/v0.5/tools/submission/submission-checker.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -A checker for mlperf inference submissions -""" - -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import collections -import json -import logging -import os -import re -import sys -import time - -# pylint: disable=missing-docstring - - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger("main") - -VALID_MODELS = ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"] -VALID_DIVISIONS = ["open", "closed"] -REQUIRED_PERF_FILES = ["mlperf_log_accuracy.json", "mlperf_log_summary.txt", "mlperf_log_detail.txt"] -REQUIRED_ACC_FILES = REQUIRED_PERF_FILES + ["accuracy.txt"] -REQUIRED_MEASURE_FILES = ["mlperf.conf", "user.conf", "README.md"] -TOMS = 1000 * 1000 - - -PERFORMANCE_SAMPLE_COUNT = { - "mobilenet": 1024, - "resnet50": 1024, - "resnet": 1024, - "ssd-mobilenet": 256, - "ssd-small": 256, - "ssd-resnet34": 64, - "ssd-large": 64, - "gnmt": 3903900, -} - -ACCURAY_TARGET = { - "mobilenet": 71.68 * 0.98, - "resnet50": 76.46 * 0.99, - "resnet": 76.46 * 0.99, - "ssd-mobilenet": 22 * 0.99, - "ssd-small": 22 * 0.99, - "ssd-resnet34": 20 * 0.99, - "ssd-large": 20 * 0.99, - "gnmt": 23.9 * 0.99, -} - -SEEDS = { - "qsl_rng_seed": 3133965575612453542, - "sample_index_rng_seed": 665484352860916858, - "schedule_rng_seed": 3622009729038561421 -} - -RESULT_VALUE = { - "Offline": "Samples per second", - "Single": "90th percentile latency (ns)", - "Multi": "Samples per query", - "Server": "Scheduled samples per second" -} - - -def get_args(): - """Parse commandline.""" - parser = argparse.ArgumentParser() - parser.add_argument("--input", required=True, help="submission directory") - parser.add_argument("--submitter", help="filter to submitter") - args = parser.parse_args() - return args - - -def model_map(model): - if model.startswith("mobilenet"): - model = "mobilenet" - elif model.startswith("rcnn"): - model = "ssd-small" - elif model.startswith("ssdlite") or model.startswith("ssd-inception") or model.startswith("yolo") or \ - model.startswith("ssd-mobilenet") or model.startswith("ssd-resnet50"): - model = "ssd-small" - if model not in PERFORMANCE_SAMPLE_COUNT: - model = None - return model - - -def list_dir(*path): - path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] - - -def list_files(*path): - path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] - - -def split_path(m): - return m.replace("\\", "/").split("/") - - -def ignore_errors(line): - if "check for ERROR in detailed" in line: - return True - if "Loadgen built with uncommitted changes" in line: - return True - if "Ran out of generated queries to issue before the minimum query count and test duration were reached" in line: - return True - if "CAS failed": - return True - return False - - -def check_accuracy_dir(model, dir): - is_valid = False - acc = 0 - # look for: accuracy=... or mAP=... - with open(os.path.join(dir, "accuracy.txt"), "r") as f: - for line in f: - m = re.match("^accuracy=([\d\.]+).*", line) - if m: - is_valid = True - acc = m.group(1) - break - m = re.match("^mAP=([\d\.]+).*", line) - if m: - is_valid = True - acc = m.group(1) - break - m = re.match("^BLEU\:\s*([\d\.]+).*", line) - if m: - is_valid = True - acc = m.group(1) - break - - if is_valid: - model_norm = model_map(model) - if model_norm: - target_acc = ACCURAY_TARGET[model_norm] - if float(acc) < target_acc: - log.error("{} accuracy not met: {:.2f}/{}".format(dir, target_acc, acc)) - is_valid = False - else: - log.error("{} unknown model, can't find target accuracy".format(dir)) - - # check if there are any errors in the detailed log - fname = os.path.join(dir, "mlperf_log_detail.txt") - if not os.path.exists(fname): - log.warning("{} missing".format(fname)) - else: - with open(fname, "r") as f: - for line in f: - # look for: ERROR - if "ERROR" in line: - if ignore_errors(line): - continue - # TODO: should this be a failed run? - log.warning("{} contains error: {}".format(fname, line)) - return is_valid - - -def check_performance_dir(model, dir): - is_valid = False - rt = {} - # look for: Result is: VALID - fname = os.path.join(dir, "mlperf_log_summary.txt") - with open(fname, "r") as f: - for line in f: - m = re.match("^Result\s+is\s*\:\s+VALID", line) - if m: - is_valid = True - m = re.match("^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line) - if m: - rt[m.group(1).strip()] = m.group(2).strip() - - model = model_map(model) - if model in PERFORMANCE_SAMPLE_COUNT: - if int(rt['performance_sample_count']) < PERFORMANCE_SAMPLE_COUNT[model]: - log.error("{} performance_sample_count should be {}".format(fname, PERFORMANCE_SAMPLE_COUNT[model])) - is_valid = False - else: - log.error("{} performance_sample_count not checked, bad model name {}".format(fname, model)) - - # check if there are any errors in the detailed log - fname = os.path.join(dir, "mlperf_log_detail.txt") - with open(fname, "r") as f: - for line in f: - # look for: ERROR - if "ERROR" in line: - if ignore_errors(line): - continue - # TODO: does this make the run fail? - log.warning("{} contains error: {}".format(fname, line)) - - for seed in ["qsl_rng_seed", "sample_index_rng_seed", "schedule_rng_seed"]: - if int(rt[seed]) != SEEDS[seed]: - log.error("{} {} wrong, {}/{}".format(fname, seed, rt[seed], SEEDS[seed])) - - scenario = rt["Scenario"] - res = float(rt[RESULT_VALUE[scenario]]) - if scenario in ["Single Stream"]: - res /= TOMS - - return is_valid, res - - -def files_diff(list1, list2): - """returns a list of files that are missing or added.""" - if list1 and list2: - for i in ["mlperf_log_trace.json", "results.json"]: - try: - list1.remove(i) - except: - pass - if len(list1) > len(list2): - return list(set(list1) - set(list2)) - else: - return list(set(list2) - set(list1)) - return [] - - -def check_results_dir(dir, filter_submitter): - good_submissions = [] - bad_submissions = {} - results = {} - - for division in list_dir("."): - if division not in ["closed", "open"]: - continue - for submitter in list_dir(division): - if filter_submitter and submitter != filter_submitter: - continue - results_path = os.path.join(division, submitter, "results") - if not os.path.exists(results_path): - log.warning("no submission in {}/{}".format(division, submitter)) - continue - for system_desc in list_dir(results_path): - # check if system_id is good. Report failure for each model/scenario. - system_id_json = os.path.join(division, submitter, "systems", system_desc + ".json") - device_bad = not os.path.exists(system_id_json) - for model in list_dir(results_path, system_desc): - if division in "closed" and model not in VALID_MODELS: - bad_submissions[os.path.join(system_desc, model)] = \ - "{} has an invalid model name {}".format(os.path.join(results_path, system_desc), model) - - for scenario in list_dir(results_path, system_desc, model): - name = os.path.join(results_path, system_desc, model, scenario) - results[name] = "NoResults" - acc_path = os.path.join(name, "accuracy") - if not os.path.exists(os.path.join(acc_path, "accuracy.txt")): - log.error( - "{} has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or " - "process_accuracy.py".format(acc_path)) - bad_submissions[name] = "{} has no accuracy.txt".format(acc_path) - else: - diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES) - if diff: - bad_submissions[name] = "{} has file list mismatch ({})".format(acc_path, diff) - if not check_accuracy_dir(model, acc_path): - bad_submissions[name] = "{} has issues".format(acc_path) - n = ["run_1"] - if scenario in ["Server"]: - n = ["run_1", "run_2", "run_3", "run_4", "run_5"] - if not os.path.exists(os.path.join(name, "performance", n[0])): - n = ["run1"] - if not os.path.exists(os.path.join(name, "performance", n[0])): - n = ["."] - else: - if scenario in ["Server"]: - n = ["run1", "run2", "run3", "run4", "run5"] - - for i in n: - perf_path = os.path.join(name, "performance", i) - if not os.path.exists(perf_path): - bad_submissions[name] = "{} missing".format(perf_path) - continue - diff = files_diff(list_files(perf_path), REQUIRED_PERF_FILES) - if diff: - bad_submissions[name] = "{} has file list mismatch ({})".format(perf_path, diff) - try: - is_valid, results[name] = check_performance_dir(model, perf_path) - except Exception as ex: - is_valid, results[name] = False, "NoResults" - if not is_valid: - bad_submissions[name] = "{} has issues".format(perf_path) - if device_bad: - bad_submissions[name] = "{}: no such system id {}".format(name, system_desc) - else: - good_submissions.append(name) - - return good_submissions, bad_submissions, results - - -def compare_json(fname, template, errors): - error_count = len(errors) - try: - with open(fname, "r") as f: - j = json.load(f) - # make sure all required sections/fields are there - for k, v in template.items(): - sz = j.get(k) - if sz is None and v == "required": - errors.append("{} field {} missing".format(fname, k)) - - # make sure no undefined sections/fields are in the meta data - for k, v in j.items(): - z = template.get(k) - if z is None: - errors.append("{} has unknwon field {}".format(fname, k)) - except Exception as ex: - errors.append("{} unexpected error {}".format(fname, ex)) - return error_count == len(errors) - - -def check_system_desc_id(good_submissions, systems_json): - errors = [] - checked = set() - for submission in good_submissions: - parts = split_path(submission) - system_desc = parts[3] - submitter = parts[1] - division = parts[0] - if division not in VALID_DIVISIONS: - errors.append(("{} has invalid division {}".format(submission, j["submitter"], division))) - continue - - fname = os.path.join(parts[0], parts[1], "systems", system_desc + ".json") - if fname not in checked: - checked.add(fname) - if not compare_json(fname, systems_json, errors): - continue - with open(fname, "r") as f: - j = json.load(f) - if j["submitter"] != submitter: - errors.append(("{} has submitter {}, directory has {}".format(fname, j["submitter"], submitter))) - continue - if j["division"] != division: - errors.append(("{} has division {}, division has {}".format(fname, j["division"], division))) - continue - if errors: - for i in errors: - log.error(i) - return errors - - -def check_measurement_dir(good_submissions, systems_imp_json): - errors = [] - for submission in good_submissions: - parts = split_path(submission) - system_desc = parts[3] - measurement_dir = os.path.join(parts[0], parts[1], "measurements", system_desc) - if not os.path.exists(measurement_dir): - errors.append("{} directory missing".format(measurement_dir)) - continue - model = parts[4] - scenario = parts[5] - fname = os.path.join(measurement_dir, model, scenario) - files = list_files(fname) - system_file = None - for i in REQUIRED_MEASURE_FILES: - if i not in files: - errors.append("{} is missing {}".format(fname, i)) - for i in files: - if i.startswith(system_desc) and i.endswith("_" + scenario + ".json"): - system_file = i - end = len("_" + scenario + ".json") - break - elif i.startswith(system_desc) and i.endswith(".json"): - system_file = i - end = len(".json") - break - if system_file: - compare_json(os.path.join(fname, system_file), systems_imp_json, errors) - impl = system_file[len(system_desc) + 1:-end] - code_dir = os.path.join(parts[0], parts[1], "code", model, impl) - if not os.path.exists(code_dir): - errors.append("{} is missing".format(code_dir)) - else: - errors.append("{} is missing {}*.json".format(fname, system_desc)) - - if errors: - for i in errors: - log.error(i) - return errors - - -def main(): - args = get_args() - - script_path = os.path.dirname(sys.argv[0]) - with open(os.path.join(script_path, "system_desc_id.json"), "r") as f: - systems_json = json.load(f) - with open(os.path.join(script_path, "system_desc_id_imp.json"), "r") as f: - systems_imp_json = json.load(f) - - os.chdir(args.input) - - # 1. check results directory - good_submissions, bad_submissions, results = check_results_dir(args.input, args.submitter) - - # 2. check the meta data under systems - meta_errors = check_system_desc_id(good_submissions, systems_json) - - # 3. check measurement and code dir - measurement_errors = check_measurement_dir(good_submissions, systems_imp_json) - with_results = 0 - for k, v in results.items(): - if v == "NoResults": - log.error("NoResults {}".format(k)) - else: - log.info("Results {} {}".format(k, v)) - with_results +=1 - - log.info("Results={}, NoResults={}".format(with_results, len(results)-with_results)) - if bad_submissions or meta_errors or measurement_errors: - log.error("SUMMARY: submission has errors") - return 1 - else: - log.info("SUMMARY: submission looks OK") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/v0.5/tools/submission/submission-to-csv.py b/v0.5/tools/submission/submission-to-csv.py deleted file mode 100644 index 122cc2d85..000000000 --- a/v0.5/tools/submission/submission-to-csv.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Tool to create a csv file from a mlperf inference submission directory -""" - -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import collections -import json -import logging -import os -import re -import sys -import time - -# pylint: disable=missing-docstring - - -logging.basicConfig(level=logging.INFO) -log = logging.getLogger("main") - -VALID_MODELS = ["ssd-small", "ssd-large", "mobilenet", "resnet", "gnmt"] -VALID_DIVISIONS = ["open", "closed"] - - -def get_args(): - """Parse commandline.""" - parser = argparse.ArgumentParser() - parser.add_argument("--input", required=True, help="submission directory") - parser.add_argument("--output", help="output") - parser.add_argument("--submitter", help="filter to submitter") - args = parser.parse_args() - return args - - -def list_dir(*path): - path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] - - -def list_files(*path): - path = os.path.join(*path) - return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] - - -def split_path(m): - return m.replace("\\", "/").split("/") - - -def model_map(model): - if model.startswith("mobilenet"): - model = "mobilenet" - elif model.startswith("rcnn"): - model = "ssd-small" - elif model.startswith("resnet50"): - model = "resnet" - elif model.startswith("ssdlite") or model.startswith("ssd-inception") or model.startswith("yolo") or \ - model.startswith("ssd-mobilenet") or model.startswith("ssd-resnet50"): - model = "ssd-small" - if model not in VALID_MODELS: - model = None - return model - - -def get_accuracy(model, dir): - is_valid = False - acc = 0 - # look for: accuracy=... or mAP=... - with open(os.path.join(dir, "accuracy.txt"), "r") as f: - for line in f: - m = re.match("^accuracy=([\d\.]+).*", line) - if m: - acc = m.group(1) - break - m = re.match("^mAP=([\d\.]+).*", line) - if m: - acc = m.group(1) - break - m = re.match("^BLEU\:\s*([\d\.]+).*", line) - if m: - acc = m.group(1) - break - return float(acc) - - -RESULT_VALUE = { - "Offline": "Samples per second", - "SingleStream": "90th percentile latency (ns)", - "MultiStream": "Samples per query", - "Server": "Scheduled samples per second" -} - -TOMS = 1000 * 1000 - - -def get_performance(model, scenario, dir, kv): - rt = {} - # look for: Result is: VALID - fname = os.path.join(dir, "mlperf_log_summary.txt") - with open(fname, "r") as f: - for line in f: - m = re.match("^\s*([\w\s.\(\)\/]+)\s*\:\s*([\w\+\.]+).*", line) - if m: - rt[m.group(1).strip()] = m.group(2).strip() - - if scenario == "singlestream": - scenario = "SingleStream" - if scenario == "server": - scenario = "Server" - if scenario == "offline": - scenario = "Offline" - if scenario == "multistream": - scenario = "MultiStream" - kv["scenario"] = scenario - res = float(rt[RESULT_VALUE[scenario]]) - if scenario in ["SingleStream"]: - res /= TOMS - kv["result"] = res - kv["p50"] = float(rt["50.00 percentile latency (ns)"]) / TOMS - kv["p90"] = float(rt["90.00 percentile latency (ns)"]) / TOMS - kv["p99"] = float(rt["99.00 percentile latency (ns)"]) / TOMS - - -def walk_results_dir(dir, filter_submitter, results): - for division in list_dir("."): - if division not in ["closed", "open"]: - continue - for submitter in list_dir(division): - if "example" in submitter: - continue - if filter_submitter and submitter != filter_submitter: - continue - results_path = os.path.join(division, submitter, "results") - if not os.path.exists(results_path): - log.warning("no submission in {}/{}".format(division, submitter)) - continue - for system_desc in list_dir(results_path): - # check if system_id is good. Report failure for each model/scenario. - for model in list_dir(results_path, system_desc): - try: - model_norm = model_map(model) - for scenario in list_dir(results_path, system_desc, model): - name = os.path.join(results_path, system_desc, model, scenario).replace("\\", "/") - nn = os.path.join(submitter, division, system_desc, model) - kv = {"name": nn, "model": model_norm, "system": system_desc, - "division": division, "submitter": submitter} - acc_path = os.path.join(name, "accuracy") - if not os.path.exists(os.path.join(acc_path, "accuracy.txt")): - log.error("{} has no accuracy.txt".format(acc_path)) - kv["acc"] = get_accuracy(model, acc_path) - n = ["1"] - for i in n: - perf_path = os.path.join(name, "performance", "run_" + str(i)) - get_performance(model_norm, scenario, perf_path, kv) - results.append(kv) - except Exception as ex: - log.error("{}, {}".format(name, ex)) - - -def main(): - args = get_args() - - os.chdir(args.input) - - results = [] - walk_results_dir(args.input, args.submitter, results) - columns = ['name', 'model', 'system', 'division', 'submitter', 'acc', 'scenario', 'result', - 'p50', 'p90', 'p99'] - if args.output: - with open(args.output, "w") as f: - f.write(",".join(columns) + "\n") - for r in results: - col = [str(r[c]) for c in columns] - f.write(",".join(col) + "\n") - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/v0.5/tools/submission/system_desc_id.json b/v0.5/tools/submission/system_desc_id.json deleted file mode 100755 index 9c792614c..000000000 --- a/v0.5/tools/submission/system_desc_id.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "division": "reqired", - "submitter": "required", - "status": "required", - "system_name": "required", - - "number_of_nodes": "required", - "host_processor_model_name": "required", - "host_processors_per_node": "required", - "host_processor_core_count": "required", - "host_processor_frequency": "", - "host_processor_caches": "", - "host_memory_configuration": "", - "host_memory_capacity": "required", - "host_storage_capacity": "required", - "host_storage_type": "required", - "host_processor_interconnect": "", - "host_networking": "", - "host_networking_topology": "", - - "accelerators_per_node": "required", - "accelerator_model_name": "required", - "accelerator_frequency": "", - "accelerator_host_interconnect": "", - "accelerator_interconnect": "", - "accelerator_interconnect_topology": "", - "accelerator_memory_capacity": "required", - "accelerator_memory_configuration": "", - "accelerator_on-chip_memories": "", - "cooling": "", - "hw_notes": "", - - "framework": "required", - "operating_system": "required", - "other_software_stack": "required", - "sw_notes": "" -} diff --git a/v0.5/tools/submission/system_desc_id_imp.json b/v0.5/tools/submission/system_desc_id_imp.json deleted file mode 100755 index c0734b177..000000000 --- a/v0.5/tools/submission/system_desc_id_imp.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "input_data_types": "required", - "retraining": "required", - "starting_weights_filename": "required", - "weight_data_types": "required", - "weight_transformations": "required" -} diff --git a/v0.7/compliance/nvidia/README.md b/v0.7/compliance/nvidia/README.md new file mode 100755 index 000000000..da96bb3d5 --- /dev/null +++ b/v0.7/compliance/nvidia/README.md @@ -0,0 +1,25 @@ +# Compliance Testing +This repository provides the compliance tests that need to be run in order to demonstrate a valid submission. + +# Table of Contents +1. [Introduction](#introduction) +2. [Test Infrastructure](#Test-Infrastructure) +3. [Test Methodology](#Test-Methodology) + +## Introduction +A handful of compliance tests have been created to help ensure that submissions comply with a subset of the MLPerf rules. Each compliance test must be run once for each submission run and the logs from the compliance test run must be uploaded along with the rest of submission collateral. Scripts are provided in each of the test subdirectories to help with copying the compliance test logs into the correct directory structure for upload. + +## Test Infrastructure +The compliance tests exercise functionality in LoadGen, enabled through the use of a config file that overrides LoadGen functionality, enabling it to run in a variety of compliance testing modes. Upon invocation, LoadGen checks if a `audit.config` file exists in the current working directory. The configuration parameters in `audit.config` override any settings set by `mlperf.conf` or `user.conf`. +## Test Methodology +Running a compliance test entails typically three steps: +#### 1. Setup +Copy the provided `audit.config` file from the test repository into the current working directory from where the benchmark typically starts execution. +#### 2. Execution +Run the benchmark as one normally would for a submission run. LoadGen will read `audit.config` and execute the compliance test. +Note: remove `audit.config` file from the working directory afterwards to prevent unintentionally running in compliance testing mode in future runs. +#### 3. Verification +Run the provided python-based verification script to ensure that the compliance test has successfully completed and meets expectations in terms of performance and/or accuracy. The script will also copy the output compliance logs to a path specified by the user in the correct directory structure in preparation for upload to the MLPerf submission repository. + + + diff --git a/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config b/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config new file mode 100644 index 000000000..984895a24 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/3d-unet/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 64 diff --git a/v0.7/compliance/nvidia/TEST01/README.md b/v0.7/compliance/nvidia/TEST01/README.md new file mode 100755 index 000000000..6e92f3277 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/README.md @@ -0,0 +1,44 @@ + +# Test 01 - Verify accuracy in performance mode +## Introduction +The purpose of this test is to ensure that valid inferences are being performed in performance mode. By default, the inference result that is returned from SUT to Loadgen is not written to the accuracy JSON file and thus not checked for accuracy. In this test, the inference results of a subset of the total samples issued by loadgen are written to the accuracy JSON. In order to pass this test, two criteria must be satisfied: + + 1. The inference results in the accuracy JSON file must match the inference results in the accuracy JSON generated in accuracy mode in the submission run. + 2. The performance while running this test must match the performance of the submission within 10%. + +## Performance considerations +The subset of samples results chosen to to be written to the accuracy JSON is determined randomly using a probability based on `accuracy_log_sampling_target` specified in the audit.config file divided by the total expected number of completed samples in the test run. This total expected number of completed samples is based on `min_duration_count`, `samples_per_query`, and `target_qps`. The goal is to ensure that a reasonable number of sample results gets written to the accuracy JSON regardless of the throughput of the system-under-test. Given that the number of actual completed samples may not match the expected number, the number of inference results written to the accuracy JSON may not exactly match `accuracy_log_sampling_target`. + +There is an audit.config file for each individual benchmark, located in the benchmark subdirectories in this test directory. The `accuracy_log_sampling_target` value for each benchmark is chosen taking into consideration the performance sample count and size of the inference result. If performance with sampling enabled cannot meet the pass threshold set in verify_performance.py, `accuracy_log_sampling_target` may be reduced to check that performance approaches the submission score. + +## Log size +3d-unet is unique in that its inference result output per-sample is drastically larger than that of other benchmarks. For all other benchmarks, the accuracy JSON results can be checked using python JSON libraries, which can be enabled by providing `--fastmode` to the run_verification.py script. For 3d-unet, using fastmode will result in verify_performance.py running out of memory, so the alternative way of using UNIX-based commandline utilities must be used by not supplying the `--fastmode` switch. + +## Prerequisites +This script works best with Python 3.3 or later. For 3d-unet, the accuracy verification script require the `wc`,`sed`,`awk`,`head`,`tail`,`grep`, and `md5sum` UNIX commandline utilities. + +## Non-determinism +Note that under MLPerf inference rules, certain forms of non-determinism is acceptable, which can cause inference results to differ across runs. It is foreseeable that the results obtained during the accuracy run can be different from that obtained during the performance run, which will cause the accuracy checking script to report failure. Test failure will automatically result in an objection, but the objection can be overturned by comparing the quality of the results generated in performance mode to that obtained in accuracy mode. This can be done by using the accuracy measurement scripts provided as part of the repo to ensure that the accuracy score meets the target. An example is provided for GNMT in the gnmt folder. + +## Instructions + +### Part I +Run test with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt + +### Part II +Run the verification script: + `python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}] [--fastmode]` + +RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline` +COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run. +OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline` + +Expected outcome: + + Accuracy check pass: True + Performance check pass: True + TEST01 verification complete + + + + diff --git a/v0.7/compliance/nvidia/TEST01/bert/audit.config b/v0.7/compliance/nvidia/TEST01/bert/audit.config new file mode 100644 index 000000000..c861e2a3d --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/bert/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 4096 diff --git a/v0.7/compliance/nvidia/TEST01/dlrm/audit.config b/v0.7/compliance/nvidia/TEST01/dlrm/audit.config new file mode 100644 index 000000000..c861e2a3d --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/dlrm/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 4096 diff --git a/v0.7/compliance/nvidia/TEST01/resnet/audit.config b/v0.7/compliance/nvidia/TEST01/resnet/audit.config new file mode 100644 index 000000000..c861e2a3d --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/resnet/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 4096 diff --git a/v0.7/compliance/nvidia/TEST01/rnnt/audit.config b/v0.7/compliance/nvidia/TEST01/rnnt/audit.config new file mode 100644 index 000000000..c861e2a3d --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/rnnt/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 4096 diff --git a/v0.7/compliance/nvidia/TEST01/run_verification.py b/v0.7/compliance/nvidia/TEST01/run_verification.py new file mode 100644 index 000000000..8124ceb35 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/run_verification.py @@ -0,0 +1,143 @@ +#! /usr/bin/env python3 +# Copyright 2018 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import os +import sys +import shutil +import subprocess +import argparse +import json + +import numpy as np + +sys.path.append(os.getcwd()) + +dtype_map = { + "byte": np.byte, + "float32": np.float32, + "int32": np.int32, + "int64": np.int64 +} + +def main(): + + + py3 = sys.version_info >= (3,0) + # Parse arguments to identify the path to the accuracy logs from + # the accuracy and performance runs + parser = argparse.ArgumentParser() + parser.add_argument( + "--results_dir", "-r", + help="Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.", + default="" + ) + parser.add_argument( + "--compliance_dir", "-c", + help="Specifies the path to the directory containing the logs from the compliance test run.", + default="" + ) + parser.add_argument( + "--output_dir", "-o", + help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.", + default="" + ) + parser.add_argument( + "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode") + parser.add_argument( + "--fastmode", action="store_true", + help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.") + + args = parser.parse_args() + + print("Parsing arguments.") + results_dir = args.results_dir + compliance_dir = args.compliance_dir + output_dir = os.path.join(args.output_dir, "TEST01") + fastmode = "" + if args.fastmode: + fastmode = " --fastmode" + else: + for binary in ["wc", "md5sum", "grep", "awk", "sed", "head", "tail"]: + missing_binary = False + if shutil.which(binary) == None: + print("Error: This script requires the {:} commandline utility".format(binary)) + missing_binary = True + if missing_binary: + exit() + + dtype = args.dtype + + # run verify accuracy + verify_accuracy_command = "python3 verify_accuracy.py --dtype " + args.dtype + fastmode + " -r " + results_dir + "/accuracy/mlperf_log_accuracy.json" + " -t " + compliance_dir + "/mlperf_log_accuracy.json | tee verify_accuracy.txt" + try: + os.system(verify_accuracy_command) + except: + print("Exception occurred trying to execute:\n " + verify_accuracy_command) + # check if verify accuracy script passes + + accuracy_pass_command = "grep PASS verify_accuracy.txt" + accuracy_pass = "TEST PASS" in subprocess.check_output(accuracy_pass_command, shell=True).decode("utf-8") + + # run verify performance + verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt" + try: + os.system(verify_performance_command) + except: + print("Exception occurred trying to execute:\n " + verify_performance_command) + + # check if verify performance script passes + performance_pass_command = "grep PASS verify_performance.txt" + performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8") + + # setup output compliance directory structure + output_accuracy_dir = os.path.join(output_dir, "accuracy") + output_performance_dir = os.path.join(output_dir, "performance", "run_1") + try: + if not os.path.isdir(output_accuracy_dir): + os.makedirs(output_accuracy_dir) + except: + print("Exception occurred trying to create " + output_accuracy_dir) + try: + if not os.path.isdir(output_performance_dir): + os.makedirs(output_performance_dir) + except: + print("Exception occurred trying to create " + output_performance_dir) + + # copy compliance logs to output compliance directory + shutil.copy2("verify_accuracy.txt",output_dir) + shutil.copy2("verify_performance.txt",output_dir) + accuracy_file = os.path.join(compliance_dir,"mlperf_log_accuracy.json") + summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt") + detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt") + + try: + shutil.copy2(accuracy_file,output_accuracy_dir) + except: + print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir) + try: + shutil.copy2(summary_file,output_performance_dir) + except: + print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir) + try: + shutil.copy2(detail_file,output_performance_dir) + except: + print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir) + + print("Accuracy check pass: {:}".format(accuracy_pass)) + print("Performance check pass: {:}".format(performance_pass)) + print("TEST01 verification complete") + +if __name__ == '__main__': + main() diff --git a/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config new file mode 100644 index 000000000..03e70a4c7 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/ssd-large/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 256 diff --git a/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config b/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config new file mode 100644 index 000000000..846c3d9da --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/ssd-small/audit.config @@ -0,0 +1,9 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.accuracy_log_rng_seed = 720381539243781796 +*.*.accuracy_log_sampling_target = 1024 diff --git a/v0.7/compliance/nvidia/TEST01/verify_accuracy.py b/v0.7/compliance/nvidia/TEST01/verify_accuracy.py new file mode 100644 index 000000000..b7859b78e --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/verify_accuracy.py @@ -0,0 +1,177 @@ +#! /usr/bin/env python3 +# Copyright 2018 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import os +import subprocess +import sys +import shutil +sys.path.append(os.getcwd()) + +import argparse +import json + +import numpy as np + +dtype_map = { + "byte": np.byte, + "float32": np.float32, + "int32": np.int32, + "int64": np.int64 +} + +def main(): + + # Parse arguments to identify the path to the accuracy logs from + # the accuracy and performance runs + parser = argparse.ArgumentParser() + parser.add_argument( + "--reference_accuracy", "-r", + help="Specifies the path to the accuracy log from a submission/accuracy run.", + default="" + ) + parser.add_argument( + "--test_accuracy", "-t", + help="Specifies the path to the accuracy log from a performance run with accuracy log sampling enabled.", + default="" + ) + parser.add_argument( + "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label") + + parser.add_argument( + "--fastmode", action="store_true", + help="Use legacy method using python JSON library instead of unix commandline utilities (uses more memory but much faster.") + args = parser.parse_args() + + print("Verifying accuracy. This might take a while...") + acc_log = args.reference_accuracy + perf_log = args.test_accuracy + + if args.fastmode: + with open(acc_log, "r") as acc_json: + acc_data = json.load(acc_json) + + with open(perf_log, "r") as perf_json: + perf_data = json.load(perf_json) + + # read accuracy log json and create a dictionary of qsl_idx/data pairs + results_dict = {} + num_acc_log_duplicate_keys = 0 + num_acc_log_data_mismatch = 0 + num_perf_log_qsl_idx_match = 0 + num_perf_log_data_mismatch = 0 + num_missing_qsl_idxs = 0 + + print("Reading accuracy mode results...") + for sample in acc_data: + #print sample["qsl_idx"] + qsl_idx = sample["qsl_idx"] + data = sample["data"] + if data == '': + data = "" + if qsl_idx in results_dict.keys(): + num_acc_log_duplicate_keys += 1 + if results_dict[qsl_idx] != data: + num_acc_log_data_mismatch += 1 + else: + results_dict[qsl_idx] = data + + print("Reading performance mode results...") + for sample in perf_data: + qsl_idx = sample["qsl_idx"] + data = np.frombuffer(bytes.fromhex(sample['data']), dtype_map[args.dtype]) if py33 == True \ + else np.frombuffer(bytearray.fromhex(sample['data']), dtype_map[args.dtype]) + + if qsl_idx in results_dict.keys(): + num_perf_log_qsl_idx_match += 1 + data_perf = np.frombuffer(bytes.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) \ + if py33 == True else np.frombuffer(bytearray.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) + if data_perf.size == 0 or data.size == 0: + if data_perf.size != data.size: + num_perf_log_data_mismatch += 1 + elif data[0] != data_perf[0]: + num_perf_log_data_mismatch += 1 + else: + num_missing_qsl_idxs += 1 + + results_dict[sample["qsl_idx"]] = sample["data"] + + + print("num_acc_log_entries = {:}".format(len(acc_data))) + print("num_acc_log_duplicate_keys = {:}".format(num_acc_log_duplicate_keys)) + print("num_acc_log_data_mismatch = {:}".format(num_acc_log_data_mismatch)) + print("num_perf_log_entries = {:}".format(len(perf_data))) + print("num_perf_log_qsl_idx_match = {:}".format(num_perf_log_qsl_idx_match)) + print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch)) + print("num_missing_qsl_idxs = {:}".format(num_missing_qsl_idxs)) + if num_perf_log_data_mismatch > 0 : + print("TEST FAIL\n"); + else : + print("TEST PASS\n"); + exit() + + py33 = sys.version_info >= (3,3) + + if not py33: + print("Error: This script requires Python v3.3 or later") + exit() + + + get_perf_lines_cmd = "wc -l " + perf_log + "| awk '{print $1}'" + num_perf_lines = int(subprocess.check_output(get_perf_lines_cmd, shell=True).decode("utf-8")) + + get_acc_lines_cmd = "wc -l " + acc_log + "| awk '{print $1}'" + num_acc_lines = int(subprocess.check_output(get_acc_lines_cmd, shell=True).decode("utf-8")) + + num_acc_log_entries = num_acc_lines - 2 + num_perf_log_entries = num_perf_lines - 2 + #print(perf_qsl_idx) + #print(get_perf_lines_cmd) + #print(num_perf_lines) + + num_perf_log_data_mismatch = 0 + for perf_line in range(0, num_perf_lines): + if perf_line % int(num_perf_lines/100) == 0: + print(".", end = "", flush=True) + # first and last line are brackets + if perf_line == 0 or perf_line == int(num_perf_lines)-1: + continue + + # calculate md5sum of line in perf mode accuracy_log + perf_md5sum_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum" + #print(perf_md5sum_cmd) + perf_md5sum = subprocess.check_output(perf_md5sum_cmd, shell=True).decode("utf-8") + + # get qsl idx + get_qsl_idx_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| awk -F\": |,\" '{print $4}'" + qsl_idx = subprocess.check_output(get_qsl_idx_cmd, shell=True).decode("utf-8").rstrip() + + # calculate md5sum of line in acc mode accuracy_log + acc_md5sum_cmd = "grep \"qsl_idx\\\" : " + qsl_idx + ",\" " + acc_log + "| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum" + acc_md5sum = subprocess.check_output(acc_md5sum_cmd, shell=True).decode("utf-8") + + if perf_md5sum != acc_md5sum: + num_perf_log_data_mismatch += 1 + + print("") + print("num_acc_log_entries = {:}".format(num_acc_log_entries)) + print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch)) + print("num_perf_log_entries = {:}".format(num_perf_log_entries)) + if num_perf_log_data_mismatch > 0 : + print("TEST FAIL\n"); + else : + print("TEST PASS\n"); + +if __name__ == '__main__': + main() diff --git a/v0.7/compliance/nvidia/TEST01/verify_performance.py b/v0.7/compliance/nvidia/TEST01/verify_performance.py new file mode 100644 index 000000000..000141f31 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST01/verify_performance.py @@ -0,0 +1,140 @@ +#! /usr/bin/env python3 +# Copyright 2018 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import os +import sys +import re +sys.path.append(os.getcwd()) + +import argparse +import json + +def main(): + # Parse arguments to identify the path to the accuracy logs from + # the accuracy and performance runs + parser = argparse.ArgumentParser() + parser.add_argument( + "--reference_summary", "-r", + help="Specifies the path to the summary log for TEST00.", + default="" + ) + parser.add_argument( + "--test_summary", "-t", + help="Specifies the path to the summary log for this test.", + default="" + ) + args = parser.parse_args() + + print("Verifying performance.") + ref_file = open(args.reference_summary, "r") + test_file = open(args.test_summary, "r") + ref_score = 0 + test_score = 0 + ref_mode = '' + test_mode = '' + + for line in ref_file: + if re.match("Scenario", line): + ref_mode = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Single Stream": + if re.match("90th percentile latency", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Multi Stream": + if re.match("Samples per query", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Server": + if re.match("Scheduled samples per second", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Offline": + if re.match("Samples per second", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if re.match("Result is", line): + valid = line.split(": ",1)[1].strip() + if valid == 'INVALID': + sys.exit("TEST FAIL: Reference results are invalid") + + if re.match("\d+ ERROR", line): + error = line.split(" ",1)[0].strip() + print("WARNING: " + error + " ERROR reported in reference results") + + + for line in test_file: + if re.match("Scenario", line): + test_mode = line.split(": ",1)[1].strip() + continue + + if test_mode == "Single Stream": + if re.match("90th percentile latency", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Multi Stream": + if re.match("Samples per query", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Server": + if re.match("Scheduled samples per second", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Offline": + if re.match("Samples per second", line): + test_score = line.split(": ",1)[1].strip() + continue + + if re.match("Result is", line): + valid = line.split(": ",1)[1].strip() + if valid == 'INVALID': + sys.exit("TEST FAIL: Test results are invalid") + + if re.match("\d+ ERROR", line): + error = line.split(" ",1)[0].strip() + print("WARNING: " + error + " ERROR reported in test results") + + if test_mode != ref_mode: + sys.exit("Test and reference scenarios do not match!") + + print("reference score = {}".format(ref_score)) + print("test score = {}".format(test_score)) + + + threshold = 0.10 + + # In single stream mode, latencies can be very short for high performance systems + # and run-to-run variation due to external disturbances (OS) can be significant. + # In this case we relax pass threshold to 20% + + if ref_mode == "Single Stream" and float(ref_score) <= 200000: + threshold = 0.20 + + if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): + print("TEST PASS") + else: + print("TEST FAIL: Test score invalid") + +if __name__ == '__main__': + main() + diff --git a/v0.7/compliance/nvidia/TEST05/README.md b/v0.7/compliance/nvidia/TEST05/README.md new file mode 100755 index 000000000..63c30fb54 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST05/README.md @@ -0,0 +1,37 @@ + +# Test 05 - Vary RNG seeds +## Introduction +The purpose of this test is to ensure that the SUT does not favor a particular set of Loadgen RNG seed values. The pass condition is that performance with non-default RNG seed values should be similar to the submitted performance. + +The seeds that are changed are listed below: + - qsl_rng_seed - determines order of samples in QSL + - sample_index_rng_seed - determines subset of samples in each loadable set + - schedule_rng_seed - determines scheduling of samples in server mode + +## Prerequisites +This script works best with Python 3.3 or later. + +## Pass Criteria +Performance must be within 5% of the submission performance. In single stream mode, latencies can be very short for high performance systems and run-to-run variation due to external disturbances (OS) can be significant. In such cases and when submission latencies are less or equal to 0.2ms, the pass threshold is relaxed to 20%. + +## Instructions + +### Part I +Run the benchmark with the provided audit.config in the corresponding benchmark subdirectory. Note that audit.config must be copied to the directory where the benchmark is being run from. Verification that audit.config was properly read can be done by checking that loadgen has found audit.config in mlperf_log_detail.txt + +### Part II +Run the verification script: + `python3 run_verification.py -r RESULTS_DIR -c COMPLIANCE_DIR -o OUTPUT_DIR [--dtype {byte,float32,int32,int64}]` + +RESULTS_DIR: Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. `inference_results_v0.7/closed/NVIDIA/results/GPU/resnet/Offline` +COMPLIANCE_DIR: Specifies the path to the directory containing the logs from the compliance test run. +OUTPUT_DIR: Specifies the path to the output directory where compliance logs will be uploaded from, i.e. `inference_results_v0.7/closed/NVIDIA/compliance/GPU/resnet/Offline` + +Expected outcome: + + Performance check pass: True + TEST05 verification complete + + + + diff --git a/v0.7/compliance/nvidia/TEST05/audit.config b/v0.7/compliance/nvidia/TEST05/audit.config new file mode 100644 index 000000000..6cc924912 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST05/audit.config @@ -0,0 +1,10 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# mode dictionary (0 = submission, 1 = accuracy, 2 = performance, 3 = find peak perf) +*.*.mode = 2 +*.*.qsl_rng_seed = 313588358309856706 +*.*.sample_index_rng_seed = 471397156132239067 +*.*.schedule_rng_seed = 413914573387865862 diff --git a/v0.7/compliance/nvidia/TEST05/run_verification.py b/v0.7/compliance/nvidia/TEST05/run_verification.py new file mode 100644 index 000000000..3b2c76d46 --- /dev/null +++ b/v0.7/compliance/nvidia/TEST05/run_verification.py @@ -0,0 +1,104 @@ +#! /usr/bin/env python3 +# Copyright 2018 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= +import os +import sys +import shutil +import subprocess +import argparse +import json + +import numpy as np + +sys.path.append(os.getcwd()) + +dtype_map = { + "byte": np.byte, + "float32": np.float32, + "int32": np.int32, + "int64": np.int64 +} + +def main(): + + + py3 = sys.version_info >= (3,0) + # Parse arguments to identify the path to the logs from the performance runs + parser = argparse.ArgumentParser() + parser.add_argument( + "--results_dir", "-r", + help="Specifies the path to the corresponding results directory that contains the performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.", + default="" + ) + parser.add_argument( + "--compliance_dir", "-c", + help="Specifies the path to the directory containing the logs from the compliance test run.", + default="" + ) + parser.add_argument( + "--output_dir", "-o", + help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.", + default="" + ) + parser.add_argument( + "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (only needed in fastmode") + + args = parser.parse_args() + + print("Parsing arguments.") + results_dir = args.results_dir + compliance_dir = args.compliance_dir + output_dir = os.path.join(args.output_dir, "TEST05") + + dtype = args.dtype + + # run verify performance + verify_performance_command = "python3 verify_performance.py -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt" + try: + os.system(verify_performance_command) + except: + print("Exception occurred trying to execute:\n " + verify_performance_command) + + # check if verify performance script passes + performance_pass_command = "grep PASS verify_performance.txt" + performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8") + + # setup output compliance directory structure + output_performance_dir = os.path.join(output_dir, "performance", "run_1") + try: + if not os.path.isdir(output_performance_dir): + os.makedirs(output_performance_dir) + except: + print("Exception occurred trying to create " + output_performance_dir) + + # copy compliance logs to output compliance directory + shutil.copy2("verify_performance.txt",output_dir) + summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt") + detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt") + + try: + shutil.copy2(summary_file,output_performance_dir) + except: + print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir) + try: + shutil.copy2(detail_file,output_performance_dir) + except: + print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir) + + print("Performance check pass: {:}".format(performance_pass)) + print("TEST05 verification complete") + +if __name__ == '__main__': + main() diff --git a/v0.7/compliance/nvidia/TEST05/verify_performance.py b/v0.7/compliance/nvidia/TEST05/verify_performance.py new file mode 100644 index 000000000..4c44f7dfd --- /dev/null +++ b/v0.7/compliance/nvidia/TEST05/verify_performance.py @@ -0,0 +1,126 @@ +#! /usr/bin/env python3 +import os +import sys +import re +sys.path.append(os.getcwd()) + +import argparse +import json + +def main(): + # Parse arguments to identify the path to the accuracy logs from + # the accuracy and performance runs + parser = argparse.ArgumentParser() + parser.add_argument( + "--reference_summary", "-r", + help="Specifies the path to the summary log for TEST00.", + default="" + ) + parser.add_argument( + "--test_summary", "-t", + help="Specifies the path to the summary log for this test.", + default="" + ) + args = parser.parse_args() + + print("Verifying performance.") + ref_file = open(args.reference_summary, "r") + test_file = open(args.test_summary, "r") + ref_score = 0 + test_score = 0 + ref_mode = '' + test_mode = '' + + for line in ref_file: + if re.match("Scenario", line): + ref_mode = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Single Stream": + if re.match("90th percentile latency", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Multi Stream": + if re.match("Samples per query", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Server": + if re.match("Scheduled samples per second", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if ref_mode == "Offline": + if re.match("Samples per second", line): + ref_score = line.split(": ",1)[1].strip() + continue + + if re.match("Result is", line): + valid = line.split(": ",1)[1].strip() + if valid == 'INVALID': + sys.exit("TEST FAIL: Reference results are invalid") + + if re.match("\d+ ERROR", line): + error = line.split(" ",1)[0].strip() + print("WARNING: " + error + " ERROR reported in reference results") + + + for line in test_file: + if re.match("Scenario", line): + test_mode = line.split(": ",1)[1].strip() + continue + + if test_mode == "Single Stream": + if re.match("90th percentile latency", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Multi Stream": + if re.match("Samples per query", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Server": + if re.match("Scheduled samples per second", line): + test_score = line.split(": ",1)[1].strip() + continue + + if test_mode == "Offline": + if re.match("Samples per second", line): + test_score = line.split(": ",1)[1].strip() + continue + + if re.match("Result is", line): + valid = line.split(": ",1)[1].strip() + if valid == 'INVALID': + sys.exit("TEST FAIL: Test results are invalid") + + if re.match("\d+ ERROR", line): + error = line.split(" ",1)[0].strip() + print("WARNING: " + error + " ERROR reported in test results") + + if test_mode != ref_mode: + sys.exit("Test and reference scenarios do not match!") + + print("reference score = {}".format(ref_score)) + print("test score = {}".format(test_score)) + + + threshold = 0.05 + + # In single stream mode, latencies can be very short for high performance systems + # and run-to-run variation due to external disturbances (OS) can be significant. + # In this case we relax pass threshold to 20% + + if ref_mode == "Single Stream" and float(ref_score) <= 200000: + threshold = 0.20 + + if float(test_score) < float(ref_score) * (1 + threshold) and float(test_score) > float(ref_score) * (1 - threshold): + print("TEST PASS") + else: + print("TEST FAIL: Test score invalid") + +if __name__ == '__main__': + main() + diff --git a/v0.7/language/bert/evaluate-v1.1.py b/v0.7/language/bert/evaluate-v1.1.py new file mode 100644 index 000000000..c582e6877 --- /dev/null +++ b/v0.7/language/bert/evaluate-v1.1.py @@ -0,0 +1,108 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Source: https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py + +""" Official evaluation script for v1.1 of the SQuAD dataset. """ +from __future__ import print_function +from collections import Counter +import string +import re +import argparse +import json +import sys + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return (normalize_answer(prediction) == normalize_answer(ground_truth)) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths( + f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return {'exact_match': exact_match, 'f1': f1} + + +if __name__ == '__main__': + expected_version = '1.1' + parser = argparse.ArgumentParser( + description='Evaluation for SQuAD ' + expected_version) + parser.add_argument('dataset_file', help='Dataset file') + parser.add_argument('prediction_file', help='Prediction File') + args = parser.parse_args() + with open(args.dataset_file) as dataset_file: + dataset_json = json.load(dataset_file) + if (dataset_json['version'] != expected_version): + print('Evaluation expects v-' + expected_version + + ', but got dataset with v-' + dataset_json['version'], + file=sys.stderr) + dataset = dataset_json['data'] + with open(args.prediction_file) as prediction_file: + predictions = json.load(prediction_file) + print(json.dumps(evaluate(dataset, predictions))) diff --git a/vision/medical_imaging/3d-unet/accuracy-brats.py b/vision/medical_imaging/3d-unet/accuracy-brats.py index 6fece895c..82c9ce6c7 100644 --- a/vision/medical_imaging/3d-unet/accuracy-brats.py +++ b/vision/medical_imaging/3d-unet/accuracy-brats.py @@ -21,7 +21,7 @@ import pickle import sys -sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet")) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "nnUnet")) from multiprocessing import Pool from nnunet.evaluation.region_based_evaluation import evaluate_regions, get_brats_regions