From 72bbf5edae733f669db892ba47ac3941f6a3e1ac Mon Sep 17 00:00:00 2001 From: mnaumovfb Date: Sat, 1 Aug 2020 13:05:03 -0700 Subject: [PATCH] Adding flexible way of sizing queries in offline mode. Also, adding instructions to the readme on (i) how to run in offline mode, withh accuracy (ii) server mode with custom distribution, and (iii) md5 hashes --- v0.5/recommendation/README.md | 47 ++++++++++++++++++++---------- v0.5/recommendation/python/main.py | 17 +++++------ 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md index b50c85f80..d45796050 100755 --- a/v0.5/recommendation/README.md +++ b/v0.5/recommendation/README.md @@ -68,6 +68,13 @@ cd $HOME/mlperf/inference/loadgen CFLAGS="-std=c++14" python setup.py develop --user ``` +### More information about the model weights + +File name | framework | Size in bytes (`du *`) | MD5 hash (`md5sum *`) +-|-|-|- +[tb0875_10M.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt) | pytorch | 12GB | b7cacffcf75f767faa9cb2af397723aa +[tb00_40M.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt) | pytorch | 90GB | 2d49a5288cddb37c3c64860a06d79bb9 + ### More information about the datasets | dataset | download link | | ---- | ---- | @@ -147,7 +154,6 @@ File name | Size in bytes (`du *`) | MD5 hash (`md5sum *`) mv ./fake_criteo .. && cd .. export DATA_DIR=./fake_criteo ``` - ### Calibration set For MLPerf Inference, we use the first 128000 rows (user-item pairs) of the second half of `day_23` as the calibration set. Specifically, `day_23` contains 178274637 rows in total, so we use the rows **from the 89137319-th row to the 89265318-th row (both inclusive) in `day_23`** as the calibration set (assuming 0-based indexing). @@ -176,16 +182,19 @@ For example, to run on CPU you may choose to use: 1. Criteo Kaggle DAC ``` -./run_local.sh pytorch dlrm kaggle cpu --accuracy --scenario Offline +./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy +./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` 2. Criteo Terabyte (0.875) ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` 3. Criteo Terabyte ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --scenario Offline --max-ind-range=40000000 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` Note that the code support (i) original and (ii) mlperf binary loader, that have slightly different performance characteristics. The latter loader can be enabled by adding `--mlperf-bin-loader` to the command line. @@ -254,19 +263,31 @@ During development running the full benchmark is unpractical. Here are some opti So if you want to tune for example Server scenario, try: ``` -./run_local.sh pytorch dlrm terabyte cpu --count-samples 100 --duration 60000 --scenario Server --target-qps 100 --max-latency 0.1 +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --count-samples 1024 --max-ind-range=10000000 --data-sub-sample-rate=0.875 --duration 60000 --target-qps 100 --max-latency 0.1 ``` If you want run with accuracy pass, try: ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --duration 60000 --scenario Server --target-qps 100 --max-latency 0.2 +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --count-samples 1024 --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=128 --accuracy [--mlperf-bin-loader] ``` ### Verifying aggregation trace In the reference implementation, each sample is mapped to 100-700 user-item pairs following the distribution specified by [tools/dist_quantile.txt](tools/dist_quantile.txt). To verify that your sample aggregation trace matches the reference, please follow the steps in [tools/dist_trace_verification.txt](tools/dist_trace_verification.txt). Or simply download the reference [dlrm_trace_of_aggregated_samples.txt from Zenodo](https://zenodo.org/record/3941795/files/dlrm_trace_of_aggregated_samples.txt?download=1) (MD5:3db90209564316f2506c99cc994ad0b2). +### Run accuracy script + +To get the accuracy from a LoadGen accuracy json log file, run the following commands: + +`python tools/accuracy-dlrm.py --mlperf-accuracy-file `: if your SUT outputs the predictions and the ground truth labels in a packed format like the reference implementation. +`python tools/accuracy-dlrm.py --mlperf-accuracy-file --day-23-file --aggregation-trace-file `: if your SUT outputs only the predictions. In this case, you need to make sure that the data in day_23 are not shuffled. + +For instance, you can run the following command +``` +python ./tools/accuracy-dlrm.py --mlperf-accuracy-file=./output/pytorch-cpu/dlrm/mlperf_log_accuracy.json +``` + ### Usage ``` usage: main.py [-h] @@ -281,7 +302,8 @@ usage: main.py [-h] [--backend BACKEND] [--use-gpu] [--threads THREADS] [--duration TIME_IN_MS] [--count-samples COUNT] [--count-queries COUNT] [--target-qps QPS] [--max-latency MAX_LATENCY] [--cache CACHE] - [--samples-per-query NUM_SAMPLES] + [--samples-per-query-multistream NUM_SAMPLES] + [--samples-per-query-offline NUM_SAMPLES] [--samples-to-aggregate-fix NUM_FIXED_SAMPLES] [--samples-to-aggregate-min MIN_NUM_VARIABLE_SAMPLES] [--samples-to-aggregate-max MAX_NUM_VARIABLE_SAMPLES] @@ -333,7 +355,9 @@ usage: main.py [-h] `--max-latency MAX_LATENCY` comma separated list of which latencies (in seconds) we try to reach in the 99 percentile (default: 0.01,0.05,0.100). -`--samples-per-query` number of samples per query in MultiStream scenario. +`--samples-per-query-multistream` maximum number of (aggregated) samples per query in MultiStream scenario. + +`--samples-per-query-offline` maximum number of (aggregated) samples per query in Offline scenario. `--samples-to-aggregate-fix` number of samples to aggregate and treat as a single sample. This number will stay fixed during runs. @@ -349,13 +373,6 @@ usage: main.py [-h] `--find-peak-performance` determine the maximum QPS for the Server and samples per query for the MultiStream, while not applicable to other scenarios. -### Run accuracy script - -To get the accuracy from a LoadGen accuracy json log file, run the following commands: - -- `python3 tools/accuracy-dlrm.py --mlperf-accuracy-file `: if your SUT outputs the predictions and the ground truth labels in a packed format like the reference implementation. -- `python3 tools/accuracy-dlrm.py --mlperf-accuracy-file --day-23-file --aggregation-trace-file `: if your SUT outputs only the predictions. In this case, you need to make sure that the data in day_23 are not shuffled. - ## License [Apache License 2.0](LICENSE) diff --git a/v0.5/recommendation/python/main.py b/v0.5/recommendation/python/main.py index e43b9d6de..d66d4f957 100755 --- a/v0.5/recommendation/python/main.py +++ b/v0.5/recommendation/python/main.py @@ -36,7 +36,6 @@ NANO_SEC = 1e9 MILLI_SEC = 1000 -QUERY_LEN_CAP = 2048 # pylint: disable=missing-docstring @@ -121,10 +120,11 @@ def get_args(): # below will override mlperf rules compliant settings - don't use for official submission parser.add_argument("--duration", type=int, help="duration in milliseconds (ms)") parser.add_argument("--target-qps", type=int, help="target/expected qps") + parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") parser.add_argument("--count-samples", type=int, help="dataset items to use") parser.add_argument("--count-queries", type=int, help="number of queries to use") - parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") - parser.add_argument("--samples-per-query", type=int, help="mlperf multi-stream sample per query") + parser.add_argument("--samples-per-query-multistream", type=int, help="query length for multi-stream scenario (in terms of aggregated samples)") + parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)") parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one") parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size") parser.add_argument("--samples-to-aggregate-max", type=int, help="max number of samples to be treated as one in random query size") @@ -280,7 +280,7 @@ def finish(self): class QueueRunner(RunnerBase): def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): super().__init__(model, ds, threads, post_proc, max_batchsize) - queue_size_multiplier = 4 #(QUERY_LEN_CAP + max_batchsize - 1) // max_batchsize) + queue_size_multiplier = 4 #(args.samples_per_query_offline + max_batchsize - 1) // max_batchsize) self.tasks = Queue(maxsize=threads * queue_size_multiplier) self.workers = [] self.result_dict = {} @@ -373,8 +373,7 @@ def main(): # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] - # --count-samples applies to accuracy mode only and can be used to limit the number - # of samples used for testing. For perf model we always cap count to QUERY_LEN_CAP. + # --count-samples can be used to limit the number of samples used for testing ds = wanted_dataset(data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function @@ -466,15 +465,15 @@ def process_latencies(latencies_ns): settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries - if args.samples_per_query: - settings.multi_stream_samples_per_query = args.samples_per_query + if args.samples_per_query_multistream: + settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) - qsl = lg.ConstructQSL(count, min(count, QUERY_LEN_CAP), ds.load_query_samples, ds.unload_query_samples) + qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)}