diff --git a/v0.5/recommendation/README.md b/v0.5/recommendation/README.md index b50c85f80..b16d2f991 100755 --- a/v0.5/recommendation/README.md +++ b/v0.5/recommendation/README.md @@ -68,6 +68,13 @@ cd $HOME/mlperf/inference/loadgen CFLAGS="-std=c++14" python setup.py develop --user ``` +### More information about the model weights + +File name | framework | Size in bytes (`du *`) | MD5 hash (`md5sum *`) +-|-|-|- +[tb0875_10M.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt) | pytorch | 12GB | b7cacffcf75f767faa9cb2af397723aa +[tb00_40M.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt) | pytorch | 90GB | 2d49a5288cddb37c3c64860a06d79bb9 + ### More information about the datasets | dataset | download link | | ---- | ---- | @@ -147,7 +154,6 @@ File name | Size in bytes (`du *`) | MD5 hash (`md5sum *`) mv ./fake_criteo .. && cd .. export DATA_DIR=./fake_criteo ``` - ### Calibration set For MLPerf Inference, we use the first 128000 rows (user-item pairs) of the second half of `day_23` as the calibration set. Specifically, `day_23` contains 178274637 rows in total, so we use the rows **from the 89137319-th row to the 89265318-th row (both inclusive) in `day_23`** as the calibration set (assuming 0-based indexing). @@ -176,16 +182,19 @@ For example, to run on CPU you may choose to use: 1. Criteo Kaggle DAC ``` -./run_local.sh pytorch dlrm kaggle cpu --accuracy --scenario Offline +./run_local.sh pytorch dlrm kaggle cpu --scenario Offline --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy +./run_local.sh pytorch dlrm kaggle cpu --scenario Server --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` 2. Criteo Terabyte (0.875) ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` 3. Criteo Terabyte ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --scenario Offline --max-ind-range=40000000 [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --max-ind-range=40000000 --samples-per-query-offline=1 --samples-to-aggregate-fix=2048 --max-batchsize=2048 --accuracy [--mlperf-bin-loader] +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --max-ind-range=40000000 --samples-to-aggregate-quantile-file=./tools/dist_quantile.txt --max-batchsize=2048 ``` Note that the code support (i) original and (ii) mlperf binary loader, that have slightly different performance characteristics. The latter loader can be enabled by adding `--mlperf-bin-loader` to the command line. @@ -254,19 +263,37 @@ During development running the full benchmark is unpractical. Here are some opti So if you want to tune for example Server scenario, try: ``` -./run_local.sh pytorch dlrm terabyte cpu --count-samples 100 --duration 60000 --scenario Server --target-qps 100 --max-latency 0.1 +./run_local.sh pytorch dlrm terabyte cpu --scenario Server --count-samples 1024 --max-ind-range=10000000 --data-sub-sample-rate=0.875 --duration 60000 --target-qps 100 --max-latency 0.1 ``` If you want run with accuracy pass, try: ``` -./run_local.sh pytorch dlrm terabyte cpu --accuracy --duration 60000 --scenario Server --target-qps 100 --max-latency 0.2 +./run_local.sh pytorch dlrm terabyte cpu --scenario Offline --count-samples 1024 --max-ind-range=10000000 --data-sub-sample-rate=0.875 --samples-per-query-offline=1 --samples-to-aggregate-fix=128 --accuracy [--mlperf-bin-loader] ``` ### Verifying aggregation trace In the reference implementation, each sample is mapped to 100-700 user-item pairs following the distribution specified by [tools/dist_quantile.txt](tools/dist_quantile.txt). To verify that your sample aggregation trace matches the reference, please follow the steps in [tools/dist_trace_verification.txt](tools/dist_trace_verification.txt). Or simply download the reference [dlrm_trace_of_aggregated_samples.txt from Zenodo](https://zenodo.org/record/3941795/files/dlrm_trace_of_aggregated_samples.txt?download=1) (MD5:3db90209564316f2506c99cc994ad0b2). +### Running accuracy script + +To get the accuracy from a LoadGen accuracy json log file, + +1. If your SUT outputs the predictions and the ground truth labels in a packed format like the reference implementation then run +``` +python tools/accuracy-dlrm.py --mlperf-accuracy-file +``` +For instance, if the output is given in a standard directory then you can run +``` +python ./tools/accuracy-dlrm.py --mlperf-accuracy-file=./output/pytorch-cpu/dlrm/mlperf_log_accuracy.json +``` + +2. If your SUT outputs only the predictions then you need to make sure that the data in day_23 are not shuffled and run +``` +python tools/accuracy-dlrm.py --mlperf-accuracy-file --day-23-file --aggregation-trace-file +``` + ### Usage ``` usage: main.py [-h] @@ -281,7 +308,8 @@ usage: main.py [-h] [--backend BACKEND] [--use-gpu] [--threads THREADS] [--duration TIME_IN_MS] [--count-samples COUNT] [--count-queries COUNT] [--target-qps QPS] [--max-latency MAX_LATENCY] [--cache CACHE] - [--samples-per-query NUM_SAMPLES] + [--samples-per-query-multistream NUM_SAMPLES] + [--samples-per-query-offline NUM_SAMPLES] [--samples-to-aggregate-fix NUM_FIXED_SAMPLES] [--samples-to-aggregate-min MIN_NUM_VARIABLE_SAMPLES] [--samples-to-aggregate-max MAX_NUM_VARIABLE_SAMPLES] @@ -333,7 +361,9 @@ usage: main.py [-h] `--max-latency MAX_LATENCY` comma separated list of which latencies (in seconds) we try to reach in the 99 percentile (default: 0.01,0.05,0.100). -`--samples-per-query` number of samples per query in MultiStream scenario. +`--samples-per-query-multistream` maximum number of (aggregated) samples per query in MultiStream scenario. + +`--samples-per-query-offline` maximum number of (aggregated) samples per query in Offline scenario. `--samples-to-aggregate-fix` number of samples to aggregate and treat as a single sample. This number will stay fixed during runs. @@ -349,13 +379,6 @@ usage: main.py [-h] `--find-peak-performance` determine the maximum QPS for the Server and samples per query for the MultiStream, while not applicable to other scenarios. -### Run accuracy script - -To get the accuracy from a LoadGen accuracy json log file, run the following commands: - -- `python3 tools/accuracy-dlrm.py --mlperf-accuracy-file `: if your SUT outputs the predictions and the ground truth labels in a packed format like the reference implementation. -- `python3 tools/accuracy-dlrm.py --mlperf-accuracy-file --day-23-file --aggregation-trace-file `: if your SUT outputs only the predictions. In this case, you need to make sure that the data in day_23 are not shuffled. - ## License [Apache License 2.0](LICENSE) diff --git a/v0.5/recommendation/python/main.py b/v0.5/recommendation/python/main.py index e43b9d6de..d66d4f957 100755 --- a/v0.5/recommendation/python/main.py +++ b/v0.5/recommendation/python/main.py @@ -36,7 +36,6 @@ NANO_SEC = 1e9 MILLI_SEC = 1000 -QUERY_LEN_CAP = 2048 # pylint: disable=missing-docstring @@ -121,10 +120,11 @@ def get_args(): # below will override mlperf rules compliant settings - don't use for official submission parser.add_argument("--duration", type=int, help="duration in milliseconds (ms)") parser.add_argument("--target-qps", type=int, help="target/expected qps") + parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") parser.add_argument("--count-samples", type=int, help="dataset items to use") parser.add_argument("--count-queries", type=int, help="number of queries to use") - parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile") - parser.add_argument("--samples-per-query", type=int, help="mlperf multi-stream sample per query") + parser.add_argument("--samples-per-query-multistream", type=int, help="query length for multi-stream scenario (in terms of aggregated samples)") + parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)") parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one") parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size") parser.add_argument("--samples-to-aggregate-max", type=int, help="max number of samples to be treated as one in random query size") @@ -280,7 +280,7 @@ def finish(self): class QueueRunner(RunnerBase): def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): super().__init__(model, ds, threads, post_proc, max_batchsize) - queue_size_multiplier = 4 #(QUERY_LEN_CAP + max_batchsize - 1) // max_batchsize) + queue_size_multiplier = 4 #(args.samples_per_query_offline + max_batchsize - 1) // max_batchsize) self.tasks = Queue(maxsize=threads * queue_size_multiplier) self.workers = [] self.result_dict = {} @@ -373,8 +373,7 @@ def main(): # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] - # --count-samples applies to accuracy mode only and can be used to limit the number - # of samples used for testing. For perf model we always cap count to QUERY_LEN_CAP. + # --count-samples can be used to limit the number of samples used for testing ds = wanted_dataset(data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function @@ -466,15 +465,15 @@ def process_latencies(latencies_ns): settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries - if args.samples_per_query: - settings.multi_stream_samples_per_query = args.samples_per_query + if args.samples_per_query_multistream: + settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) - qsl = lg.ConstructQSL(count, min(count, QUERY_LEN_CAP), ds.load_query_samples, ds.unload_query_samples) + qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)}