From 32db94068fb7f57179de2019b3f584fa21d33ff4 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 09:45:35 +0000 Subject: [PATCH 01/42] Added README files for stable-diffusion and llama2-70b --- docs/mlperf/inference/llama2-70b/README.md | 22 ++++++++++ .../inference/llama2-70b/README_reference.md | 43 +++++++++++++++++++ .../inference/stable-diffusion-xl/README.md | 21 +++++++++ .../stable-diffusion-xl/README_reference.md | 43 +++++++++++++++++++ 4 files changed, 129 insertions(+) create mode 100644 docs/mlperf/inference/llama2-70b/README.md create mode 100644 docs/mlperf/inference/llama2-70b/README_reference.md create mode 100644 docs/mlperf/inference/stable-diffusion-xl/README.md create mode 100644 docs/mlperf/inference/stable-diffusion-xl/README_reference.md diff --git a/docs/mlperf/inference/llama2-70b/README.md b/docs/mlperf/inference/llama2-70b/README.md new file mode 100644 index 0000000000..2d0d23e182 --- /dev/null +++ b/docs/mlperf/inference/llama2-70b/README.md @@ -0,0 +1,22 @@ +[ [Back to MLPerf inference benchmarks index](../README.md) ] + +## MLPerf inference: Text summarization with Llama2-70b + +### Notes + +Llama2-70b has two variants - `llama2-70b-99` and `llama2-70b-99.9` where the `99` and `99.9` specify the required accuracy constraint +with respect to the reference fp32 model. Llama2-70b applies only to datacenter category and includes both Offline and Server scenarios. + +Please check [MLPerf inference GitHub](https://github.com/mlcommons/inference) for more details. + +### Install CM + +Please follow this [guide](../README.md#install-cm-automation-language) +to install the [MLCommons CM automation language](https://doi.org/10.5281/zenodo.8105339), +pull the repository with the CM automation recipes for MLPerf and set up virtual environment to run MLPerf benchmarks. + +### Run MLPerf via CM + +The following guides explain how to run different implementations of this benchmark via CM: + +* [MLCommons Reference implementation in Python](README_reference.md) diff --git a/docs/mlperf/inference/llama2-70b/README_reference.md b/docs/mlperf/inference/llama2-70b/README_reference.md new file mode 100644 index 0000000000..1dbfa2e8fd --- /dev/null +++ b/docs/mlperf/inference/llama2-70b/README_reference.md @@ -0,0 +1,43 @@ +[ [Back to the common setup](README.md) ] + + +## Run this benchmark via CM + + +### Do a test run to detect and record the system performance + +``` +cm run script --tags=generate-run-cmds,inference,_find-performance,_all-scenarios \ +--model=llama2-70b-99 --implementation=reference --device=cuda --backend=pytorch \ +--category=datacenter --division=open --precision=bfloat16--quiet +``` +* Use `--division=closed` to run all scenarios for the closed division (compliance tests are skipped for `_find-performance` mode) +* Use `--precision=float16` or `--precision=float32` to change the model precision +* Use `--model=llama2-70b-99.9` to run the high accuracy constraint llama2-70b-99.9 model. + + + +### Do full accuracy and performance runs for all the scenarios + +``` +cm run script --tags=generate-run-cmds,inference,_submission,_all-scenarios --model=llama2-70b-99 \ +--device=cuda --implementation=reference --backend=pytorch --precision=bfloat16 \ +--execution-mode=valid --category=datacenter --division=open --quiet +``` + +* Use `--power=yes` for measuring power. It is ignored for accuracy and compliance runs +* Use `--division=closed` to run all scenarios for the closed division including the compliance tests +* `--offline_target_qps` and `--server_target_qps` can be used to override the determined performance numbers + +### Generate and upload MLPerf submission + +Follow [this guide](../Submission.md) to generate the submission tree and upload your results. + + +### Run individual scenarios for testing and optimization + +TBD + +### Questions? Suggestions? + +Don't hesitate to get in touch via [public Discord server](https://discord.gg/JjWNWXKxwT). diff --git a/docs/mlperf/inference/stable-diffusion-xl/README.md b/docs/mlperf/inference/stable-diffusion-xl/README.md new file mode 100644 index 0000000000..982431313d --- /dev/null +++ b/docs/mlperf/inference/stable-diffusion-xl/README.md @@ -0,0 +1,21 @@ +[ [Back to MLPerf inference benchmarks index](../README.md) ] + +## MLPerf inference: Text to image with Stable-Diffusion-xl + +### Notes + +Stable-diffusion-xl has SingleStream and Offline scenarios in the edge category and Offline and Server scenarios in the datacenter category. + +Please check [MLPerf inference GitHub](https://github.com/mlcommons/inference) for more details. + +### Install CM + +Please follow this [guide](../README.md#install-cm-automation-language) +to install the [MLCommons CM automation language](https://doi.org/10.5281/zenodo.8105339), +pull the repository with the CM automation recipes for MLPerf and set up virtual environment to run MLPerf benchmarks. + +### Run MLPerf via CM + +The following guides explain how to run different implementations of this benchmark via CM: + +* [MLCommons Reference implementation in Python](README_reference.md) diff --git a/docs/mlperf/inference/stable-diffusion-xl/README_reference.md b/docs/mlperf/inference/stable-diffusion-xl/README_reference.md new file mode 100644 index 0000000000..af94c954c6 --- /dev/null +++ b/docs/mlperf/inference/stable-diffusion-xl/README_reference.md @@ -0,0 +1,43 @@ +[ [Back to the common setup](README.md) ] + + +## Run this benchmark via CM + + +### Do a test run to detect and record the system performance + +``` +cm run script --tags=generate-run-cmds,inference,_find-performance,_all-scenarios \ +--model=sdxl --implementation=reference --device=cuda --backend=pytorch \ +--category=edge --division=open --quiet +``` +* Use `--division=closed` to run all scenarios for the closed division (compliance tests are skipped for `_find-performance` mode) +* Use `--category=datacenter` to run datacenter scenarios + + + +### Do full accuracy and performance runs for all the scenarios + +``` +cm run script --tags=generate-run-cmds,inference,_submission,_all-scenarios --model=sdxl \ +--device=cuda --implementation=reference --backend=pytorch \ +--execution-mode=valid --category=datacenter --division=open --quiet +``` + +* Use `--power=yes` for measuring power. It is ignored for accuracy and compliance runs +* Use `--division=closed` to run all scenarios for the closed division including the compliance tests +* `--offline_target_qps`, `--server_target_qps`, and `--singlestream_target_latency` can be used to override the determined performance numbers +* Use `--category=datacenter` to run datacenter scenarios + +### Generate and upload MLPerf submission + +Follow [this guide](../Submission.md) to generate the submission tree and upload your results. + + +### Run individual scenarios for testing and optimization + +TBD + +### Questions? Suggestions? + +Don't hesitate to get in touch via [public Discord server](https://discord.gg/JjWNWXKxwT). From a020b43923f2230961e43a8c6a18ba498bc068a5 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 12:00:33 +0100 Subject: [PATCH 02/42] productivity tools --- .../README-extra.md | 7 ++-- .../script/app-loadgen-generic-python/run.sh | 1 + cm-mlops/script/copy-to-clipboard/_cm.yaml | 32 +++++++++++++++++++ cm-mlops/script/copy-to-clipboard/code.py | 11 +++++++ cm-mlops/script/copy-to-clipboard/run.bat | 4 +++ cm-mlops/script/copy-to-clipboard/run.sh | 4 +++ 6 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 cm-mlops/script/copy-to-clipboard/_cm.yaml create mode 100644 cm-mlops/script/copy-to-clipboard/code.py create mode 100644 cm-mlops/script/copy-to-clipboard/run.bat create mode 100644 cm-mlops/script/copy-to-clipboard/run.sh diff --git a/cm-mlops/script/app-loadgen-generic-python/README-extra.md b/cm-mlops/script/app-loadgen-generic-python/README-extra.md index 30d0ade166..f15dcc14d6 100644 --- a/cm-mlops/script/app-loadgen-generic-python/README-extra.md +++ b/cm-mlops/script/app-loadgen-generic-python/README-extra.md @@ -221,12 +221,9 @@ cmr "python app loadgen-generic _onnxruntime _cuda _custom _huggingface _model-s cmr "python app loadgen-generic _onnxruntime _cuda _custom _huggingface _model-stub.Intel/gpt-j-6B-int8-static" --adr.hf-downloader.model_filename=model.onnx --adr.hf-downloader.full_subfolder=. --samples=2 ``` - -cmr "python app loadgen-generic _onnxruntime _custom _huggingface _model-stub.runwayml/stable-diffusion-v1-5" --adr.hf-downloader.model_filename=onnx/unet/model.onnx,onnx/unet/weights.pb --samples=2 - - -TBD: some cases that are not yet fully supported (data types, etc): +TBD: some cases that are not yet fully supported (data types, input mismatch, etc): ```bash +cmr "python app loadgen-generic _onnxruntime _custom _huggingface _model-stub.runwayml/stable-diffusion-v1-5" --adr.hf-downloader.revision=onnx --adr.hf-downloader.model_filename=unet/model.onnx,unet/weights.pb --samples=2 cmr "python app loadgen-generic _onnxruntime _cuda _custom _huggingface _model-stub.microsoft/Mistral-7B-v0.1-onnx" --adr.hf-downloader.model_filename=Mistral-7B-v0.1.onnx,Mistral-7B-v0.1.onnx.data --samples=2 cmr "python app loadgen-generic _onnxruntime _cuda _custom _huggingface _model-stub.alpindale/Llama-2-7b-ONNX" --adr.hf-downloader.model_filename=FP16/LlamaV2_7B_float16.onnx --adr.hf-downloader.full_subfolder=FP16 --samples=2 ``` diff --git a/cm-mlops/script/app-loadgen-generic-python/run.sh b/cm-mlops/script/app-loadgen-generic-python/run.sh index 61a67a89a7..e007a371f3 100644 --- a/cm-mlops/script/app-loadgen-generic-python/run.sh +++ b/cm-mlops/script/app-loadgen-generic-python/run.sh @@ -1,3 +1,4 @@ #!/bin/bash + ${CM_PYTHON_BIN_WITH_PATH} ${CM_TMP_CURRENT_SCRIPT_PATH}/src/main.py ${CM_RUN_OPTS} ${CM_ML_MODEL_FILE_WITH_PATH} test $? -eq 0 || exit 1 diff --git a/cm-mlops/script/copy-to-clipboard/_cm.yaml b/cm-mlops/script/copy-to-clipboard/_cm.yaml new file mode 100644 index 0000000000..de631040b2 --- /dev/null +++ b/cm-mlops/script/copy-to-clipboard/_cm.yaml @@ -0,0 +1,32 @@ +alias: copy-to-clipboard +uid: 8b3aaa97ce58474d + +automation_alias: script +automation_uid: 5b4e0237da074764 + +cache: false + +category: "DevOps automation" + +tags: +- copy +- to +- clipboard +- copy-to-clipboard + +deps: + + # Get Python + - tags: get,python3 + names: + - python + - python3 + + # Extra package + - tags: get,generic-python-lib,_package.pyperclip + +input_mapping: + text: CM_COPY_TO_CLIPBOARD_TEXT + t: CM_COPY_TO_CLIPBOARD_TEXT + add_quotes: CM_COPY_TO_CLIPBOARD_TEXT_ADD_QUOTES + q: CM_COPY_TO_CLIPBOARD_TEXT_ADD_QUOTES diff --git a/cm-mlops/script/copy-to-clipboard/code.py b/cm-mlops/script/copy-to-clipboard/code.py new file mode 100644 index 0000000000..082813e9a0 --- /dev/null +++ b/cm-mlops/script/copy-to-clipboard/code.py @@ -0,0 +1,11 @@ +import os +import pyperclip as pc + +text = os.environ.get('CM_COPY_TO_CLIPBOARD_TEXT', '') + +add_quotes = os.environ.get('CM_COPY_TO_CLIPBOARD_TEXT_ADD_QUOTES', '') in [True,'True','yes'] + +if add_quotes: + text = '"' + text + '"' + +pc.copy(text) diff --git a/cm-mlops/script/copy-to-clipboard/run.bat b/cm-mlops/script/copy-to-clipboard/run.bat new file mode 100644 index 0000000000..545178f203 --- /dev/null +++ b/cm-mlops/script/copy-to-clipboard/run.bat @@ -0,0 +1,4 @@ +rem native script + +%CM_PYTHON_BIN_WITH_PATH% %CM_TMP_CURRENT_SCRIPT_PATH%\code.py +IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% diff --git a/cm-mlops/script/copy-to-clipboard/run.sh b/cm-mlops/script/copy-to-clipboard/run.sh new file mode 100644 index 0000000000..fa6f579f76 --- /dev/null +++ b/cm-mlops/script/copy-to-clipboard/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +${CM_PYTHON_BIN_WITH_PATH} ${CM_TMP_CURRENT_SCRIPT_PATH}/code.py +test $? -eq 0 || exit 1 From a59a986277e943a12839f9c07ebcb691539fbca4 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 11:30:26 +0000 Subject: [PATCH 03/42] Update README_nvidia.md --- docs/mlperf/inference/bert/README_nvidia.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/mlperf/inference/bert/README_nvidia.md b/docs/mlperf/inference/bert/README_nvidia.md index 2554fff84e..e8fcc82a7b 100644 --- a/docs/mlperf/inference/bert/README_nvidia.md +++ b/docs/mlperf/inference/bert/README_nvidia.md @@ -1,8 +1,9 @@ [ [Back to the common setup](README.md) ] -## Prepare Nvidia software +## Build Nvidia Docker Container (from 3.1 Inference round) -You need to install TensorRT and set up the configuration files as detailed [here](https://github.com/mlcommons/ck/blob/master/cm-mlops/script/reproduce-mlperf-inference-nvidia/README-about.md). +```cm docker script --tags=build,nvidia,inference,server +``` ## Run this benchmark via CM From a82d4fcc5958c6524e3194e0c7140184170cd70c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 11:30:43 +0000 Subject: [PATCH 04/42] Update README_nvidia.md --- docs/mlperf/inference/bert/README_nvidia.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/mlperf/inference/bert/README_nvidia.md b/docs/mlperf/inference/bert/README_nvidia.md index e8fcc82a7b..207b6066da 100644 --- a/docs/mlperf/inference/bert/README_nvidia.md +++ b/docs/mlperf/inference/bert/README_nvidia.md @@ -2,7 +2,8 @@ ## Build Nvidia Docker Container (from 3.1 Inference round) -```cm docker script --tags=build,nvidia,inference,server +``` +cm docker script --tags=build,nvidia,inference,server ``` ## Run this benchmark via CM From 2ae017f0ba170c422287b6c57392e31be6f4cffd Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 11:35:37 +0000 Subject: [PATCH 05/42] Added bert mlperf inference readme for qaic --- docs/mlperf/inference/bert/README_nvidia.md | 8 ---- docs/mlperf/inference/bert/README_qualcomm.md | 46 +++++++++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 docs/mlperf/inference/bert/README_qualcomm.md diff --git a/docs/mlperf/inference/bert/README_nvidia.md b/docs/mlperf/inference/bert/README_nvidia.md index 207b6066da..27542358e6 100644 --- a/docs/mlperf/inference/bert/README_nvidia.md +++ b/docs/mlperf/inference/bert/README_nvidia.md @@ -33,14 +33,6 @@ cmr "generate-run-cmds inference _submission _all-scenarios" --model=bert-99 \ * Use `--division=closed` to run all scenarios for the closed division including the compliance tests * `--offline_target_qps`, `--server_target_qps`, and `--singlestream_target_latency` can be used to override the determined performance numbers -### Populate the README files describing your submission - -``` -cmr "generate-run-cmds inference _populate-readme _all-scenarios" \ ---model=bert-99 --device=cuda --implementation=nvidia-original --backend=tensorrt \ ---execution-mode=valid --results_dir=$HOME/results_dir \ ---category=edge --division=open --quiet -``` ### Generate and upload MLPerf submission diff --git a/docs/mlperf/inference/bert/README_qualcomm.md b/docs/mlperf/inference/bert/README_qualcomm.md new file mode 100644 index 0000000000..ba3c8d3742 --- /dev/null +++ b/docs/mlperf/inference/bert/README_qualcomm.md @@ -0,0 +1,46 @@ +[ [Back to the common setup](README.md) ] + + +## Run this benchmark via CM + +### Do a test run to detect and record the system performance + +``` +cmr "generate-run-cmds inference _find-performance _all-scenarios" \ +--model=bert-99 --implementation=qualcomm --device=qaic --backend=glow \ +--category=edge --division=open --quiet +``` +* Use `--division=closed` to run all scenarios for the closed division (compliance tests are skipped for `_find-performance` mode) +* Use `--category=datacenter` to run datacenter scenarios (only for bert-99.9) +* Use `--model=bert-99.9` to run the high-accuracy model (only for datacenter) +* Use `--rerun` to force a rerun even when result files (from a previous run) exist + +### Do full accuracy and performance runs for all the scenarios + +``` +cmr "generate-run-cmds inference _submission _all-scenarios" --model=bert-99 \ +--device=qaic --implementation=qualcomm --backend=qaic \ +--execution-mode=valid --category=edge --division=open --quiet +``` + +* Use `--power=yes` for measuring power. It is ignored for accuracy and compliance runs +* Use `--division=closed` to run all scenarios for the closed division including the compliance tests +* `--offline_target_qps`, `--server_target_qps`, and `--singlestream_target_latency` can be used to override the determined performance numbers + +### Generate and upload MLPerf submission + +Follow [this guide](../Submission.md) to generate the submission tree and upload your results. + +### Run individual scenarios for testing and optimization + +TBD + +### Questions? Suggestions? + +Don't hesitate to get in touch via [public Discord server](https://discord.gg/JjWNWXKxwT). + +### Acknowledgments + +* CM automation for Nvidia's MLPerf inference implementation was developed by Arjun Suresh and Grigori Fursin. +* Nvidia's MLPerf inference implementation was developed by Zhihan Jiang, Ethan Cheng, Yiheng Zhang and Jinho Suh. + From f19c0ec7935b86815f6f0708134cc2484c3c53e8 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 12:35:38 +0100 Subject: [PATCH 06/42] fixed MLPerf accuracy loading on Windows --- .../process-mlperf-accuracy/customize.py | 28 +++++++++++++++---- .../script/process-mlperf-accuracy/run.bat | 8 ++++++ 2 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 cm-mlops/script/process-mlperf-accuracy/run.bat diff --git a/cm-mlops/script/process-mlperf-accuracy/customize.py b/cm-mlops/script/process-mlperf-accuracy/customize.py index 1e50354076..9f6a2f93a2 100644 --- a/cm-mlops/script/process-mlperf-accuracy/customize.py +++ b/cm-mlops/script/process-mlperf-accuracy/customize.py @@ -5,12 +5,17 @@ def preprocess(i): os_info = i['os_info'] + + xsep = ';' if os_info['platform'] == 'windows' else ':' + env = i['env'] results_dir = env.get("CM_MLPERF_ACCURACY_RESULTS_DIR", "") if results_dir == "": print("Please set CM_MLPERF_ACCURACY_RESULTS_DIR") return {'return':-1} + + # In fact, we expect only 1 command line here run_cmds = [] if env.get('CM_MAX_EXAMPLES', '') != '' and env.get('CM_MLPERF_RUN_STYLE', '') != 'valid': @@ -18,15 +23,17 @@ def preprocess(i): else: max_examples_string = "" - results_dir_split = results_dir.split(":") + results_dir_split = results_dir.split(xsep) dataset = env['CM_DATASET'] regenerate_accuracy_file = env.get('CM_MLPERF_REGENERATE_ACCURACY_FILE', False) for result_dir in results_dir_split: out_file = os.path.join(result_dir, 'accuracy.txt') + if os.path.exists(out_file) and os.stat(out_file).st_size != 0 and not regenerate_accuracy_file: continue + if dataset == "openimages": if env.get('CM_DATASET_PATH_ROOT', '') != '': dataset_dir = env['CM_DATASET_PATH_ROOT'] @@ -35,9 +42,9 @@ def preprocess(i): else: env['DATASET_ANNOTATIONS_FILE_PATH'] = env['CM_DATASET_ANNOTATIONS_FILE_PATH'] dataset_dir = os.getcwd() # not used, just to keep the script happy - CMD = env['CM_PYTHON_BIN_WITH_PATH'] + " '" + os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \ - "accuracy-openimages.py") + "' --mlperf-accuracy-file '" + os.path.join(result_dir, \ - "mlperf_log_accuracy.json") + "' --openimages-dir '" + dataset_dir + "' --verbose > '" + \ + CMD = env['CM_PYTHON_BIN_WITH_PATH'] + " "+"'" + os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \ + "accuracy-openimages.py") + "'"+" --mlperf-accuracy-file "+"'" + os.path.join(result_dir, \ + "mlperf_log_accuracy.json") + "'"+" --openimages-dir "+"'" + dataset_dir + "'"+" --verbose > "+"'" + \ out_file + "'" elif dataset == "imagenet": @@ -97,7 +104,12 @@ def preprocess(i): if not os.path.exists(outfile) or os.stat(outfile).st_size == 0 or env.get("CM_REGENERATE_MEASURE_FILES", False): run_cmds.append(CMD) - env['CM_RUN_CMDS'] = "??".join(run_cmds) + + if os_info['platform'] == 'windows': + env['CM_RUN_CMDS'] = ('\n'.join(run_cmds)).replace("'", '"').replace('>','^>') + else: + env['CM_RUN_CMDS'] = "??".join(run_cmds) + return {'return':0} def postprocess(i): @@ -106,8 +118,11 @@ def postprocess(i): env = i['env'] state = i['state'] + xsep = ';' if os_info['platform'] == 'windows' else ':' + results_dir = env.get("CM_MLPERF_ACCURACY_RESULTS_DIR", "") - results_dir_split = results_dir.split(":") + + results_dir_split = results_dir.split(xsep) for result_dir in results_dir_split: accuracy_file = os.path.join(result_dir, "accuracy.txt") @@ -116,6 +131,7 @@ def postprocess(i): print ('') print ('Accuracy file: {}'.format(accuracy_file)) print ('') + x = '' with open(accuracy_file, "r") as fp: x=fp.read() diff --git a/cm-mlops/script/process-mlperf-accuracy/run.bat b/cm-mlops/script/process-mlperf-accuracy/run.bat new file mode 100644 index 0000000000..82705126d1 --- /dev/null +++ b/cm-mlops/script/process-mlperf-accuracy/run.bat @@ -0,0 +1,8 @@ +echo Running command: +echo. +echo %CM_RUN_CMDS% +echo. + +%CM_RUN_CMDS% + +IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% From d4f30d2354c1b52196cdb8af40327a8f2f38b6a9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 11:52:44 +0000 Subject: [PATCH 07/42] Also save raw pip_freeze for mlperf inference results --- cm-mlops/script/app-mlperf-inference/customize.py | 1 + cm-mlops/script/dump-pip-freeze/customize.py | 12 ++++++++---- cm-mlops/script/dump-pip-freeze/dump.py | 8 +++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cm-mlops/script/app-mlperf-inference/customize.py b/cm-mlops/script/app-mlperf-inference/customize.py index 0a19fc399b..f9c714bf1a 100644 --- a/cm-mlops/script/app-mlperf-inference/customize.py +++ b/cm-mlops/script/app-mlperf-inference/customize.py @@ -459,6 +459,7 @@ def postprocess(i): if env.get('CM_DUMP_SYSTEM_INFO', True): dump_script_output("detect,os", env, state, 'new_env', os.path.join(output_dir, "os_info.json")) dump_script_output("detect,cpu", env, state, 'new_env', os.path.join(output_dir, "cpu_info.json")) + env['CM_DUMP_RAW_PIP_FREEZE_FILE_PATH'] = os.path.join(env['CM_MLPERF_OUTPUT_DIR'], "pip_freeze.raw") dump_script_output("dump,pip,freeze", env, state, 'new_state', os.path.join(output_dir, "pip_freeze.json")) return {'return':0} diff --git a/cm-mlops/script/dump-pip-freeze/customize.py b/cm-mlops/script/dump-pip-freeze/customize.py index aef0a981df..eb5eeab8af 100644 --- a/cm-mlops/script/dump-pip-freeze/customize.py +++ b/cm-mlops/script/dump-pip-freeze/customize.py @@ -11,6 +11,9 @@ def preprocess(i): automation = i['automation'] + if env.get('CM_DUMP_RAW_PIP_FREEZE_FILE_PATH', '') == '': + env['CM_DUMP_RAW_PIP_FREEZE_FILE_PATH'] = os.path.join(os.getcwd(), "tmp-pip-freeze") + quiet = (env.get('CM_QUIET', False) == 'yes') return {'return':0} @@ -25,18 +28,19 @@ def postprocess(i): automation = i['automation'] pip_freeze = {} - if not os.path.isfile('tmp-pip-freeze'): + pip_freeze_file = env['CM_DUMP_RAW_PIP_FREEZE_FILE_PATH'] + if not os.path.isfile(pip_freeze_file): # If was not created, sometimes issues on Windows # There is another workaround if os_info['platform'] == 'windows': r = automation.cmind.access({'action':'system', 'automation':'utils', 'cmd':'py -m pip freeze', - 'stdout':'tmp-pip-freeze'}) + 'stdout':pip_freeze_file}) # skip output - if os.path.isfile('tmp-pip-freeze'): - with open("tmp-pip-freeze", "r") as f: + if os.path.isfile(pip_freeze_file): + with open(pip_freeze_file, "r") as f: for line in f.readlines(): if "==" in line: split = line.split("==") diff --git a/cm-mlops/script/dump-pip-freeze/dump.py b/cm-mlops/script/dump-pip-freeze/dump.py index d74507ccf6..1d7f7ab853 100644 --- a/cm-mlops/script/dump-pip-freeze/dump.py +++ b/cm-mlops/script/dump-pip-freeze/dump.py @@ -1,8 +1,10 @@ import os from pip._internal.operations import freeze -if os.path.isfile('tmp-pip-freeze'): - os.remove('tmp-pip-freeze') +pip_freeze_out = os.environ.get('CM_DUMP_RAW_PIP_FREEZE_FILE_PATH', 'tmp-pip-freeze') + +if os.path.isfile(pip_freeze_out): + os.remove(pip_freeze_out) pkgs = freeze.freeze() @@ -15,5 +17,5 @@ pass if len(x)>0: - with open('tmp-pip-freeze', "w") as f: + with open(pip_freeze_out, "w") as f: f.write(x) From af65bfb810cb398dc98881568a73c897e3bfd2d7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 12:04:36 +0000 Subject: [PATCH 08/42] Use CM cache for mlperf inference submission generation --- .../_cm.json | 18 ++++++++++++++++++ .../_cm.yaml | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json index 4e732fb30f..0d15cb6272 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json +++ b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json @@ -26,6 +26,24 @@ }, { "tags": "get,mlperf,inference,utils" + }, + { + "tags": "get,mlperf,results,dir", + "names": [ + "get-mlperf-results-dir" + ], + "skip_if_env": { + "CM_MLPER_RESULTS_DIR": [ "on" ] + } + }, + { + "tags": "get,mlperf,submission,dir", + "names": [ + "get-mlperf-submission-dir" + ], + "skip_if_env": { + "CM_MLPERF_SUBMISSION_DIR": [ "on" ] + } } ], "input_mapping": { diff --git a/cm-mlops/script/generate-mlperf-inference-user-conf/_cm.yaml b/cm-mlops/script/generate-mlperf-inference-user-conf/_cm.yaml index 9d19ad8fc9..36f591df4a 100644 --- a/cm-mlops/script/generate-mlperf-inference-user-conf/_cm.yaml +++ b/cm-mlops/script/generate-mlperf-inference-user-conf/_cm.yaml @@ -77,7 +77,8 @@ deps: names: - get-mlperf-results-dir skip_if_env: - OUTPUT_BASE_DIR: [ on ] + OUTPUT_BASE_DIR: + - "on" ######################################################################## # Install MLPerf inference dependencies From 41007247cfd75880144c732cbf08b16451c2df8d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 12:21:44 +0000 Subject: [PATCH 09/42] Update README_aws_dl2q.24xlarge.md --- .../README_aws_dl2q.24xlarge.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md index e904bacb35..e60669e8ae 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md +++ b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md @@ -2,7 +2,8 @@ `dl2q.24xlarge` instance is available in `us-west-2d` and it has 96 vCPUs and 768 GB of memory. -[Deep Learning Base Qualcomm AMI (Amazon Linux 2) 20231213, ami-08cae482e3b14c9b8](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#LaunchInstances:ami=ami-08cae482e3b14c9b8) image from the Community AMIs is the recommended OS image as it comes with the QIAC SDKs (both Apps and Platform) preinstalled. +[Deep Learning Base Qualcomm AMI (Amazon Linux 2) 20240110, ami-08cae482e3b14c9b8](https://us-west-2.console.aws.amazon.com/ec2/home?region=us-west-2#LaunchInstances:ami=ami-08cae482e3b14c9b8) +image from the Community AMIs is the recommended OS image as it comes with the QIAC SDKs (both Apps and Platform) preinstalled. * Recommended to take 300 GB root disk From 4170a9268f28bf435d847bf2fc91b2f7da8f9d1c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 12:23:07 +0000 Subject: [PATCH 10/42] Update README_aws_dl2q.24xlarge.md --- .../README_aws_dl2q.24xlarge.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md index e60669e8ae..a423945bdf 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md +++ b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md @@ -2,7 +2,7 @@ `dl2q.24xlarge` instance is available in `us-west-2d` and it has 96 vCPUs and 768 GB of memory. -[Deep Learning Base Qualcomm AMI (Amazon Linux 2) 20240110, ami-08cae482e3b14c9b8](https://us-west-2.console.aws.amazon.com/ec2/home?region=us-west-2#LaunchInstances:ami=ami-08cae482e3b14c9b8) +[Deep Learning Base Qualcomm AMI (Amazon Linux 2) 20240110, ami-0799a42a111b1b87a](https://us-west-2.console.aws.amazon.com/ec2/home?region=us-west-2#LaunchInstances:ami=ami-0799a42a111b1b87a) image from the Community AMIs is the recommended OS image as it comes with the QIAC SDKs (both Apps and Platform) preinstalled. * Recommended to take 300 GB root disk From 042d9f860a55eb163d6ec429f30e64b2deef14a3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 13:20:31 +0000 Subject: [PATCH 11/42] Update Submission.md --- docs/mlperf/inference/Submission.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/mlperf/inference/Submission.md b/docs/mlperf/inference/Submission.md index b874d19523..c71f91195b 100644 --- a/docs/mlperf/inference/Submission.md +++ b/docs/mlperf/inference/Submission.md @@ -6,11 +6,9 @@ You should use the master branch of MLCommons inference repo for the submission ```bash cmr "generate inference submission" \ ---results_dir=$HOME/results_dir/valid_results \ ---submission_dir=$HOME/inference_submission_tree \ --clean \ --preprocess_submission=yes \ ---adr.compiler.tags=gcc --adr.inference-src.version=master \ +--adr.compiler.tags=gcc \ --run-checker \ --submitter=CTuning \ --tar=yes \ From aeaa4f1a5cf73688f45aece4ae4f0e63792e1430 Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 05:23:30 -0800 Subject: [PATCH 12/42] Fixes to submission generation --- .../generate-mlperf-inference-submission/_cm.json | 4 ++-- .../generate-mlperf-inference-submission/customize.py | 10 +++++----- cm-mlops/script/install-nccl-libs/run-ubuntu.sh | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json index 0d15cb6272..0a404f097c 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json +++ b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json @@ -48,11 +48,11 @@ ], "input_mapping": { "device": "CM_MLPERF_DEVICE", - "results_dir": "CM_MLPERF_RESULTS_DIR", + "results_dir": "CM_MLPERF_INFERENCE_RESULTS_DIR", "run_checker": "CM_RUN_SUBMISSION_CHECKER", "run_style": "CM_MLPERF_RUN_STYLE", "skip_truncation": "CM_SKIP_TRUNCATE_ACCURACY", - "submission_dir": "CM_MLPERF_SUBMISSION_DIR", + "submission_dir": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", "clean": "CM_MLPERF_CLEAN_SUBMISSION_DIR", "hw_name": "CM_HW_NAME", "sw_notes_extra": "CM_MLPERF_SUT_SW_NOTES_EXTRA", diff --git a/cm-mlops/script/generate-mlperf-inference-submission/customize.py b/cm-mlops/script/generate-mlperf-inference-submission/customize.py index fb60c68c9c..084429ff91 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/customize.py +++ b/cm-mlops/script/generate-mlperf-inference-submission/customize.py @@ -18,19 +18,19 @@ def generate_submission(i): state = i['state'] inp=i['input'] - if env.get('CM_MLPERF_RESULTS_DIR', '') == '': + if env.get('CM_MLPERF_INFERENCE_RESULTS_DIR', '') == '': return {"return": 1, "error": "Please set --results_dir to the folder containing MLPerf inference results"} mlperf_path = env['CM_MLPERF_INFERENCE_SOURCE'] submission_checker_dir = os.path.join(mlperf_path, "tools", "submission") sys.path.append(submission_checker_dir) - results_dir = env['CM_MLPERF_RESULTS_DIR'] + results_dir = env['CM_MLPERF_INFERENCE_RESULTS_DIR'] - if env.get('CM_MLPERF_SUBMISSION_DIR', '') == '': + if env.get('CM_MLPERF_INFERENCE_SUBMISSION_DIR', '') == '': from pathlib import Path user_home = str(Path.home()) - env['CM_MLPERF_SUBMISSION_DIR'] = os.path.join(user_home, "mlperf_submission") + env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.path.join(user_home, "mlperf_submission") if env.get('CM_MLPERF_CLEAN_SUBMISSION_DIR','')!='': print ('=================================================') @@ -39,7 +39,7 @@ def generate_submission(i): shutil.rmtree(env['CM_MLPERF_SUBMISSION_DIR']) print ('=================================================') - submission_dir = env['CM_MLPERF_SUBMISSION_DIR'] + submission_dir = env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] if not os.path.isdir(submission_dir): os.makedirs(submission_dir) diff --git a/cm-mlops/script/install-nccl-libs/run-ubuntu.sh b/cm-mlops/script/install-nccl-libs/run-ubuntu.sh index 2b380e0223..e56074a517 100644 --- a/cm-mlops/script/install-nccl-libs/run-ubuntu.sh +++ b/cm-mlops/script/install-nccl-libs/run-ubuntu.sh @@ -1,2 +1,2 @@ CM_SUDO=${CM_SUDO:-sudo} -${CM_SUDO} apt install -y libnccl2=2.18.3-1+cuda${CM_CUDA_VERSION} libnccl-dev=2.18.3-1+cuda${CM_CUDA_VERSION} +${CM_SUDO} apt install -y --allow-downgrades libnccl2=2.18.3-1+cuda${CM_CUDA_VERSION} libnccl-dev=2.18.3-1+cuda${CM_CUDA_VERSION} From f3b8ba197ea58144c4b858d5f337c5c39ce0e47c Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 05:48:24 -0800 Subject: [PATCH 13/42] Fixes for submission generation --- .../_cm.json | 2 +- .../customize.py | 16 ++++++++++------ .../customize.py | 4 +++- .../_cm.json | 2 +- .../customize.py | 8 ++++---- .../_cm.json | 4 ++-- .../customize.py | 6 +++--- .../_cm.json | 4 ++-- .../customize.py | 6 +++--- 9 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json index 0a404f097c..2f02ab3ec6 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/_cm.json +++ b/cm-mlops/script/generate-mlperf-inference-submission/_cm.json @@ -48,7 +48,7 @@ ], "input_mapping": { "device": "CM_MLPERF_DEVICE", - "results_dir": "CM_MLPERF_INFERENCE_RESULTS_DIR", + "results_dir": "CM_MLPERF_INFERENCE_RESULTS_DIR_", "run_checker": "CM_RUN_SUBMISSION_CHECKER", "run_style": "CM_MLPERF_RUN_STYLE", "skip_truncation": "CM_SKIP_TRUNCATE_ACCURACY", diff --git a/cm-mlops/script/generate-mlperf-inference-submission/customize.py b/cm-mlops/script/generate-mlperf-inference-submission/customize.py index 084429ff91..f71d07648d 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/customize.py +++ b/cm-mlops/script/generate-mlperf-inference-submission/customize.py @@ -18,8 +18,8 @@ def generate_submission(i): state = i['state'] inp=i['input'] - if env.get('CM_MLPERF_INFERENCE_RESULTS_DIR', '') == '': - return {"return": 1, "error": "Please set --results_dir to the folder containing MLPerf inference results"} + if env.get('CM_MLPERF_INFERENCE_RESULTS_DIR_', '') == '': + env['CM_MLPERF_INFERENCE_RESULTS_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_RESULTS_DIR'], "valid_results") mlperf_path = env['CM_MLPERF_INFERENCE_SOURCE'] submission_checker_dir = os.path.join(mlperf_path, "tools", "submission") @@ -32,14 +32,15 @@ def generate_submission(i): user_home = str(Path.home()) env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.path.join(user_home, "mlperf_submission") + submission_dir = env.get('CM_MLPERF_INFERENCE_SUBMISSION_DIR', '') + if env.get('CM_MLPERF_CLEAN_SUBMISSION_DIR','')!='': print ('=================================================') - print ('Cleaning {} ...'.format(env['CM_MLPERF_SUBMISSION_DIR'])) - if os.path.exists(env['CM_MLPERF_SUBMISSION_DIR']): - shutil.rmtree(env['CM_MLPERF_SUBMISSION_DIR']) + print ('Cleaning {} ...'.format(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'])) + if os.path.exists(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR']): + shutil.rmtree(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR']) print ('=================================================') - submission_dir = env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] if not os.path.isdir(submission_dir): os.makedirs(submission_dir) @@ -127,6 +128,9 @@ def generate_submission(i): # Override framework and framework versions from the folder name system_meta_default['framework'] = framework + " " + framework_version + else: + print(parts) + return {'return': 1} result_path = os.path.join(results_dir, res) platform_prefix = inp.get('platform_prefix', '') if platform_prefix: diff --git a/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py b/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py index a7f885d518..8f94abb151 100644 --- a/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py +++ b/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py @@ -14,7 +14,9 @@ def preprocess(i): quiet = (env.get('CM_QUIET', False) == 'yes') if env.get('CM_MLPERF_INFERENCE_SUBMISSION_DIR','') == '': - env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.getcwd() + if not os.path.exists("mlperf-inference-submission"): + os.makedir("mlperf-inference-submission") + env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.path.join(os.getcwd(), "mlperf-inference-submission") return {'return':0} diff --git a/cm-mlops/script/preprocess-mlperf-inference-submission/_cm.json b/cm-mlops/script/preprocess-mlperf-inference-submission/_cm.json index 3cbe9244d8..80f74c6cf5 100644 --- a/cm-mlops/script/preprocess-mlperf-inference-submission/_cm.json +++ b/cm-mlops/script/preprocess-mlperf-inference-submission/_cm.json @@ -22,7 +22,7 @@ } ], "input_mapping": { - "submission_dir": "CM_MLPERF_SUBMISSION_DIR", + "submission_dir": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", "submitter": "CM_MLPERF_SUBMITTER" }, "tags": [ diff --git a/cm-mlops/script/preprocess-mlperf-inference-submission/customize.py b/cm-mlops/script/preprocess-mlperf-inference-submission/customize.py index 08ee3d0723..03bca7cd9b 100644 --- a/cm-mlops/script/preprocess-mlperf-inference-submission/customize.py +++ b/cm-mlops/script/preprocess-mlperf-inference-submission/customize.py @@ -8,11 +8,11 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] - submission_dir = env.get("CM_MLPERF_SUBMISSION_DIR", "") + submission_dir = env.get("CM_MLPERF_INFERENCE_SUBMISSION_DIR", "") if submission_dir == "": - print("Please set CM_MLPERF_SUBMISSION_DIR") - return {'return': 1, 'error':'CM_MLPERF_SUBMISSION_DIR is not specified'} + print("Please set CM_MLPERF_INFERENCE_SUBMISSION_DIR") + return {'return': 1, 'error':'CM_MLPERF_INFERENCE_SUBMISSION_DIR is not specified'} submitter = env.get("CM_MLPERF_SUBMITTER", "cTuning") submission_processed = submission_dir + "_processed" @@ -31,7 +31,7 @@ def preprocess(i): def postprocess(i): env = i['env'] - submission_dir = env["CM_MLPERF_SUBMISSION_DIR"] + submission_dir = env["CM_MLPERF_INFERENCE_SUBMISSION_DIR"] import datetime submission_backup = submission_dir+"_backup_"+'{date:%Y-%m-%d_%H:%M:%S}'.format( date=datetime.datetime.now() ) diff --git a/cm-mlops/script/run-mlperf-inference-submission-checker/_cm.json b/cm-mlops/script/run-mlperf-inference-submission-checker/_cm.json index ffc0d6cd90..4a9331b410 100644 --- a/cm-mlops/script/run-mlperf-inference-submission-checker/_cm.json +++ b/cm-mlops/script/run-mlperf-inference-submission-checker/_cm.json @@ -64,8 +64,8 @@ ], "input_mapping": { "skip_compliance": "CM_MLPERF_SKIP_COMPLIANCE", - "submission_dir": "CM_MLPERF_SUBMISSION_DIR", - "input": "CM_MLPERF_SUBMISSION_DIR", + "submission_dir": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", + "input": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", "submitter": "CM_MLPERF_SUBMITTER", "src_version": "CM_MLPERF_SUBMISSION_CHECKER_VERSION", "push_to_github": "CM_MLPERF_RESULT_PUSH_TO_GITHUB", diff --git a/cm-mlops/script/run-mlperf-inference-submission-checker/customize.py b/cm-mlops/script/run-mlperf-inference-submission-checker/customize.py index 29b74695bc..71ded640f6 100644 --- a/cm-mlops/script/run-mlperf-inference-submission-checker/customize.py +++ b/cm-mlops/script/run-mlperf-inference-submission-checker/customize.py @@ -7,7 +7,7 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] - submission_dir = env.get("CM_MLPERF_SUBMISSION_DIR", "") + submission_dir = env.get("CM_MLPERF_INFERENCE_SUBMISSION_DIR", "") version = env.get('CM_MLPERF_SUBMISSION_CHECKER_VERSION','') @@ -68,8 +68,8 @@ def preprocess(i): def postprocess(i): env = i['env'] - if env.get('CM_TAR_SUBMISSION_DIR'): - env['CM_TAR_INPUT_DIR'] = env.get('CM_MLPERF_SUBMISSION_DIR', '$HOME') + if env.get('CM_TAR_SUBMISSION_DIR',''): + env['CM_TAR_INPUT_DIR'] = env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] x=env.get('MLPERF_INFERENCE_SUBMISSION_TAR_FILE','') if x!='': diff --git a/cm-mlops/script/truncate-mlperf-inference-accuracy-log/_cm.json b/cm-mlops/script/truncate-mlperf-inference-accuracy-log/_cm.json index 20c019fa57..f54e6417d4 100644 --- a/cm-mlops/script/truncate-mlperf-inference-accuracy-log/_cm.json +++ b/cm-mlops/script/truncate-mlperf-inference-accuracy-log/_cm.json @@ -21,8 +21,8 @@ } ], "input_mapping": { - "input": "CM_MLPERF_SUBMISSION_DIR", - "submission_dir": "CM_MLPERF_SUBMISSION_DIR", + "input": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", + "submission_dir": "CM_MLPERF_INFERENCE_SUBMISSION_DIR", "submitter": "CM_MLPERF_SUBMITTER" }, "tags": [ diff --git a/cm-mlops/script/truncate-mlperf-inference-accuracy-log/customize.py b/cm-mlops/script/truncate-mlperf-inference-accuracy-log/customize.py index 2a59b45932..5f03ccb086 100644 --- a/cm-mlops/script/truncate-mlperf-inference-accuracy-log/customize.py +++ b/cm-mlops/script/truncate-mlperf-inference-accuracy-log/customize.py @@ -8,11 +8,11 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] - submission_dir = env.get("CM_MLPERF_SUBMISSION_DIR", "") + submission_dir = env.get("CM_MLPERF_INFERENCE_SUBMISSION_DIR", "") if submission_dir == "": - print("Please set CM_MLPERF_SUBMISSION_DIR") - return {'return': 1, 'error':'CM_MLPERF_SUBMISSION_DIR is not specified in env in run-mlperf-accuracy-log-truncator'} + print("Please set CM_MLPERF_INFERENCE_SUBMISSION_DIR") + return {'return': 1, 'error':'CM_MLPERF_INFERENCE_SUBMISSION_DIR is not specified in env in run-mlperf-accuracy-log-truncator'} submitter = env.get("CM_MLPERF_SUBMITTER", "cTuning") From a02057247897ba098170ba03fceb60ca303fd617 Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 05:53:08 -0800 Subject: [PATCH 14/42] Remove --results_dir from run-template --- .../run-template.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh index bfedac7923..1557c58d09 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh @@ -47,38 +47,38 @@ find_performance_cmd='cm run script --tags=generate-run-cmds,inference,_find-per submission_cmd='cm run script --tags=generate-run-cmds,inference,_submission,_all-scenarios \ --model=$model --implementation=$implementation --device=$device --backend=$backend \ ---category=$category --division=$division --quiet --results_dir=$results_dir \ +--category=$category --division=$division --quiet \ --skip_submission_generation=yes --execution-mode=valid ${POWER_STRING} ${EXTRA_ARGS}' submission_cmd_scenario='cm run script --tags=generate-run-cmds,inference,_submission --scenario=$scenario \ --model=$model --implementation=$implementation --device=$device --backend=$backend \ ---category=$category --division=$division --quiet --results_dir=$results_dir \ +--category=$category --division=$division --quiet \ --skip_submission_generation=yes --execution-mode=valid ${POWER_STRING} ${EXTRA_ARGS}' readme_cmd_single='cm run script --tags=generate-run-cmds,inference,_populate-readme --scenario=$scenario \ --model=$model --implementation=$implementation --device=$device --backend=$backend \ ---category=$category --division=$division --quiet --results_dir=$results_dir \ +--category=$category --division=$division --quiet \ --skip_submission_generation=yes --execution-mode=valid ${POWER_STRING} ${EXTRA_ARGS}' readme_cmd='cm run script --tags=generate-run-cmds,inference,_populate-readme,_all-scenarios \ --model=$model --implementation=$implementation --device=$device --backend=$backend \ ---category=$category --division=$division --quiet --results_dir=$results_dir \ +--category=$category --division=$division --quiet \ --skip_submission_generation=yes --execution-mode=valid ${POWER_STRING} ${EXTRA_ARGS}' tflite_accuracy_cmd='cm run script --tags=run,mobilenet-models,_tflite,_accuracy-only$extra_tags \ --adr.compiler.tags=gcc \ ${extra_option} \ ---results_dir=$results_dir ${EXTRA_ARGS}' + ${EXTRA_ARGS}' tflite_performance_cmd='cm run script --tags=run,mobilenet-models,_tflite,_performance-only$extra_tags \ ${POWER_STRING} \ --adr.compiler.tags=gcc \ ${extra_option} \ ---results_dir=$results_dir ${EXTRA_ARGS}' + ${EXTRA_ARGS}' tflite_readme_cmd='cm run script --tags=run,mobilenet-models,_tflite,_populate-readme$extra_tags \ ${POWER_STRING} \ --adr.compiler.tags=gcc \ ${extra_option} \ ---results_dir=$results_dir ${EXTRA_ARGS}' + ${EXTRA_ARGS}' From 7c786f08aa9fa069f01cf772116ff5fac3e19c8e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 14:49:24 +0000 Subject: [PATCH 15/42] Added new SUTs --- .../_cm.yaml | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml index 12b390efac..e1cbc57f70 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml @@ -117,11 +117,38 @@ variations: POWER_SERVER: 192.168.0.15 POWER_SERVER_PORT: 4950 - + phoenix,power: + default_env: + POWER_SERVER: 192.168.0.15 + POWER_SERVER_PORT: 4950 + rb6,power: + default_env: + POWER_SERVER: 192.168.0.15 + POWER_SERVER_PORT: 4950 + orin,power: + default_env: + POWER_SERVER: 192.168.0.15 + POWER_SERVER_PORT: 4950 + rpi4,power: + default_env: + POWER_SERVER: 192.168.0.15 + POWER_SERVER_PORT: 4950 + mini,power: + default_env: + POWER_SERVER: 192.168.0.15 + POWER_SERVER_PORT: 4950 + rb6: + group: sut + orin: + group: sut + rpi4: + group: sut + mini: + group: sut phoenix: group: sut env: - CATEGORY: edge,datacenter + CATEGORY: edge DIVISION: closed state: resnet50: From 36134fee9b681d243f51e5bd4622641f36fe0d0e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 14:54:22 +0000 Subject: [PATCH 16/42] Update the power server IP for the suts --- .../_cm.yaml | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml index e1cbc57f70..2d67773b95 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml @@ -114,29 +114,24 @@ variations: env: POWER: yes default_env: - POWER_SERVER: 192.168.0.15 + POWER_SERVER: 192.168.1.166 POWER_SERVER_PORT: 4950 phoenix,power: default_env: - POWER_SERVER: 192.168.0.15 - POWER_SERVER_PORT: 4950 + POWER_SERVER: 192.168.1.79 rb6,power: default_env: - POWER_SERVER: 192.168.0.15 - POWER_SERVER_PORT: 4950 + POWER_SERVER: 192.168.1.166 orin,power: default_env: - POWER_SERVER: 192.168.0.15 - POWER_SERVER_PORT: 4950 + POWER_SERVER: 192.168.1.166 rpi4,power: default_env: - POWER_SERVER: 192.168.0.15 - POWER_SERVER_PORT: 4950 + POWER_SERVER: 192.168.1.166 mini,power: default_env: - POWER_SERVER: 192.168.0.15 - POWER_SERVER_PORT: 4950 + POWER_SERVER: 192.168.1.166 rb6: group: sut orin: From 14430a6808e1df421b76aa970d329df823c303f5 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 17:02:27 +0100 Subject: [PATCH 17/42] * support loadgen C++ building on Windows * support LLVM 17+ --- .../script/app-mlperf-inference-cpp/customize.py | 11 +++++++++-- .../get-mlperf-inference-loadgen/customize.py | 12 ++++++------ .../script/get-mlperf-inference-loadgen/run.bat | 15 +++++++++++++++ .../script/install-llvm-prebuilt/customize.py | 14 ++++++++++++++ 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/cm-mlops/script/app-mlperf-inference-cpp/customize.py b/cm-mlops/script/app-mlperf-inference-cpp/customize.py index 5c7c232351..1f8c3b0e11 100644 --- a/cm-mlops/script/app-mlperf-inference-cpp/customize.py +++ b/cm-mlops/script/app-mlperf-inference-cpp/customize.py @@ -6,9 +6,16 @@ def preprocess(i): os_info = i['os_info'] -# if os_info['platform'] == 'windows': -# return {'return':1, 'error': 'Windows is not supported in this script yet'} + automation = i['automation'] + + meta = i['meta'] +# if os_info['platform'] == 'windows': +# # Currently support only LLVM on Windows +# print ('# Forcing LLVM on Windows') +# r = automation.update_deps({'deps':meta['post_deps'], 'update_deps':{'compile-program': {'adr':{'compiler':{'tags':'llvm'}}}}}) +# if r['return']>0: return r + env = i['env'] if env.get('CM_MLPERF_SKIP_RUN', '') == "yes": diff --git a/cm-mlops/script/get-mlperf-inference-loadgen/customize.py b/cm-mlops/script/get-mlperf-inference-loadgen/customize.py index 23391b0346..0e4768366d 100644 --- a/cm-mlops/script/get-mlperf-inference-loadgen/customize.py +++ b/cm-mlops/script/get-mlperf-inference-loadgen/customize.py @@ -21,11 +21,11 @@ def postprocess(i): env[key] = [] # On Windows installs directly into Python distro for simplicity - if os_info['platform'] != 'windows': - env['+C_INCLUDE_PATH'].append(os.path.join(os.getcwd(), 'install', 'include')) - env['+CPLUS_INCLUDE_PATH'].append(os.path.join(os.getcwd(), 'install', 'include')) - env['+LD_LIBRARY_PATH'].append(os.path.join(os.getcwd(), 'install', 'lib')) - env['+DYLD_FALLBACK_LIBRARY_PATH'].append(os.path.join(os.getcwd(), 'install', 'lib')) - env['+PYTHONPATH'].append(os.path.join(os.getcwd(), 'install', 'python')) +# if os_info['platform'] != 'windows': + env['+C_INCLUDE_PATH'].append(os.path.join(os.getcwd(), 'install', 'include')) + env['+CPLUS_INCLUDE_PATH'].append(os.path.join(os.getcwd(), 'install', 'include')) + env['+LD_LIBRARY_PATH'].append(os.path.join(os.getcwd(), 'install', 'lib')) + env['+DYLD_FALLBACK_LIBRARY_PATH'].append(os.path.join(os.getcwd(), 'install', 'lib')) + env['+PYTHONPATH'].append(os.path.join(os.getcwd(), 'install', 'python')) return {'return':0} diff --git a/cm-mlops/script/get-mlperf-inference-loadgen/run.bat b/cm-mlops/script/get-mlperf-inference-loadgen/run.bat index dc1d75acc0..6d97f12b4e 100644 --- a/cm-mlops/script/get-mlperf-inference-loadgen/run.bat +++ b/cm-mlops/script/get-mlperf-inference-loadgen/run.bat @@ -9,6 +9,8 @@ if "%CM_MLPERF_INFERENCE_LOADGEN_DOWNLOAD%" == "YES" ( set CM_MLPERF_INFERENCE_SOURCE=%CM_EXTRACT_EXTRACTED_PATH% ) +set INSTALL_DIR=%CUR_DIR%\install + echo. echo Switching to %CM_MLPERF_INFERENCE_SOURCE%\loadgen @@ -22,3 +24,16 @@ echo Running %CM_PYTHON_BIN% setup.py develop IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% echo ======================================================= +cmake ^ + -DCMAKE_INSTALL_PREFIX=%INSTALL_DIR% ^ + %CM_MLPERF_INFERENCE_SOURCE%\loadgen ^ + -DPYTHON_EXECUTABLE:FILEPATH=%CM_PYTHON_BIN_WITH_PATH% +IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + +echo ======================================================= +cmake --build . --target install +IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + +del /Q /S build + +echo ======================================================= diff --git a/cm-mlops/script/install-llvm-prebuilt/customize.py b/cm-mlops/script/install-llvm-prebuilt/customize.py index 236518ca0e..c64f6b5923 100644 --- a/cm-mlops/script/install-llvm-prebuilt/customize.py +++ b/cm-mlops/script/install-llvm-prebuilt/customize.py @@ -119,6 +119,20 @@ def preprocess(i): elif need_version == '16.0.4': default_os = '22.04' + elif need_version == '17.0.2': + default_os = '22.04' + + elif need_version == '17.0.2': + default_os = '22.04' + + elif need_version == '17.0.4': + default_os = '22.04' + + elif need_version == '17.0.5': + default_os = '22.04' + + elif need_version == '17.0.6': + default_os = '22.04' package_name = 'clang+llvm-' + need_version + '-x86_64-linux-gnu-ubuntu-' + default_os + '.tar.xz' From 5ee841f05c9513ab81d007bf00728f4d1d679a1c Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 17:12:27 +0100 Subject: [PATCH 18/42] fixing benchmark program on Windows --- cm-mlops/script/app-image-corner-detection/_cm.json | 4 ++++ cm-mlops/script/benchmark-program/customize.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cm-mlops/script/app-image-corner-detection/_cm.json b/cm-mlops/script/app-image-corner-detection/_cm.json index 4462c5736d..405654f5ee 100644 --- a/cm-mlops/script/app-image-corner-detection/_cm.json +++ b/cm-mlops/script/app-image-corner-detection/_cm.json @@ -3,6 +3,10 @@ "automation_alias": "script", "automation_uid": "5b4e0237da074764", "category": "Modular application pipeline", + "deps": [ + {"tags":"detect,os"}, + {"tags":"detect,cpu"} + ], "posthook_deps": [ { "skip_if_env": { diff --git a/cm-mlops/script/benchmark-program/customize.py b/cm-mlops/script/benchmark-program/customize.py index b4bb9e3f40..0ca21ce255 100644 --- a/cm-mlops/script/benchmark-program/customize.py +++ b/cm-mlops/script/benchmark-program/customize.py @@ -41,7 +41,7 @@ def preprocess(i): if x!='': env['CM_RUN_CMD'] = x + ' ' + env.get('CM_RUN_CMD','') - if env.get('CM_HOST_OS_TYPE', '') != 'windows' and str(env.get('CM_SAVE_CONSOLE_LOG', True)).lower() not in [ "no", "false", "0"]: + if os_info['platform'] == 'windows' and str(env.get('CM_SAVE_CONSOLE_LOG', True)).lower() not in [ "no", "false", "0"]: logs_dir = env.get('CM_LOGS_DIR', env['CM_RUN_DIR']) env['CM_RUN_CMD'] += " 2>&1 | tee " + os.path.join(logs_dir, "console.out") From 3f43fad405bd375bd08d569c9e905a370df5c034 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 17:32:17 +0100 Subject: [PATCH 19/42] clean up --- cm-mlops/script/app-mlperf-inference-cpp/tests/win.bat | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 cm-mlops/script/app-mlperf-inference-cpp/tests/win.bat diff --git a/cm-mlops/script/app-mlperf-inference-cpp/tests/win.bat b/cm-mlops/script/app-mlperf-inference-cpp/tests/win.bat new file mode 100644 index 0000000000..08dc944a4b --- /dev/null +++ b/cm-mlops/script/app-mlperf-inference-cpp/tests/win.bat @@ -0,0 +1,8 @@ +rem TBD: current not compiling - need to check ... + +cmr "install llvm prebuilt" --version=16.0.4 +cmr "install llvm prebuilt" --version=17.0.6 + +cmr "get lib onnxruntime lang-cpp _cpu" --version=1.11.1 +cmr "get lib onnxruntime lang-cpp _cpu" --version=1.13.1 +cmr "get lib onnxruntime lang-cpp _cpu" --version=1.15.1 From 08d4a59a5397446e0300467d50f606a253f1eafc Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 09:16:29 -0800 Subject: [PATCH 20/42] Fix wrong state export in nvidia implementation --- .../customize.py | 17 ++++++++++------- .../reproduce-mlperf-inference-nvidia/_cm.yaml | 6 +++--- .../run-mlperf-inference-app/customize.py | 16 +++------------- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/cm-mlops/script/generate-mlperf-inference-user-conf/customize.py b/cm-mlops/script/generate-mlperf-inference-user-conf/customize.py index b67dcf20af..f8c404e143 100644 --- a/cm-mlops/script/generate-mlperf-inference-user-conf/customize.py +++ b/cm-mlops/script/generate-mlperf-inference-user-conf/customize.py @@ -109,13 +109,16 @@ def preprocess(i): value = env.get('CM_MLPERF_LOADGEN_TARGET_QPS') elif scenario in [ 'SingleStream', 'MultiStream' ]: metric = "target_latency" - tolerance = 0.4 #much lower because we have max_duration value = env.get('CM_MLPERF_LOADGEN_TARGET_LATENCY') if value: if scenario == "SingleStream" and (1000/float(value) * 660 < 100): env['CM_MLPERF_USE_MAX_DURATION'] = 'no' elif scenario == "MultiStream" and (1000/float(value) * 660 < 662): env['CM_MLPERF_USE_MAX_DURATION'] = 'no' + if env.get('CM_MLPERF_MODEL_EQUAL_ISSUE_MODE', 'no').lower() not in [ "yes", "1", "true" ] and env.get('CM_MLPERF_USE_MAX_DURATION', "yes").lower() not in [ "no", "false", "0"]: + tolerance = 0.4 #much lower because we have max_duration + else: + tolerance = 0.9 else: return {'return': 1, 'error': 'Invalid scenario: {}'.format(scenario)} @@ -346,18 +349,18 @@ def run_files_exist(mode, OUTPUT_DIR, run_files, env): required_files = run_files[file_loc[mode]] if mode == "performance_power": - for file in run_files[2]: - file_path = os.path.join(os.path.dirname(OUTPUT_DIR), "power", file) + for file_ in run_files[2]: + file_path = os.path.join(os.path.dirname(OUTPUT_DIR), "power", file_) if (not os.path.exists(file_path) or os.stat(file_path).st_size == 0): return False required_files += run_files[1] #We need performance files too in the run directory - for file in required_files: - file_path = os.path.join(OUTPUT_DIR, file) - if (not os.path.exists(file_path) or os.stat(file_path).st_size == 0) and file != "accuracy.txt": + for file_ in required_files: + file_path = os.path.join(OUTPUT_DIR, file_) + if (not os.path.exists(file_path) or os.stat(file_path).st_size == 0) and file_ != "accuracy.txt": return False - if file == "mlperf_log_detail.txt" and "performance" in mode: + if file_ == "mlperf_log_detail.txt" and "performance" in mode: mlperf_log = MLPerfLog(file_path) if ( "result_validity" not in mlperf_log.get_keys() diff --git a/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml b/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml index e698ac91da..89fdbb485b 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml +++ b/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml @@ -89,9 +89,6 @@ input_mapping: skip_postprocess: CM_MLPERF_NVIDIA_HARNESS_SKIP_POSTPROCESS embedding_weights_on_gpu_part: CM_MLPERF_NVIDIA_HARNESS_EMBEDDING_WEIGHTS_ON_GPU_PART -new_state_keys: - - mlperf-inference-implementation - - CM_SUT_* # Dependencies on other CM scripts @@ -775,6 +772,9 @@ variations: - CM_ML_MODEL_* - CM_HW_NAME - CM_MAX_EXAMPLES + new_state_keys: + - mlperf-inference-implementation + - CM_SUT_* build_engine_options.#: group: build-engine-options diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index 2647e7922e..d415582a1b 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -148,7 +148,7 @@ def preprocess(i): print ('=========================================================') - local_keys = [ 'CM_MLPERF_SKIP_RUN', 'CM_MLPERF_LOADGEN_QUERY_COUNT' ] + #local_keys = [ 'CM_MLPERF_SKIP_RUN', 'CM_MLPERF_LOADGEN_QUERY_COUNT', 'CM_MLPERF_LOADGEN_TARGET_QPS', 'CM_MLPERF_LOADGEN_TARGET_LATENCY' ] for scenario in env['CM_MLPERF_LOADGEN_SCENARIOS']: scenario_tags = tags + ",_"+scenario.lower() @@ -172,32 +172,22 @@ def preprocess(i): print(f"\nRunning loadgen scenario: {scenario} and mode: {mode}") ii = {'action':action, 'automation':'script', 'tags': scenario_tags, 'quiet': 'true', - 'env': env, 'input': inp, 'state': state, 'add_deps': add_deps, 'add_deps_recursive': + 'env': copy.deepcopy(env), 'input': inp, 'state': copy.deepcopy(state), 'add_deps': copy.deepcopy(add_deps), 'add_deps_recursive': copy.deepcopy(add_deps_recursive), 'ad': ad, 'adr': copy.deepcopy(adr), 'v': verbose, 'print_env': print_env, 'print_deps': print_deps, 'dump_version_info': dump_version_info} r = cm.access(ii) if r['return'] > 0: return r - if env.get('CM_MLPERF_SKIP_RUN', '') != '': - del(env['CM_MLPERF_SKIP_RUN']) - - for key in local_keys: - if env.get(key, '') != '': - del(env[key]) if env.get("CM_MLPERF_LOADGEN_COMPLIANCE", "") == "yes": for test in test_list: env['CM_MLPERF_LOADGEN_COMPLIANCE_TEST'] = test env['CM_MLPERF_LOADGEN_MODE'] = "compliance" r = cm.access({'action':action, 'automation':'script', 'tags': scenario_tags, 'quiet': 'true', - 'env': env, 'input': inp, 'state': state, 'add_deps': add_deps, 'add_deps_recursive': + 'env': copy.deepcopy(env), 'input': inp, 'state': copy.deepcopy(state), 'add_deps': copy.deepcopy(add_deps), 'add_deps_recursive': copy.deepcopy(add_deps_recursive), 'adr': copy.deepcopy(adr), 'ad': ad, 'v': verbose, 'print_env': print_env, 'print_deps': print_deps, 'dump_version_info': dump_version_info}) if r['return'] > 0: return r - for key in local_keys: - if env.get(key, '') != '': - del(env[key]) - if state.get("cm-mlperf-inference-results"): #print(state["cm-mlperf-inference-results"]) for sut in state["cm-mlperf-inference-results"]:#only one sut will be there From 2539269a64c013c99f7bd93f83e6abeed3592369 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 19:43:41 +0000 Subject: [PATCH 21/42] Mark invalid results in results table --- .../script/app-mlperf-inference/customize.py | 5 +- .../mlperf_utils.py | 52 +++++++++++++++---- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/cm-mlops/script/app-mlperf-inference/customize.py b/cm-mlops/script/app-mlperf-inference/customize.py index f9c714bf1a..cb52c14acb 100644 --- a/cm-mlops/script/app-mlperf-inference/customize.py +++ b/cm-mlops/script/app-mlperf-inference/customize.py @@ -203,7 +203,7 @@ def postprocess(i): if os.path.exists(env['CM_MLPERF_USER_CONF']): shutil.copy(env['CM_MLPERF_USER_CONF'], 'user.conf') - result = mlperf_utils.get_result_from_log(env['CM_MLPERF_LAST_RELEASE'], model, scenario, output_dir, mode) + result, valid = mlperf_utils.get_result_from_log(env['CM_MLPERF_LAST_RELEASE'], model, scenario, output_dir, mode) power = None power_efficiency = None if mode == "performance": @@ -221,8 +221,10 @@ def postprocess(i): if not state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model].get(scenario): state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario] = {} state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode] = result + state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode]['valid'] = valid[mode] if power: state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power'] = power + state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power']['valid'] = valid['power'] if power_efficiency: state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power_efficiency'] = power_efficiency @@ -281,7 +283,6 @@ def postprocess(i): for xd in xdirs: xpath = os.path.join(cache.path, xd) - print (xpath) if os.path.isdir(xpath): r = cm.access({'action':'system', 'automation':'utils', 'path':xpath, 'cmd':'git rev-parse HEAD'}) if r['return'] == 0 and r['ret'] == 0: diff --git a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py index d35f0f2450..5bbfcdf637 100644 --- a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py +++ b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py @@ -16,9 +16,19 @@ def get_result_from_log(version, model, scenario, result_path, mode): #scenario = checker.SCENARIO_MAPPING[scenario] result = '' + valid = {} if mode == "performance": has_power = os.path.exists(os.path.join(result_path, "power")) result_ = checker.get_performance_metric(config, mlperf_model, result_path, scenario, None, None, has_power) + mlperf_log = MLPerfLog(result_path) + if ( + "result_validity" not in mlperf_log.get_keys() + or mlperf_log["result_validity"] != "VALID" + ): + valid['performance'] = False + else: + valid['performance'] = True + if "stream" in scenario.lower(): result = result_ / 1000000 #convert to milliseconds else: @@ -26,13 +36,15 @@ def get_result_from_log(version, model, scenario, result_path, mode): result = str(round(result, 3)) if has_power: - is_valid, power_metric, scenario, avg_power_efficiency = checker.get_power_metric(config, scenario, result_path, True, result_) + power_valid, power_metric, scenario, avg_power_efficiency = checker.get_power_metric(config, scenario, result_path, True, result_) result += f",{power_metric},{avg_power_efficiency}" + valid['power'] = power_valid elif mode == "accuracy" and os.path.exists(os.path.join(result_path, 'accuracy.txt')): - acc_results, acc_targets, acc_limits = get_accuracy_metric(config, mlperf_model, result_path) + acc_valid, acc_results, acc_targets, acc_limits = get_accuracy_metric(config, mlperf_model, result_path) + valid['accuracy'] = acc_valid if len(acc_results) == 1: for acc in acc_results: @@ -44,7 +56,7 @@ def get_result_from_log(version, model, scenario, result_path, mode): result_list.append(str(round(float(acc_results[acc]), 5))) result += ", ".join(result_list) + ")" - return result + return result, valid def get_accuracy_metric(config, model, path): @@ -110,7 +122,7 @@ def get_accuracy_metric(config, model, path): is_valid &= acc_limit_check - return acc_results, acc_targets, acc_limits + return is_valid, acc_results, acc_targets, acc_limits def get_result_string(version, model, scenario, result_path, has_power, sub_res): @@ -155,7 +167,7 @@ def get_result_string(version, model, scenario, result_path, has_power, sub_res) result['power'] = power_result result['power_efficiency'] = power_efficiency_result - acc_results, acc_targets, acc_limits = get_accuracy_metric(config, mlperf_model, accuracy_path) + acc_valid, acc_results, acc_targets, acc_limits = get_accuracy_metric(config, mlperf_model, accuracy_path) result_field = checker.RESULT_FIELD[effective_scenario] @@ -201,7 +213,10 @@ def get_result_table(results): row.append(model) row.append(scenario) if results[model][scenario].get('accuracy'): - row.append(results[model][scenario]['accuracy']) + val = str(results[model][scenario]['accuracy']) + if not results[model][scenario]['accuracy']['valid']: + val = "X "+val + row.append(val) else: row.append("-") @@ -211,18 +226,33 @@ def get_result_table(results): if float(results[model][scenario]['performance']) == 0: row.append("-") elif scenario.lower() == "singlestream": - row.append(str(round(1000/float(results[model][scenario]['performance']), 3))) + val_qps = str(round(1000/float(results[model][scenario]['performance']), 3)) + if not results[model][scenario]['performance']['valid']: + val_qps = "X "+val_qps + row.appenx(val_qps) elif scenario.lower() == "multistream": - row.append(str(round(8000/float(results[model][scenario]['performance']), 3))) - row.append(results[model][scenario]['performance']) + val_qps = str(round(8000/float(results[model][scenario]['performance']), 3)) + if not results[model][scenario]['performance']['valid']: + val_qps = "X "+val_qps + row.appenx(val_qps) + val = str(results[model][scenario]['performance']) + if not results[model][scenario]['performance']['valid']: + val = "X "+val + row.append(val) else: - row.append(results[model][scenario]['performance']) + val = str(results[model][scenario]['performance']) + if not results[model][scenario]['performance']['valid']: + val = "X "+val + row.append(val) row.append("-") #if results[model][scenario].get('power','') != '': # row.append(results[model][scenario]['power']) if results[model][scenario].get('power_efficiency','') != '': - row.append(results[model][scenario]['power_efficiency']) + val = str(results[model][scenario]['power_efficiency']) + if not results[model][scenario]['power']['valid']: + val = "X "+val + row.append(val) table.append(row) return table, headers From d0953cbb021069e8832b7d63331d5a39bd1beaae Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 19:53:02 +0000 Subject: [PATCH 22/42] Fix mlperf log path --- cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py index 5bbfcdf637..53d5a092c5 100644 --- a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py +++ b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py @@ -20,7 +20,7 @@ def get_result_from_log(version, model, scenario, result_path, mode): if mode == "performance": has_power = os.path.exists(os.path.join(result_path, "power")) result_ = checker.get_performance_metric(config, mlperf_model, result_path, scenario, None, None, has_power) - mlperf_log = MLPerfLog(result_path) + mlperf_log = MLPerfLog(os.path.join(result_path, mlperf_log_detail.txt)) if ( "result_validity" not in mlperf_log.get_keys() or mlperf_log["result_validity"] != "VALID" From cb685bceefc33a68fe68ea3eeeeb6c8c948fa5b4 Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 12:00:55 -0800 Subject: [PATCH 23/42] Fix SPR nvidia configs --- .../_cm.yaml | 14 ++++++++++---- .../customize.py | 8 +++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml index 2d67773b95..4259ef207e 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml @@ -178,7 +178,7 @@ variations: 3d-unet-99.9: cuda: tensorrt: - offline_target_qps: 1 + offline_target_qps: 4 rnnt: cuda: tensorrt: @@ -188,7 +188,7 @@ variations: tensorrt: offline_target_qps: 4.5 - spr,nvidia: + sapphire-rapids.24c,nvidia: default_env: EXTRA_ARGS: " --gpu_name=rtx_4090" state: @@ -211,7 +211,13 @@ variations: 3d-unet-99.9: cuda: tensorrt: - offline_target_qps: 2 + offline_target_qps: 8 + singlestream_target_latency: 400 + 3d-unet-99.9: + cuda: + tensorrt: + offline_target_qps: 8 + singlestream_target_latency: 400 rnnt: cuda: tensorrt: @@ -230,7 +236,7 @@ variations: sapphire-rapids.24c: group: sut env: - CATEGORY: edge,datacenter + CATEGORY: edge DIVISION: closed macbookpro-m1: diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py index 174bf4c5eb..b21aa4ee54 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py @@ -100,7 +100,7 @@ def preprocess(i): for device in devices: offline_target_qps = (((state.get(model, {})).get(device, {})).get(backend, {})).get('offline_target_qps') if offline_target_qps: - pass + env['EXTRA_ARGS'] += f" --offline_target_qps={offline_target_qps}" else: #try to do a test run with reasonable number of samples to get and record the actual system performance if device == "cpu": if model == "resnet50": @@ -118,6 +118,12 @@ def preprocess(i): cmd = f'run_test "{model}" "{backend}" "100" "{implementation}" "{device}" "$submission_cmd"' cmds.append(cmd) + singlestream_target_latency = (((state.get(model, {})).get(device, {})).get(backend, {})).get('singlestream_target_latency') + if singlestream_target_latency: + env['EXTRA_ARGS'] += f" --singlestream_target_latency={singlestream_target_latency}" + + print(f"EXTRA_ARGS={env['EXTRA_ARGS']}") + run_script_content += "\n\n" +"\n\n".join(cmds) with open(os.path.join(script_path, run_file_name+".sh"), 'w') as f: f.write(run_script_content) From ebf3223a0f1541a3f4f29408df4778fedb24731f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 20:02:02 +0000 Subject: [PATCH 24/42] Fix mlperf log path --- cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py index 53d5a092c5..911b9400fd 100644 --- a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py +++ b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py @@ -20,7 +20,7 @@ def get_result_from_log(version, model, scenario, result_path, mode): if mode == "performance": has_power = os.path.exists(os.path.join(result_path, "power")) result_ = checker.get_performance_metric(config, mlperf_model, result_path, scenario, None, None, has_power) - mlperf_log = MLPerfLog(os.path.join(result_path, mlperf_log_detail.txt)) + mlperf_log = MLPerfLog(os.path.join(result_path, "mlperf_log_detail.txt")) if ( "result_validity" not in mlperf_log.get_keys() or mlperf_log["result_validity"] != "VALID" From 1f7bfbecbd66d59eaf9ac6b85f6378a2c26e30b6 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 20:19:44 +0000 Subject: [PATCH 25/42] Fix for invalid results in results table --- cm-mlops/script/app-mlperf-inference/customize.py | 4 ++-- .../get-mlperf-inference-utils/mlperf_utils.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cm-mlops/script/app-mlperf-inference/customize.py b/cm-mlops/script/app-mlperf-inference/customize.py index cb52c14acb..1b1215d111 100644 --- a/cm-mlops/script/app-mlperf-inference/customize.py +++ b/cm-mlops/script/app-mlperf-inference/customize.py @@ -221,10 +221,10 @@ def postprocess(i): if not state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model].get(scenario): state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario] = {} state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode] = result - state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode]['valid'] = valid[mode] + state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode+'_valid'] = valid[mode] if power: state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power'] = power - state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power']['valid'] = valid['power'] + state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power_valid'] = valid['power'] if power_efficiency: state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power_efficiency'] = power_efficiency diff --git a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py index 911b9400fd..9628434c08 100644 --- a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py +++ b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py @@ -214,7 +214,7 @@ def get_result_table(results): row.append(scenario) if results[model][scenario].get('accuracy'): val = str(results[model][scenario]['accuracy']) - if not results[model][scenario]['accuracy']['valid']: + if not results[model][scenario]['accuracy_valid']: val = "X "+val row.append(val) else: @@ -227,21 +227,21 @@ def get_result_table(results): row.append("-") elif scenario.lower() == "singlestream": val_qps = str(round(1000/float(results[model][scenario]['performance']), 3)) - if not results[model][scenario]['performance']['valid']: + if not results[model][scenario]['performance_valid']: val_qps = "X "+val_qps row.appenx(val_qps) elif scenario.lower() == "multistream": val_qps = str(round(8000/float(results[model][scenario]['performance']), 3)) - if not results[model][scenario]['performance']['valid']: + if not results[model][scenario]['performance_valid']: val_qps = "X "+val_qps row.appenx(val_qps) val = str(results[model][scenario]['performance']) - if not results[model][scenario]['performance']['valid']: + if not results[model][scenario]['performance_valid']: val = "X "+val row.append(val) else: val = str(results[model][scenario]['performance']) - if not results[model][scenario]['performance']['valid']: + if not results[model][scenario]['performance_valid']: val = "X "+val row.append(val) row.append("-") @@ -250,7 +250,7 @@ def get_result_table(results): # row.append(results[model][scenario]['power']) if results[model][scenario].get('power_efficiency','') != '': val = str(results[model][scenario]['power_efficiency']) - if not results[model][scenario]['power']['valid']: + if not results[model][scenario]['power_valid']: val = "X "+val row.append(val) table.append(row) From 36f4eb2e088943ffab116af768d577939cd5c762 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 20:38:25 +0000 Subject: [PATCH 26/42] fix typo --- .../script/get-mlperf-inference-submission-dir/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py b/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py index 8f94abb151..92fb3735ce 100644 --- a/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py +++ b/cm-mlops/script/get-mlperf-inference-submission-dir/customize.py @@ -15,7 +15,7 @@ def preprocess(i): if env.get('CM_MLPERF_INFERENCE_SUBMISSION_DIR','') == '': if not os.path.exists("mlperf-inference-submission"): - os.makedir("mlperf-inference-submission") + os.mkdir("mlperf-inference-submission") env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.path.join(os.getcwd(), "mlperf-inference-submission") return {'return':0} From 4f6bfb4a7c10904708fc75b03b69d798a046a9e0 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 20:46:58 +0000 Subject: [PATCH 27/42] Fix results_dir for submission generation --- .../script/generate-mlperf-inference-submission/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/generate-mlperf-inference-submission/customize.py b/cm-mlops/script/generate-mlperf-inference-submission/customize.py index f71d07648d..34d63dd9f4 100644 --- a/cm-mlops/script/generate-mlperf-inference-submission/customize.py +++ b/cm-mlops/script/generate-mlperf-inference-submission/customize.py @@ -19,7 +19,7 @@ def generate_submission(i): inp=i['input'] if env.get('CM_MLPERF_INFERENCE_RESULTS_DIR_', '') == '': - env['CM_MLPERF_INFERENCE_RESULTS_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_RESULTS_DIR'], "valid_results") + env['CM_MLPERF_INFERENCE_RESULTS_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_RESULTS_DIR'], f"{env['CM_MLPERF_RUN_STYLE']}_results") mlperf_path = env['CM_MLPERF_INFERENCE_SOURCE'] submission_checker_dir = os.path.join(mlperf_path, "tools", "submission") From a660db20f5eade9ee9bd4edd9ffca31287dfff5e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 21:04:17 +0000 Subject: [PATCH 28/42] Fix results table for short run --- cm-mlops/script/app-mlperf-inference/customize.py | 1 + .../get-mlperf-inference-utils/mlperf_utils.py | 14 +++++++------- .../script/run-mlperf-inference-app/customize.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cm-mlops/script/app-mlperf-inference/customize.py b/cm-mlops/script/app-mlperf-inference/customize.py index 1b1215d111..39f6dca9ee 100644 --- a/cm-mlops/script/app-mlperf-inference/customize.py +++ b/cm-mlops/script/app-mlperf-inference/customize.py @@ -222,6 +222,7 @@ def postprocess(i): state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario] = {} state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode] = result state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario][mode+'_valid'] = valid[mode] + if power: state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power'] = power state['cm-mlperf-inference-results'][state['CM_SUT_CONFIG_NAME']][model][scenario]['power_valid'] = valid['power'] diff --git a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py index 9628434c08..1eb4cf9d9f 100644 --- a/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py +++ b/cm-mlops/script/get-mlperf-inference-utils/mlperf_utils.py @@ -214,9 +214,9 @@ def get_result_table(results): row.append(scenario) if results[model][scenario].get('accuracy'): val = str(results[model][scenario]['accuracy']) - if not results[model][scenario]['accuracy_valid']: + if not results[model][scenario].get('accuracy_valid', True): val = "X "+val - row.append(val) + row.append(val) else: row.append("-") @@ -227,21 +227,21 @@ def get_result_table(results): row.append("-") elif scenario.lower() == "singlestream": val_qps = str(round(1000/float(results[model][scenario]['performance']), 3)) - if not results[model][scenario]['performance_valid']: + if not results[model][scenario].get('performance_valid', True): # we explicitly mark invalid results val_qps = "X "+val_qps row.appenx(val_qps) elif scenario.lower() == "multistream": val_qps = str(round(8000/float(results[model][scenario]['performance']), 3)) - if not results[model][scenario]['performance_valid']: + if not results[model][scenario].get('performance_valid', True): val_qps = "X "+val_qps row.appenx(val_qps) val = str(results[model][scenario]['performance']) - if not results[model][scenario]['performance_valid']: + if not results[model][scenario].get('performance_valid', True): val = "X "+val row.append(val) else: val = str(results[model][scenario]['performance']) - if not results[model][scenario]['performance_valid']: + if not results[model][scenario].get('performance_valid', True): val = "X "+val row.append(val) row.append("-") @@ -250,7 +250,7 @@ def get_result_table(results): # row.append(results[model][scenario]['power']) if results[model][scenario].get('power_efficiency','') != '': val = str(results[model][scenario]['power_efficiency']) - if not results[model][scenario]['power_valid']: + if not results[model][scenario].get('power_valid', True): val = "X "+val row.append(val) table.append(row) diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index d415582a1b..70c48bd857 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -189,7 +189,7 @@ def preprocess(i): return r if state.get("cm-mlperf-inference-results"): - #print(state["cm-mlperf-inference-results"]) + print(state["cm-mlperf-inference-results"]) for sut in state["cm-mlperf-inference-results"]:#only one sut will be there print(sut) result_table, headers = mlperf_utils.get_result_table(state["cm-mlperf-inference-results"][sut]) From 2d07731562f8d60a73bd380b7ec0b979df21a1c7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 21:04:51 +0000 Subject: [PATCH 29/42] Fix results table for short run --- cm-mlops/script/run-mlperf-inference-app/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index 70c48bd857..d415582a1b 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -189,7 +189,7 @@ def preprocess(i): return r if state.get("cm-mlperf-inference-results"): - print(state["cm-mlperf-inference-results"]) + #print(state["cm-mlperf-inference-results"]) for sut in state["cm-mlperf-inference-results"]:#only one sut will be there print(sut) result_table, headers = mlperf_utils.get_result_table(state["cm-mlperf-inference-results"][sut]) From 58251a66603a2045750eba2c624f60a4185d211e Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 22:38:37 +0100 Subject: [PATCH 30/42] demos --- cm-mlops/automation/script/module.py | 12 +- .../README-extra.md | 30 +++++ .../app-stable-diffusion-onnx-py/_cm.yaml | 105 ++++++++++++++++++ .../app-stable-diffusion-onnx-py/process.py | 32 ++++++ .../app-stable-diffusion-onnx-py/run.bat | 2 + .../app-stable-diffusion-onnx-py/run.sh | 4 + .../download_model.py | 37 ++++-- 7 files changed, 210 insertions(+), 12 deletions(-) create mode 100644 cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md create mode 100644 cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml create mode 100644 cm-mlops/script/app-stable-diffusion-onnx-py/process.py create mode 100644 cm-mlops/script/app-stable-diffusion-onnx-py/run.bat create mode 100644 cm-mlops/script/app-stable-diffusion-onnx-py/run.sh diff --git a/cm-mlops/automation/script/module.py b/cm-mlops/automation/script/module.py index 0a51cff7eb..a40299c735 100644 --- a/cm-mlops/automation/script/module.py +++ b/cm-mlops/automation/script/module.py @@ -4133,12 +4133,14 @@ def prepare_and_run_script_with_postprocessing(i, postprocess="postprocess"): print (r['string']) print ("") - note = '''^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + note = ''' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Note that it may be a portability issue of a third-party tool or a native script -wrapped and unified by this portable CM script. In such case, please report this issue -with a full log at "https://github.com/mlcommons/ck". The CM concept is to collaboratively -fix such issues inside portable CM scripts to make existing tools and native scripts -more portable, interoperable and deterministic. Thank you''' +wrapped and unified by this automation recipe (CM script). In such case, +please report this issue with a full log at "https://github.com/mlcommons/ck". +The CM concept is to collaboratively fix such issues inside portable CM scripts +to make existing tools and native scripts more portable, interoperable +and deterministic. Thank you!''' return {'return':2, 'error':'Portable CM script failed (name = {}, return code = {})\n\n{}'.format(meta['alias'], rc, note)} diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md new file mode 100644 index 0000000000..1da13b460d --- /dev/null +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md @@ -0,0 +1,30 @@ +# Examples + +CM interface for https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/onnx + +```bash +cm run script "install python-venv" --name=sd-test +cm run script "get generic-python-lib _package.optimum[onnxruntime]" --adr.python.name=sd-test +cm run script "activate python-venv" --name=sd-test + +cm run script "python app stable-diffusion onnx" --adr.python.name=sd-test --text="crazy programmer" + +cm rm cache -f +cm run script "python app stable-diffusion onnx _cuda" --adr.python.name=sd-test --text="crazy programmer" + +cm docker script "python app stable-diffusion onnx" --text="crazy programmer" --output=. --env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO=xyz + +``` + + + +# Resources + +* https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 +* https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main +* https://huggingface.co/CompVis/stable-diffusion-v1-4/tree/main +* https://huggingface.co/runwayml/stable-diffusion-v1-5 +* https://huggingface.co/bes-dev/stable-diffusion-v1-4-onnx +* https://onnxruntime.ai/docs/tutorials/csharp/stable-diffusion-csharp.html +* https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main +* https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml b/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml new file mode 100644 index 0000000000..bbdb1cc594 --- /dev/null +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml @@ -0,0 +1,105 @@ +alias: app-stable-diffusion-onnx-py +uid: 4d33981ac3534b3b + +automation_alias: script +automation_uid: 5b4e0237da074764 + +category: "Modular AI/ML application pipeline" + +tags: +- app +- stable +- diffusion +- stable-diffusion +- onnx +- python + + +deps: +- tags: detect,os +- tags: get,sys-utils-cm +- names: + - python + - python3 + tags: get,python3 + +- tags: get,cuda + names: + - cuda + enable_if_env: + USE_CUDA: + - yes +- tags: get,cudnn + names: + - cudnn + enable_if_env: + USE_CUDA: + - yes + + + + + + + +- tags: get,generic-python-lib,_package.optimum[onnxruntime] + names: + - optimum + skip_if_env: + USE_CUDA: + - yes + +- tags: get,generic-python-lib,_package.optimum[onnxruntime-gpu] + names: + - optimum + enable_if_env: + USE_CUDA: + - yes + +- tags: get,generic-python-lib,_package.diffusers + names: + - diffusers + + +- tags: get,ml-model,huggingface,zoo,_model-stub.runwayml/stable-diffusion-v1-5 + revision: onnx + model_filename: model_index.json + full_subfolder: . + + +variations: + cuda: + group: target + env: + USE_CUDA: yes + CM_DEVICE: cuda:0 + + cpu: + group: target + default: yes + env: + USE_CPU: yes + CM_DEVICE: cpu + +input_mapping: + text: CM_APP_STABLE_DIFFUSION_ONNX_PY_TEXT + output: CM_APP_STABLE_DIFFUSION_ONNX_PY_OUTPUT + + +input_description: + text: + desc: "Text to generate image" + output: + desc: "Output directory" + + +docker: + skip_run_cmd: 'no' + all_gpus: 'yes' + input_paths: + - text + - output + skip_input_for_fake_run: + - text + - output + - env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/process.py b/cm-mlops/script/app-stable-diffusion-onnx-py/process.py new file mode 100644 index 0000000000..e32b2b1fa3 --- /dev/null +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/process.py @@ -0,0 +1,32 @@ +# https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/onnx + +import os + +from optimum.onnxruntime import ORTStableDiffusionPipeline + +output = os.environ.get('CM_APP_STABLE_DIFFUSION_ONNX_PY_OUTPUT','') + +f = os.path.join(output, 'output.png') + +if os.path.isfile(f): + os.remove(f) + +cm_model_path = os.environ.get('CM_ML_MODEL_PATH','') +if cm_model_path == '': + print ('Error: CM_ML_MODEL_PATH env is not defined') + exit(1) + +device = os.environ.get('CM_DEVICE','') + +pipeline = ORTStableDiffusionPipeline.from_pretrained(cm_model_path, local_files_only=True).to(device) + +text = os.environ.get('CM_APP_STABLE_DIFFUSION_ONNX_PY_TEXT','') +if text == '': text = "a photo of an astronaut riding a horse on mars" + + +print ('') +print ('Generating imaged based on "{}"'.format(text)) + +image = pipeline(text).images[0] + +image.save(f) diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/run.bat b/cm-mlops/script/app-stable-diffusion-onnx-py/run.bat new file mode 100644 index 0000000000..fbcf3a07ef --- /dev/null +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/run.bat @@ -0,0 +1,2 @@ +%CM_PYTHON_BIN_WITH_PATH% %CM_TMP_CURRENT_SCRIPT_PATH%\process.py +IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/run.sh b/cm-mlops/script/app-stable-diffusion-onnx-py/run.sh new file mode 100644 index 0000000000..efffec67f0 --- /dev/null +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +${CM_PYTHON_BIN} ${CM_TMP_CURRENT_SCRIPT_PATH}/process.py +test $? -eq 0 || exit 1 diff --git a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py index 5494d6f31e..0cc2006f10 100644 --- a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py +++ b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py @@ -19,18 +19,18 @@ f.write(f"CM_ML_MODEL_FILE_WITH_PATH={os.path.join(os.getcwd(),'')}") else: + subfolder = os.environ.get('CM_HF_SUBFOLDER', '') + full_subfolder = os.environ.get('CM_HF_FULL_SUBFOLDER', '') + model_filename = os.environ.get('CM_MODEL_ZOO_FILENAME', '') - if model_filename == '': + if model_filename == '': model_filename = 'model.onnx' - subfolder = os.environ.get('CM_HF_SUBFOLDER', '') - model_filenames = model_filename.split(',') if ',' in model_filename else [model_filename] # First must be model base_model_filename = model_filenames[0] - full_subfolder = os.environ.get('CM_HF_FULL_SUBFOLDER', '') files = [] if full_subfolder!='': @@ -43,19 +43,42 @@ print ('') print ('Listing files in {} ...'.format(path)) - files=fs.ls(path, detail=False) + def list_hf_files(path): + all_files = [] + + files=fs.ls(path, revision=revision) #, detail=False) + + for f in files: + fname = f['name'] + fdir = f['type'] == 'directory' + + if fdir: + all_files += list_hf_files(fname) + else: + all_files.append(fname) + + return all_files + + + files=list_hf_files(path) print ('') print ('Found {} files'.format(len(files))) for f in files: - ff = f[len(model_stub)+1:] + print (f) + + remove = len(model_stub)+1 + + if revision!='': + remove+=len(revision)+1 + + ff = f[remove:] if ff not in model_filenames: model_filenames.append(ff) - print ('') for model_filename in model_filenames: From c86e87e697aa1d8c62a28d689d06ecbc416270ee Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 21:46:27 +0000 Subject: [PATCH 31/42] Update README_aws_dl2q.24xlarge.md --- .../README_aws_dl2q.24xlarge.md | 44 +++---------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md index a423945bdf..8561a38790 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md +++ b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md @@ -29,25 +29,15 @@ cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic ### Full valid run ``` -cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic \ +cm run script --tags=generate-run-cmds,inference,_submission --device=qaic \ --backend=glow --scenario=Offline --implementation=kilt --model=bert-99 --precision=uint8 \ --adr.mlperf-inference-implementation.tags=_loadgen-batch-size.4096,_dl2q.24xlarge \ --rerun --quiet --execution-mode=valid ``` The expected performance is ~5700 QPS -* Use `--scenario=Server --server_target_qps=5200` to run the server scenario - -### Accuracy run -``` -cm run script --tags=generate-run-cmds,inference,_accuracy-only --device=qaic \ ---backend=glow --scenario=Offline --implementation=kilt --model=bert-99 --precision=uint8 \ ---adr.mlperf-inference-implementation.tags=_loadgen-batch-size.4096,_dl2q.24xlarge \ ---rerun --quiet --execution-mode=valid -``` - The expected accuracy is ~90 - +* Use `--scenario=Server --server_target_qps=5200` to run the server scenario ## ResNet50 @@ -70,26 +60,15 @@ cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic ### Full valid run ``` -cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic --backend=glow \ +cm run script --tags=generate-run-cmds,inference,_submission --device=qaic --backend=glow \ --scenario=Offline --implementation=kilt --model=resnet50 \ --precision=uint8 --rerun --adr.compiler.tags=gcc \ --adr.mlperf-inference-implementation.tags=_bs.8,_dl2q.24xlarge --execution-mode=valid --quiet ``` Expected performance is ~157500 -* Use `--scenario=Server --server_target_qps=152000` to run the server scenario - -### Accuracy run - -``` -cm run script --tags=generate-run-cmds,inference,_accuracy-only --device=qaic --backend=glow \ ---scenario=Offline --implementation=kilt --model=resnet50 \ ---precision=uint8 --rerun --adr.compiler.tags=gcc \ ---adr.mlperf-inference-implementation.tags=_bs.8,_dl2q.24xlarge --execution-mode=valid --quiet -``` - Expected accuracy is 75.936% - +* Use `--scenario=Server --server_target_qps=152000` to run the server scenario ## RetinaNet @@ -106,22 +85,13 @@ cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic ### Full valid run ``` -cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic --backend=glow \ +cm run script --tags=generate-run-cmds,inference,_submission --device=qaic --backend=glow \ --scenario=Offline --implementation=kilt --model=retinanet \ --precision=uint8 --rerun --adr.compiler.tags=gcc --adr.dataset-preprocessed.tags=_custom-annotations\ --adr.mlperf-inference-implementation.tags=_bs.1,_dl2q.24xlarge --execution-mode=valid --quiet ``` Expected performance is ~2200 -* Use `--scenario=Server --server_target_qps=2050` to run the server scenario - -### Accuracy run - -``` -cm run script --tags=generate-run-cmds,inference,_accuracy-only --device=qaic --backend=glow \ ---scenario=Offline --implementation=kilt --model=retinanet \ ---precision=uint8 --rerun --adr.compiler.tags=gcc --adr.dataset-preprocessed.tags=_custom-annotations \ ---adr.mlperf-inference-implementation.tags=_bs.1,_dl2q.24xlarge --execution-mode=valid --quiet -``` - The expected accuracy is 37.234 +* Use `--scenario=Server --server_target_qps=2050` to run the server scenario + From a3f87c20745db3617f2d2297e3b28c63af04a397 Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 22:04:37 +0000 Subject: [PATCH 32/42] Fixes for dl2q qaic run --- .../script/calibrate-model-for.qaic/_cm.json | 6 ++--- .../_cm.json | 5 +--- .../_cm.json | 26 ++++++++++++++++++- cm-mlops/script/get-sys-utils-cm/customize.py | 4 +++ cm-mlops/script/get-sys-utils-cm/run-rhel.sh | 4 +++ .../script/run-mlperf-inference-app/_cm.yaml | 4 +-- .../run-mlperf-inference-app/customize.py | 2 +- 7 files changed, 40 insertions(+), 11 deletions(-) diff --git a/cm-mlops/script/calibrate-model-for.qaic/_cm.json b/cm-mlops/script/calibrate-model-for.qaic/_cm.json index 0de40a5991..17c43c685f 100644 --- a/cm-mlops/script/calibrate-model-for.qaic/_cm.json +++ b/cm-mlops/script/calibrate-model-for.qaic/_cm.json @@ -39,8 +39,7 @@ }, "names": [ "imagenet-cal", - "preprocessed-dataset", - "dataset-preprocessed" + "preprocessed-calibration-dataset" ], "tags": "get,dataset,imagenet,preprocessed,_calibration,_for.resnet50,_float32,_rgb32" }, @@ -170,7 +169,8 @@ "seq-length": "seq.384" }, "env": { - "CM_QAIC_MODEL_NAME": "bert-large" + "CM_QAIC_MODEL_NAME": "bert-large", + "CM_CREATE_INPUT_BATCH": "no" }, "adr": { "model-src": { diff --git a/cm-mlops/script/get-preprocessed-dataset-imagenet/_cm.json b/cm-mlops/script/get-preprocessed-dataset-imagenet/_cm.json index 1c7f7a625f..99aad32e8a 100644 --- a/cm-mlops/script/get-preprocessed-dataset-imagenet/_cm.json +++ b/cm-mlops/script/get-preprocessed-dataset-imagenet/_cm.json @@ -35,11 +35,8 @@ }, { "tags": "get,dataset,imagenet,calibration", - "skip_if_env": { - "CM_IMAGENET_PREPROCESSED_PATH": [ "on" ] - }, "enable_if_env": { - "CM_DATASET_TYPE": "calibration" + "CM_DATASET_TYPE": [ "calibration" ] } }, { diff --git a/cm-mlops/script/get-preprocessed-dataset-openimages/_cm.json b/cm-mlops/script/get-preprocessed-dataset-openimages/_cm.json index 766f9ddde5..0f796a7c6d 100644 --- a/cm-mlops/script/get-preprocessed-dataset-openimages/_cm.json +++ b/cm-mlops/script/get-preprocessed-dataset-openimages/_cm.json @@ -39,6 +39,10 @@ { "tags": "get,generic-python-lib,_package.ujson" }, + { + "names": [ "numpy" ], + "tags": "get,generic-python-lib,_numpy" + }, { "names": [ "numpy" ], "tags": "get,generic-python-lib,_numpy" @@ -72,6 +76,20 @@ "variations": { "generic-preprocessor": { "group": "preprocessing-source", + "deps": [ + { + "tags": "get,generic-python-lib,_torch", + "names": [ + "torch", "pytorch" + ] + }, + { + "tags": "get,generic-python-lib,_torchvision", + "names": [ + "torchvision" + ] + } + ], "prehook_deps": [ { "tags": "get,generic,image-preprocessor" @@ -286,10 +304,16 @@ "CM_DATASET_PREPROCESSED_EXTENSION": "npy" } }, + "raw": { + "group": "extension", + "env": { + "CM_DATASET_PREPROCESSED_EXTENSION": "raw" + } + }, "fp32": { "group": "dataset-precision", "default_variations": { - "extension": "npy" + "extension": "raw" }, "default": true, "env": { diff --git a/cm-mlops/script/get-sys-utils-cm/customize.py b/cm-mlops/script/get-sys-utils-cm/customize.py index d407a63611..e9a1890852 100644 --- a/cm-mlops/script/get-sys-utils-cm/customize.py +++ b/cm-mlops/script/get-sys-utils-cm/customize.py @@ -10,6 +10,10 @@ def preprocess(i): automation = i['automation'] cm = automation.cmind + if env.get('CM_HOST_OS_FLAVOR', '') == 'amzn': + env['CM_PACKAGE_TOOL'] = "yum" + i['run_script_input']['script_name'] = "run-rhel" + # Test (not needed - will be removed) if env.get('CM_SKIP_SYS_UTILS','').lower() in [True, 'yes', 'on']: return {'return':0, 'skip':True} diff --git a/cm-mlops/script/get-sys-utils-cm/run-rhel.sh b/cm-mlops/script/get-sys-utils-cm/run-rhel.sh index 9e3959d36d..f247f807e2 100644 --- a/cm-mlops/script/get-sys-utils-cm/run-rhel.sh +++ b/cm-mlops/script/get-sys-utils-cm/run-rhel.sh @@ -11,6 +11,10 @@ if [[ "$CM_QUIET" != "yes" ]]; then if [[ "$DUMMY" == "skip" ]]; then exit 0; fi fi +if [[ "$CM_HOST_OS_FLAVOR" == "amzn" ]]; then + ${CM_SUDO} yum groupinstall "Development Tools" +fi + CM_PACKAGE_TOOL=${CM_PACKAGE_TOOL:-dnf} ${CM_SUDO} ${CM_PACKAGE_TOOL} update && \ diff --git a/cm-mlops/script/run-mlperf-inference-app/_cm.yaml b/cm-mlops/script/run-mlperf-inference-app/_cm.yaml index d130a44650..0368c82e43 100644 --- a/cm-mlops/script/run-mlperf-inference-app/_cm.yaml +++ b/cm-mlops/script/run-mlperf-inference-app/_cm.yaml @@ -47,7 +47,7 @@ input_mapping: docker: CM_MLPERF_USE_DOCKER dump_version_info: CM_DUMP_VERSION_INFO save_console_log: CM_SAVE_CONSOLE_LOG - execution_mode: CM_MLPERF_EXECUTION_MODE + execution_mode: CM_MLPERF_RUN_STYLE find_performance: CM_MLPERF_FIND_PERFORMANCE_MODE gpu_name: CM_NVIDIA_GPU_NAME hw_name: CM_HW_NAME @@ -74,7 +74,7 @@ input_mapping: results_dir: OUTPUT_BASE_DIR results_git_url: CM_MLPERF_RESULTS_GIT_REPO_URL run_checker: CM_RUN_SUBMISSION_CHECKER - run_style: CM_MLPERF_EXECUTION_MODE + run_style: CM_MLPERF_RUN_STYLE scenario: CM_MLPERF_LOADGEN_SCENARIO server_target_qps: CM_MLPERF_LOADGEN_SERVER_TARGET_QPS singlestream_target_latency: CM_MLPERF_LOADGEN_SINGLESTREAM_TARGET_LATENCY diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index d415582a1b..b421200692 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -113,7 +113,7 @@ def preprocess(i): variation_model= ",_" + env["CM_MLPERF_MODEL"] variation_backend= ",_" + env["CM_MLPERF_BACKEND"] if env.get("CM_MLPERF_BACKEND","") != "" else "" variation_device= ",_" + env["CM_MLPERF_DEVICE"] if env.get("CM_MLPERF_DEVICE","") != "" else "" - variation_run_style= ",_" + env.get("CM_MLPERF_EXECUTION_MODE", "test") + variation_run_style= ",_" + env.get("CM_MLPERF_RUN_STYLE", "test") variation_reproducibility= ",_" + env["CM_RUN_MLPERF_INFERENCE_APP_DEFAULTS"] if env.get("CM_MLPERF_MODEL_PRECISION", '') != '': From 1256385ee4395b125e9d9576569cda7afea8bf5c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 15 Feb 2024 22:06:24 +0000 Subject: [PATCH 33/42] Update README_aws_dl2q.24xlarge.md --- .../README_aws_dl2q.24xlarge.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md index 8561a38790..7dde066479 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md +++ b/cm-mlops/script/reproduce-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md @@ -87,7 +87,7 @@ cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic ``` cm run script --tags=generate-run-cmds,inference,_submission --device=qaic --backend=glow \ --scenario=Offline --implementation=kilt --model=retinanet \ ---precision=uint8 --rerun --adr.compiler.tags=gcc --adr.dataset-preprocessed.tags=_custom-annotations\ +--precision=uint8 --rerun --adr.compiler.tags=gcc --adr.dataset-preprocessed.tags=_custom-annotations \ --adr.mlperf-inference-implementation.tags=_bs.1,_dl2q.24xlarge --execution-mode=valid --quiet ``` Expected performance is ~2200 From ff31843e8a954f9229da09d214acbf599d52932a Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 23:25:18 +0100 Subject: [PATCH 34/42] docker clean up --- cm-mlops/automation/script/module_misc.py | 17 ++++++++++++----- .../README-extra.md | 2 +- .../app-stable-diffusion-onnx-py/_cm.yaml | 3 ++- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cm-mlops/automation/script/module_misc.py b/cm-mlops/automation/script/module_misc.py index 51ed8d9eff..9a6948eca3 100644 --- a/cm-mlops/automation/script/module_misc.py +++ b/cm-mlops/automation/script/module_misc.py @@ -1187,9 +1187,10 @@ def regenerate_script_cmd(i): skip_input_for_fake_run = docker_settings.get('skip_input_for_fake_run', []) + add_quotes_to_keys = docker_settings.get('add_quotes_to_keys', []) - def rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, key_prefix): + def rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, add_quotes_to_keys, key_prefix): run_cmd = '' @@ -1212,16 +1213,22 @@ def rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, key_prefix): v = i_run_cmd[k] + q = '"' if long_key in add_quotes_to_keys else '' + if type(v)==dict: - run_cmd += rebuild_flags(v, fake_run, skip_input_for_fake_run, long_key) + run_cmd += rebuild_flags(v, fake_run, skip_input_for_fake_run, add_quotes_to_keys, long_key) elif type(v)==list: - run_cmd+=' --'+long_key+',='+','.join(v) + x = '' + for vv in v: + if x != '': x+=',' + x+=q+str(vv)+q + run_cmd+=' --'+long_key+',=' + x else: - run_cmd+=' --'+long_key+'='+str(v) + run_cmd+=' --'+long_key+'='+q+str(v)+q return run_cmd - run_cmd += rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, '') + run_cmd += rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, add_quotes_to_keys, '') run_cmd = docker_run_cmd_prefix + ' && ' + run_cmd if docker_run_cmd_prefix!='' else run_cmd diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md index 1da13b460d..f4f00adaf5 100644 --- a/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md @@ -12,7 +12,7 @@ cm run script "python app stable-diffusion onnx" --adr.python.name=sd-test --tex cm rm cache -f cm run script "python app stable-diffusion onnx _cuda" --adr.python.name=sd-test --text="crazy programmer" -cm docker script "python app stable-diffusion onnx" --text="crazy programmer" --output=. --env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO=xyz +cm docker script "python app stable-diffusion onnx" --text="crazy programmer" --output=. --docker_cm_repo=ctuning@mlcommons-ck --env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO=xyz2 ``` diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml b/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml index bbdb1cc594..c611691f23 100644 --- a/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/_cm.yaml @@ -97,8 +97,9 @@ docker: skip_run_cmd: 'no' all_gpus: 'yes' input_paths: - - text - output + add_quotes_to_keys: + - text skip_input_for_fake_run: - text - output From 8095a6250b2d6cd946a1efe4b5035213583938c0 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 23:52:52 +0100 Subject: [PATCH 35/42] minor clean up --- cm-mlops/script/app-stable-diffusion-onnx-py/process.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/process.py b/cm-mlops/script/app-stable-diffusion-onnx-py/process.py index e32b2b1fa3..0f759089b7 100644 --- a/cm-mlops/script/app-stable-diffusion-onnx-py/process.py +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/process.py @@ -30,3 +30,5 @@ image = pipeline(text).images[0] image.save(f) + +print ('Image recorded to "{}"'.format(f)) From 38482e94a76d1a39bf696130f89b68e9e8e6432f Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 15 Feb 2024 23:56:34 +0100 Subject: [PATCH 36/42] clean up --- cm-mlops/script/app-loadgen-generic-python/README-extra.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/app-loadgen-generic-python/README-extra.md b/cm-mlops/script/app-loadgen-generic-python/README-extra.md index f15dcc14d6..db38e6d871 100644 --- a/cm-mlops/script/app-loadgen-generic-python/README-extra.md +++ b/cm-mlops/script/app-loadgen-generic-python/README-extra.md @@ -266,7 +266,7 @@ Available flags mapped to environment variables: ## Running this app via Docker ```bash -cm docker script "python app loadgen-generic _onnxruntime _cuda _custom _huggingface _model-stub.steerapi/Llama-2-7b-chat-hf-onnx-awq-w8" --adr.hf-downloader.model_filename=onnx/decoder_model_merged_quantized.onnx,onnx/decoder_model_merged_quantized.onnx_data --samples=2 --output_dir=. --docker_cm_repo=ctuning@mlcommons-ck +cm docker script "python app loadgen-generic _onnxruntime _custom _huggingface _model-stub.ctuning/mlperf-inference-bert-onnx-fp32-squad-v1.1" --adr.hf-downloader.model_filename=model.onnx --samples=2 --output_dir=. --docker_cm_repo=ctuning@mlcommons-ck ``` ## Tuning CPU performance via CM experiment From d7ad2031d0b853fe59e787328b641bc68cb159ed Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 14:58:06 -0800 Subject: [PATCH 37/42] Fix model starting weights --- cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml b/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml index e698ac91da..0c27e40fdf 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml +++ b/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml @@ -438,6 +438,8 @@ variations: deps: - tags: get,generic-python-lib,_package.datasets - tags: get,generic-python-lib,_package.simplejson + env: + CM_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download" gptj_,build: deps: From 5a775570e5b3a771c0467c9d56e2e62227287d4f Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 13:51:38 -0800 Subject: [PATCH 38/42] fix retinanet model soft link for nvidia-harness --- cm-mlops/script/reproduce-mlperf-inference-nvidia/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/reproduce-mlperf-inference-nvidia/customize.py b/cm-mlops/script/reproduce-mlperf-inference-nvidia/customize.py index c1f078665b..4983e45fe3 100644 --- a/cm-mlops/script/reproduce-mlperf-inference-nvidia/customize.py +++ b/cm-mlops/script/reproduce-mlperf-inference-nvidia/customize.py @@ -143,7 +143,7 @@ def preprocess(i): model_path = os.path.join(target_model_path_dir, 'retinanet-fpn-torch2.1-postprocessed.onnx') alt_model_path = os.path.join(target_model_path_dir, 'retinanet-fpn-torch2.2-postprocessed.onnx') if not os.path.exists(model_path) and os.path.exists(alt_model_path): - cmds.append(f"ln -sf {model_path} {alt_model_path}") + cmds.append(f"ln -s {alt_model_path} {model_path}") model_name = "retinanet" From 7421d85639b35e4ee00f01ab4c3ee3c24502f61c Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Feb 2024 15:13:12 -0800 Subject: [PATCH 39/42] Fix SS command for nvidia-harness --- .../customize.py | 5 ++--- .../run-template.sh | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py index b21aa4ee54..96d4973ee0 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py @@ -116,13 +116,12 @@ def preprocess(i): cmds.append(cmd) #second argument is unused for submission_cmd cmd = f'run_test "{model}" "{backend}" "100" "{implementation}" "{device}" "$submission_cmd"' - cmds.append(cmd) singlestream_target_latency = (((state.get(model, {})).get(device, {})).get(backend, {})).get('singlestream_target_latency') if singlestream_target_latency: - env['EXTRA_ARGS'] += f" --singlestream_target_latency={singlestream_target_latency}" + cmd += f" --singlestream_target_latency={singlestream_target_latency}" - print(f"EXTRA_ARGS={env['EXTRA_ARGS']}") + cmds.append(cmd) run_script_content += "\n\n" +"\n\n".join(cmds) with open(os.path.join(script_path, run_file_name+".sh"), 'w') as f: diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh index 1557c58d09..47dd7836a7 100644 --- a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh +++ b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/run-template.sh @@ -45,6 +45,10 @@ find_performance_cmd='cm run script --tags=generate-run-cmds,inference,_find-per --model=$model --implementation=$implementation --device=$device --backend=$backend \ --category=edge --division=open --scenario=Offline --quiet --test_query_count=$test_query_count $rerun ${EXTRA_ARGS}' +find_ss_performance_cmd='cm run script --tags=generate-run-cmds,inference,_find-performance \ +--model=$model --implementation=$implementation --device=$device --backend=$backend \ +--category=edge --division=open --scenario=SingleStream --quiet --test_query_count=$test_query_count $rerun ${EXTRA_ARGS}' + submission_cmd='cm run script --tags=generate-run-cmds,inference,_submission,_all-scenarios \ --model=$model --implementation=$implementation --device=$device --backend=$backend \ --category=$category --division=$division --quiet \ From c1ec39a421a6c0bae5e0f1f6ed81780141ce51ba Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Fri, 16 Feb 2024 00:53:29 +0100 Subject: [PATCH 40/42] improving docker automation --- cm-mlops/automation/script/module_misc.py | 2 +- cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cm-mlops/automation/script/module_misc.py b/cm-mlops/automation/script/module_misc.py index 9a6948eca3..9a0085b9fe 100644 --- a/cm-mlops/automation/script/module_misc.py +++ b/cm-mlops/automation/script/module_misc.py @@ -1213,7 +1213,7 @@ def rebuild_flags(i_run_cmd, fake_run, skip_input_for_fake_run, add_quotes_to_ke v = i_run_cmd[k] - q = '"' if long_key in add_quotes_to_keys else '' + q = '\\"' if long_key in add_quotes_to_keys else '' if type(v)==dict: run_cmd += rebuild_flags(v, fake_run, skip_input_for_fake_run, add_quotes_to_keys, long_key) diff --git a/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md index f4f00adaf5..ecab8070eb 100644 --- a/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md +++ b/cm-mlops/script/app-stable-diffusion-onnx-py/README-extra.md @@ -12,7 +12,7 @@ cm run script "python app stable-diffusion onnx" --adr.python.name=sd-test --tex cm rm cache -f cm run script "python app stable-diffusion onnx _cuda" --adr.python.name=sd-test --text="crazy programmer" -cm docker script "python app stable-diffusion onnx" --text="crazy programmer" --output=. --docker_cm_repo=ctuning@mlcommons-ck --env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO=xyz2 +cm docker script "python app stable-diffusion onnx" --text="crazy programmer" --output=. --docker_cm_repo=ctuning@mlcommons-ck --env.CM_DOCKER_ADD_FLAG_TO_CM_MLOPS_REPO=xyz4 ``` From 966df6219a82f57aadc0af8d37743345579eab7b Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Fri, 16 Feb 2024 02:17:31 +0100 Subject: [PATCH 41/42] fixing hugging face download, adding latest cmake prebuilt --- .../README-extra.md | 4 +++ .../download_model.py | 27 +++++++++---------- .../script/install-cmake-prebuilt/_cm.json | 2 +- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cm-mlops/script/get-ml-model-huggingface-zoo/README-extra.md b/cm-mlops/script/get-ml-model-huggingface-zoo/README-extra.md index 7a2eebf295..b7ec3407b2 100644 --- a/cm-mlops/script/get-ml-model-huggingface-zoo/README-extra.md +++ b/cm-mlops/script/get-ml-model-huggingface-zoo/README-extra.md @@ -15,3 +15,7 @@ cmr "get ml-model huggingface zoo _model-stub.Intel/gpt-j-6B-int8-static" --mode ```bash cmr "get ml-model huggingface zoo _model-stub.runwayml/stable-diffusion-v1-5" --revision=onnx --model_filename=unet/model.onnx,unet/weights.pb ``` + +```bash +cmr "get ml-model huggingface zoo _model-stub.ctuning/mlperf-inference-bert-onnx-fp32-squad-v1.1" --model_filename=model.onnx +``` diff --git a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py index 0cc2006f10..4e6e9c86e8 100644 --- a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py +++ b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py @@ -46,7 +46,8 @@ def list_hf_files(path): all_files = [] - files=fs.ls(path, revision=revision) #, detail=False) + xrevision = None if revision == '' else revision + files=fs.ls(path, revision=xrevision) #, detail=False) for f in files: fname = f['name'] @@ -66,7 +67,6 @@ def list_hf_files(path): print ('Found {} files'.format(len(files))) for f in files: - print (f) remove = len(model_stub)+1 @@ -89,19 +89,16 @@ def list_hf_files(path): if extra_dir!='' and not os.path.exists(extra_dir): os.makedirs(extra_dir) - if subfolder == '': - hf_hub_download(repo_id=model_stub, - filename=model_filename, - force_filename=model_filename, - revision=revision, - cache_dir=os.getcwd()) - else: - hf_hub_download(repo_id=model_stub, - subfolder=subfolder, - filename=model_filename, - force_filename=model_filename, - revision=revision, - cache_dir=os.getcwd()) + + xrevision = None if revision == '' else revision + xsubfolder = None if subfolder == '' else subfolder + + hf_hub_download(repo_id=model_stub, + subfolder=xsubfolder, + filename=model_filename, + force_filename=model_filename, + revision=xrevision, + cache_dir=os.getcwd()) print ('') diff --git a/cm-mlops/script/install-cmake-prebuilt/_cm.json b/cm-mlops/script/install-cmake-prebuilt/_cm.json index 66d317fa8e..5eacad9f0b 100644 --- a/cm-mlops/script/install-cmake-prebuilt/_cm.json +++ b/cm-mlops/script/install-cmake-prebuilt/_cm.json @@ -4,7 +4,7 @@ "automation_alias": "script", "automation_uid": "5b4e0237da074764", "cache": true, - "default_version": "3.21.1", + "default_version": "3.28.3", "new_env_keys": [ "CM_CMAKE_*", "CM_GET_DEPENDENT_CACHED_PATH", From 09c941efc557ff4f643bf4e65e0e59f5ece1d1c8 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Fri, 16 Feb 2024 02:27:53 +0100 Subject: [PATCH 42/42] typo --- cm-mlops/script/app-loadgen-generic-python/README-extra.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/app-loadgen-generic-python/README-extra.md b/cm-mlops/script/app-loadgen-generic-python/README-extra.md index db38e6d871..1bcdabddfb 100644 --- a/cm-mlops/script/app-loadgen-generic-python/README-extra.md +++ b/cm-mlops/script/app-loadgen-generic-python/README-extra.md @@ -266,7 +266,7 @@ Available flags mapped to environment variables: ## Running this app via Docker ```bash -cm docker script "python app loadgen-generic _onnxruntime _custom _huggingface _model-stub.ctuning/mlperf-inference-bert-onnx-fp32-squad-v1.1" --adr.hf-downloader.model_filename=model.onnx --samples=2 --output_dir=. --docker_cm_repo=ctuning@mlcommons-ck +cm docker script "python app loadgen-generic _onnxruntime _custom _huggingface _model-stub.ctuning/mlperf-inference-bert-onnx-fp32-squad-v1.1" --adr.hf-downloader.model_filename=model.onnx --samples=2 --output_dir=new_results --docker_cm_repo=ctuning@mlcommons-ck ``` ## Tuning CPU performance via CM experiment