Merge pull request #21 from arjunsuresh/mlperf-inference

Merge
mlcommons · May 22, 2024 · c35530e · c35530e
2 parents 38941fc + 839d06f
commit c35530e
Show file tree

Hide file tree

Showing 12 changed files with 192 additions and 6 deletions.
diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml
@@ -311,6 +311,23 @@ post_deps:
 
 # Variations to customize dependencies
 variations:
+  # MLPerf inference version
+  v4.0:
+    group: version
+    default: true
+    env:
+      CM_MLPERF_INFERENCE_VERSION: "v4.0"
+      CM_MLPERF_GPTJ_MODEL_FP8_PATH_SUFFIX: GPTJ-FP8-quantized
+    adr:
+      pytorch:
+        tags: _for-nvidia-mlperf-inference-v4.0
+  v3.1:
+    env:
+      CM_MLPERF_INFERENCE_VERSION: "v3.1"
+      CM_MLPERF_GPTJ_MODEL_FP8_PATH_SUFFIX: GPTJ-07142023.pth
+    adr:
+      pytorch:
+        tags: _for-nvidia-mlperf-inference-v3.1
   # Target devices
   cpu:
     group: device
@@ -369,6 +386,7 @@ variations:
     - tags: get,generic-python-lib,_transformers
     - tags: get,generic-python-lib,_safetensors
     - tags: get,generic-python-lib,_onnx
+    - tags: get,generic-python-lib,_onnx-graphsurgeon
 
   bert-99:
     group: model
@@ -479,18 +497,25 @@ variations:
     deps:
     - tags: get,generic-python-lib,_package.datasets
     - tags: get,generic-python-lib,_package.simplejson
+    - tags: get,generic-python-lib,_onnx
+    - tags: get,generic-python-lib,_transformers
+    - tags: get,generic-python-lib,_onnx-graphsurgeon
     env:
       CM_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download"
 
   gptj_,build:
     deps:
-    - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v3.1
+    - tags: install,pytorch,from.src
+      names:
+      - pytorch
     - tags: get,cmake
       version_min: "3.25.0"
 
   gptj_,build_engine:
     deps:
-    - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v3.1
+    - tags: install,pytorch,from.src
+      names:
+      - pytorch
     - tags: get,cmake
       version_min: "3.25.0"
 
@@ -880,7 +905,9 @@ variations:
 
   gptj_,run_harness:
     deps:
-    - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v3.1
+    - tags: install,pytorch,from.src
+      names:
+      - pytorch
     - tags: get,cmake
       version_min: "3.25.0"
     env:

diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py
@@ -156,7 +156,7 @@ def preprocess(i):
             cmds.append("make download_data BENCHMARKS='gptj'")
 
         fp32_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'GPTJ-6B', 'checkpoint-final')
-        fp8_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'GPTJ-6B', 'fp8-quantized-ammo', 'GPTJ-07142023.pth')
+        fp8_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'GPTJ-6B', 'fp8-quantized-ammo', env['CM_MLPERF_GPTJ_MODEL_FP8_PATH_SUFFIX'])
         vocab_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'bert', 'vocab.txt')
 
         if not os.path.exists(os.path.dirname(fp32_model_path)):

diff --git a/script/authenticate-github-cli/_cm.yaml b/script/authenticate-github-cli/_cm.yaml
@@ -0,0 +1,16 @@
+alias: authenticate-github-cli
+automation_alias: script
+automation_uid: 5b4e0237da074764
+cache: true
+input_mapping:
+  with_token: CM_GH_AUTH_TOKEN
+  with-token: CM_GH_AUTH_TOKEN
+tags:
+- auth
+- authenticate
+- github
+- gh
+- cli
+uid: 7b57673ac14a4337
+deps:
+  - tags: get,gh,cli
diff --git a/script/authenticate-github-cli/customize.py b/script/authenticate-github-cli/customize.py
@@ -0,0 +1,27 @@
+from cmind import utils
+import os
+
+def preprocess(i):
+
+    os_info = i['os_info']
+
+    env = i['env']
+
+    meta = i['meta']
+
+    automation = i['automation']
+
+    cmd = "gh auth login"
+    if env.get('CM_GH_AUTH_TOKEN', '') != '':
+        cmd = f" echo {env['CM_GH_AUTH_TOKEN']} | {cmd} --with-token"
+
+    env['CM_RUN_CMD'] = cmd
+    quiet = (env.get('CM_QUIET', False) == 'yes')
+
+    return {'return':0}
+
+def postprocess(i):
+
+    env = i['env']
+
+    return {'return':0}
diff --git a/script/authenticate-github-cli/run.bat b/script/authenticate-github-cli/run.bat
@@ -0,0 +1 @@
+rem native script
diff --git a/script/authenticate-github-cli/run.sh b/script/authenticate-github-cli/run.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#CM Script location: ${CM_TMP_CURRENT_SCRIPT_PATH}
+
+#To export any variable
+#echo "VARIABLE_NAME=VARIABLE_VALUE" >>tmp-run-env.out
+
+#${CM_PYTHON_BIN_WITH_PATH} contains the path to python binary if "get,python" is added as a dependency
+
+echo "Running: "
+echo "${CM_RUN_CMD}"
+echo ""
+
+if [[ ${CM_FAKE_RUN} != "yes" ]]; then
+  eval "${CM_RUN_CMD}"
+  test $? -eq 0 || exit 1
+fi
+
diff --git a/script/build-mlperf-inference-server-nvidia/_cm.yaml b/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -240,6 +240,8 @@ versions:
         version: r4.0
       nvidia-scratch-space:
         tags: _version.4_1
+    env:
+      BUILD_TRTLLM: 1
     deps:
       - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0
         names:

diff --git a/script/get-ml-model-gptj/_cm.json b/script/get-ml-model-gptj/_cm.json
@@ -171,6 +171,13 @@
         }
       ]
     },
+    "fp8": {
+      "env": {
+        "CM_ML_MODEL_INPUT_DATA_TYPES": "fp8",
+        "CM_ML_MODEL_WEIGHT_DATA_TYPES": "fp8"
+      },
+      "group": "precision"
+    },
     "int4": {
       "env": {
         "CM_ML_MODEL_INPUT_DATA_TYPES": "int4",
@@ -193,6 +200,62 @@
       "group": "model-provider",
       "default": true
     },
+    "nvidia": {
+      "default_variations": {
+        "framework": "pytorch"
+      },
+      "group": "model-provider",
+      "env": {
+        "CM_TMP_ML_MODEL_PROVIDER": "nvidia"
+      }
+    },
+    "pytorch,nvidia": {
+      "default_variations": {
+        "precision": "fp8"
+      },
+      "deps": [
+        {
+          "tags": "get,git,repo,_repo.https://github.com/NVIDIA/TensorRT-LLM.git,_sha.0ab9d17a59c284d2de36889832fe9fc7c8697604",
+          "extra_cache_tags": "tensorrt-llm",
+          "env": {
+            "CM_GIT_CHECKOUT_PATH_ENV_NAME": "CM_TENSORRT_LLM_CHECKOUT_PATH"
+          }
+        },
+        {
+          "tags": "get,cuda",
+          "names": [
+            "cuda"
+          ]
+        },
+        {
+          "tags": "get,nvidia,scratch,space"
+        },
+        {
+          "tags": "get,cuda-devices"
+        },
+        {
+          "tags": "get,ml-model,gpt-j,_fp32,_pytorch",
+          "env": {
+          },
+          "force_new_env_keys": [
+            "GPTJ_CHECKPOINT_PATH"
+          ]
+        },
+        {
+          "tags": "get,nvidia,inference,common-code",
+          "names": [
+            "nvidia-inference-common-code"
+          ]
+        },
+        {
+          "tags": "get,python3",
+          "names": [
+            "python",
+            "python3"
+          ]
+        }
+      ]
+    },
     "intel": {
       "default_variations": {
         "framework": "pytorch"

diff --git a/script/get-ml-model-gptj/customize.py b/script/get-ml-model-gptj/customize.py
@@ -21,6 +21,12 @@ def preprocess(i):
             env['INT8_MODEL_DIR'] = os.getcwd()
         else:
             env['INT4_MODEL_DIR'] = os.getcwd()
+    elif env.get('CM_TMP_ML_MODEL_PROVIDER', '') == 'nvidia':
+        i['run_script_input']['script_name'] = 'run-nvidia'
+        gpu_arch = int(float(env['CM_CUDA_DEVICE_PROP_GPU_COMPUTE_CAPABILITY']) * 10)
+        env['CM_GPU_ARCH'] = gpu_arch
+        env['CM_TMP_REQUIRE_DOWNLOAD'] = 'no'
+
     else:
         is_saxml = env.get('CM_TMP_MODEL_SAXML','')
         if is_saxml == "fp32":

diff --git a/script/get-ml-model-gptj/run-nvidia.sh b/script/get-ml-model-gptj/run-nvidia.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [[ ! -e ${CM_NVIDIA_MLPERF_SCRATCH_PATH}/models/GPTJ-6B/checkpoint-final ]]; then
+  cp -r ${GPTJ_CHECKPOINT_PATH} ${CM_NVIDIA_MLPERF_SCRATCH_PATH}/models/GPTJ-6B/
+  test $? -eq 0 || exit $?
+fi
+
+echo "cd ${CM_TENSORRT_LLM_CHECKOUT_PATH}"
+cd ${CM_TENSORRT_LLM_CHECKOUT_PATH}
+
+make -C docker build
+test $? -eq 0 || exit $?
+
+RUN_CMD="bash -c '${CM_PYTHON_BIN_WITH_PATH} scripts/build_wheel.py -a=${CM_GPU_ARCH} --clean --install --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16  --output_dir=/mnt/models/GPTJ-6B/fp8-quantized-ammo/GPTJ-FP8-quantized --model_dir=/mnt/models/GPTJ-6B/checkpoint-final --qformat=fp8 --kv_cache_dtype=fp8 '"
+DOCKER_RUN_ARGS=" -v ${CM_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt"
+export DOCKER_RUN_ARGS="$DOCKER_RUN_ARGS"
+export RUN_CMD="$RUN_CMD"
+make -C docker run LOCAL_USER=1
+test $? -eq 0 || exit $?
+
+${CM_PYTHON_BIN_WITH_PATH} ${CM_MLPERF_INFERENCE_NVIDIA_CODE_PATH}/code/gptj/tensorrt/onnx_tune.py --fp8-scalers-path=${CM_NVIDIA_MLPERF_SCRATCH_PATH}/models/GPTJ-6B/fp8-quantized-ammo/GPTJ-FP8-quantized/rank0.safetensors --scaler 1.005 --index 15
+test $? -eq 0 || exit $?
diff --git a/script/install-llvm-src/_cm.json b/script/install-llvm-src/_cm.json
@@ -299,7 +299,9 @@
            "tags": "get,generic-python-lib,_custom-python,_package.setuptools",
            "env": {
              "CM_PYTHON_BIN_WITH_PATH": "<<<CM_CONDA_BIN_PATH>>>/python3"
-           }
+           },
+           "version_max": "69.9.999",
+           "version_max_usable": "58.2.0"
         },
         {
            "tags": "get,generic-python-lib,_custom-python,_package.neural-compressor,_url.git+https://github.com/intel/neural-compressor.git@a2931eaa4052eec195be3c79a13f7bfa23e54473",

diff --git a/script/install-pytorch-from-src/_cm.json b/script/install-pytorch-from-src/_cm.json
@@ -202,7 +202,9 @@
             "conda-package",
             "setuptools"
           ],
-          "tags": "get,generic,conda-package,_package.setuptools,_source.conda-forge"
+          "tags": "get,generic,conda-package,_package.setuptools,_source.conda-forge",
+          "version_max": "69.9.999",
+          "version_max_usable": "58.2.0"
         },
         {
           "names": [