onnx · AlexandreEichenberger · Jan 8, 2025 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/docker/Dockerfile.onnx-mlir b/docker/Dockerfile.onnx-mlir
@@ -26,7 +26,7 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
 ARG NPROC=4
 ARG ACCEL=NNPA
 ARG TEST_NOFLOAT16
-ARG TEST_MCPU
+ARG TEST_MARCH
 ARG KEEPSRC
 
 RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
@@ -53,21 +53,21 @@ RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
                                          ([ "$(uname -m)" = "x86_64" ] &&  echo true || \
                                          ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
 # User image is built with SIMD (currently on s390x only)
-    && TEST_MCPU=${TEST_MCPU:-$([ "$(uname -m)" = "s390x" ] && echo z16 || \
+    && TEST_MARCH=${TEST_MARCH:-$([ "$(uname -m)" = "s390x" ] && echo z16 || \
                                ([ "$(uname -m)" = "x86_64" ] &&  echo || \
                                ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
-    && TEST_ARGS="-mcpu=${TEST_MCPU}" \
+    && TEST_ARGS="-march=${TEST_MARCH}" \
     && make check-docs \
     && make check-unittest \
     && make check-multiple-models \
     && make NPROC=${NPROC} \
             CTEST_PARALLEL_LEVEL=${NPROC} \
             TEST_NOFLOAT16=${TEST_NOFLOAT16} \
-            TEST_MCPU=${TEST_MCPU} \
+            TEST_MARCH=${TEST_MARCH} \
             TEST_ARGS="${TEST_ARGS}" \
             -j${NPROC} \
             check-onnx-backend-numerical \
-    && if [ "${TEST_MCPU}" = "z16" ]; then \
+    && if [ "${TEST_MARCH}" = "z16" ]; then \
           make NPROC=${NPROC} \
                CTEST_PARALLEL_LEVEL=${NPROC} \
                -j${NPROC} \

diff --git a/docker/Dockerfile.onnx-mlir-dev b/docker/Dockerfile.onnx-mlir-dev
@@ -20,7 +20,7 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
 ARG NPROC=4
 ARG ACCEL=NNPA
 ARG TEST_NOFLOAT16
-ARG TEST_MCPU
+ARG TEST_MARCH
 
 RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
     && ONNX_MLIR_ROOT=${WORK_DIR}/onnx-mlir \
@@ -51,18 +51,18 @@ RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
                                          ([ "$(uname -m)" = "x86_64" ] &&  echo true || \
                                          ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
 # Dev image is built without SIMD, placeholder for easy SIMD enablement
-    && TEST_MCPU=$([ "$(uname -m)" = "s390x" ] && echo || \
+    && TEST_MARCH=$([ "$(uname -m)" = "s390x" ] && echo || \
                   ([ "$(uname -m)" = "x86_64" ] &&  echo || \
                   ([ "$(uname -m)" = "ppc64le" ] && echo || echo))) \
-    && TEST_ARGS="-mcpu=${TEST_MCPU}" \
+    && TEST_ARGS="-march=${TEST_MARCH}" \
     && TEST_OPTLEVEL=0 \
     && make check-docs \
     && make check-unittest \
     && make check-multiple-models \
     && make NPROC=${NPROC} \
             CTEST_PARALLEL_LEVEL=${NPROC} \
             TEST_NOFLOAT16=${TEST_NOFLOAT16} \
-            TEST_MCPU=${TEST_MCPU} \
+            TEST_MARCH=${TEST_MARCH} \
             TEST_ARGS="${TEST_ARGS}" \
             TEST_OPTLEVEL=${TEST_OPTLEVEL} \
             -j${NPROC} \

diff --git a/docs/DebuggingNumericalError.md b/docs/DebuggingNumericalError.md
@@ -65,7 +65,7 @@ optional arguments:
 ## Helper script to compare a model under two distinct compile option.
 
 Based on the above `utils/runONNXModel.py`, the `utils/checkONNXModel.py` allows a user to run a given model twice, under two distinct compile options, and compare its results.
-This let a user simply test a new option, comparing the safe version of the compiler (e.g. `-O0` or `-O3`) with a more advanced version (e.g. `-O3` or `-O3 -march=x86-64`). Simply specify the compile options using the `--ref-compile-args` and `--test-compile-args` flags, a model using the `--model` flag, and possibly a `--shape-info` in presence of dynamic shape inputs.
+This let a user simply test a new option, comparing the safe version of the compiler (e.g. `-O0` or `-O3`) with a more advanced version (e.g. `-O3` or `-O3 --march=x86-64`). Simply specify the compile options using the `--ref-compile-args` and `--test-compile-args` flags, a model using the `--model` flag, and possibly a `--shape-info` in presence of dynamic shape inputs.
 Full options are listed under the `--help` flag.
 
 ## Debugging the Code Generated for an Operator.

diff --git a/docs/Instrumentation.md b/docs/Instrumentation.md
@@ -61,11 +61,11 @@ The output for the memory measurement is explained here.
 
 Other example for NNPA
 - Performance profiling for onnx ops before lowering to zhigh ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=Onnx --instrument-ops=onnx.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=Onnx --instrument-ops=onnx.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 - Performance profiling for onnx and zhigh ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=ZHigh --instrument-ops=onnx.*,zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=ZHigh --instrument-ops=onnx.*,zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 - Performance profiling for zlow ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 
 ## Control instrument at runtime
 By providing certain env variable at runtime, you can disable reports from  instrument library.

diff --git a/docs/SupportedONNXOps-NNPA.md b/docs/SupportedONNXOps-NNPA.md
@@ -8,38 +8,38 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 21. Limitatio
 * Operations are defined by the [ONNX Standard](https://github.com/onnx/onnx/blob/main/docs/Operators.md).
 * **Supported Opsets** indicates the lowest and highest opset a model may have for onnx-mlir to support compiling a model with the operator.
    * A * indicates onnx-mlir is compatible with the latest version of that operator available as of opset 21.
+   * A ^ indicates onnx-mlir is compatible with the latest level of the NNPA Architecture which is z16.
 
 
-NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.
+NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.  Onnx-mlir with NNPA utilizes hardware when possible. To accomplish this, the compiler converts ONNX ops to [ZHigh](Dialects/zhigh.md) ops, [ZLow](Dialects/zlow.md) ops, and are processed by the [IBM Z Deep Neural Network Library (zDNN)](https://github.com/IBM/zDNN).
 
 
-| Op |Supported Opsets (inclusive) |Limitations |Notes |
-| --- |--- |--- |--- |
-| **Add** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **AveragePool** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors (N x C x H x W).<br>- `kernel_shape` must be static.<br>- `count_include_pad` must be default value(0).<br>- `ceil_mode` must be default value(0). | |
-| **BatchNormalization** |6 - * |Input and output tensor must be 4D(N x C x H x W). | |
-| **Conv** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- Dimension in Height and weight must be static.<br>- `group` must be default value(1).<br>- `dilations` must be default value(1).<br>- Input and output tensors must have 4D (N x C x H x W).<br>- `kernel_shape` must be static. | |
-| **ConvTranspose** |6 - * |- 1D and 3D not supported because Conv1D and Conv3D not supported in zDNN. non-default `dilations` not supported because dilated convolution not supported in zDNN. | |
-| **Div** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Exp** |6 - * |Input tensor must have 4 dimensions. | |
-| **GRU** |7 - * |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- If `B` and `initial_h` are given, they must have static dimensions.<br>- `sequence_lens` is not supported for bidirectional GRU.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `linear_before_reset` must be 1.<br>- `layout` is not supported. | |
-| **Gemm** |6 - * |- `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`. | |
-| **GlobalAveragePool** |6 - * |- Input shape must be 4D tensor(NCHW).<br>- Dimensions in `H` and `W` must be static. | |
-| **LSTM** |7 - * |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- `B` and `initial_h` have static dimensions if given. `B`'s direction dim must be 1 or 2.<br>- `P`(peepholes), `activation_alpha`, and `activation_beta` are not supported.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `input_forget` must be default value(0).<br>- `layout` is not supported. | |
-| **LeakyRelu** |6 - * |The operations immediately before and after the LeakyRelu operation must be executed on the NNPA. Otherwise, LeakyRelu is executed on the CPU. This limitation is set to avoid performance degradation. | |
-| **Log** |6 - * |Input tensor must have 4 dimensions. | |
-| **LogSoftmax** |6 - * | | |
-| **MatMul** |6 - * |Ranks of input tensors must be (Rank of A, Rank of B) = (M, N), where M >= 2 and N >= 2. | |
-| **Max** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **MaxPool** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors(N x C x H x W).<br>- `kernel_shape` must be static.<br>- `ceil_mode` must be default value(0).<br>- `dilations` must be default value(1). | |
-| **Min** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Mul** |6 - * |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Pow** |7 - * |- Exponent should be a scalar integer and less or equal to 64. | |
-| **ReduceMean** |6 - * |- `keepdims` must be 1.<br>- Input tensor must be 4D tensors and `axis` must be [2, 3]. | |
-| **Relu** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
-| **Sigmoid** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
-| **Softmax** |6 - * |- `axis` must be the last dimension, i.e. `rank - 1` or -1. | |
-| **Softplus** |6 - * |The operations immediately before and after the Softplus operation must be executed on the NNPA. Otherwise, Softplus is executed on the CPU. This limitation is set to avoid performance degradation. | |
-| **Sub** |6 - * |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Sum** |6 - * |- All inputs must have the same static shape (Broadcasting not supported.)<br>- Single input not supported. | |
-| **Tanh** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
+| Op |Supported Opsets (inclusive) |Minimum NNPA Level(Inclusive) |Limitations |Notes |
+| --- |--- |--- |--- |--- |
+| **Add** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **AveragePool** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors (N x C x H x W).<br>- `kernel_shape` must be static.<br>- `count_include_pad` must be default value(0).<br>- `ceil_mode` must be default value(0). | |
+| **BatchNormalization** |6 - * |z16 |Input and output tensor must be 4D(N x C x H x W). | |
+| **Conv** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- Dimension in Height and weight must be static.<br>- `group` must be default value(1).<br>- `dilations` must be default value(1).<br>- Input and output tensors must have 4D (N x C x H x W).<br>- `kernel_shape` must be static. | |
+| **ConvTranspose** |6 - * |z16 |- 1D and 3D not supported because Conv1D and Conv3D not supported in zDNN. non-default `dilations` not supported because dilated convolution not supported in zDNN. | |
+| **Div** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Exp** |6 - * |z16 |Input tensor must have 4 dimensions. | |
+| **GRU** |7 - * |z16 |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- If `B` and `initial_h` are given, they must have static dimensions.<br>- `sequence_lens` is not supported for bidirectional GRU.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `linear_before_reset` must be 1.<br>- `layout` is not supported. | |
+| **Gemm** |6 - * |z16 |- `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`.<br>. | |
+| **GlobalAveragePool** |6 - * |z16 |- Input shape must be 4D tensor(NCHW).<br>- Dimensions in `H` and `W` must be static. | |
+| **LSTM** |7 - * |z16 |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- `B` and `initial_h` have static dimensions if given. `B`'s direction dim must be 1 or 2.<br>- `P`(peepholes), `activation_alpha`, and `activation_beta` are not supported.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `input_forget` must be default value(0).<br>- `layout` is not supported. | |
+| **Log** |6 - * |z16 |Input tensor must have 4 dimensions. | |
+| **LogSoftmax** |6 - * |z16 | | |
+| **MatMul** |6 - * |z16 |Ranks of input tensors must be (Rank of A, Rank of B) = (M, N), where M >= 2 and N >= 2. | |
+| **Max** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **MaxPool** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors(N x C x H x W).<br>- `kernel_shape` must be static.<br>- `ceil_mode` must be default value(0).<br>- `dilations` must be default value(1). | |
+| **Min** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Mul** |6 - * |z16 |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Pow** |7 - * |z16 |- Exponent should be a scalar integer and less or equal to 64. | |
+| **ReduceMean** |6 - * |z16 |- `keepdims` must be 1.<br>- Input tensor must be 4D tensors and `axis` must be [2, 3]. | |
+| **Relu** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
+| **Sigmoid** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
+| **Softmax** |6 - * |z16 |- `axis` must be the last dimension, i.e. `rank - 1` or -1. | |
+| **Softplus** |6 - * |z16 |The operations immediately before and after the Softplus operation must be executed on the NNPA. Otherwise, Softplus is executed on the CPU. This limitation is set to avoid performance degradation. | |
+| **Sub** |6 - * |z16 |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Sum** |6 - * |z16 |- All inputs must have the same static shape (Broadcasting not supported.)<br>- Single input not supported. | |
+| **Tanh** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
diff --git a/docs/Testing.md b/docs/Testing.md
@@ -122,9 +122,9 @@ cmake --build . --config Release --target check-onnx-backend-signature
 
 ### Enable SIMD instructions
 
-On supported platforms, currently s390x only, backend tests can generate SIMD instructions for the compiled models. To enable SIMD, set the TEST_MCPU environment variable, e.g.,
+On supported platforms (currently s390x z14 and up, x86, and arm), backend tests can generate SIMD instructions for the compiled models. To enable SIMD, set the TEST_MARCH environment variable, e.g.,
 ```
-TEST_MCPU=z14 cmake --build . --config Release --target check-onnx-backend[-jni]
+TEST_MARCH=z16 cmake --build . --config Release --target check-onnx-backend[-jni]
 ```
 
 ### Execution of backend tests
@@ -294,9 +294,9 @@ If you need to change ATOL and RTOL for accuracy checks, set the environment var
 
 ### Enable SIMD instructions
 
-On supported platforms, currently s390x only, numerical tests can generate SIMD instructions for the compiled models. To enable SIMD, set the `TEST_ARGS` environment variable, e.g.,
+On supported platforms (currently s390x z14 and up, x86, and arm), numerical tests can generate SIMD instructions for the compiled models. To enable SIMD, set the `TEST_ARGS` environment variable, e.g.,
 ```
-TEST_ARGS="-mcpu=z14" CTEST_PARALLEL_LEVEL=$(nproc) cmake --build . --config Release --target check-onnx-numerical
+TEST_ARGS="-march=z16" CTEST_PARALLEL_LEVEL=$(nproc) cmake --build . --config Release --target check-onnx-numerical
 ```
 
 ### Testing of specific accelerators
@@ -395,7 +395,7 @@ Without specifying a model using `-m`, the script will check all models in the O
 
 If you want to gather performance info about a model zoo (or any models, for that matter), simplest is to request the desired statistic at compile time (using `-profile-ir` flag), divert the output statistic to a file, and then analyze it using `make-report.py`. For example:
 ```
-> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 -march=arm64 --profile-ir=Onnx" -m bertsquad-10
+> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 --march=arm64 --profile-ir=Onnx" -m bertsquad-10
 ...
 > make-report.py -r run.log
 ...
@@ -408,7 +408,7 @@ Statistics start (all ops).
 
 The runtime profiling info can be combined with specific compile-time statistics as well. Let's say that we are interested in SIMD statistics. We inform the compiler of the compile-time statistic to emit using `-opt-report` option, and inform `RunONNXModelZoo.py` that we want to preserve the compiler output using the `--log-to-file` option. For example
 ```
-> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 -march=arm64 -opt-report=Simd --profile-ir=Onnx" -m bertsquad-10 --log-to-file compile.log
+> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 --march=arm64 -opt-report=Simd --profile-ir=Onnx" -m bertsquad-10 --log-to-file compile.log
 ...
 > make-report.py -c compile.log -r run.log
 ...