From b661e8b692a6591137daa56d38066b0db7afdb4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tarek=20Ziad=C3=A9?= <tarek@ziade.org>
Date: Thu, 12 Dec 2024 11:42:18 +0100
Subject: [PATCH] now using the new API

---
 .../quantization/firefox_matmul_integer.cc    |  24 +-
 .../cpu/quantization/firefox_matmul_integer.h | 269 +-----------------
 2 files changed, 20 insertions(+), 273 deletions(-)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
index 94e2eb374c9ed..810d3bbba1879 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
@@ -114,24 +114,22 @@ Status FirefoxMatMulInteger8::Compute(OpKernelContext* ctx) const {
     std::vector<float> float_output(helper.M() * helper.N(), 0.0f);
 
     // Call the function
-    // matix A (M x K) * matrix B (K x N)
+    // matrix A (M x K) * matrix B (K x N)
     // matrix C (M x N)
     size_t rows_a = static_cast<size_t>(helper.M());
     size_t cols_b = static_cast<size_t>(helper.N());
     size_t width = static_cast<size_t>(helper.K());
 
-    int8MultiplyAndAddBias(reinterpret_cast<const int8_t*>(a_data),
-                           1.0f,  // scale factor for A
-                           a_offset,
-                           reinterpret_cast<const int8_t*>(b_data),
-                           1.0f,  // scale factor for B
-                           0, // b_zero_point
-                           0,  // we don't have any bias
-                           1.0f, // quantization multiplier
-                           rows_a,  // rows A
-                           width,  // width
-                           cols_b,  // col B
-                           reinterpret_cast<float*>(y_data));
+    // gemmology is only doing A unsigned x B signed 
+    int8Multiply(reinterpret_cast<const int8_t*>(a_data),
+                         a_offset,
+                         reinterpret_cast<const int8_t*>(b_data),
+                         0, // b_zero_point
+                         rows_a,  // rows A
+                         width,  // width
+                         cols_b,  // col B
+                         reinterpret_cast<float*>(y_data));
+
 
     // Print the output
   #if 0
diff --git a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
index d20a1aa123397..3b8066a405f7a 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
+++ b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
@@ -36,271 +36,20 @@ class FirefoxMatMulInteger8 final : public MatMulIntegerBase {
 #include <emscripten/emscripten.h>
 
 
-/** Main interface for integer matrix multiplication followed by addition of bias for wasm.
- *
- * C = A * B + Bias
- *
- * Input matrix A:
- *   - is a 2-D matrix that typically represents activations as floating point values
- *   - no. of rows should be a multiple of 1 (i.e. no restriction)
- *   - no. of columns should be a multiple of 64
- *   - is represented as array (contiguous memory locations) in row-major format
- *
- * Input matrix B:
- *   - is a 2-D matrix that typically represents fixed model parameters as floating point values
- *   - no. of rows should be:
- *     -- equal to no. of columns of Input matrix A
- *     -- a multiple of 64
- *   - no. of columns should be a multiple of 8
- *   - is represented as array (contiguous memory locations) in row-major format
- *
- *   Please note that it is also possible to pass Input matrix B in 2 more forms:
- *    - One that is already a quantized and transposed version of Input matrix B
- *    - Other that is already a transposed version of Input matrix B
- *
- * Input Bias:
- *   - is an array (contiguous memory locations) that represents bias
- *   - size of the array should be equal to the no. of columns of Input matrix B
- *
- * Output matrix C:
- *   - is a 2-D matrix that represents the result (= A * B + Bias)
- *   - no. of rows will be equal to no. of rows of Input matrix A
- *   - no. of columns will be equal to no. of columns of Input matrix B (in untransposed form)
- *   - is represented as array (contiguous memory locations) in row-major format
- *
- * Please note that most of the functions in this interface might have architecture specific
- * implementations.
- *
- * Conventions followed throughout this file:
- *  - Unless explicitly mentioned, Input matrix B always means an unquantized (i.e. float values)
- *    and non-transposed version
- *  - no. of rows of Input matrix A = `rows_A`
- *  - no. of columns of Input matrix A = no. of rows of Input matrix B = `width`
- *  - no. of columns of Input matrix B = `cols_B`
- */
 
 #include <cstdint>
 
 using Index = uint32_t;
-
-/**
- * Prepare B for the Matrix Multiply function from Input matrix B.
- *
- * Quantization is performed on the input.
- * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
- * function (`int8MultiplyAndAddBias`).
- *
- * Please note that this interface might have architecture specific implementation.
- *
- * @param[in]   input_B             An array representing the Input matrix B in row-major format.
- *                                  Size of the array = `width` * `cols_B`.
- *                                  Shape of the matrix: (`width`, `cols_B`)
- * @param[in]   scale               The scaling factor (for quantization)
- * @param[in]   zero_point          The zero point (for quantization)
- * @param[in]   width               No. of rows of Input matrix B. It should be a multiple of 64.
- * @param[in]   cols_B              No. of columns of Input matrix B. It should be a multiple of 8.
- * @param[out]  output              An array representing the prepared B matrix.
- *                                  Size of the array = `width` * `cols_B`.
- */
-extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b")))
-int8PrepareB(const float* input_B,
-             float scale,
-             float zero_point,
-             Index width,
-             Index cols_B,
-             int8_t* output);
-
-/**
- * Prepare B for the Matrix Multiply function from transposed version of Input matrix B.
- *
- * Quantization is performed on floating values of input.
- * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
- * function (`int8MultiplyAndAddBias`).
- *
- * Please note that this interface might have architecture specific implementation.
- *
- * @param[in]   input_B_transposed     An array representing transposed version of Input matrix B.
- *                                     It is in column-major format.
- *                                     Size of the array = `width` * `cols_B`.
- *                                     Shape of the matrix: (`cols_B`, `width`)
- * @param[in]   scale                  The scaling factor (for quantization)
- * @param[in]   zero_point             The zero point (for quantization)
- * @param[in]   width                  No. of rows of Input matrix B. It should be a multiple of 64.
- * @param[in]   cols_B                 No. of columns of Input matrix B. Should be a multiple of 8.
- * @param[out]  output                 An array representing the prepared B matrix.
- *                                     Size of the array = `width` * `cols_B`.
- */
-extern "C" void
-    __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b_from_transposed")))
-    int8PrepareBFromTransposed(const float* input_B_transposed,
-                               float scale,
-                               float zero_point,
-                               Index width,
-                               Index cols_B,
-                               int8_t* output);
-
-/**
- * Prepare B for the Matrix Multiply function from a quantized and transposed version of Input
- * matrix B which is also in a CPU-independent format.
- *
- * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
- * function (`int8MultiplyAndAddBias`).
- *
- * This function is useful while using the quantized models that are stored in a CPU-independent
- * format on the disk.
- *
- * @param[in]   input_B_quant_transposed   An array representing the quantized and transposed
- *                                         version of Input matrix B. It is in column-major format.
- *                                         Size of the array = `width` * `cols_B`.
- *                                         Shape of the matrix: (`cols_B`, `width`)
- * @param[in]   width                      No. of rows of Input matrix B. Should be multiple of 64
- * @param[in]   cols_B                     No. of columns of Input matrix B. Should be multiple of 8
- * @param[out]  output                     An array representing the prepared B matrix.
- *                                         Size of the array = `width` * `cols_B`.
- */
-extern "C" void __attribute__((import_module("wasm_gemm"),
-                               import_name("int8_prepare_b_from_quantized_transposed")))
-int8PrepareBFromQuantizedTransposed(const int8_t* input_B_quant_transposed,
-                                    Index width,
-                                    Index cols_B,
-                                    int8_t* output);
-
-/**
- * Prepare A for the Matrix Multiply function from Input matrix A.
- *
- * It performs quantization on floating values of input.
- * The final prepared A might be architecture dependent. e.g. On some architectures like x86, it
- * might be unsigned (achieved by adding 127 to quantized values) while on others like Arm, it might
- * be signed.
- * The final prepared A can be used as an input to matrix multiply function
- * (`int8MultiplyAndAddBias`).
- *
- * Please note that this interface might have architecture specific implementation.
- *
- * @param[in]   input_A        An array representing the Input matrix A in row-major format.
- *                             Size of the array = `rows_A` * `width`.
- *                             Shape of the matrix: (`rows_A`, `width`)
- * @param[in]   scale          The scaling factor (for quantization)
- * @param[in]   zero_point     The zero point (for quantization)
- * @param[in]   rows_A         No. of rows of Input matrix A. No restriction on its size.
- * @param[in]   width          No. of columns of Input matrix A. It should be a multiple of 64.
- * @param[out]  output         An array representing the prepared A matrix.
- *                             Size of the array = `rows_A` * `width`.
- */
-extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_a")))
-int8PrepareA(const float* input_A,
-             float scale,
-             float zero_point,
-             Index rows_A,
-             Index width,
-             int8_t* output);
-
-/**
- * Prepares bias for the Matrix Multiply function.
- *
- * It uses the prepared B (which must be obtained by using any of the int8PrepareB* functions) and
- * a bias input to prepare the final bias.
- *
- * The final bias can be used as an input to matrix multiply function (`int8MultiplyAndAddBias`).
- *
- * @param[in]   input_B_prepared    An array representing the prepared B matrix.
- *                                  Size of the array = `width` * `cols_B`.
- * @param[in]   scale_A             The scaling factor (for quantization) of A
- * @param[in]   zero_point_A        The zero point (for quantization) of A
- * @param[in]   scale_B             The scaling factor (for quantization) of B
- * @param[in]   zero_point_B        The zero point (for quantization) of B
- *                                  factor that is prepared from `scale_A` and `scale_B`.
- * @param[in]   width               No. of rows of Input matrix B (unquantized & non-transposed).
- *                                  It should be a multiple of 64.
- * @param[in]   cols_B              No. of columns of Input matrix B (unquantized & non-transposed)
- *                                  It should be a multiple of 8.
- * @param[in]   input_bias          An array representing the input bias. Size of array = `cols_B`
- * @param[out]  output              An array representing the final prepared bias.
- *                                  Size of the array = `cols_B`
- */
-extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_bias")))
-int8PrepareBias(const int8_t* input_B_prepared,
-                float scale_A,
-                float zero_point_A,
-                float scale_B,
-                float zero_point_B,
-                Index width,
-                Index cols_B,
-                const float* input_bias,
-                float* output);
-
-/**
- * Perform multiplication of 2 matrices followed by adding a bias.
- *
- * i.e Output = A_prepared * B_prepared + Bias_prepared
- *
- * The inputs A_prepared, B_prepared and Bias_prepared of this function must be
- * obtained by using `int8PrepareA`, one of the `int8PrepareB*` and `int8PrepareBias`
- * functions respectively.
- *
- * Please note that this interface might have architecture specific implementation.
- *
- * @param[in]   input_A_prepared       An array representing the prepared A matrix.
- *                                     This must be obtained by using `int8PrepareA` function.
- *                                     Size of the array = `rows_A` * `width`.
- * @param[in]   scale_A                The scaling factor (for quantization) of A
- * @param[in]   zero_point_A           The zero point (for quantization) of A
- * @param[in]   input_B_prepared       An array representing the prepared B matrix.
- *                                     This must be obtained by using one of `int8PrepareB*`
- *                                     functions. Size of the array = `width` * `cols_B`.
- * @param[in]   scale_B                The scaling factor (for quantization) of B
- * @param[in]   zero_point_B           The zero point (for quantization) of B
- * @param[in]   input_bias_prepared    An array representing the prepared bias.
- *                                     This must be obtained by using `int8PrepareBias` function.
- *                                     Size of the array = `cols_B`
- * @param[in]   unquant_multiplier     A value that will be multiplied to the final unquantization
- *                                     factor that is prepared from `scale_A` and `scale_B`.
- * @param[in]   rows_A                 No. of rows of Input matrix A. No restriction on its size.
- * @param[in]   width                  No. of columns of Input matrix A (same as no. of columns of
- *                                     Input matrix B). It should be a multiple of 64.
- * @param[in]   cols_B                 No. of columns of Input matrix B. Should be a multiple of 8.
- * @param[out]  output                 An array representing the result matrix in row-major format.
- *                                     Size of the array = `rows_A` * `cols_B`.
- */
 extern "C" void
-    __attribute__((import_module("wasm_gemm"), import_name("int8_multiply_and_add_bias")))
-    int8MultiplyAndAddBias(const int8_t* input_A_prepared,
-                           float scale_A,
-                           float zero_point_A,
-                           const int8_t* input_B_prepared,
-                           float scale_B,
-                           float zero_point_B,
-                           const float* input_bias_prepared,
-                           float unquant_multiplier,
-                           Index rows_A,
-                           Index width,
-                           Index cols_B,
-                           float* output);
-
-/**
- * Select a subset of columns of prepared B.
- *
- * Indices of the columns to be selected are specified by an array.
- *
- * @param[in]   input_B_prepared   An array representing the prepared B matrix.
- *                                 This must be obtained by using one of the `int8PrepareB*`
- *                                 functions Size of the array = `width` * `cols_B`.
- * @param[in]   width              No. of rows of Input matrix B. It should be a multiple of 64.
- * @param[in]   cols_B             No. of columns of Input matrix B. It should be a multiple of 8.
- * @param[in]   cols               An array of column indices to be selected from prepared B.
- *                                 All indices of the array should be valid. i.e.
- *                                 0 <= cols[N] < cols_B   where N = 0, 1, 2 .... (`num_cols`-1)
- * @param[in]   num_cols           Size of the `cols` array. It should be a multiple of 8.
- * @param[out]  output             An array representing the selected columns of prepared B.
- *                                 Size of the array = `width` * `num_cols`.
- */
-extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_select_columns_of_b")))
-int8SelectColumnsOfB(const int8_t* input_B_prepared,
-                     Index width,
-                     Index cols_B,
-                     const Index* cols,
-                     const Index num_cols,
-                     int8_t* output);
+    __attribute__((import_module("wasm_gemm"), import_name("int8_multiply")))
+    int8Multiply(const int8_t* input_A,
+                 float zero_point_A,
+                 const int8_t* input_B,
+                 float zero_point_B,
+                 Index rows_A,
+                 Index width,
+                 Index cols_B,
+                 float* output);
 
 
 #endif