[CoreML] ML Program more ops

cast argmax gelu cast ln gn in
microsoft · Oct 17, 2024 · 80119af · 80119af
1 parent ac98bca
commit 80119af
Show file tree

Hide file tree

Showing 18 changed files with 903 additions and 159 deletions.
diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -31,10 +31,10 @@ enum COREMLFlags {
   // Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or later.
   COREML_FLAG_CREATE_MLPROGRAM = 0x010,
 
-  // Exclude ANE as sometimes this decrease performance
   // https://developer.apple.com/documentation/coreml/mlcomputeunits?language=objc
   // there are four compute units:
   // MLComputeUnitsCPUAndNeuralEngine|MLComputeUnitsCPUAndGPU|MLComputeUnitsCPUOnly|MLComputeUnitsAll
+  // different CU will have different performance and power consumption
   COREML_FLAG_USE_CPU_AND_GPU = 0x020,
   // Keep COREML_FLAG_LAST at the end of the enum definition
   // And assign the last COREMLFlag to it

diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -84,6 +84,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
     std::string_view coreml_op_type;
     bool add_alpha = false;
+    bool add_gelu_mode = false;
     if (op_type == "Sigmoid") {
       coreml_op_type = "sigmoid";
     } else if (op_type == "Tanh") {
@@ -93,6 +94,9 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     } else if (op_type == "LeakyRelu") {
       coreml_op_type = "leaky_relu";
       add_alpha = true;
+    } else if (op_type == "Gelu") {
+      coreml_op_type = "gelu";
+      add_gelu_mode = true;
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
@@ -112,6 +116,16 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
         AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", MLFloat16(alpha)));
       }
     }
+    if (add_gelu_mode) {
+      NodeAttrHelper helper(node);
+      std::string approximate = helper.Get("approximate", std::string("EXACT"));
+      if (approximate == "tanh") {
+        approximate = "TANH_APPROXIMATION";
+      } else if (approximate == "none") {
+        approximate = "EXACT";
+      }
+      AddOperationInput(*op, "mode", model_builder.AddScalarConstant(op->type(), "mode", std::string(approximate)));
+    }
 
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
@@ -212,6 +226,15 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para
 bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                             const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
+#if !defined(COREML_ENABLE_MLPROGRAM)
+  if (op_type == "Gelu") {
+    return false;
+  }
+#endif
+
+  if (op_type == "Gelu" && !input_params.create_mlprogram) {
+    return false;
+  }
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
@@ -245,6 +268,7 @@ void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistration
           "Relu",
           "PRelu",
           "LeakyRelu",
+          "Gelu",
       };
 
   op_registrations.builders.push_back(std::make_unique<ActivationOpBuilder>());

diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
 
@@ -15,6 +16,9 @@ class ArgMaxOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+ public:
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
@@ -24,41 +28,69 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& graph_viewer = model_builder.GetGraphViewer();
 
   NodeAttrHelper helper(node);
-  const auto axis = helper.Get("axis", 0);
-  const auto keepdims = helper.Get("keepdims", 1);
+  const int64_t axis = helper.Get("axis", 0);
+  const int64_t keepdims = helper.Get("keepdims", 1);
   const bool removedim = keepdims != 1;
 
-  auto* coreml_argmax = layer->mutable_argmax();
-  coreml_argmax->set_axis(axis);
-  coreml_argmax->set_removedim(removedim);
-
-  // There are two cases here:
-  // 1. Special Case (ArgMax-Cast(from int64 to int32)), we fuse the Argmax's output/Cast's input
-  // (We still have this special case here because CoreML model does not have Cast)
-  // 2. Otherwise, we add Argmax layer normally
-  if (node.GetOutputEdgesCount() == 1) {
-    auto it = node.OutputEdgesBegin();
-    const auto* next_node_in_partition = graph_viewer.GetNode(it->GetNode().Index());
-    // If Argmax's successive node is a Cast from int64 to int32 output
-    // The 'cast to' type is checked when determining operator support (see CastOpBuilder::IsOpSupportedImpl())
-    //   so we omit the check here
-    if (next_node_in_partition != nullptr && next_node_in_partition->OpType() == "Cast") {
-      // Skip the cast's input/argmax's output
-      *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-      *layer->mutable_output()->Add() = next_node_in_partition->OutputDefs()[0]->Name();
-      model_builder.AddLayer(std::move(layer));
-      return Status::OK();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.reduction
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "reduce_argmax");
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis));
+    AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", bool(keepdims)));
+    if (node.GetOutputEdgesCount() == 1) {
+      auto it = node.OutputEdgesBegin();
+      const auto* next_node_in_partition = &(it->GetNode());
+      // If Argmax's successive node is a Cast from int64 to int32 output, we fuse it
+      if (next_node_in_partition != nullptr && next_node_in_partition->OpType() == "Cast") {
+        // Skip the cast's input/argmax's output
+        AddOperationOutput(*op, *next_node_in_partition->OutputDefs()[0]);
+        model_builder.AddOperation(std::move(op));
+        return Status::OK();
+      }
+    }
+    // shall we add cast here?
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.cast
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    auto* coreml_argmax = layer->mutable_argmax();
+    coreml_argmax->set_axis(axis);
+    coreml_argmax->set_removedim(removedim);
+
+    // There are two cases here:
+    // 1. Special Case (ArgMax-Cast(from int64 to int32)), we fuse the Argmax's output/Cast's input
+    // (We still have this special case here because CoreML model does not have Cast)
+    // 2. Otherwise, we add Argmax layer normally
+    if (node.GetOutputEdgesCount() == 1) {
+      auto it = node.OutputEdgesBegin();
+      const auto* next_node_in_partition = graph_viewer.GetNode(it->GetNode().Index());
+      // If Argmax's successive node is a Cast from int64 to int32 output
+      // The 'cast to' type is checked when determining operator support (see CastOpBuilder::IsOpSupportedImpl())
+      //   so we omit the check here
+      if (next_node_in_partition != nullptr && next_node_in_partition->OpType() == "Cast") {
+        // Skip the cast's input/argmax's output
+        *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+        *layer->mutable_output()->Add() = next_node_in_partition->OutputDefs()[0]->Name();
+        model_builder.AddLayer(std::move(layer));
+        return Status::OK();
+      }
     }
-  }
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
 
-  model_builder.AddLayer(std::move(layer));
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
-bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
   // Attribute `select_last_index` of ArgMax op is not supported
   NodeAttrHelper helper(node);
@@ -86,6 +118,27 @@ bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
     }
   }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram) {
+    if (node.GetOutputEdgesCount() == 1) {
+      auto it = node.OutputEdgesBegin();
+      const auto& op_type = it->GetNode().OpType();
+      if (op_type == "Cast") {
+        // Check if the output type of cast node is int32
+        NodeAttrHelper output_helper(it->GetNode());
+        const auto cast_to_type = output_helper.Get("to", ONNX_NAMESPACE::TensorProto::UNDEFINED);
+        if (cast_to_type == ONNX_NAMESPACE::TensorProto::INT32) {
+          return true;
+        } else {
+          return false;
+        }
+      }
+    } else {
+      return false;
+    }
+  }
+#endif
+
   return true;
 }
 

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -18,8 +18,9 @@ namespace coreml {
 static std::set<std::string> Float16Ops = {
     "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal",
     "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool",
-    "Clip", "DepthToSpace", "Resize", "Slice", "Conv",
-    "ConvTranspose", "GlobalMaxPool", "Gemm", "MatMul",
+    "Clip", "DepthToSpace", "Resize", "Slice", "Conv", "Cast", "BatchNormalization",
+    "ConvTranspose", "GlobalMaxPool", "Gemm", "MatMul", "ArgMax", "Gelu",
+    "LayerNormalization", "InstanceNormalization", "GroupNormalization",
     "AveragePool", "MaxPool", "Reshape", "Split", "Transpose"};
 
 namespace {

diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -24,6 +24,9 @@ class BatchNormalizationOpBuilder : public BaseOpBuilder {
 
   // BatchNormalization opset 6- has unsupported attributes
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 7; }
+
+ public:
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void BatchNormalizationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
@@ -50,21 +53,46 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
   const auto eps = helper.Get("epsilon", 1e-5f);
   const auto channels = scale_tensor.dims()[0];
 
-  auto* coreml_batch_norm = layer->mutable_batchnorm();
-  coreml_batch_norm->set_channels(channels);
-  coreml_batch_norm->set_epsilon(eps);
-  coreml_batch_norm->set_computemeanvar(false);
-  coreml_batch_norm->set_instancenormalization(false);
-
-  ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_gamma(), scale_tensor));   // scale
-  ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_beta(), bias_tensor));     // B
-  ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_mean(), mean_tensor));     // mean
-  ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_variance(), var_tensor));  // var
-
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-
-  model_builder.AddLayer(std::move(layer));
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.normalization.batch_norm
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "batch_norm");
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "mean", model_builder.AddConstant(op->type(), input_defs[3]->Name() + "mean", mean_tensor));
+    AddOperationInput(*op, "variance", model_builder.AddConstant(op->type(), input_defs[4]->Name() + "variance", var_tensor));
+    AddOperationInput(*op, "gamma", model_builder.AddConstant(op->type(), input_defs[1]->Name(), scale_tensor));
+    AddOperationInput(*op, "beta", model_builder.AddConstant(op->type(), input_defs[2]->Name(), bias_tensor));
+    auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
+    if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+      MLFloat16 epsilon_fp16(eps);
+      AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", epsilon_fp16));
+    } else {
+      AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", eps));
+    }
+
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    auto* coreml_batch_norm = layer->mutable_batchnorm();
+    coreml_batch_norm->set_channels(channels);
+    coreml_batch_norm->set_epsilon(eps);
+    coreml_batch_norm->set_computemeanvar(false);
+    coreml_batch_norm->set_instancenormalization(false);
+
+    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_gamma(), scale_tensor));   // scale
+    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_beta(), bias_tensor));     // B
+    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_mean(), mean_tensor));     // mean
+    ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_batch_norm->mutable_variance(), var_tensor));  // var
+
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }