update

wangyems · wangyems · commit 9ba3637056a9 · 2024-05-21T04:20:22.000Z
diff --git a/onnxruntime/contrib_ops/cuda/collective/custom_reduce_impl.cu b/onnxruntime/contrib_ops/cuda/collective/custom_reduce_impl.cu
@@ -16,6 +16,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/shared_library/provider_api.h"
 #include "custom_reduce_impl.h"
 #include <algorithm>
@@ -27,6 +28,9 @@ namespace ort_trtllm {
 
 #if defined(USE_MPI) || defined(USE_NCCL)
 
+using namespace onnxruntime;
+using namespace onnxruntime::cuda;
+
 // Calculates ceil(a / b). User must be careful to ensure that there
 // is no overflow or underflow in the calculation.
 template <typename T>
@@ -559,13 +563,54 @@ size_t GetMaxRequiredWorkspaceSize(int world_size) {
   return 8 * 1000 * 1000;
 }
 
-AllReduceStrategyType SelectImplementation(size_t message_size, int world_size, onnxruntime::MLDataType type) {
+Status SetPeerAccess(int rank, int world_size, bool enable, int& can_access_peer) {
+  const int src_node = rank;
+
+  for (int dst_node = 0; dst_node < world_size; dst_node++) {
+    if (dst_node == src_node) {
+      continue;
+    }
+
+    CUDA_RETURN_IF_ERROR(cudaDeviceCanAccessPeer(&can_access_peer, src_node, dst_node));
+
+    if (!can_access_peer) {
+      return Status::OK();
+    }
+
+    if (enable) {
+      cudaDeviceEnablePeerAccess(dst_node, 0);
+    } else {
+      cudaDeviceDisablePeerAccess(dst_node);
+    }
+
+    auto const error = cudaGetLastError();
+    if (error != cudaErrorPeerAccessAlreadyEnabled && error != cudaErrorPeerAccessNotEnabled) {
+      CUDA_RETURN_IF_ERROR(error);
+    }
+  }
+
+  return Status::OK();
+}
+
+AllReduceStrategyType SelectImplementation(size_t message_size, int rank, int world_size,
+                                           onnxruntime::MLDataType type) {
   AllReduceStrategyType strategy = AllReduceStrategyType::NCCL;
   if (type != onnxruntime::DataTypeImpl::GetType<float>() &&
       type != onnxruntime::DataTypeImpl::GetType<onnxruntime::MLFloat16>()) {
     return strategy;
   }
 
+  if (world_size != 2 && world_size != 4 && world_size != 6 && world_size != 8) {
+    return strategy;
+  }
+
+  int can_access_peer = 0;
+  ORT_ENFORCE(SetPeerAccess(rank, world_size, true, can_access_peer) == Status::OK());
+  // If P2P is not enabled, we cannot use the custom allreduce, so default to NCCL.
+  if (!can_access_peer) {
+    return strategy;
+  }
+
   const size_t maxWorkspaceSize = GetMaxRequiredWorkspaceSize(world_size);
   const size_t message_size_bytes = message_size * type->Size();
 
diff --git a/onnxruntime/contrib_ops/cuda/collective/custom_reduce_impl.h b/onnxruntime/contrib_ops/cuda/collective/custom_reduce_impl.h
@@ -68,7 +68,9 @@ void CustomAllReduce(AllReduceParams& params, onnxruntime::MLDataType data_type,
 
 size_t GetMaxRequiredWorkspaceSize(int world_size);
 
-AllReduceStrategyType SelectImplementation(size_t message_size, int world_size, onnxruntime::MLDataType type);
+Status SetPeerAccess(int rank, int world_size, bool enable, int& can_access_peer);
+
+AllReduceStrategyType SelectImplementation(size_t message_size, int rank, int world_size, onnxruntime::MLDataType type);
 
 #endif
 
diff --git a/onnxruntime/contrib_ops/cuda/collective/ipc_utils.cc b/onnxruntime/contrib_ops/cuda/collective/ipc_utils.cc
@@ -25,31 +25,6 @@ namespace ort_trtllm {
 
 using namespace onnxruntime;
 
-Status SetPeerAccess(int rank, int world_size, bool enable) {
-  const int src_node = rank;
-
-  for (int dst_node = 0; dst_node < world_size; dst_node++) {
-    if (dst_node == src_node) {
-      continue;
-    }
-
-    int can_access_peer;
-    CUDA_RETURN_IF_ERROR(cudaDeviceCanAccessPeer(&can_access_peer, src_node, dst_node));
-
-    if (enable) {
-      cudaDeviceEnablePeerAccess(dst_node, 0);
-    } else {
-      cudaDeviceDisablePeerAccess(dst_node);
-    }
-    auto const error = cudaGetLastError();
-    if (error != cudaErrorPeerAccessAlreadyEnabled && error != cudaErrorPeerAccessNotEnabled) {
-      CUDA_RETURN_IF_ERROR(error);
-    }
-  }
-
-  return Status::OK();
-}
-
 IpcMemory::IpcMemory(int rank, int world_size, std::size_t buffer_size)
     : rank_(rank), world_size_(world_size), m_comm_ptrs_(world_size), mbuffer_size_(buffer_size) {
   ORT_ENFORCE(AllocateIpcMemory() == Status::OK());
@@ -113,9 +88,6 @@ Status GetCustomAllReduceWorkspace(int rank, int world_size, size_t input_size,
     return Status::OK();
   }
 
-  ORT_ENFORCE(SetPeerAccess(rank, world_size, true) == Status::OK());
-  CUDA_RETURN_IF_ERROR(cudaGetLastError());
-
   const std::size_t buffer_size = world_size * input_size;
 
   std::vector<std::shared_ptr<IpcMemory>>& m_ipc_memory_handles = ipc_mem_res_pack.m_ipc_momery_handles;
diff --git a/onnxruntime/contrib_ops/cuda/collective/ipc_utils.h b/onnxruntime/contrib_ops/cuda/collective/ipc_utils.h
@@ -25,8 +25,6 @@ namespace ort_trtllm {
 
 #if defined(USE_MPI) || defined(USE_NCCL)
 
-Status SetPeerAccess(int rank, int world_size, bool enable = true);
-
 class IpcMemory {
  public:
   size_t static constexpr FLAGS_SIZE = (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
diff --git a/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc
@@ -441,7 +441,7 @@ Status FuncCustomAllReduce(
   int world_size = nccl->Size();
 
   ort_trtllm::AllReduceStrategyType runtime_strategy =
-      ort_trtllm::SelectImplementation(input_count, world_size, data_type);
+      ort_trtllm::SelectImplementation(input_count, rank, world_size, data_type);
 
   if (runtime_strategy == ort_trtllm::AllReduceStrategyType::NCCL) {
     ncclDataType_t dtype = GetNcclDataType(data_type);
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
@@ -17,78 +17,65 @@ class ConvBase : public JsKernel {
   ConvBase(const OpKernelInfo& info, bool is_channels_last, bool is_fused_conv) : JsKernel(info),
                                                                                   conv_attrs_(info),
                                                                                   w_is_const_(false) {
-    TensorShapeVector kernel_shape;
     const size_t pads_vec_size = conv_attrs_.pads.size() == 0 ? 4 : conv_attrs_.pads.size();
     std::vector<int32_t> local_pads(pads_vec_size, 0);
     for (size_t i = 0; i < conv_attrs_.pads.size() && i < pads_vec_size; ++i) {
       local_pads[i] = gsl::narrow_cast<int32_t>(conv_attrs_.pads[i]);
     }
 
+    TensorShapeVector kernel_shape;
     if (conv_attrs_.kernel_shape_specified) {
       ORT_ENFORCE(info.GetAttrs("kernel_shape", kernel_shape).IsOK());
     }
+    std::vector<int32_t> kernel_shapes(kernel_shape.size(), 0);
+    if (conv_attrs_.kernel_shape_specified) {
+      for (size_t i = 0; i < kernel_shape.size(); ++i) {
+        kernel_shapes[i] = gsl::narrow_cast<int32_t>(kernel_shape[i]);
+      }
+    }
+
+    std::vector<int32_t> strides(conv_attrs_.strides.size(), 0);
+    for (size_t i = 0; i < conv_attrs_.strides.size(); ++i) {
+      strides[i] = gsl::narrow_cast<int32_t>(conv_attrs_.strides[i]);
+    }
+
+    std::vector<int32_t> dilations(conv_attrs_.dilations.size(), 0);
+    for (size_t i = 0; i < conv_attrs_.dilations.size(); ++i) {
+      dilations[i] = gsl::narrow_cast<int32_t>(conv_attrs_.dilations[i]);
+    }
+
     conv_attrs_.activation = info.GetAttrOrDefault<std::string>("activation", "");
     std::vector<float> activation_params = info.GetAttrsOrDefault<float>("activation_params");
     int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
-    auto kernel_shape_0 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0;
-    auto kernel_shape_1 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0;
+
     // currently only support Conv 1D/2D. TODO: support Conv3D and other
-    if (conv_attrs_.dilations.size() == 1 ||
-        (conv_attrs_.kernel_shape_specified && kernel_shape.size() == 1) ||
-        conv_attrs_.strides.size() == 1) {
-      JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-                                   "format" : $8 ? "NHWC" : "NCHW",
-                                   "auto_pad" : $1,
-                                   "dilations" : [$2],
-                                   "group" : $3,
-                                   "kernel_shape" : [$4],
-                                   "pads" : $5 ? Array.from(HEAP32.subarray($5, $6)) : [],
-                                   "strides" : [$7],
-                                   "w_is_const" : () JS_ARROW(!!HEAP8[$9]),
-                                   "activation" : UTF8ToString($10),
-                                   "activation_params" : $11 ? Array.from(HEAPF32.subarray($11, $12)) : []
-                                 }),
-                                 static_cast<int32_t>(conv_attrs_.auto_pad),
-                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
-                                 static_cast<int32_t>(conv_attrs_.group),
-                                 static_cast<int32_t>(kernel_shape_0),
-                                 JSEP_HEAP32_INDEX_START(local_pads),
-                                 JSEP_HEAP32_INDEX_END(local_pads),
-                                 static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
-                                 static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP8_INDEX(&w_is_const_),
-                                 conv_attrs_.activation.c_str(),
-                                 JSEP_HEAP32_INDEX_START(activation_params),
-                                 JSEP_HEAP32_INDEX_END(activation_params));
-    } else {
-      JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
-                                   "format" : $11 ? "NHWC" : "NCHW",
-                                   "auto_pad" : $1,
-                                   "dilations" : [ $2, $3 ],
-                                   "group" : $4,
-                                   "kernel_shape" : [ $5, $6 ],
-                                   "pads" : $7 ? Array.from(HEAP32.subarray($7, $8)) : [],
-                                   "strides" : [ $9, $10 ],
-                                   "w_is_const" : () JS_ARROW(!!HEAP8[$12]),
-                                   "activation" : UTF8ToString($13),
-                                   "activation_params" : $14 ? Array.from(HEAPF32.subarray($14, $15)) : []
-                                 }),
-                                 static_cast<int32_t>(conv_attrs_.auto_pad),
-                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
-                                 static_cast<int32_t>(conv_attrs_.dilations.size() > 1 ? conv_attrs_.dilations[1] : 0),
-                                 static_cast<int32_t>(conv_attrs_.group),
-                                 static_cast<int32_t>(kernel_shape_0),
-                                 static_cast<int32_t>(kernel_shape_1),
-                                 JSEP_HEAP32_INDEX_START(local_pads),
-                                 JSEP_HEAP32_INDEX_END(local_pads),
-                                 static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
-                                 static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
-                                 static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP8_INDEX(&w_is_const_),
-                                 conv_attrs_.activation.c_str(),
-                                 JSEP_HEAP32_INDEX_START(activation_params),
-                                 JSEP_HEAP32_INDEX_END(activation_params));
-    }
+    JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
+                                 "format" : $11 ? "NHWC" : "NCHW",
+                                 "auto_pad" : $1,
+                                 "dilations" : $2 ? Array.from(HEAP32.subarray($2, $3)) : [],
+                                 "group" : $4,
+                                 "kernel_shape" : $5 ? Array.from(HEAP32.subarray($5, $6)) : [],
+                                 "pads" : $7 ? Array.from(HEAP32.subarray($7, $8)) : [],
+                                 "strides" : $9 ? Array.from(HEAP32.subarray($9, $10)) : [],
+                                 "w_is_const" : () JS_ARROW(!!HEAP8[$12]),
+                                 "activation" : UTF8ToString($13),
+                                 "activation_params" : $14 ? Array.from(HEAPF32.subarray($14, $15)) : []
+                               }),
+                               static_cast<int32_t>(conv_attrs_.auto_pad),
+                               JSEP_HEAP32_INDEX_START(dilations),
+                               JSEP_HEAP32_INDEX_END(dilations),
+                               static_cast<int32_t>(conv_attrs_.group),
+                               JSEP_HEAP32_INDEX_START(kernel_shape),
+                               JSEP_HEAP32_INDEX_END(kernel_shape),
+                               JSEP_HEAP32_INDEX_START(local_pads),
+                               JSEP_HEAP32_INDEX_END(local_pads),
+                               JSEP_HEAP32_INDEX_START(strides),
+                               JSEP_HEAP32_INDEX_END(strides),
+                               static_cast<int32_t>(channels_last),
+                               JSEP_HEAP8_INDEX(&w_is_const_),
+                               conv_attrs_.activation.c_str(),
+                               JSEP_HEAP32_INDEX_START(activation_params),
+                               JSEP_HEAP32_INDEX_END(activation_params));
   }
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -359,6 +359,30 @@ void IterateSubgraphFromNode(Graph& graph,
 }
 }  // namespace
 
+void RemovePrintDensityFlag(Graph& graph,
+                            const std::vector<NodeIndex>& node_topology_list,
+                            bool& modified,
+                            const logging::Logger& logger) {
+  for (auto node_index : node_topology_list) {
+    Node* node = graph.GetNode(node_index);
+    if (node == nullptr) {
+      continue;
+    }
+    if (graph_utils::IsSupportedOptypeVersionAndDomain(*node, "PythonOp", {1}, kMSDomain) &&
+        static_cast<std::string>(node->GetAttributes().at("func_name").s()) == kFlagAndPrintDensityFuncName) {
+      if (graph_utils::CanRemoveNode(graph, *node, logger)) {
+        if (graph_utils::RemoveNode(graph, *node)) {
+          modified = true;
+        } else {
+          LOG_DEBUG_INFO(logger, "Failed to remove node " + node->Name() + "(" + node->OpType() + ")");
+        }
+      } else {
+        LOG_DEBUG_INFO(logger, "Can not remove node " + node->Name() + "(" + node->OpType() + ")");
+      }
+    }
+  }
+}
+
 Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
   LOG_DEBUG_INFO(logger, "Enter PaddingElimination");
 
@@ -392,10 +416,6 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
         node.InputDefs()[1]->Exists() &&
         node.InputDefs()[1]->Shape() &&
         node.InputDefs()[1]->Shape()->dim_size() >= 2) {
-      const auto outputNodeCount = std::distance(node.OutputEdgesBegin(), node.OutputEdgesEnd());
-      if (outputNodeCount != 1) {
-        continue;
-      }
       Node* embedding_input_node = graph.GetMutableProducerNode(node.MutableInputDefs()[1]->Name());
       if (embedding_input_node == nullptr ||
           !graph_utils::IsSupportedOptypeVersionAndDomain(*embedding_input_node, "PythonOp", {1}, kMSDomain) ||
@@ -404,21 +424,6 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
         LOG_DEBUG_INFO(logger, "not find PythonOp of flagPaddingElimination after embedding node");
         continue;
       }
-      if (!print_density_) {
-        if (graph_utils::CanRemoveNode(graph, *embedding_input_node, logger)) {
-          if (graph_utils::RemoveNode(graph, *embedding_input_node)) {
-            modified = true;
-          } else {
-            LOG_DEBUG_INFO(logger, "Failed to remove node " + embedding_input_node->Name() +
-                                       "(" + embedding_input_node->OpType() + ")");
-            continue;
-          }
-        } else {
-          LOG_DEBUG_INFO(logger, "Can not remove node " + embedding_input_node->Name() +
-                                     "(" + embedding_input_node->OpType() + ")");
-          continue;
-        }
-      }
       const ONNX_NAMESPACE::TensorProto* padding_initializer =
           graph_utils::GetConstantInitializer(graph, node.InputDefs()[2]->Name());
       if (padding_initializer != nullptr &&
@@ -430,19 +435,22 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
           continue;
         }
         embedding_node = &node;
-        input_ids_arg = embedding_node->MutableInputDefs()[1];
-        for (auto output_defs : embedding_node->MutableOutputDefs()) {
-          subgraph.insert(output_defs);
-        }
         break;
       }
     }
   }
 
+  if (!print_density_) {
+    RemovePrintDensityFlag(graph, node_topology_list, modified, logger);
+  }
   if (!embedding_node) {
     LOG_DEBUG_INFO(logger, "Exit PaddingElimination optimization for not finding any valid embedding node.");
     return Status::OK();
   }
+  input_ids_arg = embedding_node->MutableInputDefs()[1];
+  for (auto output_defs : embedding_node->MutableOutputDefs()) {
+    subgraph.insert(output_defs);
+  }
 
   if (!input_ids_arg->Shape()) {
     LOG_DEBUG_INFO(logger, "Exit PaddingElimination optimization for not finding shape of input_ids.");
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
@@ -122,8 +122,8 @@ stages:
           --volume $(Build.BinariesDirectory):/build \
           --volume /mnist:/mnist \
           onnxruntime_ortmodule_distributed_tests_image \
-            bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py onnxscript && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_distributed.py" \
-      displayName: 'Run onnxruntime_test_collective.py'
+            bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py onnxscript && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_distributed.py && mpirun -n 2 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py" \
+      displayName: 'Run onnxruntime_test_collective.py, onnxruntime_test_distributed.py and test_sharded_moe.py'
       condition: succeededOrFailed()
       timeoutInMinutes: 30