update

Chao1Han · Chao1Han · commit 58f526396a87 · 2025-10-15T12:52:58.000+08:00
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
@@ -267,11 +267,9 @@ ProcessGroupXCCL::WorkXCCL::WorkXCCL(
         : nullptr;
     xcclEndEvent_ = XPUEventCache::get(device.index())->create(enableTiming);
   } else {
-    xcclStartEvent_ = enableTiming
-        ? std::make_shared<at::xpu::XPUEvent>(xpuEventDefault)
-        : nullptr;
-    xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(
-        enableTiming ? xpuEventDefault : xpuEventDisableTiming);
+    xcclStartEvent_ =
+        enableTiming ? std::make_shared<at::xpu::XPUEvent>(1) : nullptr;
+    xcclEndEvent_ = std::make_shared<at::xpu::XPUEvent>(enableTiming ? 1 : 0);
   }
   stashed_for_allocator_safety_ = std::make_shared<TensorShelf>();
 }
@@ -902,7 +900,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
   auto cclstream = xcclStreamsMap_.at(key).second;
   syncStream(device, xcclEventsMap_[key], stream);
 
-  c10::intrusive_ptr<ProcessGroupNCCL::WorkXCCL> work;
+  c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
   if (!coalescing_state_) {
     work = initWork(device, rank_, opType, true, profilingTitle, {tensor}, {});
     work->outputs_ = std::make_shared<std::vector<at::Tensor>>();
@@ -944,9 +942,8 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     checkForNan(tensor, stream);
   }
   if (!coalescing_state_) {
-    // Start event should only be recorded before the ncclGroupStart()
     if (work->timingEnabled_) {
-      work->ncclStartEvent_->record(stream);
+      work->xcclStartEvent_->record(stream);
     }
 
     pre(stream, work);
@@ -956,10 +953,10 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
   c10::xpu::XPUCachingAllocator::recordStream(
       tensor.storage().data_ptr(), stream);
 
-  xcclGroupStart();
+  ccl::group_start();
   fn(tensor, *comm, stream, cclstream, p2pTargetRank);
-  xcclGroupEnd();
-  
+  ccl::group_end();
+
   if (!coalescing_state_) {
     post(stream);
 
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
@@ -22,6 +22,7 @@
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <torch/csrc/distributed/c10d/logger.hpp>
 #include <xccl/ProcessGroupXCCLMonitor.hpp>
+#include <xccl/XPUEventCache.hpp>
 namespace c10d {
 
 static std::vector<std::string> TORCH_XCCL_HIGH_PRIORITY = {
@@ -73,7 +74,9 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         uint64_t seq,
         bool isP2P,
         const char* profilingTitle = nullptr,
-        const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt);
+        const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt,
+        bool enableTiming = false,
+        bool xpuEventCacheEnabled = false);
     WorkXCCL(const WorkXCCL& w);
     ~WorkXCCL() override;
 
@@ -87,6 +90,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
     void synchronizeStream();
 
+    float getDuration() const override;
+
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
 
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
@@ -308,23 +313,23 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         /*nanCheck =*/false);
   }
 
-template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
-    at::Tensor& tensor,
-    Fn fn,
-    int peer,
-    OpType opType,
-    const char* profilingTitle) {
-  return pointToPoint(
-      tensor,
-      fn,
-      peer,
-      opType,
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
-      [](at::xpu::XPUStream&) {},
-      profilingTitle);
-}
+  template <typename Fn>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      const char* profilingTitle) {
+    return pointToPoint(
+        tensor,
+        fn,
+        peer,
+        opType,
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+        [](at::xpu::XPUStream&) {},
+        profilingTitle);
+  }
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> pointToPoint(
diff --git a/src/xccl/XPUEventCache.cpp b/src/xccl/XPUEventCache.cpp
@@ -0,0 +1,43 @@
+#include <c10/xpu/XPUStream.h>
+#include <xccl/XPUEventCache.hpp>
+#include <map>
+
+namespace c10d {
+
+XPUEventCache::XPUEventCache() = default;
+
+std::shared_ptr<at::xpu::XPUEvent> XPUEventCache::create(bool timing) {
+  auto deleter = [cache = shared_from_this(),
+                  timing](at::xpu::XPUEvent* event) {
+    std::lock_guard<std::mutex> lock(cache->cacheMutex_);
+
+    cache->eventsArray_[timing ? 1 : 0].push_back(event);
+  };
+  at::xpu::XPUEvent* event = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(cacheMutex_);
+    auto& events = eventsArray_[timing ? 1 : 0];
+    // If we still have events in the cache, we reuse it. Otherwise, we create a
+    // new one.
+    if (!events.empty()) {
+      event = events.front();
+      events.pop_front();
+    } else {
+      event = new at::xpu::XPUEvent(timing ? 1 : 0);
+    }
+  }
+  return std::shared_ptr<at::xpu::XPUEvent>(event, std::move(deleter));
+}
+
+std::shared_ptr<XPUEventCache> XPUEventCache::get(at::DeviceIndex device) {
+  static thread_local std::map<at::DeviceIndex, std::shared_ptr<XPUEventCache>>
+      cacheDeviceMap;
+  // Check if device has already been in the map, if not, add a new entry
+  auto it = cacheDeviceMap.find(device);
+  if (it == cacheDeviceMap.end()) {
+    cacheDeviceMap.emplace(device, std::make_shared<XPUEventCache>());
+  }
+  return cacheDeviceMap[device];
+}
+
+} // namespace c10d
diff --git a/src/xccl/XPUEventCache.hpp b/src/xccl/XPUEventCache.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <array>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/macros/Export.h>
+
+namespace c10d {
+
+class TORCH_API XPUEventCache
+    : public std::enable_shared_from_this<XPUEventCache> {
+ public:
+  XPUEventCache();
+  std::shared_ptr<at::xpu::XPUEvent> create(bool timing);
+  static std::shared_ptr<XPUEventCache> get(at::DeviceIndex device);
+
+ private:
+  std::mutex cacheMutex_;
+  // NOTE: We intentionally store raw pointers so that
+  // we do not attempt to destroy the event objects on process exit,
+  // because cuda may be gone.
+  std::array<std::deque<at::xpu::XPUEvent*>, 2>
+      eventsArray_; // 0 for timing=false, 1 for timing=true
+};
+
+} // namespace c10d