update

Chao1Han · Chao1Han · commit 0c69f08a5292 · 2025-10-15T11:34:18.000+08:00
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
@@ -959,7 +959,7 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
   xcclGroupStart();
   fn(tensor, *comm, stream, cclstream, p2pTargetRank);
   xcclGroupEnd();
-  
+
   if (!coalescing_state_) {
     post(stream);
 
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
@@ -22,6 +22,7 @@
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <torch/csrc/distributed/c10d/logger.hpp>
 #include <xccl/ProcessGroupXCCLMonitor.hpp>
+#include <xccl/XPUEventCache.hpp>
 namespace c10d {
 
 static std::vector<std::string> TORCH_XCCL_HIGH_PRIORITY = {
@@ -308,23 +309,23 @@ class TORCH_API ProcessGroupXCCL : public Backend {
         /*nanCheck =*/false);
   }
 
-template <typename Fn>
-c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
-    at::Tensor& tensor,
-    Fn fn,
-    int peer,
-    OpType opType,
-    const char* profilingTitle) {
-  return pointToPoint(
-      tensor,
-      fn,
-      peer,
-      opType,
-      [](at::xpu::XPUStream&,
-         c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
-      [](at::xpu::XPUStream&) {},
-      profilingTitle);
-}
+  template <typename Fn>
+  c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      const char* profilingTitle) {
+    return pointToPoint(
+        tensor,
+        fn,
+        peer,
+        opType,
+        [](at::xpu::XPUStream&,
+           c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL>& work) {},
+        [](at::xpu::XPUStream&) {},
+        profilingTitle);
+  }
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> pointToPoint(
@@ -441,6 +442,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
 
   uint64_t getSequenceNumberForGroup() override;
 
+  float getDuration() const override;
+
   std::string createLogPrefix() const;
 
   const std::string& logPrefix() const;
diff --git a/src/xccl/XPUEventCache.cpp b/src/xccl/XPUEventCache.cpp
@@ -0,0 +1,43 @@
+#include <c10/xpu/XPUStream.h>
+#include <xccl/XPUEventCache.hpp>
+#include <map>
+
+namespace c10d {
+
+XPUEventCache::XPUEventCache() = default;
+
+std::shared_ptr<at::xpu::XPUEvent> XPUEventCache::create(bool timing) {
+  auto deleter = [cache = shared_from_this(),
+                  timing](at::xpu::XPUEvent* event) {
+    std::lock_guard<std::mutex> lock(cache->cacheMutex_);
+
+    cache->eventsArray_[timing ? 1 : 0].push_back(event);
+  };
+  at::xpu::XPUEvent* event = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(cacheMutex_);
+    auto& events = eventsArray_[timing ? 1 : 0];
+    // If we still have events in the cache, we reuse it. Otherwise, we create a
+    // new one.
+    if (!events.empty()) {
+      event = events.front();
+      events.pop_front();
+    } else {
+      event = new at::xpu::XPUEvent(timing ? 1 : 0);
+    }
+  }
+  return std::shared_ptr<at::xpu::XPUEvent>(event, std::move(deleter));
+}
+
+std::shared_ptr<XPUEventCache> XPUEventCache::get(at::DeviceIndex device) {
+  static thread_local std::map<at::DeviceIndex, std::shared_ptr<XPUEventCache>>
+      cacheDeviceMap;
+  // Check if device has already been in the map, if not, add a new entry
+  auto it = cacheDeviceMap.find(device);
+  if (it == cacheDeviceMap.end()) {
+    cacheDeviceMap.emplace(device, std::make_shared<XPUEventCache>());
+  }
+  return cacheDeviceMap[device];
+}
+
+} // namespace c10d
diff --git a/src/xccl/XPUEventCache.hpp b/src/xccl/XPUEventCache.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <array>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/macros/Export.h>
+
+namespace c10d {
+
+class TORCH_API XPUEventCache
+    : public std::enable_shared_from_this<XPUEventCache> {
+ public:
+  XPUEventCache();
+  std::shared_ptr<at::xpu::XPUEvent> create(bool timing);
+  static std::shared_ptr<XPUEventCache> get(at::DeviceIndex device);
+
+ private:
+  std::mutex cacheMutex_;
+  // NOTE: We intentionally store raw pointers so that
+  // we do not attempt to destroy the event objects on process exit,
+  // because cuda may be gone.
+  std::array<std::deque<at::xpu::XPUEvent*>, 2>
+      eventsArray_; // 0 for timing=false, 1 for timing=true
+};
+
+} // namespace c10d