From 1b031a51e23552b29452cdfdf857656e3388cc84 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Sat, 3 Jan 2026 10:36:47 -0800
Subject: [PATCH 1/3] Create MultiDeviceFixture

---
 tests/cpp/multidevice.cpp | 15 ++++++++-------
 tests/cpp/multidevice.h   | 23 ++++++++++++++++-------
 2 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp
index 4d962ffa922..838c71c06ab 100644
--- a/tests/cpp/multidevice.cpp
+++ b/tests/cpp/multidevice.cpp
@@ -7,7 +7,6 @@
 // clang-format on
 #include <sys/types.h>
 #include <unistd.h>
-#include <mutex>
 
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/debug.h>
@@ -33,7 +32,7 @@ void MultiDeviceTestEnvironment::TearDown() {
   Communicator::getInstance().cleanup();
 }
 
-MultiDeviceTest::MultiDeviceTest() {
+MultiDeviceFixture::MultiDeviceFixture() {
   // Enable logging in c10d so debug messages can be printed out via
   // `TORCH_DISTRIBUTED_DEBUG`.
   c10d::setDebugLevelFromEnvironment();
@@ -42,10 +41,9 @@ MultiDeviceTest::MultiDeviceTest() {
   tensor_options_ =
       at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr;
-  disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
 }
 
-MultiDeviceTest::~MultiDeviceTest() {
+MultiDeviceFixture::~MultiDeviceFixture() {
   // Force all processes to synchronize at a barrier between tests. It slightly
   // slows the tests down, but makes it much easier to isolate a failing test.
   // Without this, if a test fails such that a subset of processes fail, then
@@ -55,8 +53,11 @@ MultiDeviceTest::~MultiDeviceTest() {
   }
 }
 
+MultiDeviceTest::MultiDeviceTest() {
+  disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
+}
+
 void MultiDeviceTest::SetUp() {
-  // Set the same random seed for all processes.
   NVFuserTest::SetUp();
 
   if (!disable_skip && !communicator_->is_available()) {
@@ -64,7 +65,7 @@ void MultiDeviceTest::SetUp() {
   }
 }
 
-at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
+at::Tensor MultiDeviceFixture::shardTensor(at::Tensor tensor, TensorView* tv) {
   if (!isSharded(tv)) {
     return tensor;
   }
@@ -75,7 +76,7 @@ at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
       tv->getDeviceMesh());
 }
 
-at::Tensor MultiDeviceTest::shardTensor(
+at::Tensor MultiDeviceFixture::shardTensor(
     at::Tensor tensor,
     const int64_t axis,
     const DeviceMesh& mesh) {
diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h
index fa043ef3f6d..c4f60f940ae 100644
--- a/tests/cpp/multidevice.h
+++ b/tests/cpp/multidevice.h
@@ -22,11 +22,12 @@ class MultiDeviceTestEnvironment : public testing::Environment {
   void TearDown() override;
 };
 
-class MultiDeviceTest : public NVFuserTest {
+// Fixture class containing the logic for multi-device testing.
+// Does not inherit from NVFuserTest or testing::Test.
+class MultiDeviceFixture {
  protected:
-  MultiDeviceTest();
-  ~MultiDeviceTest();
-  void SetUp() override;
+  MultiDeviceFixture();
+  ~MultiDeviceFixture();
 
   // Returns a shard of the tensor according to the sharding annotation in tv
   // for the deviceId. If tensor is not sharded returns the original tensor.
@@ -40,15 +41,23 @@ class MultiDeviceTest : public NVFuserTest {
       int64_t axis,
       const DeviceMesh& mesh);
 
+  Communicator* communicator_;
+  c10::TensorOptions tensor_options_;
+  bool debug_print;
+};
+
+// Test class that inherits from NVFuserTest and uses MultiDeviceFixture.
+class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
+ protected:
+  MultiDeviceTest();
+  void SetUp() override;
+
   // Validate the outputs of a fusion against expected outputs.
   static void validate(
       const std::vector<at::Tensor>& expected_outputs,
       const KernelArgumentHolder& outputs,
       const std::vector<double>& atols);
 
-  Communicator* communicator_;
-  c10::TensorOptions tensor_options_;
-  bool debug_print;
   bool disable_skip;
 };
 

From 36804d964fb42103e3b2ebf1a5da90fbbafcd3b9 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Sat, 3 Jan 2026 10:50:43 -0800
Subject: [PATCH 2/3] Create MultiDeviceBenchmark

---
 CMakeLists.txt          | 2 ++
 tests/cpp/multidevice.h | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccda3d89fb5..b4a6136ae70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1111,6 +1111,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
   target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)
   target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
   target_include_directories(${TEST_NAME} SYSTEM PRIVATE
+    ${NVFUSER_ROOT}/third_party/benchmark/include
     ${NVFUSER_ROOT}/third_party/googletest/googletest/include
     ${NVFUSER_ROOT}/third_party/googletest/googlemock/include
   )
@@ -1123,6 +1124,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
     dynamic_type
     GTest::gtest
     GTest::gmock
+    benchmark::benchmark
     flatbuffers
     ${TORCH_LIBRARIES}
   )
diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h
index c4f60f940ae..b3ae4dd9a28 100644
--- a/tests/cpp/multidevice.h
+++ b/tests/cpp/multidevice.h
@@ -7,6 +7,9 @@
 // clang-format on
 #pragma once
 
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
 #include <multidevice/communication.h>
 #include <multidevice/communicator.h>
 #include <multidevice/execution_utils.h>
@@ -61,6 +64,9 @@ class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
   bool disable_skip;
 };
 
+class MultiDeviceBenchmark : public benchmark::Fixture,
+                             public MultiDeviceFixture {};
+
 // This macro is supposed to be used in a test case of a MultiDeviceTest or its
 // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not
 // made a function because that function wouldn't be able to skip the test by

From 5da9fb05f82aaea894d88f66667633f2e3096286 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Sat, 3 Jan 2026 13:06:35 -0800
Subject: [PATCH 3/3] Add a sample multi-GPU benchmark

```
$ mpirun -np 2 -output-filename /tmp/test_multidevice bin/test_multidevice --benchmarks=all

$ cat /tmp/test_multidevice/1/rank.0/stdout
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
MultiDeviceBenchmark/Reduction/4/iterations:10   20128420 ns     16788148 ns           10
MultiDeviceBenchmark/Reduction/8/iterations:10     100694 ns       100708 ns           10
```
---
 tests/cpp/multidevice.cpp               | 42 +++++++++++++++++++++++--
 tests/cpp/multidevice.h                 |  7 +++--
 tests/cpp/test_multidevice_sharding.cpp | 36 +++++++++++++++++++++
 3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp
index 838c71c06ab..52f286a5e56 100644
--- a/tests/cpp/multidevice.cpp
+++ b/tests/cpp/multidevice.cpp
@@ -8,6 +8,14 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/debug.h>
 #else
@@ -43,7 +51,11 @@ MultiDeviceFixture::MultiDeviceFixture() {
   debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr;
 }
 
-MultiDeviceFixture::~MultiDeviceFixture() {
+MultiDeviceTest::MultiDeviceTest() {
+  disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
+}
+
+MultiDeviceTest::~MultiDeviceTest() {
   // Force all processes to synchronize at a barrier between tests. It slightly
   // slows the tests down, but makes it much easier to isolate a failing test.
   // Without this, if a test fails such that a subset of processes fail, then
@@ -53,8 +65,13 @@ MultiDeviceFixture::~MultiDeviceFixture() {
   }
 }
 
-MultiDeviceTest::MultiDeviceTest() {
-  disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
+void MultiDeviceBenchmark::TearDown(benchmark::State& state) {
+  // Unlike testing::Test, a benchmark::Fixture is destructed after `main`
+  // exits, not after each benchmark. Therefore, we have to put barrier in
+  // TearDown instead of the destructor.
+  if (communicator_->is_available()) {
+    communicator_->barrier();
+  }
 }
 
 void MultiDeviceTest::SetUp() {
@@ -163,8 +180,27 @@ void MultiDeviceTest::validate(
 
 } // namespace nvfuser
 
+namespace {
+bool wantsBenchmarks(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    std::string_view a(argv[i]);
+    if (a.starts_with("--benchmark"))
+      return true;
+  }
+  return false;
+}
+} // namespace
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::AddGlobalTestEnvironment(new nvfuser::MultiDeviceTestEnvironment());
+
+  if (wantsBenchmarks(argc, argv)) {
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+    return 0;
+  }
+
   return RUN_ALL_TESTS();
 }
diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h
index b3ae4dd9a28..3702c35ff16 100644
--- a/tests/cpp/multidevice.h
+++ b/tests/cpp/multidevice.h
@@ -30,7 +30,6 @@ class MultiDeviceTestEnvironment : public testing::Environment {
 class MultiDeviceFixture {
  protected:
   MultiDeviceFixture();
-  ~MultiDeviceFixture();
 
   // Returns a shard of the tensor according to the sharding annotation in tv
   // for the deviceId. If tensor is not sharded returns the original tensor.
@@ -53,6 +52,7 @@ class MultiDeviceFixture {
 class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
  protected:
   MultiDeviceTest();
+  ~MultiDeviceTest();
   void SetUp() override;
 
   // Validate the outputs of a fusion against expected outputs.
@@ -65,7 +65,10 @@ class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
 };
 
 class MultiDeviceBenchmark : public benchmark::Fixture,
-                             public MultiDeviceFixture {};
+                             public MultiDeviceFixture {
+ protected:
+  void TearDown(benchmark::State& state) override;
+};
 
 // This macro is supposed to be used in a test case of a MultiDeviceTest or its
 // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index ff4c98936df..0e37c254090 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <benchmark/benchmark.h>
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
@@ -1283,4 +1284,39 @@ TEST_F(MultiDeviceTest, MultipleIncompatibleReshapes) {
     EXPECT_FALSE(runtime->isSegmented());
   }
 }
+
+BENCHMARK_DEFINE_F(MultiDeviceBenchmark, Reduction)(benchmark::State& state) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  auto mesh = DeviceMesh::createForNumDevices(communicator_->size());
+
+  TensorView* in = makeContigTensor(2);
+  TensorView* out = sum(in, {0});
+
+  fusion->addInput(in);
+  fusion->addOutput(out);
+
+  in->setDeviceMesh(mesh);
+  in->axis(0)->parallelize(ParallelType::DIDx);
+
+  auto unsharded_in_tensor =
+      at::randn({mesh.size(), state.range(0)}, tensor_options_);
+  auto in_tensor = shardTensor(unsharded_in_tensor, in);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto _ : state) {
+    executor_cache.runFusionWithInputs({in_tensor});
+  }
+}
+
+// `Iterations` ensures that all processes run the benchmark for the same number
+// of iterations. Without it, Google Benchmark adaptively determines the
+// iteration count per process, which can differ across processes and cause
+// collective operations (like allreduce) to hang indefinitely.
+BENCHMARK_REGISTER_F(MultiDeviceBenchmark, Reduction)
+    ->Arg(4)
+    ->Arg(8)
+    ->Iterations(10);
+
 } // namespace nvfuser