From 1b031a51e23552b29452cdfdf857656e3388cc84 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sat, 3 Jan 2026 10:36:47 -0800 Subject: [PATCH 1/3] Create MultiDeviceFixture --- tests/cpp/multidevice.cpp | 15 ++++++++------- tests/cpp/multidevice.h | 23 ++++++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp index 4d962ffa922..838c71c06ab 100644 --- a/tests/cpp/multidevice.cpp +++ b/tests/cpp/multidevice.cpp @@ -7,7 +7,6 @@ // clang-format on #include #include -#include #ifdef NVFUSER_DISTRIBUTED #include @@ -33,7 +32,7 @@ void MultiDeviceTestEnvironment::TearDown() { Communicator::getInstance().cleanup(); } -MultiDeviceTest::MultiDeviceTest() { +MultiDeviceFixture::MultiDeviceFixture() { // Enable logging in c10d so debug messages can be printed out via // `TORCH_DISTRIBUTED_DEBUG`. c10d::setDebugLevelFromEnvironment(); @@ -42,10 +41,9 @@ MultiDeviceTest::MultiDeviceTest() { tensor_options_ = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr; - disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr; } -MultiDeviceTest::~MultiDeviceTest() { +MultiDeviceFixture::~MultiDeviceFixture() { // Force all processes to synchronize at a barrier between tests. It slightly // slows the tests down, but makes it much easier to isolate a failing test. // Without this, if a test fails such that a subset of processes fail, then @@ -55,8 +53,11 @@ MultiDeviceTest::~MultiDeviceTest() { } } +MultiDeviceTest::MultiDeviceTest() { + disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr; +} + void MultiDeviceTest::SetUp() { - // Set the same random seed for all processes. NVFuserTest::SetUp(); if (!disable_skip && !communicator_->is_available()) { @@ -64,7 +65,7 @@ void MultiDeviceTest::SetUp() { } } -at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) { +at::Tensor MultiDeviceFixture::shardTensor(at::Tensor tensor, TensorView* tv) { if (!isSharded(tv)) { return tensor; } @@ -75,7 +76,7 @@ at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) { tv->getDeviceMesh()); } -at::Tensor MultiDeviceTest::shardTensor( +at::Tensor MultiDeviceFixture::shardTensor( at::Tensor tensor, const int64_t axis, const DeviceMesh& mesh) { diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h index fa043ef3f6d..c4f60f940ae 100644 --- a/tests/cpp/multidevice.h +++ b/tests/cpp/multidevice.h @@ -22,11 +22,12 @@ class MultiDeviceTestEnvironment : public testing::Environment { void TearDown() override; }; -class MultiDeviceTest : public NVFuserTest { +// Fixture class containing the logic for multi-device testing. +// Does not inherit from NVFuserTest or testing::Test. +class MultiDeviceFixture { protected: - MultiDeviceTest(); - ~MultiDeviceTest(); - void SetUp() override; + MultiDeviceFixture(); + ~MultiDeviceFixture(); // Returns a shard of the tensor according to the sharding annotation in tv // for the deviceId. If tensor is not sharded returns the original tensor. @@ -40,15 +41,23 @@ class MultiDeviceTest : public NVFuserTest { int64_t axis, const DeviceMesh& mesh); + Communicator* communicator_; + c10::TensorOptions tensor_options_; + bool debug_print; +}; + +// Test class that inherits from NVFuserTest and uses MultiDeviceFixture. +class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture { + protected: + MultiDeviceTest(); + void SetUp() override; + // Validate the outputs of a fusion against expected outputs. static void validate( const std::vector& expected_outputs, const KernelArgumentHolder& outputs, const std::vector& atols); - Communicator* communicator_; - c10::TensorOptions tensor_options_; - bool debug_print; bool disable_skip; }; From 36804d964fb42103e3b2ebf1a5da90fbbafcd3b9 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sat, 3 Jan 2026 10:50:43 -0800 Subject: [PATCH 2/3] Create MultiDeviceBenchmark --- CMakeLists.txt | 2 ++ tests/cpp/multidevice.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ccda3d89fb5..b4a6136ae70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1111,6 +1111,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST) target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${TEST_NAME} SYSTEM PRIVATE + ${NVFUSER_ROOT}/third_party/benchmark/include ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) @@ -1123,6 +1124,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) dynamic_type GTest::gtest GTest::gmock + benchmark::benchmark flatbuffers ${TORCH_LIBRARIES} ) diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h index c4f60f940ae..b3ae4dd9a28 100644 --- a/tests/cpp/multidevice.h +++ b/tests/cpp/multidevice.h @@ -7,6 +7,9 @@ // clang-format on #pragma once +#include +#include + #include #include #include @@ -61,6 +64,9 @@ class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture { bool disable_skip; }; +class MultiDeviceBenchmark : public benchmark::Fixture, + public MultiDeviceFixture {}; + // This macro is supposed to be used in a test case of a MultiDeviceTest or its // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not // made a function because that function wouldn't be able to skip the test by From 5da9fb05f82aaea894d88f66667633f2e3096286 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sat, 3 Jan 2026 13:06:35 -0800 Subject: [PATCH 3/3] Add a sample multi-GPU benchmark ``` $ mpirun -np 2 -output-filename /tmp/test_multidevice bin/test_multidevice --benchmarks=all $ cat /tmp/test_multidevice/1/rank.0/stdout ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- MultiDeviceBenchmark/Reduction/4/iterations:10 20128420 ns 16788148 ns 10 MultiDeviceBenchmark/Reduction/8/iterations:10 100694 ns 100708 ns 10 ``` --- tests/cpp/multidevice.cpp | 42 +++++++++++++++++++++++-- tests/cpp/multidevice.h | 7 +++-- tests/cpp/test_multidevice_sharding.cpp | 36 +++++++++++++++++++++ 3 files changed, 80 insertions(+), 5 deletions(-) diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp index 838c71c06ab..52f286a5e56 100644 --- a/tests/cpp/multidevice.cpp +++ b/tests/cpp/multidevice.cpp @@ -8,6 +8,14 @@ #include #include +#include +#include +#include +#include + +#include +#include + #ifdef NVFUSER_DISTRIBUTED #include #else @@ -43,7 +51,11 @@ MultiDeviceFixture::MultiDeviceFixture() { debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr; } -MultiDeviceFixture::~MultiDeviceFixture() { +MultiDeviceTest::MultiDeviceTest() { + disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr; +} + +MultiDeviceTest::~MultiDeviceTest() { // Force all processes to synchronize at a barrier between tests. It slightly // slows the tests down, but makes it much easier to isolate a failing test. // Without this, if a test fails such that a subset of processes fail, then @@ -53,8 +65,13 @@ MultiDeviceFixture::~MultiDeviceFixture() { } } -MultiDeviceTest::MultiDeviceTest() { - disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr; +void MultiDeviceBenchmark::TearDown(benchmark::State& state) { + // Unlike testing::Test, a benchmark::Fixture is destructed after `main` + // exits, not after each benchmark. Therefore, we have to put barrier in + // TearDown instead of the destructor. + if (communicator_->is_available()) { + communicator_->barrier(); + } } void MultiDeviceTest::SetUp() { @@ -163,8 +180,27 @@ void MultiDeviceTest::validate( } // namespace nvfuser +namespace { +bool wantsBenchmarks(int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + std::string_view a(argv[i]); + if (a.starts_with("--benchmark")) + return true; + } + return false; +} +} // namespace + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); testing::AddGlobalTestEnvironment(new nvfuser::MultiDeviceTestEnvironment()); + + if (wantsBenchmarks(argc, argv)) { + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; + } + return RUN_ALL_TESTS(); } diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h index b3ae4dd9a28..3702c35ff16 100644 --- a/tests/cpp/multidevice.h +++ b/tests/cpp/multidevice.h @@ -30,7 +30,6 @@ class MultiDeviceTestEnvironment : public testing::Environment { class MultiDeviceFixture { protected: MultiDeviceFixture(); - ~MultiDeviceFixture(); // Returns a shard of the tensor according to the sharding annotation in tv // for the deviceId. If tensor is not sharded returns the original tensor. @@ -53,6 +52,7 @@ class MultiDeviceFixture { class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture { protected: MultiDeviceTest(); + ~MultiDeviceTest(); void SetUp() override; // Validate the outputs of a fusion against expected outputs. @@ -65,7 +65,10 @@ class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture { }; class MultiDeviceBenchmark : public benchmark::Fixture, - public MultiDeviceFixture {}; + public MultiDeviceFixture { + protected: + void TearDown(benchmark::State& state) override; +}; // This macro is supposed to be used in a test case of a MultiDeviceTest or its // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index ff4c98936df..0e37c254090 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -5,6 +5,7 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include #include #include @@ -1283,4 +1284,39 @@ TEST_F(MultiDeviceTest, MultipleIncompatibleReshapes) { EXPECT_FALSE(runtime->isSegmented()); } } + +BENCHMARK_DEFINE_F(MultiDeviceBenchmark, Reduction)(benchmark::State& state) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + auto mesh = DeviceMesh::createForNumDevices(communicator_->size()); + + TensorView* in = makeContigTensor(2); + TensorView* out = sum(in, {0}); + + fusion->addInput(in); + fusion->addOutput(out); + + in->setDeviceMesh(mesh); + in->axis(0)->parallelize(ParallelType::DIDx); + + auto unsharded_in_tensor = + at::randn({mesh.size(), state.range(0)}, tensor_options_); + auto in_tensor = shardTensor(unsharded_in_tensor, in); + + FusionExecutorCache executor_cache(std::move(fusion)); + + for (auto _ : state) { + executor_cache.runFusionWithInputs({in_tensor}); + } +} + +// `Iterations` ensures that all processes run the benchmark for the same number +// of iterations. Without it, Google Benchmark adaptively determines the +// iteration count per process, which can differ across processes and cause +// collective operations (like allreduce) to hang indefinitely. +BENCHMARK_REGISTER_F(MultiDeviceBenchmark, Reduction) + ->Arg(4) + ->Arg(8) + ->Iterations(10); + } // namespace nvfuser