Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)
target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
target_include_directories(${TEST_NAME} SYSTEM PRIVATE
${NVFUSER_ROOT}/third_party/benchmark/include
${NVFUSER_ROOT}/third_party/googletest/googletest/include
${NVFUSER_ROOT}/third_party/googletest/googlemock/include
)
Expand All @@ -1123,6 +1124,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
dynamic_type
GTest::gtest
GTest::gmock
benchmark::benchmark
flatbuffers
${TORCH_LIBRARIES}
)
Expand Down
47 changes: 42 additions & 5 deletions tests/cpp/multidevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@
// clang-format on
#include <sys/types.h>
#include <unistd.h>
#include <mutex>

#include <sstream>
#include <string>
#include <string_view>
#include <vector>

#include <benchmark/benchmark.h>
#include <gtest/gtest.h>

#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/debug.h>
Expand All @@ -33,7 +40,7 @@ void MultiDeviceTestEnvironment::TearDown() {
Communicator::getInstance().cleanup();
}

MultiDeviceTest::MultiDeviceTest() {
MultiDeviceFixture::MultiDeviceFixture() {
// Enable logging in c10d so debug messages can be printed out via
// `TORCH_DISTRIBUTED_DEBUG`.
c10d::setDebugLevelFromEnvironment();
Expand All @@ -42,6 +49,9 @@ MultiDeviceTest::MultiDeviceTest() {
tensor_options_ =
at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr;
}

MultiDeviceTest::MultiDeviceTest() {
disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
}

Expand All @@ -55,16 +65,24 @@ MultiDeviceTest::~MultiDeviceTest() {
}
}

void MultiDeviceBenchmark::TearDown(benchmark::State& state) {
// Unlike testing::Test, a benchmark::Fixture is destructed after `main`
// exits, not after each benchmark. Therefore, we have to put barrier in
// TearDown instead of the destructor.
if (communicator_->is_available()) {
communicator_->barrier();
}
}

void MultiDeviceTest::SetUp() {
// Set the same random seed for all processes.
NVFuserTest::SetUp();

if (!disable_skip && !communicator_->is_available()) {
GTEST_SKIP() << "This test needs an available communicator.";
}
}

at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
at::Tensor MultiDeviceFixture::shardTensor(at::Tensor tensor, TensorView* tv) {
if (!isSharded(tv)) {
return tensor;
}
Expand All @@ -75,7 +93,7 @@ at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
tv->getDeviceMesh());
}

at::Tensor MultiDeviceTest::shardTensor(
at::Tensor MultiDeviceFixture::shardTensor(
at::Tensor tensor,
const int64_t axis,
const DeviceMesh& mesh) {
Expand Down Expand Up @@ -162,8 +180,27 @@ void MultiDeviceTest::validate(

} // namespace nvfuser

namespace {
bool wantsBenchmarks(int argc, char** argv) {
for (int i = 1; i < argc; ++i) {
std::string_view a(argv[i]);
if (a.starts_with("--benchmark"))
return true;
}
return false;
}
} // namespace

int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
testing::AddGlobalTestEnvironment(new nvfuser::MultiDeviceTestEnvironment());

if (wantsBenchmarks(argc, argv)) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks tend to run longer and don't need to run as frequently as tests, so it's worth separating benchmarks from (correctness) tests.

The question though is how.

  1. In this version, the benchmarks are defined in the same set of files as tests, and I'm reusing the same main function which detects flags like --benchmarks.
  2. Alternatively, I could write two main functions (one for tests and the other for benchmarks) and link them to different binaries (test_multidevice vs benchmark_multidevice).
  3. Furthermore, I could even split the test files and the benchmark files. It's harder to reuse code this way. For example, a FusionDefinition needs to be DRY'ed in order to be both tested and benchmarked.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Option (1) might be simplest to use in the short term. Instead of 2 different commands, only an additional flag is needed. The downside is that tests and benchmarks do not have a clear distinction.

Option (2) is a good balance to reuse while maintaining different binaries but requires different commands for the validation and benchmarking part.

For option (3), we could define common fusions in a path outside tests/benchmarks, however the setup will still likely be repeated. Another downside I see is that there are multiple locations which need to be kept in sync.

Yet another option is to have these in the benchmark file with validation, and allow arguments to disable either. The github CI can only run validation whereas nightly CI runs everything.

For now what you have in the PR looks like a good starting point to atleast unify how we create benchmarks. I am assuming you intend to modify

to use google benchmarks as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am assuming you intend to modify

Yes, that'll likely be the first target.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that we only run one of validation or benchmarking?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that has been a Google internal convention -- when the user specifies --benchmarks=all the default main function will run just the benchmarks. But I'm open to other contracts. Multi-GPU tests come with a customized main function so we can do whichever we prefer.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

has both validation and benchmarking. It would be preferable to allow having both done together in a single run.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

has both validation and benchmarking

I suspect we are talking about different things.

Nothing prevents a BENCHMARK_DEFINE_F from using comparison macros like EXPECT_EQ. That'll make a BENCHMARK_DEFINE_F on par with the runBenchmark function you pointed to.

I'm asking whether a benchmark binary (e.g. multidevice_benchmark) or a combined binary running in benchmark mode (e.g. test_multidevice --benchmarks=all) should also run TEST_Fs (in addition to BENCHMARK_DEFINE_Fs). Wdyt?

Copy link
Collaborator

@Priya2698 Priya2698 Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.

I'm asking whether a benchmark binary (e.g. multidevice_benchmark) or a combined binary running in benchmark mode (e.g. test_multidevice --benchmarks=all) should also run TEST_Fs (in addition to BENCHMARK_DEFINE_Fs). Wdyt?

I think we should either run tests or benchmarks. Benchmarks can additionally validate the results, as you mentioned. In this case, my preference would be to link them to different binaries. Test binaries only run tests and benchmark binaries only run benchmarks. This behavior sounds the most predictable to me.

benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}

return RUN_ALL_TESTS();
}
32 changes: 25 additions & 7 deletions tests/cpp/multidevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
// clang-format on
#pragma once

#include <benchmark/benchmark.h>
#include <gtest/gtest.h>

#include <multidevice/communication.h>
#include <multidevice/communicator.h>
#include <multidevice/execution_utils.h>
Expand All @@ -22,11 +25,11 @@ class MultiDeviceTestEnvironment : public testing::Environment {
void TearDown() override;
};

class MultiDeviceTest : public NVFuserTest {
// Fixture class containing the logic for multi-device testing.
// Does not inherit from NVFuserTest or testing::Test.
class MultiDeviceFixture {
protected:
MultiDeviceTest();
~MultiDeviceTest();
void SetUp() override;
MultiDeviceFixture();

// Returns a shard of the tensor according to the sharding annotation in tv
// for the deviceId. If tensor is not sharded returns the original tensor.
Expand All @@ -40,18 +43,33 @@ class MultiDeviceTest : public NVFuserTest {
int64_t axis,
const DeviceMesh& mesh);

Communicator* communicator_;
c10::TensorOptions tensor_options_;
bool debug_print;
};

// Test class that inherits from NVFuserTest and uses MultiDeviceFixture.
class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
protected:
MultiDeviceTest();
~MultiDeviceTest();
void SetUp() override;

// Validate the outputs of a fusion against expected outputs.
static void validate(
const std::vector<at::Tensor>& expected_outputs,
const KernelArgumentHolder& outputs,
const std::vector<double>& atols);

Communicator* communicator_;
c10::TensorOptions tensor_options_;
bool debug_print;
bool disable_skip;
};

class MultiDeviceBenchmark : public benchmark::Fixture,
public MultiDeviceFixture {
protected:
void TearDown(benchmark::State& state) override;
};

// This macro is supposed to be used in a test case of a MultiDeviceTest or its
// `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not
// made a function because that function wouldn't be able to skip the test by
Expand Down
36 changes: 36 additions & 0 deletions tests/cpp/test_multidevice_sharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <benchmark/benchmark.h>
#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

Expand Down Expand Up @@ -1283,4 +1284,39 @@ TEST_F(MultiDeviceTest, MultipleIncompatibleReshapes) {
EXPECT_FALSE(runtime->isSegmented());
}
}

BENCHMARK_DEFINE_F(MultiDeviceBenchmark, Reduction)(benchmark::State& state) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
auto mesh = DeviceMesh::createForNumDevices(communicator_->size());

TensorView* in = makeContigTensor(2);
TensorView* out = sum(in, {0});

fusion->addInput(in);
fusion->addOutput(out);

in->setDeviceMesh(mesh);
in->axis(0)->parallelize(ParallelType::DIDx);

auto unsharded_in_tensor =
at::randn({mesh.size(), state.range(0)}, tensor_options_);
auto in_tensor = shardTensor(unsharded_in_tensor, in);

FusionExecutorCache executor_cache(std::move(fusion));

for (auto _ : state) {
executor_cache.runFusionWithInputs({in_tensor});
}
}

// `Iterations` ensures that all processes run the benchmark for the same number
// of iterations. Without it, Google Benchmark adaptively determines the
// iteration count per process, which can differ across processes and cause
// collective operations (like allreduce) to hang indefinitely.
BENCHMARK_REGISTER_F(MultiDeviceBenchmark, Reduction)
->Arg(4)
->Arg(8)
->Iterations(10);

} // namespace nvfuser