Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes cudaErrorInvalidValue when running on nvbench-created cuda stream #113

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
31 changes: 26 additions & 5 deletions nvbench/cuda_stream.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@
#pragma once

#include <nvbench/cuda_call.cuh>
#include <nvbench/detail/device_scope.cuh>
#include <nvbench/device_info.cuh>

#include <cuda_runtime_api.h>

#include <memory>
#include <optional>

namespace nvbench
{
Expand All @@ -39,18 +42,36 @@ namespace nvbench
struct cuda_stream
{
/**
* Constructs a cuda_stream that owns a new stream, created with
* `cudaStreamCreate`.
* Constructs a cuda_stream that owns a new stream, created with `cudaStreamCreate`.
*
* @param device The device that this stream should be associated with. If no device is provided,
* the stream will be associated with the device that is active at the call time.
*/
cuda_stream()
: m_stream{[]() {
explicit cuda_stream(std::optional<nvbench::device_info> device)
: m_stream{[device]() {
cudaStream_t s;
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
if (device.has_value())
{
nvbench::detail::device_scope scope_guard{device.value().get_id()};
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
}
else
{
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
}
return s;
}(),
stream_deleter{true}}
{}

/**
* @brief Constructs a new cuda_stream tha is asociated with the device that is active at the call
* time.
*/
cuda_stream()
: cuda_stream(std::nullopt)
{}

/**
* Constructs a `cuda_stream` from an explicit cudaStream_t.
*
Expand Down
8 changes: 7 additions & 1 deletion nvbench/detail/measure_cold.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,13 @@ namespace nvbench::detail

measure_cold_base::measure_cold_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_launch{nvbench::launch([this]() -> decltype(auto) {
if (!m_state.get_cuda_stream().has_value())
{
m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
}
return m_state.get_cuda_stream().value();
}())}
, m_run_once{exec_state.get_run_once()}
, m_no_block{exec_state.get_disable_blocking_kernel()}
, m_min_samples{exec_state.get_min_samples()}
Expand Down
8 changes: 7 additions & 1 deletion nvbench/detail/measure_cupti.cu
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,13 @@ measure_cupti_base::measure_cupti_base(state &exec_state)
// (formatter doesn't handle `try :` very well...)
try
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_launch{[this]() -> decltype(auto) {
if (!m_state.get_cuda_stream().has_value())
{
m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
}
return m_state.get_cuda_stream().value();
}()}
, m_cupti{*m_state.get_device(), add_metrics(m_state)}
{}
// clang-format on
Expand Down
8 changes: 7 additions & 1 deletion nvbench/detail/measure_hot.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ namespace nvbench::detail

measure_hot_base::measure_hot_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_launch{nvbench::launch([this]() -> decltype(auto) {
if (!m_state.get_cuda_stream().has_value())
{
m_state.set_cuda_stream(nvbench::cuda_stream{m_state.get_device()});
}
return m_state.get_cuda_stream().value();
Comment on lines +41 to +45
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels weird to have the initialization of the optional external to state.

How about putting this logic inside state::get_cuda_stream instead and don't expose the optional externally.

Copy link
Author

@elstehle elstehle Feb 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about putting this logic inside state::get_cuda_stream instead and don't expose the optional externally.

@allisonvacanti and I have discussed that option too but agreed to prefer explicitly setting the stream over implicitly initializing it as a byproduct, if it didn't exist. Considering the user interfacing with the API, I feel that, for multi-GPU systems, it's safer to make it explicit when resources are created and what device they are associated with. Especially, when the current device may influence what device a resource is associated with.

That said, I'm fine to have it any way we decide makes more sense. @allisonvacanti what do you think?

}())}
, m_min_samples{exec_state.get_min_samples()}
, m_min_time{exec_state.get_min_time()}
, m_skip_time{exec_state.get_skip_time()}
Expand Down
8 changes: 6 additions & 2 deletions nvbench/state.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ struct state
state &operator=(const state &) = delete;
state &operator=(state &&) = default;

[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
[[nodiscard]] const std::optional<nvbench::cuda_stream> &get_cuda_stream() const
{
return m_cuda_stream;
}
void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }

/// The CUDA device associated with with this benchmark state. May be
Expand Down Expand Up @@ -261,7 +264,6 @@ private:
std::optional<nvbench::device_info> device,
std::size_t type_config_index);

nvbench::cuda_stream m_cuda_stream;
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
nvbench::named_values m_axis_values;
std::optional<nvbench::device_info> m_device;
Expand All @@ -277,6 +279,8 @@ private:
nvbench::float64_t m_skip_time;
nvbench::float64_t m_timeout;

std::optional<nvbench::cuda_stream> m_cuda_stream;

// Deadlock protection. See blocking_kernel's class doc for details.
nvbench::float64_t m_blocking_kernel_timeout{30.0};

Expand Down
2 changes: 2 additions & 0 deletions nvbench/state.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ state::state(const benchmark_base &bench)
, m_max_noise{bench.get_max_noise()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
, m_cuda_stream{std::nullopt}
{}

state::state(const benchmark_base &bench,
Expand All @@ -58,6 +59,7 @@ state::state(const benchmark_base &bench,
, m_max_noise{bench.get_max_noise()}
, m_skip_time{bench.get_skip_time()}
, m_timeout{bench.get_timeout()}
, m_cuda_stream{std::nullopt}
{}

nvbench::int64_t state::get_int64(const std::string &axis_name) const
Expand Down
1 change: 1 addition & 0 deletions testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set(test_srcs
benchmark.cu
create.cu
cuda_timer.cu
cuda_stream.cu
cpu_timer.cu
enum_type_list.cu
float64_axis.cu
Expand Down
77 changes: 77 additions & 0 deletions testing/cuda_stream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright 2023 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvbench/config.cuh>
#include <nvbench/cuda_stream.cuh>
#include <nvbench/device_manager.cuh>
#include <nvbench/types.cuh>

#include "test_asserts.cuh"

#include <fmt/format.h>

namespace
{
#ifdef NVBENCH_HAS_CUPTI
/**
* @brief Queries and returns the device id that the given \p cuda_stream is associated with
*
* @param cuda_stream The stream to get the device id for
* @return The device id that \p cuda_stream is associated with
*/
int get_device_of_stream(cudaStream_t cuda_stream)
{
CUcontext ctx;
NVBENCH_DRIVER_API_CALL(cuStreamGetCtx(CUstream{cuda_stream}, &ctx));
NVBENCH_DRIVER_API_CALL(cuCtxPushCurrent(ctx));
CUdevice device_id{};
NVBENCH_DRIVER_API_CALL(cuCtxGetDevice(&device_id));
NVBENCH_DRIVER_API_CALL(cuCtxPopCurrent(&ctx));
return static_cast<int>(device_id);
elstehle marked this conversation as resolved.
Show resolved Hide resolved
}
#endif
} // namespace

void test_basic()
{
#ifdef NVBENCH_HAS_CUPTI
// Get devices
auto devices = nvbench::device_manager::get().get_devices();

// Iterate over devices
for (auto const &device_info : devices)
{
// Create stream on the device before it becomes the active device
nvbench::cuda_stream device_stream(device_info);

// Verify cuda stream is associated with the correct cuda device
ASSERT(get_device_of_stream(device_stream.get_stream()) == device_info.get_id());

// Set the device as active device
device_info.set_active();

// Create the stream (implicitly) on the device that is currently active
nvbench::cuda_stream current_device_stream{};

// Verify the cuda stream was in fact associated with the currently active device
ASSERT(get_device_of_stream(current_device_stream.get_stream()) == device_info.get_id());
}
#endif
}

int main() { test_basic(); }
6 changes: 4 additions & 2 deletions testing/state.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ struct state_tester : public nvbench::state
void set_param(std::string name, T &&value)
{
this->state::m_axis_values.set_value(std::move(name),
nvbench::named_values::value_type{
std::forward<T>(value)});
nvbench::named_values::value_type{std::forward<T>(value)});
}
};
} // namespace nvbench::detail
Expand All @@ -57,6 +56,9 @@ void test_streams()

state_tester state{bench};

// Confirm that the stream hasn't been initialized yet
ASSERT(!state.get_cuda_stream().has_value());

// Test non-owning stream
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
Expand Down