Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Cuda in Graphics Implementation for TensorRT backend #100

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF)

set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")

Expand Down Expand Up @@ -271,6 +273,17 @@ target_link_libraries(
CUDA::cudart
)

if(${TRITON_ENABLE_CUDA_CTX_SHARING})
target_compile_definitions(
triton-tensorrt-backend
PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING
)
target_link_libraries(
triton-tensorrt-backend
PRIVATE
CUDA::cuda_driver
)
endif()

#
# Install
Expand Down
25 changes: 16 additions & 9 deletions src/instance_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ ModelInstanceState::ModelInstanceState(

ModelInstanceState::~ModelInstanceState()
{
cudaSetDevice(DeviceId());
if (!model_state_->isCudaContextSharingEnabled()) {
cudaSetDevice(DeviceId());
}
for (auto& io_binding_infos : io_binding_infos_) {
for (auto& io_binding_info : io_binding_infos) {
if (!io_binding_info.IsDynamicShapeOutput() &&
Expand Down Expand Up @@ -424,7 +426,9 @@ ModelInstanceState::Run(
payload_.reset(new Payload(next_set_, requests, request_count));
SET_TIMESTAMP(payload_->compute_start_ns_);

cudaSetDevice(DeviceId());
if (!model_state_->isCudaContextSharingEnabled()) {
cudaSetDevice(DeviceId());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind to share the reasoning of avoiding the set device calls? Wouldn't that cause the issue of model not being placed / executed on selected device (based on model config)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. The intended use of cuda context sharing is targeted only of single GPU (RTX end-user) systems. I wanted to avoid complications with this use case
  2. When we call cudaSetDevice() the cuda runtime resets the to using the default cuda context for the thread

}
#ifdef TRITON_ENABLE_STATS
{
SET_TIMESTAMP(payload_->compute_start_ns_);
Expand Down Expand Up @@ -1551,13 +1555,16 @@ ModelInstanceState::EvaluateTensorRTContext(
TRITONSERVER_Error*
ModelInstanceState::InitStreamsAndEvents()
{
// Set the device before preparing the context.
auto cuerr = cudaSetDevice(DeviceId());
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") +
Name() + ": " + cudaGetErrorString(cuerr))
.c_str());
if (!model_state_->isCudaContextSharingEnabled()) {
// Set the device before preparing the context.
auto cuerr = cudaSetDevice(DeviceId());
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
}
}

// Create CUDA streams associated with the instance
Expand Down
53 changes: 30 additions & 23 deletions src/model_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
ModelState::~ModelState()
{
for (auto& device_engine : device_engines_) {
cudaSetDevice(device_engine.first.first);
if (!isCudaContextSharingEnabled()) {
cudaSetDevice(device_engine.first.first);
}
auto& runtime = device_engine.second.first;
auto& engine = device_engine.second.second;
// Need to reset explicitly to ensure proper destruction order
Expand Down Expand Up @@ -209,15 +211,16 @@ ModelState::CreateEngine(
// We share the engine (for models that don't have dynamic shapes) and
// runtime across instances that have access to the same GPU/NVDLA.
if (eit->second.second == nullptr) {
auto cuerr = cudaSetDevice(gpu_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
if (!isCudaContextSharingEnabled()) {
auto cuerr = cudaSetDevice(gpu_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
}
}

const bool new_runtime = (eit->second.first == nullptr);
RETURN_IF_ERROR(LoadPlan(
model_path, dla_core_id, &eit->second.first, &eit->second.second,
Expand Down Expand Up @@ -321,13 +324,15 @@ ModelState::AutoCompleteConfig()
" to auto-complete config for " + Name())
.c_str()));

cuerr = cudaSetDevice(device_id);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set CUDA device to GPU ") +
std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
.c_str());
if (!isCudaContextSharingEnabled()) {
cuerr = cudaSetDevice(device_id);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set CUDA device to GPU ") +
std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
.c_str());
}
}

std::string artifact_name;
Expand Down Expand Up @@ -373,13 +378,15 @@ ModelState::AutoCompleteConfig()

RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));

cuerr = cudaSetDevice(current_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to revert CUDA device to GPU ") +
std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
.c_str());
if (!isCudaContextSharingEnabled()) {
cuerr = cudaSetDevice(current_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to revert CUDA device to GPU ") +
std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
.c_str());
}
}

if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
Expand Down
8 changes: 8 additions & 0 deletions src/tensorrt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
}

ScopedRuntimeCudaContext cig_scope(model_state);

// With each instance we create a ModelInstanceState object and
// associate it with the TRITONBACKEND_ModelInstance.
Expand Down Expand Up @@ -353,6 +354,11 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
"TRITONBACKEND_ModelInstanceFinalize: delete instance state");
if (!instance_state) {
return nullptr;
}

ScopedRuntimeCudaContext cig_scope(instance_state->StateForModel());

delete instance_state;

Expand All @@ -377,6 +383,8 @@ TRITONBACKEND_ModelInstanceExecute(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();

ScopedRuntimeCudaContext cig_scope(model_state);

// For TensorRT backend, the executing instance may not closely tie to
// TRITONBACKEND_ModelInstance, the instance will be assigned based on
// execution policy.
Expand Down
32 changes: 32 additions & 0 deletions src/tensorrt_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ TensorRTModel::ParseModelConfig()
}
}

#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
std::string ptr_str = "";
RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str));
cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_str));
// cuda_ctx = static_cast<CUcontext>(reinterpret_cast<void*>(ptr_str));
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
#endif // TRITON_ENABLE_CUDA_CTX_SHARING

return nullptr; // Success
}

Expand Down Expand Up @@ -120,4 +128,28 @@ TensorRTModel::GetCudaStreamPriority()
return cuda_stream_priority;
}

template <>
TRITONSERVER_Error*
TensorRTModel::GetParameter<std::string>(
std::string const& name, std::string& str_value)
{
triton::common::TritonJson::Value parameters;
TRITONSERVER_Error* err =
ashishk98 marked this conversation as resolved.
Show resolved Hide resolved
model_config_.MemberAsObject("parameters", &parameters);
if (err != nullptr) {
return err;
// throw std::runtime_error("Model config doesn't have a parameters
// section");
}
triton::common::TritonJson::Value value;
err = parameters.MemberAsObject(name.c_str(), &value);
ashishk98 marked this conversation as resolved.
Show resolved Hide resolved
if (err != nullptr) {
return err;
// std::string errStr = "Cannot find parameter with name: " + name;
// throw std::runtime_error(errStr);
}
value.MemberAsString("string_value", &str_value);
return nullptr;
}

}}} // namespace triton::backend::tensorrt
96 changes: 96 additions & 0 deletions src/tensorrt_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
#include <cuda.h>
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
#include <sstream>

#include "triton/backend/backend_model.h"

namespace triton { namespace backend { namespace tensorrt {
Expand All @@ -34,6 +39,14 @@ class TensorRTModel : public BackendModel {
TensorRTModel(TRITONBACKEND_Model* triton_model);
virtual ~TensorRTModel() = default;

template <typename T>
TRITONSERVER_Error* GetParameter(std::string const& name, T& value)
{
assert(false);
auto dummy = T();
return dummy;
}

TRITONSERVER_Error* SetTensorRTModelConfig();

TRITONSERVER_Error* ParseModelConfig();
Expand All @@ -53,6 +66,65 @@ class TensorRTModel : public BackendModel {
bool EagerBatching() { return eager_batching_; }
bool BusyWaitEvents() { return busy_wait_events_; }

template <>
TRITONSERVER_Error* GetParameter<std::string>(
std::string const& name, std::string& str_value);

void* StringToPointer(std::string& str)
{
std::stringstream ss;
ss << str;

void* ctx_ptr;
ss >> ctx_ptr;
return ctx_ptr;
}

//! Following functions are related to Cuda (Cuda in Graphics) context sharing
ashishk98 marked this conversation as resolved.
Show resolved Hide resolved
//! for gaming use case. Creating a shared contexts reduces context switching
//! overhead and leads to better performance of model execution along side
//! Graphics workload.

bool isCudaContextSharingEnabled()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
return cuda_ctx != nullptr;
#else
return false;
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}

inline TRITONSERVER_Error* PushCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to push Cuda context for ") + Name()).c_str());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
return nullptr;
}

inline TRITONSERVER_Error* PopCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
CUcontext oldCtx{};
if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to pop Cuda context for ") + Name()).c_str());
}
if (oldCtx != cuda_ctx) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("popping the wrong Cuda context for ") + Name())
.c_str());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
return nullptr;
}

protected:
common::TritonJson::Value graph_specs_;
Priority priority_;
Expand All @@ -61,6 +133,30 @@ class TensorRTModel : public BackendModel {
bool separate_output_stream_;
bool eager_batching_;
bool busy_wait_events_;
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
CUcontext cuda_ctx = nullptr;
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
};

struct ScopedRuntimeCudaContext {
ScopedRuntimeCudaContext(TensorRTModel* model_state)
: model_state_(model_state)
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (model_state_->isCudaContextSharingEnabled()) {
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}
~ScopedRuntimeCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (model_state_->isCudaContextSharingEnabled()) {
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}
TensorRTModel* model_state_;
};

}}} // namespace triton::backend::tensorrt