-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable Cuda in Graphics Implementation for TensorRT backend #100
base: main
Are you sure you want to change the base?
Changes from 4 commits
134bf33
7353671
ed1296d
05e3786
9ae5a09
89ab580
b624b98
1f1ae7e
b42a8d6
d9aff26
80ba393
2ca19ac
20283c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -269,6 +269,7 @@ target_link_libraries( | |
triton-tensorrt-backend | ||
PRIVATE | ||
CUDA::cudart | ||
CUDA::cuda_driver | ||
) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -257,7 +257,10 @@ ModelInstanceState::ModelInstanceState( | |
|
||
ModelInstanceState::~ModelInstanceState() | ||
{ | ||
cudaSetDevice(DeviceId()); | ||
// Set device if CiG is disabled | ||
if (!model_state_->isCiGEnabled()) { | ||
cudaSetDevice(DeviceId()); | ||
} | ||
for (auto& io_binding_infos : io_binding_infos_) { | ||
for (auto& io_binding_info : io_binding_infos) { | ||
if (!io_binding_info.IsDynamicShapeOutput() && | ||
|
@@ -424,7 +427,10 @@ ModelInstanceState::Run( | |
payload_.reset(new Payload(next_set_, requests, request_count)); | ||
SET_TIMESTAMP(payload_->compute_start_ns_); | ||
|
||
cudaSetDevice(DeviceId()); | ||
// Set device if CiG is disabled | ||
if (!model_state_->isCiGEnabled()) { | ||
cudaSetDevice(DeviceId()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mind to share the reasoning of avoiding the set device calls? Wouldn't that cause the issue of model not being placed / executed on selected device (based on model config)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
#ifdef TRITON_ENABLE_STATS | ||
{ | ||
SET_TIMESTAMP(payload_->compute_start_ns_); | ||
|
@@ -1551,13 +1557,17 @@ ModelInstanceState::EvaluateTensorRTContext( | |
TRITONSERVER_Error* | ||
ModelInstanceState::InitStreamsAndEvents() | ||
{ | ||
// Set the device before preparing the context. | ||
auto cuerr = cudaSetDevice(DeviceId()); | ||
if (cuerr != cudaSuccess) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") + | ||
Name() + ": " + cudaGetErrorString(cuerr)) | ||
.c_str()); | ||
// Set device if CiG is disabled | ||
if (!model_state_->isCiGEnabled()) { | ||
// Set the device before preparing the context. | ||
auto cuerr = cudaSetDevice(DeviceId()); | ||
if (cuerr != cudaSuccess) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("unable to set device for ") + Name() + ": " + | ||
cudaGetErrorString(cuerr)) | ||
.c_str()); | ||
} | ||
} | ||
|
||
// Create CUDA streams associated with the instance | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
#include "tensorrt_model.h" | ||
#include <sstream> | ||
|
||
namespace triton { namespace backend { namespace tensorrt { | ||
|
||
|
@@ -53,7 +54,7 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) | |
: BackendModel(triton_model), priority_(Priority::DEFAULT), | ||
use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0), | ||
separate_output_stream_(false), eager_batching_(false), | ||
busy_wait_events_(false) | ||
busy_wait_events_(false), cig_ctx_(nullptr) | ||
{ | ||
ParseModelConfig(); | ||
} | ||
|
@@ -89,7 +90,20 @@ TensorRTModel::ParseModelConfig() | |
cuda.MemberAsBool("output_copy_stream", &separate_output_stream_)); | ||
} | ||
} | ||
|
||
triton::common::TritonJson::Value parameters; | ||
if (model_config_.Find("parameters", ¶meters)) { | ||
triton::common::TritonJson::Value value; | ||
std::string ptr_value; | ||
if (parameters.Find("CIG_CONTEXT_PTR", &value)) { | ||
RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ashishk98 instead of directly converting here as a special case, I would prefer to use something similar to what is done in the trt-llm backend: In this case there is a template method to convert from a parameter to a value - I think the code will be a little clearer to follow. Also - can we convert to and from a 64 bit integer? so something like:
Also it strikes me that although we use we could also use So two things - 1) add a templated GetParameter() method and 2) we can use MemberAsUint for the uint64 template. 3) officially transfer uint64 values and convert them to and from context. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added a GetParameter call for std::string instead of UINT64. This is because when we add the parameter to model config it is directly converted into a hex string instead of a numeric string. Hence while parsing the pointer, MemberAsUint fails because it gets a hex string to parse. |
||
std::stringstream ss; | ||
ss << ptr_value; | ||
void* ctx_ptr; | ||
ss >> ctx_ptr; | ||
cig_ctx_ = static_cast<CUcontext>(ctx_ptr); | ||
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set"); | ||
} | ||
} | ||
return nullptr; // Success | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
#pragma once | ||
|
||
#include "triton/backend/backend_model.h" | ||
#include <cuda.h> | ||
|
||
namespace triton { namespace backend { namespace tensorrt { | ||
|
||
|
@@ -53,6 +54,39 @@ class TensorRTModel : public BackendModel { | |
bool EagerBatching() { return eager_batching_; } | ||
bool BusyWaitEvents() { return busy_wait_events_; } | ||
|
||
|
||
//! Following functions are related to CiG (Cuda in Graphics) context sharing for | ||
//! gaming use case. Creating a shared contexts reduces context switching overhead | ||
//! and leads to better performance of model execution along side Graphics workload. | ||
CUcontext GetCiGContext() { return cig_ctx_; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ashishk98 question: is this specific to CIG - or could be applied to any application provided cuda context? |
||
bool isCiGEnabled() { return cig_ctx_ != nullptr; } | ||
|
||
inline TRITONSERVER_Error* PushCiGContext() | ||
{ | ||
if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("unable to push CiG context for ") + Name()).c_str()); | ||
} | ||
return nullptr; | ||
} | ||
|
||
inline TRITONSERVER_Error* PopCiGContext() | ||
{ | ||
CUcontext oldCtx{}; | ||
if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("unable to [pop CiG context for ") + Name()).c_str()); | ||
} | ||
if (oldCtx != cig_ctx_) { | ||
return TRITONSERVER_ErrorNew( | ||
TRITONSERVER_ERROR_INTERNAL, | ||
(std::string("popping the wrong CiG context for ") + Name()).c_str()); | ||
} | ||
return nullptr; | ||
} | ||
|
||
protected: | ||
common::TritonJson::Value graph_specs_; | ||
Priority priority_; | ||
|
@@ -61,6 +95,24 @@ class TensorRTModel : public BackendModel { | |
bool separate_output_stream_; | ||
bool eager_batching_; | ||
bool busy_wait_events_; | ||
CUcontext cig_ctx_; | ||
}; | ||
|
||
struct ScopedRuntimeCiGContext { | ||
ScopedRuntimeCiGContext(TensorRTModel* model_state) | ||
: model_state_(model_state) | ||
{ | ||
if (model_state_->isCiGEnabled()) { | ||
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext()); | ||
} | ||
} | ||
~ScopedRuntimeCiGContext() | ||
{ | ||
if (model_state_->isCiGEnabled()) { | ||
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext()); | ||
} | ||
} | ||
TensorRTModel* model_state_; | ||
}; | ||
|
||
}}} // namespace triton::backend::tensorrt |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@nv-kmcgill53 , @mc-nv, @tanmayv25 - any issues with this dependency
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the context behind adding this dependency?
From documentation is see:
Aren't this dependency is requisite of TensorRT itself?
Thought by default our product expect driver to be installed and if GPU capability given then available for usage including driver targets and binaries.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding this dependency should be fine. Ashish is linking correctly according to the cuda documentation. As it states
So they will need to link against the driver instead of just linking against the cuda runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not agree with this statement, current linkage doesn't explain why user want to add it explicitly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cmake documentation isn't exhaustive when it mentions
cuMemAlloc
andcuMemFree
. The user in this case is using the Driver API to set/pass the cuda context around in the backend, rather than letting thecore
take care of this. This is the reason for adding theCUDA::cuda_driver
lib to the linking path. This PR necessarily makes use of functions in the driver where the trt_backend didn't before.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Triton TensorRT Backend is unable to work without CUDA, Triton Inference Server and TensorRT installation.
Current change, per my understanding, uses only
cudaSetDevice
(CUDA::cudart) andcudaGetErrorString
(CUDA runtime API) and those dependencies are satisfied. There why I don't see any reason to link againstCUDA::cuda_driver
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The additional dependency is required for cuCtxPushCurrent() / cuCtxPopCurrent() @ https://github.com/triton-inference-server/tensorrt_backend/pull/100/files#diff-3137e95db7c97b9ddd71fa1b600e3dd646025c5a872ad94d4d09f04313fe3fcdR66
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason we need this dependency is because we are using a special context call Cuda in Graphics (CiG) context which has to work with the cuda driver dll for its operations.