Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add parameters support to InferResponse #394

Merged
merged 5 commits into from
Jan 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -54,6 +54,7 @@ any C++ code.
- [`finalize`](#finalize)
- [Model Config File](#model-config-file)
- [Inference Request Parameters](#inference-request-parameters)
- [Inference Response Parameters](#inference-response-parameters)
- [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries)
- [Building Custom Python Backend Stub](#building-custom-python-backend-stub)
- [Creating Custom Execution Environments](#creating-custom-execution-environments)
Expand Down Expand Up @@ -787,6 +788,24 @@ You can read more about the inference request parameters in the [parameters
extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
documentation.

## Inference Response Parameters

Inference response parameters may be optionally set during the construction of
an inference response object. The parameters should be a dictionary of key value
pairs, where keys are `str` and values are `bool`, `int` or `str`. For example,
```python
response = pb_utils.InferenceResponse(
output_tensors, parameters={"key": "value"}
)
```

You can read more about the inference response parameters in the [parameters
extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
documentation.
kthui marked this conversation as resolved.
Show resolved Hide resolved

Inference response parameters is currently not supported on BLS inference
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure this is clear ....

responses received by BLS models.

## Managing Python Runtime and Libraries

Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/)
Expand Down
78 changes: 67 additions & 11 deletions src/infer_response.cc
kthui marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -39,8 +39,10 @@ namespace triton { namespace backend { namespace python {

InferResponse::InferResponse(
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
std::shared_ptr<PbError> error, const bool is_last_response, void* id)
: error_(error), is_last_response_(is_last_response), id_(id)
std::shared_ptr<PbError> error, std::string parameters,
const bool is_last_response, void* id)
: error_(error), is_last_response_(is_last_response), id_(id),
parameters_(std::move(parameters))
{
for (auto& output : output_tensors) {
if (!output) {
Expand All @@ -58,6 +60,12 @@ InferResponse::OutputTensors()
return output_tensors_;
}

const std::string&
InferResponse::Parameters() const
{
return parameters_;
}

bool
InferResponse::HasError()
{
Expand Down Expand Up @@ -106,6 +114,9 @@ InferResponse::SaveToSharedMemory(
j++;
}
response_shm_ptr->id = id_;

parameters_shm_ = PbString::Create(shm_pool, parameters_);
response_shm_ptr->parameters = parameters_shm_->ShmHandle();
}
}

Expand Down Expand Up @@ -143,6 +154,8 @@ InferResponse::LoadFromSharedMemory(

std::shared_ptr<PbError> pb_error;
std::vector<std::shared_ptr<PbTensor>> output_tensors;
std::shared_ptr<PbString> parameters_shm;
std::string parameters;

// If the error field is set, do not load output tensors from shared memory.
if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) {
Expand All @@ -154,33 +167,44 @@ InferResponse::LoadFromSharedMemory(
bi::managed_external_buffer::handle_t* tensor_handle_shm =
reinterpret_cast<bi::managed_external_buffer::handle_t*>(
response_shm.data_.get() + sizeof(ResponseShm));
{
#ifdef TRITON_PB_STUB
kthui marked this conversation as resolved.
Show resolved Hide resolved
// Need to acquire the GIL to avoid hangs.
py::gil_scoped_acquire acquire;
// PbTensor::LoadFromSharedMemory() will construct Python objects if
// called from pb_stub, which requires holding the GIL.
py::gil_scoped_acquire acquire;
#endif
for (size_t idx = 0; idx < requested_output_count; ++idx) {
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
output_tensors.emplace_back(std::move(pb_tensor));
for (size_t idx = 0; idx < requested_output_count; ++idx) {
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
output_tensors.emplace_back(std::move(pb_tensor));
}
}

parameters_shm = std::move(
PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters));
parameters = parameters_shm->String();
}

return std::unique_ptr<InferResponse>(new InferResponse(
response_shm, output_tensors, pb_error,
response_shm_ptr->is_last_response, response_shm_ptr->id));
response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm,
parameters));
}

InferResponse::InferResponse(
AllocatedSharedMemory<char>& response_shm,
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id)
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
std::shared_ptr<PbString>& parameters_shm, std::string& parameters)
{
response_shm_ = std::move(response_shm);
output_tensors_ = std::move(output_tensors);
error_ = std::move(pb_error);
shm_handle_ = response_shm_.handle_;
id_ = id;
is_last_response_ = is_last_response;
parameters_shm_ = std::move(parameters_shm);
parameters_ = std::move(parameters);
}

std::shared_ptr<PbError>&
Expand Down Expand Up @@ -387,6 +411,38 @@ InferResponse::Send(
cuda_copy |= cuda_used;
}

if (!parameters_.empty()) {
triton::common::TritonJson::Value param;
THROW_IF_TRITON_ERROR(
param.Parse(parameters_.c_str(), parameters_.length()));
std::vector<std::string> param_keys;
THROW_IF_TRITON_ERROR(param.Members(&param_keys));
for (const auto& key : param_keys) {
triton::common::TritonJson::Value value;
if (!param.Find(key.c_str(), &value)) {
throw PythonBackendException("Unexpected missing key on parameters");
}
if (value.IsString()) {
std::string string_value;
THROW_IF_TRITON_ERROR(value.AsString(&string_value));
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter(
response, key.c_str(), string_value.c_str()));
} else if (value.IsInt()) {
int64_t int_value = 0;
THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter(
response, key.c_str(), int_value));
} else if (value.IsBool()) {
bool bool_value = false;
THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter(
response, key.c_str(), bool_value));
} else {
throw PythonBackendException("Unsupported value type on parameters");
}
}
}

#ifdef TRITON_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));
Expand Down
13 changes: 9 additions & 4 deletions src/infer_response.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -38,6 +38,7 @@ namespace triton { namespace backend { namespace python {

struct ResponseShm {
uint32_t outputs_size;
bi::managed_external_buffer::handle_t parameters;
bi::managed_external_buffer::handle_t error;
bool has_error;
// Indicates whether this error has a message or not.
Expand Down Expand Up @@ -72,9 +73,10 @@ class InferResponse {
public:
InferResponse(
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
std::shared_ptr<PbError> error = nullptr,
std::shared_ptr<PbError> error = nullptr, std::string parameters = "",
const bool is_last_response = true, void* id = nullptr);
std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
const std::string& Parameters() const; // JSON serializable unless empty
void SaveToSharedMemory(
std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu = true);
static std::unique_ptr<InferResponse> LoadFromSharedMemory(
Expand Down Expand Up @@ -116,8 +118,8 @@ class InferResponse {
InferResponse(
AllocatedSharedMemory<char>& response_shm,
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
std::shared_ptr<PbError>& pb_error, const bool is_last_response,
void* id);
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
std::shared_ptr<PbString>& parameters_shm, std::string& parameters);
std::vector<std::shared_ptr<PbTensor>> output_tensors_;

std::shared_ptr<PbError> error_;
Expand All @@ -128,6 +130,9 @@ class InferResponse {
bool is_last_response_;
// Representing the request id that the response was created from.
void* id_;

std::shared_ptr<PbString> parameters_shm_;
std::string parameters_;
};

}}} // namespace triton::backend::python
131 changes: 72 additions & 59 deletions src/pb_stub.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -104,6 +104,28 @@ PyDefaultArgumentToMutableType(const py::object& argument)
std::string(py::str(argument.get_type())));
}

std::string
PyParametersToJSON(const py::dict& parameters)
{
for (const auto& pair : parameters) {
if (!py::isinstance<py::str>(pair.first)) {
throw PythonBackendException(
"Expect parameters keys to have type str, found type " +
std::string(py::str(pair.first.get_type())));
}
if (!py::isinstance<py::bool_>(pair.second) &&
!py::isinstance<py::int_>(pair.second) &&
!py::isinstance<py::str>(pair.second)) {
throw PythonBackendException(
"Expect parameters values to have type bool/int/str, found type " +
std::string(py::str(pair.second.get_type())));
}
}
py::module_ py_json = py::module_::import("json");
std::string parameters_str = py::str(py_json.attr("dumps")(parameters));
return parameters_str;
}

void
AsyncEventFutureDoneCallback(const py::object& py_future)
{
Expand Down Expand Up @@ -1714,59 +1736,41 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
py::class_<InferRequest, std::shared_ptr<InferRequest>>(
module, "InferenceRequest")
.def(
py::init([](const std::string& request_id,
const py::object& correlation_id,
const std::vector<std::shared_ptr<PbTensor>>& inputs,
const std::vector<std::string>& requested_output_names,
const std::string& model_name,
const int64_t model_version, const uint32_t flags,
const uint64_t timeout,
const PreferredMemory& preferred_memory,
const InferenceTrace& trace,
const py::object& parameters_) {
py::dict parameters =
PyDefaultArgumentToMutableType<py::dict>(parameters_);
std::set<std::string> requested_outputs;
for (auto& requested_output_name : requested_output_names) {
requested_outputs.emplace(requested_output_name);
}
for (const auto& pair : parameters) {
if (!py::isinstance<py::str>(pair.first)) {
throw PythonBackendException(
"Expect parameters keys to have type str, found type " +
std::string(py::str(pair.first.get_type())));
}
if (!py::isinstance<py::bool_>(pair.second) &&
!py::isinstance<py::int_>(pair.second) &&
!py::isinstance<py::str>(pair.second)) {
throw PythonBackendException(
"Expect parameters values to have type bool/int/str, found "
"type " +
std::string(py::str(pair.second.get_type())));
}
}
py::module_ py_json = py::module_::import("json");
std::string parameters_str =
py::str(py_json.attr("dumps")(parameters));

CorrelationId correlation_id_obj;
if (py::isinstance<py::int_>(correlation_id)) {
correlation_id_obj =
CorrelationId(py::cast<uint64_t>(correlation_id));
} else if (py::isinstance<py::str>(correlation_id)) {
correlation_id_obj =
CorrelationId(py::cast<std::string>(correlation_id));
} else {
throw PythonBackendException(
"Correlation ID must be integer or string");
}

return std::make_shared<InferRequest>(
request_id, correlation_id_obj, inputs, requested_outputs,
model_name, model_version, parameters_str, flags, timeout,
0 /*response_factory_address*/, 0 /*request_address*/,
preferred_memory, trace);
}),
py::init(
[](const std::string& request_id,
const py::object& correlation_id,
const std::vector<std::shared_ptr<PbTensor>>& inputs,
const std::vector<std::string>& requested_output_names,
const std::string& model_name, const int64_t model_version,
const uint32_t flags, const uint64_t timeout,
const PreferredMemory& preferred_memory,
const InferenceTrace& trace, const py::object& parameters_) {
py::dict parameters =
PyDefaultArgumentToMutableType<py::dict>(parameters_);
std::set<std::string> requested_outputs;
for (auto& requested_output_name : requested_output_names) {
requested_outputs.emplace(requested_output_name);
}
std::string parameters_str = PyParametersToJSON(parameters);

CorrelationId correlation_id_obj;
if (py::isinstance<py::int_>(correlation_id)) {
correlation_id_obj =
CorrelationId(py::cast<uint64_t>(correlation_id));
} else if (py::isinstance<py::str>(correlation_id)) {
correlation_id_obj =
CorrelationId(py::cast<std::string>(correlation_id));
} else {
throw PythonBackendException(
"Correlation ID must be integer or string");
}

return std::make_shared<InferRequest>(
request_id, correlation_id_obj, inputs, requested_outputs,
model_name, model_version, parameters_str, flags, timeout,
0 /*response_factory_address*/, 0 /*request_address*/,
preferred_memory, trace);
}),
py::arg("request_id").none(false) = "",
py::arg("correlation_id").none(false) = 0,
py::arg("inputs").none(false),
Expand Down Expand Up @@ -1869,16 +1873,25 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
py::class_<InferResponse, std::shared_ptr<InferResponse>>(
module, "InferenceResponse")
.def(
py::init<
const std::vector<std::shared_ptr<PbTensor>>&,
std::shared_ptr<PbError>>(),
py::init(
[](const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
const std::shared_ptr<PbError>& error,
const py::object& parameters_) {
py::dict parameters =
PyDefaultArgumentToMutableType<py::dict>(parameters_);
std::string parameters_str = PyParametersToJSON(parameters);
return std::make_shared<InferResponse>(
output_tensors, error, parameters_str /* parameters */);
}),
py::arg("output_tensors") = py::list(),
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr))
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr),
py::arg("parameters") = py::none())
.def(
"output_tensors", &InferResponse::OutputTensors,
py::return_value_policy::reference)
.def("has_error", &InferResponse::HasError)
.def("error", &InferResponse::Error);
.def("error", &InferResponse::Error)
.def("parameters", &InferResponse::Parameters);

py::class_<ResponseSender, std::shared_ptr<ResponseSender>>(
module, "InferenceResponseSender")
Expand Down
Loading
Loading