Skip to content
4 changes: 3 additions & 1 deletion lib/llm/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
}

fn build_protos() -> Result<(), Box<dyn std::error::Error>> {
tonic_build::compile_protos("src/grpc/protos/kserve.proto")?;
tonic_build::configure()
.type_attribute(".", "#[derive(serde::Serialize,serde::Deserialize)]")
.compile_protos(&["kserve.proto"], &["src/grpc/protos"])?;
Ok(())
}

Expand Down
203 changes: 137 additions & 66 deletions lib/llm/src/grpc/service/kserve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use crate::http::service::Metrics;
use crate::http::service::metrics;

use crate::discovery::ModelManager;
use crate::local_model::runtime_config::ModelRuntimeConfig;
use crate::protocols::tensor::TensorModelConfig;
use crate::protocols::tensor::{NvCreateTensorRequest, NvCreateTensorResponse};
use crate::request_template::RequestTemplate;
use anyhow::Result;
Expand Down Expand Up @@ -39,6 +41,8 @@ use inference::{
ModelMetadataRequest, ModelMetadataResponse, ModelStreamInferResponse,
};

use prost::Message;

/// [gluo TODO] 'metrics' are for HTTP service and there is HTTP endpoint
/// for it as part of HTTP service. Should we always start HTTP service up
/// for non-inference?
Expand Down Expand Up @@ -183,6 +187,27 @@ impl KserveServiceConfigBuilder {
}
}

#[allow(clippy::large_enum_variant)]
enum Config {
Dynamo(TensorModelConfig),
Triton(ModelConfig),
}

impl Config {
fn from_runtime_config(runtime_config: &ModelRuntimeConfig) -> Result<Config, anyhow::Error> {
if let Some(tensor_model_config) = runtime_config.tensor_model_config.as_ref() {
if let Some(triton_model_config) = tensor_model_config.triton_model_config.as_ref() {
let model_config = ModelConfig::decode(triton_model_config.as_slice())?;
Ok(Config::Triton(model_config))
} else {
Ok(Config::Dynamo(tensor_model_config.clone()))
}
} else {
Err(anyhow::anyhow!("no model config is provided"))
}
}
}

#[tonic::async_trait]
impl GrpcInferenceService for KserveService {
async fn model_infer(
Expand Down Expand Up @@ -416,38 +441,76 @@ impl GrpcInferenceService for KserveService {
.find(|card| request_model_name == &card.display_name)
{
if card.model_type.supports_tensor() {
if let Some(tensor_model_config) = card.runtime_config.tensor_model_config.as_ref()
{
return Ok(Response::new(ModelMetadataResponse {
name: tensor_model_config.name.clone(),
versions: vec!["1".to_string()],
platform: "dynamo".to_string(),
inputs: tensor_model_config
.inputs
.iter()
.map(|input| inference::model_metadata_response::TensorMetadata {
name: input.name.clone(),
datatype: input.data_type.to_string(),
shape: input.shape.clone(),
})
.collect(),
outputs: tensor_model_config
.outputs
.iter()
.map(
|output| inference::model_metadata_response::TensorMetadata {
name: output.name.clone(),
datatype: output.data_type.to_string(),
shape: output.shape.clone(),
},
)
.collect(),
}));
let config = Config::from_runtime_config(&card.runtime_config).map_err(|e| {
Status::invalid_argument(format!(
"Model '{}' has type Tensor but: {}",
request_model_name, e
))
})?;
match config {
Config::Triton(model_config) => {
return Ok(Response::new(ModelMetadataResponse {
name: model_config.name,
versions: vec!["1".to_string()],
platform: model_config.platform,
inputs: model_config
.input
.iter()
.map(|input| inference::model_metadata_response::TensorMetadata {
name: input.name.clone(),
datatype: match inference::DataType::try_from(input.data_type) {
Ok(dt) => dt.as_str_name().to_string(),
Err(_) => "TYPE_INVALID".to_string(),
},
shape: input.dims.clone(),
})
.collect(),
outputs: model_config
.output
.iter()
.map(
|output| inference::model_metadata_response::TensorMetadata {
name: output.name.clone(),
datatype: match inference::DataType::try_from(
output.data_type,
) {
Ok(dt) => dt.as_str_name().to_string(),
Err(_) => "TYPE_INVALID".to_string(),
},
shape: output.dims.clone(),
},
)
.collect(),
}));
}
Config::Dynamo(model_config) => {
return Ok(Response::new(ModelMetadataResponse {
name: model_config.name.clone(),
versions: vec!["1".to_string()],
platform: "dynamo".to_string(),
inputs: model_config
.inputs
.iter()
.map(|input| inference::model_metadata_response::TensorMetadata {
name: input.name.clone(),
datatype: input.data_type.to_string(),
shape: input.shape.clone(),
})
.collect(),
outputs: model_config
.outputs
.iter()
.map(
|output| inference::model_metadata_response::TensorMetadata {
name: output.name.clone(),
datatype: output.data_type.to_string(),
shape: output.shape.clone(),
},
)
.collect(),
}));
}
}
Err(Status::invalid_argument(format!(
"Model '{}' has type Tensor but no model config is provided",
request_model_name
)))?
} else if card.model_type.supports_completions() {
return Ok(Response::new(ModelMetadataResponse {
name: card.display_name,
Expand Down Expand Up @@ -497,42 +560,50 @@ impl GrpcInferenceService for KserveService {
.find(|card| request_model_name == &card.display_name)
{
if card.model_type.supports_tensor() {
if let Some(tensor_model_config) = card.runtime_config.tensor_model_config.as_ref()
{
let model_config = ModelConfig {
name: tensor_model_config.name.clone(),
platform: "dynamo".to_string(),
backend: "dynamo".to_string(),
input: tensor_model_config
.inputs
.iter()
.map(|input| ModelInput {
name: input.name.clone(),
data_type: input.data_type.to_kserve(),
dims: input.shape.clone(),
..Default::default()
})
.collect(),
output: tensor_model_config
.outputs
.iter()
.map(|output| ModelOutput {
name: output.name.clone(),
data_type: output.data_type.to_kserve(),
dims: output.shape.clone(),
..Default::default()
})
.collect(),
..Default::default()
};
return Ok(Response::new(ModelConfigResponse {
config: Some(model_config.clone()),
}));
let config = Config::from_runtime_config(&card.runtime_config).map_err(|e| {
Status::invalid_argument(format!(
"Model '{}' has type Tensor but: {}",
request_model_name, e
))
})?;
match config {
Config::Triton(model_config) => {
return Ok(Response::new(ModelConfigResponse {
config: Some(model_config),
}));
}
Config::Dynamo(tensor_model_config) => {
let model_config = ModelConfig {
name: tensor_model_config.name.clone(),
platform: "dynamo".to_string(),
Copy link
Contributor

@rmccorm4 rmccorm4 Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you writeup some documentation on the kserve frontend before you go on PTO? Separate PR is OK.

High level things I'd want to see in something like docs/frontend/kserve.md:

  1. ModelInput.Tensor behavior / motivation
  2. ModelInput.text/tokens behavior and how we support conversion
  3. Triton Model Config compatibiilty/interaction
  4. Reference to example tensor echo worker
  5. Reference to python bindings doc: https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/kserve_grpc_service/README.md
  6. Limitations
    • implications (if any) of hard-coded "dynamo" platform/backend I see here
    • implications (if any) of hard-coded "version=1" I see here
    • ...

backend: "dynamo".to_string(),
input: tensor_model_config
.inputs
.iter()
.map(|input| ModelInput {
name: input.name.clone(),
data_type: input.data_type.to_kserve(),
dims: input.shape.clone(),
..Default::default()
})
.collect(),
output: tensor_model_config
.outputs
.iter()
.map(|output| ModelOutput {
name: output.name.clone(),
data_type: output.data_type.to_kserve(),
dims: output.shape.clone(),
..Default::default()
})
.collect(),
..Default::default()
};
return Ok(Response::new(ModelConfigResponse {
config: Some(model_config.clone()),
}));
}
}
Err(Status::invalid_argument(format!(
"Model '{}' has type Tensor but no model config is provided",
request_model_name
)))?
} else if card.model_type.supports_completions() {
let config = ModelConfig {
name: card.display_name,
Expand Down
6 changes: 5 additions & 1 deletion lib/llm/src/protocols/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,15 @@ pub struct TensorMetadata {
pub parameters: Parameters,
}

#[derive(Serialize, Deserialize, Validate, Debug, Clone, PartialEq)]
#[derive(Serialize, Deserialize, Validate, Debug, Clone, PartialEq, Default)]
pub struct TensorModelConfig {
pub name: String,
pub inputs: Vec<TensorMetadata>,
pub outputs: Vec<TensorMetadata>,
// Optional Triton model config in serialized protobuf string,
// if provided, it supersedes the basic model config defined above.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub triton_model_config: Option<Vec<u8>>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
Expand Down
Loading
Loading