Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/app/endpoints/rlsapi_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
handle_known_apistatus_errors,
)
from utils.responses import (
build_turn_summary,
extract_text_from_response_items,
extract_token_usage,
get_mcp_tools,
Expand Down Expand Up @@ -405,7 +406,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
return None


@router.post("/infer", responses=infer_responses)
@router.post("/infer", responses=infer_responses, response_model_exclude_none=True)
@authorize(Action.RLSAPI_V1_INFER)
async def infer_endpoint( # pylint: disable=R0914
infer_request: RlsapiV1InferRequest,
Expand Down Expand Up @@ -448,14 +449,38 @@ async def infer_endpoint( # pylint: disable=R0914
)

start_time = time.monotonic()

# Check if verbose metadata should be returned
verbose_enabled = (
configuration.customization is not None
and configuration.customization.allow_verbose_infer
and infer_request.include_metadata
)

try:
instructions = _build_instructions(infer_request.context.systeminfo)
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)

# For verbose mode, retrieve the full response object instead of just text
if verbose_enabled:
client = AsyncLlamaStackClientHolder().get_client()
response = await client.responses.create(
input=input_source,
model=model_id,
instructions=instructions,
tools=mcp_tools or [],
stream=False,
store=False,
)
response = cast(OpenAIResponseObject, response)
response_text = extract_text_from_response_items(response.output)
else:
response = None
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)
inference_time = time.monotonic() - start_time
except _INFER_HANDLED_EXCEPTIONS as error:
_record_inference_failure(
Expand Down Expand Up @@ -493,6 +518,27 @@ async def infer_endpoint( # pylint: disable=R0914

logger.info("Completed rlsapi v1 /infer request %s", request_id)

# Build response with optional extended metadata
if verbose_enabled and response is not None:
# Extract metadata from full response object
turn_summary = build_turn_summary(
response, model_id, vector_store_ids=None, rag_id_mapping=None
)

return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
request_id=request_id,
tool_calls=turn_summary.tool_calls,
tool_results=turn_summary.tool_results,
rag_chunks=turn_summary.rag_chunks,
referenced_documents=turn_summary.referenced_documents,
input_tokens=turn_summary.token_usage.input_tokens,
output_tokens=turn_summary.token_usage.output_tokens,
)
)

# Standard minimal response
return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
Expand Down
15 changes: 15 additions & 0 deletions src/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,21 @@ class Customization(ConfigurationBase):
agent_card_config: Optional[dict[str, Any]] = None
custom_profile: Optional[CustomProfile] = Field(default=None, init=False)

# Debugging: Allow /v1/infer to return extended metadata
# WARNING: This should NOT be enabled in production environments.
# Setting this to True allows clients to request extended response data
# (tool_calls, rag_chunks, token_usage, etc.) from the /v1/infer endpoint
# by including "include_metadata": true in the request body.
#
# If this feature were wanted in production, consider implementing RBAC-based access control instead:
# 1. Add Action.RLSAPI_V1_INFER_VERBOSE to models/config.py Action enum
# 2. Check authorization in infer_endpoint:
# if infer_request.include_metadata:
# if Action.RLSAPI_V1_INFER_VERBOSE not in request.state.authorized_actions:
# raise HTTPException(status_code=403, detail="Verbose infer not authorized")
# 3. Add the action to authorization rules for specific users/roles
allow_verbose_infer: bool = False

@model_validator(mode="after")
def check_customization_model(self) -> Self:
"""
Expand Down
7 changes: 7 additions & 0 deletions src/models/rlsapi/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class RlsapiV1InferRequest(ConfigurationBase):
question: User question string.
context: Context with system info, terminal output, etc. (defaults provided).
skip_rag: Reserved for future use. RAG retrieval is not yet implemented.
include_metadata: Request extended response with debugging metadata (development/testing only).

Example:
```python
Expand Down Expand Up @@ -189,6 +190,12 @@ class RlsapiV1InferRequest(ConfigurationBase):
description="Reserved for future use. RAG retrieval is not yet implemented.",
examples=[False, True],
)
include_metadata: bool = Field(
default=False,
description="[Development/Testing Only] Return extended response with debugging metadata (tool_calls, rag_chunks, tokens). "
"Only honored when allow_verbose_infer is enabled in configuration. Not available in production.",
examples=[False, True],
)

@field_validator("question")
@classmethod
Expand Down
42 changes: 40 additions & 2 deletions src/models/rlsapi/responses.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""Models for rlsapi v1 REST API responses."""

from typing import Optional
from typing import Any, Optional
from pydantic import Field

from models.config import ConfigurationBase
from models.responses import AbstractSuccessfulResponse
from models.responses import (
AbstractSuccessfulResponse,
RAGChunk,
ReferencedDocument,
ToolCallSummary,
ToolResultSummary,
)


class RlsapiV1InferData(ConfigurationBase):
Expand All @@ -13,6 +19,12 @@ class RlsapiV1InferData(ConfigurationBase):
Attributes:
text: The generated response text.
request_id: Unique identifier for the request.
tool_calls: MCP tool calls made during inference (verbose mode only).
tool_results: Results from MCP tool calls (verbose mode only).
rag_chunks: RAG chunks retrieved from documentation (verbose mode only).
referenced_documents: Source documents referenced (verbose mode only).
input_tokens: Number of input tokens consumed (verbose mode only).
output_tokens: Number of output tokens generated (verbose mode only).
"""

text: str = Field(
Expand All @@ -26,6 +38,32 @@ class RlsapiV1InferData(ConfigurationBase):
examples=["01JDKR8N7QW9ZMXVGK3PB5TQWZ"],
)

# Extended metadata fields (only populated when include_metadata=true)
tool_calls: Optional[list[ToolCallSummary]] = Field(
None,
description="Tool calls made during inference (requires include_metadata=true)",
)
tool_results: Optional[list[ToolResultSummary]] = Field(
None,
description="Results from tool calls (requires include_metadata=true)",
)
rag_chunks: Optional[list[RAGChunk]] = Field(
None,
description="Retrieved RAG documentation chunks (requires include_metadata=true)",
)
referenced_documents: Optional[list[ReferencedDocument]] = Field(
None,
description="Source documents referenced in answer (requires include_metadata=true)",
)
input_tokens: Optional[int] = Field(
None,
description="Number of input tokens consumed (requires include_metadata=true)",
)
output_tokens: Optional[int] = Field(
None,
description="Number of output tokens generated (requires include_metadata=true)",
)


class RlsapiV1InferResponse(AbstractSuccessfulResponse):
"""RHEL Lightspeed rlsapi v1 /infer response.
Expand Down