Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/app/endpoints/rlsapi_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
handle_known_apistatus_errors,
)
from utils.responses import (
build_turn_summary,
extract_text_from_response_items,
extract_token_usage,
get_mcp_tools,
Expand Down Expand Up @@ -405,7 +406,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
return None


@router.post("/infer", responses=infer_responses)
@router.post("/infer", responses=infer_responses, response_model_exclude_none=True)
@authorize(Action.RLSAPI_V1_INFER)
async def infer_endpoint( # pylint: disable=R0914
infer_request: RlsapiV1InferRequest,
Expand Down Expand Up @@ -448,14 +449,38 @@ async def infer_endpoint( # pylint: disable=R0914
)

start_time = time.monotonic()

# Check if verbose metadata should be returned
verbose_enabled = (
configuration.customization is not None
and configuration.customization.allow_verbose_infer
and infer_request.include_metadata
)

try:
instructions = _build_instructions(infer_request.context.systeminfo)
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)

# For verbose mode, retrieve the full response object instead of just text
if verbose_enabled:
client = AsyncLlamaStackClientHolder().get_client()
response = await client.responses.create(
input=input_source,
model=model_id,
instructions=instructions,
tools=mcp_tools or [],
stream=False,
store=False,
)
response = cast(OpenAIResponseObject, response)
response_text = extract_text_from_response_items(response.output)
Comment on lines +462 to +475
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Verbose path may skip token usage metrics recording.

The non-verbose path calls retrieve_simple_response(), which invokes extract_token_usage() to record token usage metrics (via _increment_llm_call_metric()). The verbose path directly calls client.responses.create() but does not call extract_token_usage(), so token usage metrics may not be recorded for verbose requests.

Consider calling extract_token_usage(response.usage, model_id) in the verbose path to ensure consistent metrics tracking:

Proposed fix
             response = cast(OpenAIResponseObject, response)
             response_text = extract_text_from_response_items(response.output)
+            extract_token_usage(response.usage, model_id)
         else:
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/app/endpoints/rlsapi_v1.py` around lines 462 - 475, Verbose branch calls
AsyncLlamaStackClientHolder().get_client().responses.create(...) and uses
extract_text_from_response_items(response.output) but does not record token
metrics; update the verbose path to call extract_token_usage(response.usage,
model_id) after casting the response (same place as
extract_text_from_response_items) so metrics recorded by
_increment_llm_call_metric are executed (mirror what retrieve_simple_response
does) ensuring consistent token usage tracking for verbose requests.

else:
response = None
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)
inference_time = time.monotonic() - start_time
except _INFER_HANDLED_EXCEPTIONS as error:
_record_inference_failure(
Expand Down Expand Up @@ -493,6 +518,27 @@ async def infer_endpoint( # pylint: disable=R0914

logger.info("Completed rlsapi v1 /infer request %s", request_id)

# Build response with optional extended metadata
if verbose_enabled and response is not None:
# Extract metadata from full response object
turn_summary = build_turn_summary(
response, model_id, vector_store_ids=None, rag_id_mapping=None
)

return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
request_id=request_id,
tool_calls=turn_summary.tool_calls,
tool_results=turn_summary.tool_results,
rag_chunks=turn_summary.rag_chunks,
referenced_documents=turn_summary.referenced_documents,
input_tokens=turn_summary.token_usage.input_tokens,
output_tokens=turn_summary.token_usage.output_tokens,
)
)

# Standard minimal response
return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
Expand Down
15 changes: 15 additions & 0 deletions src/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,21 @@ class Customization(ConfigurationBase):
agent_card_config: Optional[dict[str, Any]] = None
custom_profile: Optional[CustomProfile] = Field(default=None, init=False)

# Debugging: Allow /v1/infer to return extended metadata
# WARNING: This should NOT be enabled in production environments.
# Setting this to True allows clients to request extended response data
# (tool_calls, rag_chunks, token_usage, etc.) from the /v1/infer endpoint
# by including "include_metadata": true in the request body.
#
# If this feature were wanted in production, consider implementing RBAC-based access control instead:
# 1. Add Action.RLSAPI_V1_INFER_VERBOSE to models/config.py Action enum
# 2. Check authorization in infer_endpoint:
# if infer_request.include_metadata:
# if Action.RLSAPI_V1_INFER_VERBOSE not in request.state.authorized_actions:
# raise HTTPException(status_code=403, detail="Verbose infer not authorized")
# 3. Add the action to authorization rules for specific users/roles
allow_verbose_infer: bool = False

@model_validator(mode="after")
def check_customization_model(self) -> Self:
"""
Expand Down
7 changes: 7 additions & 0 deletions src/models/rlsapi/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class RlsapiV1InferRequest(ConfigurationBase):
question: User question string.
context: Context with system info, terminal output, etc. (defaults provided).
skip_rag: Reserved for future use. RAG retrieval is not yet implemented.
include_metadata: Request extended response with debugging metadata (development/testing only).

Example:
```python
Expand Down Expand Up @@ -189,6 +190,12 @@ class RlsapiV1InferRequest(ConfigurationBase):
description="Reserved for future use. RAG retrieval is not yet implemented.",
examples=[False, True],
)
include_metadata: bool = Field(
default=False,
description="[Development/Testing Only] Return extended response with debugging metadata (tool_calls, rag_chunks, tokens). "
"Only honored when allow_verbose_infer is enabled in configuration. Not available in production.",
examples=[False, True],
)

@field_validator("question")
@classmethod
Expand Down
42 changes: 40 additions & 2 deletions src/models/rlsapi/responses.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""Models for rlsapi v1 REST API responses."""

from typing import Optional
from typing import Any, Optional
from pydantic import Field

from models.config import ConfigurationBase
from models.responses import AbstractSuccessfulResponse
from models.responses import (
AbstractSuccessfulResponse,
RAGChunk,
ReferencedDocument,
ToolCallSummary,
ToolResultSummary,
)


class RlsapiV1InferData(ConfigurationBase):
Expand All @@ -13,6 +19,12 @@ class RlsapiV1InferData(ConfigurationBase):
Attributes:
text: The generated response text.
request_id: Unique identifier for the request.
tool_calls: MCP tool calls made during inference (verbose mode only).
tool_results: Results from MCP tool calls (verbose mode only).
rag_chunks: RAG chunks retrieved from documentation (verbose mode only).
referenced_documents: Source documents referenced (verbose mode only).
input_tokens: Number of input tokens consumed (verbose mode only).
output_tokens: Number of output tokens generated (verbose mode only).
"""

text: str = Field(
Expand All @@ -26,6 +38,32 @@ class RlsapiV1InferData(ConfigurationBase):
examples=["01JDKR8N7QW9ZMXVGK3PB5TQWZ"],
)

# Extended metadata fields (only populated when include_metadata=true)
tool_calls: Optional[list[ToolCallSummary]] = Field(
None,
description="Tool calls made during inference (requires include_metadata=true)",
)
tool_results: Optional[list[ToolResultSummary]] = Field(
None,
description="Results from tool calls (requires include_metadata=true)",
)
rag_chunks: Optional[list[RAGChunk]] = Field(
None,
description="Retrieved RAG documentation chunks (requires include_metadata=true)",
)
referenced_documents: Optional[list[ReferencedDocument]] = Field(
None,
description="Source documents referenced in answer (requires include_metadata=true)",
)
input_tokens: Optional[int] = Field(
None,
description="Number of input tokens consumed (requires include_metadata=true)",
)
output_tokens: Optional[int] = Field(
None,
description="Number of output tokens generated (requires include_metadata=true)",
)


class RlsapiV1InferResponse(AbstractSuccessfulResponse):
"""RHEL Lightspeed rlsapi v1 /infer response.
Expand Down