Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 59 additions & 7 deletions src/app/endpoints/rlsapi_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
handle_known_apistatus_errors,
)
from utils.responses import (
build_turn_summary,
extract_text_from_response_items,
extract_token_usage,
get_mcp_tools,
Expand Down Expand Up @@ -405,7 +406,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
return None


@router.post("/infer", responses=infer_responses)
@router.post("/infer", responses=infer_responses, response_model_exclude_none=True)
@authorize(Action.RLSAPI_V1_INFER)
async def infer_endpoint( # pylint: disable=R0914
infer_request: RlsapiV1InferRequest,
Expand Down Expand Up @@ -448,14 +449,38 @@ async def infer_endpoint( # pylint: disable=R0914
)

start_time = time.monotonic()

# Check if verbose metadata should be returned
verbose_enabled = (
configuration.customization is not None
and configuration.customization.allow_verbose_infer
and infer_request.include_metadata
)

try:
instructions = _build_instructions(infer_request.context.systeminfo)
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)

# For verbose mode, retrieve the full response object instead of just text
if verbose_enabled:
client = AsyncLlamaStackClientHolder().get_client()
response = await client.responses.create(
input=input_source,
model=model_id,
instructions=instructions,
tools=mcp_tools or [],
stream=False,
store=False,
)
response = cast(OpenAIResponseObject, response)
response_text = extract_text_from_response_items(response.output)
Comment on lines +462 to +475
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Verbose path may skip token usage metrics recording.

The non-verbose path calls retrieve_simple_response(), which invokes extract_token_usage() to record token usage metrics (via _increment_llm_call_metric()). The verbose path directly calls client.responses.create() but does not call extract_token_usage(), so token usage metrics may not be recorded for verbose requests.

Consider calling extract_token_usage(response.usage, model_id) in the verbose path to ensure consistent metrics tracking:

Proposed fix
             response = cast(OpenAIResponseObject, response)
             response_text = extract_text_from_response_items(response.output)
+            extract_token_usage(response.usage, model_id)
         else:
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/app/endpoints/rlsapi_v1.py` around lines 462 - 475, Verbose branch calls
AsyncLlamaStackClientHolder().get_client().responses.create(...) and uses
extract_text_from_response_items(response.output) but does not record token
metrics; update the verbose path to call extract_token_usage(response.usage,
model_id) after casting the response (same place as
extract_text_from_response_items) so metrics recorded by
_increment_llm_call_metric are executed (mirror what retrieve_simple_response
does) ensuring consistent token usage tracking for verbose requests.

else:
response = None
response_text = await retrieve_simple_response(
input_source,
instructions,
tools=cast(list[Any], mcp_tools),
model_id=model_id,
)
inference_time = time.monotonic() - start_time
except _INFER_HANDLED_EXCEPTIONS as error:
_record_inference_failure(
Expand Down Expand Up @@ -493,9 +518,36 @@ async def infer_endpoint( # pylint: disable=R0914

logger.info("Completed rlsapi v1 /infer request %s", request_id)

# Build response with optional extended metadata
if verbose_enabled and response is not None:
# Extract metadata from full response object
turn_summary = build_turn_summary(
response, model_id, vector_store_ids=None, rag_id_mapping=None
)

return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
request_id=request_id,
tool_calls=turn_summary.tool_calls,
tool_results=turn_summary.tool_results,
rag_chunks=turn_summary.rag_chunks,
referenced_documents=turn_summary.referenced_documents,
input_tokens=turn_summary.token_usage.input_tokens,
output_tokens=turn_summary.token_usage.output_tokens,
)
)

# Standard minimal response
return RlsapiV1InferResponse(
data=RlsapiV1InferData(
text=response_text,
request_id=request_id,
tool_calls=None,
tool_results=None,
rag_chunks=None,
referenced_documents=None,
input_tokens=None,
output_tokens=None,
)
)
15 changes: 15 additions & 0 deletions src/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,21 @@ class Customization(ConfigurationBase):
agent_card_config: Optional[dict[str, Any]] = None
custom_profile: Optional[CustomProfile] = Field(default=None, init=False)

# Debugging: Allow /v1/infer to return extended metadata
# WARNING: This should NOT be enabled in production environments.
# Setting this to True allows clients to request extended response data
# (tool_calls, rag_chunks, token_usage, etc.) from the /v1/infer endpoint
# by including "include_metadata": true in the request body.
#
# If this feature were wanted in production, consider RBAC-based access control instead:
# 1. Add Action.RLSAPI_V1_INFER_VERBOSE to models/config.py Action enum
# 2. Check authorization in infer_endpoint:
# if infer_request.include_metadata:
# if Action.RLSAPI_V1_INFER_VERBOSE not in request.state.authorized_actions:
# raise HTTPException(status_code=403, detail="Verbose infer not authorized")
# 3. Add the action to authorization rules for specific users/roles
allow_verbose_infer: bool = False

@model_validator(mode="after")
def check_customization_model(self) -> Self:
"""
Expand Down
10 changes: 10 additions & 0 deletions src/models/rlsapi/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class RlsapiV1InferRequest(ConfigurationBase):
question: User question string.
context: Context with system info, terminal output, etc. (defaults provided).
skip_rag: Reserved for future use. RAG retrieval is not yet implemented.
include_metadata: Request extended response with debugging metadata (dev/testing only).

Example:
```python
Expand Down Expand Up @@ -189,6 +190,15 @@ class RlsapiV1InferRequest(ConfigurationBase):
description="Reserved for future use. RAG retrieval is not yet implemented.",
examples=[False, True],
)
include_metadata: bool = Field(
default=False,
description=(
"[Development/Testing Only] Return extended response with debugging metadata "
"(tool_calls, rag_chunks, tokens). Only honored when allow_verbose_infer is enabled. "
"Not available in production."
),
examples=[False, True],
)

@field_validator("question")
@classmethod
Expand Down
40 changes: 39 additions & 1 deletion src/models/rlsapi/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
from pydantic import Field

from models.config import ConfigurationBase
from models.responses import AbstractSuccessfulResponse
from models.responses import (
AbstractSuccessfulResponse,
RAGChunk,
ReferencedDocument,
ToolCallSummary,
ToolResultSummary,
)


class RlsapiV1InferData(ConfigurationBase):
Expand All @@ -13,6 +19,12 @@ class RlsapiV1InferData(ConfigurationBase):
Attributes:
text: The generated response text.
request_id: Unique identifier for the request.
tool_calls: MCP tool calls made during inference (verbose mode only).
tool_results: Results from MCP tool calls (verbose mode only).
rag_chunks: RAG chunks retrieved from documentation (verbose mode only).
referenced_documents: Source documents referenced (verbose mode only).
input_tokens: Number of input tokens consumed (verbose mode only).
output_tokens: Number of output tokens generated (verbose mode only).
"""

text: str = Field(
Expand All @@ -26,6 +38,32 @@ class RlsapiV1InferData(ConfigurationBase):
examples=["01JDKR8N7QW9ZMXVGK3PB5TQWZ"],
)

# Extended metadata fields (only populated when include_metadata=true)
tool_calls: Optional[list[ToolCallSummary]] = Field(
None,
description="Tool calls made during inference (requires include_metadata=true)",
)
tool_results: Optional[list[ToolResultSummary]] = Field(
None,
description="Results from tool calls (requires include_metadata=true)",
)
rag_chunks: Optional[list[RAGChunk]] = Field(
None,
description="Retrieved RAG documentation chunks (requires include_metadata=true)",
)
referenced_documents: Optional[list[ReferencedDocument]] = Field(
None,
description="Source documents referenced in answer (requires include_metadata=true)",
)
input_tokens: Optional[int] = Field(
None,
description="Number of input tokens consumed (requires include_metadata=true)",
)
output_tokens: Optional[int] = Field(
None,
description="Number of output tokens generated (requires include_metadata=true)",
)


class RlsapiV1InferResponse(AbstractSuccessfulResponse):
"""RHEL Lightspeed rlsapi v1 /infer response.
Expand Down
109 changes: 109 additions & 0 deletions tests/unit/app/endpoints/test_rlsapi_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,16 @@ async def test_infer_minimal_request(
assert response.data.text == "This is a test LLM response."
assert response.data.request_id is not None
assert check_suid(response.data.request_id)
# Standard response must not include verbose metadata (dual opt-in required)
assert response.data.tool_calls is None
assert response.data.tool_results is None
assert response.data.rag_chunks is None
assert response.data.referenced_documents is None
assert response.data.input_tokens is None
assert response.data.output_tokens is None
# Serialized payload must contain only text and request_id (response_model_exclude_none=True)
data_keys = set(response.model_dump(exclude_none=True)["data"].keys())
assert data_keys == {"text", "request_id"}, f"Expected only text and request_id, got {data_keys}"


async def test_infer_full_context_request(
Expand Down Expand Up @@ -640,6 +650,105 @@ async def test_infer_empty_llm_response_returns_fallback(
assert response.data.text == constants.UNABLE_TO_PROCESS_RESPONSE


async def test_infer_include_metadata_returns_verbose_response(
mocker: MockerFixture,
mock_configuration: AppConfig,
mock_llm_response: None,
mock_auth_resolvers: None,
) -> None:
"""Test /infer with include_metadata=True and allow_verbose_infer returns metadata."""
# Enable verbose infer (dual opt-in: config + request). customization is a
# read-only property on AppConfig, so patch the module-level configuration.
custom_mock = mocker.Mock()
custom_mock.allow_verbose_infer = True
custom_mock.system_prompt = "You are a helpful assistant."
config_mock = mocker.Mock()
config_mock.inference = mock_configuration.inference
config_mock.customization = custom_mock
mocker.patch("app.endpoints.rlsapi_v1.configuration", config_mock)

# Mock full response with usage so build_turn_summary can extract token counts
mock_response = mocker.Mock()
mock_response.output = [
_create_mock_response_output(mocker, "Verbose metadata test response.")
]
mock_usage = mocker.Mock()
mock_usage.input_tokens = 42
mock_usage.output_tokens = 18
mock_response.usage = mock_usage
_setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response))

infer_request = RlsapiV1InferRequest(
question="How do I list files?", include_metadata=True
)
mock_request = _create_mock_request(mocker)
mock_background_tasks = _create_mock_background_tasks(mocker)

response = await infer_endpoint(
infer_request=infer_request,
request=mock_request,
background_tasks=mock_background_tasks,
auth=MOCK_AUTH,
)

assert isinstance(response, RlsapiV1InferResponse)
assert response.data.text == "Verbose metadata test response."
assert response.data.request_id is not None
assert check_suid(response.data.request_id)
# Verbose response must include metadata fields
assert response.data.tool_calls is not None
assert response.data.tool_results is not None
assert response.data.rag_chunks is not None
assert response.data.referenced_documents is not None
assert response.data.input_tokens == 42
assert response.data.output_tokens == 18


async def test_infer_include_metadata_ignored_when_verbose_infer_disabled(
mocker: MockerFixture,
mock_configuration: AppConfig,
mock_auth_resolvers: None,
) -> None:
"""Metadata should remain excluded unless both request and config opt in."""
custom_mock = mocker.Mock()
custom_mock.allow_verbose_infer = False
custom_mock.system_prompt = "You are a helpful assistant."
config_mock = mocker.Mock()
config_mock.inference = mock_configuration.inference
config_mock.customization = custom_mock
mocker.patch("app.endpoints.rlsapi_v1.configuration", config_mock)

mock_response = mocker.Mock()
mock_response.output = [
_create_mock_response_output(mocker, "Response with metadata disabled.")
]
mock_usage = mocker.Mock()
mock_usage.input_tokens = 99
mock_usage.output_tokens = 11
mock_response.usage = mock_usage
_setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response))

infer_request = RlsapiV1InferRequest(
question="How do I list files?", include_metadata=True
)
mock_request = _create_mock_request(mocker)
mock_background_tasks = _create_mock_background_tasks(mocker)

response = await infer_endpoint(
infer_request=infer_request,
request=mock_request,
background_tasks=mock_background_tasks,
auth=MOCK_AUTH,
)

assert response.data.tool_calls is None
assert response.data.tool_results is None
assert response.data.rag_chunks is None
assert response.data.referenced_documents is None
assert response.data.input_tokens is None
assert response.data.output_tokens is None


# --- Test Splunk integration ---


Expand Down
Loading