diff --git a/docs/openapi.json b/docs/openapi.json index aa53ed81b..512d6f7e7 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -3795,19 +3795,19 @@ } } }, - "/v1/infer": { + "/v1/responses": { "post": { "tags": [ - "rlsapi-v1" + "responses" ], - "summary": "Infer Endpoint", - "description": "Handle rlsapi v1 /infer requests for stateless inference.\n\nThis endpoint serves requests from the RHEL Lightspeed Command Line Assistant (CLA).\n\nAccepts a question with optional context (stdin, attachments, terminal output,\nsystem info) and returns an LLM-generated response.\n\nArgs:\n infer_request: The inference request containing question and context.\n request: The FastAPI request object for accessing headers and state.\n background_tasks: FastAPI background tasks for async Splunk event sending.\n auth: Authentication tuple from the configured auth provider.\n\nReturns:\n RlsapiV1InferResponse containing the generated response text and request ID.\n\nRaises:\n HTTPException: 503 if the LLM service is unavailable.", - "operationId": "infer_endpoint_v1_infer_post", + "summary": "Responses Endpoint Handler", + "description": "Handle request to the /responses endpoint using Responses API (LCORE specification).\n\nProcesses a POST request to the responses endpoint, forwarding the\nuser's request to a selected Llama Stack LLM and returning the generated response\nfollowing the LCORE OpenAPI specification.\n\nReturns:\n ResponsesResponse: Contains the response following LCORE specification (non-streaming).\n StreamingResponse: SSE-formatted streaming response with enriched events (streaming).\n - response.created event includes conversation attribute\n - response.completed event includes available_quotas attribute\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 413: Prompt too long - Prompt exceeded model's context window size\n - 422: Unprocessable Entity - Request validation failed\n - 429: Quota limit exceeded - The token quota for model or user has been exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend", + "operationId": "responses_endpoint_handler_v1_responses_post", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RlsapiV1InferRequest" + "$ref": "#/components/schemas/ResponsesRequest" } } }, @@ -3819,12 +3819,39 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RlsapiV1InferResponse" + "$ref": "#/components/schemas/ResponsesResponse" }, "example": { - "data": { - "request_id": "01JDKR8N7QW9ZMXVGK3PB5TQWZ", - "text": "To list files in Linux, use the `ls` command." + "available_quotas": { + "daily": 1000, + "monthly": 50000 + }, + "completed_at": 1704067250, + "conversation": "conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "created_at": 1704067200, + "id": "resp_abc123", + "instructions": "You are a helpful assistant", + "model": "openai/gpt-4-turbo", + "object": "response", + "output": [ + { + "content": [ + { + "text": "Kubernetes is an open-source container orchestration system...", + "type": "output_text" + } + ], + "role": "assistant", + "type": "message" + } + ], + "output_text": "Kubernetes is an open-source container orchestration system...", + "status": "completed", + "temperature": 0.7, + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150 } } } @@ -3866,6 +3893,14 @@ "$ref": "#/components/schemas/ForbiddenResponse" }, "examples": { + "conversation read": { + "value": { + "detail": { + "cause": "User 6789 does not have permission to read conversation with ID 123e4567-e89b-12d3-a456-426614174000", + "response": "User does not have permission to perform this action" + } + } + }, "endpoint": { "value": { "detail": { @@ -3873,6 +3908,50 @@ "response": "User does not have permission to access this endpoint" } } + }, + "model override": { + "value": { + "detail": { + "cause": "User lacks model_override permission required to override model/provider.", + "response": "This instance does not permit overriding model/provider in the query request (missing permission: MODEL_OVERRIDE). Please remove the model and provider fields from your request." + } + } + } + } + } + } + }, + "404": { + "description": "Resource not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/NotFoundResponse" + }, + "examples": { + "conversation": { + "value": { + "detail": { + "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", + "response": "Conversation not found" + } + } + }, + "provider": { + "value": { + "detail": { + "cause": "Provider with ID openai does not exist", + "response": "Provider not found" + } + } + }, + "model": { + "value": { + "detail": { + "cause": "Model with ID gpt-4-turbo is not configured", + "response": "Model not found" + } + } } } } @@ -4008,6 +4087,16 @@ "application/json": { "schema": { "$ref": "#/components/schemas/InternalServerErrorResponse" + }, + "examples": { + "configuration": { + "value": { + "detail": { + "cause": "Lightspeed Stack configuration has not been initialized.", + "response": "Configuration is not loaded" + } + } + } } } } @@ -4035,26 +4124,37 @@ } } }, - "/readiness": { - "get": { + "/v1/infer": { + "post": { "tags": [ - "health" + "rlsapi-v1" ], - "summary": "Readiness Probe Get Method", - "description": "Handle the readiness probe endpoint, returning service readiness.\n\nIf any provider reports an error status, responds with HTTP 503\nand details of unhealthy providers; otherwise, indicates the\nservice is ready.\n\nReturns:\n ReadinessResponse: Object with `ready` indicating overall readiness,\n `reason` explaining the outcome, and `providers` containing the list of\n unhealthy ProviderHealthStatus entries (empty when ready).", - "operationId": "readiness_probe_get_method_readiness_get", + "summary": "Infer Endpoint", + "description": "Handle rlsapi v1 /infer requests for stateless inference.\n\nThis endpoint serves requests from the RHEL Lightspeed Command Line Assistant (CLA).\n\nAccepts a question with optional context (stdin, attachments, terminal output,\nsystem info) and returns an LLM-generated response.\n\nArgs:\n infer_request: The inference request containing question and context.\n request: The FastAPI request object for accessing headers and state.\n background_tasks: FastAPI background tasks for async Splunk event sending.\n auth: Authentication tuple from the configured auth provider.\n\nReturns:\n RlsapiV1InferResponse containing the generated response text and request ID.\n\nRaises:\n HTTPException: 503 if the LLM service is unavailable.", + "operationId": "infer_endpoint_v1_infer_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RlsapiV1InferRequest" + } + } + }, + "required": true + }, "responses": { "200": { "description": "Successful response", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ReadinessResponse" + "$ref": "#/components/schemas/RlsapiV1InferResponse" }, "example": { - "providers": [], - "ready": true, - "reason": "Service is ready" + "data": { + "request_id": "01JDKR8N7QW9ZMXVGK3PB5TQWZ", + "text": "To list files in Linux, use the `ls` command." + } } } } @@ -4107,72 +4207,123 @@ } } }, - "503": { - "description": "Service unavailable", + "413": { + "description": "Prompt is too long", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ServiceUnavailableResponse" + "$ref": "#/components/schemas/PromptTooLongResponse" }, "examples": { - "llama stack": { + "prompt too long": { "value": { "detail": { - "cause": "Connection error while trying to reach backend service.", - "response": "Unable to connect to Llama Stack" + "cause": "The prompt exceeds the maximum allowed length.", + "response": "Prompt is too long" } } } } } } - } - } - } - }, - "/liveness": { - "get": { - "tags": [ - "health" - ], - "summary": "Liveness Probe Get Method", - "description": "Return the liveness status of the service.\n\nReturns:\n LivenessResponse: Indicates that the service is alive.", - "operationId": "liveness_probe_get_method_liveness_get", - "responses": { - "200": { - "description": "Successful response", + }, + "422": { + "description": "Request validation failed", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/LivenessResponse" + "$ref": "#/components/schemas/UnprocessableEntityResponse" }, - "example": { - "alive": true + "examples": { + "invalid format": { + "value": { + "detail": { + "cause": "Invalid request format. The request body could not be parsed.", + "response": "Invalid request format" + } + } + }, + "missing attributes": { + "value": { + "detail": { + "cause": "Missing required attributes: ['query', 'model', 'provider']", + "response": "Missing required attributes" + } + } + }, + "invalid value": { + "value": { + "detail": { + "cause": "Invalid attachment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']", + "response": "Invalid attribute value" + } + } + } } } } }, - "401": { - "description": "Unauthorized", + "429": { + "description": "Quota limit exceeded", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/UnauthorizedResponse" + "$ref": "#/components/schemas/QuotaExceededResponse" }, "examples": { - "missing header": { + "model": { "value": { "detail": { - "cause": "No Authorization header found", - "response": "Missing or invalid credentials provided by client" + "cause": "The token quota for model gpt-4-turbo has been exceeded.", + "response": "The model quota has been exceeded" } } }, - "missing token": { + "user none": { "value": { "detail": { - "cause": "No token found in Authorization header", - "response": "Missing or invalid credentials provided by client" + "cause": "User 123 has no available tokens.", + "response": "The quota has been exceeded" + } + } + }, + "cluster none": { + "value": { + "detail": { + "cause": "Cluster has no available tokens.", + "response": "The quota has been exceeded" + } + } + }, + "subject none": { + "value": { + "detail": { + "cause": "Unknown subject 999 has no available tokens.", + "response": "The quota has been exceeded" + } + } + }, + "user insufficient": { + "value": { + "detail": { + "cause": "User 123 has 5 tokens, but 10 tokens are needed.", + "response": "The quota has been exceeded" + } + } + }, + "cluster insufficient": { + "value": { + "detail": { + "cause": "Cluster has 500 tokens, but 900 tokens are needed.", + "response": "The quota has been exceeded" + } + } + }, + "subject insufficient": { + "value": { + "detail": { + "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", + "response": "The quota has been exceeded" } } } @@ -4180,19 +4331,29 @@ } } }, - "403": { - "description": "Permission denied", + "500": { + "description": "Internal server error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ForbiddenResponse" + "$ref": "#/components/schemas/InternalServerErrorResponse" + } + } + } + }, + "503": { + "description": "Service unavailable", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ServiceUnavailableResponse" }, "examples": { - "endpoint": { + "llama stack": { "value": { "detail": { - "cause": "User 6789 is not authorized to access this endpoint.", - "response": "User does not have permission to access this endpoint" + "cause": "Connection error while trying to reach backend service.", + "response": "Unable to connect to Llama Stack" } } } @@ -4203,10 +4364,178 @@ } } }, - "/authorized": { - "post": { + "/readiness": { + "get": { "tags": [ - "authorized" + "health" + ], + "summary": "Readiness Probe Get Method", + "description": "Handle the readiness probe endpoint, returning service readiness.\n\nIf any provider reports an error status, responds with HTTP 503\nand details of unhealthy providers; otherwise, indicates the\nservice is ready.\n\nReturns:\n ReadinessResponse: Object with `ready` indicating overall readiness,\n `reason` explaining the outcome, and `providers` containing the list of\n unhealthy ProviderHealthStatus entries (empty when ready).", + "operationId": "readiness_probe_get_method_readiness_get", + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReadinessResponse" + }, + "example": { + "providers": [], + "ready": true, + "reason": "Service is ready" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnauthorizedResponse" + }, + "examples": { + "missing header": { + "value": { + "detail": { + "cause": "No Authorization header found", + "response": "Missing or invalid credentials provided by client" + } + } + }, + "missing token": { + "value": { + "detail": { + "cause": "No token found in Authorization header", + "response": "Missing or invalid credentials provided by client" + } + } + } + } + } + } + }, + "403": { + "description": "Permission denied", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ForbiddenResponse" + }, + "examples": { + "endpoint": { + "value": { + "detail": { + "cause": "User 6789 is not authorized to access this endpoint.", + "response": "User does not have permission to access this endpoint" + } + } + } + } + } + } + }, + "503": { + "description": "Service unavailable", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ServiceUnavailableResponse" + }, + "examples": { + "llama stack": { + "value": { + "detail": { + "cause": "Connection error while trying to reach backend service.", + "response": "Unable to connect to Llama Stack" + } + } + } + } + } + } + } + } + } + }, + "/liveness": { + "get": { + "tags": [ + "health" + ], + "summary": "Liveness Probe Get Method", + "description": "Return the liveness status of the service.\n\nReturns:\n LivenessResponse: Indicates that the service is alive.", + "operationId": "liveness_probe_get_method_liveness_get", + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/LivenessResponse" + }, + "example": { + "alive": true + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnauthorizedResponse" + }, + "examples": { + "missing header": { + "value": { + "detail": { + "cause": "No Authorization header found", + "response": "Missing or invalid credentials provided by client" + } + } + }, + "missing token": { + "value": { + "detail": { + "cause": "No token found in Authorization header", + "response": "Missing or invalid credentials provided by client" + } + } + } + } + } + } + }, + "403": { + "description": "Permission denied", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ForbiddenResponse" + }, + "examples": { + "endpoint": { + "value": { + "detail": { + "cause": "User 6789 is not authorized to access this endpoint.", + "response": "User does not have permission to access this endpoint" + } + } + } + } + } + } + } + } + } + }, + "/authorized": { + "post": { + "tags": [ + "authorized" ], "summary": "Authorized Endpoint Handler", "description": "Handle request to the /authorized endpoint.\n\nProcess POST requests to the /authorized endpoint, returning\nthe authenticated user's ID and username.\n\nThe response intentionally omits any authentication token.\n\nReturns:\n AuthorizedResponse: Contains the user ID and username of the authenticated user.", @@ -5167,6 +5496,62 @@ "title": "AgentSkill", "description": "Represents a distinct capability or function that an agent can perform." }, + "AllowedToolsFilter": { + "properties": { + "tool_names": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool Names" + } + }, + "type": "object", + "title": "AllowedToolsFilter", + "description": "Filter configuration for restricting which MCP tools can be used.\n\n:param tool_names: (Optional) List of specific tool names that are allowed" + }, + "ApprovalFilter": { + "properties": { + "always": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Always" + }, + "never": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Never" + } + }, + "type": "object", + "title": "ApprovalFilter", + "description": "Filter configuration for MCP tool approval requirements.\n\n:param always: (Optional) List of tool names that always require approval\n:param never: (Optional) List of tool names that never require approval" + }, "Attachment": { "properties": { "attachment_type": { @@ -7257,6 +7642,37 @@ } ] }, + "MCPListToolsTool": { + "properties": { + "input_schema": { + "additionalProperties": true, + "type": "object", + "title": "Input Schema" + }, + "name": { + "type": "string", + "title": "Name" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + } + }, + "type": "object", + "required": [ + "input_schema", + "name" + ], + "title": "MCPListToolsTool", + "description": "Tool definition returned by MCP list tools operation.\n\n:param input_schema: JSON schema defining the tool's input parameters\n:param name: Name of the tool\n:param description: (Optional) Description of what the tool does" + }, "MCPServerAuthInfo": { "properties": { "name": { @@ -7575,103 +7991,190 @@ "title": "OAuthFlows", "description": "Defines the configuration for the supported OAuth 2.0 flows." }, - "OpenIdConnectSecurityScheme": { + "OpenAIResponseAnnotationCitation": { "properties": { - "description": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Description" - }, - "openIdConnectUrl": { - "type": "string", - "title": "Openidconnecturl" - }, "type": { "type": "string", - "const": "openIdConnect", + "const": "url_citation", "title": "Type", - "default": "openIdConnect" + "default": "url_citation" + }, + "end_index": { + "type": "integer", + "title": "End Index" + }, + "start_index": { + "type": "integer", + "title": "Start Index" + }, + "title": { + "type": "string", + "title": "Title" + }, + "url": { + "type": "string", + "title": "Url" } }, "type": "object", "required": [ - "openIdConnectUrl" + "end_index", + "start_index", + "title", + "url" ], - "title": "OpenIdConnectSecurityScheme", - "description": "Defines a security scheme using OpenID Connect." + "title": "OpenAIResponseAnnotationCitation", + "description": "URL citation annotation for referencing external web resources.\n\n:param type: Annotation type identifier, always \"url_citation\"\n:param end_index: End position of the citation span in the content\n:param start_index: Start position of the citation span in the content\n:param title: Title of the referenced web resource\n:param url: URL of the referenced web resource" }, - "PasswordOAuthFlow": { + "OpenAIResponseAnnotationContainerFileCitation": { "properties": { - "refreshUrl": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Refreshurl" + "type": { + "type": "string", + "const": "container_file_citation", + "title": "Type", + "default": "container_file_citation" }, - "scopes": { - "additionalProperties": { - "type": "string" - }, - "type": "object", - "title": "Scopes" + "container_id": { + "type": "string", + "title": "Container Id" }, - "tokenUrl": { + "end_index": { + "type": "integer", + "title": "End Index" + }, + "file_id": { "type": "string", - "title": "Tokenurl" + "title": "File Id" + }, + "filename": { + "type": "string", + "title": "Filename" + }, + "start_index": { + "type": "integer", + "title": "Start Index" } }, "type": "object", "required": [ - "scopes", - "tokenUrl" + "container_id", + "end_index", + "file_id", + "filename", + "start_index" ], - "title": "PasswordOAuthFlow", - "description": "Defines configuration details for the OAuth 2.0 Resource Owner Password flow." + "title": "OpenAIResponseAnnotationContainerFileCitation" }, - "PostgreSQLDatabaseConfiguration": { + "OpenAIResponseAnnotationFileCitation": { "properties": { - "host": { + "type": { "type": "string", - "title": "Hostname", - "description": "Database server host or socket directory", - "default": "localhost" + "const": "file_citation", + "title": "Type", + "default": "file_citation" }, - "port": { + "file_id": { + "type": "string", + "title": "File Id" + }, + "filename": { + "type": "string", + "title": "Filename" + }, + "index": { "type": "integer", - "exclusiveMinimum": 0.0, - "title": "Port", - "description": "Database server port", - "default": 5432 + "title": "Index" + } + }, + "type": "object", + "required": [ + "file_id", + "filename", + "index" + ], + "title": "OpenAIResponseAnnotationFileCitation", + "description": "File citation annotation for referencing specific files in response content.\n\n:param type: Annotation type identifier, always \"file_citation\"\n:param file_id: Unique identifier of the referenced file\n:param filename: Name of the referenced file\n:param index: Position index of the citation within the content" + }, + "OpenAIResponseAnnotationFilePath": { + "properties": { + "type": { + "type": "string", + "const": "file_path", + "title": "Type", + "default": "file_path" }, - "db": { + "file_id": { "type": "string", - "title": "Database name", - "description": "Database name to connect to" + "title": "File Id" }, - "user": { + "index": { + "type": "integer", + "title": "Index" + } + }, + "type": "object", + "required": [ + "file_id", + "index" + ], + "title": "OpenAIResponseAnnotationFilePath" + }, + "OpenAIResponseContentPartRefusal": { + "properties": { + "type": { "type": "string", - "title": "User name", - "description": "Database user name used to authenticate" + "const": "refusal", + "title": "Type", + "default": "refusal" }, - "password": { + "refusal": { "type": "string", - "format": "password", - "title": "Password", - "description": "Password used to authenticate", - "writeOnly": true + "title": "Refusal" + } + }, + "type": "object", + "required": [ + "refusal" + ], + "title": "OpenAIResponseContentPartRefusal", + "description": "Refusal content within a streamed response part.\n\n:param type: Content part type identifier, always \"refusal\"\n:param refusal: Refusal text supplied by the model" + }, + "OpenAIResponseError": { + "properties": { + "code": { + "type": "string", + "title": "Code" }, - "namespace": { + "message": { + "type": "string", + "title": "Message" + } + }, + "type": "object", + "required": [ + "code", + "message" + ], + "title": "OpenAIResponseError", + "description": "Error details for failed OpenAI response requests.\n\n:param code: Error code identifying the type of failure\n:param message: Human-readable error message describing the failure" + }, + "OpenAIResponseInputFunctionToolCallOutput": { + "properties": { + "call_id": { + "type": "string", + "title": "Call Id" + }, + "output": { + "type": "string", + "title": "Output" + }, + "type": { + "type": "string", + "const": "function_call_output", + "title": "Type", + "default": "function_call_output" + }, + "id": { "anyOf": [ { "type": "string" @@ -7680,91 +8183,48 @@ "type": "null" } ], - "title": "Name space", - "description": "Database namespace", - "default": "public" - }, - "ssl_mode": { - "type": "string", - "title": "SSL mode", - "description": "SSL mode", - "default": "prefer" - }, - "gss_encmode": { - "type": "string", - "title": "GSS encmode", - "description": "This option determines whether or with what priority a secure GSS TCP/IP connection will be negotiated with the server.", - "default": "prefer" + "title": "Id" }, - "ca_cert_path": { + "status": { "anyOf": [ { - "type": "string", - "format": "file-path" + "type": "string" }, { "type": "null" } ], - "title": "CA certificate path", - "description": "Path to CA certificate" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "db", - "user", - "password" - ], - "title": "PostgreSQLDatabaseConfiguration", - "description": "PostgreSQL database configuration.\n\nPostgreSQL database is used by Lightspeed Core Stack service for storing\ninformation about conversation IDs. It can also be leveraged to store\nconversation history and information about quota usage.\n\nUseful resources:\n\n- [Psycopg: connection classes](https://www.psycopg.org/psycopg3/docs/api/connections.html)\n- [PostgreSQL connection strings](https://www.connectionstrings.com/postgresql/)\n- [How to Use PostgreSQL in Python](https://www.freecodecamp.org/news/postgresql-in-python/)" - }, - "PromptTooLongResponse": { - "properties": { - "status_code": { - "type": "integer", - "title": "Status Code" - }, - "detail": { - "$ref": "#/components/schemas/DetailModel" + "title": "Status" } }, "type": "object", "required": [ - "status_code", - "detail" + "call_id", + "output" ], - "title": "PromptTooLongResponse", - "description": "413 Payload Too Large - Prompt is too long.", - "examples": [ - { - "detail": { - "cause": "The prompt exceeds the maximum allowed length.", - "response": "Prompt is too long" - }, - "label": "prompt too long" - } - ] + "title": "OpenAIResponseInputFunctionToolCallOutput", + "description": "This represents the output of a function call that gets passed back to the model." }, - "ProviderHealthStatus": { + "OpenAIResponseInputMessageContentFile": { "properties": { - "provider_id": { + "type": { "type": "string", - "title": "Provider Id", - "description": "The ID of the provider" + "const": "input_file", + "title": "Type", + "default": "input_file" }, - "status": { - "type": "string", - "title": "Status", - "description": "The health status", - "examples": [ - "ok", - "unhealthy", - "not_implemented" - ] + "file_data": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "File Data" }, - "message": { + "file_id": { "anyOf": [ { "type": "string" @@ -7773,172 +8233,209 @@ "type": "null" } ], - "title": "Message", - "description": "Optional message about the health status", - "examples": [ - "All systems operational", - "Llama Stack is unavailable" - ] + "title": "File Id" + }, + "file_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "File Url" + }, + "filename": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Filename" } }, "type": "object", - "required": [ - "provider_id", - "status" - ], - "title": "ProviderHealthStatus", - "description": "Model representing the health status of a provider.\n\nAttributes:\n provider_id: The ID of the provider.\n status: The health status ('ok', 'unhealthy', 'not_implemented').\n message: Optional message about the health status." + "title": "OpenAIResponseInputMessageContentFile", + "description": "File content for input messages in OpenAI response format.\n\n:param type: The type of the input item. Always `input_file`.\n:param file_data: The data of the file to be sent to the model.\n:param file_id: (Optional) The ID of the file to be sent to the model.\n:param file_url: The URL of the file to be sent to the model.\n:param filename: The name of the file to be sent to the model." }, - "ProviderResponse": { + "OpenAIResponseInputMessageContentImage": { "properties": { - "api": { + "detail": { + "anyOf": [ + { + "type": "string", + "const": "low" + }, + { + "type": "string", + "const": "high" + }, + { + "type": "string", + "const": "auto" + } + ], + "title": "Detail", + "default": "auto" + }, + "type": { "type": "string", - "title": "Api", - "description": "The API this provider implements" + "const": "input_image", + "title": "Type", + "default": "input_image" }, - "config": { - "additionalProperties": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "items": {}, - "type": "array" - }, - {}, - { - "type": "null" - } - ] - }, - "type": "object", - "title": "Config", - "description": "Provider configuration parameters" - }, - "health": { - "additionalProperties": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "items": {}, - "type": "array" - }, - {}, - { - "type": "null" - } - ] - }, - "type": "object", - "title": "Health", - "description": "Current health status of the provider" + "file_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "File Id" }, - "provider_id": { + "image_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Image Url" + } + }, + "type": "object", + "title": "OpenAIResponseInputMessageContentImage", + "description": "Image content for input messages in OpenAI response format.\n\n:param detail: Level of detail for image processing, can be \"low\", \"high\", or \"auto\"\n:param type: Content type identifier, always \"input_image\"\n:param file_id: (Optional) The ID of the file to be sent to the model.\n:param image_url: (Optional) URL of the image content" + }, + "OpenAIResponseInputMessageContentText": { + "properties": { + "text": { "type": "string", - "title": "Provider Id", - "description": "Unique provider identifier" + "title": "Text" }, - "provider_type": { + "type": { "type": "string", - "title": "Provider Type", - "description": "Provider implementation type" + "const": "input_text", + "title": "Type", + "default": "input_text" } }, "type": "object", "required": [ - "api", - "config", - "health", - "provider_id", - "provider_type" + "text" ], - "title": "ProviderResponse", - "description": "Model representing a response to get specific provider request.", - "examples": [ - { - "api": "inference", - "config": { - "api_key": "********" - }, - "health": { - "message": "Healthy", - "status": "OK" - }, - "provider_id": "openai", - "provider_type": "remote::openai" - } - ] + "title": "OpenAIResponseInputMessageContentText", + "description": "Text content for input messages in OpenAI response format.\n\n:param text: The text content of the input message\n:param type: Content type identifier, always \"input_text\"" }, - "ProvidersListResponse": { + "OpenAIResponseInputToolChoiceAllowedTools": { "properties": { - "providers": { - "additionalProperties": { - "items": { - "additionalProperties": true, - "type": "object" + "mode": { + "type": "string", + "enum": [ + "auto", + "required" + ], + "title": "Mode", + "default": "auto" + }, + "tools": { + "items": { + "additionalProperties": { + "type": "string" }, - "type": "array" + "type": "object" }, - "type": "object", - "title": "Providers", - "description": "List of available API types and their corresponding providers" + "type": "array", + "title": "Tools" + }, + "type": { + "type": "string", + "const": "allowed_tools", + "title": "Type", + "default": "allowed_tools" } }, "type": "object", "required": [ - "providers" + "tools" ], - "title": "ProvidersListResponse", - "description": "Model representing a response to providers request.", - "examples": [ - { - "providers": { - "agents": [ - { - "provider_id": "meta-reference", - "provider_type": "inline::meta-reference" - } - ], - "inference": [ - { - "provider_id": "sentence-transformers", - "provider_type": "inline::sentence-transformers" - }, - { - "provider_id": "openai", - "provider_type": "remote::openai" - } - ] - } + "title": "OpenAIResponseInputToolChoiceAllowedTools", + "description": "Constrains the tools available to the model to a pre-defined set.\n\n:param mode: Constrains the tools available to the model to a pre-defined set\n:param tools: A list of tool definitions that the model should be allowed to call\n:param type: Tool choice type identifier, always \"allowed_tools\"" + }, + "OpenAIResponseInputToolChoiceCustomTool": { + "properties": { + "type": { + "type": "string", + "const": "custom", + "title": "Type", + "default": "custom" + }, + "name": { + "type": "string", + "title": "Name" } - ] + }, + "type": "object", + "required": [ + "name" + ], + "title": "OpenAIResponseInputToolChoiceCustomTool", + "description": "Forces the model to call a custom tool.\n\n:param type: Tool choice type identifier, always \"custom\"\n:param name: The name of the custom tool to call." }, - "QueryRequest": { + "OpenAIResponseInputToolChoiceFileSearch": { "properties": { - "query": { + "type": { "type": "string", - "title": "Query", - "description": "The query string", - "examples": [ - "What is Kubernetes?" - ] + "const": "file_search", + "title": "Type", + "default": "file_search" + } + }, + "type": "object", + "title": "OpenAIResponseInputToolChoiceFileSearch", + "description": "Indicates that the model should use file search to generate a response.\n\n:param type: Tool choice type identifier, always \"file_search\"" + }, + "OpenAIResponseInputToolChoiceFunctionTool": { + "properties": { + "name": { + "type": "string", + "title": "Name" }, - "conversation_id": { + "type": { + "type": "string", + "const": "function", + "title": "Type", + "default": "function" + } + }, + "type": "object", + "required": [ + "name" + ], + "title": "OpenAIResponseInputToolChoiceFunctionTool", + "description": "Forces the model to call a specific function.\n\n:param name: The name of the function to call\n:param type: Tool choice type identifier, always \"function\"" + }, + "OpenAIResponseInputToolChoiceMCPTool": { + "properties": { + "server_label": { + "type": "string", + "title": "Server Label" + }, + "type": { + "type": "string", + "const": "mcp", + "title": "Type", + "default": "mcp" + }, + "name": { "anyOf": [ { "type": "string" @@ -7947,92 +8444,149 @@ "type": "null" } ], - "title": "Conversation Id", - "description": "The optional conversation ID (UUID)", - "examples": [ - "c5260aec-4d82-4370-9fdf-05cf908b3f16" - ] - }, - "provider": { + "title": "Name" + } + }, + "type": "object", + "required": [ + "server_label" + ], + "title": "OpenAIResponseInputToolChoiceMCPTool", + "description": "Forces the model to call a specific tool on a remote MCP server\n\n:param server_label: The label of the MCP server to use.\n:param type: Tool choice type identifier, always \"mcp\"\n:param name: (Optional) The name of the tool to call on the server." + }, + "OpenAIResponseInputToolChoiceMode": { + "type": "string", + "enum": [ + "auto", + "required", + "none" + ], + "title": "OpenAIResponseInputToolChoiceMode" + }, + "OpenAIResponseInputToolChoiceWebSearch": { + "properties": { + "type": { "anyOf": [ { - "type": "string" + "type": "string", + "const": "web_search" }, { - "type": "null" + "type": "string", + "const": "web_search_preview" + }, + { + "type": "string", + "const": "web_search_preview_2025_03_11" + }, + { + "type": "string", + "const": "web_search_2025_08_26" } ], - "title": "Provider", - "description": "The optional provider", - "examples": [ - "openai", - "watsonx" - ] + "title": "Type", + "default": "web_search" + } + }, + "type": "object", + "title": "OpenAIResponseInputToolChoiceWebSearch", + "description": "Indicates that the model should use web search to generate a response\n\n:param type: Web search tool type variant to use" + }, + "OpenAIResponseInputToolFileSearch": { + "properties": { + "type": { + "type": "string", + "const": "file_search", + "title": "Type", + "default": "file_search" }, - "model": { + "vector_store_ids": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Vector Store Ids" + }, + "filters": { "anyOf": [ { - "type": "string" + "additionalProperties": true, + "type": "object" }, { "type": "null" } ], - "title": "Model", - "description": "The optional model", - "examples": [ - "gpt4mini" - ] + "title": "Filters" }, - "system_prompt": { + "max_num_results": { "anyOf": [ { - "type": "string" + "type": "integer", + "maximum": 50.0, + "minimum": 1.0 }, { "type": "null" } ], - "title": "System Prompt", - "description": "The optional system prompt.", - "examples": [ - "You are OpenShift assistant.", - "You are Ansible assistant." - ] + "title": "Max Num Results", + "default": 10 }, - "attachments": { + "ranking_options": { "anyOf": [ { - "items": { - "$ref": "#/components/schemas/Attachment" - }, - "type": "array" + "$ref": "#/components/schemas/SearchRankingOptions" }, { "type": "null" } - ], - "title": "Attachments", - "description": "The optional list of attachments.", - "examples": [ + ] + } + }, + "type": "object", + "required": [ + "vector_store_ids" + ], + "title": "OpenAIResponseInputToolFileSearch", + "description": "File search tool configuration for OpenAI response inputs.\n\n:param type: Tool type identifier, always \"file_search\"\n:param vector_store_ids: List of vector store identifiers to search within\n:param filters: (Optional) Additional filters to apply to the search\n:param max_num_results: (Optional) Maximum number of search results to return (1-50)\n:param ranking_options: (Optional) Options for ranking and scoring search results" + }, + "OpenAIResponseInputToolFunction": { + "properties": { + "type": { + "type": "string", + "const": "function", + "title": "Type", + "default": "function" + }, + "name": { + "type": "string", + "title": "Name" + }, + "description": { + "anyOf": [ { - "attachment_type": "log", - "content": "this is attachment", - "content_type": "text/plain" + "type": "string" }, { - "attachment_type": "configuration", - "content": "kind: Pod\n metadata:\n name: private-reg", - "content_type": "application/yaml" + "type": "null" + } + ], + "title": "Description" + }, + "parameters": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" }, { - "attachment_type": "configuration", - "content": "foo: bar", - "content_type": "application/yaml" + "type": "null" } - ] + ], + "title": "Parameters" }, - "no_tools": { + "strict": { "anyOf": [ { "type": "boolean" @@ -8041,32 +8595,46 @@ "type": "null" } ], - "title": "No Tools", - "description": "Whether to bypass all tools and MCP servers", - "default": false, - "examples": [ - true, - false - ] + "title": "Strict" + } + }, + "type": "object", + "required": [ + "name", + "parameters" + ], + "title": "OpenAIResponseInputToolFunction", + "description": "Function tool configuration for OpenAI response inputs.\n\n:param type: Tool type identifier, always \"function\"\n:param name: Name of the function that can be called\n:param description: (Optional) Description of what the function does\n:param parameters: (Optional) JSON schema defining the function's parameters\n:param strict: (Optional) Whether to enforce strict parameter validation" + }, + "OpenAIResponseInputToolMCP": { + "properties": { + "type": { + "type": "string", + "const": "mcp", + "title": "Type", + "default": "mcp" }, - "generate_topic_summary": { + "server_label": { + "type": "string", + "title": "Server Label" + }, + "server_url": { + "type": "string", + "title": "Server Url" + }, + "headers": { "anyOf": [ { - "type": "boolean" + "additionalProperties": true, + "type": "object" }, { "type": "null" } ], - "title": "Generate Topic Summary", - "description": "Whether to generate topic summary for new conversations", - "default": true, - "examples": [ - true, - false - ] + "title": "Headers" }, - "media_type": { + "authorization": { "anyOf": [ { "type": "string" @@ -8075,34 +8643,26 @@ "type": "null" } ], - "title": "Media Type", - "description": "Media type for the response format", - "examples": [ - "application/json", - "text/plain" - ] + "title": "Authorization" }, - "vector_store_ids": { + "require_approval": { "anyOf": [ { - "items": { - "type": "string" - }, - "type": "array" + "type": "string", + "const": "always" }, { - "type": "null" + "type": "string", + "const": "never" + }, + { + "$ref": "#/components/schemas/ApprovalFilter" } ], - "title": "Vector Store Ids", - "description": "Optional list of specific vector store IDs to query for RAG. If not provided, all available vector stores will be queried.", - "examples": [ - "ocp_docs", - "knowledge_base", - "vector_db_1" - ] + "title": "Require Approval", + "default": "never" }, - "shield_ids": { + "allowed_tools": { "anyOf": [ { "items": { @@ -8110,437 +8670,2777 @@ }, "type": "array" }, + { + "$ref": "#/components/schemas/AllowedToolsFilter" + }, { "type": "null" } ], - "title": "Shield Ids", - "description": "Optional list of safety shield IDs to apply. If None, all configured shields are used. If provided, must contain at least one valid shield ID (empty list raises 422 error).", - "examples": [ - "llama-guard", - "custom-shield" - ] - }, - "solr": { + "title": "Allowed Tools" + } + }, + "type": "object", + "required": [ + "server_label", + "server_url" + ], + "title": "OpenAIResponseInputToolMCP", + "description": "Model Context Protocol (MCP) tool configuration for OpenAI response inputs.\n\n:param type: Tool type identifier, always \"mcp\"\n:param server_label: Label to identify this MCP server\n:param server_url: URL endpoint of the MCP server\n:param headers: (Optional) HTTP headers to include when connecting to the server\n:param authorization: (Optional) OAuth access token for authenticating with the MCP server\n:param require_approval: Approval requirement for tool calls (\"always\", \"never\", or filter)\n:param allowed_tools: (Optional) Restriction on which tools can be used from this server" + }, + "OpenAIResponseInputToolWebSearch": { + "properties": { + "type": { "anyOf": [ { - "additionalProperties": true, - "type": "object" + "type": "string", + "const": "web_search" }, { - "type": "null" + "type": "string", + "const": "web_search_preview" + }, + { + "type": "string", + "const": "web_search_preview_2025_03_11" + }, + { + "type": "string", + "const": "web_search_2025_08_26" } ], - "title": "Solr", - "description": "Solr-specific query parameters including filter queries", - "examples": [ + "title": "Type", + "default": "web_search" + }, + "search_context_size": { + "anyOf": [ { - "fq": [ - "product:*openshift*", - "product_version:*4.16*" - ] + "type": "string", + "pattern": "^low|medium|high$" + }, + { + "type": "null" } - ] + ], + "title": "Search Context Size", + "default": "medium" + } + }, + "type": "object", + "title": "OpenAIResponseInputToolWebSearch", + "description": "Web search tool configuration for OpenAI response inputs.\n\n:param type: Web search tool type variant to use\n:param search_context_size: (Optional) Size of search context, must be \"low\", \"medium\", or \"high\"" + }, + "OpenAIResponseMCPApprovalRequest": { + "properties": { + "arguments": { + "type": "string", + "title": "Arguments" + }, + "id": { + "type": "string", + "title": "Id" + }, + "name": { + "type": "string", + "title": "Name" + }, + "server_label": { + "type": "string", + "title": "Server Label" + }, + "type": { + "type": "string", + "const": "mcp_approval_request", + "title": "Type", + "default": "mcp_approval_request" } }, - "additionalProperties": false, "type": "object", "required": [ - "query" + "arguments", + "id", + "name", + "server_label" ], - "title": "QueryRequest", - "description": "Model representing a request for the LLM (Language Model).\n\nAttributes:\n query: The query string.\n conversation_id: The optional conversation ID (UUID).\n provider: The optional provider.\n model: The optional model.\n system_prompt: The optional system prompt.\n attachments: The optional attachments.\n no_tools: Whether to bypass all tools and MCP servers (default: False).\n generate_topic_summary: Whether to generate topic summary for new conversations.\n media_type: The optional media type for response format (application/json or text/plain).\n vector_store_ids: The optional list of specific vector store IDs to query for RAG.\n shield_ids: The optional list of safety shield IDs to apply.\n\nExample:\n ```python\n query_request = QueryRequest(query=\"Tell me about Kubernetes\")\n ```", - "examples": [ - { - "attachments": [ + "title": "OpenAIResponseMCPApprovalRequest", + "description": "A request for human approval of a tool invocation." + }, + "OpenAIResponseMCPApprovalResponse": { + "properties": { + "approval_request_id": { + "type": "string", + "title": "Approval Request Id" + }, + "approve": { + "type": "boolean", + "title": "Approve" + }, + "type": { + "type": "string", + "const": "mcp_approval_response", + "title": "Type", + "default": "mcp_approval_response" + }, + "id": { + "anyOf": [ { - "attachment_type": "log", - "content": "this is attachment", - "content_type": "text/plain" + "type": "string" }, { - "attachment_type": "configuration", - "content": "kind: Pod\n metadata:\n name: private-reg", - "content_type": "application/yaml" + "type": "null" + } + ], + "title": "Id" + }, + "reason": { + "anyOf": [ + { + "type": "string" }, { - "attachment_type": "configuration", - "content": "foo: bar", - "content_type": "application/yaml" + "type": "null" } ], - "conversation_id": "123e4567-e89b-12d3-a456-426614174000", - "generate_topic_summary": true, - "model": "model-name", - "no_tools": false, - "provider": "openai", - "query": "write a deployment yaml for the mongodb image", - "system_prompt": "You are a helpful assistant", - "vector_store_ids": [ - "ocp_docs", - "knowledge_base" - ] + "title": "Reason" } - ] + }, + "type": "object", + "required": [ + "approval_request_id", + "approve" + ], + "title": "OpenAIResponseMCPApprovalResponse", + "description": "A response to an MCP approval request." }, - "QueryResponse": { + "OpenAIResponseMessage-Input": { "properties": { - "conversation_id": { + "content": { "anyOf": [ { "type": "string" }, { - "type": "null" - } - ], - "title": "Conversation Id", - "description": "The optional conversation ID (UUID)", - "examples": [ + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile", + "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage", + "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText" + } + } + }, + "type": "array" + }, + { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText-Input" + }, + { + "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "output_text": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText-Input", + "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal" + } + } + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "anyOf": [ + { + "type": "string", + "const": "system" + }, + { + "type": "string", + "const": "developer" + }, + { + "type": "string", + "const": "user" + }, + { + "type": "string", + "const": "assistant" + } + ], + "title": "Role" + }, + "type": { + "type": "string", + "const": "message", + "title": "Type", + "default": "message" + }, + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Id" + }, + "status": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Status" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "OpenAIResponseMessage", + "description": "Corresponds to the various Message types in the Responses API.\nThey are all under one type because the Responses API gives them all\nthe same \"type\" value, and there is no way to tell them apart in certain\nscenarios." + }, + "OpenAIResponseMessage-Output": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile", + "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage", + "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText" + } + } + }, + "type": "array" + }, + { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText-Output" + }, + { + "$ref": "#/components/schemas/OpenAIResponseContentPartRefusal" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "output_text": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText-Output", + "refusal": "#/components/schemas/OpenAIResponseContentPartRefusal" + } + } + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "anyOf": [ + { + "type": "string", + "const": "system" + }, + { + "type": "string", + "const": "developer" + }, + { + "type": "string", + "const": "user" + }, + { + "type": "string", + "const": "assistant" + } + ], + "title": "Role" + }, + "type": { + "type": "string", + "const": "message", + "title": "Type", + "default": "message" + }, + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Id" + }, + "status": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Status" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "OpenAIResponseMessage", + "description": "Corresponds to the various Message types in the Responses API.\nThey are all under one type because the Responses API gives them all\nthe same \"type\" value, and there is no way to tell them apart in certain\nscenarios." + }, + "OpenAIResponseOutputMessageContentOutputText-Input": { + "properties": { + "text": { + "type": "string", + "title": "Text" + }, + "type": { + "type": "string", + "const": "output_text", + "title": "Type", + "default": "output_text" + }, + "annotations": { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationFileCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationContainerFileCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationFilePath" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "container_file_citation": "#/components/schemas/OpenAIResponseAnnotationContainerFileCitation", + "file_citation": "#/components/schemas/OpenAIResponseAnnotationFileCitation", + "file_path": "#/components/schemas/OpenAIResponseAnnotationFilePath", + "url_citation": "#/components/schemas/OpenAIResponseAnnotationCitation" + } + } + }, + "type": "array", + "title": "Annotations" + }, + "logprobs": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Logprobs" + } + }, + "type": "object", + "required": [ + "text" + ], + "title": "OpenAIResponseOutputMessageContentOutputText" + }, + "OpenAIResponseOutputMessageContentOutputText-Output": { + "properties": { + "text": { + "type": "string", + "title": "Text" + }, + "type": { + "type": "string", + "const": "output_text", + "title": "Type", + "default": "output_text" + }, + "annotations": { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationFileCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationContainerFileCitation" + }, + { + "$ref": "#/components/schemas/OpenAIResponseAnnotationFilePath" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "container_file_citation": "#/components/schemas/OpenAIResponseAnnotationContainerFileCitation", + "file_citation": "#/components/schemas/OpenAIResponseAnnotationFileCitation", + "file_path": "#/components/schemas/OpenAIResponseAnnotationFilePath", + "url_citation": "#/components/schemas/OpenAIResponseAnnotationCitation" + } + } + }, + "type": "array", + "title": "Annotations" + }, + "logprobs": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Logprobs" + } + }, + "type": "object", + "required": [ + "text" + ], + "title": "OpenAIResponseOutputMessageContentOutputText" + }, + "OpenAIResponseOutputMessageFileSearchToolCall": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "queries": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Queries" + }, + "status": { + "type": "string", + "title": "Status" + }, + "type": { + "type": "string", + "const": "file_search_call", + "title": "Type", + "default": "file_search_call" + }, + "results": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCallResults" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Results" + } + }, + "type": "object", + "required": [ + "id", + "queries", + "status" + ], + "title": "OpenAIResponseOutputMessageFileSearchToolCall", + "description": "File search tool call output message for OpenAI responses.\n\n:param id: Unique identifier for this tool call\n:param queries: List of search queries executed\n:param status: Current status of the file search operation\n:param type: Tool call type identifier, always \"file_search_call\"\n:param results: (Optional) Search results returned by the file search operation" + }, + "OpenAIResponseOutputMessageFileSearchToolCallResults": { + "properties": { + "attributes": { + "additionalProperties": true, + "type": "object", + "title": "Attributes" + }, + "file_id": { + "type": "string", + "title": "File Id" + }, + "filename": { + "type": "string", + "title": "Filename" + }, + "score": { + "type": "number", + "title": "Score" + }, + "text": { + "type": "string", + "title": "Text" + } + }, + "type": "object", + "required": [ + "attributes", + "file_id", + "filename", + "score", + "text" + ], + "title": "OpenAIResponseOutputMessageFileSearchToolCallResults", + "description": "Search results returned by the file search operation.\n\n:param attributes: (Optional) Key-value attributes associated with the file\n:param file_id: Unique identifier of the file containing the result\n:param filename: Name of the file containing the result\n:param score: Relevance score for this search result (between 0 and 1)\n:param text: Text content of the search result" + }, + "OpenAIResponseOutputMessageFunctionToolCall": { + "properties": { + "call_id": { + "type": "string", + "title": "Call Id" + }, + "name": { + "type": "string", + "title": "Name" + }, + "arguments": { + "type": "string", + "title": "Arguments" + }, + "type": { + "type": "string", + "const": "function_call", + "title": "Type", + "default": "function_call" + }, + "id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Id" + }, + "status": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Status" + } + }, + "type": "object", + "required": [ + "call_id", + "name", + "arguments" + ], + "title": "OpenAIResponseOutputMessageFunctionToolCall", + "description": "Function tool call output message for OpenAI responses.\n\n:param call_id: Unique identifier for the function call\n:param name: Name of the function being called\n:param arguments: JSON string containing the function arguments\n:param type: Tool call type identifier, always \"function_call\"\n:param id: (Optional) Additional identifier for the tool call\n:param status: (Optional) Current status of the function call execution" + }, + "OpenAIResponseOutputMessageMCPCall": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "type": { + "type": "string", + "const": "mcp_call", + "title": "Type", + "default": "mcp_call" + }, + "arguments": { + "type": "string", + "title": "Arguments" + }, + "name": { + "type": "string", + "title": "Name" + }, + "server_label": { + "type": "string", + "title": "Server Label" + }, + "error": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Error" + }, + "output": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Output" + } + }, + "type": "object", + "required": [ + "id", + "arguments", + "name", + "server_label" + ], + "title": "OpenAIResponseOutputMessageMCPCall", + "description": "Model Context Protocol (MCP) call output message for OpenAI responses.\n\n:param id: Unique identifier for this MCP call\n:param type: Tool call type identifier, always \"mcp_call\"\n:param arguments: JSON string containing the MCP call arguments\n:param name: Name of the MCP method being called\n:param server_label: Label identifying the MCP server handling the call\n:param error: (Optional) Error message if the MCP call failed\n:param output: (Optional) Output result from the successful MCP call" + }, + "OpenAIResponseOutputMessageMCPListTools": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "type": { + "type": "string", + "const": "mcp_list_tools", + "title": "Type", + "default": "mcp_list_tools" + }, + "server_label": { + "type": "string", + "title": "Server Label" + }, + "tools": { + "items": { + "$ref": "#/components/schemas/MCPListToolsTool" + }, + "type": "array", + "title": "Tools" + } + }, + "type": "object", + "required": [ + "id", + "server_label", + "tools" + ], + "title": "OpenAIResponseOutputMessageMCPListTools", + "description": "MCP list tools output message containing available tools from an MCP server.\n\n:param id: Unique identifier for this MCP list tools operation\n:param type: Tool call type identifier, always \"mcp_list_tools\"\n:param server_label: Label identifying the MCP server providing the tools\n:param tools: List of available tools provided by the MCP server" + }, + "OpenAIResponseOutputMessageWebSearchToolCall": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "status": { + "type": "string", + "title": "Status" + }, + "type": { + "type": "string", + "const": "web_search_call", + "title": "Type", + "default": "web_search_call" + } + }, + "type": "object", + "required": [ + "id", + "status" + ], + "title": "OpenAIResponseOutputMessageWebSearchToolCall", + "description": "Web search tool call output message for OpenAI responses.\n\n:param id: Unique identifier for this tool call\n:param status: Current status of the web search operation\n:param type: Tool call type identifier, always \"web_search_call\"" + }, + "OpenAIResponsePrompt": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "variables": { + "anyOf": [ + { + "additionalProperties": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile", + "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage", + "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText" + } + } + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Variables" + }, + "version": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Version" + } + }, + "type": "object", + "required": [ + "id" + ], + "title": "OpenAIResponsePrompt", + "description": "OpenAI compatible Prompt object that is used in OpenAI responses.\n\n:param id: Unique identifier of the prompt template\n:param variables: Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types\nlike images or files.\n:param version: Version number of the prompt to use (defaults to latest if not specified)" + }, + "OpenAIResponseText": { + "properties": { + "format": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseTextFormat" + }, + { + "type": "null" + } + ] + } + }, + "type": "object", + "title": "OpenAIResponseText", + "description": "Text response configuration for OpenAI responses.\n\n:param format: (Optional) Text format configuration specifying output format requirements" + }, + "OpenAIResponseTextFormat": { + "properties": { + "type": { + "anyOf": [ + { + "type": "string", + "const": "text" + }, + { + "type": "string", + "const": "json_schema" + }, + { + "type": "string", + "const": "json_object" + } + ], + "title": "Type" + }, + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name" + }, + "schema": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Schema" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "strict": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Strict" + } + }, + "type": "object", + "title": "OpenAIResponseTextFormat", + "description": "Configuration for Responses API text format.\n\n:param type: Must be \"text\", \"json_schema\", or \"json_object\" to identify the format type\n:param name: The name of the response format. Only used for json_schema.\n:param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema.\n:param description: (Optional) A description of the response format. Only used for json_schema.\n:param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema." + }, + "OpenAIResponseToolMCP": { + "properties": { + "type": { + "type": "string", + "const": "mcp", + "title": "Type", + "default": "mcp" + }, + "server_label": { + "type": "string", + "title": "Server Label" + }, + "allowed_tools": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "$ref": "#/components/schemas/AllowedToolsFilter" + }, + { + "type": "null" + } + ], + "title": "Allowed Tools" + } + }, + "type": "object", + "required": [ + "server_label" + ], + "title": "OpenAIResponseToolMCP", + "description": "Model Context Protocol (MCP) tool configuration for OpenAI response object.\n\n:param type: Tool type identifier, always \"mcp\"\n:param server_label: Label to identify this MCP server\n:param allowed_tools: (Optional) Restriction on which tools can be used from this server" + }, + "OpenAIResponseUsage": { + "properties": { + "input_tokens": { + "type": "integer", + "title": "Input Tokens" + }, + "output_tokens": { + "type": "integer", + "title": "Output Tokens" + }, + "total_tokens": { + "type": "integer", + "title": "Total Tokens" + }, + "input_tokens_details": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseUsageInputTokensDetails" + }, + { + "type": "null" + } + ] + }, + "output_tokens_details": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseUsageOutputTokensDetails" + }, + { + "type": "null" + } + ] + } + }, + "type": "object", + "required": [ + "input_tokens", + "output_tokens", + "total_tokens" + ], + "title": "OpenAIResponseUsage", + "description": "Usage information for OpenAI response.\n\n:param input_tokens: Number of tokens in the input\n:param output_tokens: Number of tokens in the output\n:param total_tokens: Total tokens used (input + output)\n:param input_tokens_details: Detailed breakdown of input token usage\n:param output_tokens_details: Detailed breakdown of output token usage" + }, + "OpenAIResponseUsageInputTokensDetails": { + "properties": { + "cached_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Cached Tokens" + } + }, + "type": "object", + "title": "OpenAIResponseUsageInputTokensDetails", + "description": "Token details for input tokens in OpenAI response usage.\n\n:param cached_tokens: Number of tokens retrieved from cache" + }, + "OpenAIResponseUsageOutputTokensDetails": { + "properties": { + "reasoning_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Reasoning Tokens" + } + }, + "type": "object", + "title": "OpenAIResponseUsageOutputTokensDetails", + "description": "Token details for output tokens in OpenAI response usage.\n\n:param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models)" + }, + "OpenAITokenLogProb": { + "properties": { + "token": { + "type": "string", + "title": "Token" + }, + "bytes": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Bytes" + }, + "logprob": { + "type": "number", + "title": "Logprob" + }, + "top_logprobs": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/OpenAITopLogProb" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Top Logprobs" + } + }, + "type": "object", + "required": [ + "token", + "logprob" + ], + "title": "OpenAITokenLogProb", + "description": "The log probability for a token from an OpenAI-compatible chat completion response.\n\n:token: The token\n:bytes: (Optional) The bytes for the token\n:logprob: The log probability of the token\n:top_logprobs: The top log probabilities for the token" + }, + "OpenAITopLogProb": { + "properties": { + "token": { + "type": "string", + "title": "Token" + }, + "bytes": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Bytes" + }, + "logprob": { + "type": "number", + "title": "Logprob" + } + }, + "type": "object", + "required": [ + "token", + "logprob" + ], + "title": "OpenAITopLogProb", + "description": "The top log probability for a token from an OpenAI-compatible chat completion response.\n\n:token: The token\n:bytes: (Optional) The bytes for the token\n:logprob: The log probability of the token" + }, + "OpenIdConnectSecurityScheme": { + "properties": { + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "openIdConnectUrl": { + "type": "string", + "title": "Openidconnecturl" + }, + "type": { + "type": "string", + "const": "openIdConnect", + "title": "Type", + "default": "openIdConnect" + } + }, + "type": "object", + "required": [ + "openIdConnectUrl" + ], + "title": "OpenIdConnectSecurityScheme", + "description": "Defines a security scheme using OpenID Connect." + }, + "PasswordOAuthFlow": { + "properties": { + "refreshUrl": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Refreshurl" + }, + "scopes": { + "additionalProperties": { + "type": "string" + }, + "type": "object", + "title": "Scopes" + }, + "tokenUrl": { + "type": "string", + "title": "Tokenurl" + } + }, + "type": "object", + "required": [ + "scopes", + "tokenUrl" + ], + "title": "PasswordOAuthFlow", + "description": "Defines configuration details for the OAuth 2.0 Resource Owner Password flow." + }, + "PostgreSQLDatabaseConfiguration": { + "properties": { + "host": { + "type": "string", + "title": "Hostname", + "description": "Database server host or socket directory", + "default": "localhost" + }, + "port": { + "type": "integer", + "exclusiveMinimum": 0.0, + "title": "Port", + "description": "Database server port", + "default": 5432 + }, + "db": { + "type": "string", + "title": "Database name", + "description": "Database name to connect to" + }, + "user": { + "type": "string", + "title": "User name", + "description": "Database user name used to authenticate" + }, + "password": { + "type": "string", + "format": "password", + "title": "Password", + "description": "Password used to authenticate", + "writeOnly": true + }, + "namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name space", + "description": "Database namespace", + "default": "public" + }, + "ssl_mode": { + "type": "string", + "title": "SSL mode", + "description": "SSL mode", + "default": "prefer" + }, + "gss_encmode": { + "type": "string", + "title": "GSS encmode", + "description": "This option determines whether or with what priority a secure GSS TCP/IP connection will be negotiated with the server.", + "default": "prefer" + }, + "ca_cert_path": { + "anyOf": [ + { + "type": "string", + "format": "file-path" + }, + { + "type": "null" + } + ], + "title": "CA certificate path", + "description": "Path to CA certificate" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "db", + "user", + "password" + ], + "title": "PostgreSQLDatabaseConfiguration", + "description": "PostgreSQL database configuration.\n\nPostgreSQL database is used by Lightspeed Core Stack service for storing\ninformation about conversation IDs. It can also be leveraged to store\nconversation history and information about quota usage.\n\nUseful resources:\n\n- [Psycopg: connection classes](https://www.psycopg.org/psycopg3/docs/api/connections.html)\n- [PostgreSQL connection strings](https://www.connectionstrings.com/postgresql/)\n- [How to Use PostgreSQL in Python](https://www.freecodecamp.org/news/postgresql-in-python/)" + }, + "PromptTooLongResponse": { + "properties": { + "status_code": { + "type": "integer", + "title": "Status Code" + }, + "detail": { + "$ref": "#/components/schemas/DetailModel" + } + }, + "type": "object", + "required": [ + "status_code", + "detail" + ], + "title": "PromptTooLongResponse", + "description": "413 Payload Too Large - Prompt is too long.", + "examples": [ + { + "detail": { + "cause": "The prompt exceeds the maximum allowed length.", + "response": "Prompt is too long" + }, + "label": "prompt too long" + } + ] + }, + "ProviderHealthStatus": { + "properties": { + "provider_id": { + "type": "string", + "title": "Provider Id", + "description": "The ID of the provider" + }, + "status": { + "type": "string", + "title": "Status", + "description": "The health status", + "examples": [ + "ok", + "unhealthy", + "not_implemented" + ] + }, + "message": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Message", + "description": "Optional message about the health status", + "examples": [ + "All systems operational", + "Llama Stack is unavailable" + ] + } + }, + "type": "object", + "required": [ + "provider_id", + "status" + ], + "title": "ProviderHealthStatus", + "description": "Model representing the health status of a provider.\n\nAttributes:\n provider_id: The ID of the provider.\n status: The health status ('ok', 'unhealthy', 'not_implemented').\n message: Optional message about the health status." + }, + "ProviderResponse": { + "properties": { + "api": { + "type": "string", + "title": "Api", + "description": "The API this provider implements" + }, + "config": { + "additionalProperties": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "items": {}, + "type": "array" + }, + {}, + { + "type": "null" + } + ] + }, + "type": "object", + "title": "Config", + "description": "Provider configuration parameters" + }, + "health": { + "additionalProperties": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "items": {}, + "type": "array" + }, + {}, + { + "type": "null" + } + ] + }, + "type": "object", + "title": "Health", + "description": "Current health status of the provider" + }, + "provider_id": { + "type": "string", + "title": "Provider Id", + "description": "Unique provider identifier" + }, + "provider_type": { + "type": "string", + "title": "Provider Type", + "description": "Provider implementation type" + } + }, + "type": "object", + "required": [ + "api", + "config", + "health", + "provider_id", + "provider_type" + ], + "title": "ProviderResponse", + "description": "Model representing a response to get specific provider request.", + "examples": [ + { + "api": "inference", + "config": { + "api_key": "********" + }, + "health": { + "message": "Healthy", + "status": "OK" + }, + "provider_id": "openai", + "provider_type": "remote::openai" + } + ] + }, + "ProvidersListResponse": { + "properties": { + "providers": { + "additionalProperties": { + "items": { + "additionalProperties": true, + "type": "object" + }, + "type": "array" + }, + "type": "object", + "title": "Providers", + "description": "List of available API types and their corresponding providers" + } + }, + "type": "object", + "required": [ + "providers" + ], + "title": "ProvidersListResponse", + "description": "Model representing a response to providers request.", + "examples": [ + { + "providers": { + "agents": [ + { + "provider_id": "meta-reference", + "provider_type": "inline::meta-reference" + } + ], + "inference": [ + { + "provider_id": "sentence-transformers", + "provider_type": "inline::sentence-transformers" + }, + { + "provider_id": "openai", + "provider_type": "remote::openai" + } + ] + } + } + ] + }, + "QueryRequest": { + "properties": { + "query": { + "type": "string", + "title": "Query", + "description": "The query string", + "examples": [ + "What is Kubernetes?" + ] + }, + "conversation_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Conversation Id", + "description": "The optional conversation ID (UUID)", + "examples": [ + "c5260aec-4d82-4370-9fdf-05cf908b3f16" + ] + }, + "provider": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Provider", + "description": "The optional provider", + "examples": [ + "openai", + "watsonx" + ] + }, + "model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Model", + "description": "The optional model", + "examples": [ + "gpt4mini" + ] + }, + "system_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "System Prompt", + "description": "The optional system prompt.", + "examples": [ + "You are OpenShift assistant.", + "You are Ansible assistant." + ] + }, + "attachments": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/Attachment" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Attachments", + "description": "The optional list of attachments.", + "examples": [ + { + "attachment_type": "log", + "content": "this is attachment", + "content_type": "text/plain" + }, + { + "attachment_type": "configuration", + "content": "kind: Pod\n metadata:\n name: private-reg", + "content_type": "application/yaml" + }, + { + "attachment_type": "configuration", + "content": "foo: bar", + "content_type": "application/yaml" + } + ] + }, + "no_tools": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "No Tools", + "description": "Whether to bypass all tools and MCP servers", + "default": false, + "examples": [ + true, + false + ] + }, + "generate_topic_summary": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Generate Topic Summary", + "description": "Whether to generate topic summary for new conversations", + "default": true, + "examples": [ + true, + false + ] + }, + "media_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Media Type", + "description": "Media type for the response format", + "examples": [ + "application/json", + "text/plain" + ] + }, + "vector_store_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Vector Store Ids", + "description": "Optional list of specific vector store IDs to query for RAG. If not provided, all available vector stores will be queried.", + "examples": [ + "ocp_docs", + "knowledge_base", + "vector_db_1" + ] + }, + "shield_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Shield Ids", + "description": "Optional list of safety shield IDs to apply. If None, all configured shields are used. If provided, must contain at least one valid shield ID (empty list raises 422 error).", + "examples": [ + "llama-guard", + "custom-shield" + ] + }, + "solr": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Solr", + "description": "Solr-specific query parameters including filter queries", + "examples": [ + { + "fq": [ + "product:*openshift*", + "product_version:*4.16*" + ] + } + ] + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "query" + ], + "title": "QueryRequest", + "description": "Model representing a request for the LLM (Language Model).\n\nAttributes:\n query: The query string.\n conversation_id: The optional conversation ID (UUID).\n provider: The optional provider.\n model: The optional model.\n system_prompt: The optional system prompt.\n attachments: The optional attachments.\n no_tools: Whether to bypass all tools and MCP servers (default: False).\n generate_topic_summary: Whether to generate topic summary for new conversations.\n media_type: The optional media type for response format (application/json or text/plain).\n vector_store_ids: The optional list of specific vector store IDs to query for RAG.\n shield_ids: The optional list of safety shield IDs to apply.\n\nExample:\n ```python\n query_request = QueryRequest(query=\"Tell me about Kubernetes\")\n ```", + "examples": [ + { + "attachments": [ + { + "attachment_type": "log", + "content": "this is attachment", + "content_type": "text/plain" + }, + { + "attachment_type": "configuration", + "content": "kind: Pod\n metadata:\n name: private-reg", + "content_type": "application/yaml" + }, + { + "attachment_type": "configuration", + "content": "foo: bar", + "content_type": "application/yaml" + } + ], + "conversation_id": "123e4567-e89b-12d3-a456-426614174000", + "generate_topic_summary": true, + "model": "model-name", + "no_tools": false, + "provider": "openai", + "query": "write a deployment yaml for the mongodb image", + "system_prompt": "You are a helpful assistant", + "vector_store_ids": [ + "ocp_docs", + "knowledge_base" + ] + } + ] + }, + "QueryResponse": { + "properties": { + "conversation_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Conversation Id", + "description": "The optional conversation ID (UUID)", + "examples": [ "c5260aec-4d82-4370-9fdf-05cf908b3f16" ] }, - "response": { - "type": "string", - "title": "Response", - "description": "Response from LLM", + "response": { + "type": "string", + "title": "Response", + "description": "Response from LLM", + "examples": [ + "Kubernetes is an open-source container orchestration system for automating ..." + ] + }, + "rag_chunks": { + "items": { + "$ref": "#/components/schemas/RAGChunk" + }, + "type": "array", + "title": "Rag Chunks", + "description": "Deprecated: List of RAG chunks used to generate the response." + }, + "referenced_documents": { + "items": { + "$ref": "#/components/schemas/ReferencedDocument" + }, + "type": "array", + "title": "Referenced Documents", + "description": "List of documents referenced in generating the response", + "examples": [ + [ + { + "doc_title": "Operator Lifecycle Manager (OLM)", + "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" + } + ] + ] + }, + "truncated": { + "type": "boolean", + "title": "Truncated", + "description": "Deprecated:Whether conversation history was truncated", + "default": false, + "examples": [ + false, + true + ] + }, + "input_tokens": { + "type": "integer", + "title": "Input Tokens", + "description": "Number of tokens sent to LLM", + "default": 0, + "examples": [ + 150, + 250, + 500 + ] + }, + "output_tokens": { + "type": "integer", + "title": "Output Tokens", + "description": "Number of tokens received from LLM", + "default": 0, + "examples": [ + 50, + 100, + 200 + ] + }, + "available_quotas": { + "additionalProperties": { + "type": "integer" + }, + "type": "object", + "title": "Available Quotas", + "description": "Quota available as measured by all configured quota limiters", + "examples": [ + { + "daily": 1000, + "monthly": 50000 + } + ] + }, + "tool_calls": { + "items": { + "$ref": "#/components/schemas/ToolCallSummary" + }, + "type": "array", + "title": "Tool Calls", + "description": "List of tool calls made during response generation" + }, + "tool_results": { + "items": { + "$ref": "#/components/schemas/ToolResultSummary" + }, + "type": "array", + "title": "Tool Results", + "description": "List of tool results" + } + }, + "type": "object", + "required": [ + "response" + ], + "title": "QueryResponse", + "description": "Model representing LLM response to a query.\n\nAttributes:\n conversation_id: The optional conversation ID (UUID).\n response: The response.\n rag_chunks: Deprecated. List of RAG chunks used to generate the response.\n This information is now available in tool_results under file_search_call type.\n referenced_documents: The URLs and titles for the documents used to generate the response.\n tool_calls: List of tool calls made during response generation.\n tool_results: List of tool results.\n truncated: Whether conversation history was truncated.\n input_tokens: Number of tokens sent to LLM.\n output_tokens: Number of tokens received from LLM.\n available_quotas: Quota available as measured by all configured quota limiters.", + "examples": [ + { + "available_quotas": { + "ClusterQuotaLimiter": 998911, + "UserQuotaLimiter": 998911 + }, + "conversation_id": "123e4567-e89b-12d3-a456-426614174000", + "input_tokens": 123, + "output_tokens": 456, + "referenced_documents": [ + { + "doc_title": "Operator Lifecycle Manager concepts and resources", + "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/understanding/olm/olm-understanding-olm.html" + } + ], + "response": "Operator Lifecycle Manager (OLM) helps users install...", + "tool_calls": [ + { + "args": {}, + "id": "1", + "name": "tool1", + "type": "tool_call" + } + ], + "tool_results": [ + { + "content": "bla", + "id": "1", + "round": 1, + "status": "success", + "type": "tool_result" + } + ], + "truncated": false + } + ] + }, + "QuotaExceededResponse": { + "properties": { + "status_code": { + "type": "integer", + "title": "Status Code" + }, + "detail": { + "$ref": "#/components/schemas/DetailModel" + } + }, + "type": "object", + "required": [ + "status_code", + "detail" + ], + "title": "QuotaExceededResponse", + "description": "429 Too Many Requests - Quota limit exceeded.", + "examples": [ + { + "detail": { + "cause": "The token quota for model gpt-4-turbo has been exceeded.", + "response": "The model quota has been exceeded" + }, + "label": "model" + }, + { + "detail": { + "cause": "User 123 has no available tokens.", + "response": "The quota has been exceeded" + }, + "label": "user none" + }, + { + "detail": { + "cause": "Cluster has no available tokens.", + "response": "The quota has been exceeded" + }, + "label": "cluster none" + }, + { + "detail": { + "cause": "Unknown subject 999 has no available tokens.", + "response": "The quota has been exceeded" + }, + "label": "subject none" + }, + { + "detail": { + "cause": "User 123 has 5 tokens, but 10 tokens are needed.", + "response": "The quota has been exceeded" + }, + "label": "user insufficient" + }, + { + "detail": { + "cause": "Cluster has 500 tokens, but 900 tokens are needed.", + "response": "The quota has been exceeded" + }, + "label": "cluster insufficient" + }, + { + "detail": { + "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", + "response": "The quota has been exceeded" + }, + "label": "subject insufficient" + } + ] + }, + "QuotaHandlersConfiguration": { + "properties": { + "sqlite": { + "anyOf": [ + { + "$ref": "#/components/schemas/SQLiteDatabaseConfiguration" + }, + { + "type": "null" + } + ], + "title": "SQLite configuration", + "description": "SQLite database configuration" + }, + "postgres": { + "anyOf": [ + { + "$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration" + }, + { + "type": "null" + } + ], + "title": "PostgreSQL configuration", + "description": "PostgreSQL database configuration" + }, + "limiters": { + "items": { + "$ref": "#/components/schemas/QuotaLimiterConfiguration" + }, + "type": "array", + "title": "Quota limiters", + "description": "Quota limiters configuration" + }, + "scheduler": { + "$ref": "#/components/schemas/QuotaSchedulerConfiguration", + "title": "Quota scheduler", + "description": "Quota scheduler configuration" + }, + "enable_token_history": { + "type": "boolean", + "title": "Enable token history", + "description": "Enables storing information about token usage history", + "default": false + } + }, + "additionalProperties": false, + "type": "object", + "title": "QuotaHandlersConfiguration", + "description": "Quota limiter configuration.\n\nIt is possible to limit quota usage per user or per service or services\n(that typically run in one cluster). Each limit is configured as a separate\n_quota limiter_. It can be of type `user_limiter` or `cluster_limiter`\n(which is name that makes sense in OpenShift deployment)." + }, + "QuotaLimiterConfiguration": { + "properties": { + "type": { + "type": "string", + "enum": [ + "user_limiter", + "cluster_limiter" + ], + "title": "Quota limiter type", + "description": "Quota limiter type, either user_limiter or cluster_limiter" + }, + "name": { + "type": "string", + "title": "Quota limiter name", + "description": "Human readable quota limiter name" + }, + "initial_quota": { + "type": "integer", + "minimum": 0.0, + "title": "Initial quota", + "description": "Quota set at beginning of the period" + }, + "quota_increase": { + "type": "integer", + "minimum": 0.0, + "title": "Quota increase", + "description": "Delta value used to increase quota when period is reached" + }, + "period": { + "type": "string", + "title": "Period", + "description": "Period specified in human readable form" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "type", + "name", + "initial_quota", + "quota_increase", + "period" + ], + "title": "QuotaLimiterConfiguration", + "description": "Configuration for one quota limiter.\n\nThere are three configuration options for each limiter:\n\n1. ``period`` is specified in a human-readable form, see\n https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT\n for all possible options. When the end of the period is reached, the\n quota is reset or increased.\n2. ``initial_quota`` is the value set at the beginning of the period.\n3. ``quota_increase`` is the value (if specified) used to increase the\n quota when the period is reached.\n\nThere are two basic use cases:\n\n1. When the quota needs to be reset to a specific value periodically (for\n example on a weekly or monthly basis), set ``initial_quota`` to the\n required value.\n2. When the quota needs to be increased by a specific value periodically\n (for example on a daily basis), set ``quota_increase``." + }, + "QuotaSchedulerConfiguration": { + "properties": { + "period": { + "type": "integer", + "exclusiveMinimum": 0.0, + "title": "Period", + "description": "Quota scheduler period specified in seconds", + "default": 1 + }, + "database_reconnection_count": { + "type": "integer", + "exclusiveMinimum": 0.0, + "title": "Database reconnection count on startup", + "description": "Database reconnection count on startup. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.", + "default": 10 + }, + "database_reconnection_delay": { + "type": "integer", + "exclusiveMinimum": 0.0, + "title": "Database reconnection delay", + "description": "Database reconnection delay specified in seconds. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.", + "default": 1 + } + }, + "additionalProperties": false, + "type": "object", + "title": "QuotaSchedulerConfiguration", + "description": "Quota scheduler configuration." + }, + "RAGChunk": { + "properties": { + "content": { + "type": "string", + "title": "Content", + "description": "The content of the chunk" + }, + "source": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source", + "description": "Index name identifying the knowledge source from configuration" + }, + "score": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Score", + "description": "Relevance score" + }, + "attributes": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Attributes", + "description": "Document metadata from the RAG provider (e.g., url, title, author)" + } + }, + "type": "object", + "required": [ + "content" + ], + "title": "RAGChunk", + "description": "Model representing a RAG chunk used in the response." + }, + "RAGInfoResponse": { + "properties": { + "id": { + "type": "string", + "title": "Id", + "description": "Vector DB unique ID", + "examples": [ + "vs_00000000_0000_0000" + ] + }, + "name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Name", + "description": "Human readable vector DB name", + "examples": [ + "Faiss Store with Knowledge base" + ] + }, + "created_at": { + "type": "integer", + "title": "Created At", + "description": "When the vector store was created, represented as Unix time", + "examples": [ + 1763391371 + ] + }, + "last_active_at": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Last Active At", + "description": "When the vector store was last active, represented as Unix time", + "examples": [ + 1763391371 + ] + }, + "usage_bytes": { + "type": "integer", + "title": "Usage Bytes", + "description": "Storage byte(s) used by this vector DB", + "examples": [ + 0 + ] + }, + "expires_at": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Expires At", + "description": "When the vector store expires, represented as Unix time", "examples": [ - "Kubernetes is an open-source container orchestration system for automating ..." + 1763391371 ] }, - "rag_chunks": { - "items": { - "$ref": "#/components/schemas/RAGChunk" - }, - "type": "array", - "title": "Rag Chunks", - "description": "Deprecated: List of RAG chunks used to generate the response." + "object": { + "type": "string", + "title": "Object", + "description": "Object type", + "examples": [ + "vector_store" + ] }, - "referenced_documents": { + "status": { + "type": "string", + "title": "Status", + "description": "Vector DB status", + "examples": [ + "completed" + ] + } + }, + "type": "object", + "required": [ + "id", + "created_at", + "usage_bytes", + "object", + "status" + ], + "title": "RAGInfoResponse", + "description": "Model representing a response with information about RAG DB.", + "examples": [ + { + "created_at": 1763391371, + "id": "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", + "last_active_at": 1763391371, + "name": "Faiss Store with Knowledge base", + "object": "vector_store", + "status": "completed", + "usage_bytes": 1024000 + } + ] + }, + "RAGListResponse": { + "properties": { + "rags": { "items": { - "$ref": "#/components/schemas/ReferencedDocument" + "type": "string" }, "type": "array", - "title": "Referenced Documents", - "description": "List of documents referenced in generating the response", + "title": "RAG list response", + "description": "List of RAG identifiers", "examples": [ - [ - { - "doc_title": "Operator Lifecycle Manager (OLM)", - "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" - } - ] + "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", + "vs_7b52a8cf-0fa3-489c-cafe-27e061d102f3" ] - }, - "truncated": { + } + }, + "type": "object", + "required": [ + "rags" + ], + "title": "RAGListResponse", + "description": "Model representing a response to list RAGs request.", + "examples": [ + { + "rags": [ + "vs_00000000-cafe-babe-0000-000000000000", + "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", + "vs_7b52a8cf-0fa3-489c-cafe-27e061d102f3" + ] + } + ] + }, + "RHIdentityConfiguration": { + "properties": { + "required_entitlements": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Required entitlements", + "description": "List of all required entitlements." + } + }, + "additionalProperties": false, + "type": "object", + "title": "RHIdentityConfiguration", + "description": "Red Hat Identity authentication configuration." + }, + "ReadinessResponse": { + "properties": { + "ready": { "type": "boolean", - "title": "Truncated", - "description": "Deprecated:Whether conversation history was truncated", - "default": false, + "title": "Ready", + "description": "Flag indicating if service is ready", "examples": [ - false, - true + true, + false + ] + }, + "reason": { + "type": "string", + "title": "Reason", + "description": "The reason for the readiness", + "examples": [ + "Service is ready" ] }, - "input_tokens": { - "type": "integer", - "title": "Input Tokens", - "description": "Number of tokens sent to LLM", - "default": 0, - "examples": [ - 150, - 250, - 500 - ] + "providers": { + "items": { + "$ref": "#/components/schemas/ProviderHealthStatus" + }, + "type": "array", + "title": "Providers", + "description": "List of unhealthy providers in case of readiness failure.", + "examples": [] + } + }, + "type": "object", + "required": [ + "ready", + "reason", + "providers" + ], + "title": "ReadinessResponse", + "description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready.\n reason: The reason for the readiness.\n providers: List of unhealthy providers in case of readiness failure.\n\nExample:\n ```python\n readiness_response = ReadinessResponse(\n ready=False,\n reason=\"Service is not ready\",\n providers=[\n ProviderHealthStatus(\n provider_id=\"ollama\",\n status=\"unhealthy\",\n message=\"Server is unavailable\"\n )\n ]\n )\n ```", + "examples": [ + { + "providers": [], + "ready": true, + "reason": "Service is ready" + } + ] + }, + "ReferencedDocument": { + "properties": { + "doc_url": { + "anyOf": [ + { + "type": "string", + "minLength": 1, + "format": "uri" + }, + { + "type": "null" + } + ], + "title": "Doc Url", + "description": "URL of the referenced document" + }, + "doc_title": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Doc Title", + "description": "Title of the referenced document" + }, + "source": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source", + "description": "Index name identifying the knowledge source from configuration" + } + }, + "type": "object", + "title": "ReferencedDocument", + "description": "Model representing a document referenced in generating a response.\n\nAttributes:\n doc_url: Url to the referenced doc.\n doc_title: Title of the referenced doc." + }, + "ResponsesRequest": { + "properties": { + "input": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseMessage-Input" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse" + } + ] + }, + "type": "array" + } + ], + "title": "Input" + }, + "model": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Model" + }, + "conversation": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Conversation" }, - "output_tokens": { - "type": "integer", - "title": "Output Tokens", - "description": "Number of tokens received from LLM", - "default": 0, - "examples": [ - 50, - 100, - 200 - ] + "include": { + "anyOf": [ + { + "items": { + "type": "string", + "enum": [ + "web_search_call.action.sources", + "code_interpreter_call.outputs", + "computer_call_output.output.image_url", + "file_search_call.results", + "message.input_image.image_url", + "message.output_text.logprobs", + "reasoning.encrypted_content" + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Include" }, - "available_quotas": { - "additionalProperties": { - "type": "integer" - }, - "type": "object", - "title": "Available Quotas", - "description": "Quota available as measured by all configured quota limiters", - "examples": [ + "instructions": { + "anyOf": [ { - "daily": 1000, - "monthly": 50000 + "type": "string" + }, + { + "type": "null" } - ] + ], + "title": "Instructions" }, - "tool_calls": { - "items": { - "$ref": "#/components/schemas/ToolCallSummary" - }, - "type": "array", - "title": "Tool Calls", - "description": "List of tool calls made during response generation" + "max_infer_iters": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Infer Iters" }, - "tool_results": { - "items": { - "$ref": "#/components/schemas/ToolResultSummary" - }, - "type": "array", - "title": "Tool Results", - "description": "List of tool results" - } - }, - "type": "object", - "required": [ - "response" - ], - "title": "QueryResponse", - "description": "Model representing LLM response to a query.\n\nAttributes:\n conversation_id: The optional conversation ID (UUID).\n response: The response.\n rag_chunks: Deprecated. List of RAG chunks used to generate the response.\n This information is now available in tool_results under file_search_call type.\n referenced_documents: The URLs and titles for the documents used to generate the response.\n tool_calls: List of tool calls made during response generation.\n tool_results: List of tool results.\n truncated: Whether conversation history was truncated.\n input_tokens: Number of tokens sent to LLM.\n output_tokens: Number of tokens received from LLM.\n available_quotas: Quota available as measured by all configured quota limiters.", - "examples": [ - { - "available_quotas": { - "ClusterQuotaLimiter": 998911, - "UserQuotaLimiter": 998911 - }, - "conversation_id": "123e4567-e89b-12d3-a456-426614174000", - "input_tokens": 123, - "output_tokens": 456, - "referenced_documents": [ + "max_tool_calls": { + "anyOf": [ { - "doc_title": "Operator Lifecycle Manager concepts and resources", - "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/understanding/olm/olm-understanding-olm.html" + "type": "integer" + }, + { + "type": "null" } ], - "response": "Operator Lifecycle Manager (OLM) helps users install...", - "tool_calls": [ + "title": "Max Tool Calls" + }, + "metadata": { + "anyOf": [ { - "args": {}, - "id": "1", - "name": "tool1", - "type": "tool_call" + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" } ], - "tool_results": [ + "title": "Metadata" + }, + "parallel_tool_calls": { + "anyOf": [ { - "content": "bla", - "id": "1", - "round": 1, - "status": "success", - "type": "tool_result" + "type": "boolean" + }, + { + "type": "null" } ], - "truncated": false - } - ] - }, - "QuotaExceededResponse": { - "properties": { - "status_code": { - "type": "integer", - "title": "Status Code" + "title": "Parallel Tool Calls" }, - "detail": { - "$ref": "#/components/schemas/DetailModel" - } - }, - "type": "object", - "required": [ - "status_code", - "detail" - ], - "title": "QuotaExceededResponse", - "description": "429 Too Many Requests - Quota limit exceeded.", - "examples": [ - { - "detail": { - "cause": "The token quota for model gpt-4-turbo has been exceeded.", - "response": "The model quota has been exceeded" - }, - "label": "model" + "previous_response_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Previous Response Id" }, - { - "detail": { - "cause": "User 123 has no available tokens.", - "response": "The quota has been exceeded" - }, - "label": "user none" + "prompt": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponsePrompt" + }, + { + "type": "null" + } + ] }, - { - "detail": { - "cause": "Cluster has no available tokens.", - "response": "The quota has been exceeded" - }, - "label": "cluster none" + "store": { + "type": "boolean", + "title": "Store", + "default": true }, - { - "detail": { - "cause": "Unknown subject 999 has no available tokens.", - "response": "The quota has been exceeded" - }, - "label": "subject none" + "stream": { + "type": "boolean", + "title": "Stream", + "default": false }, - { - "detail": { - "cause": "User 123 has 5 tokens, but 10 tokens are needed.", - "response": "The quota has been exceeded" - }, - "label": "user insufficient" + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature" }, - { - "detail": { - "cause": "Cluster has 500 tokens, but 900 tokens are needed.", - "response": "The quota has been exceeded" - }, - "label": "cluster insufficient" + "text": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseText" + }, + { + "type": "null" + } + ] }, - { - "detail": { - "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", - "response": "The quota has been exceeded" - }, - "label": "subject insufficient" - } - ] - }, - "QuotaHandlersConfiguration": { - "properties": { - "sqlite": { + "tool_choice": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceMode" + }, + { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceAllowedTools" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceFunctionTool" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceMCPTool" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceCustomTool" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "allowed_tools": "#/components/schemas/OpenAIResponseInputToolChoiceAllowedTools", + "custom": "#/components/schemas/OpenAIResponseInputToolChoiceCustomTool", + "file_search": "#/components/schemas/OpenAIResponseInputToolChoiceFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolChoiceFunctionTool", + "mcp": "#/components/schemas/OpenAIResponseInputToolChoiceMCPTool", + "web_search": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_2025_08_26": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_preview": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_preview_2025_03_11": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch" + } + } + }, + { + "type": "null" + } + ], + "title": "Tool Choice" + }, + "tools": { "anyOf": [ { - "$ref": "#/components/schemas/SQLiteDatabaseConfiguration" + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFunction" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolMCP" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "file_search": "#/components/schemas/OpenAIResponseInputToolFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolFunction", + "mcp": "#/components/schemas/OpenAIResponseInputToolMCP", + "web_search": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_2025_08_26": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_preview": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_preview_2025_03_11": "#/components/schemas/OpenAIResponseInputToolWebSearch" + } + } + }, + "type": "array" }, { "type": "null" } ], - "title": "SQLite configuration", - "description": "SQLite database configuration" + "title": "Tools" }, - "postgres": { + "generate_topic_summary": { "anyOf": [ { - "$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration" + "type": "boolean" }, { "type": "null" } ], - "title": "PostgreSQL configuration", - "description": "PostgreSQL database configuration" - }, - "limiters": { - "items": { - "$ref": "#/components/schemas/QuotaLimiterConfiguration" - }, - "type": "array", - "title": "Quota limiters", - "description": "Quota limiters configuration" + "title": "Generate Topic Summary", + "default": true }, - "scheduler": { - "$ref": "#/components/schemas/QuotaSchedulerConfiguration", - "title": "Quota scheduler", - "description": "Quota scheduler configuration" + "shield_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Shield Ids" }, - "enable_token_history": { - "type": "boolean", - "title": "Enable token history", - "description": "Enables storing information about token usage history", - "default": false + "solr": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Solr" } }, - "additionalProperties": false, "type": "object", - "title": "QuotaHandlersConfiguration", - "description": "Quota limiter configuration.\n\nIt is possible to limit quota usage per user or per service or services\n(that typically run in one cluster). Each limit is configured as a separate\n_quota limiter_. It can be of type `user_limiter` or `cluster_limiter`\n(which is name that makes sense in OpenShift deployment)." + "required": [ + "input" + ], + "title": "ResponsesRequest", + "description": "Model representing a request for the Responses API following LCORE specification.\n\nAttributes:\n input: Input text or structured input items containing the query.\n model: Model identifier in format \"provider/model\". Auto-selected if not provided.\n conversation: Conversation ID linking to an existing conversation. Accepts both\n OpenAI and LCORE formats. Mutually exclusive with previous_response_id.\n include: Explicitly specify output item types that are excluded by default but\n should be included in the response.\n instructions: System instructions or guidelines provided to the model (acts as\n the system prompt).\n max_infer_iters: Maximum number of inference iterations the model can perform.\n max_tool_calls: Maximum number of tool calls allowed in a single response.\n metadata: Custom metadata dictionary with key-value pairs for tracking or logging.\n parallel_tool_calls: Whether the model can make multiple tool calls in parallel.\n previous_response_id: Identifier of the previous response in a multi-turn\n conversation. Mutually exclusive with conversation.\n prompt: Prompt object containing a template with variables for dynamic\n substitution.\n store: Whether to store the response in conversation history. Defaults to True.\n stream: Whether to stream the response as it is generated. Defaults to False.\n temperature: Sampling temperature controlling randomness (typically 0.0\u20132.0).\n text: Text response configuration specifying output format constraints (JSON\n schema, JSON object, or plain text).\n tool_choice: Tool selection strategy (\"auto\", \"required\", \"none\", or specific\n tool configuration).\n tools: List of tools available to the model (file search, web search, function\n calls, MCP tools). Defaults to all tools available to the model.\n generate_topic_summary: LCORE-specific flag indicating whether to generate a\n topic summary for new conversations. Defaults to True.\n shield_ids: LCORE-specific list of safety shield IDs to apply. If None, all\n configured shields are used. If provided, must contain at least one valid\n shield ID (empty list raises 422).\n solr: LCORE-specific Solr vector_io provider query parameters (e.g. filter\n queries). Optional.", + "examples": [ + { + "generate_topic_summary": true, + "input": "What is Kubernetes?", + "instructions": "You are a helpful assistant", + "model": "openai/gpt-4o-mini", + "store": true, + "stream": false + } + ] }, - "QuotaLimiterConfiguration": { + "ResponsesResponse": { "properties": { - "type": { + "id": { "type": "string", - "enum": [ - "user_limiter", - "cluster_limiter" - ], - "title": "Quota limiter type", - "description": "Quota limiter type, either user_limiter or cluster_limiter" + "title": "Id" }, - "name": { + "object": { "type": "string", - "title": "Quota limiter name", - "description": "Human readable quota limiter name" - }, - "initial_quota": { - "type": "integer", - "minimum": 0.0, - "title": "Initial quota", - "description": "Quota set at beginning of the period" + "const": "response", + "title": "Object", + "default": "response" }, - "quota_increase": { + "created_at": { "type": "integer", - "minimum": 0.0, - "title": "Quota increase", - "description": "Delta value used to increase quota when period is reached" + "title": "Created At" }, - "period": { + "status": { "type": "string", - "title": "Period", - "description": "Period specified in human readable form" - } - }, - "additionalProperties": false, - "type": "object", - "required": [ - "type", - "name", - "initial_quota", - "quota_increase", - "period" - ], - "title": "QuotaLimiterConfiguration", - "description": "Configuration for one quota limiter.\n\nThere are three configuration options for each limiter:\n\n1. ``period`` is specified in a human-readable form, see\n https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT\n for all possible options. When the end of the period is reached, the\n quota is reset or increased.\n2. ``initial_quota`` is the value set at the beginning of the period.\n3. ``quota_increase`` is the value (if specified) used to increase the\n quota when the period is reached.\n\nThere are two basic use cases:\n\n1. When the quota needs to be reset to a specific value periodically (for\n example on a weekly or monthly basis), set ``initial_quota`` to the\n required value.\n2. When the quota needs to be increased by a specific value periodically\n (for example on a daily basis), set ``quota_increase``." - }, - "QuotaSchedulerConfiguration": { - "properties": { - "period": { - "type": "integer", - "exclusiveMinimum": 0.0, - "title": "Period", - "description": "Quota scheduler period specified in seconds", - "default": 1 + "title": "Status" }, - "database_reconnection_count": { - "type": "integer", - "exclusiveMinimum": 0.0, - "title": "Database reconnection count on startup", - "description": "Database reconnection count on startup. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.", - "default": 10 + "completed_at": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Completed At" }, - "database_reconnection_delay": { - "type": "integer", - "exclusiveMinimum": 0.0, - "title": "Database reconnection delay", - "description": "Database reconnection delay specified in seconds. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.", - "default": 1 - } - }, - "additionalProperties": false, - "type": "object", - "title": "QuotaSchedulerConfiguration", - "description": "Quota scheduler configuration." - }, - "RAGChunk": { - "properties": { - "content": { + "model": { "type": "string", - "title": "Content", - "description": "The content of the chunk" + "title": "Model" }, - "source": { + "output": { + "items": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseMessage-Output" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall", + "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall", + "mcp_approval_request": "#/components/schemas/OpenAIResponseMCPApprovalRequest", + "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", + "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools", + "message": "#/components/schemas/OpenAIResponseMessage-Output", + "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + } + } + }, + "type": "array", + "title": "Output" + }, + "error": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseError" + }, + { + "type": "null" + } + ] + }, + "instructions": { "anyOf": [ { "type": "string" @@ -8549,53 +11449,39 @@ "type": "null" } ], - "title": "Source", - "description": "Index name identifying the knowledge source from configuration" + "title": "Instructions" }, - "score": { + "max_tool_calls": { "anyOf": [ { - "type": "number" + "type": "integer" }, { "type": "null" } ], - "title": "Score", - "description": "Relevance score" + "title": "Max Tool Calls" }, - "attributes": { + "metadata": { "anyOf": [ { - "additionalProperties": true, + "additionalProperties": { + "type": "string" + }, "type": "object" }, { "type": "null" } ], - "title": "Attributes", - "description": "Document metadata from the RAG provider (e.g., url, title, author)" - } - }, - "type": "object", - "required": [ - "content" - ], - "title": "RAGChunk", - "description": "Model representing a RAG chunk used in the response." - }, - "RAGInfoResponse": { - "properties": { - "id": { - "type": "string", - "title": "Id", - "description": "Vector DB unique ID", - "examples": [ - "vs_00000000_0000_0000" - ] + "title": "Metadata" }, - "name": { + "parallel_tool_calls": { + "type": "boolean", + "title": "Parallel Tool Calls", + "default": true + }, + "previous_response_id": { "anyOf": [ { "type": "string" @@ -8604,135 +11490,116 @@ "type": "null" } ], - "title": "Name", - "description": "Human readable vector DB name", - "examples": [ - "Faiss Store with Knowledge base" - ] + "title": "Previous Response Id" }, - "created_at": { - "type": "integer", - "title": "Created At", - "description": "When the vector store was created, represented as Unix time", - "examples": [ - 1763391371 + "prompt": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponsePrompt" + }, + { + "type": "null" + } ] }, - "last_active_at": { + "temperature": { "anyOf": [ { - "type": "integer" + "type": "number" }, { "type": "null" } ], - "title": "Last Active At", - "description": "When the vector store was last active, represented as Unix time", - "examples": [ - 1763391371 - ] + "title": "Temperature" }, - "usage_bytes": { - "type": "integer", - "title": "Usage Bytes", - "description": "Storage byte(s) used by this vector DB", - "examples": [ - 0 + "text": { + "anyOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseText" + }, + { + "type": "null" + } ] }, - "expires_at": { + "tool_choice": { "anyOf": [ { - "type": "integer" + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceMode" + }, + { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceAllowedTools" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceFunctionTool" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceMCPTool" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolChoiceCustomTool" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "allowed_tools": "#/components/schemas/OpenAIResponseInputToolChoiceAllowedTools", + "custom": "#/components/schemas/OpenAIResponseInputToolChoiceCustomTool", + "file_search": "#/components/schemas/OpenAIResponseInputToolChoiceFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolChoiceFunctionTool", + "mcp": "#/components/schemas/OpenAIResponseInputToolChoiceMCPTool", + "web_search": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_2025_08_26": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_preview": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch", + "web_search_preview_2025_03_11": "#/components/schemas/OpenAIResponseInputToolChoiceWebSearch" + } + } }, { "type": "null" } ], - "title": "Expires At", - "description": "When the vector store expires, represented as Unix time", - "examples": [ - 1763391371 - ] - }, - "object": { - "type": "string", - "title": "Object", - "description": "Object type", - "examples": [ - "vector_store" - ] + "title": "Tool Choice" }, - "status": { - "type": "string", - "title": "Status", - "description": "Vector DB status", - "examples": [ - "completed" - ] - } - }, - "type": "object", - "required": [ - "id", - "created_at", - "usage_bytes", - "object", - "status" - ], - "title": "RAGInfoResponse", - "description": "Model representing a response with information about RAG DB.", - "examples": [ - { - "created_at": 1763391371, - "id": "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", - "last_active_at": 1763391371, - "name": "Faiss Store with Knowledge base", - "object": "vector_store", - "status": "completed", - "usage_bytes": 1024000 - } - ] - }, - "RAGListResponse": { - "properties": { - "rags": { - "items": { - "type": "string" - }, - "type": "array", - "title": "RAG list response", - "description": "List of RAG identifiers", - "examples": [ - "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", - "vs_7b52a8cf-0fa3-489c-cafe-27e061d102f3" - ] - } - }, - "type": "object", - "required": [ - "rags" - ], - "title": "RAGListResponse", - "description": "Model representing a response to list RAGs request.", - "examples": [ - { - "rags": [ - "vs_00000000-cafe-babe-0000-000000000000", - "vs_7b52a8cf-0fa3-489c-beab-27e061d102f3", - "vs_7b52a8cf-0fa3-489c-cafe-27e061d102f3" - ] - } - ] - }, - "RHIdentityConfiguration": { - "properties": { - "required_entitlements": { + "tools": { "anyOf": [ { "items": { - "type": "string" + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFunction" + }, + { + "$ref": "#/components/schemas/OpenAIResponseToolMCP" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "file_search": "#/components/schemas/OpenAIResponseInputToolFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolFunction", + "mcp": "#/components/schemas/OpenAIResponseToolMCP", + "web_search": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_2025_08_26": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_preview": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "web_search_preview_2025_03_11": "#/components/schemas/OpenAIResponseInputToolWebSearch" + } + } }, "type": "array" }, @@ -8740,77 +11607,20 @@ "type": "null" } ], - "title": "Required entitlements", - "description": "List of all required entitlements." - } - }, - "additionalProperties": false, - "type": "object", - "title": "RHIdentityConfiguration", - "description": "Red Hat Identity authentication configuration." - }, - "ReadinessResponse": { - "properties": { - "ready": { - "type": "boolean", - "title": "Ready", - "description": "Flag indicating if service is ready", - "examples": [ - true, - false - ] - }, - "reason": { - "type": "string", - "title": "Reason", - "description": "The reason for the readiness", - "examples": [ - "Service is ready" - ] + "title": "Tools" }, - "providers": { - "items": { - "$ref": "#/components/schemas/ProviderHealthStatus" - }, - "type": "array", - "title": "Providers", - "description": "List of unhealthy providers in case of readiness failure.", - "examples": [] - } - }, - "type": "object", - "required": [ - "ready", - "reason", - "providers" - ], - "title": "ReadinessResponse", - "description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready.\n reason: The reason for the readiness.\n providers: List of unhealthy providers in case of readiness failure.\n\nExample:\n ```python\n readiness_response = ReadinessResponse(\n ready=False,\n reason=\"Service is not ready\",\n providers=[\n ProviderHealthStatus(\n provider_id=\"ollama\",\n status=\"unhealthy\",\n message=\"Server is unavailable\"\n )\n ]\n )\n ```", - "examples": [ - { - "providers": [], - "ready": true, - "reason": "Service is ready" - } - ] - }, - "ReferencedDocument": { - "properties": { - "doc_url": { + "top_p": { "anyOf": [ { - "type": "string", - "minLength": 1, - "format": "uri" + "type": "number" }, { "type": "null" } ], - "title": "Doc Url", - "description": "URL of the referenced document" + "title": "Top P" }, - "doc_title": { + "truncation": { "anyOf": [ { "type": "string" @@ -8819,25 +11629,82 @@ "type": "null" } ], - "title": "Doc Title", - "description": "Title of the referenced document" + "title": "Truncation" }, - "source": { + "usage": { "anyOf": [ { - "type": "string" + "$ref": "#/components/schemas/OpenAIResponseUsage" }, { "type": "null" } - ], - "title": "Source", - "description": "Index name identifying the knowledge source from configuration" + ] + }, + "conversation": { + "type": "string", + "title": "Conversation" + }, + "available_quotas": { + "additionalProperties": { + "type": "integer" + }, + "type": "object", + "title": "Available Quotas" + }, + "output_text": { + "type": "string", + "title": "Output Text" } }, "type": "object", - "title": "ReferencedDocument", - "description": "Model representing a document referenced in generating a response.\n\nAttributes:\n doc_url: Url to the referenced doc.\n doc_title: Title of the referenced doc." + "required": [ + "id", + "created_at", + "status", + "model", + "output", + "conversation", + "available_quotas", + "output_text" + ], + "title": "ResponsesResponse", + "description": "Model representing a response from the Responses API following LCORE specification.\n\nAttributes:\n id: Unique identifier for this response.\n object: Object type identifier, always \"response\".\n created_at: Unix timestamp when the response was created.\n status: Current status of the response (e.g., \"completed\", \"blocked\",\n \"in_progress\").\n completed_at: Unix timestamp when the response was completed, if applicable.\n model: Model identifier in \"provider/model\" format used for generation.\n output: List of structured output items containing messages, tool calls, and\n other content. This is the primary response content.\n error: Error details if the response failed or was blocked.\n instructions: System instructions or guidelines provided to the model.\n max_tool_calls: Maximum number of tool calls allowed in a single response.\n metadata: Additional metadata dictionary with custom key-value pairs.\n parallel_tool_calls: Whether the model can make multiple tool calls in parallel.\n previous_response_id: Identifier of the previous response in a multi-turn\n conversation.\n prompt: The input prompt object that was sent to the model.\n temperature: Temperature parameter used for generation (controls randomness).\n text: Text response configuration object used for OpenAI responses.\n tool_choice: Tool selection strategy used (e.g., \"auto\", \"required\", \"none\").\n tools: List of tools available to the model during generation.\n top_p: Top-p sampling parameter used for generation.\n truncation: Strategy used for handling content that exceeds context limits.\n usage: Token usage statistics including input_tokens, output_tokens, and\n total_tokens. None for intermediate responses.\n conversation: Conversation ID linking this response to a conversation thread\n (LCORE-specific).\n available_quotas: Remaining token quotas for the user (LCORE-specific).\n output_text: Aggregated text output from all output_text items in the\n output array.", + "examples": [ + { + "available_quotas": { + "daily": 1000, + "monthly": 50000 + }, + "completed_at": 1704067250, + "conversation": "conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "created_at": 1704067200, + "id": "resp_abc123", + "instructions": "You are a helpful assistant", + "model": "openai/gpt-4-turbo", + "object": "response", + "output": [ + { + "content": [ + { + "text": "Kubernetes is an open-source container orchestration system...", + "type": "output_text" + } + ], + "role": "assistant", + "type": "message" + } + ], + "output_text": "Kubernetes is an open-source container orchestration system...", + "status": "completed", + "temperature": 0.7, + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150 + } + } + ] }, "RlsapiV1Attachment": { "properties": { @@ -9103,6 +11970,36 @@ "title": "SQLiteDatabaseConfiguration", "description": "SQLite database configuration." }, + "SearchRankingOptions": { + "properties": { + "ranker": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Ranker" + }, + "score_threshold": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Score Threshold", + "default": 0.0 + } + }, + "type": "object", + "title": "SearchRankingOptions", + "description": "Options for ranking and filtering search results.\n\n:param ranker: (Optional) Name of the ranking algorithm to use\n:param score_threshold: (Optional) Minimum relevance score threshold for results" + }, "SecurityScheme": { "anyOf": [ { diff --git a/docs/responses.md b/docs/responses.md index 80131048b..0c8973166 100644 --- a/docs/responses.md +++ b/docs/responses.md @@ -102,6 +102,7 @@ The following fields are LCORE-specific request extensions and are not part of t | Field | Type | Description | Required | |-------|------|-------------|----------| | `generate_topic_summary` | boolean | Generate topic summary for new conversations | No | +| `shield_ids` | array[string] | Safety shield IDs to apply. If omitted, all configured shields are used; if provided, must contain at least one valid shield ID (empty list returns 422). | No | | `solr` | dictionary | Solr vector_io provider query parameters | No | @@ -119,6 +120,7 @@ The following table maps LCORE query request fields to the OpenResponses request | `no_tools` | `tool_choice` | `no_tools=true` mapped to `tool_choice="none"` | | `vector_store_ids` | `tools` + `tool_choice` | Vector stores can be explicitly specified and restricted by `file_search` tool type's `vector_store_ids` attribute | | `generate_topic_summary` | N/A | Exposed directly (LCORE-specific) | +| `shield_ids` | N/A | Exposed directly (LCORE-specific) | | `solr` | N/A | Exposed directly (LCORE-specific) | **Note:** The `media_type` attribute is not present in the LCORE specification, as downstream logic determines which format to process (structured `output` or textual `output_text` response attributes). @@ -491,6 +493,7 @@ Vector store IDs are configured within the `tools` array (e.g., as `file_search` The API introduces extensions that are not part of the OpenResponses specification: - `generate_topic_summary` (request) — When set to `true` and a new conversation is created, a topic summary is automatically generated and stored in conversation metadata. +- `shield_ids` (request) — Optional list of safety shield IDs to apply. If omitted, all configured shields are used; if provided, must contain at least one valid shield ID (empty list returns 422). - `solr` (request) — Solr vector_io provider query parameters (e.g. filter queries). - `available_quotas` (response) — Provides real-time quota information from all configured quota limiters. diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py new file mode 100644 index 000000000..d9621dc3d --- /dev/null +++ b/src/app/endpoints/responses.py @@ -0,0 +1,756 @@ +# pylint: disable=too-many-locals,too-many-branches,too-many-nested-blocks, too-many-arguments,too-many-positional-arguments + +"""Handler for REST API call to provide answer using Responses API (LCORE specification).""" + +import json +from datetime import UTC, datetime +from typing import Annotated, Any, AsyncIterator, Optional, Union, cast + +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import StreamingResponse +from llama_stack_api import OpenAIResponseObject, OpenAIResponseObjectStream +from llama_stack_api.openai_responses import ( + OpenAIResponseMessage, + OpenAIResponseObjectStreamResponseOutputItemAdded as OutputItemAddedChunk, + OpenAIResponseObjectStreamResponseOutputItemDone as OutputItemDoneChunk, + OpenAIResponseUsage as Usage, +) +from llama_stack_client import ( + APIConnectionError, + APIStatusError as LLSApiStatusError, + AsyncLlamaStackClient, +) +from openai._exceptions import ( + APIStatusError as OpenAIAPIStatusError, +) + +from authentication import get_auth_dependency +from authentication.interface import AuthTuple +from authorization.azure_token_manager import AzureEntraIDManager +from authorization.middleware import authorize +from client import AsyncLlamaStackClientHolder +from configuration import configuration +from log import get_logger +from models.config import Action +from models.database.conversations import UserConversation +from models.requests import ResponsesRequest +from models.responses import ( + ForbiddenResponse, + InternalServerErrorResponse, + NotFoundResponse, + PromptTooLongResponse, + QuotaExceededResponse, + ResponsesResponse, + ServiceUnavailableResponse, + UnauthorizedResponse, + UnprocessableEntityResponse, +) + +from utils.conversations import add_response_to_conversation +from utils.endpoints import ( + check_configuration_loaded, + retrieve_turn_by_response_id, + validate_and_retrieve_conversation, +) +from utils.mcp_headers import mcp_headers_dependency +from utils.query import ( + consume_query_tokens, + extract_provider_and_model_from_model_id, + handle_known_apistatus_errors, + store_query_results, + update_azure_token, + validate_model_provider_override, +) +from utils.quota import check_tokens_available, get_available_quotas +from utils.responses import ( + build_turn_summary, + check_model_configured, + create_new_conversation, + deduplicate_referenced_documents, + extract_text_from_response_item, + extract_text_from_response_items, + extract_token_usage, + extract_vector_store_ids_from_tools, + get_topic_summary, + prepare_tools, + select_model_for_responses, +) +from utils.conversations import append_turn_items_to_conversation +from utils.shields import run_shield_moderation +from utils.suid import ( + normalize_conversation_id, + to_llama_stack_conversation_id, +) +from utils.types import ( + ReferencedDocument, + ResponseInput, + ResponsesApiParams, + ShieldModerationResult, + TurnSummary, +) +from utils.vector_search import ( + build_message_from_static_rag, + format_rag_context_for_injection, + perform_vector_search, +) + +logger = get_logger(__name__) +router = APIRouter(tags=["responses"]) + +responses_response: dict[int | str, dict[str, Any]] = { + 200: ResponsesResponse.openapi_response(), + 401: UnauthorizedResponse.openapi_response( + examples=["missing header", "missing token"] + ), + 403: ForbiddenResponse.openapi_response( + examples=["endpoint", "conversation read", "model override"] + ), + 404: NotFoundResponse.openapi_response( + examples=["model", "conversation", "provider"] + ), + 413: PromptTooLongResponse.openapi_response(), + 422: UnprocessableEntityResponse.openapi_response(), + 429: QuotaExceededResponse.openapi_response(), + 500: InternalServerErrorResponse.openapi_response(examples=["configuration"]), + 503: ServiceUnavailableResponse.openapi_response(), +} + + +@router.post( + "/responses", + responses=responses_response, + response_model=None, + summary="Responses Endpoint Handler", +) +@authorize(Action.QUERY) +async def responses_endpoint_handler( + request: Request, + responses_request: ResponsesRequest, + auth: Annotated[AuthTuple, Depends(get_auth_dependency())], + _mcp_headers: dict[str, dict[str, str]] = Depends(mcp_headers_dependency), +) -> Union[ResponsesResponse, StreamingResponse]: + """ + Handle request to the /responses endpoint using Responses API (LCORE specification). + + Processes a POST request to the responses endpoint, forwarding the + user's request to a selected Llama Stack LLM and returning the generated response + following the LCORE OpenAPI specification. + + Returns: + ResponsesResponse: Contains the response following LCORE specification (non-streaming). + StreamingResponse: SSE-formatted streaming response with enriched events (streaming). + - response.created event includes conversation attribute + - response.completed event includes available_quotas attribute + + Raises: + HTTPException: + - 401: Unauthorized - Missing or invalid credentials + - 403: Forbidden - Insufficient permissions or model override not allowed + - 404: Not Found - Conversation, model, or provider not found + - 413: Prompt too long - Prompt exceeded model's context window size + - 422: Unprocessable Entity - Request validation failed + - 429: Quota limit exceeded - The token quota for model or user has been exceeded + - 500: Internal Server Error - Configuration not loaded or other server errors + - 503: Service Unavailable - Unable to connect to Llama Stack backend + """ + responses_request = responses_request.model_copy(deep=True) + check_configuration_loaded(configuration) + started_at = datetime.now(UTC) + user_id = auth[0] + + # Check token availability + check_tokens_available(configuration.quota_limiters, user_id) + + # Enforce RBAC: optionally disallow overriding model in requests + if responses_request.model: + validate_model_provider_override( + responses_request.model, + None, # provider specified as model prefix + request.state.authorized_actions, + ) + + client = AsyncLlamaStackClientHolder().get_client() + + user_conversation: Optional[UserConversation] = None + # Context for the LLM passed by conversation + if responses_request.conversation: + logger.info( + "Conversation ID specified in request: %s", responses_request.conversation + ) + user_conversation = validate_and_retrieve_conversation( + normalized_conv_id=normalize_conversation_id( + responses_request.conversation + ), + user_id=user_id, + others_allowed=Action.READ_OTHERS_CONVERSATIONS + in request.state.authorized_actions, + ) + responses_request.conversation = to_llama_stack_conversation_id( + user_conversation.id + ) + # Disable topic summary generation for existing conversations + responses_request.generate_topic_summary = False + + # Context for the LLM passed by previous response id + elif responses_request.previous_response_id: + user_turn = retrieve_turn_by_response_id(responses_request.previous_response_id) + user_conversation = validate_and_retrieve_conversation( + normalized_conv_id=user_turn.conversation_id, + user_id=user_id, + others_allowed=Action.READ_OTHERS_CONVERSATIONS + in request.state.authorized_actions, + ) + # Conversation will be forked if the specified response is not the last response + if user_conversation.last_response_id != responses_request.previous_response_id: + logger.info("Forking conversation") + responses_request.conversation = await create_new_conversation(client) + # Copy the forked turn to the new conversation + await add_response_to_conversation( + client, + responses_request.previous_response_id, + responses_request.conversation, + ) + else: # Specified response is the last response in the conversation (no fork) + logger.info("No fork, using existing conversation") + responses_request.conversation = to_llama_stack_conversation_id( + user_conversation.id + ) + responses_request.generate_topic_summary = False + else: # No context passed, create new conversation + responses_request.conversation = await create_new_conversation(client) + + # LCORE-specific: Automatically select model if not provided in request + # This extends the base LLS API which requires model to be specified. + if not responses_request.model: + responses_request.model = await select_model_for_responses( + client, user_conversation + ) + if not await check_model_configured(client, responses_request.model): + _, model_id = extract_provider_and_model_from_model_id(responses_request.model) + error_response = NotFoundResponse(resource="model", resource_id=model_id) + raise HTTPException(**error_response.model_dump()) + + # Handle Azure token refresh if needed + if ( + responses_request.model.startswith("azure") + and AzureEntraIDManager().is_entra_id_configured + and AzureEntraIDManager().is_token_expired + and AzureEntraIDManager().refresh_token() + ): + client = await update_azure_token(client) + + input_text = ( + responses_request.input + if isinstance(responses_request.input, str) + else extract_text_from_response_items(responses_request.input) + ) + + _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search( + client, input_text, responses_request.solr + ) + + rag_context = format_rag_context_for_injection(pre_rag_chunks) + if isinstance(responses_request.input, str): + responses_request.input = responses_request.input + rag_context + else: + responses_request.input.append(build_message_from_static_rag(rag_context)) + + moderation_result = await run_shield_moderation( + client, input_text, responses_request.shield_ids + ) + + # If tools attribute is None, configure all tools configured in LCORE + if responses_request.tools is None: + responses_request.tools = await prepare_tools( + client, + None, + False, + auth[1], + _mcp_headers, + ) + + response_handler = ( + handle_streaming_response + if responses_request.stream + else handle_non_streaming_response + ) + return await response_handler( + client=client, + request=responses_request, + auth=auth, + input_text=input_text, + started_at=started_at, + moderation_result=moderation_result, + static_rag_docs=doc_ids_from_chunks, + ) + + +async def handle_streaming_response( + client: AsyncLlamaStackClient, + request: ResponsesRequest, + auth: AuthTuple, + input_text: str, + started_at: datetime, + moderation_result: ShieldModerationResult, + static_rag_docs: list[ReferencedDocument], +) -> StreamingResponse: + """Handle streaming response from Responses API. + + Args: + client: The AsyncLlamaStackClient instance + request: ResponsesRequest (LCORE-specific fields e.g. generate_topic_summary) + auth: Authentication tuple + input_text: The extracted input text + started_at: Timestamp when the conversation started + moderation_result: Result of shield moderation check + static_rag_docs: Static RAG documents to be used for the response + Returns: + StreamingResponse with SSE-formatted events + """ + api_params = ResponsesApiParams.model_validate(request.model_dump()) + turn_summary = TurnSummary() + # Handle blocked response + if moderation_result.decision == "blocked": + turn_summary.id = moderation_result.moderation_id + turn_summary.llm_response = moderation_result.message + available_quotas = get_available_quotas( + quota_limiters=configuration.quota_limiters, user_id=auth[0] + ) + generator = shield_violation_generator( + moderation_result.refusal_response, + api_params.conversation, + moderation_result.moderation_id, + request.echoed_params(), + started_at, + available_quotas, + ) + if api_params.store: + await append_turn_items_to_conversation( + client, + api_params.conversation, + request.input, + llm_output=[moderation_result.refusal_response], + ) + else: + try: + # Do not pass the new conversation so that the full context is provided to the model + response = await client.responses.create(**api_params.dump_for_create()) + generator = response_generator( + cast(AsyncIterator[OpenAIResponseObjectStream], response), + request.input, + api_params, + auth[0], + turn_summary, + vector_store_ids=extract_vector_store_ids_from_tools(api_params.tools), + static_rag_docs=static_rag_docs, + ) + except RuntimeError as e: # library mode wraps 413 into runtime error + if "context_length" in str(e).lower(): + error_response = PromptTooLongResponse(model=api_params.model) + raise HTTPException(**error_response.model_dump()) from e + raise e + except APIConnectionError as e: + error_response = ServiceUnavailableResponse( + backend_name="Llama Stack", + cause=str(e), + ) + raise HTTPException(**error_response.model_dump()) from e + except (LLSApiStatusError, OpenAIAPIStatusError) as e: + error_response = handle_known_apistatus_errors(e, api_params.model) + raise HTTPException(**error_response.model_dump()) from e + + return StreamingResponse( + generate_response( + generator, + turn_summary, + client=client, + auth=auth, + input_text=input_text, + started_at=started_at, + api_params=api_params, + generate_topic_summary=request.generate_topic_summary or False, + ), + media_type="text/event-stream", + ) + + +async def shield_violation_generator( + refusal_response: OpenAIResponseMessage, + conversation_id: str, + response_id: str, + echoed_params: dict[str, Any], + created_at: datetime, + available_quotas: dict[str, int], +) -> AsyncIterator[str]: + """Generate SSE-formatted streaming response for shield-blocked requests. + + Follows the Open Responses spec: + - Content-Type: text/event-stream + - Each event has 'event:' field matching the type in the event body + - Data objects are JSON-encoded strings + - Terminal event is the literal string [DONE] + - Emits full event sequence: response.created (in_progress), output_item.added, + output_item.done, response.completed (completed) + - Performs topic summary and persistence after [DONE] is emitted + + Args: + refusal_response: The refusal response message object + conversation_id: The conversation ID to include in the response + response_id: Unique identifier for this response + echoed_params: Echoed parameters from the request + created_at: Unix timestamp when the response was created + available_quotas: Available quotas dictionary for the user + Yields: + SSE-formatted strings for streaming events, ending with [DONE] + """ + normalized_conv_id = normalize_conversation_id(conversation_id) + + # 1. Send response.created event with status "in_progress" and empty output + created_response_object = ResponsesResponse.model_construct( + id=response_id, + object="response", + created_at=int(created_at.timestamp()), + status="in_progress", + output=[], + conversation=normalized_conv_id, + available_quotas={}, + output_text="", + **echoed_params, + ) + created_response_dict = created_response_object.model_dump(exclude_none=True) + created_event = { + "type": "response.created", + "sequence_number": 0, + "response": created_response_dict, + } + data_json = json.dumps(created_event) + yield f"event: response.created\ndata: {data_json}\n\n" + + # 2. Send response.output_item.added event + item_added_event = OutputItemAddedChunk( + response_id=response_id, + item=refusal_response, + output_index=0, + sequence_number=1, + ) + data_json = json.dumps(item_added_event.model_dump(exclude_none=True)) + yield f"event: response.output_item.added\ndata: {data_json}\n\n" + + # 3. Send response.output_item.done event + item_done_event = OutputItemDoneChunk( + response_id=response_id, + item=refusal_response, + output_index=0, + sequence_number=2, + ) + data_json = json.dumps(item_done_event.model_dump(exclude_none=True)) + yield f"event: response.output_item.done\ndata: {data_json}\n\n" + + # 4. Send response.completed event with status "completed" and output populated + completed_response_object = ResponsesResponse.model_construct( + id=response_id, + created_at=int(created_at.timestamp()), + object="response", + completed_at=int(datetime.now(UTC).timestamp()), + status="completed", + output=[refusal_response], + usage=Usage(input_tokens=0, output_tokens=0, total_tokens=0), + conversation=normalized_conv_id, + available_quotas=available_quotas, + output_text=extract_text_from_response_item(refusal_response), + **echoed_params, + ) + completed_response_dict = completed_response_object.model_dump(exclude_none=True) + completed_event = { + "type": "response.completed", + "sequence_number": 3, + "response": completed_response_dict, + "available_quotas": available_quotas, + } + data_json = json.dumps(completed_event) + yield f"event: response.completed\ndata: {data_json}\n\n" + + yield "data: [DONE]\n\n" + + +async def response_generator( + stream: AsyncIterator[OpenAIResponseObjectStream], + user_input: ResponseInput, + api_params: ResponsesApiParams, + user_id: str, + turn_summary: TurnSummary, + static_rag_docs: list[ReferencedDocument], + vector_store_ids: Optional[list[str]] = None, +) -> AsyncIterator[str]: + """Generate SSE-formatted streaming response with LCORE-enriched events. + + Args: + stream: The streaming response from Llama Stack + user_input: User input to the response + api_params: ResponsesApiParams + user_id: User ID for quota retrieval + turn_summary: TurnSummary to populate during streaming + static_rag_docs: Static RAG documents to be used for the response + vector_store_ids: Vector store IDs used in the query for source resolution. + Yields: + SSE-formatted strings for streaming events, ending with [DONE] + """ + normalized_conv_id = normalize_conversation_id(api_params.conversation) + + logger.debug("Starting streaming response (Responses API) processing") + + latest_response_object: Optional[OpenAIResponseObject] = None + sequence_number = 0 + + async for chunk in stream: + event_type = getattr(chunk, "type", None) + logger.debug("Processing streaming chunk, type: %s", event_type) + + chunk_dict = chunk.model_dump(exclude_none=True) + + # Create own sequence number for chunks to maintain order + chunk_dict["sequence_number"] = sequence_number + sequence_number += 1 + + # Add conversation attribute to the response if chunk has it + if "response" in chunk_dict: + chunk_dict["response"]["conversation"] = normalized_conv_id + + # Intermediate response - no quota consumption yet + if event_type == "response.in_progress": + chunk_dict["response"]["available_quotas"] = {} + + # Handle completion, incomplete, and failed events - only quota handling here + if event_type in ( + "response.completed", + "response.incomplete", + "response.failed", + ): + latest_response_object = cast( + OpenAIResponseObject, getattr(chunk, "response") + ) + + # Extract and consume tokens if any were used + turn_summary.token_usage = extract_token_usage( + latest_response_object.usage, api_params.model + ) + consume_query_tokens( + user_id=user_id, + model_id=api_params.model, + token_usage=turn_summary.token_usage, + ) + + # Get available quotas after token consumption + available_quotas = get_available_quotas( + quota_limiters=configuration.quota_limiters, user_id=user_id + ) + chunk_dict["response"]["available_quotas"] = available_quotas + + data_json = json.dumps(chunk_dict) + yield f"event: {event_type}\ndata: {data_json}\n\n" + + # Extract response metadata from final response object + t = build_turn_summary( + latest_response_object, + api_params.model, + vector_store_ids, + configuration.rag_id_mapping, + ) + t.referenced_documents = deduplicate_referenced_documents( + static_rag_docs + t.referenced_documents + ) + + # Copy turn summary fields to the outer turn_summary + for field, value in t.model_dump().items(): + setattr(turn_summary, field, value) + + client = AsyncLlamaStackClientHolder().get_client() + # Explicitely append the turn to conversation if context passed by previous response + if api_params.store and api_params.previous_response_id and latest_response_object: + await append_turn_items_to_conversation( + client, api_params.conversation, user_input, latest_response_object.output + ) + + yield "data: [DONE]\n\n" + + +async def generate_response( + generator: AsyncIterator[str], + turn_summary: TurnSummary, + client: AsyncLlamaStackClient, + auth: AuthTuple, + input_text: str, + started_at: datetime, + api_params: ResponsesApiParams, + generate_topic_summary: bool, +) -> AsyncIterator[str]: + """Stream the response from the generator and persist conversation details. + + After streaming completes, conversation details are persisted. + + Args: + generator: The SSE event generator + turn_summary: TurnSummary populated during streaming + moderation_result: Result of shield moderation check + client: The AsyncLlamaStackClient instance + user_id: The authenticated user ID + input_text: The extracted input text + started_at: Timestamp when the conversation started + api_params: ResponsesApiParams + generate_topic_summary: Whether to generate topic summary for new conversations + + Yields: + SSE-formatted strings from the generator + """ + user_id, _, skip_userid_check, _ = auth + async for event in generator: + yield event + + # Get topic summary for new conversation + topic_summary = None + if generate_topic_summary: + logger.debug("Generating topic summary for new conversation") + topic_summary = await get_topic_summary(input_text, client, api_params.model) + + completed_at = datetime.now(UTC) + if api_params.store: + store_query_results( + user_id=user_id, + conversation_id=normalize_conversation_id(api_params.conversation), + model=api_params.model, + started_at=started_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + completed_at=completed_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + summary=turn_summary, + query=input_text, + attachments=[], + skip_userid_check=skip_userid_check, + topic_summary=topic_summary, + ) + + +async def handle_non_streaming_response( + client: AsyncLlamaStackClient, + request: ResponsesRequest, + auth: AuthTuple, + input_text: str, + started_at: datetime, + moderation_result: ShieldModerationResult, + static_rag_docs: list[ReferencedDocument], +) -> ResponsesResponse: + """Handle non-streaming response from Responses API. + + Args: + client: The AsyncLlamaStackClient instance + api_params: Full API params with resolved model/conversation for responses.create + user_id: The authenticated user ID + input_text: The extracted input text + started_at: Timestamp when the conversation started + moderation_result: Result of shield moderation check + static_rag_docs: Static RAG documents to be used for the response + Returns: + ResponsesResponse with the completed response + """ + user_id, _, skip_userid_check, _ = auth + api_params = ResponsesApiParams.model_validate(request.model_dump()) + + # Fork: Get response object (blocked vs normal) + if moderation_result.decision == "blocked": + response_id = moderation_result.moderation_id + completed_at = datetime.now(UTC) + api_response = OpenAIResponseObject.model_construct( + id=response_id, + object="response", + created_at=int(started_at.timestamp()), + status="completed", + output=[moderation_result.refusal_response], + usage=Usage(input_tokens=0, output_tokens=0, total_tokens=0), + **request.echoed_params(), + ) + output_text = moderation_result.message + if api_params.store: + await append_turn_items_to_conversation( + client, + api_params.conversation, + request.input, + llm_output=[moderation_result.refusal_response], + ) + else: + try: + api_response = cast( + OpenAIResponseObject, + await client.responses.create(**api_params.dump_for_create()), + ) + token_usage = extract_token_usage(api_response.usage, api_params.model) + logger.info("Consuming tokens") + consume_query_tokens( + user_id=user_id, + model_id=api_params.model, + token_usage=token_usage, + ) + output_text = extract_text_from_response_items(api_response.output) + # Explicitely append the turn to conversation if context passed by previous response + if api_params.store and api_params.previous_response_id: + await append_turn_items_to_conversation( + client, api_params.conversation, request.input, api_response.output + ) + + except RuntimeError as e: + if "context_length" in str(e).lower(): + error_response = PromptTooLongResponse(model=api_params.model) + raise HTTPException(**error_response.model_dump()) from e + raise e + except APIConnectionError as e: + error_response = ServiceUnavailableResponse( + backend_name="Llama Stack", + cause=str(e), + ) + raise HTTPException(**error_response.model_dump()) from e + except (LLSApiStatusError, OpenAIAPIStatusError) as e: + error_response = handle_known_apistatus_errors(e, api_params.model) + raise HTTPException(**error_response.model_dump()) from e + + # Get available quotas + logger.info("Getting available quotas") + available_quotas = get_available_quotas( + quota_limiters=configuration.quota_limiters, user_id=user_id + ) + # Get topic summary for new conversation + topic_summary = None + if request.generate_topic_summary: + logger.debug("Generating topic summary for new conversation") + topic_summary = await get_topic_summary(input_text, client, api_params.model) + + vector_store_ids = extract_vector_store_ids_from_tools(api_params.tools) + turn_summary = build_turn_summary( + api_response, + api_params.model, + vector_store_ids, + configuration.rag_id_mapping, + ) + turn_summary.referenced_documents = deduplicate_referenced_documents( + static_rag_docs + turn_summary.referenced_documents + ) + completed_at = datetime.now(UTC) + if api_params.store: + store_query_results( + user_id=user_id, + conversation_id=normalize_conversation_id(api_params.conversation), + model=api_params.model, + started_at=started_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + completed_at=completed_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + summary=turn_summary, + query=input_text, + attachments=[], + skip_userid_check=skip_userid_check, + topic_summary=topic_summary, + ) + response = ResponsesResponse.model_validate( + { + **api_response.model_dump(exclude_none=True), + "available_quotas": available_quotas, + "conversation": normalize_conversation_id(api_params.conversation), + "completed_at": int(completed_at.timestamp()), + "output_text": output_text, + } + ) + return response diff --git a/src/app/routers.py b/src/app/routers.py index 78663e18f..97e3522f6 100644 --- a/src/app/routers.py +++ b/src/app/routers.py @@ -26,6 +26,7 @@ rlsapi_v1, # A2A (Agent-to-Agent) protocol support a2a, + responses, ) @@ -58,7 +59,7 @@ def include_routers(app: FastAPI) -> None: app.include_router(feedback.router, prefix="/v1") app.include_router(conversations_v1.router, prefix="/v1") app.include_router(conversations_v2.router, prefix="/v2") - + app.include_router(responses.router, prefix="/v1") # RHEL Lightspeed rlsapi v1 compatibility - stateless CLA (Command Line Assistant) endpoint app.include_router(rlsapi_v1.router, prefix="/v1") diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py index b34c9eb53..970de6aef 100644 --- a/src/models/database/conversations.py +++ b/src/models/database/conversations.py @@ -31,6 +31,7 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods DateTime(timezone=True), server_default=func.now(), # pylint: disable=not-callable ) + last_response_id: Mapped[str] = mapped_column(default="") # The number of user messages in the conversation message_count: Mapped[int] = mapped_column(default=0) @@ -66,3 +67,7 @@ class UserTurn(Base): # pylint: disable=too-few-public-methods provider: Mapped[str] = mapped_column(nullable=False) model: Mapped[str] = mapped_column(nullable=False) + + # Llama Stack response ID for this turn (1:1); nullable for legacy turns without it. + # Indexed for fast lookup when resolving previous_response_id to conversation. + response_id: Mapped[str] = mapped_column(nullable=True, index=True) diff --git a/src/models/requests.py b/src/models/requests.py index fc408694b..8ce41aa4f 100644 --- a/src/models/requests.py +++ b/src/models/requests.py @@ -6,10 +6,10 @@ from llama_stack_api.openai_responses import ( OpenAIResponseInputToolChoice as ToolChoice, - OpenAIResponseInputToolChoiceMode as ToolChoiceMode, OpenAIResponseInputTool as InputTool, OpenAIResponsePrompt as Prompt, OpenAIResponseText as Text, + OpenAIResponseToolMCP as OutputToolMCP, ) from pydantic import BaseModel, Field, field_validator, model_validator @@ -20,6 +20,22 @@ logger = get_logger(__name__) +# Attribute names that are echoed back in the response. +_ECHOED_FIELDS = set( + { + "instructions", + "max_tool_calls", + "metadata", + "model", + "parallel_tool_calls", + "previous_response_id", + "prompt", + "temperature", + "text", + "tool_choice", + } +) + class Attachment(BaseModel): """Model representing an attachment that can be send from the UI as part of query. @@ -629,11 +645,14 @@ class ResponsesRequest(BaseModel): text: Text response configuration specifying output format constraints (JSON schema, JSON object, or plain text). tool_choice: Tool selection strategy ("auto", "required", "none", or specific - tool configuration). Defaults to "auto". + tool configuration). tools: List of tools available to the model (file search, web search, function calls, MCP tools). Defaults to all tools available to the model. generate_topic_summary: LCORE-specific flag indicating whether to generate a topic summary for new conversations. Defaults to True. + shield_ids: LCORE-specific list of safety shield IDs to apply. If None, all + configured shields are used. If provided, must contain at least one valid + shield ID (empty list raises 422). solr: LCORE-specific Solr vector_io provider query parameters (e.g. filter queries). Optional. """ @@ -653,50 +672,22 @@ class ResponsesRequest(BaseModel): stream: bool = False temperature: Optional[float] = None text: Optional[Text] = None - tool_choice: Optional[ToolChoice] = ToolChoiceMode.auto + tool_choice: Optional[ToolChoice] = None tools: Optional[list[InputTool]] = None generate_topic_summary: Optional[bool] = True + shield_ids: Optional[list[str]] = None solr: Optional[dict[str, Any]] = None model_config = { - "extra": "forbid", + "extra": "ignore", "json_schema_extra": { "examples": [ { "input": "What is Kubernetes?", "model": "openai/gpt-4o-mini", - "conversation": "conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", "instructions": "You are a helpful assistant", - "include": ["message.output_text.logprobs"], - "max_tool_calls": 5, - "metadata": {"source": "api"}, - "parallel_tool_calls": True, - "prompt": { - "id": "prompt_123", - "variables": { - "topic": {"type": "input_text", "text": "Kubernetes"} - }, - "version": "1.0", - }, "store": True, "stream": False, - "temperature": 0.7, - "text": { - "format": { - "type": "json_schema", - "schema": { - "type": "object", - "properties": {"answer": {"type": "string"}}, - }, - } - }, - "tool_choice": "auto", - "tools": [ - { - "type": "file_search", - "vector_store_ids": ["vs_123"], - } - ], "generate_topic_summary": True, } ] @@ -731,3 +722,26 @@ def check_suid(cls, value: Optional[str]) -> Optional[str]: if value and not suid.check_suid(value): raise ValueError(f"Improper conversation ID '{value}'") return value + + def echoed_params(self) -> dict[str, Any]: + """Dump attributes that are echoed back in the response. + + The ``tools`` attribute is converted from list[InputTool] to list[OutputTool] + via model_validate so that the response-side type (OutputTool) is used; MCP + tools use a subset of attributes on the output side. + + Returns: + Dict of echoed attributes. + """ + data = self.model_dump(include=_ECHOED_FIELDS) + if self.tools is not None: + data["tools"] = [ + ( + OutputToolMCP.model_validate(t.model_dump()).model_dump() + if t.type == "mcp" + else t.model_dump() + ) + for t in self.tools + ] + + return data diff --git a/src/models/responses.py b/src/models/responses.py index 946a71fbf..294971f8d 100644 --- a/src/models/responses.py +++ b/src/models/responses.py @@ -1431,7 +1431,7 @@ class ResponsesResponse(AbstractSuccessfulResponse): top_p: Top-p sampling parameter used for generation. truncation: Strategy used for handling content that exceeds context limits. usage: Token usage statistics including input_tokens, output_tokens, and - total_tokens. + total_tokens. None for intermediate responses. conversation: Conversation ID linking this response to a conversation thread (LCORE-specific). available_quotas: Remaining token quotas for the user (LCORE-specific). @@ -1459,8 +1459,8 @@ class ResponsesResponse(AbstractSuccessfulResponse): tools: Optional[list[OutputTool]] = None top_p: Optional[float] = None truncation: Optional[str] = None - usage: Usage - conversation: Optional[str] = None + usage: Optional[Usage] = None + conversation: str available_quotas: dict[str, int] output_text: str diff --git a/src/utils/conversations.py b/src/utils/conversations.py index 577c3fce7..77fe9051e 100644 --- a/src/utils/conversations.py +++ b/src/utils/conversations.py @@ -2,8 +2,14 @@ import json from datetime import UTC, datetime -from typing import Any, Optional, Union, cast +from typing import Any, Optional, Sequence, Union, cast +from fastapi import HTTPException +from llama_stack_api import ( + OpenAIResponseMessage, + OpenAIResponseObject, + OpenAIResponseOutput, +) from llama_stack_api.openai_responses import ( OpenAIResponseOutputMessageFileSearchToolCall as FileSearchCall, OpenAIResponseOutputMessageFunctionToolCall as FunctionCall, @@ -11,6 +17,8 @@ OpenAIResponseOutputMessageMCPListTools as MCPListTools, OpenAIResponseOutputMessageWebSearchToolCall as WebSearchCall, ) +from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient +from llama_stack_client.types.conversations.item_create_params import Item from llama_stack_client.types.conversations.item_list_response import ( ItemListResponse, OpenAIResponseInputFunctionToolCallOutput as FunctionToolCallOutput, @@ -21,9 +29,14 @@ from constants import DEFAULT_RAG_TOOL from models.database.conversations import UserTurn -from models.responses import ConversationTurn, Message +from models.responses import ( + ConversationTurn, + InternalServerErrorResponse, + Message, + ServiceUnavailableResponse, +) from utils.responses import parse_arguments_string -from utils.types import ToolCallSummary, ToolResultSummary +from utils.types import ResponseInput, ToolCallSummary, ToolResultSummary def _extract_text_from_content(content: Union[str, list[Any]]) -> str: @@ -423,3 +436,81 @@ def build_conversation_turns_from_items( ) return chat_history + + +async def add_response_to_conversation( + client: AsyncLlamaStackClient, response_id: str, conversation_id: str +) -> None: + """Add a response to a conversation. + + Args: + client: The Llama Stack client (AsyncLlamaStackClient). + response_id: The ID of the response to add + conversation_id: The ID of the conversation to add the response to + """ + try: + items = cast( + OpenAIResponseObject, await client.responses.retrieve(response_id) + ).output + items = [cast(Any, item.model_dump()) for item in items] + await client.conversations.items.create( + conversation_id=conversation_id, + items=items, # type: ignore[arg-type] + ) + + except APIConnectionError as e: + raise HTTPException( + **ServiceUnavailableResponse( + backend_name="Llama Stack", cause=str(e) + ).model_dump() + ) from e + except APIStatusError as e: + raise HTTPException(**InternalServerErrorResponse.generic().model_dump()) from e + + +async def append_turn_items_to_conversation( + client: AsyncLlamaStackClient, + conversation_id: str, + user_input: ResponseInput, + llm_output: Sequence[OpenAIResponseOutput], +) -> None: + """ + Append a turn (user input + LLM output) to a conversation. + + Persists the user's input and the LLM output items so the turn appears in + conversation history. Used for refusal turns, completed responses, or any + other turn that should be stored. + + Args: + client: The Llama Stack client. + conversation_id: The Llama Stack conversation ID. + user_input: Request input (str or list of ResponseItem). If a string, + wrapped in a single message item; otherwise items are dumped as-is. + llm_output: Output from the LLM: a list of OpenAIResponseOutput + """ + if isinstance(user_input, str): + user_message = OpenAIResponseMessage( + role="user", + content=user_input, + ) + user_items = [user_message.model_dump()] + else: + user_items = [item.model_dump() for item in user_input] + + output_items = [item.model_dump() for item in llm_output] + + items = user_items + output_items + try: + await client.conversations.items.create( + conversation_id, + items=cast(list[Item], items), + ) + except APIConnectionError as e: + error_response = ServiceUnavailableResponse( + backend_name="Llama Stack", + cause=str(e), + ) + raise HTTPException(**error_response.model_dump()) from e + except APIStatusError as e: + error_response = InternalServerErrorResponse.generic() + raise HTTPException(**error_response.model_dump()) from e diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 332002eeb..0cac88547 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -10,7 +10,7 @@ from app.database import get_session from configuration import AppConfig, LogicError from log import get_logger -from models.database.conversations import UserConversation +from models.database.conversations import UserConversation, UserTurn from models.responses import ( ForbiddenResponse, InternalServerErrorResponse, @@ -179,6 +179,39 @@ def validate_and_retrieve_conversation( return user_conversation +def retrieve_turn_by_response_id(response_id: str) -> UserTurn: + """Retrieve a response's turn from the database by response ID. + + Looks up the turn that has this response_id to get its conversation. + Used for fork/previous_response_id resolution. + + Args: + response_id: The ID of the response (stored on UserTurn.response_id). + + Returns: + The UserTurn row for that response (has conversation_id). + + Raises: + HTTPException: 404 if no turn has this response_id; 500 on database error. + """ + try: + with get_session() as session: + turn = session.query(UserTurn).filter_by(response_id=response_id).first() + if turn is None: + logger.error("Response %s not found in database.", response_id) + response = NotFoundResponse( + resource="response", resource_id=response_id + ) + raise HTTPException(**response.model_dump()) + return turn + except SQLAlchemyError as e: + logger.exception( + "Database error while retrieving turn by response_id %s", response_id + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e + + def check_configuration_loaded(config: AppConfig) -> None: """ Raise an error if the configuration is not loaded. diff --git a/src/utils/query.py b/src/utils/query.py index fef28eea6..17dff5206 100644 --- a/src/utils/query.py +++ b/src/utils/query.py @@ -282,6 +282,7 @@ def store_query_results( # pylint: disable=too-many-arguments model_id=model_id, provider_id=provider_id, topic_summary=topic_summary, + response_id=summary.id, ) except SQLAlchemyError as e: logger.exception("Error persisting conversation details.") @@ -369,6 +370,7 @@ def persist_user_conversation_details( model_id: str, provider_id: str, topic_summary: Optional[str], + response_id: str, ) -> None: """Associate conversation to user in the database. @@ -380,6 +382,7 @@ def persist_user_conversation_details( model_id: The model identifier provider_id: The provider identifier topic_summary: Optional topic summary for the conversation + response_id: Response ID for the conversation """ # Normalize the conversation ID (strip 'conv_' prefix if present) normalized_id = normalize_conversation_id(conversation_id) @@ -403,6 +406,7 @@ def persist_user_conversation_details( last_used_provider=provider_id, topic_summary=topic_summary or "", message_count=1, + last_response_id=response_id, ) session.add(conversation) logger.debug( @@ -419,6 +423,7 @@ def persist_user_conversation_details( user_id, existing_conversation.message_count, ) + existing_conversation.last_response_id = response_id max_turn_number = ( session.query(func.max(UserTurn.turn_number)) @@ -433,6 +438,7 @@ def persist_user_conversation_details( completed_at=datetime.fromisoformat(completed_at), provider=provider_id, model=model_id, + response_id=response_id, ) session.add(turn) logger.debug( diff --git a/src/utils/responses.py b/src/utils/responses.py index 71cdf2899..4b0674ee9 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -7,6 +7,7 @@ from typing import Any, Optional, cast from fastapi import HTTPException +from llama_stack_api import OpenAIResponseObject from llama_stack_api.openai_responses import ( OpenAIResponseContentPartRefusal as ContentPartRefusal, OpenAIResponseInputMessageContent as InputMessageContent, @@ -976,7 +977,7 @@ async def select_model_for_responses( def build_turn_summary( - response: Optional[ResponseObject], + response: Optional[OpenAIResponseObject], model: str, vector_store_ids: Optional[list[str]] = None, rag_id_mapping: Optional[dict[str, str]] = None, @@ -998,6 +999,7 @@ def build_turn_summary( if response is None or response.output is None: return summary + summary.id = response.id # Extract text from output items summary.llm_response = extract_text_from_response_items(response.output) @@ -1108,3 +1110,35 @@ def deduplicate_referenced_documents( seen.add(key) out.append(d) return out + + +async def create_new_conversation( + client: AsyncLlamaStackClient, +) -> str: + """Create a new conversation via the Llama Stack Conversations API. + + Calls the client to create a conversation with empty metadata and returns + the new conversation's ID. + + Args: + client: The Llama Stack client used to create the conversation. + + Returns: + The new conversation's ID (string), as returned by the API. + + Raises: + HTTPException: 503 when the backend is unreachable (APIConnectionError); + 500 on other API errors (APIStatusError). + """ + try: + conversation = await client.conversations.create(metadata={}) + return conversation.id + except APIConnectionError as e: + error_response = ServiceUnavailableResponse( + backend_name="Llama Stack", + cause=str(e), + ) + raise HTTPException(**error_response.model_dump()) from e + except APIStatusError as e: + error_response = InternalServerErrorResponse.generic() + raise HTTPException(**error_response.model_dump()) from e diff --git a/src/utils/shields.py b/src/utils/shields.py index ff99fc3b0..0f8f891d1 100644 --- a/src/utils/shields.py +++ b/src/utils/shields.py @@ -3,7 +3,10 @@ from typing import Any, Optional from fastapi import HTTPException -from llama_stack_api import OpenAIResponseContentPartRefusal, OpenAIResponseMessage +from llama_stack_api import ( + OpenAIResponseContentPartRefusal, + OpenAIResponseMessage, +) from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient import metrics @@ -266,7 +269,6 @@ def create_refusal_response(refusal_message: str) -> OpenAIResponseMessage: """ refusal_content = OpenAIResponseContentPartRefusal(refusal=refusal_message) return OpenAIResponseMessage( - type="message", role="assistant", content=[refusal_content], ) diff --git a/src/utils/types.py b/src/utils/types.py index 220a85239..654b78608 100644 --- a/src/utils/types.py +++ b/src/utils/types.py @@ -222,6 +222,21 @@ class ResponsesApiParams(BaseModel): description="Extra HTTP headers to send with the request (e.g. x-llamastack-provider-data)", ) + def dump_for_create(self) -> dict[str, Any]: + """Dump params for client.responses.create() with single-context semantics. + + When previous_response_id is set, conversation is omitted so that only + one context (previous_response_id) is passed to the Responses API. + Otherwise the full dump is returned. + + Returns: + Dictionary suitable for unpacking into responses.create(). + """ + data = self.model_dump(exclude_none=True) + if self.previous_response_id: + data.pop("conversation", None) + return data + class ToolCallSummary(BaseModel): """Model representing a tool call made during response generation (for tool_calls list).""" @@ -288,6 +303,7 @@ class ReferencedDocument(BaseModel): class TurnSummary(BaseModel): """Summary of a turn in llama stack.""" + id: str = Field(default="", description="ID of the response") llm_response: str = "" tool_calls: list[ToolCallSummary] = Field(default_factory=list) tool_results: list[ToolResultSummary] = Field(default_factory=list) diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index e39e9ec04..6aaba0220 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -10,6 +10,7 @@ from llama_stack_client import AsyncLlamaStackClient from llama_stack_client.types.query_chunks_response import Chunk +from llama_stack_api.openai_responses import OpenAIResponseMessage as ResponseMessage from pydantic import AnyUrl import constants @@ -301,3 +302,15 @@ def format_rag_context_for_injection( logger.info("Injecting %d RAG chunks into user message", len(context_chunks)) return rag_context + + +def build_message_from_static_rag(rag_context: str) -> ResponseMessage: + """Build a user message from static RAG context. + + Args: + rag_context: The RAG context string to include in the message. + + Returns: + ResponseMessage: A user-role message containing the RAG context. + """ + return ResponseMessage(content=rag_context, role="user") diff --git a/tests/e2e/features/responses.feature b/tests/e2e/features/responses.feature new file mode 100644 index 000000000..e1e0ccd61 --- /dev/null +++ b/tests/e2e/features/responses.feature @@ -0,0 +1,24 @@ +@Authorized +Feature: Responses endpoint API tests + + Background: + Given The service is started locally + And REST API service prefix is /v1 + + Scenario: Check if responses endpoint returns 200 for minimal request + Given The system is in default state + And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva + When I use "responses" to ask question with authorization header + """ + {"input": "Say hello", "model": "{PROVIDER}/{MODEL}", "stream": false} + """ + Then The status code of the response is 200 + + Scenario: Check if responses endpoint returns 200 for minimal streaming request + Given The system is in default state + And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva + When I use "responses" to ask question with authorization header + """ + {"input": "Say hello", "model": "{PROVIDER}/{MODEL}", "stream": true} + """ + Then The status code of the response is 200 \ No newline at end of file diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 84bf75af7..b3bbddf1c 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -9,6 +9,7 @@ features/conversation_cache_v2.feature features/feedback.feature features/health.feature features/info.feature +features/responses.feature features/query.feature features/streaming_query.feature features/rest_api.feature diff --git a/tests/integration/test_openapi_json.py b/tests/integration/test_openapi_json.py index 17ff8ac66..05ccc83f8 100644 --- a/tests/integration/test_openapi_json.py +++ b/tests/integration/test_openapi_json.py @@ -231,6 +231,11 @@ def test_servers_section_present_from_url(spec_from_url: dict[str, Any]) -> None "post", {"200", "401", "403", "404"}, ), + ( + "/v1/responses", + "post", + {"200", "401", "403", "404", "413", "422", "429", "500", "503"}, + ), ("/v1/config", "get", {"200", "401", "403", "500"}), ("/v1/feedback", "post", {"200", "401", "403", "404", "500"}), ("/v1/feedback/status", "get", {"200"}), @@ -318,6 +323,11 @@ def test_paths_and_responses_exist_from_file( "post", {"200", "401", "403", "404"}, ), + ( + "/v1/responses", + "post", + {"200", "401", "403", "404", "413", "422", "429", "500", "503"}, + ), ("/v1/config", "get", {"200", "401", "403", "500"}), ("/v1/feedback", "post", {"200", "401", "403", "404", "500"}), ("/v1/feedback/status", "get", {"200"}), diff --git a/tests/unit/app/endpoints/test_responses.py b/tests/unit/app/endpoints/test_responses.py new file mode 100644 index 000000000..fce172978 --- /dev/null +++ b/tests/unit/app/endpoints/test_responses.py @@ -0,0 +1,1014 @@ +# pylint: disable=redefined-outer-name, too-many-locals, too-many-lines +"""Unit tests for the /responses REST API endpoint (LCORE Responses API).""" + +from datetime import UTC, datetime +from typing import Any, cast + +import pytest +from fastapi import HTTPException, Request +from fastapi.responses import StreamingResponse +from llama_stack_api import OpenAIResponseObject +from llama_stack_api.openai_responses import OpenAIResponseMessage +from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient +from pytest_mock import MockerFixture + +from app.endpoints.responses import ( + handle_non_streaming_response, + handle_streaming_response, + responses_endpoint_handler, +) +from configuration import AppConfig +from models.config import Action +from models.database.conversations import UserConversation +from models.requests import ResponsesRequest +from models.responses import ResponsesResponse +from utils.types import TurnSummary + +MOCK_AUTH = ( + "00000001-0001-0001-0001-000000000001", + "mock_username", + False, + "mock_token", +) +VALID_CONV_ID = "conv_e6afd7aaa97b49ce8f4f96a801b07893d9cb784d72e53e3c" +VALID_CONV_ID_NORMALIZED = "e6afd7aaa97b49ce8f4f96a801b07893d9cb784d72e53e3c" +MODULE = "app.endpoints.responses" + + +def _patch_base(mocker: MockerFixture, config: AppConfig) -> None: + """Patch configuration and mandatory checks for responses endpoint.""" + mocker.patch(f"{MODULE}.configuration", config) + mocker.patch(f"{MODULE}.check_configuration_loaded") + mocker.patch(f"{MODULE}.check_tokens_available") + mocker.patch(f"{MODULE}.validate_model_provider_override") + mocker.patch(f"{MODULE}.prepare_tools", new=mocker.AsyncMock(return_value=None)) + + +def _patch_client(mocker: MockerFixture) -> Any: + """Patch AsyncLlamaStackClientHolder; return (mock_client, mock_holder).""" + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_vector_stores = mocker.Mock() + mock_vector_stores.list = mocker.AsyncMock(return_value=mocker.Mock(data=[])) + mock_client.vector_stores = mock_vector_stores + mock_holder = mocker.Mock() + mock_holder.get_client.return_value = mock_client + mocker.patch(f"{MODULE}.AsyncLlamaStackClientHolder", return_value=mock_holder) + return mock_client, mock_holder + + +def _patch_rag( + mocker: MockerFixture, + *, + raw_chunks: list[Any] | None = None, + scores: list[float] | None = None, + doc_ids: list[Any] | None = None, + pre_rag_chunks: list[Any] | None = None, + rag_context: str = "", +) -> None: + """Patch vector search and RAG formatting for responses endpoint.""" + # pylint: disable=too-many-arguments + if raw_chunks is None: + raw_chunks = [] + if scores is None: + scores = [] + if doc_ids is None: + doc_ids = [] + if pre_rag_chunks is None: + pre_rag_chunks = [] + mocker.patch( + f"{MODULE}.perform_vector_search", + new=mocker.AsyncMock( + return_value=(raw_chunks, scores, doc_ids, pre_rag_chunks), + ), + ) + mocker.patch( + f"{MODULE}.format_rag_context_for_injection", + return_value=rag_context, + ) + mocker.patch(f"{MODULE}.build_message_from_static_rag") + + +def _patch_moderation(mocker: MockerFixture, decision: str = "passed") -> Any: + """Patch run_shield_moderation; return mock moderation result.""" + mock_moderation = mocker.Mock() + mock_moderation.decision = decision + mocker.patch( + f"{MODULE}.run_shield_moderation", + new=mocker.AsyncMock(return_value=mock_moderation), + ) + return mock_moderation + + +def _make_responses_response( + *, + output_text: str = "", + conversation: str = "", + model: str = "provider/model1", + **kwargs: Any, +) -> ResponsesResponse: + """Build a minimal valid ResponsesResponse for tests.""" + defaults = { + "id": "resp_1", + "object": "response", + "created_at": 0, + "status": "completed", + "model": model, + "output": [], + "conversation": conversation, + "completed_at": 0, + "output_text": output_text, + "available_quotas": {}, + } + defaults.update(kwargs) + return ResponsesResponse(**defaults) + + +def _patch_handle_non_streaming_common( + mocker: MockerFixture, config: AppConfig +) -> None: + """Patch deps used by handle_non_streaming_response (blocked and success).""" + mocker.patch(f"{MODULE}.configuration", config) + mocker.patch(f"{MODULE}.get_available_quotas", return_value={}) + mocker.patch( + f"{MODULE}.get_topic_summary", + new=mocker.AsyncMock(return_value=None), + ) + mocker.patch(f"{MODULE}.store_query_results") + + +@pytest.fixture(name="dummy_request") +def dummy_request_fixture() -> Request: + """Minimal FastAPI Request with authorized_actions for responses endpoint.""" + req = Request(scope={"type": "http", "headers": []}) + req.state.authorized_actions = {Action.QUERY, Action.READ_OTHERS_CONVERSATIONS} + return req + + +@pytest.fixture(name="minimal_config") +def minimal_config_fixture() -> AppConfig: + """Minimal AppConfig for responses endpoint tests.""" + cfg = AppConfig() + cfg.init_from_dict( + { + "name": "test", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "test-key", + "url": "http://test.com:1234", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "authorization": {"access_rules": []}, + } + ) + return cfg + + +def _request_with_model_and_conv( + input_text: str = "Hello", model: str = "provider/model1" +) -> ResponsesRequest: + """Build request with model and conversation set (as handler does).""" + return ResponsesRequest( + input=input_text, + model=model, + conversation=VALID_CONV_ID, + ) + + +class TestResponsesEndpointHandler: + """Unit tests for responses_endpoint_handler.""" + + @pytest.mark.asyncio + async def test_successful_responses_string_input_non_streaming( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test successful responses request with string input returns ResponsesResponse.""" + responses_request = ResponsesRequest(input="What is Kubernetes?") + _patch_base(mocker, minimal_config) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.create_new_conversation", + new=mocker.AsyncMock(return_value="conv_new_123"), + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/model1"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + _patch_rag(mocker) + _patch_moderation(mocker, decision="passed") + + mock_response = _make_responses_response( + output_text="Kubernetes is a container orchestration platform.", + conversation="conv_new_123", + ) + mocker.patch( + f"{MODULE}.handle_non_streaming_response", + new=mocker.AsyncMock(return_value=mock_response), + ) + + response = await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + assert isinstance(response, ResponsesResponse) + assert ( + response.output_text == "Kubernetes is a container orchestration platform." + ) + assert response.conversation == "conv_new_123" + + @pytest.mark.asyncio + async def test_responses_with_conversation_validates_and_retrieves( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that providing conversation ID calls validate_and_retrieve_conversation.""" + responses_request = ResponsesRequest( + input="Follow-up question", + conversation=VALID_CONV_ID, + ) + _patch_base(mocker, minimal_config) + mock_user_conv = mocker.Mock(spec=UserConversation) + mock_user_conv.id = VALID_CONV_ID_NORMALIZED + mock_validate = mocker.patch( + f"{MODULE}.validate_and_retrieve_conversation", + return_value=mock_user_conv, + ) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + mocker.patch( + f"{MODULE}.to_llama_stack_conversation_id", + return_value=VALID_CONV_ID, + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/model1"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + _patch_rag(mocker) + _patch_moderation(mocker, decision="passed") + mocker.patch( + f"{MODULE}.handle_non_streaming_response", + new=mocker.AsyncMock( + return_value=_make_responses_response( + output_text="Answer", + conversation=VALID_CONV_ID_NORMALIZED, + ) + ), + ) + + await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + + mock_validate.assert_called_once() + + @pytest.mark.asyncio + async def test_responses_model_not_configured_raises_404( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that unconfigured model leads to 404 HTTPException.""" + responses_request = ResponsesRequest(input="Hello", model="provider/unknown") + _patch_base(mocker, minimal_config) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.create_new_conversation", + new=mocker.AsyncMock(return_value="conv_new"), + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/unknown"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=False), + ) + mocker.patch( + f"{MODULE}.extract_provider_and_model_from_model_id", + return_value=("provider", "unknown"), + ) + + with pytest.raises(HTTPException) as exc_info: + await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + assert exc_info.value.status_code == 404 + + @pytest.mark.asyncio + async def test_responses_streaming_returns_streaming_response( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that stream=True delegates to handle_streaming_response.""" + responses_request = ResponsesRequest(input="Stream this", stream=True) + _patch_base(mocker, minimal_config) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.create_new_conversation", + new=mocker.AsyncMock(return_value="conv_new"), + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/model1"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + _patch_rag(mocker) + _patch_moderation(mocker, decision="passed") + mock_streaming = mocker.Mock(spec=StreamingResponse) + mocker.patch( + f"{MODULE}.handle_streaming_response", + new=mocker.AsyncMock(return_value=mock_streaming), + ) + + response = await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + assert response is mock_streaming + + @pytest.mark.asyncio + async def test_responses_azure_token_refresh( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that Azure token refresh is called when model starts with azure.""" + responses_request = ResponsesRequest(input="Hi", model="azure/some-model") + _patch_base(mocker, minimal_config) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.create_new_conversation", + new=mocker.AsyncMock(return_value="conv_new"), + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="azure/some-model"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + mock_azure = mocker.Mock() + mock_azure.is_entra_id_configured = True + mock_azure.is_token_expired = True + mock_azure.refresh_token.return_value = True + mocker.patch(f"{MODULE}.AzureEntraIDManager", return_value=mock_azure) + updated_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_update_token = mocker.patch( + f"{MODULE}.update_azure_token", + new=mocker.AsyncMock(return_value=updated_client), + ) + _patch_rag(mocker) + _patch_moderation(mocker, decision="passed") + mocker.patch( + f"{MODULE}.handle_non_streaming_response", + new=mocker.AsyncMock( + return_value=_make_responses_response( + output_text="Ok", + conversation="conv_new", + model="azure/some-model", + ) + ), + ) + + await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + mock_update_token.assert_called_once() + + @pytest.mark.asyncio + async def test_responses_structured_input_appends_rag_message( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that non-string input uses extract_text and appends RAG message.""" + structured_input: list[Any] = [ + OpenAIResponseMessage(role="user", content="What is K8s?"), + ] + responses_request = ResponsesRequest( + input=cast(Any, structured_input), + ) + _patch_base(mocker, minimal_config) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.create_new_conversation", + new=mocker.AsyncMock(return_value="conv_new"), + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/model1"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + _patch_rag(mocker, rag_context="\n\nRelevant documentation:\nDoc1") + mock_build_message = mocker.patch(f"{MODULE}.build_message_from_static_rag") + _patch_moderation(mocker, decision="passed") + mocker.patch( + f"{MODULE}.handle_non_streaming_response", + new=mocker.AsyncMock( + return_value=_make_responses_response( + output_text="K8s is Kubernetes.", + conversation="conv_new", + ) + ), + ) + + await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + + mock_build_message.assert_called_once_with("\n\nRelevant documentation:\nDoc1") + + @pytest.mark.asyncio + async def test_responses_blocked_with_conversation_appends_refusal( + self, + dummy_request: Request, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Blocked moderation with conversation calls append_turn_items_to_conversation.""" + responses_request = ResponsesRequest( + input="Bad", + conversation=VALID_CONV_ID, + stream=False, + model="provider/model1", + ) + _patch_base(mocker, minimal_config) + mock_user_conv = mocker.Mock(spec=UserConversation) + mock_user_conv.id = VALID_CONV_ID_NORMALIZED + mocker.patch( + f"{MODULE}.validate_and_retrieve_conversation", + return_value=mock_user_conv, + ) + _patch_client(mocker) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + mocker.patch( + f"{MODULE}.to_llama_stack_conversation_id", + return_value=VALID_CONV_ID, + ) + mocker.patch( + f"{MODULE}.select_model_for_responses", + new=mocker.AsyncMock(return_value="provider/model1"), + ) + mocker.patch( + f"{MODULE}.check_model_configured", + new=mocker.AsyncMock(return_value=True), + ) + _patch_rag(mocker) + mock_moderation = _patch_moderation(mocker, decision="blocked") + mock_moderation.message = "Blocked" + mock_moderation.moderation_id = "resp_blocked_123" + mock_moderation.refusal_response = OpenAIResponseMessage( + type="message", role="assistant", content="Blocked" + ) + mock_append = mocker.patch( + f"{MODULE}.append_turn_items_to_conversation", + new=mocker.AsyncMock(), + ) + # Let real handle_non_streaming_response run (stream=False) so it calls + # append_turn_items_to_conversation + mocker.patch(f"{MODULE}.store_query_results") + # Response build may omit model from model_dump(); ensure it is present for validation + real_validate = ResponsesResponse.model_validate + + def _validate_with_model(data: dict) -> Any: + if "model" not in data: + data = {**data, "model": "provider/model1"} + return real_validate(data) + + mocker.patch.object( + ResponsesResponse, + "model_validate", + side_effect=_validate_with_model, + ) + + await responses_endpoint_handler( + request=dummy_request, + responses_request=responses_request, + auth=MOCK_AUTH, + _mcp_headers={}, + ) + + mock_append.assert_called_once() + + +class TestHandleNonStreamingResponse: + """Unit tests for handle_non_streaming_response.""" + + @pytest.mark.asyncio + async def test_handle_non_streaming_blocked_returns_refusal( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that blocked moderation returns response with refusal message.""" + request = _request_with_model_and_conv("Bad input") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_moderation = mocker.Mock() + mock_moderation.decision = "blocked" + mock_moderation.message = "Content blocked" + mock_refusal = mocker.Mock(spec=OpenAIResponseMessage) + mock_refusal.type = "message" + mock_refusal.role = "assistant" + mock_refusal.content = "Content blocked" + mock_moderation.refusal_response = mock_refusal + + _patch_handle_non_streaming_common(mocker, minimal_config) + mock_client.conversations.items.create = mocker.AsyncMock() + mock_api_response = mocker.Mock() + mock_api_response.output = [mock_refusal] + mock_api_response.model_dump.return_value = { + "id": "resp_blocked", + "object": "response", + "created_at": 0, + "status": "completed", + "model": "provider/model1", + "output": [mock_refusal], + "usage": {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}, + } + mocker.patch( + f"{MODULE}.OpenAIResponseObject.model_construct", + return_value=mock_api_response, + ) + + response = await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Bad input", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + assert isinstance(response, ResponsesResponse) + assert response.output_text == "Content blocked" + mock_client.responses.create.assert_not_called() + + @pytest.mark.asyncio + async def test_handle_non_streaming_success_returns_response( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test successful handle_non_streaming_response returns ResponsesResponse.""" + request = _request_with_model_and_conv("Hello") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + mock_api_response = mocker.Mock(spec=OpenAIResponseObject) + mock_api_response.output = [] + mock_api_response.usage = mocker.Mock( + input_tokens=1, output_tokens=2, total_tokens=3 + ) + mock_api_response.model_dump.return_value = { + "id": "resp_1", + "object": "response", + "created_at": 0, + "status": "completed", + "model": "provider/model1", + "output": [], + "usage": { + "input_tokens": 1, + "output_tokens": 2, + "total_tokens": 3, + }, + } + mock_client.responses.create = mocker.AsyncMock(return_value=mock_api_response) + + _patch_handle_non_streaming_common(mocker, minimal_config) + mocker.patch( + f"{MODULE}.extract_token_usage", + return_value=mocker.Mock(input_tokens=1, output_tokens=2), + ) + mocker.patch(f"{MODULE}.consume_query_tokens") + mocker.patch( + f"{MODULE}.build_turn_summary", + return_value=mocker.Mock(referenced_documents=[]), + ) + mocker.patch( + f"{MODULE}.extract_text_from_response_items", + return_value="Model reply", + ) + mocker.patch( + f"{MODULE}.extract_vector_store_ids_from_tools", + return_value=[], + ) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + + response = await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hello", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert isinstance(response, ResponsesResponse) + assert response.output_text == "Model reply" + mock_client.responses.create.assert_called_once() + + @pytest.mark.asyncio + async def test_handle_non_streaming_context_length_raises_413( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that RuntimeError with context_length raises 413.""" + request = _request_with_model_and_conv("Long input") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=RuntimeError("context_length exceeded") + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + _patch_handle_non_streaming_common(mocker, minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + + with pytest.raises(HTTPException) as exc_info: + await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Long input", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert exc_info.value.status_code == 413 + + @pytest.mark.asyncio + async def test_handle_non_streaming_connection_error_raises_503( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that APIConnectionError raises 503.""" + request = _request_with_model_and_conv("Hi") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=APIConnectionError( + message="Connection failed", + request=mocker.Mock(), + ) + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + _patch_handle_non_streaming_common(mocker, minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + + with pytest.raises(HTTPException) as exc_info: + await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hi", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert exc_info.value.status_code == 503 + + @pytest.mark.asyncio + async def test_handle_non_streaming_api_status_error_raises_http( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that APIStatusError is handled and re-raised as HTTPException.""" + request = _request_with_model_and_conv("Hi") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=APIStatusError( + message="API error", + response=mocker.Mock(request=None), + body=None, + ) + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + _patch_handle_non_streaming_common(mocker, minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + mocker.patch( + f"{MODULE}.handle_known_apistatus_errors", + return_value=mocker.Mock( + model_dump=lambda: { + "status_code": 500, + "detail": {"response": "Error", "cause": "API error"}, + } + ), + ) + + with pytest.raises(HTTPException) as exc_info: + await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hi", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert exc_info.value.status_code == 500 + + @pytest.mark.asyncio + async def test_handle_non_streaming_runtime_error_without_context_reraises( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test that RuntimeError without context_length is re-raised.""" + request = _request_with_model_and_conv("Hi") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=RuntimeError("Some other error") + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + _patch_handle_non_streaming_common(mocker, minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + + with pytest.raises(RuntimeError, match="Some other error"): + await handle_non_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hi", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + +class TestHandleStreamingResponse: + """Unit tests for handle_streaming_response and streaming generators.""" + + @pytest.mark.asyncio + async def test_handle_streaming_blocked_returns_sse_consumes_shield_generator( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test streaming with blocked moderation yields SSE from shield_violation_generator.""" + request = _request_with_model_and_conv("Bad", model="provider/model1") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_moderation = mocker.Mock() + mock_moderation.decision = "blocked" + mock_moderation.message = "Blocked" + mock_moderation.moderation_id = "mod_123" + mock_refusal = OpenAIResponseMessage( + role="assistant", content="Blocked", type="message" + ) + mock_moderation.refusal_response = mock_refusal + + mocker.patch(f"{MODULE}.configuration", minimal_config) + mocker.patch(f"{MODULE}.get_available_quotas", return_value={}) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + mocker.patch( + f"{MODULE}.get_topic_summary", + new=mocker.AsyncMock(return_value=None), + ) + mocker.patch(f"{MODULE}.store_query_results") + + mock_client.conversations.items.create = mocker.AsyncMock() + response = await handle_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Bad", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert isinstance(response, StreamingResponse) + assert response.media_type == "text/event-stream" + collected: list[str] = [] + async for part in response.body_iterator: + chunk_str = ( + part.decode("utf-8") + if isinstance(part, bytes) + else (part if isinstance(part, str) else bytes(part).decode("utf-8")) + ) + collected.append(chunk_str) + body = "".join(collected) + assert "event: response.created" in body + assert "event: response.output_item.added" in body + assert "event: response.output_item.done" in body + assert "event: response.completed" in body + assert "[DONE]" in body + mock_client.responses.create.assert_not_called() + + @pytest.mark.asyncio + async def test_handle_streaming_success_returns_sse_consumes_response_generator( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test streaming with passed moderation yields SSE from response_generator.""" + request = _request_with_model_and_conv("Hi", model="provider/model1") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + mock_chunk = mocker.Mock() + mock_chunk.type = "response.completed" + mock_chunk.response = mocker.Mock() + mock_chunk.response.usage = mocker.Mock( + input_tokens=1, output_tokens=2, total_tokens=3 + ) + mock_chunk.model_dump.return_value = { + "type": "response.completed", + "response": {"id": "r1", "usage": {"input_tokens": 1}}, + } + + async def mock_stream() -> Any: + yield mock_chunk + + mock_client.responses.create = mocker.AsyncMock(return_value=mock_stream()) + + mocker.patch(f"{MODULE}.configuration", minimal_config) + mocker.patch(f"{MODULE}.get_available_quotas", return_value={}) + mocker.patch(f"{MODULE}.extract_token_usage", return_value=mocker.Mock()) + mocker.patch(f"{MODULE}.consume_query_tokens") + mocker.patch(f"{MODULE}.extract_vector_store_ids_from_tools", return_value=[]) + mocker.patch( + f"{MODULE}.build_turn_summary", + return_value=TurnSummary(referenced_documents=[]), + ) + mocker.patch( + f"{MODULE}.get_topic_summary", + new=mocker.AsyncMock(return_value=None), + ) + mocker.patch(f"{MODULE}.store_query_results") + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + mock_holder = mocker.Mock() + mock_holder.get_client.return_value = mock_client + mocker.patch(f"{MODULE}.AsyncLlamaStackClientHolder", return_value=mock_holder) + response = await handle_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hi", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + assert isinstance(response, StreamingResponse) + collected = [] + async for part in response.body_iterator: + chunk_str = ( + part.decode("utf-8") + if isinstance(part, bytes) + else (part if isinstance(part, str) else bytes(part).decode("utf-8")) + ) + collected.append(chunk_str) + body = "".join(collected) + assert "response.completed" in body or "event:" in body + assert "[DONE]" in body + mock_client.responses.create.assert_called_once() + + @pytest.mark.asyncio + async def test_handle_streaming_context_length_raises_413( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test streaming raises 413 when create raises RuntimeError context_length.""" + request = _request_with_model_and_conv("Long", model="provider/model1") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=RuntimeError("context_length exceeded") + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + mocker.patch(f"{MODULE}.configuration", minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + with pytest.raises(HTTPException) as exc_info: + await handle_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Long", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + assert exc_info.value.status_code == 413 + + @pytest.mark.asyncio + async def test_handle_streaming_connection_error_raises_503( + self, + minimal_config: AppConfig, + mocker: MockerFixture, + ) -> None: + """Test streaming raises 503 when create raises APIConnectionError.""" + request = _request_with_model_and_conv("Hi", model="provider/model1") + mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + mock_client.responses.create = mocker.AsyncMock( + side_effect=APIConnectionError( + message="Connection failed", + request=mocker.Mock(), + ) + ) + mock_moderation = mocker.Mock() + mock_moderation.decision = "passed" + + mocker.patch(f"{MODULE}.configuration", minimal_config) + mocker.patch( + f"{MODULE}.normalize_conversation_id", + return_value=VALID_CONV_ID_NORMALIZED, + ) + with pytest.raises(HTTPException) as exc_info: + await handle_streaming_response( + client=mock_client, + request=request, + auth=MOCK_AUTH, + input_text="Hi", + started_at=datetime.now(UTC), + moderation_result=mock_moderation, + static_rag_docs=[], + ) + + assert exc_info.value.status_code == 503 diff --git a/tests/unit/app/test_routers.py b/tests/unit/app/test_routers.py index d29f7d026..503e7616b 100644 --- a/tests/unit/app/test_routers.py +++ b/tests/unit/app/test_routers.py @@ -27,6 +27,7 @@ rlsapi_v1, a2a, query, + responses, ) # noqa:E402 @@ -52,7 +53,7 @@ def include_router( # pylint: disable=too-many-arguments prefix: str = "", tags: Optional[list] = None, dependencies: Optional[Sequence] = None, - responses: Optional[dict] = None, + responses: Optional[dict] = None, # pylint: disable=redefined-outer-name deprecated: Optional[bool] = None, include_in_schema: Optional[bool] = None, default_response_class: Optional[Any] = None, @@ -107,7 +108,7 @@ def test_include_routers() -> None: include_routers(app) # are all routers added? - assert len(app.routers) == 20 + assert len(app.routers) == 21 assert root.router in app.get_routers() assert info.router in app.get_routers() assert models.router in app.get_routers() @@ -128,6 +129,7 @@ def test_include_routers() -> None: assert rlsapi_v1.router in app.get_routers() assert a2a.router in app.get_routers() assert stream_interrupt.router in app.get_routers() + assert responses.router in app.get_routers() def test_check_prefixes() -> None: @@ -135,7 +137,7 @@ def test_check_prefixes() -> None: Verify that include_routers registers the expected routers with their configured URL prefixes. - Asserts that 16 routers are registered on a MockFastAPI instance and that + Asserts that 21 routers are registered on a MockFastAPI instance and that each router's prefix matches the expected value (e.g., root, health, authorized, metrics use an empty prefix; most API routers use "/v1"; conversations_v2 uses "/v2"). @@ -144,7 +146,7 @@ def test_check_prefixes() -> None: include_routers(app) # are all routers added? - assert len(app.routers) == 20 + assert len(app.routers) == 21 assert app.get_router_prefix(root.router) == "" assert app.get_router_prefix(info.router) == "/v1" assert app.get_router_prefix(models.router) == "/v1" @@ -166,3 +168,4 @@ def test_check_prefixes() -> None: assert app.get_router_prefix(rlsapi_v1.router) == "/v1" assert app.get_router_prefix(a2a.router) == "" assert app.get_router_prefix(stream_interrupt.router) == "/v1" + assert app.get_router_prefix(responses.router) == "/v1" diff --git a/tests/unit/utils/test_conversations.py b/tests/unit/utils/test_conversations.py index e4120f145..e731e0ac4 100644 --- a/tests/unit/utils/test_conversations.py +++ b/tests/unit/utils/test_conversations.py @@ -3,6 +3,7 @@ from datetime import datetime, UTC from typing import Any +from llama_stack_api import OpenAIResponseMessage import pytest from pytest_mock import MockerFixture @@ -11,6 +12,7 @@ from utils.conversations import ( _build_tool_call_summary_from_item, _extract_text_from_content, + append_turn_items_to_conversation, build_conversation_turns_from_items, ) from utils.types import ToolCallSummary @@ -720,3 +722,37 @@ def test_legacy_conversation_without_metadata(self, mocker: MockerFixture) -> No # Timestamps should match conversation start time assert turn.started_at == "2024-01-01T10:00:00Z" assert turn.completed_at == "2024-01-01T10:00:00Z" + + +class TestAppendTurnItemsToConversation: # pylint: disable=too-few-public-methods + """Tests for append_turn_items_to_conversation function.""" + + @pytest.mark.asyncio + async def test_appends_user_input_and_llm_output( + self, mocker: MockerFixture + ) -> None: + """Test that append_turn_items_to_conversation creates conversation items correctly.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock(return_value=None) + assistant_msg = OpenAIResponseMessage( + type="message", + role="assistant", + content="I cannot help with that", + ) + + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-123", + user_input="Hello", + llm_output=[assistant_msg], + ) + + mock_client.conversations.items.create.assert_called_once() + call_args = mock_client.conversations.items.create.call_args + assert call_args[0][0] == "conv-123" + items = call_args[1]["items"] + assert len(items) == 2 + assert items[0]["type"] == "message" and items[0]["role"] == "user" + assert items[0]["content"] == "Hello" + assert items[1]["type"] == "message" and items[1]["role"] == "assistant" + assert items[1]["content"] == "I cannot help with that" diff --git a/tests/unit/utils/test_query.py b/tests/unit/utils/test_query.py index 867a5c6c6..c2d3a29ce 100644 --- a/tests/unit/utils/test_query.py +++ b/tests/unit/utils/test_query.py @@ -407,6 +407,7 @@ def query_side_effect(*args: Any) -> Any: model_id="model1", provider_id="provider1", topic_summary="Topic", + response_id="resp_1", ) mock_session.add.assert_called() @@ -454,6 +455,7 @@ def query_side_effect(*args: Any) -> Any: model_id="new_model", provider_id="new_provider", topic_summary=None, + response_id="resp_1", ) assert existing_conv.last_used_model == "new_model" @@ -497,11 +499,12 @@ def query_side_effect(*args: Any) -> Any: model_id="model1", provider_id="provider1", topic_summary="Topic", + response_id="resp_1", ) # Verify that the turn number is incremented correctly add_calls = mock_session.add.call_args_list - assert len(add_calls) == 2 # Conversation and UserTurn + assert len(add_calls) == 2 # Conversation and UserTurn (response_id on turn) # Find the UserTurn object in the add calls turn_added = None diff --git a/tests/unit/utils/test_shields.py b/tests/unit/utils/test_shields.py index 55ee56886..b199300f4 100644 --- a/tests/unit/utils/test_shields.py +++ b/tests/unit/utils/test_shields.py @@ -1,12 +1,16 @@ """Unit tests for utils/shields.py functions.""" +from llama_stack_api import OpenAIResponseMessage +from llama_stack_client import APIConnectionError, APIStatusError import pytest from fastapi import HTTPException, status from pytest_mock import MockerFixture +from utils.conversations import append_turn_items_to_conversation from utils.shields import ( DEFAULT_VIOLATION_MESSAGE, append_turn_to_conversation, + create_refusal_response, detect_shield_violations, get_available_shields, run_shield_moderation, @@ -518,3 +522,131 @@ def test_raises_422_when_empty_list_shield_ids_and_override_disabled( validate_shield_ids_override(query_request, mock_config) assert exc_info.value.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +class TestAppendTurnItemsRefusalCases: + """Tests for append_turn_items_to_conversation (refusal / single output item).""" + + @pytest.mark.asyncio + async def test_appends_string_input_and_refusal( + self, mocker: MockerFixture + ) -> None: + """When user_input is a string, wraps it in a user message and appends refusal.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock(return_value=None) + refusal = create_refusal_response("Blocked by policy") + + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-456", + user_input="user text", + llm_output=[refusal], + ) + + mock_client.conversations.items.create.assert_called_once() + call_args = mock_client.conversations.items.create.call_args + assert call_args[0][0] == "conv-456" + items = call_args[1]["items"] + assert len(items) == 2 + assert items[0]["type"] == "message" + assert items[0]["role"] == "user" + assert items[0]["content"] == "user text" + assert items[1]["role"] == "assistant" + assert items[1]["content"][0]["refusal"] == "Blocked by policy" + + @pytest.mark.asyncio + async def test_appends_list_input_and_refusal(self, mocker: MockerFixture) -> None: + """When user_input is a list of items, dumps each and appends refusal.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock(return_value=None) + user_item = OpenAIResponseMessage( + type="message", + role="user", + content="multi part", + ) + refusal = create_refusal_response("Refused") + + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-789", + user_input=[user_item], + llm_output=[refusal], + ) + + mock_client.conversations.items.create.assert_called_once() + call_args = mock_client.conversations.items.create.call_args + assert call_args[0][0] == "conv-789" + items = call_args[1]["items"] + assert len(items) == 2 + assert items[0]["role"] == "user" + assert items[0]["content"] == "multi part" + assert items[1]["role"] == "assistant" + + @pytest.mark.asyncio + async def test_appends_list_llm_output(self, mocker: MockerFixture) -> None: + """When llm_output is a list of items, all are appended after user input.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock(return_value=None) + msg1 = OpenAIResponseMessage(type="message", role="assistant", content="First") + msg2 = OpenAIResponseMessage(type="message", role="assistant", content="Second") + + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-list", + user_input="user", + llm_output=[msg1, msg2], + ) + + mock_client.conversations.items.create.assert_called_once() + items = mock_client.conversations.items.create.call_args[1]["items"] + assert len(items) == 3 + assert items[0]["role"] == "user" + assert items[0]["content"] == "user" + assert items[1]["content"] == "First" + assert items[2]["content"] == "Second" + + @pytest.mark.asyncio + async def test_raises_http_exception_on_api_connection_error( + self, mocker: MockerFixture + ) -> None: + """APIConnectionError is converted to HTTPException with 503.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock( + side_effect=APIConnectionError(request=mocker.Mock()) + ) + refusal = create_refusal_response("Blocked") + + with pytest.raises(HTTPException) as exc_info: + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-err", + user_input="input", + llm_output=[refusal], + ) + + assert exc_info.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE + + @pytest.mark.asyncio + async def test_raises_http_exception_on_api_status_error( + self, mocker: MockerFixture + ) -> None: + """APIStatusError is converted to HTTPException with 500.""" + mock_client = mocker.Mock() + mock_client.conversations.items.create = mocker.AsyncMock( + side_effect=APIStatusError( + message="server error", + response=mocker.Mock(request=None), + body=None, + ) + ) + refusal = create_refusal_response("Blocked") + + with pytest.raises(HTTPException) as exc_info: + await append_turn_items_to_conversation( + mock_client, + conversation_id="conv-err", + user_input="input", + llm_output=[refusal], + ) + + assert exc_info.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR