14
14
# limitations under the License.
15
15
16
16
17
+ import asyncio
18
+ import logging
17
19
import time
18
20
from typing import Annotated , Optional , Union
19
21
20
- from fastapi import Depends , FastAPI , HTTPException
22
+ from fastapi import Depends , FastAPI , HTTPException , Request , Response
21
23
22
24
from nemoguardrails .benchmark .mock_llm_server .config import AppModelConfig , get_config
23
25
from nemoguardrails .benchmark .mock_llm_server .models import (
35
37
from nemoguardrails .benchmark .mock_llm_server .response_data import (
36
38
calculate_tokens ,
37
39
generate_id ,
40
+ get_latency_seconds ,
38
41
get_response ,
39
42
)
40
43
44
+ # Create a console logging handler
45
+ log = logging .getLogger (__name__ )
46
+ log .setLevel (logging .INFO ) # TODO Control this from the CLi args
47
+
48
+ # Create a formatter to define the log message format
49
+ formatter = logging .Formatter (
50
+ "%(asctime)s %(levelname)s: %(message)s" , datefmt = "%Y-%m-%d %H:%M:%S"
51
+ )
52
+
53
+ # Create a console handler to print logs to the console
54
+ console_handler = logging .StreamHandler ()
55
+ console_handler .setLevel (logging .INFO ) # DEBUG and higher will go to the console
56
+ console_handler .setFormatter (formatter )
57
+
58
+ # Add console handler to logs
59
+ log .addHandler (console_handler )
60
+
61
+
41
62
ModelConfigDep = Annotated [AppModelConfig , Depends (get_config )]
42
63
43
64
@@ -60,6 +81,24 @@ def _validate_request_model(
60
81
)
61
82
62
83
84
+ @app .middleware ("http" )
85
+ async def log_http_duration (request : Request , call_next ):
86
+ """
87
+ Middleware to log incoming requests and their responses.
88
+ """
89
+ request_time = time .time ()
90
+ response = await call_next (request )
91
+ response_time = time .time ()
92
+
93
+ duration_seconds = response_time - request_time
94
+ log .info (
95
+ "Request finished: %s, took %.3f seconds" ,
96
+ response .status_code ,
97
+ duration_seconds ,
98
+ )
99
+ return response
100
+
101
+
63
102
@app .get ("/" )
64
103
async def root (config : ModelConfigDep ):
65
104
"""Root endpoint with basic server information."""
@@ -75,22 +114,30 @@ async def root(config: ModelConfigDep):
75
114
@app .get ("/v1/models" , response_model = ModelsResponse )
76
115
async def list_models (config : ModelConfigDep ):
77
116
"""List available models."""
117
+ log .debug ("/v1/models request" )
118
+
78
119
model = Model (
79
120
id = config .model , object = "model" , created = int (time .time ()), owned_by = "system"
80
121
)
81
- return ModelsResponse (object = "list" , data = [model ])
122
+ response = ModelsResponse (object = "list" , data = [model ])
123
+ log .debug ("/v1/models response: %s" , response )
124
+ return response
82
125
83
126
84
127
@app .post ("/v1/chat/completions" , response_model = ChatCompletionResponse )
85
128
async def chat_completions (
86
129
request : ChatCompletionRequest , config : ModelConfigDep
87
130
) -> ChatCompletionResponse :
88
131
"""Create a chat completion."""
132
+
133
+ log .debug ("/v1/chat/completions request: %s" , request )
134
+
89
135
# Validate model exists
90
136
_validate_request_model (config , request )
91
137
92
138
# Generate dummy response
93
139
response_content = get_response (config )
140
+ response_latency_seconds = get_latency_seconds (config , seed = 12345 )
94
141
95
142
# Calculate token usage
96
143
prompt_text = " " .join ([msg .content for msg in request .messages ])
@@ -122,7 +169,8 @@ async def chat_completions(
122
169
total_tokens = prompt_tokens + completion_tokens ,
123
170
),
124
171
)
125
-
172
+ await asyncio .sleep (response_latency_seconds )
173
+ log .debug ("/v1/chat/completions response: %s" , response )
126
174
return response
127
175
128
176
@@ -132,6 +180,8 @@ async def completions(
132
180
) -> CompletionResponse :
133
181
"""Create a text completion."""
134
182
183
+ log .debug ("/v1/completions request: %s" , request )
184
+
135
185
# Validate model exists
136
186
_validate_request_model (config , request )
137
187
@@ -143,6 +193,7 @@ async def completions(
143
193
144
194
# Generate dummy response
145
195
response_text = get_response (config )
196
+ response_latency_seconds = get_latency_seconds (config , seed = 12345 )
146
197
147
198
# Calculate token usage
148
199
prompt_tokens = calculate_tokens (prompt_text )
@@ -171,10 +222,16 @@ async def completions(
171
222
total_tokens = prompt_tokens + completion_tokens ,
172
223
),
173
224
)
225
+
226
+ await asyncio .sleep (response_latency_seconds )
227
+ log .debug ("/v1/completions response: %s" , response )
174
228
return response
175
229
176
230
177
231
@app .get ("/health" )
178
232
async def health_check ():
179
233
"""Health check endpoint."""
180
- return {"status" : "healthy" , "timestamp" : int (time .time ())}
234
+ log .debug ("/health request" )
235
+ response = {"status" : "healthy" , "timestamp" : int (time .time ())}
236
+ log .debug ("/health response: %s" , response )
237
+ return response
0 commit comments