Add async sleep statements and logging to record request time

tgasser-nv · tgasser-nv · commit a18b5145d5ce · 2025-10-02T16:50:11.000-05:00
diff --git a/nemoguardrails/benchmark/__init__.py b/nemoguardrails/benchmark/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemoguardrails/benchmark/mock_llm_server/api.py b/nemoguardrails/benchmark/mock_llm_server/api.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 
 
+import asyncio
+import logging
 import time
 from typing import Annotated, Optional, Union
 
-from fastapi import Depends, FastAPI, HTTPException
+from fastapi import Depends, FastAPI, HTTPException, Request, Response
 
 from nemoguardrails.benchmark.mock_llm_server.config import AppModelConfig, get_config
 from nemoguardrails.benchmark.mock_llm_server.models import (
@@ -35,9 +37,28 @@
 from nemoguardrails.benchmark.mock_llm_server.response_data import (
     calculate_tokens,
     generate_id,
+    get_latency_seconds,
     get_response,
 )
 
+# Create a console logging handler
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)  # TODO Control this from the CLi args
+
+# Create a formatter to define the log message format
+formatter = logging.Formatter(
+    "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+
+# Create a console handler to print logs to the console
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)  # DEBUG and higher will go to the console
+console_handler.setFormatter(formatter)
+
+# Add console handler to logs
+log.addHandler(console_handler)
+
+
 ModelConfigDep = Annotated[AppModelConfig, Depends(get_config)]
 
 
@@ -60,6 +81,24 @@ def _validate_request_model(
 )
 
 
+@app.middleware("http")
+async def log_http_duration(request: Request, call_next):
+    """
+    Middleware to log incoming requests and their responses.
+    """
+    request_time = time.time()
+    response = await call_next(request)
+    response_time = time.time()
+
+    duration_seconds = response_time - request_time
+    log.info(
+        "Request finished: %s, took %.3f seconds",
+        response.status_code,
+        duration_seconds,
+    )
+    return response
+
+
 @app.get("/")
 async def root(config: ModelConfigDep):
     """Root endpoint with basic server information."""
@@ -75,22 +114,30 @@ async def root(config: ModelConfigDep):
 @app.get("/v1/models", response_model=ModelsResponse)
 async def list_models(config: ModelConfigDep):
     """List available models."""
+    log.debug("/v1/models request")
+
     model = Model(
         id=config.model, object="model", created=int(time.time()), owned_by="system"
     )
-    return ModelsResponse(object="list", data=[model])
+    response = ModelsResponse(object="list", data=[model])
+    log.debug("/v1/models response: %s", response)
+    return response
 
 
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
 async def chat_completions(
     request: ChatCompletionRequest, config: ModelConfigDep
 ) -> ChatCompletionResponse:
     """Create a chat completion."""
+
+    log.debug("/v1/chat/completions request: %s", request)
+
     # Validate model exists
     _validate_request_model(config, request)
 
     # Generate dummy response
     response_content = get_response(config)
+    response_latency_seconds = get_latency_seconds(config, seed=12345)
 
     # Calculate token usage
     prompt_text = " ".join([msg.content for msg in request.messages])
@@ -122,7 +169,8 @@ async def chat_completions(
             total_tokens=prompt_tokens + completion_tokens,
         ),
     )
-
+    await asyncio.sleep(response_latency_seconds)
+    log.debug("/v1/chat/completions response: %s", response)
     return response
 
 
@@ -132,6 +180,8 @@ async def completions(
 ) -> CompletionResponse:
     """Create a text completion."""
 
+    log.debug("/v1/completions request: %s", request)
+
     # Validate model exists
     _validate_request_model(config, request)
 
@@ -143,6 +193,7 @@ async def completions(
 
     # Generate dummy response
     response_text = get_response(config)
+    response_latency_seconds = get_latency_seconds(config, seed=12345)
 
     # Calculate token usage
     prompt_tokens = calculate_tokens(prompt_text)
@@ -171,10 +222,16 @@ async def completions(
             total_tokens=prompt_tokens + completion_tokens,
         ),
     )
+
+    await asyncio.sleep(response_latency_seconds)
+    log.debug("/v1/completions response: %s", response)
     return response
 
 
 @app.get("/health")
 async def health_check():
     """Health check endpoint."""
-    return {"status": "healthy", "timestamp": int(time.time())}
+    log.debug("/health request")
+    response = {"status": "healthy", "timestamp": int(time.time())}
+    log.debug("/health response: %s", response)
+    return response
diff --git a/nemoguardrails/benchmark/mock_llm_server/run_server.py b/nemoguardrails/benchmark/mock_llm_server/run_server.py
@@ -21,12 +21,29 @@
 """
 
 import argparse
+import logging
 import sys
 
 import uvicorn
+from uvicorn.logging import AccessFormatter
 
 from nemoguardrails.benchmark.mock_llm_server.config import get_config, load_config
 
+# 1. Get a logger instance
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)  # Set the lowest level to capture all messages
+
+# Set up formatter and direct it to the console
+formatter = logging.Formatter(
+    "%(asctime)s %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.DEBUG)  # DEBUG and higher will go to the console
+console_handler.setFormatter(formatter)
+
+# Add the console handler for logging
+log.addHandler(console_handler)
+
 
 def main():
     parser = argparse.ArgumentParser(description="Run the Mock LLM Server")
@@ -64,11 +81,11 @@ def main():
     # Import the app after configuration is loaded. This caches the values in the app Dependencies
     from nemoguardrails.benchmark.mock_llm_server.api import app
 
-    print(f"Starting Mock LLM Server on {args.host}:{args.port}")
-    print(f"OpenAPI docs available at: http://{args.host}:{args.port}/docs")
-    print(f"Health check at: http://{args.host}:{args.port}/health")
-    print(f"Model configuration: {model_config}")
-    print("Press Ctrl+C to stop the server")
+    log.info(f"Starting Mock LLM Server on {args.host}:{args.port}")
+    log.info(f"OpenAPI docs available at: http://{args.host}:{args.port}/docs")
+    log.info(f"Health check at: http://{args.host}:{args.port}/health")
+    log.info(f"Model configuration: {model_config}")
+    log.info("Press Ctrl+C to stop the server")
 
     try:
         uvicorn.run(
@@ -79,9 +96,9 @@ def main():
             log_level=args.log_level,
         )
     except KeyboardInterrupt:
-        print("\nServer stopped by user")
+        log.info("\nServer stopped by user")
     except Exception as e:  # pylint: disable=broad-except
-        print(f"Error starting server: {e}")
+        log.error(f"Error starting server: {e}")
         sys.exit(1)