thushan
diff --git a/‎config/profiles/lmstudio.yaml‎
Lines changed: 19 additions & 1 deletion b/‎config/profiles/lmstudio.yaml‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎config/profiles/ollama.yaml‎
Lines changed: 30 additions & 1 deletion b/‎config/profiles/ollama.yaml‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎config/profiles/openai.yaml‎
Lines changed: 23 additions & 1 deletion b/‎config/profiles/openai.yaml‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎config/profiles/vllm.yaml‎
Lines changed: 23 additions & 1 deletion b/‎config/profiles/vllm.yaml‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎docs/content/api-reference/overview.md‎
Lines changed: 17 additions & 0 deletions b/‎docs/content/api-reference/overview.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎docs/content/concepts/overview.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/content/concepts/overview.md‎
Lines changed: 10 additions & 0 deletions
@@ -133,4 +133,22 @@ resources:
   # No need for load time buffer - models are preloaded
   timeout_scaling:
     base_timeout_seconds: 180  # 3 minutes
-    load_time_buffer: false
+    load_time_buffer: false
+
+# Metrics extraction for LM Studio responses
+metrics:
+  extraction:
+    enabled: true
+    source: response_body
+    format: json
+    # LM Studio uses OpenAI-compatible format
+    paths:
+      model: "$.model"
+      finish_reason: "$.choices[0].finish_reason"  # String value (e.g., "stop", "length")
+      input_tokens: "$.usage.prompt_tokens"
+      output_tokens: "$.usage.completion_tokens"
+      total_tokens: "$.usage.total_tokens"
+    calculations:
+      # Derive IsComplete from finish_reason presence (LM Studio doesn't have a separate 'done' field)
+      is_complete: 'len(finish_reason) > 0'
+      # LM Studio doesn't provide timing data, so we can't calculate tokens/sec
@@ -171,4 +171,33 @@ resources:
   # Dynamic timeout scaling
   timeout_scaling:
     base_timeout_seconds: 30
-    load_time_buffer: true  # adds estimated_load_time_ms to timeout
+    load_time_buffer: true  # adds estimated_load_time_ms to timeout
+
+# Metrics extraction from Ollama responses
+metrics:
+  extraction:
+    enabled: true
+    source: "response_body"
+    format: "json"
+    
+    # JSONPath expressions for extracting values from Ollama response
+    paths:
+      model: "$.model"
+      is_complete: "$.done"  # Ollama provides 'done' as a boolean directly
+      finish_reason: "$.finish_reason"  # Optional: Ollama may include this in some versions
+      # Token counts
+      input_tokens: "$.prompt_eval_count"
+      output_tokens: "$.eval_count"
+      # Timing data (in nanoseconds from Ollama)
+      total_duration_ns: "$.total_duration"
+      load_duration_ns: "$.load_duration"
+      prompt_duration_ns: "$.prompt_eval_duration"
+      eval_duration_ns: "$.eval_duration"
+      
+    # Simple calculations to convert to useful metrics
+    calculations:
+      # Safe division: multiply first for precision, then divide with guard against zero
+      tokens_per_second: "eval_duration_ns > 0 ? (output_tokens * 1000000000.0) / eval_duration_ns : 0"
+      ttft_ms: "prompt_duration_ns / 1000000"
+      total_ms: "total_duration_ns / 1000000"
+      model_load_ms: "load_duration_ns / 1000000"
@@ -100,4 +100,26 @@ resources:
   # No load time buffer needed for cloud services
   timeout_scaling:
     base_timeout_seconds: 120  # 2 minutes
-    load_time_buffer: false
+    load_time_buffer: false
+
+# Metrics extraction for OpenAI-compatible responses
+metrics:
+  extraction:
+    enabled: true
+    source: response_body
+    format: json
+    # OpenAI standard format
+    paths:
+      model: "$.model"
+      finish_reason: "$.choices[0].finish_reason"  # String value (e.g., "stop", "length", "function_call")
+      input_tokens: "$.usage.prompt_tokens"
+      output_tokens: "$.usage.completion_tokens" 
+      total_tokens: "$.usage.total_tokens"
+      # Some providers include additional metrics
+      ttft_ms: "$.metrics.time_to_first_token"
+      total_ms: "$.metrics.total_time"
+    calculations:
+      # Derive IsComplete from finish_reason presence (OpenAI doesn't have a separate 'done' field)
+      is_complete: 'len(finish_reason) > 0'
+      # Safe division: multiply first for precision, then divide with guard against zero
+      tokens_per_second: "total_ms > 0 ? (output_tokens * 1000.0) / total_ms : 0"
@@ -221,4 +221,26 @@ features:
   # Continuous batching
   continuous_batching:
     enabled: true
-    description: "Dynamic batching for optimal GPU utilisation"
+    description: "Dynamic batching for optimal GPU utilisation"
+
+# Metrics extraction for vLLM responses
+metrics:
+  extraction:
+    enabled: true
+    source: response_body
+    format: json
+    # vLLM uses OpenAI-compatible format for chat/completions endpoints
+    paths:
+      model: "$.model"
+      finish_reason: "$.choices[0].finish_reason"  # String value (e.g., "stop", "length")
+      input_tokens: "$.usage.prompt_tokens"
+      output_tokens: "$.usage.completion_tokens"
+      total_tokens: "$.usage.total_tokens"
+      # vLLM may include additional performance metrics
+      ttft_ms: "$.metrics.time_to_first_token_ms"
+      generation_time_ms: "$.metrics.generation_time_ms"
+    calculations:
+      # Derive IsComplete from finish_reason presence (vLLM doesn't have a separate 'done' field)
+      is_complete: 'len(finish_reason) > 0'
+      # Safe division: multiply first for precision, then divide with guard against zero
+      tokens_per_second: "generation_time_ms > 0 ? (output_tokens * 1000.0) / generation_time_ms : 0"
@@ -88,6 +88,23 @@ All responses include:
 | `X-Olla-Model` | Model used (if applicable) |
 | `X-Olla-Backend-Type` | Provider type (ollama/lmstudio/openai/vllm) |
 | `X-Olla-Response-Time` | Total processing time |
+| `X-Olla-Routing-Strategy` | Routing strategy used (when model routing is active) |
+| `X-Olla-Routing-Decision` | Routing decision made (routed/fallback/rejected) |
+| `X-Olla-Routing-Reason` | Human-readable reason for routing decision |
+
+### Provider Metrics (Debug Logs)
+
+When available, provider-specific performance metrics are extracted from responses and included in debug logs:
+
+| Metric | Description | Providers |
+|--------|-------------|-----------|
+| `provider_total_ms` | Total processing time (ms) | Ollama, LM Studio |
+| `provider_prompt_tokens` | Tokens in prompt (count) | All |
+| `provider_completion_tokens` | Tokens generated (count) | All |
+| `provider_tokens_per_second` | Generation speed (tokens/s) | Ollama, LM Studio |
+| `provider_model` | Actual model used | All |
+
+See [Provider Metrics](../concepts/provider-metrics.md) for detailed information.
 
 ## Error Responses
 
 
@@ -72,6 +72,16 @@ Provider-specific configuration templates:
 
 The profile system ensures compatibility with various LLM providers.
 
+### [Provider Metrics](provider-metrics.md)
+Real-time performance metrics extraction:
+
+- Automatic extraction from provider responses
+- Token usage and generation speed tracking
+- Processing latency measurements
+- Best-effort extraction with zero performance impact
+
+Provider metrics give insights into model performance and resource usage.
+
 ## How Components Work Together
 
 ```mermaid