Skip to content

Commit 7f91ce3

Browse files
authored
Merge pull request #56 from thushan/feature/backend/litellm
feat: backend/litellm
2 parents 5d2915b + b9b7a5d commit 7f91ce3

File tree

18 files changed

+1288
-58
lines changed

18 files changed

+1288
-58
lines changed

CLAUDE.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# CLAUDE.md
22

33
## Overview
4-
Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, OpenAI-compatible endpoints).
4+
Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, LiteLLM, vLLM, OpenAI-compatible endpoints).
55

66
The project provides two proxy engines: Sherpa (simple, maintainable) and Olla (high-performance with advanced features).
77

@@ -22,6 +22,7 @@ olla/
2222
│ ├── profiles/ # Provider-specific profiles
2323
│ │ ├── ollama.yaml # Ollama configuration
2424
│ │ ├── lmstudio.yaml # LM Studio configuration
25+
│ │ ├── litellm.yaml # LiteLLM gateway configuration
2526
│ │ ├── openai.yaml # OpenAI-compatible configuration
2627
│ │ └── vllm.yaml # vLLM configuration
2728
│ └── models.yaml # Model configurations
@@ -87,7 +88,7 @@ olla/
8788
## Response Headers
8889
- `X-Olla-Endpoint`: Backend name
8990
- `X-Olla-Model`: Model used
90-
- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm
91+
- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm/litellm
9192
- `X-Olla-Request-ID`: Request ID
9293
- `X-Olla-Response-Time`: Total processing time
9394

config/config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,15 @@ discovery:
7878
health_check_url: "/"
7979
check_interval: 2s # How often to check when healthy
8080
check_timeout: 1s
81-
- url: "http://localhost:11234"
81+
- url: "http://localhost:1234"
8282
name: "local-lm-studio"
8383
type: "lm-studio"
8484
priority: 100
8585
model_url: "/v1/models"
8686
health_check_url: "/"
8787
check_interval: 2s
8888
check_timeout: 1s
89-
- url: "http://192.168.0.1:8000"
89+
- url: "http://localhost:8000"
9090
name: "local-vllm"
9191
type: "vllm"
9292
priority: 100

config/profiles/litellm.yaml

Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
# LiteLLM unified gateway profile
2+
# Docs: https://litellm.ai/
3+
# Revised 17-08-2025
4+
# 17-08-2025 [TF]: Updated to match latest LiteLLM features and OpenAI compatibility
5+
name: litellm
6+
version: "1.0"
7+
display_name: "LiteLLM"
8+
description: "Unified gateway to 100+ LLM providers with automatic fallbacks and load balancing"
9+
10+
# Routing configuration
11+
routing:
12+
prefixes:
13+
- litellm
14+
#- lite
15+
16+
# API compatibility
17+
api:
18+
openai_compatible: true
19+
paths:
20+
# Core OpenAI-compatible endpoints (always available)
21+
- / # 0: root health check
22+
- /health # 1: health check
23+
- /v1/chat/completions # 2: chat completions (primary)
24+
- /v1/completions # 3: text completions
25+
- /v1/embeddings # 4: embeddings
26+
- /v1/models # 5: list models
27+
# Alternative paths (some deployments)
28+
- /chat/completions # 6: chat completions (alt)
29+
- /completions # 7: text completions (alt)
30+
- /embeddings # 8: embeddings (alt)
31+
- /models # 9: list models (alt)
32+
# Health probes (Kubernetes deployments)
33+
# Unsure if these are needed in proxy mode, but included for completeness
34+
- /health/readiness # 10: readiness probe
35+
- /health/liveness # 11: liveness probe
36+
# Note: Management endpoints (/key/*, /user/*, /team/*, /spend/*)
37+
# require database backend and are not available in basic proxy mode
38+
39+
model_discovery_path: /v1/models
40+
health_check_path: /health
41+
42+
# Platform characteristics
43+
characteristics:
44+
timeout: 5m # Remote providers can be slow
45+
max_concurrent_requests: 100 # LiteLLM handles high concurrency well
46+
default_priority: 95 # High priority as a unified gateway
47+
streaming_support: true
48+
49+
# Detection hints for auto-discovery
50+
detection:
51+
user_agent_patterns:
52+
- "litellm/"
53+
headers:
54+
- "X-LiteLLM-Version"
55+
- "X-LiteLLM-Provider"
56+
path_indicators:
57+
- "/v1/models"
58+
- "/health"
59+
- "/model/info"
60+
- "/key/generate"
61+
default_ports:
62+
- 4000 # LiteLLM proxy default
63+
- 8000 # Common alternative
64+
- 8080 # Another common port
65+
66+
# Model handling
67+
models:
68+
name_format: "{{.Name}}"
69+
# LiteLLM uses provider-prefixed model names
70+
provider_prefixes:
71+
- "openai/"
72+
- "azure/"
73+
- "bedrock/"
74+
- "anthropic/"
75+
- "cohere/"
76+
- "together_ai/"
77+
- "replicate/"
78+
- "huggingface/"
79+
- "vertex_ai/"
80+
- "palm/"
81+
- "gemini/"
82+
- "groq/"
83+
- "mistral/"
84+
- "deepinfra/"
85+
- "perplexity/"
86+
- "anyscale/"
87+
- "cloudflare/"
88+
- "voyage/"
89+
- "databricks/"
90+
- "ai21/"
91+
- "nlp_cloud/"
92+
- "aleph_alpha/"
93+
- "baseten/"
94+
- "openrouter/"
95+
- "custom/" # Custom endpoints
96+
97+
capability_patterns:
98+
chat:
99+
# OpenAI models
100+
- "gpt-5*"
101+
- "gpt-4*"
102+
- "gpt-3.5*"
103+
- "chatgpt*"
104+
# Anthropic models
105+
- "claude-*"
106+
- "anthropic/*"
107+
# Google models
108+
- "gemini*"
109+
- "palm*"
110+
- "chat-bison*"
111+
# Open models
112+
- "llama*"
113+
- "mistral*"
114+
- "mixtral*"
115+
- "vicuna*"
116+
- "alpaca*"
117+
- "wizardlm*"
118+
- "mpt*"
119+
- "falcon*"
120+
- "starchat*"
121+
# Cohere models
122+
- "command*"
123+
# Provider-prefixed
124+
- "*/gpt-*"
125+
- "*/claude-*"
126+
- "*/llama*"
127+
- "*/mistral*"
128+
129+
embeddings:
130+
- "*embedding*"
131+
- "voyage-*"
132+
- "embed-*"
133+
- "text-embedding-*"
134+
- "*/embedding*"
135+
- "cohere/embed-*"
136+
- "openai/text-embedding-*"
137+
- "bedrock/amazon.titan-embed*"
138+
139+
vision:
140+
- "gpt-5-vision*"
141+
- "gpt-5-turbo*"
142+
- "gpt-4-vision*"
143+
- "gpt-4-turbo*"
144+
- "claude-3-*"
145+
- "claude-4-*"
146+
- "gemini-*vision*"
147+
- "gemini-*pro*"
148+
- "llava*"
149+
- "bakllava*"
150+
- "*/vision*"
151+
- "anthropic/claude-3-*"
152+
- "anthropic/claude-4-*"
153+
154+
code:
155+
- "*code*"
156+
- "codellama*"
157+
- "deepseek-coder*"
158+
- "starcoder*"
159+
- "codegen*"
160+
- "replit*"
161+
- "wizardcoder*"
162+
- "phind*"
163+
- "*/code*"
164+
165+
function_calling:
166+
- "gpt-5*"
167+
- "gpt-4*"
168+
- "gpt-3.5-turbo*"
169+
- "claude-3-*"
170+
- "claude-4-*"
171+
- "mistral-large*"
172+
- "mixtral*"
173+
- "gemini*"
174+
- "*/function*"
175+
176+
# Context window detection patterns (LiteLLM handles many model variants)
177+
context_patterns:
178+
- pattern: "*-128k*"
179+
context: 131072
180+
- pattern: "*-100k*"
181+
context: 102400
182+
- pattern: "*-64k*"
183+
context: 65536
184+
- pattern: "*-32k*"
185+
context: 32768
186+
- pattern: "*-16k*"
187+
context: 16384
188+
- pattern: "*-8k*"
189+
context: 8192
190+
- pattern: "*-4k*"
191+
context: 4096
192+
- pattern: "gpt-4-turbo*"
193+
context: 128000
194+
- pattern: "gpt-4-32k*"
195+
context: 32768
196+
- pattern: "gpt-4*"
197+
context: 8192
198+
- pattern: "claude-3-opus*"
199+
context: 200000
200+
- pattern: "claude-3-sonnet*"
201+
context: 200000
202+
- pattern: "claude-3-haiku*"
203+
context: 200000
204+
- pattern: "claude-2*"
205+
context: 100000
206+
- pattern: "gemini-1.5-pro*"
207+
context: 1048576 # 1M context
208+
- pattern: "gemini-1.5-flash*"
209+
context: 1048576 # 1M context
210+
- pattern: "mistral-large*"
211+
context: 32768
212+
- pattern: "mixtral*"
213+
context: 32768
214+
215+
# Request/response handling
216+
request:
217+
model_field_paths:
218+
- "model"
219+
response_format: "openai" # LiteLLM uses OpenAI-compatible format
220+
parsing_rules:
221+
chat_completions_path: "/v1/chat/completions"
222+
completions_path: "/v1/completions"
223+
embeddings_path: "/v1/embeddings"
224+
model_field_name: "model"
225+
supports_streaming: true
226+
227+
# Path indices for specific functions
228+
path_indices:
229+
health: 1
230+
chat_completions: 2
231+
completions: 3
232+
embeddings: 4
233+
models: 5
234+
235+
# Resource management
236+
resources:
237+
# LiteLLM proxy itself is lightweight - actual models run elsewhere
238+
defaults:
239+
min_memory_gb: 0.5
240+
recommended_memory_gb: 1
241+
requires_gpu: false
242+
estimated_load_time_ms: 100
243+
244+
# Concurrency is handled by remote providers
245+
concurrency_limits:
246+
- min_memory_gb: 0
247+
max_concurrent: 1000 # LiteLLM can handle many concurrent requests
248+
249+
# Basic timeout configuration
250+
timeout_scaling:
251+
base_timeout_seconds: 60
252+
load_time_buffer: false # No model loading for proxy
253+
254+
# Metrics extraction for LiteLLM responses
255+
# 18-08-2025 [TF]: These are based on standard OpenAI response formats
256+
metrics:
257+
extraction:
258+
enabled: true
259+
source: "response_body"
260+
format: "json"
261+
262+
# LiteLLM returns standard OpenAI format JSON responses
263+
paths:
264+
# Basic response fields
265+
request_id: "$.id"
266+
model: "$.model"
267+
created: "$.created"
268+
object_type: "$.object"
269+
270+
# Completion status - finish_reason can be: stop, length, function_call, content_filter, null (streaming)
271+
finish_reason: "$.choices[0].finish_reason"
272+
273+
# Token usage (always present in non-streaming responses)
274+
input_tokens: "$.usage.prompt_tokens"
275+
output_tokens: "$.usage.completion_tokens"
276+
total_tokens: "$.usage.total_tokens"
277+
278+
# Cache tokens (present when caching is enabled)
279+
cache_read_tokens: "$.usage.cache_read_input_tokens"
280+
cache_creation_tokens: "$.usage.cache_creation_input_tokens"
281+
282+
calculations:
283+
# Response is complete when finish_reason is present and not null
284+
# Valid completion reasons: stop (normal), length (max tokens), function_call, content_filter
285+
is_complete: 'finish_reason != null && finish_reason != ""'
286+
287+
# Check if response was from cache (when cache tokens are present and > 0)
288+
is_cached: 'cache_read_tokens != null && cache_read_tokens > 0'
289+
290+
# Calculate actual new tokens (total minus cached)
291+
new_tokens: 'cache_read_tokens != null ? total_tokens - cache_read_tokens : total_tokens'

0 commit comments

Comments
 (0)