1
+ # LiteLLM unified gateway profile
2
+ # Docs: https://litellm.ai/
3
+ # Revised 17-08-2025
4
+ # 17-08-2025 [TF]: Updated to match latest LiteLLM features and OpenAI compatibility
5
+ name : litellm
6
+ version : " 1.0"
7
+ display_name : " LiteLLM"
8
+ description : " Unified gateway to 100+ LLM providers with automatic fallbacks and load balancing"
9
+
10
+ # Routing configuration
11
+ routing :
12
+ prefixes :
13
+ - litellm
14
+ # - lite
15
+
16
+ # API compatibility
17
+ api :
18
+ openai_compatible : true
19
+ paths :
20
+ # Core OpenAI-compatible endpoints (always available)
21
+ - / # 0: root health check
22
+ - /health # 1: health check
23
+ - /v1/chat/completions # 2: chat completions (primary)
24
+ - /v1/completions # 3: text completions
25
+ - /v1/embeddings # 4: embeddings
26
+ - /v1/models # 5: list models
27
+ # Alternative paths (some deployments)
28
+ - /chat/completions # 6: chat completions (alt)
29
+ - /completions # 7: text completions (alt)
30
+ - /embeddings # 8: embeddings (alt)
31
+ - /models # 9: list models (alt)
32
+ # Health probes (Kubernetes deployments)
33
+ # Unsure if these are needed in proxy mode, but included for completeness
34
+ - /health/readiness # 10: readiness probe
35
+ - /health/liveness # 11: liveness probe
36
+ # Note: Management endpoints (/key/*, /user/*, /team/*, /spend/*)
37
+ # require database backend and are not available in basic proxy mode
38
+
39
+ model_discovery_path : /v1/models
40
+ health_check_path : /health
41
+
42
+ # Platform characteristics
43
+ characteristics :
44
+ timeout : 5m # Remote providers can be slow
45
+ max_concurrent_requests : 100 # LiteLLM handles high concurrency well
46
+ default_priority : 95 # High priority as a unified gateway
47
+ streaming_support : true
48
+
49
+ # Detection hints for auto-discovery
50
+ detection :
51
+ user_agent_patterns :
52
+ - " litellm/"
53
+ headers :
54
+ - " X-LiteLLM-Version"
55
+ - " X-LiteLLM-Provider"
56
+ path_indicators :
57
+ - " /v1/models"
58
+ - " /health"
59
+ - " /model/info"
60
+ - " /key/generate"
61
+ default_ports :
62
+ - 4000 # LiteLLM proxy default
63
+ - 8000 # Common alternative
64
+ - 8080 # Another common port
65
+
66
+ # Model handling
67
+ models :
68
+ name_format : " {{.Name}}"
69
+ # LiteLLM uses provider-prefixed model names
70
+ provider_prefixes :
71
+ - " openai/"
72
+ - " azure/"
73
+ - " bedrock/"
74
+ - " anthropic/"
75
+ - " cohere/"
76
+ - " together_ai/"
77
+ - " replicate/"
78
+ - " huggingface/"
79
+ - " vertex_ai/"
80
+ - " palm/"
81
+ - " gemini/"
82
+ - " groq/"
83
+ - " mistral/"
84
+ - " deepinfra/"
85
+ - " perplexity/"
86
+ - " anyscale/"
87
+ - " cloudflare/"
88
+ - " voyage/"
89
+ - " databricks/"
90
+ - " ai21/"
91
+ - " nlp_cloud/"
92
+ - " aleph_alpha/"
93
+ - " baseten/"
94
+ - " openrouter/"
95
+ - " custom/" # Custom endpoints
96
+
97
+ capability_patterns :
98
+ chat :
99
+ # OpenAI models
100
+ - " gpt-5*"
101
+ - " gpt-4*"
102
+ - " gpt-3.5*"
103
+ - " chatgpt*"
104
+ # Anthropic models
105
+ - " claude-*"
106
+ - " anthropic/*"
107
+ # Google models
108
+ - " gemini*"
109
+ - " palm*"
110
+ - " chat-bison*"
111
+ # Open models
112
+ - " llama*"
113
+ - " mistral*"
114
+ - " mixtral*"
115
+ - " vicuna*"
116
+ - " alpaca*"
117
+ - " wizardlm*"
118
+ - " mpt*"
119
+ - " falcon*"
120
+ - " starchat*"
121
+ # Cohere models
122
+ - " command*"
123
+ # Provider-prefixed
124
+ - " */gpt-*"
125
+ - " */claude-*"
126
+ - " */llama*"
127
+ - " */mistral*"
128
+
129
+ embeddings :
130
+ - " *embedding*"
131
+ - " voyage-*"
132
+ - " embed-*"
133
+ - " text-embedding-*"
134
+ - " */embedding*"
135
+ - " cohere/embed-*"
136
+ - " openai/text-embedding-*"
137
+ - " bedrock/amazon.titan-embed*"
138
+
139
+ vision :
140
+ - " gpt-5-vision*"
141
+ - " gpt-5-turbo*"
142
+ - " gpt-4-vision*"
143
+ - " gpt-4-turbo*"
144
+ - " claude-3-*"
145
+ - " claude-4-*"
146
+ - " gemini-*vision*"
147
+ - " gemini-*pro*"
148
+ - " llava*"
149
+ - " bakllava*"
150
+ - " */vision*"
151
+ - " anthropic/claude-3-*"
152
+ - " anthropic/claude-4-*"
153
+
154
+ code :
155
+ - " *code*"
156
+ - " codellama*"
157
+ - " deepseek-coder*"
158
+ - " starcoder*"
159
+ - " codegen*"
160
+ - " replit*"
161
+ - " wizardcoder*"
162
+ - " phind*"
163
+ - " */code*"
164
+
165
+ function_calling :
166
+ - " gpt-5*"
167
+ - " gpt-4*"
168
+ - " gpt-3.5-turbo*"
169
+ - " claude-3-*"
170
+ - " claude-4-*"
171
+ - " mistral-large*"
172
+ - " mixtral*"
173
+ - " gemini*"
174
+ - " */function*"
175
+
176
+ # Context window detection patterns (LiteLLM handles many model variants)
177
+ context_patterns :
178
+ - pattern : " *-128k*"
179
+ context : 131072
180
+ - pattern : " *-100k*"
181
+ context : 102400
182
+ - pattern : " *-64k*"
183
+ context : 65536
184
+ - pattern : " *-32k*"
185
+ context : 32768
186
+ - pattern : " *-16k*"
187
+ context : 16384
188
+ - pattern : " *-8k*"
189
+ context : 8192
190
+ - pattern : " *-4k*"
191
+ context : 4096
192
+ - pattern : " gpt-4-turbo*"
193
+ context : 128000
194
+ - pattern : " gpt-4-32k*"
195
+ context : 32768
196
+ - pattern : " gpt-4*"
197
+ context : 8192
198
+ - pattern : " claude-3-opus*"
199
+ context : 200000
200
+ - pattern : " claude-3-sonnet*"
201
+ context : 200000
202
+ - pattern : " claude-3-haiku*"
203
+ context : 200000
204
+ - pattern : " claude-2*"
205
+ context : 100000
206
+ - pattern : " gemini-1.5-pro*"
207
+ context : 1048576 # 1M context
208
+ - pattern : " gemini-1.5-flash*"
209
+ context : 1048576 # 1M context
210
+ - pattern : " mistral-large*"
211
+ context : 32768
212
+ - pattern : " mixtral*"
213
+ context : 32768
214
+
215
+ # Request/response handling
216
+ request :
217
+ model_field_paths :
218
+ - " model"
219
+ response_format : " openai" # LiteLLM uses OpenAI-compatible format
220
+ parsing_rules :
221
+ chat_completions_path : " /v1/chat/completions"
222
+ completions_path : " /v1/completions"
223
+ embeddings_path : " /v1/embeddings"
224
+ model_field_name : " model"
225
+ supports_streaming : true
226
+
227
+ # Path indices for specific functions
228
+ path_indices :
229
+ health : 1
230
+ chat_completions : 2
231
+ completions : 3
232
+ embeddings : 4
233
+ models : 5
234
+
235
+ # Resource management
236
+ resources :
237
+ # LiteLLM proxy itself is lightweight - actual models run elsewhere
238
+ defaults :
239
+ min_memory_gb : 0.5
240
+ recommended_memory_gb : 1
241
+ requires_gpu : false
242
+ estimated_load_time_ms : 100
243
+
244
+ # Concurrency is handled by remote providers
245
+ concurrency_limits :
246
+ - min_memory_gb : 0
247
+ max_concurrent : 1000 # LiteLLM can handle many concurrent requests
248
+
249
+ # Basic timeout configuration
250
+ timeout_scaling :
251
+ base_timeout_seconds : 60
252
+ load_time_buffer : false # No model loading for proxy
253
+
254
+ # Metrics extraction for LiteLLM responses
255
+ # 18-08-2025 [TF]: These are based on standard OpenAI response formats
256
+ metrics :
257
+ extraction :
258
+ enabled : true
259
+ source : " response_body"
260
+ format : " json"
261
+
262
+ # LiteLLM returns standard OpenAI format JSON responses
263
+ paths :
264
+ # Basic response fields
265
+ request_id : " $.id"
266
+ model : " $.model"
267
+ created : " $.created"
268
+ object_type : " $.object"
269
+
270
+ # Completion status - finish_reason can be: stop, length, function_call, content_filter, null (streaming)
271
+ finish_reason : " $.choices[0].finish_reason"
272
+
273
+ # Token usage (always present in non-streaming responses)
274
+ input_tokens : " $.usage.prompt_tokens"
275
+ output_tokens : " $.usage.completion_tokens"
276
+ total_tokens : " $.usage.total_tokens"
277
+
278
+ # Cache tokens (present when caching is enabled)
279
+ cache_read_tokens : " $.usage.cache_read_input_tokens"
280
+ cache_creation_tokens : " $.usage.cache_creation_input_tokens"
281
+
282
+ calculations :
283
+ # Response is complete when finish_reason is present and not null
284
+ # Valid completion reasons: stop (normal), length (max tokens), function_call, content_filter
285
+ is_complete : ' finish_reason != null && finish_reason != ""'
286
+
287
+ # Check if response was from cache (when cache tokens are present and > 0)
288
+ is_cached : ' cache_read_tokens != null && cache_read_tokens > 0'
289
+
290
+ # Calculate actual new tokens (total minus cached)
291
+ new_tokens : ' cache_read_tokens != null ? total_tokens - cache_read_tokens : total_tokens'
0 commit comments