getzep · lehcode · Dec 22, 2025 · Dec 23, 2025 · Dec 29, 2025 · Dec 29, 2025
diff --git a/graphiti_core/llm_client/openai_generic_client.py b/graphiti_core/llm_client/openai_generic_client.py
@@ -92,6 +92,91 @@ def __init__(
         else:
             self.client = client
 
+        # Instance-level fallback state for providers that don't support json_schema
+        # (e.g., LiteLLM with Gemini). Once set to True, remains True for client lifetime.
+        self._use_json_object_mode: bool = False
+
+    def _is_schema_returned_as_data(self, response: dict[str, Any]) -> bool:
+        """Detect if the model returned the schema definition instead of data.
+
+        When some providers (e.g., LiteLLM with Gemini) receive json_schema format,
+        they return the schema definition itself instead of data conforming to the schema.
+
+        Args:
+            response: The parsed JSON response from the LLM
+
+        Returns:
+            True if the response appears to be a JSON Schema definition
+        """
+        # Immediate detection: JSON Schema keywords that are NEVER present in real data
+        schema_keywords = {'$defs', '$schema', 'definitions', 'properties'}
+        if any(key in response for key in schema_keywords):
+            return True
+
+        # Also detect "type": "object" at top level (another JSON Schema pattern)
+        return response.get('type') == 'object'
+
+    def _extract_json(self, text: str) -> dict[str, Any]:
+        """Extract the first valid JSON object from text that may contain trailing content.
+
+        Some LLM providers return JSON followed by explanatory text, which breaks
+        standard JSON parsing. This method finds and extracts the first complete
+        JSON object.
+
+        Args:
+            text: Raw response text that may contain JSON with trailing content
+
+        Returns:
+            Parsed JSON as a dictionary
+
+        Raises:
+            json.JSONDecodeError: If no valid JSON object can be extracted
+        """
+        text = text.strip()
+
+        # Try standard parsing first (fast path)
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError as e:
+            # Only handle "Extra data" errors - other errors should propagate
+            if 'Extra data' not in str(e):
+                raise
+
+        # Find the first complete JSON object by matching braces
+        if not text.startswith('{'):
+            raise json.JSONDecodeError('No JSON object found', text, 0)
+
+        depth = 0
+        in_string = False
+        escape_next = False
+
+        for i, char in enumerate(text):
+            if escape_next:
+                escape_next = False
+                continue
+
+            if char == '\\' and in_string:
+                escape_next = True
+                continue
+
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+
+            if in_string:
+                continue
+
+            if char == '{':
+                depth += 1
+            elif char == '}':
+                depth -= 1
+                if depth == 0:
+                    # Found complete JSON object
+                    json_str = text[: i + 1]
+                    return json.loads(json_str)
+
+        raise json.JSONDecodeError('Incomplete JSON object', text, len(text))
+
     async def _generate_response(
         self,
         messages: list[Message],
@@ -107,9 +192,10 @@ async def _generate_response(
             elif m.role == 'system':
                 openai_messages.append({'role': 'system', 'content': m.content})
         try:
-            # Prepare response format
-            response_format: dict[str, Any] = {'type': 'json_object'}
-            if response_model is not None:
+            # Prepare response format based on mode
+            response_format: dict[str, Any]
+            if response_model is not None and not self._use_json_object_mode:
+                # Preferred mode: use json_schema format (works with OpenAI, vLLM, etc.)
                 schema_name = getattr(response_model, '__name__', 'structured_response')
                 json_schema = response_model.model_json_schema()
                 response_format = {
@@ -119,6 +205,21 @@ async def _generate_response(
                         'schema': json_schema,
                     },
                 }
+            else:
+                # Fallback mode: use json_object format with schema embedded in prompt
+                # (for providers that don't support json_schema, e.g., LiteLLM with Gemini)
+                response_format = {'type': 'json_object'}
+                if response_model is not None:
+                    # Append schema to last user message (like base class does)
+                    serialized_model = json.dumps(response_model.model_json_schema())
+                    for i in range(len(openai_messages) - 1, -1, -1):
+                        if openai_messages[i]['role'] == 'user':
+                            content = openai_messages[i].get('content', '')
+                            openai_messages[i]['content'] = (
+                                f'{content}\n\nRespond with a JSON object in the following '
+                                f'format:\n\n{serialized_model}'
+                            )
+                            break
 
             response = await self.client.chat.completions.create(
                 model=self.model or DEFAULT_MODEL,
@@ -128,7 +229,7 @@ async def _generate_response(
                 response_format=response_format,  # type: ignore[arg-type]
             )
             result = response.choices[0].message.content or ''
-            return json.loads(result)
+            return self._extract_json(result)
         except openai.RateLimitError as e:
             raise RateLimitError from e
         except Exception as e:
@@ -153,22 +254,50 @@ async def generate_response(
         # Wrap entire operation in tracing span
         with self.tracer.start_span('llm.generate') as span:
             attributes = {
-                'llm.provider': 'openai',
+                'llm.provider': 'openai_generic',
                 'model.size': model_size.value,
                 'max_tokens': max_tokens,
+                'structured_output.mode': 'json_object'
+                if self._use_json_object_mode
+                else 'json_schema',
             }
             if prompt_name:
                 attributes['prompt.name'] = prompt_name
             span.add_attributes(attributes)
 
             retry_count = 0
             last_error = None
+            # Track if we've already attempted fallback in this call
+            fallback_attempted_this_call = False
 
             while retry_count <= self.MAX_RETRIES:
                 try:
                     response = await self._generate_response(
                         messages, response_model, max_tokens=max_tokens, model_size=model_size
                     )
+
+                    # Check for schema-as-data pattern (only if using json_schema mode)
+                    if (
+                        response_model is not None
+                        and not self._use_json_object_mode
+                        and self._is_schema_returned_as_data(response)
+                    ):
+                        if not fallback_attempted_this_call:
+                            logger.warning(
+                                'Provider returned schema definition instead of data. '
+                                'Switching to json_object mode with embedded schema.'
+                            )
+                            self._use_json_object_mode = True
+                            fallback_attempted_this_call = True
+                            span.add_attributes({'structured_output.fallback_triggered': True})
+                            # Retry immediately with fallback mode (does NOT count against MAX_RETRIES)
+                            continue
+                        else:
+                            # Fallback already attempted but still got schema - treat as error
+                            raise ValueError(
+                                'Provider returned schema definition even in fallback mode'
+                            )
+
                     return response
                 except (RateLimitError, RefusalError):
                     # These errors should not trigger retries

diff --git a/mcp_server/.env.example b/mcp_server/.env.example
@@ -14,6 +14,11 @@ MODEL_NAME=gpt-4.1-mini
 # Optional: Only needed for non-standard OpenAI endpoints
 # OPENAI_BASE_URL=https://api.openai.com/v1
 
+# OpenAI Generic Provider Configuration (LiteLLM, Ollama, vLLM)
+# Use provider: "openai_generic" in config.yaml for OpenAI-compatible APIs
+# OPENAI_BASE_URL=http://localhost:4000/v1  # LiteLLM proxy
+# OPENAI_BASE_URL=http://localhost:11434/v1  # Ollama
+
 # Optional: Group ID for namespacing graph data
 # GROUP_ID=my_project
 

diff --git a/mcp_server/README.md b/mcp_server/README.md
@@ -21,7 +21,7 @@ The Graphiti MCP server provides comprehensive knowledge graph capabilities:
 - **Group Management**: Organize and manage groups of related data with group_id filtering
 - **Graph Maintenance**: Clear the graph and rebuild indices
 - **Graph Database Support**: Multiple backend options including FalkorDB (default) and Neo4j
-- **Multiple LLM Providers**: Support for OpenAI, Anthropic, Gemini, Groq, and Azure OpenAI
+- **Multiple LLM Providers**: Support for OpenAI, Anthropic, Gemini, Groq, Azure OpenAI, and OpenAI-compatible APIs (LiteLLM, Ollama, vLLM)
 - **Multiple Embedding Providers**: Support for OpenAI, Voyage, Sentence Transformers, and Gemini embeddings
 - **Rich Entity Types**: Built-in entity types including Preferences, Requirements, Procedures, Locations, Events, Organizations, Documents, and more for structured knowledge extraction
 - **HTTP Transport**: Default HTTP transport with MCP endpoint at `/mcp/` for broad client compatibility
@@ -164,23 +164,37 @@ server:
   transport: "http"  # Default. Options: stdio, http
 
 llm:
-  provider: "openai"  # or "anthropic", "gemini", "groq", "azure_openai"
+  provider: "openai"  # or "anthropic", "gemini", "groq", "azure_openai", "openai_generic"
   model: "gpt-4.1"  # Default model
 
 database:
   provider: "falkordb"  # Default. Options: "falkordb", "neo4j"
 ```
 
-### Using Ollama for Local LLM
+### Using OpenAI-Compatible APIs (LiteLLM, Ollama, vLLM)
 
-To use Ollama with the MCP server, configure it as an OpenAI-compatible endpoint:
+The `openai_generic` provider supports any OpenAI-compatible API, including LiteLLM proxy, Ollama, and vLLM. It uses the standard `/chat/completions` endpoint with automatic fallback for providers that don't support `json_schema` response format.
 
+**LiteLLM Proxy Example:**
 ```yaml
 llm:
-  provider: "openai"
-  model: "gpt-oss:120b"  # or your preferred Ollama model
-  api_base: "http://localhost:11434/v1"
-  api_key: "ollama"  # dummy key required
+  provider: "openai_generic"
+  model: "gemini/gemini-2.0-flash"  # or any LiteLLM model
+  providers:
+    openai:
+      api_key: "your-litellm-key"
+      base_url: "http://localhost:4000/v1"
+```
+
+**Ollama Example:**
+```yaml
+llm:
+  provider: "openai_generic"
+  model: "llama3.2"  # or your preferred Ollama model
+  providers:
+    openai:
+      api_key: "ollama"  # dummy key required
+      base_url: "http://localhost:11434/v1"
 
 embedder:
   provider: "sentence_transformers"  # recommended for local setup
@@ -189,6 +203,8 @@ embedder:
 
 Make sure Ollama is running locally with: `ollama serve`
 
+> **Note:** The `openai_generic` provider automatically handles JSON response parsing for providers that return extra text after JSON output.
+
 ### Entity Types
 
 Graphiti MCP Server includes built-in entity types for structured knowledge extraction. These entity types are always enabled and configured via the `entity_types` section in your `config.yaml`:

diff --git a/mcp_server/src/services/factories.py b/mcp_server/src/services/factories.py
@@ -70,6 +70,14 @@
     HAS_GROQ = True
 except ImportError:
     HAS_GROQ = False
+
+try:
+    from graphiti_core.llm_client.openai_generic_client import OpenAIGenericClient
+
+    HAS_OPENAI_GENERIC = True
+except ImportError:
+    HAS_OPENAI_GENERIC = False
+
 from utils.utils import create_azure_credential_token_provider
 
 
@@ -246,6 +254,32 @@ def create(config: LLMConfig) -> LLMClient:
                 )
                 return GroqClient(config=llm_config)
 
+            case 'openai_generic':
+                # OpenAI Generic client - uses /chat/completions with response_format
+                # instead of /responses endpoint. Compatible with LiteLLM, Ollama, vLLM, etc.
+                if not HAS_OPENAI_GENERIC:
+                    raise ValueError(
+                        'OpenAI Generic client not available in current graphiti-core version'
+                    )
+                if not config.providers.openai:
+                    raise ValueError('OpenAI provider configuration not found')
+
+                api_key = config.providers.openai.api_key
+                base_url = config.providers.openai.api_url
+                _validate_api_key('OpenAI Generic', api_key, logger)
+
+                from graphiti_core.llm_client.config import LLMConfig as CoreLLMConfig
+
+                llm_config = CoreLLMConfig(
+                    api_key=api_key,
+                    base_url=base_url,
+                    model=config.model,
+                    small_model='gpt-4.1-mini',
+                    temperature=config.temperature,
+                    max_tokens=config.max_tokens,
+                )
+                return OpenAIGenericClient(config=llm_config)
+
             case _:
                 raise ValueError(f'Unsupported LLM provider: {provider}')