diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..104fbf7ba --- /dev/null +++ b/.env.example @@ -0,0 +1,152 @@ +# ================================================================================ +# LLM COUNCIL - Environment Configuration +# ================================================================================ +# +# SECURITY WARNING: +# - Never commit the actual .env file to version control +# - Keep your API keys secret and secure +# - The .env file is already listed in .gitignore +# +# SETUP INSTRUCTIONS: +# 1. Copy this file to .env: cp .env.example .env +# 2. Fill in your actual values in the .env file +# 3. Choose your provider mode and configure accordingly +# +# ================================================================================ + +# -------------------------------------------------------------------------------- +# LLM PROVIDER MODE +# -------------------------------------------------------------------------------- +# Determines which LLM provider(s) to use for the council +# +# Valid options: +# - openrouter: Use OpenRouter API for all models (cloud-based, requires API key) +# - ollama: Use local Ollama server for all models (self-hosted, free) +# - mixed: Use both providers with explicit prefixes per model +# +# Default: openrouter (for backward compatibility) +# -------------------------------------------------------------------------------- +LLM_PROVIDER=openrouter + + +# -------------------------------------------------------------------------------- +# OPENROUTER CONFIGURATION +# -------------------------------------------------------------------------------- +# Required for: 'openrouter' and 'mixed' modes +# Get your API key from: https://openrouter.ai/keys +# +# SECURITY: This is a secret key - never share it or commit it to git! +# -------------------------------------------------------------------------------- +OPENROUTER_API_KEY=your_openrouter_api_key_here + + +# -------------------------------------------------------------------------------- +# OLLAMA CONFIGURATION +# -------------------------------------------------------------------------------- +# Required for: 'ollama' and 'mixed' modes +# Default: http://localhost:11434 (standard Ollama installation) +# +# Change this if: +# - Running Ollama on a different port +# - Using a remote Ollama server +# - Using Docker with custom networking +# -------------------------------------------------------------------------------- +OLLAMA_BASE_URL=http://localhost:11434 + + +# ================================================================================ +# CONFIGURATION EXAMPLES BY MODE +# ================================================================================ +# +# The council models and chairman are configured in backend/config.py, but here +# are examples of how to set up each mode: +# +# -------------------------------------------------------------------------------- +# EXAMPLE 1: OpenRouter Mode (Cloud-based) +# -------------------------------------------------------------------------------- +# LLM_PROVIDER=openrouter +# OPENROUTER_API_KEY=sk-or-v1-your-actual-key-here +# +# In backend/config.py, use models like: +# COUNCIL_MODELS = [ +# "openai/gpt-4", +# "openai/gpt-5.1", +# "google/gemini-3-pro-preview", +# "anthropic/claude-sonnet-4.5", +# "x-ai/grok-4" +# ] +# CHAIRMAN_MODEL = "google/gemini-3-pro-preview" +# +# Available OpenRouter models: https://openrouter.ai/models +# +# -------------------------------------------------------------------------------- +# EXAMPLE 2: Ollama Mode (Local/Self-hosted) +# -------------------------------------------------------------------------------- +# LLM_PROVIDER=ollama +# OLLAMA_BASE_URL=http://localhost:11434 +# +# In backend/config.py, use models like: +# COUNCIL_MODELS = [ +# "llama3.1:8b", +# "mistral:latest", +# "qwen2.5:3b", +# "phi3:latest" +# ] +# CHAIRMAN_MODEL = "llama3.1:8b" +# +# Note: You must have these models installed locally via: +# ollama pull llama3.1:8b +# ollama pull mistral:latest +# (etc.) +# +# Available Ollama models: https://ollama.ai/library +# +# -------------------------------------------------------------------------------- +# EXAMPLE 3: Mixed Mode (Hybrid Cloud + Local) +# -------------------------------------------------------------------------------- +# LLM_PROVIDER=mixed +# OPENROUTER_API_KEY=sk-or-v1-your-actual-key-here +# OLLAMA_BASE_URL=http://localhost:11434 +# +# In backend/config.py, prefix each model with provider: +# COUNCIL_MODELS = [ +# "ollama:llama3.1:8b", # Local model (fast, free) +# "ollama:mistral:latest", # Local model (fast, free) +# "openrouter:google/gemini-2.5-flash-lite", # Cloud model (paid) +# "openrouter:anthropic/claude-3.5-haiku" # Cloud model (paid) +# ] +# CHAIRMAN_MODEL = "openrouter:google/gemini-2.5-flash-lite" +# +# Benefits of mixed mode: +# - Use free local models for bulk processing +# - Use premium cloud models for final synthesis +# - Optimize cost vs quality trade-offs +# +# ================================================================================ + + +# -------------------------------------------------------------------------------- +# ADDITIONAL NOTES +# -------------------------------------------------------------------------------- +# +# MODEL NAMING CONVENTIONS: +# - OpenRouter: Uses "provider/model-name" format (e.g., "openai/gpt-4") +# - Ollama: Uses "model-name:tag" format (e.g., "llama3.1:8b") +# - Mixed mode: Uses "provider:model-identifier" format +# +# COST CONSIDERATIONS: +# - OpenRouter charges per token (varies by model) +# - Ollama is free but requires local compute resources +# - Mixed mode allows cost optimization strategies +# +# PERFORMANCE: +# - OpenRouter: Fast API, no local setup required +# - Ollama: Speed depends on hardware, no network latency +# - Mixed mode: Balance based on your infrastructure +# +# PRIVACY: +# - OpenRouter: Data sent to third-party cloud services +# - Ollama: All processing happens locally (fully private) +# - Mixed mode: Be aware which models process sensitive data +# +# ================================================================================ diff --git a/.gitignore b/.gitignore index 4c2041a54..156cf440f 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,11 @@ data/ # Frontend frontend/node_modules/ frontend/dist/ -frontend/.vite/ \ No newline at end of file +frontend/.vite/ + +#superclaude settings +.claude +.serena + +#backlog.md files +backlog/ diff --git a/CLAUDE.md b/CLAUDE.md index b803720fa..8d9c1e114 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,20 +8,71 @@ LLM Council is a 3-stage deliberation system where multiple LLMs collaboratively ## Architecture +### Multi-Provider System + +The application now supports three LLM provider modes via a provider abstraction layer: + +1. **OpenRouter Mode** (default): All models use OpenRouter API (cloud-based, paid) +2. **Ollama Mode**: All models use local Ollama server (self-hosted, free) +3. **Mixed Mode**: Combine both providers with explicit prefixes per model + +**Key Design Decisions:** +- **Backward Compatibility**: Existing OpenRouter configs work without changes +- **Zero-Configuration Local**: Ollama mode requires no API keys +- **Cost Optimization**: Mixed mode allows free local + premium cloud models +- **Provider Abstraction**: Clean separation between provider logic and core council logic + ### Backend Structure (`backend/`) **`config.py`** -- Contains `COUNCIL_MODELS` (list of OpenRouter model identifiers) +- Contains `LLM_PROVIDER` (determines provider mode: openrouter/ollama/mixed) +- Contains `COUNCIL_MODELS` (list of model identifiers, format depends on mode) - Contains `CHAIRMAN_MODEL` (model that synthesizes final answer) -- Uses environment variable `OPENROUTER_API_KEY` from `.env` +- Adapts model configurations based on `LLM_PROVIDER` setting +- Environment variables from `.env`: + - `OPENROUTER_API_KEY`: Required for openrouter/mixed modes + - `OLLAMA_BASE_URL`: Required for ollama/mixed modes (default: http://localhost:11434) - Backend runs on **port 8001** (NOT 8000 - user had another app on 8000) -**`openrouter.py`** -- `query_model()`: Single async model query +**Model Specification Formats:** +- **OpenRouter mode**: `"openai/gpt-4o"`, `"google/gemini-2.5-flash-lite"` +- **Ollama mode**: `"llama3.1:8b"`, `"mistral:latest"` +- **Mixed mode**: `"openrouter:openai/gpt-4o"`, `"ollama:llama3.1:8b"` + +**`providers/` directory** - Provider Abstraction Layer + +**`providers/base.py`** +- `LLMProvider` abstract base class defining provider interface +- All providers must implement: `query_model()` and `query_models_parallel()` +- Ensures consistent API across different provider implementations + +**`providers/openrouter.py`** +- `OpenRouterProvider`: Implementation for OpenRouter API +- Handles OpenRouter-specific API format and authentication +- Supports both standard and reasoning models (o1, etc.) +- `query_model()`: Single async model query to OpenRouter - `query_models_parallel()`: Parallel queries using `asyncio.gather()` - Returns dict with 'content' and optional 'reasoning_details' - Graceful degradation: returns None on failure, continues with successful responses +**`providers/ollama.py`** +- `OllamaProvider`: Implementation for local Ollama server +- Communicates with Ollama HTTP API (default: localhost:11434) +- Supports streaming and non-streaming responses +- `query_model()`: Single async model query to Ollama +- `query_models_parallel()`: Parallel queries to local Ollama server +- Same return format as OpenRouter for consistency +- Error handling for common issues (server not running, model not found) + +**`providers/__init__.py`** - Factory and Routing Logic +- `get_provider(provider_name)`: Factory function with singleton pattern +- `parse_model_spec(model_spec)`: Parses model specifications into (provider, model_id) +- `query_model(model_spec, messages)`: Routes single query to appropriate provider +- `query_models_parallel(model_specs, messages)`: Intelligent parallel routing +- Supports both simple mode (global provider) and mixed mode (per-model provider) +- Groups queries by provider for efficient batch execution +- All queries execute concurrently (both within and across providers) + **`council.py`** - The Core Logic - `stage1_collect_responses()`: Parallel queries to all council models - `stage2_collect_rankings()`: @@ -109,11 +160,224 @@ This strict format allows reliable parsing while still getting thoughtful evalua - Users can verify system's interpretation of model outputs - This builds trust and allows debugging of edge cases +## Provider Abstraction Pattern + +### Architecture Pattern: Abstract Factory + +The multi-provider system uses the **Abstract Factory** and **Strategy** patterns: + +``` +LLMProvider (Abstract Base Class) + ↓ + ├── OpenRouterProvider (Concrete Implementation) + ├── OllamaProvider (Concrete Implementation) + └── Future providers... (Extensible) + +Factory Function: get_provider(name) → LLMProvider instance +Routing Layer: parse_model_spec() + query_model() + query_models_parallel() +``` + +**Benefits:** +- **Encapsulation**: Provider-specific logic isolated in dedicated classes +- **Extensibility**: Add new providers by implementing `LLMProvider` interface +- **Consistency**: All providers return same response format +- **Testability**: Mock providers easily for testing +- **Performance**: Singleton pattern for provider instances + +### Model Specification Parsing + +The system intelligently routes models to providers: + +1. **Simple Mode** (openrouter or ollama): + - `"llama3.1:8b"` → Uses `LLM_PROVIDER` from config + - No prefix needed, provider determined globally + +2. **Mixed Mode**: + - `"ollama:llama3.1:8b"` → Explicitly routes to Ollama + - `"openrouter:openai/gpt-4"` → Explicitly routes to OpenRouter + - All models MUST have prefix in mixed mode + +3. **Parsing Logic**: + ```python + # Check for explicit prefix + if model_spec.startswith("ollama:"): + return ("ollama", model_spec[7:]) + elif model_spec.startswith("openrouter:"): + return ("openrouter", model_spec[11:]) + + # Fall back to global provider (if not mixed mode) + return (LLM_PROVIDER, model_spec) + ``` + +### Parallel Execution Strategy + +The routing layer optimizes parallel execution: + +1. **Input**: List of model specs (potentially mixed providers) +2. **Grouping**: Group models by provider for batch execution +3. **Parallel Provider Queries**: Each provider batch runs concurrently +4. **Result Aggregation**: Map provider results back to original model specs +5. **Output**: Dictionary of all results (including failures as None) + +**Example Flow:** +```python +# Input +model_specs = [ + "ollama:llama3.1:8b", + "ollama:mistral:latest", + "openrouter:openai/gpt-4", + "openrouter:anthropic/claude-3.5-sonnet" +] + +# Grouping +{ + "ollama": ["llama3.1:8b", "mistral:latest"], + "openrouter": ["openai/gpt-4", "anthropic/claude-3.5-sonnet"] +} + +# Execution (concurrent) +await asyncio.gather( + ollama_provider.query_models_parallel([...]), # 2 models + openrouter_provider.query_models_parallel([...]) # 2 models +) + +# Result mapping back to original specs +``` + +### Error Handling Philosophy + +**Graceful Degradation at Every Layer:** + +1. **Provider Level**: + - Individual model failures return None + - Don't propagate exceptions to council logic + - Log errors but continue with successful responses + +2. **Routing Level**: + - Invalid model specs → None result + - Provider instantiation failures → None for affected models + - Parse errors → None result + +3. **Council Level**: + - Continue with successful responses from Stage 1 + - Adapt Stage 2 ranking to available responses + - Stage 3 synthesis uses whatever data is available + +**Rationale**: Better to get partial results than complete failure. The council system is designed to aggregate multiple perspectives, so it's robust to individual model failures. + +## Migration Guide: Single Provider to Multi-Provider + +### For Existing Users (Minimal Changes) + +If you're already using OpenRouter, **no changes required**. The system defaults to `openrouter` mode for backward compatibility. + +**Optional**: Add `LLM_PROVIDER=openrouter` to `.env` for explicitness. + +### Migrating to Ollama (Local) + +1. Install Ollama (see README.md) +2. Pull models: + ```bash + ollama pull llama3.1:8b + ollama pull mistral:latest + ollama pull qwen2.5:3b + ``` +3. Update `.env`: + ```bash + LLM_PROVIDER=ollama + OLLAMA_BASE_URL=http://localhost:11434 + ``` +4. Update `backend/config.py`: + ```python + COUNCIL_MODELS = [ + "llama3.1:8b", + "mistral:latest", + "qwen2.5:3b", + ] + CHAIRMAN_MODEL = "llama3.1:8b" + ``` +5. Restart backend + +### Migrating to Mixed Mode (Hybrid) + +1. Set up both OpenRouter and Ollama +2. Update `.env`: + ```bash + LLM_PROVIDER=mixed + OPENROUTER_API_KEY=sk-or-v1-... + OLLAMA_BASE_URL=http://localhost:11434 + ``` +3. Update `backend/config.py` with prefixes: + ```python + COUNCIL_MODELS = [ + "ollama:llama3.1:8b", # Local, free + "ollama:mistral:latest", # Local, free + "openrouter:google/gemini-2.5-flash-lite", # Cloud, paid + "openrouter:anthropic/claude-3.5-haiku", # Cloud, paid + ] + CHAIRMAN_MODEL = "openrouter:google/gemini-2.5-flash-lite" + ``` +4. Restart backend + +### Adding a New Provider (For Developers) + +To add support for a new LLM provider (e.g., HuggingFace, Anthropic Direct, etc.): + +1. Create `backend/providers/your_provider.py`: + ```python + from backend.providers.base import LLMProvider + + class YourProvider(LLMProvider): + def __init__(self, api_key: str): + self.api_key = api_key + + async def query_model(self, model_id, messages, timeout=120.0): + # Implement API call to your provider + # Return {"content": str, "reasoning_details": dict or None} + # Return None on failure + pass + + async def query_models_parallel(self, model_ids, messages): + # Implement parallel queries + # Return {model_id: result_dict} for all models + pass + ``` + +2. Update `backend/providers/__init__.py`: + ```python + from backend.providers.your_provider import YourProvider + + def get_provider(provider_name: str) -> LLMProvider: + if provider_name == "your_provider": + return YourProvider(api_key=YOUR_PROVIDER_API_KEY) + # ... existing code + + def parse_model_spec(model_spec: str): + if model_spec.startswith("your_provider:"): + return ("your_provider", model_spec[14:]) + # ... existing code + ``` + +3. Update `backend/config.py`: + ```python + VALID_PROVIDERS = ["openrouter", "ollama", "mixed", "your_provider"] + YOUR_PROVIDER_API_KEY = os.getenv("YOUR_PROVIDER_API_KEY") + ``` + +4. Test thoroughly with both simple and mixed modes + ## Important Implementation Details ### Relative Imports All backend modules use relative imports (e.g., `from .config import ...`) not absolute imports. This is critical for Python's module system to work correctly when running as `python -m backend.main`. +### Provider Imports in council.py +The core council logic now imports from `backend.providers` instead of `backend.openrouter`: +```python +from backend.providers import query_model, query_models_parallel +``` +This maintains backward compatibility while routing to the appropriate provider. + ### Port Configuration - Backend: 8001 (changed from 8000 to avoid conflict) - Frontend: 5173 (Vite default) diff --git a/README.md b/README.md index 23599b3cf..0b7e9fed4 100644 --- a/README.md +++ b/README.md @@ -32,19 +32,30 @@ npm install cd .. ``` -### 2. Configure API Key +### 2. Choose Your LLM Provider -Create a `.env` file in the project root: +LLM Council supports three provider modes: -```bash -OPENROUTER_API_KEY=sk-or-v1-... -``` +- **OpenRouter** (cloud-based): Access to 100+ models via API, requires paid credits +- **Ollama** (local): Run models on your own hardware, completely free and private +- **Mixed** (hybrid): Combine both providers for cost/quality optimization + +### 3. Provider Configuration + +#### Option A: OpenRouter (Cloud-based) -Get your API key at [openrouter.ai](https://openrouter.ai/). Make sure to purchase the credits you need, or sign up for automatic top up. +**1. Get API Key** -### 3. Configure Models (Optional) +Sign up at [openrouter.ai](https://openrouter.ai/) and get your API key. Make sure to purchase credits or enable automatic top-up. -Edit `backend/config.py` to customize the council: +**2. Create `.env` file** + +```bash +LLM_PROVIDER=openrouter +OPENROUTER_API_KEY=sk-or-v1-your-actual-key-here +``` + +**3. Configure models in `backend/config.py`** (optional) ```python COUNCIL_MODELS = [ @@ -57,6 +68,97 @@ COUNCIL_MODELS = [ CHAIRMAN_MODEL = "google/gemini-3-pro-preview" ``` +See available models at [openrouter.ai/models](https://openrouter.ai/models) + +#### Option B: Ollama (Local) + +**1. Install Ollama** + +```bash +# Linux & Mac +curl -fsSL https://ollama.com/install.sh | sh + +# Windows +# Download from https://ollama.com/download +``` + +**2. Start Ollama service** + +```bash +# Ollama typically starts automatically after installation +# To verify it's running: +curl http://localhost:11434/api/version +``` + +**3. Pull models** + +```bash +# Download the models you want to use +ollama pull llama3.1:latest +ollama pull mistral:latest +ollama pull qwen2.5:3b +ollama pull phi3:latest +``` + +See available models at [ollama.com/library](https://ollama.com/library) + +**4. Create `.env` file** + +```bash +LLM_PROVIDER=ollama +OLLAMA_BASE_URL=http://localhost:11434 +``` + +**5. Configure models in `backend/config.py`** (optional) + +```python +COUNCIL_MODELS = [ + "llama3.1:8b", + "mistral:latest", + "qwen2.5:3b", + "phi3:latest", +] + +CHAIRMAN_MODEL = "llama3.1:8b" +``` + +#### Option C: Mixed (Hybrid) + +Combine local and cloud models for cost optimization. + +**1. Set up both providers** + +Follow installation steps for both OpenRouter and Ollama above. + +**2. Create `.env` file** + +```bash +LLM_PROVIDER=mixed +OPENROUTER_API_KEY=sk-or-v1-your-actual-key-here +OLLAMA_BASE_URL=http://localhost:11434 +``` + +**3. Configure models in `backend/config.py`** (optional) + +Prefix each model with its provider: + +```python +COUNCIL_MODELS = [ + "ollama:llama3.1:8b", # Local, fast, free + "ollama:mistral:latest", # Local, fast, free + "openrouter:google/gemini-2.5-flash-lite", # Cloud, paid + "openrouter:anthropic/claude-3.5-haiku", # Cloud, paid +] + +CHAIRMAN_MODEL = "openrouter:google/gemini-2.5-flash-lite" +``` + +**Benefits of mixed mode:** +- Use free local models for council deliberation +- Use premium cloud models for final synthesis +- Optimize cost vs quality trade-offs +- Keep sensitive data local while using cloud for general queries + ## Running the Application **Option 1: Use the start script** @@ -81,7 +183,7 @@ Then open http://localhost:5173 in your browser. ## Tech Stack -- **Backend:** FastAPI (Python 3.10+), async httpx, OpenRouter API +- **Backend:** FastAPI (Python 3.10+), async httpx, multi-provider support (OpenRouter, Ollama) - **Frontend:** React + Vite, react-markdown for rendering - **Storage:** JSON files in `data/conversations/` - **Package Management:** uv for Python, npm for JavaScript diff --git a/backend/config.py b/backend/config.py index a9cf7c473..c202b4caa 100644 --- a/backend/config.py +++ b/backend/config.py @@ -1,23 +1,74 @@ -"""Configuration for the LLM Council.""" +"""Configuration for the LLM Council. + +Supports three provider modes: +1. 'openrouter': All models use OpenRouter API (default) +2. 'ollama': All models use local Ollama server +3. 'mixed': Prefix each model with provider (e.g., 'ollama:llama3.1:8b', 'openrouter:google/gemini-2.5-flash-lite') +""" import os from dotenv import load_dotenv load_dotenv() -# OpenRouter API key +# LLM Provider configuration +# Valid values: 'openrouter', 'ollama', 'mixed' +# Default: 'openrouter' for backward compatibility +LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openrouter") +print(f"LLM_PROVIDER set to: {LLM_PROVIDER}") +# Validate LLM_PROVIDER +VALID_PROVIDERS = ["openrouter", "ollama", "mixed"] +if LLM_PROVIDER not in VALID_PROVIDERS: + raise ValueError( + f"Invalid LLM_PROVIDER '{LLM_PROVIDER}'. Must be one of {VALID_PROVIDERS}" + ) + +# OpenRouter API key (required for 'openrouter' and 'mixed' modes) OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") -# Council members - list of OpenRouter model identifiers -COUNCIL_MODELS = [ - "openai/gpt-5.1", - "google/gemini-3-pro-preview", - "anthropic/claude-sonnet-4.5", - "x-ai/grok-4", -] +# Ollama base URL (required for 'ollama' and 'mixed' modes) +# Default: http://localhost:11434 +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") + +# Council members - adapts based on LLM_PROVIDER +# +# Examples: +# - openrouter mode: ["openai/gpt-5.1", "google/gemini-3-pro-preview"] +# - ollama mode: ["llama3.1:8b", "mistral:latest", "qwen2.5:3b"] +# - mixed mode: ["ollama:llama3.1:8b", "openrouter:google/gemini-2.5-flash-lite"] +if LLM_PROVIDER == "openrouter": + # Original OpenRouter configuration (backward compatible) + COUNCIL_MODELS = [ + "openai/gpt-5.1", + "google/gemini-3-pro-preview", + "anthropic/claude-sonnet-4.5", + "x-ai/grok-4", + ] + # Original OpenRouter configuration (backward compatible) + CHAIRMAN_MODEL = "google/gemini-3-pro-preview" + TITLE_MODEL = "google/gemini-2.5-flash-lite" +elif LLM_PROVIDER == "ollama": + # Local Ollama models + COUNCIL_MODELS = [ + "mistral:latest", + "qwen2.5:3b", + "phi3:latest", + ] + # Local Ollama chairman + CHAIRMAN_MODEL = "mistral:latest" + TITLE_MODEL = "phi3:latest" +else: # mixed mode + # Mixed provider configuration with explicit prefixes + COUNCIL_MODELS = [ + "ollama:phi3:latest", + "ollama:mistral:latest", + "openrouter:google/gemini-3-pro-preview", + "openrouter:anthropic/claude-3.5-haiku", + ] + # Mixed mode chairman (can use either provider) + CHAIRMAN_MODEL = "openrouter:google/gemini-3-pro-preview" + TITLE_MODEL = "openrouter:google/gemini-2.5-flash-lite" -# Chairman model - synthesizes final response -CHAIRMAN_MODEL = "google/gemini-3-pro-preview" # OpenRouter API endpoint OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions" diff --git a/backend/council.py b/backend/council.py index 5069abec9..19b2c7d1c 100644 --- a/backend/council.py +++ b/backend/council.py @@ -1,8 +1,8 @@ """3-stage LLM Council orchestration.""" from typing import List, Dict, Any, Tuple -from .openrouter import query_models_parallel, query_model -from .config import COUNCIL_MODELS, CHAIRMAN_MODEL +from .providers import query_models_parallel, query_model +from .config import COUNCIL_MODELS, CHAIRMAN_MODEL, TITLE_MODEL async def stage1_collect_responses(user_query: str) -> List[Dict[str, Any]]: @@ -95,6 +95,7 @@ async def stage2_collect_rankings( messages = [{"role": "user", "content": ranking_prompt}] # Get rankings from all council models in parallel + print(f"Stage 2: Querying council models for rankings...{COUNCIL_MODELS}") responses = await query_models_parallel(COUNCIL_MODELS, messages) # Format results @@ -274,8 +275,8 @@ async def generate_conversation_title(user_query: str) -> str: messages = [{"role": "user", "content": title_prompt}] - # Use gemini-2.5-flash for title generation (fast and cheap) - response = await query_model("google/gemini-2.5-flash", messages, timeout=30.0) + # Use TITLE_MODEL for title generation (fast and cheap) + response = await query_model(TITLE_MODEL, messages, timeout=30.0) if response is None: # Fallback to a generic title diff --git a/backend/providers/__init__.py b/backend/providers/__init__.py new file mode 100644 index 000000000..be573e97d --- /dev/null +++ b/backend/providers/__init__.py @@ -0,0 +1,286 @@ +"""LLM Provider abstractions for the council system. + +This module provides a factory pattern for provider instantiation and intelligent +routing logic for both simple and mixed provider configurations. + +Key Functions: + - get_provider(provider_name): Get singleton provider instance + - parse_model_spec(model_spec): Parse model spec into (provider, model) tuple + - query_model(): Route single query to appropriate provider + - query_models_parallel(): Route parallel queries with cross-provider support +""" + +import asyncio +from typing import Any, Dict, List, Optional, Tuple + +from backend.config import LLM_PROVIDER, OPENROUTER_API_KEY, OLLAMA_BASE_URL +from backend.providers.base import LLMProvider +from backend.providers.ollama import OllamaProvider +from backend.providers.openrouter import OpenRouterProvider + + +# Singleton provider instances +_provider_instances: Dict[str, LLMProvider] = {} + + +def get_provider(provider_name: str) -> LLMProvider: + """Get a singleton instance of the specified provider. + + Args: + provider_name: Provider name ("openrouter" or "ollama") + + Returns: + LLMProvider instance for the specified provider + + Raises: + ValueError: If provider_name is not "openrouter" or "ollama" + + Example: + provider = get_provider("ollama") + result = await provider.query_model("llama3.1:8b", messages) + """ + if provider_name not in ["openrouter", "ollama"]: + raise ValueError( + f"Invalid provider name '{provider_name}'. Must be 'openrouter' or 'ollama'" + ) + + # Return existing instance if already created + if provider_name in _provider_instances: + return _provider_instances[provider_name] + + # Create new instance based on provider type + if provider_name == "openrouter": + if not OPENROUTER_API_KEY: + raise ValueError( + "OPENROUTER_API_KEY is required for openrouter provider. " + "Set it in your .env file." + ) + _provider_instances[provider_name] = OpenRouterProvider( + api_key=OPENROUTER_API_KEY + ) + else: # ollama + _provider_instances[provider_name] = OllamaProvider( + base_url=OLLAMA_BASE_URL + ) + + return _provider_instances[provider_name] + + +def parse_model_spec(model_spec: str) -> Tuple[str, str]: + """Parse a model specification into (provider, model) tuple. + + Handles two formats: + 1. Simple mode: "model_name" -> uses LLM_PROVIDER from config + 2. Mixed mode: "provider:model_name" -> explicit provider + + Args: + model_spec: Model specification string + - Simple: "llama3.1:8b" or "openai/gpt-4o" + - Mixed: "ollama:llama3.1:8b" or "openrouter:openai/gpt-4o" + + Returns: + Tuple of (provider_name, model_identifier) + - provider_name: "openrouter" or "ollama" + - model_identifier: The model name to pass to the provider + + Raises: + ValueError: If mixed mode format is invalid or provider is unknown + + Examples: + # Simple mode (uses LLM_PROVIDER from config) + parse_model_spec("llama3.1:8b") -> ("ollama", "llama3.1:8b") + parse_model_spec("openai/gpt-4o") -> ("openrouter", "openai/gpt-4o") + + # Mixed mode (explicit provider prefix) + parse_model_spec("ollama:llama3.1:8b") -> ("ollama", "llama3.1:8b") + parse_model_spec("openrouter:openai/gpt-4o") -> ("openrouter", "openai/gpt-4o") + """ + # Check if this is mixed mode format (provider:model) + if model_spec.startswith("ollama:"): + return ("ollama", model_spec[7:]) # Remove "ollama:" prefix + elif model_spec.startswith("openrouter:"): + return ("openrouter", model_spec[11:]) # Remove "openrouter:" prefix + + # Simple mode - use global LLM_PROVIDER setting + if LLM_PROVIDER == "mixed": + raise ValueError( + f"In mixed mode, model spec '{model_spec}' must include provider prefix " + "(e.g., 'ollama:llama3.1:8b' or 'openrouter:openai/gpt-4o')" + ) + + return (LLM_PROVIDER, model_spec) + + +async def query_model( + model_spec: str, + messages: List[Dict[str, str]], + timeout: float = 120.0 +) -> Optional[Dict[str, Any]]: + """Query a single model, routing to the appropriate provider. + + This is the main routing function for single model queries. It parses the + model specification to determine the provider, gets the provider instance, + and routes the query accordingly. + + Args: + model_spec: Model specification string (see parse_model_spec for format) + messages: List of message dicts with 'role' and 'content' keys + timeout: Maximum time in seconds to wait for response (default: 120.0) + + Returns: + Dictionary with 'content' and optional 'reasoning_details' on success, + None on any failure (provider error, network error, timeout, etc.) + + Example: + # Simple mode + result = await query_model( + "llama3.1:8b", + [{"role": "user", "content": "Hello"}] + ) + + # Mixed mode + result = await query_model( + "openrouter:openai/gpt-4o", + [{"role": "user", "content": "Hello"}] + ) + + Notes: + - Maintains backward compatibility with original openrouter.py API + - Returns same format as original implementation + - Handles all errors gracefully, returns None on failure + """ + try: + provider_name, model_id = parse_model_spec(model_spec) + provider = get_provider(provider_name) + return await provider.query_model(model_id, messages, timeout) + except ValueError as e: + print(f"Error parsing model spec '{model_spec}': {e}") + return None + except Exception as e: + print(f"Unexpected error querying model '{model_spec}': {e}") + return None + + +async def query_models_parallel( + model_specs: List[str], + messages: List[Dict[str, str]] +) -> Dict[str, Optional[Dict[str, Any]]]: + """Query multiple models in parallel, with cross-provider support. + + This function intelligently routes queries to their respective providers and + executes them in parallel. For efficiency, it groups queries by provider to + leverage each provider's native parallel execution capabilities. + + Args: + model_specs: List of model specification strings (see parse_model_spec) + messages: List of message dictionaries to send to all models + + Returns: + Dictionary mapping model specifications to their responses. + Successful queries return the standard response dict. + Failed queries have None as their value. + + Example: + # Mixed provider parallel queries + results = await query_models_parallel( + [ + "ollama:llama3.1:8b", + "ollama:mistral:latest", + "openrouter:openai/gpt-4o", + "openrouter:anthropic/claude-3.5-sonnet" + ], + [{"role": "user", "content": "Hello"}] + ) + # results = { + # "ollama:llama3.1:8b": {"content": "Hi!", "reasoning_details": None}, + # "ollama:mistral:latest": None, # Failed + # "openrouter:openai/gpt-4o": {"content": "Hello!", "reasoning_details": {...}}, + # "openrouter:anthropic/claude-3.5-sonnet": {"content": "Hi!", "reasoning_details": None} + # } + + Notes: + - Groups queries by provider for efficient execution + - All queries execute concurrently (both within and across providers) + - Individual model failures do not affect other queries + - Returns entry for every model in the input list + - Maintains backward compatibility with original openrouter.py API + """ + # Group models by provider for efficient parallel execution + provider_groups: Dict[str, List[Tuple[str, str]]] = {} # provider -> [(spec, model_id)] + + for model_spec in model_specs: + try: + provider_name, model_id = parse_model_spec(model_spec) + if provider_name not in provider_groups: + provider_groups[provider_name] = [] + provider_groups[provider_name].append((model_spec, model_id)) + except ValueError as e: + print(f"Error parsing model spec '{model_spec}': {e}") + # Add None entry for invalid specs + provider_groups.setdefault("_invalid", []).append((model_spec, None)) + + # Execute queries grouped by provider in parallel + all_results = {} + + # Create tasks for each provider group + provider_tasks = [] + for provider_name, specs_and_models in provider_groups.items(): + if provider_name == "_invalid": + # Handle invalid specs - add None entries + for model_spec, _ in specs_and_models: + all_results[model_spec] = None + continue + + # Get provider instance + try: + provider = get_provider(provider_name) + except ValueError as e: + print(f"Error getting provider '{provider_name}': {e}") + # Mark all models for this provider as failed + for model_spec, _ in specs_and_models: + all_results[model_spec] = None + continue + + # Extract just the model IDs for this provider + model_ids = [model_id for _, model_id in specs_and_models] + + # Create task for this provider's batch query + async def query_provider_batch(prov, model_ids_list, specs_list): + """Helper to query a provider's models and map back to original specs.""" + results = await prov.query_models_parallel(model_ids_list, messages) + # Map back from model_id to original model_spec + return { + spec: results[model_id] + for spec, model_id in specs_list + } + + provider_tasks.append( + query_provider_batch(provider, model_ids, specs_and_models) + ) + + # Wait for all provider batches to complete + if provider_tasks: + provider_results = await asyncio.gather(*provider_tasks) + + # Merge all results + for result_dict in provider_results: + all_results.update(result_dict) + + # Ensure we have an entry for every input model spec + for model_spec in model_specs: + if model_spec not in all_results: + all_results[model_spec] = None + + return all_results + + +# Export public API +__all__ = [ + 'LLMProvider', + 'OllamaProvider', + 'OpenRouterProvider', + 'get_provider', + 'parse_model_spec', + 'query_model', + 'query_models_parallel', +] diff --git a/backend/providers/base.py b/backend/providers/base.py new file mode 100644 index 000000000..967c52d68 --- /dev/null +++ b/backend/providers/base.py @@ -0,0 +1,109 @@ +"""Abstract base class for LLM providers. + +This module defines the interface that all LLM provider implementations must follow. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + + +class LLMProvider(ABC): + """Abstract base class for LLM providers. + + This class defines the interface contract that all LLM provider implementations + must follow. Providers are responsible for communicating with their respective + LLM APIs and handling errors gracefully. + + Interface Contract: + All methods should handle errors internally and return None on failure + rather than raising exceptions. This allows the system to continue + operating even when individual providers or models fail. + + Return Format: + All query methods must return a dictionary with the following structure: + { + "content": str, # The main response text from the model + "reasoning_details": Optional[Any] # Optional reasoning trace or metadata + } + + Returns None if the query fails for any reason (network error, API error, + timeout, invalid model, etc.). + + Error Handling: + Implementations should: + - Catch all exceptions internally + - Log errors appropriately + - Return None on any failure + - Never raise exceptions to callers + - Implement appropriate timeout handling + """ + + @abstractmethod + async def query_model( + self, + model: str, + messages: List[Dict[str, str]], + timeout: float = 120.0 + ) -> Optional[Dict[str, Any]]: + """Query a single model with the given messages. + + Args: + model: Model identifier (format depends on provider implementation) + messages: List of message dictionaries with 'role' and 'content' keys + following the standard chat completion format: + [{"role": "user", "content": "..."}, + {"role": "assistant", "content": "..."}, ...] + timeout: Maximum time in seconds to wait for response (default: 120.0) + + Returns: + Dictionary with 'content' and optional 'reasoning_details' on success, + None on any failure (timeout, network error, API error, etc.) + + Example: + result = await provider.query_model( + "gpt-4", + [{"role": "user", "content": "Hello"}] + ) + if result: + print(result["content"]) + """ + pass + + @abstractmethod + async def query_models_parallel( + self, + models: List[str], + messages: List[Dict[str, str]] + ) -> Dict[str, Optional[Dict[str, Any]]]: + """Query multiple models in parallel with the same messages. + + This method should execute all queries concurrently to minimize total + latency. Each model query is independent and failures should not affect + other queries. + + Args: + models: List of model identifiers to query + messages: List of message dictionaries to send to all models + (same format as query_model) + + Returns: + Dictionary mapping model identifiers to their responses. + Successful queries return the standard response dict. + Failed queries have None as their value. + + Example: + results = await provider.query_models_parallel( + ["gpt-4", "claude-3"], + [{"role": "user", "content": "Hello"}] + ) + # results = { + # "gpt-4": {"content": "Hi there!", "reasoning_details": None}, + # "claude-3": None # This query failed + # } + + Notes: + - All queries execute concurrently using asyncio.gather or similar + - Individual model failures should not cause the entire operation to fail + - The returned dict should contain an entry for every model in the input list + """ + pass diff --git a/backend/providers/ollama.py b/backend/providers/ollama.py new file mode 100644 index 000000000..8f2fad443 --- /dev/null +++ b/backend/providers/ollama.py @@ -0,0 +1,185 @@ +"""Ollama provider for local LLM instances. + +This module implements the LLM provider interface for Ollama, which runs +models locally. Ollama exposes an OpenAI-compatible API endpoint, making +integration straightforward. + +Key Features: + - Uses OpenAI-compatible /v1/chat/completions endpoint + - No authentication required (local instance) + - Supports Ollama model tags (e.g., llama3.1:8b, mistral:latest) + - Graceful handling of connection errors and missing models + - Parallel query support via asyncio +""" + +import asyncio +import logging +from typing import Any, Dict, List, Optional + +import httpx + +from .base import LLMProvider + +# Configure logging +logger = logging.getLogger(__name__) + + +class OllamaProvider(LLMProvider): + """Provider implementation for local Ollama instances. + + Ollama is a tool for running LLMs locally. It exposes an OpenAI-compatible + API endpoint, which this provider uses for communication. + + Attributes: + base_url: Base URL for the Ollama instance (default: http://localhost:11434) + + Example: + provider = OllamaProvider() + result = await provider.query_model( + "llama3.1:8b", + [{"role": "user", "content": "Hello"}] + ) + + Common Error Scenarios: + - Connection refused: Ollama service not running + - 404 Not Found: Model not pulled/available locally + - Timeout: Model loading or generation taking too long + - Network errors: Local network issues + """ + + def __init__(self, base_url: str = 'http://localhost:11434'): + """Initialize the Ollama provider. + + Args: + base_url: Base URL for the Ollama instance. Should not include + trailing slash. Default is http://localhost:11434 + """ + self.base_url = base_url.rstrip('/') + self.api_endpoint = f"{self.base_url}/v1/chat/completions" + + async def query_model( + self, + model: str, + messages: List[Dict[str, str]], + timeout: float = 120.0 + ) -> Optional[Dict[str, Any]]: + """Query a single Ollama model with the given messages. + + This method sends a request to the local Ollama instance using the + OpenAI-compatible chat completions endpoint. + + Args: + model: Ollama model tag (e.g., "llama3.1:8b", "mistral:latest") + messages: List of message dictionaries with 'role' and 'content' keys + timeout: Maximum time in seconds to wait for response (default: 120.0) + + Returns: + Dictionary with 'content' and 'reasoning_details' on success, + None on any failure. + + Error Handling: + - Connection errors → None (logs warning about Ollama not running) + - 404 errors → None (logs warning about model not found) + - Timeouts → None (logs warning about timeout) + - Any other errors → None (logs error details) + """ + headers = { + "Content-Type": "application/json", + } + + payload = { + "model": model, + "messages": messages, + } + + try: + async with httpx.AsyncClient(timeout=timeout) as client: + response = await client.post( + self.api_endpoint, + headers=headers, + json=payload + ) + response.raise_for_status() + + data = response.json() + message = data['choices'][0]['message'] + + return { + 'content': message.get('content'), + 'reasoning_details': None # Ollama doesn't provide reasoning details + } + + except httpx.ConnectError as e: + logger.warning( + f"Failed to connect to Ollama at {self.base_url}. " + f"Is Ollama running? Error: {e}" + ) + return None + + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + logger.warning( + f"Model '{model}' not found in Ollama. " + f"Pull it with: ollama pull {model}" + ) + else: + logger.error( + f"HTTP error querying Ollama model {model}: " + f"Status {e.response.status_code}, {e}" + ) + return None + + except httpx.TimeoutException as e: + logger.warning( + f"Timeout querying Ollama model {model} after {timeout}s. " + f"Model may be loading or generation is slow. Error: {e}" + ) + return None + + except Exception as e: + logger.error(f"Unexpected error querying Ollama model {model}: {e}") + return None + + async def query_models_parallel( + self, + models: List[str], + messages: List[Dict[str, str]] + ) -> Dict[str, Optional[Dict[str, Any]]]: + """Query multiple Ollama models in parallel with the same messages. + + This method executes all queries concurrently to minimize total latency. + Each model query is independent, and failures do not affect other queries. + + Args: + models: List of Ollama model tags to query + messages: List of message dictionaries to send to all models + + Returns: + Dictionary mapping model tags to their responses. + Successful queries return the standard response dict. + Failed queries have None as their value. + + Example: + results = await provider.query_models_parallel( + ["llama3.1:8b", "mistral:latest"], + [{"role": "user", "content": "Hello"}] + ) + # results = { + # "llama3.1:8b": {"content": "Hi!", "reasoning_details": None}, + # "mistral:latest": None # This query failed + # } + + Notes: + - All queries execute concurrently using asyncio.gather + - Individual model failures do not cause the entire operation to fail + - The returned dict contains an entry for every model in the input list + """ + # Create tasks for all models + tasks = [self.query_model(model, messages) for model in models] + + # Wait for all to complete (return_exceptions=False means gather will + # not raise, but our query_model already catches all exceptions) + responses = await asyncio.gather(*tasks) + + # Map models to their responses + return {model: response for model, response in zip(models, responses)} diff --git a/backend/providers/openrouter.py b/backend/providers/openrouter.py new file mode 100644 index 000000000..24146f703 --- /dev/null +++ b/backend/providers/openrouter.py @@ -0,0 +1,125 @@ +"""OpenRouter LLM provider implementation. + +This module implements the LLMProvider interface for OpenRouter API, +preserving the exact behavior of the original openrouter.py implementation. +""" + +import asyncio +import httpx +from typing import Any, Dict, List, Optional + +from backend.providers.base import LLMProvider + + +class OpenRouterProvider(LLMProvider): + """OpenRouter API provider implementation. + + This provider communicates with OpenRouter's API to query various LLM models. + It implements graceful degradation - failed requests return None without raising + exceptions, allowing the system to continue with successful responses. + + Args: + api_key: OpenRouter API key for authentication + api_url: OpenRouter API endpoint URL (default: https://openrouter.ai/api/v1/chat/completions) + + Example: + provider = OpenRouterProvider(api_key="sk-...") + result = await provider.query_model( + "openai/gpt-4o", + [{"role": "user", "content": "Hello"}] + ) + """ + + def __init__( + self, + api_key: str, + api_url: str = "https://openrouter.ai/api/v1/chat/completions" + ): + """Initialize the OpenRouter provider. + + Args: + api_key: OpenRouter API key for authentication + api_url: OpenRouter API endpoint URL (default: https://openrouter.ai/api/v1/chat/completions) + """ + self.api_key = api_key + self.api_url = api_url + + async def query_model( + self, + model: str, + messages: List[Dict[str, str]], + timeout: float = 120.0 + ) -> Optional[Dict[str, Any]]: + """Query a single model via OpenRouter API. + + Args: + model: OpenRouter model identifier (e.g., "openai/gpt-4o") + messages: List of message dicts with 'role' and 'content' + timeout: Request timeout in seconds (default: 120.0) + + Returns: + Response dict with 'content' and optional 'reasoning_details', or None if failed + + Notes: + - Handles all errors internally, returns None on failure + - Preserves exact behavior from original implementation + - Prints error messages to console for debugging + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + payload = { + "model": model, + "messages": messages, + } + + try: + async with httpx.AsyncClient(timeout=timeout) as client: + response = await client.post( + self.api_url, + headers=headers, + json=payload + ) + response.raise_for_status() + + data = response.json() + message = data['choices'][0]['message'] + + return { + 'content': message.get('content'), + 'reasoning_details': message.get('reasoning_details') + } + + except Exception as e: + print(f"Error querying model {model}: {e}") + return None + + async def query_models_parallel( + self, + models: List[str], + messages: List[Dict[str, str]] + ) -> Dict[str, Optional[Dict[str, Any]]]: + """Query multiple models in parallel. + + Args: + models: List of OpenRouter model identifiers + messages: List of message dicts to send to each model + + Returns: + Dict mapping model identifier to response dict (or None if failed) + + Notes: + - Uses asyncio.gather for concurrent execution + - Individual model failures don't affect other queries + - Returns entry for every model in input list + """ + # Create tasks for all models + tasks = [self.query_model(model, messages) for model in models] + + # Wait for all to complete + responses = await asyncio.gather(*tasks) + + # Map models to their responses + return {model: response for model, response in zip(models, responses)} diff --git a/frontend/package-lock.json b/frontend/package-lock.json index a6a7c3430..adc26cc05 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -2485,9 +2485,10 @@ } }, "node_modules/mdast-util-to-hast": { - "version": "13.2.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz", - "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==", + "version": "13.2.1", + "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz", + "integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0", "@types/mdast": "^4.0.0",