From 12121b06c371353725b1a8b29c2d530e4b96499d Mon Sep 17 00:00:00 2001 From: FlynnCruse Date: Wed, 3 Dec 2025 16:14:40 -0500 Subject: [PATCH 1/5] added local model provisions and resource guard --- README.md | 55 +++++++++++++ backend/config.py | 50 ++++++++++++ backend/council.py | 38 +++++++-- backend/local.py | 178 ++++++++++++++++++++++++++++++++++++++++++ backend/openrouter.py | 23 ++++-- pyproject.toml | 1 + 6 files changed, 330 insertions(+), 15 deletions(-) create mode 100644 backend/local.py diff --git a/README.md b/README.md index 23599b3cf..9b8e074bb 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,61 @@ npm run dev Then open http://localhost:5173 in your browser. +## Run with Local Models (Ollama) + +You can run the council against local, open‑source models you installed with Ollama (e.g. `nemotron`, `nemotron9b`, `nemotron12b`). + +1) Ensure the Ollama daemon is running and models are installed: +```bash +ollama list +``` + +2) Option A — Switch the backend to local mode (with resource guard). Either set env vars: +```bash +export COUNCIL_PROVIDER=local +export LOCAL_MODELS="nemotron,nemotron9b" # customize as you like +# optional: +# export CHAIRMAN_LOCAL_MODEL=nemotron12b +# export COUNCIL_MAX_PARALLEL_LOCAL=2 # hard cap concurrent runs +# export COUNCIL_MEM_RESERVE_GB=6 # keep RAM reserved for OS/apps +``` +…or edit `backend/config.py` accordingly. + +3) Start the backend as usual: +```bash +uv run python -m backend.main +``` + +### Safeguards to avoid crashes +- Adaptive memory guard estimates each model’s memory weight (from `ollama /api/tags` or safe defaults) and keeps the total under a computed budget (≈ 60% of RAM or total minus reserve). +- Optional hard cap: set `COUNCIL_MAX_PARALLEL_LOCAL` to limit parallel local runs. +- Per-request timeout for local calls (default 180s) prevents hung requests. + +This lets you run multiple efficient models in parallel while avoiding overload on machines with limited memory. + +## 4. Custom Self‑Hosted Models (Optional) + +Alternatively, you can point the council at any OpenAI‑compatible endpoint (e.g., vLLM, Ollama’s OpenAI proxy) without enabling the local resource guard: + +In `backend/config.py`: + +```python +CUSTOM_MODELS = { + "ollama/llama3": { + "api_url": "http://localhost:11434/v1/chat/completions", + "api_key": "ollama" # Optional, defaults to "custom" + } +} + +# Don't forget to add it to the council list! +COUNCIL_MODELS = [ + # ... other models + "ollama/llama3", +] +``` + +With this setup, the backend will route those IDs to the specified endpoints via the same OpenRouter client code path (no adaptive guard). See related discussion and example in the PR for custom models. + ## Tech Stack - **Backend:** FastAPI (Python 3.10+), async httpx, OpenRouter API diff --git a/backend/config.py b/backend/config.py index a9cf7c473..755290ef3 100644 --- a/backend/config.py +++ b/backend/config.py @@ -8,6 +8,9 @@ # OpenRouter API key OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") +# Provider selection: "openrouter" (default) or "local" +PROVIDER = os.getenv("COUNCIL_PROVIDER", "openrouter").strip().lower() + # Council members - list of OpenRouter model identifiers COUNCIL_MODELS = [ "openai/gpt-5.1", @@ -16,6 +19,17 @@ "x-ai/grok-4", ] +# Custom models configuration (OpenAI-compatible endpoints) +# Format: { "model_id": { "api_url": "...", "api_key": "..." } } +# Example for a local Ollama OpenAI-compatible endpoint: +# CUSTOM_MODELS = { +# "ollama/llama3": { +# "api_url": "http://localhost:11434/v1/chat/completions", +# "api_key": "ollama" # often ignored by local servers but required by some clients +# } +# } +CUSTOM_MODELS = {} + # Chairman model - synthesizes final response CHAIRMAN_MODEL = "google/gemini-3-pro-preview" @@ -24,3 +38,39 @@ # Data directory for conversation storage DATA_DIR = "data/conversations" + +# ---------- Local (Ollama) settings ---------- +# Base URL for the Ollama daemon +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").strip() + +# Comma-separated list of local model names (e.g. "nemotron,nemotron9b,nemotron12b") +_local_models_env = os.getenv("LOCAL_MODELS") +if _local_models_env: + LOCAL_MODELS = [m.strip() for m in _local_models_env.split(",") if m.strip()] +else: + # Defaults based on models you installed with Ollama + LOCAL_MODELS = ["nemotron", "nemotron9b", "nemotron12b"] + +# Chairman model to use when PROVIDER == "local" +CHAIRMAN_LOCAL_MODEL = os.getenv("CHAIRMAN_LOCAL_MODEL") +if not CHAIRMAN_LOCAL_MODEL: + if "nemotron12b" in LOCAL_MODELS: + CHAIRMAN_LOCAL_MODEL = "nemotron12b" + elif LOCAL_MODELS: + CHAIRMAN_LOCAL_MODEL = LOCAL_MODELS[0] + else: + CHAIRMAN_LOCAL_MODEL = "nemotron" + +# Keep at least this much RAM free for the OS/apps (in GiB) +COUNCIL_MEM_RESERVE_GB = float(os.getenv("COUNCIL_MEM_RESERVE_GB", "6")) + +# Hard cap on number of concurrent local model runs (optional). +# If unset, adaptive guard uses memory budget alone. +COUNCIL_MAX_PARALLEL_LOCAL = os.getenv("COUNCIL_MAX_PARALLEL_LOCAL") +COUNCIL_MAX_PARALLEL_LOCAL = int(COUNCIL_MAX_PARALLEL_LOCAL) if COUNCIL_MAX_PARALLEL_LOCAL else None + +# Enable/disable adaptive resource guard +COUNCIL_ADAPTIVE_RESOURCE_GUARD = os.getenv("COUNCIL_ADAPTIVE_RESOURCE_GUARD", "1").strip().lower() in {"1", "true", "yes", "on"} + +# Per-call timeout (seconds) for local model requests +COUNCIL_LOCAL_TIMEOUT_SEC = float(os.getenv("COUNCIL_LOCAL_TIMEOUT_SEC", "180")) diff --git a/backend/council.py b/backend/council.py index 5069abec9..9315f7cb7 100644 --- a/backend/council.py +++ b/backend/council.py @@ -1,8 +1,15 @@ """3-stage LLM Council orchestration.""" from typing import List, Dict, Any, Tuple -from .openrouter import query_models_parallel, query_model -from .config import COUNCIL_MODELS, CHAIRMAN_MODEL +from .openrouter import query_models_parallel as query_openrouter_parallel, query_model as query_openrouter_model +from .local import query_models_parallel as query_local_parallel, query_model as query_local_model +from .config import ( + COUNCIL_MODELS, + CHAIRMAN_MODEL, + PROVIDER, + LOCAL_MODELS, + CHAIRMAN_LOCAL_MODEL, +) async def stage1_collect_responses(user_query: str) -> List[Dict[str, Any]]: @@ -17,8 +24,13 @@ async def stage1_collect_responses(user_query: str) -> List[Dict[str, Any]]: """ messages = [{"role": "user", "content": user_query}] - # Query all models in parallel - responses = await query_models_parallel(COUNCIL_MODELS, messages) + # Decide provider and model set + if PROVIDER == "local": + model_list = LOCAL_MODELS + responses = await query_local_parallel(model_list, messages) + else: + model_list = COUNCIL_MODELS + responses = await query_openrouter_parallel(model_list, messages) # Format results stage1_results = [] @@ -95,7 +107,12 @@ async def stage2_collect_rankings( messages = [{"role": "user", "content": ranking_prompt}] # Get rankings from all council models in parallel - responses = await query_models_parallel(COUNCIL_MODELS, messages) + if PROVIDER == "local": + model_list = LOCAL_MODELS + responses = await query_local_parallel(model_list, messages) + else: + model_list = COUNCIL_MODELS + responses = await query_openrouter_parallel(model_list, messages) # Format results stage2_results = [] @@ -159,17 +176,22 @@ async def stage3_synthesize_final( messages = [{"role": "user", "content": chairman_prompt}] # Query the chairman model - response = await query_model(CHAIRMAN_MODEL, messages) + if PROVIDER == "local": + response = await query_local_model(CHAIRMAN_LOCAL_MODEL, messages) + chairman_name = CHAIRMAN_LOCAL_MODEL + else: + response = await query_openrouter_model(CHAIRMAN_MODEL, messages) + chairman_name = CHAIRMAN_MODEL if response is None: # Fallback if chairman fails return { - "model": CHAIRMAN_MODEL, + "model": chairman_name, "response": "Error: Unable to generate final synthesis." } return { - "model": CHAIRMAN_MODEL, + "model": chairman_name, "response": response.get('content', '') } diff --git a/backend/local.py b/backend/local.py new file mode 100644 index 000000000..22c819ff6 --- /dev/null +++ b/backend/local.py @@ -0,0 +1,178 @@ +"""Ollama (local) provider with adaptive resource safeguards.""" + +from __future__ import annotations + +import asyncio +from contextlib import asynccontextmanager +from typing import Any, Dict, List, Optional, Tuple + +import httpx + +from .config import ( + OLLAMA_BASE_URL, + COUNCIL_MEM_RESERVE_GB, + COUNCIL_MAX_PARALLEL_LOCAL, + COUNCIL_ADAPTIVE_RESOURCE_GUARD, + COUNCIL_LOCAL_TIMEOUT_SEC, +) + +GiB = 1024 ** 3 + + +async def _get_total_ram_bytes() -> int: + """Return total system RAM in bytes (best-effort).""" + try: + import psutil # type: ignore + return int(psutil.virtual_memory().total) + except Exception: + # Fallback to a conservative default if psutil is not available at runtime + return 8 * GiB + + +async def _fetch_installed_model_sizes() -> Dict[str, int]: + """ + Ask the Ollama daemon for installed models and sizes. + Returns map of model name -> size in bytes. + """ + url = f"{OLLAMA_BASE_URL}/api/tags" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + r = await client.get(url) + r.raise_for_status() + data = r.json() + sizes: Dict[str, int] = {} + for m in data.get("models", []): + name = m.get("name") + size = m.get("size") + if isinstance(name, str) and isinstance(size, int): + sizes[name.split(":")[0]] = size # normalize "name:tag" -> "name" + return sizes + except Exception: + return {} + + +# Conservative default size hints if Ollama does not return sizes +DEFAULT_SIZE_HINTS_GB: Dict[str, float] = { + "nemotron": 4.9, + "nemotron9b": 6.5, + "nemotron12b": 7.5, +} + + +def _estimate_weight_bytes(model: str, size_bytes: Optional[int]) -> int: + """ + Estimate memory weight for scheduling. We multiply file size by 1.3 + as a rough upper bound for runtime memory, with a minimum floor of 2 GiB. + """ + if size_bytes is None or size_bytes <= 0: + size_gb = DEFAULT_SIZE_HINTS_GB.get(model.split(":")[0], 3.0) + size_bytes = int(size_gb * GiB) + weight = int(size_bytes * 1.3) + return max(weight, 2 * GiB) + + +def _compute_budget_bytes(total_ram_bytes: int) -> int: + """ + Compute a memory budget for concurrent model runs, keeping a reserve. + We take max(60% of total, total - reserve). + """ + reserve_bytes = int(COUNCIL_MEM_RESERVE_GB * GiB) + sixty_percent = int(total_ram_bytes * 0.60) + return max(sixty_percent, total_ram_bytes - reserve_bytes) + + +class WeightedResourcePool: + """Async weighted resource pool for memory-guarded concurrency.""" + + def __init__(self, total_budget_bytes: int): + self._total = total_budget_bytes + self._used = 0 + self._cond = asyncio.Condition() + + @asynccontextmanager + async def hold(self, weight_bytes: int): + async with self._cond: + while self._used + weight_bytes > self._total: + await self._cond.wait() + self._used += weight_bytes + try: + yield + finally: + async with self._cond: + self._used = max(0, self._used - weight_bytes) + self._cond.notify_all() + + +async def query_model( + model: str, + messages: List[Dict[str, str]], + timeout: float = COUNCIL_LOCAL_TIMEOUT_SEC +) -> Optional[Dict[str, Any]]: + """ + Query a single local model via Ollama chat API. + """ + url = f"{OLLAMA_BASE_URL}/api/chat" + payload = { + "model": model, + "messages": messages, + "stream": False, + } + try: + async with httpx.AsyncClient(timeout=timeout) as client: + r = await client.post(url, json=payload) + r.raise_for_status() + data = r.json() + # Non-stream response typically includes 'message': {'content': ...} + content = None + if isinstance(data, dict): + message = data.get("message") or {} + content = message.get("content") or data.get("response") + return {"content": content or ""} + except Exception as e: + print(f"[local] Error querying model {model}: {e}") + return None + + +async def query_models_parallel( + models: List[str], + messages: List[Dict[str, str]] +) -> Dict[str, Optional[Dict[str, Any]]]: + """ + Query multiple local models in parallel with adaptive safeguards: + - WeightedResourcePool to keep memory usage under a computed budget + - Optional hard cap on max concurrent local runs + """ + # Gather model size hints + installed_sizes = await _fetch_installed_model_sizes() + weights = {m: _estimate_weight_bytes(m, installed_sizes.get(m)) for m in models} + + # Compute memory budget + total_ram = await _get_total_ram_bytes() + budget = _compute_budget_bytes(total_ram) + pool = WeightedResourcePool(budget) if COUNCIL_ADAPTIVE_RESOURCE_GUARD else None + + # Optional hard cap + semaphore = asyncio.Semaphore(COUNCIL_MAX_PARALLEL_LOCAL) if COUNCIL_MAX_PARALLEL_LOCAL else None + + async def run_one(model: str) -> Optional[Dict[str, Any]]: + async def call(): + return await query_model(model, messages) + + if pool is not None and semaphore is not None: + async with semaphore: + async with pool.hold(weights[model]): + return await call() + elif pool is not None: + async with pool.hold(weights[model]): + return await call() + elif semaphore is not None: + async with semaphore: + return await call() + else: + return await call() + + tasks = [run_one(m) for m in models] + results = await asyncio.gather(*tasks) + return {m: r for m, r in zip(models, results)} + + diff --git a/backend/openrouter.py b/backend/openrouter.py index 118fb0b73..234ce626a 100644 --- a/backend/openrouter.py +++ b/backend/openrouter.py @@ -1,8 +1,8 @@ -"""OpenRouter API client for making LLM requests.""" +"""OpenRouter API client for making LLM requests (plus optional custom endpoints).""" import httpx from typing import List, Dict, Any, Optional -from .config import OPENROUTER_API_KEY, OPENROUTER_API_URL +from .config import OPENROUTER_API_KEY, OPENROUTER_API_URL, CUSTOM_MODELS async def query_model( @@ -11,18 +11,27 @@ async def query_model( timeout: float = 120.0 ) -> Optional[Dict[str, Any]]: """ - Query a single model via OpenRouter API. + Query a single model via OpenRouter API or a custom endpoint. Args: - model: OpenRouter model identifier (e.g., "openai/gpt-4o") + model: Model identifier (e.g., "openai/gpt-4o" or custom ID) messages: List of message dicts with 'role' and 'content' timeout: Request timeout in seconds Returns: Response dict with 'content' and optional 'reasoning_details', or None if failed """ + # Choose endpoint and key + if model in CUSTOM_MODELS: + cfg = CUSTOM_MODELS[model] + api_url = cfg["api_url"] + api_key = cfg.get("api_key", "custom") + else: + api_url = OPENROUTER_API_URL + api_key = OPENROUTER_API_KEY + headers = { - "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } @@ -34,7 +43,7 @@ async def query_model( try: async with httpx.AsyncClient(timeout=timeout) as client: response = await client.post( - OPENROUTER_API_URL, + api_url, headers=headers, json=payload ) @@ -61,7 +70,7 @@ async def query_models_parallel( Query multiple models in parallel. Args: - models: List of OpenRouter model identifiers + models: List of model identifiers messages: List of message dicts to send to each model Returns: diff --git a/pyproject.toml b/pyproject.toml index 56356ebcb..2db5d0d54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,4 +10,5 @@ dependencies = [ "python-dotenv>=1.0.0", "httpx>=0.27.0", "pydantic>=2.9.0", + "psutil>=5.9.0", ] From 6e4b899a398c31d0522029e966abc90400a8a241 Mon Sep 17 00:00:00 2001 From: FlynnCruse Date: Wed, 3 Dec 2025 16:24:22 -0500 Subject: [PATCH 2/5] Add psutil dependency and update title generation logic to support local models --- backend/council.py | 10 ++++++++-- uv.lock | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/backend/council.py b/backend/council.py index 9315f7cb7..1f3c47ca7 100644 --- a/backend/council.py +++ b/backend/council.py @@ -296,8 +296,14 @@ async def generate_conversation_title(user_query: str) -> str: messages = [{"role": "user", "content": title_prompt}] - # Use gemini-2.5-flash for title generation (fast and cheap) - response = await query_model("google/gemini-2.5-flash", messages, timeout=30.0) + # Generate title using appropriate provider + if PROVIDER == "local": + # Use a local model (prefer the first configured, fallback to chairman) + model_for_title = LOCAL_MODELS[0] if LOCAL_MODELS else CHAIRMAN_LOCAL_MODEL + response = await query_local_model(model_for_title, messages, timeout=30.0) + else: + # Use a fast OpenRouter model for title generation + response = await query_openrouter_model("google/gemini-2.5-flash", messages, timeout=30.0) if response is None: # Fallback to a generic title diff --git a/uv.lock b/uv.lock index 079224681..9a54b4bd4 100644 --- a/uv.lock +++ b/uv.lock @@ -188,6 +188,7 @@ source = { virtual = "." } dependencies = [ { name = "fastapi" }, { name = "httpx" }, + { name = "psutil" }, { name = "pydantic" }, { name = "python-dotenv" }, { name = "uvicorn", extra = ["standard"] }, @@ -197,11 +198,38 @@ dependencies = [ requires-dist = [ { name = "fastapi", specifier = ">=0.115.0" }, { name = "httpx", specifier = ">=0.27.0" }, + { name = "psutil", specifier = ">=5.9.0" }, { name = "pydantic", specifier = ">=2.9.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" }, ] +[[package]] +name = "psutil" +version = "7.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/93/0c49e776b8734fef56ec9c5c57f923922f2cf0497d62e0f419465f28f3d0/psutil-7.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0005da714eee687b4b8decd3d6cc7c6db36215c9e74e5ad2264b90c3df7d92dc", size = 239751, upload-time = "2025-11-02T12:25:58.161Z" }, + { url = "https://files.pythonhosted.org/packages/6f/8d/b31e39c769e70780f007969815195a55c81a63efebdd4dbe9e7a113adb2f/psutil-7.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:19644c85dcb987e35eeeaefdc3915d059dac7bd1167cdcdbf27e0ce2df0c08c0", size = 240368, upload-time = "2025-11-02T12:26:00.491Z" }, + { url = "https://files.pythonhosted.org/packages/62/61/23fd4acc3c9eebbf6b6c78bcd89e5d020cfde4acf0a9233e9d4e3fa698b4/psutil-7.1.3-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:95ef04cf2e5ba0ab9eaafc4a11eaae91b44f4ef5541acd2ee91d9108d00d59a7", size = 287134, upload-time = "2025-11-02T12:26:02.613Z" }, + { url = "https://files.pythonhosted.org/packages/30/1c/f921a009ea9ceb51aa355cb0cc118f68d354db36eae18174bab63affb3e6/psutil-7.1.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1068c303be3a72f8e18e412c5b2a8f6d31750fb152f9cb106b54090296c9d251", size = 289904, upload-time = "2025-11-02T12:26:05.207Z" }, + { url = "https://files.pythonhosted.org/packages/a6/82/62d68066e13e46a5116df187d319d1724b3f437ddd0f958756fc052677f4/psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa", size = 249642, upload-time = "2025-11-02T12:26:07.447Z" }, + { url = "https://files.pythonhosted.org/packages/df/ad/c1cd5fe965c14a0392112f68362cfceb5230819dbb5b1888950d18a11d9f/psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee", size = 245518, upload-time = "2025-11-02T12:26:09.719Z" }, + { url = "https://files.pythonhosted.org/packages/2e/bb/6670bded3e3236eb4287c7bcdc167e9fae6e1e9286e437f7111caed2f909/psutil-7.1.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b403da1df4d6d43973dc004d19cee3b848e998ae3154cc8097d139b77156c353", size = 239843, upload-time = "2025-11-02T12:26:11.968Z" }, + { url = "https://files.pythonhosted.org/packages/b8/66/853d50e75a38c9a7370ddbeefabdd3d3116b9c31ef94dc92c6729bc36bec/psutil-7.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad81425efc5e75da3f39b3e636293360ad8d0b49bed7df824c79764fb4ba9b8b", size = 240369, upload-time = "2025-11-02T12:26:14.358Z" }, + { url = "https://files.pythonhosted.org/packages/41/bd/313aba97cb5bfb26916dc29cf0646cbe4dd6a89ca69e8c6edce654876d39/psutil-7.1.3-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f33a3702e167783a9213db10ad29650ebf383946e91bc77f28a5eb083496bc9", size = 288210, upload-time = "2025-11-02T12:26:16.699Z" }, + { url = "https://files.pythonhosted.org/packages/c2/fa/76e3c06e760927a0cfb5705eb38164254de34e9bd86db656d4dbaa228b04/psutil-7.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fac9cd332c67f4422504297889da5ab7e05fd11e3c4392140f7370f4208ded1f", size = 291182, upload-time = "2025-11-02T12:26:18.848Z" }, + { url = "https://files.pythonhosted.org/packages/0f/1d/5774a91607035ee5078b8fd747686ebec28a962f178712de100d00b78a32/psutil-7.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:3792983e23b69843aea49c8f5b8f115572c5ab64c153bada5270086a2123c7e7", size = 250466, upload-time = "2025-11-02T12:26:21.183Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/e426584bacb43a5cb1ac91fae1937f478cd8fbe5e4ff96574e698a2c77cd/psutil-7.1.3-cp314-cp314t-win_arm64.whl", hash = "sha256:31d77fcedb7529f27bb3a0472bea9334349f9a04160e8e6e5020f22c59893264", size = 245756, upload-time = "2025-11-02T12:26:23.148Z" }, + { url = "https://files.pythonhosted.org/packages/ef/94/46b9154a800253e7ecff5aaacdf8ebf43db99de4a2dfa18575b02548654e/psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab", size = 238359, upload-time = "2025-11-02T12:26:25.284Z" }, + { url = "https://files.pythonhosted.org/packages/68/3a/9f93cff5c025029a36d9a92fef47220ab4692ee7f2be0fba9f92813d0cb8/psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880", size = 239171, upload-time = "2025-11-02T12:26:27.23Z" }, + { url = "https://files.pythonhosted.org/packages/ce/b1/5f49af514f76431ba4eea935b8ad3725cdeb397e9245ab919dbc1d1dc20f/psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3", size = 263261, upload-time = "2025-11-02T12:26:29.48Z" }, + { url = "https://files.pythonhosted.org/packages/e0/95/992c8816a74016eb095e73585d747e0a8ea21a061ed3689474fabb29a395/psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b", size = 264635, upload-time = "2025-11-02T12:26:31.74Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" }, +] + [[package]] name = "pydantic" version = "2.12.4" From a16a95baca35b4127eff57b2f020a1e5406f7090 Mon Sep 17 00:00:00 2001 From: FlynnCruse Date: Wed, 3 Dec 2025 16:35:30 -0500 Subject: [PATCH 3/5] Add council.sh script for conversation management with API integration --- scripts/council.sh | 99 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 scripts/council.sh diff --git a/scripts/council.sh b/scripts/council.sh new file mode 100644 index 000000000..69aa4c7ac --- /dev/null +++ b/scripts/council.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Simple helper to: +# 1) Create a conversation +# 2) Verify and store the conversation ID +# 3) Send the first message (non-stream or stream) +# +# Usage: +# bash scripts/council.sh -m "Your question" +# bash scripts/council.sh --stream -m "Your question" +# bash scripts/council.sh # will prompt for message +# +# Env: +# COUNCIL_API_URL (default: http://127.0.0.1:8001) +# +# Outputs: +# - Prints the conversation ID +# - Saves it to .council_last_cid in project root + +API_URL="${COUNCIL_API_URL:-http://127.0.0.1:8001}" +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CID_FILE="${PROJECT_ROOT}/.council_last_cid" + +STREAM=0 +MESSAGE="" + +usage() { + echo "Usage: $0 [--stream] [-m MESSAGE]" + echo " COUNCIL_API_URL can override API base (default: ${API_URL})" +} + +while (( "$#" )); do + case "$1" in + --stream) + STREAM=1 + shift + ;; + -m) + if [ "${2:-}" = "" ]; then + echo "Error: -m requires a message" + usage + exit 1 + fi + MESSAGE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown arg: $1" + usage + exit 1 + ;; + esac +done + +command -v jq >/dev/null 2>&1 || { echo "Error: jq is required."; exit 1; } + +# Health check +if ! curl -fsS "${API_URL}/" >/dev/null; then + echo "Error: Council API not reachable at ${API_URL}" + exit 1 +fi + +# Create conversation +CID="$(curl -sS -H "Content-Type: application/json" -d '{}' "${API_URL}/api/conversations" | jq -r .id)" +if [ -z "${CID}" ] || [ "${CID}" = "null" ]; then + echo "Error: Failed to create conversation (no ID returned)." + exit 1 +fi + +echo "${CID}" > "${CID_FILE}" +echo "Conversation ID: ${CID}" +echo "(Saved to ${CID_FILE})" + +# Get message if not provided +if [ -z "${MESSAGE}" ]; then + echo -n "Enter your first message: " + IFS= read -r MESSAGE +fi + +# Build JSON safely +PAYLOAD="$(jq -n --arg content "$MESSAGE" '{content:$content}')" + +if [ "${STREAM}" -eq 1 ]; then + echo "Streaming response (SSE)..." + curl -N -H "Content-Type: application/json" \ + -d "${PAYLOAD}" \ + "${API_URL}/api/conversations/${CID}/message/stream" +else + curl -sS -H "Content-Type: application/json" \ + -d "${PAYLOAD}" \ + "${API_URL}/api/conversations/${CID}/message" | jq . +fi + + From cc0db90aaa81defeb92b68532993fc1842bf88f1 Mon Sep 17 00:00:00 2001 From: FlynnCruse Date: Fri, 5 Dec 2025 00:01:46 -0500 Subject: [PATCH 4/5] Add documentation for running NVIDIA's Llama-3.1-Nemotron-Nano-8B model locally via Ollama, including usage instructions, recommended settings, and troubleshooting tips. --- backend/Local-AI.md | 192 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 backend/Local-AI.md diff --git a/backend/Local-AI.md b/backend/Local-AI.md new file mode 100644 index 000000000..c2ee8884d --- /dev/null +++ b/backend/Local-AI.md @@ -0,0 +1,192 @@ +# Nemotron Nano 8B (Local) + +NVIDIA's Llama-3.1-Nemotron-Nano-8B-v1 running locally via Ollama. + +## About + +A reasoning model fine-tuned by NVIDIA for: +- **Tool calling / function calling** +- **RAG (Retrieval Augmented Generation)** +- **Math & code reasoning** +- **General chat** + +Based on Llama 3.1 8B Instruct. Supports 128K context length. + +## Usage + +ollama run nemotron + +If you’ve registered additional models: + +- nemotron9b: `ollama run nemotron9b` +- nemotron12b: `ollama run nemotron12b` + +### Reasoning Mode + +Toggle reasoning with the system prompt "detailed thinking on" or "detailed thinking off". + +### Recommended Settings + +- **Reasoning ON**: temperature 0.6, top_p 0.95 +- **Reasoning OFF**: greedy decoding (temperature 0) + +## Download + +Download the GGUF weights locally: +(Already downloaded) +```bash +curl -L -o nemotron-nano.gguf "https://huggingface.co/bartowski/nvidia_Llama-3.1-Nemotron-Nano-8B-v1-GGUF/resolve/main/nvidia_Llama-3.1-Nemotron-Nano-8B-v1-Q4_K_M.gguf" +``` + +## Files + +- Modelfile - Ollama model config +- nemotron-nano.gguf - Model weights (Q4_K_M quantization, ~4.7GB) +- nemotron-9b-v2.gguf - Optional upgrade (Q4_K_M, ~6.1GB) +- nemotron-12b-v2.gguf - Optional upgrade (Q4_K_M, ~7.1GB) + +## Use with LLM Council (Local) + +Run Karpathy’s LLM Council fully offline using your local Ollama models. + +1) Verify Ollama and models + +```bash +ollama list +``` + +2) Install Council deps + +```bash +cd Council/llm-council +uv sync +cd frontend && npm install && cd .. +``` + +3) Configure local mode (.env in Council/llm-council) + +```bash +cat > .env <<'EOF' +COUNCIL_PROVIDER=local +LOCAL_MODELS=nemotron,nemotron9b,nemotron12b +CHAIRMAN_LOCAL_MODEL=nemotron12b +COUNCIL_MAX_PARALLEL_LOCAL=2 +COUNCIL_MEM_RESERVE_GB=6 +# OLLAMA_BASE_URL=http://127.0.0.1:11434 +EOF +``` + +4) Start servers + +- Option A: + +```bash +./start.sh +``` + +- Option B: + +```bash +uv run python -m backend.main +# in a new terminal: +cd frontend && npm run dev +``` + +Open http://localhost:5173. + +5) Smoke test via CLI (optional) + +```bash +bash scripts/council.sh -m "Say hi in five words" +bash scripts/council.sh --stream -m "One fun fact about space." +``` + +Notes +- Ensure `LOCAL_MODELS` names match `ollama list` (omit tags like `:latest`). +- If the machine is tight on RAM, lower `LOCAL_MODELS` count or set `COUNCIL_MAX_PARALLEL_LOCAL=1–2`. +- `COUNCIL_MEM_RESERVE_GB` keeps headroom for the OS/apps; increase if needed. + +## Additional Models (Optional) + +### NVIDIA Nemotron Nano 9B v2 + +Download: + +```bash +curl -L -o nemotron-9b-v2.gguf "https://huggingface.co/bartowski/nvidia_NVIDIA-Nemotron-Nano-9B-v2-GGUF/resolve/main/nvidia_NVIDIA-Nemotron-Nano-9B-v2-Q4_K_M.gguf" +``` + +Register with Ollama: + +```bash +cat > Modelfile-9b << 'EOF' +FROM ./nemotron-9b-v2.gguf +TEMPLATE """<|begin_of_text|><|start_header_id|>system<|end_header_id|> +{{ .System }}<|eot_id|><|start_header_id|>user<|end_header_id|> +{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> +""" +PARAMETER temperature 0.6 +PARAMETER top_p 0.95 +EOF + +ollama create nemotron9b -f Modelfile-9b +``` + +Run: + +```bash +ollama run nemotron9b "What are you?" +``` + +--- + +### NVIDIA Nemotron Nano 12B v2 + +Download: + +```bash +curl -L -o nemotron-12b-v2.gguf "https://huggingface.co/bartowski/nvidia_NVIDIA-Nemotron-Nano-12B-v2-GGUF/resolve/main/nvidia_NVIDIA-Nemotron-Nano-12B-v2-Q4_K_M.gguf" +``` + +Register with Ollama: + +```bash +cat > Modelfile-12b << 'EOF' +FROM ./nemotron-12b-v2.gguf +TEMPLATE """<|begin_of_text|><|start_header_id|>system<|end_header_id|> +{{ .System }}<|eot_id|><|start_header_id|>user<|end_header_id|> +{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|> +""" +PARAMETER temperature 0.6 +PARAMETER top_p 0.95 +EOF + +ollama create nemotron12b -f Modelfile-12b +``` + +Run: + +```bash +ollama run nemotron12b "What are you?" +``` + +## Troubleshooting + +- Error: supplied file was not in GGUF format + This usually means the downloaded file was an HTML page, not a .gguf. Make sure you: + - Use the Hugging Face “resolve/main/... .gguf” URL. + - Pass -L to curl to follow redirects. + - Verify file size is several GB (ls -lh). Re-download if it’s only KB/MB. +- zsh: command not found: llama + Use `ollama run ...` instead of `llama run`. + +## Source + +- Official: https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-8B-v1 +- GGUF: https://huggingface.co/bartowski/nvidia_Llama-3.1-Nemotron-Nano-8B-v1-GGUF +- 9B v2: https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2 +- 12B v2: https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2 + +## License + +NVIDIA Open Model License + Llama 3.1 Community License From 6dac3bd15366a11e08882bb3bc80b6b0b9e857e4 Mon Sep 17 00:00:00 2001 From: FlynnCruse Date: Fri, 5 Dec 2025 00:09:40 -0500 Subject: [PATCH 5/5] Add detailed installation instructions for Ollama on macOS, Linux, and Windows, along with model management and troubleshooting tips. Include guidance on RAM sizing and safeguards for optimal performance. --- backend/Local-AI.md | 109 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/backend/Local-AI.md b/backend/Local-AI.md index c2ee8884d..ed8dc82b8 100644 --- a/backend/Local-AI.md +++ b/backend/Local-AI.md @@ -12,6 +12,88 @@ A reasoning model fine-tuned by NVIDIA for: Based on Llama 3.1 8B Instruct. Supports 128K context length. +## Install Ollama (macOS / Linux / Windows) + +### macOS (Apple Silicon or Intel) + +```bash +# Option A (recommended): Homebrew +brew install ollama + +# Option B: Official installer +curl -fsSL https://ollama.com/install.sh | sh + +# Verify and quick test +ollama --version +ollama run llama3.2:3b "hi" +``` + +Notes: +- Homebrew is for macOS/Linux only. For Windows, see the Windows section below. +- Apple Silicon (M‑series) accelerates with Metal automatically. +- If you prefer a foreground server: `ollama serve` (Ctrl+C to stop). + +### Linux (Ubuntu/Debian/Fedora/Arch) + +```bash +# Install +curl -fsSL https://ollama.com/install.sh | sh + +# Start/enable service (systemd distros) +sudo systemctl enable --now ollama + +# Verify +ollama --version +ollama list +``` + +GPU (optional): +- NVIDIA: install recent NVIDIA driver + CUDA; verify with `nvidia-smi`. +- If GPU isn’t available, Ollama falls back to CPU. + +### Windows 11/10 + +```powershell +# PowerShell (run as Administrator; requires winget) +winget install Ollama.Ollama +# If winget is unavailable, download the installer from: https://ollama.com + +# New terminal: +ollama --version +ollama run llama3.2:3b "hi" +``` + +Notes: +- Allow Ollama (port 11434) through Windows Firewall on first run. +- Alternative: WSL2 → install Ubuntu, then follow Linux steps inside WSL. + +## Add Models to Ollama + +You can either pull from the Ollama library or use local GGUF files. + +### Option A — Pull from library (1‑line) + +```bash +ollama pull llama3.2:3b +ollama run llama3.2:3b "hello" +``` + +### Option B — Use local GGUF (Nemotron examples below) + +See “Download” and “Additional Models (Optional)” sections for GGUF URLs and `Modelfile` examples to register: + +```bash +# Example flow +curl -L -o model.gguf "https://huggingface.co/.../model-Q4_K_M.gguf" +cat > Modelfile << 'EOF' +FROM ./model.gguf +PARAMETER temperature 0.6 +PARAMETER top_p 0.95 +EOF +ollama create mymodel -f Modelfile +ollama run mymodel "hi" +``` + ## Usage ollama run nemotron @@ -106,6 +188,13 @@ Notes - If the machine is tight on RAM, lower `LOCAL_MODELS` count or set `COUNCIL_MAX_PARALLEL_LOCAL=1–2`. - `COUNCIL_MEM_RESERVE_GB` keeps headroom for the OS/apps; increase if needed. +### RAM Sizing & Safeguards (Council Local Mode) +- Effective model budget ≈ `max(60% of RAM, RAM − COUNCIL_MEM_RESERVE_GB)`. +- Recommended on 48 GiB Macs: + - Light multitasking: `COUNCIL_MEM_RESERVE_GB=8` + - Heavy multitasking: `COUNCIL_MEM_RESERVE_GB=10–12` +- To reduce pressure: lower `COUNCIL_MAX_PARALLEL_LOCAL` or remove a model from `LOCAL_MODELS`. + ## Additional Models (Optional) ### NVIDIA Nemotron Nano 9B v2 @@ -170,6 +259,20 @@ Run: ollama run nemotron12b "What are you?" ``` +## Manage Models & Disk Space + +```bash +# See installed models and sizes +ollama list + +# Remove a model to free space +ollama rm nemotron9b +``` + +Notes: +- GGUF files are several GB each; keep an eye on free disk space before downloads. +- If using Council in local mode, ensure `LOCAL_MODELS` only includes models you actually need. + ## Troubleshooting - Error: supplied file was not in GGUF format @@ -179,6 +282,12 @@ ollama run nemotron12b "What are you?" - Verify file size is several GB (ls -lh). Re-download if it’s only KB/MB. - zsh: command not found: llama Use `ollama run ...` instead of `llama run`. +- Connection refused to http://127.0.0.1:11434 + Start Ollama: + - macOS/Windows: launch the app or run `ollama serve` + - Linux: `sudo systemctl enable --now ollama` +- Port 11434 already in use + Stop whatever is bound to 11434, or change Ollama port/host (e.g., `OLLAMA_HOST=127.0.0.1:11435 ollama serve`) and update `OLLAMA_BASE_URL` in `.env`. ## Source