Merge branch 'main' into kaiming/BackendBenchIntegration

kaiming-cheng · web-flow · commit fea4bd0700ad · 2025-11-06T22:09:43.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,7 @@ triton_kernel_logs/
 *.log
 session_*/
 worker_*/
+.fuse/
 
 # Generated kernels
 kernel.py
@@ -139,6 +140,6 @@ CLAUDE.md
 .Spotlight-V100
 .Trashes
 ehthumbs.db
-Thumbs.db 
+Thumbs.db
 # Local batch runner
 scripts/run_kernelbench_batch.py
diff --git a/README.md b/README.md
@@ -7,7 +7,9 @@ KernelAgent turns PyTorch programs into verified Triton kernels. It was designed
 - Parallel Triton kernel generation with strict runtime verification
 - End‑to‑end composition that rebuilds the original forward pass using only the synthesized kernels
 
-Blog post: [TBD] • Additional docs: coming soon
+Blog post: [PyTorch KernelFalcon](https://pytorch.org/blog/kernelfalcon-autonomous-gpu-kernel-generation-via-deep-agents/)
+
+Additional docs: coming soon
 
 ## Pipeline Overview
 
@@ -18,42 +20,46 @@ Every stage writes artifacts to a run directory under `.fuse/<run_id>/`, includi
 ## Quickstart
 
 ### Requirements
+- Python 3.8 – 3.12
 - Linux or macOS; CUDA‑capable GPU for Triton execution
-- Python 3.8–3.12
-- Triton (install separately: `pip install triton` or nightly from source)
-- At least one LLM provider:
-  - OpenAI (`OPENAI_API_KEY`, models like `o4-mini`, `gpt-5`)
-  - Anthropic (`ANTHROPIC_API_KEY`; default fallback model is `claude-sonnet-4-20250514` when `OPENAI_MODEL` is unset)
-  - Any OpenAI‑compatible relay endpoint (`LLM_RELAY_URL`, optional `LLM_RELAY_API_KEY`; see `triton_kernel_agent/providers/relay_provider.py`)
-- Gradio (UI dependencies; installed as part of the core package)
+- Triton (installed separately: `pip install triton` or nightly from source)
 - PyTorch (https://pytorch.org/get-started/locally/)
+- LLM provider ([OpenAI](https://openai.com/api/), [Anthropic](https://www.anthropic.com/), or a self-hosted relay)
 
-### Installation
+### Install
 ```bash
-git clone https://github.com/pytorch-labs/KernelAgent.git
-cd KernelAgent
-python -m venv .venv && source .venv/bin/activate  # choose your own env manager
-pip install -e .[dev]    # project + tooling deps
-pip install triton       # not part of extras; install the version you need
+pip install -e .
+```
 
-# (optional) Install KernelBench for problem examples
+#### (Optional) Install KernelBench for problem examples
+```bash
 git clone https://github.com/ScalingIntelligence/KernelBench.git
 ```
+Note: By default, KernelAgent UI searches for KernelBench at the same level as `KernelAgent`. (i.e. `../KernelBench`)
 
-### Configure credentials
-You can export keys directly or use an `.env` file that the CLIs load automatically:
+### Configure
+You can export keys directly or use an `.env` file that the CLIs load automatically.
 
 ```bash
-OPENAI_API_KEY=sk-...
-OPENAI_MODEL=gpt-5            # override default fallback (claude-sonnet-4-20250514)
+OPENAI_MODEL=gpt-5            # default model for extraction
 NUM_KERNEL_SEEDS=4            # parallel workers per kernel
 MAX_REFINEMENT_ROUNDS=10      # retry budget per worker
-LOG_LEVEL=INFO
+LOG_LEVEL=INFO                # logging level
+```
+
+#### LLM Providers
+KernelAgent currently supports OpenAI and Anthropic out-of-the-box. You can also use a custom OpenAI endpoint.
+These can be configured in `.env` or via environment variables.
+```bash
+# OpenAI (models like `o4-mini`, `gpt-5`)
+OPENAI_API_KEY=sk-...
+
+# Anthropic (default; `claude-sonnet-4-20250514` is used when `OPENAI_MODEL` is unset)
+ANTHROPIC_API_KEY=sk-ant-...
 
-# Optional relay configuration for self-hosted gateways
-# LLM_RELAY_URL=http://127.0.0.1:11434
-# LLM_RELAY_API_KEY=your-relay-token
-# LLM_RELAY_TIMEOUT_S=120
+# Relay configuration for self-hosted gateways
+LLM_RELAY_URL=http://127.0.0.1:11434
+LLM_RELAY_TIMEOUT_S=120
 ```
 
 More knobs live in `triton_kernel_agent/agent.py` and `Fuser/config.py`.
@@ -153,9 +159,9 @@ These artifacts are designed for reproducibility: you can re-run a single kernel
 
 ## Documentation & Community
 
-- Architecture and deep-dive docs: `docs/kernelfalcon_overview.html`, `docs/kernelfalcon_agents2_overview.html`, `docs/FuserAgent_sketch.html`, `docs/fuser_agent_compare.html`
+- Architecture and deep-dive docs: `Coming Soon`
 - Issues: https://github.com/pytorch-labs/KernelAgent/issues
-- Discussions & blog posts: [TBD]
+- Blog post: https://pytorch.org/blog/kernelfalcon-autonomous-gpu-kernel-generation-via-deep-agents/
 
 ## License
 
diff --git a/triton_kernel_agent/providers/relay_provider.py b/triton_kernel_agent/providers/relay_provider.py
@@ -18,6 +18,7 @@
 
 import requests
 import logging
+import os
 
 from .base import BaseProvider, LLMResponse
 
@@ -34,7 +35,7 @@ class RelayProvider(BaseProvider):
     """
 
     def __init__(self):
-        self.server_url = "http://127.0.0.1:11434"
+        self.server_url = os.environ.get("LLM_RELAY_URL", "http://127.0.0.1:11434")
         self.is_available_flag = False
         super().__init__()
 
@@ -68,7 +69,7 @@ def get_response(
             self.server_url,
             json=request_data,
             headers={"Content-Type": "application/json"},
-            timeout=120.0,
+            timeout=int(os.environ.get("LLM_RELAY_TIMEOUT_S", 120)),
         )
 
         if response.status_code != 200: