alexzhang13 · Zhi0467 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -213,3 +213,6 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# rlm artifacts
+.rlm_workspace/
diff --git a/README.md b/README.md
@@ -85,6 +85,7 @@ rlm = RLM(
     environment_kwargs={...},
 )
 ```
+Environment kwargs also accept `llm_query_timeout`, `llm_query_timeout_step`, and `llm_query_timeout_min` to control depth-based `llm_query()` timeouts.
 
 ### Local Environments
 The default `local` environment `LocalREPL` runs in the same process as the RLM itself, with specified global and local namespaces for minimal security. Using this REPL is generally safe, but should not be used for production settings. It also shares the same virtual environment (e.g. Conda or uv) as the host process.

diff --git a/docs/api/rlm.md b/docs/api/rlm.md
@@ -43,7 +43,7 @@ RLM(
     environment: str = "local",
     environment_kwargs: dict | None = None,
     depth: int = 0,
-    max_depth: int = 1,
+    recursive_max_depth: int = 1,
     max_iterations: int = 30,
     custom_system_prompt: str | None = None,
     other_backends: list[str] | None = None,
@@ -150,17 +150,30 @@ environment_kwargs = {
 }
 ```
 
+**Common (all environments):**
+```python
+environment_kwargs = {
+    "llm_query_timeout": 900,       # Root timeout in seconds
+    "llm_query_timeout_step": 120,  # Subtract per routing depth
+    "llm_query_timeout_min": 300,   # Floor in seconds
+}
+```
+Timeout at depth `d` (llm_query routing depth):
+```
+max(min_timeout, root_timeout - d * step)
+```
+
 ---
 
-#### `max_depth`
+#### `recursive_max_depth`
 {: .no_toc }
 
 **Type:** `int`  
 **Default:** `1`
 
-Maximum recursion depth for nested RLM calls. Currently only depth 1 is fully supported.
+Maximum recursion depth for nested RLM calls.
 
-When `depth >= max_depth`, the RLM falls back to a regular LM completion.
+Global recursion cap across all nested RLM calls. This value decrements each recursive layer; when it reaches 0, the RLM falls back to a regular LM completion.
 
 ---
 
@@ -171,6 +184,7 @@ When `depth >= max_depth`, the RLM falls back to a regular LM completion.
 **Default:** `30`
 
 Maximum number of REPL iterations before forcing a final answer.
+For recursive sub-calls, this value is halved per depth with a floor of 1.
 
 Each iteration consists of:
 1. LM generates response (potentially with code blocks)
@@ -217,23 +231,24 @@ rlm = RLM(
 **Type:** `list[str] | None` / `list[dict] | None`  
 **Default:** `None`
 
-Register additional LM backends available for sub-calls via `llm_query()`.
+Depth-specific LM backends for recursive sub-calls. Entry `other_backends[i]` (and matching
+`other_backend_kwargs[i]`) is used at recursion depth `i + 1`. If the list is shorter than
+the recursion depth, the root backend is used as a default.
 
 ```python
 rlm = RLM(
     backend="openai",
     backend_kwargs={"model_name": "gpt-4o"},
-    other_backends=["anthropic", "openai"],
+    recursive_max_depth=3,
+    other_backends=["anthropic", "openai", "openai"],
     other_backend_kwargs=[
         {"model_name": "claude-sonnet-4-20250514"},
         {"model_name": "gpt-4o-mini"},
+        {"model_name": "gpt-4o-nano"},
     ],
 )
 
-# Inside REPL, code can call:
-# llm_query(prompt)  # Uses default (gpt-4o)
-# llm_query(prompt, model="claude-sonnet-4-20250514")  # Uses Claude
-# llm_query(prompt, model="gpt-4o-mini")  # Uses GPT-4o-mini
+# Depth 1 uses Claude, depth 2 uses GPT-4o-mini, depth 3 uses GPT-4o-nano.
 ```
 
 ---
@@ -437,7 +452,7 @@ rlm = RLM(
         "image": "python:3.11-slim",
     },
 
-    # Additional models for sub-calls
+    # Depth-specific backends for recursion
     other_backends=["openai"],
     other_backend_kwargs=[{
         "api_key": os.getenv("OPENAI_API_KEY"),
@@ -446,7 +461,7 @@ rlm = RLM(
 
     # Behavior
     max_iterations=40,
-    max_depth=1,
+    recursive_max_depth=1,
 
     # Debugging
     logger=logger,
@@ -458,4 +473,3 @@ result = rlm.completion(
     root_prompt="Summarize the key findings"
 )
 ```
-
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -131,11 +131,11 @@ This will display:
 | `backend_kwargs` | `dict` | `None` | Backend-specific configuration |
 | `environment` | `str` | `"local"` | Execution environment type |
 | `environment_kwargs` | `dict` | `None` | Environment configuration |
-| `max_depth` | `int` | `1` | Maximum recursion depth |
+| `recursive_max_depth` | `int` | `1` | Maximum recursion depth across all nested RLM calls |
 | `max_iterations` | `int` | `30` | Max REPL iterations per call |
 | `custom_system_prompt` | `str` | `None` | Override default system prompt |
-| `other_backends` | `list` | `None` | Additional backends for sub-calls |
-| `other_backend_kwargs` | `list` | `None` | Configs for additional backends |
+| `other_backends` | `list` | `None` | Depth-specific backends for recursive sub-calls |
+| `other_backend_kwargs` | `list` | `None` | Configs for depth-specific backends |
 | `logger` | `RLMLogger` | `None` | Logger for trajectory tracking |
 | `verbose` | `bool` | `False` | Enable console output |
 
@@ -219,6 +219,23 @@ rlm = RLM(
 
 ---
 
+### llm_query Timeouts
+
+`llm_query()` calls inherit a depth-based timeout. Configure via `environment_kwargs`:
+
+```python
+environment_kwargs = {
+    "llm_query_timeout": 900,
+    "llm_query_timeout_step": 120,
+    "llm_query_timeout_min": 300,
+}
+```
+
+Timeout at depth `d` (llm_query routing depth):
+```
+max(min_timeout, root_timeout - d * step)
+```
+
 ## Choosing a Backend
 
 ### OpenAI
@@ -340,4 +357,3 @@ Upload `.jsonl` log files to visualize:
 - [API Reference](api/rlm.md) - Complete RLM class documentation
 - [Environments](environments/) - Deep dive into each environment
 - [Backends](backends.md) - Detailed backend configuration
-
diff --git a/docs/src/app/api/page.tsx b/docs/src/app/api/page.tsx
@@ -36,7 +36,7 @@ print(result.response)`} />
     environment: str = "local",
     environment_kwargs: dict | None = None,
     depth: int = 0,
-    max_depth: int = 1,
+    recursive_max_depth: int = 1,
     max_iterations: int = 30,
     custom_system_prompt: str | None = None,
     other_backends: list[str] | None = None,
@@ -140,23 +140,18 @@ environment_kwargs={"setup_code": "import numpy as np"}`} />
               <span className="text-xs px-2 py-1 rounded-md bg-muted text-muted-foreground font-mono">int</span>
               <span className="text-xs px-2 py-1 rounded-md bg-blue-100 text-blue-700 font-mono">default: 30</span>
             </div>
-            <p className="text-muted-foreground">Maximum REPL iterations before forcing a final answer.</p>
+            <p className="text-muted-foreground">Maximum REPL iterations before forcing a final answer. Recursive sub-calls halve this per depth (floor 1).</p>
           </div>
 
-          {/* max_depth */}
+          {/* recursive_max_depth */}
           <div className="border-l-4 border-blue-500 pl-6">
             <div className="flex items-baseline gap-3 mb-2">
-              <code className="text-lg font-semibold text-foreground">max_depth</code>
+              <code className="text-lg font-semibold text-foreground">recursive_max_depth</code>
               <span className="text-xs px-2 py-1 rounded-md bg-muted text-muted-foreground font-mono">int</span>
               <span className="text-xs px-2 py-1 rounded-md bg-blue-100 text-blue-700 font-mono">default: 1</span>
             </div>
-            <div className="mb-2 p-3 bg-amber-50 border border-amber-200 rounded-md">
-              <p className="text-sm text-amber-800">
-                <strong>Note:</strong> This is a TODO. Only <code className="px-1.5 py-0.5 rounded bg-amber-100 text-amber-900 text-xs font-semibold">max_depth=1</code> is currently supported.
-              </p>
-            </div>
             <p className="text-muted-foreground">
-              Maximum recursion depth. When <code className="px-1.5 py-0.5 rounded bg-muted text-foreground text-sm">depth {">="} max_depth</code>, falls back to regular LM completion.
+              Global recursion cap across all nested RLM calls. This value decrements each recursive layer; when it reaches 0, the RLM falls back to regular LM completion.
             </p>
           </div>
 
@@ -177,12 +172,16 @@ environment_kwargs={"setup_code": "import numpy as np"}`} />
               <span className="text-xs px-2 py-1 rounded-md bg-muted text-muted-foreground font-mono">list[str] | None</span>
               <span className="text-xs px-2 py-1 rounded-md bg-blue-100 text-blue-700 font-mono">default: None</span>
             </div>
-            <p className="text-muted-foreground mb-4">Additional backends available for sub-LM calls within the REPL.</p>
+            <p className="text-muted-foreground mb-4">Depth-specific backends for recursive sub-calls. Entry 0 is used at depth 1, entry 1 at depth 2, etc.</p>
             <CodeBlock code={`rlm = RLM(
     backend="openai",
     backend_kwargs={"model_name": "gpt-5-mini"},
-    other_backends=["anthropic"],
-    other_backend_kwargs=[{"model_name": "claude-sonnet-4-20250514"}],
+    recursive_max_depth=2,
+    other_backends=["anthropic", "openai"],
+    other_backend_kwargs=[
+        {"model_name": "claude-sonnet-4-20250514"},
+        {"model_name": "gpt-4o-mini"},
+    ],
 )`} />
           </div>
 
@@ -193,7 +192,7 @@ environment_kwargs={"setup_code": "import numpy as np"}`} />
               <span className="text-xs px-2 py-1 rounded-md bg-muted text-muted-foreground font-mono">list[dict] | None</span>
               <span className="text-xs px-2 py-1 rounded-md bg-blue-100 text-blue-700 font-mono">default: None</span>
             </div>
-            <p className="text-muted-foreground">Configurations for <code className="px-1.5 py-0.5 rounded bg-muted text-foreground text-sm">other_backends</code> (must match order).</p>
+            <p className="text-muted-foreground">Configurations for <code className="px-1.5 py-0.5 rounded bg-muted text-foreground text-sm">other_backends</code> (must match order). Missing depths fall back to the root backend.</p>
           </div>
 
           {/* logger */}

diff --git a/docs/src/app/backends/page.tsx b/docs/src/app/backends/page.tsx
@@ -94,31 +94,27 @@ python -m vllm.entrypoints.openai.api_server \\
 
       <hr className="my-8 border-border" />
 
-      <h2 className="text-2xl font-semibold mb-4">Multiple Backends (Experimental)</h2>
+      <h2 className="text-2xl font-semibold mb-4">Depth-Specific Backends</h2>
       <p className="text-muted-foreground mb-4">
-        <strong>Experimental:</strong> This feature allows you to specify <em>ordered</em> lists of backends and model kwargs, so that RLMs can sub-call different language models from within execution code. 
-        The order of <code>other_backends</code> and <code>other_backend_kwargs</code> must match: e.g., the 0th element of <code>other_backends</code> is used with the 0th dict in <code>other_backend_kwargs</code>.
+        Provide an ordered list of backends and model kwargs, one per recursion depth.
+        The order of <code>other_backends</code> and <code>other_backend_kwargs</code> must match: the 0th
+        entry is used at depth 1, the 1st entry at depth 2, and so on. Missing depths fall back to the
+        root backend.
         <br />
         <br />
-        <span className="font-medium">
-          This functionality is for advanced use and is currently experimental.
-        </span>
-        It will become more useful as RLMs get the ability to orchestrate and delegate between different LMs within a workflow.
+        This is an advanced feature for controlling cost/quality across recursive calls.
       </p>
       <CodeBlock code={`rlm = RLM(
     backend="openai",
     backend_kwargs={"model_name": "gpt-5-mini"},
-    other_backends=["anthropic", "openai"],  # ORDER MATTERS!
+    recursive_max_depth=3,
+    other_backends=["anthropic", "openai", "openai"],  # depth 1..3
     other_backend_kwargs=[
         {"model_name": "claude-sonnet-4-20250514"},
         {"model_name": "gpt-4o-mini"},
+        {"model_name": "gpt-4o-nano"},
     ],  # ORDER MATCHES other_backends
 )`} />
-      <p className="text-muted-foreground mt-4">Inside REPL (future releases):</p>
-      <CodeBlock code={`llm_query("prompt")  # Uses default (gpt-5-mini)
-llm_query("prompt", model="claude-sonnet-4-20250514")  # Uses Claude 
-llm_query("prompt", model="gpt-4o-mini")  # Uses GPT-4o-mini`} />
     </div>
   );
 }
-
diff --git a/examples/prime_repl_example.py b/examples/prime_repl_example.py
@@ -21,7 +21,7 @@
         "docker_image": "python:3.11-slim",
         "timeout_minutes": 30,
     },
-    max_depth=1,
+    recursive_max_depth=1,
     logger=logger,
     verbose=True,
 )

diff --git a/examples/quickstart.py b/examples/quickstart.py
@@ -12,16 +12,24 @@
 rlm = RLM(
     backend="openai",  # or "portkey", etc.
     backend_kwargs={
-        "model_name": "gpt-5-nano",
+        "model_name": "gpt-5",
         "api_key": os.getenv("OPENAI_API_KEY"),
     },
     environment="docker",
     environment_kwargs={},
-    max_depth=1,
+    recursive_max_depth=2,
+    max_iterations=4,
+    other_backends=["openai", "openai"],  # depth 1 and depth 2 (leaf)
+    other_backend_kwargs=[
+        {"model_name": "gpt-5-nano", "api_key": os.getenv("OPENAI_API_KEY")},
+        {"model_name": "gpt-5-nano", "api_key": os.getenv("OPENAI_API_KEY")},
+    ],
     logger=logger,
     verbose=True,  # For printing to console with rich, disabled by default.
 )
 
-result = rlm.completion("Print me the first 5 powers of two, each on a newline.")
+prompt = "Let ${\\triangle ABC}$ be a right triangle with $\\angle A = 90^{\\circ}$ and $BC = 38.$ There exist points $K$ and $L$ inside the triangle such that $AK = AL = BK = CL = KL = 14.$ The area of the quadrilateral $BKLC$ can be expressed as $n\\sqrt{3}$ for some positive integer $n.$ Find $n.$"
+
+result = rlm.completion(prompt)
 
 print(result)
diff --git a/examples/quickstart_vllm_docker.py b/examples/quickstart_vllm_docker.py
@@ -0,0 +1,62 @@
+"""
+Quickstart: Local vLLM backend with Docker execution.
+
+Setup:
+    1. Start vLLM OpenAI-compatible server, e.g.:
+       python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3-70b --port 8000
+    2. Ensure Docker is running.
+    3. Run: python -m examples.quickstart_vllm_docker
+"""
+
+import os
+
+from dotenv import load_dotenv
+
+from rlm import RLM
+from rlm.logger import RLMLogger
+
+load_dotenv()
+
+logger = RLMLogger(log_dir="./logs")
+
+# OpenAI client requires an api_key; vLLM ignores it unless configured server-side.
+api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+
+rlm = RLM(
+    backend="vllm",
+    backend_kwargs={
+        "base_url": "http://localhost:8000/v1",
+        "model_name": "meta-llama/Llama-3-70b",
+        "api_key": api_key,
+    },
+    environment="docker",
+    environment_kwargs={},
+    recursive_max_depth=3,
+    max_iterations=4,
+    other_backends=["vllm", "vllm"],  # depth 1 and depth 2
+    other_backend_kwargs=[
+        {
+            "base_url": "http://localhost:8001/v1",
+            "model_name": "meta-llama/Llama-3-8b",
+            "api_key": api_key,
+        },
+        {
+            "base_url": "http://localhost:8002/v1",
+            "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
+            "api_key": api_key,
+        },
+    ],
+    logger=logger,
+    verbose=True,
+)
+
+prompts = [
+    "Summarize the key idea of recursive language models in 2 sentences.",
+    "Given a list of numbers [3, 7, 2, 9], return the max and min.",
+]
+
+for prompt in prompts:
+    result = rlm.completion(prompt)
+    print("=" * 60)
+    print(f"Prompt: {prompt}")
+    print(f"Response: {result.response}")