chughtapan · chughtapan · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -52,6 +52,8 @@ htmlcov/
 fastagent.secrets.yaml
 outputs/
 output*/
+results/
+experiments/
 fastagent.jsonl
 test_script_*.py
 .claude/

diff --git a/docs/evals.md b/docs/evals.md
@@ -79,12 +79,43 @@ appworld download data
 
 Validate existing logs without running new tests:
 
+**BFCL:**
 ```bash
-# Validate logs from default directory
+# Validate logs (auto-detects from outputs/raw/)
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only
 
-# Validate logs from specific directory
-.venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only --log-dir outputs/experiment1/raw
+# Or specify custom output directory
+.venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only --output-dir outputs/experiment1
+```
+
+**AppWorld:**
+```bash
+# Validate logs (auto-detects from results/{model}/{dataset}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+```
+
+### AppWorld Results Organization
+
+AppWorld tests automatically organize results during execution:
+
+```bash
+# Run tests - results automatically organized
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+
+# Results automatically written to:
+# - results/gpt-4o/train/outputs/raw/ (conversation logs)
+# - results/gpt-4o/train/failure_reports/ (auto-generated for failed tests)
+# - experiments/outputs/gpt-4o/train/ (AppWorld evaluation data)
+
+# Clean up large experiment directories after tests
+rm -rf experiments/outputs/gpt-4o/  # Frees ~15GB
+```
+
+**AppWorld-specific options:**
+```bash
+--dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
+--limit N                 # Run only first N tasks from dataset
+--start-from TASK_ID      # Resume from specific task ID
 ```
 
 ### Parallel Execution

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "wags"
 version = "0.0.0.dev0"
 description = "WAGS - Middleware orchestration for MCP servers with FastMCP"
-requires-python = ">=3.13.5"
+requires-python = "~=3.13.5"
 
 dependencies = [
     "mcp @ git+https://github.com/chughtapan/python-sdk.git@wags-dev",
@@ -74,6 +74,8 @@ ignore = ["PERF203", "PLC0415", "PLR0402"]
 "src/wags/cli/main.py" = ["PLR0913"]
 # Pytest fixtures require many parameters
 "tests/benchmarks/appworld/test_appworld.py" = ["PLR0913"]
+# Reporting functions naturally have high complexity due to formatting logic
+"tests/benchmarks/appworld/reporting.py" = ["C901", "PLR0912", "PLR0915"]
 
 [tool.ruff.lint.pylint]
 allow-magic-value-types = ["bytes", "float", "int", "str"]

diff --git a/tests/README.md b/tests/README.md
@@ -90,7 +90,7 @@ UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
 # Run specific category
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -k "multi_turn_base"
 
-# Validate existing logs
+# Validate existing logs (auto-detects from outputs/raw/)
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only
 ```
 
@@ -99,14 +99,14 @@ UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
 appworld install
 appworld download data
 
-# Run all train tasks
+# Run all train tasks (results automatically organized to results/{model}/{dataset}/)
 .venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
 
 # Run specific task
 .venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]'
 
-# Validate existing results
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only
+# Validate existing results (auto-detects from results/{model}/{dataset}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
 ```
 
 ### 5. Smoke Tests (`smoke/`)
@@ -211,8 +211,13 @@ async def test_with_client():
 Global fixtures available to all tests:
 
 - `model`: Model name from `--model` CLI option (default: gpt-4o-mini)
-- `output_dir`: Output directory from `--output-dir` (default: outputs)
-- `fast_agent`: FastAgent instance (e2e tests only)
+- `temperature`: Temperature from `--temperature` CLI option (default: 0.001)
+
+Benchmark-specific fixtures:
+
+- `output_dir`: Output directory (benchmark-specific override)
+  - BFCL: Uses `--output-dir` (default: outputs)
+  - AppWorld: Automatically uses `results/{model}/{dataset}/outputs/`
 
 ### Custom Markers
 
@@ -222,16 +227,21 @@ Registered markers:
 
 ### CLI Options
 
+**Global options:**
 ```bash
 --model MODEL              # Model to use (default: gpt-4o-mini)
+--temperature FLOAT        # Temperature for sampling (default: 0.001)
 --output-dir DIR          # Output directory (default: outputs)
---validate-only           # Only validate existing logs (benchmarks)
---log-dir DIR             # Log directory for validation
---max-workers N           # Max concurrent tests (default: 4)
+--validate-only           # Only validate existing logs
+```
 
-# AppWorld-specific options
---dataset DATASET         # AppWorld dataset: train, dev, test_normal, test_challenge
+**AppWorld-specific options:**
+```bash
+--dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
 --limit N                 # Run only first N tasks from dataset
+--api-mode MODE           # API prediction mode (default: app_oracle)
+--experiment-dir DIR      # Custom experiment directory name
+--start-from TASK_ID      # Resume from specific task ID
 ```
 
 ## Common Commands

diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md
@@ -16,20 +16,20 @@ appworld download data
 ## Run Tests
 
 ```bash
-# Run first task from train dataset
+# Run first task from train dataset (automatically organized to results/gpt-4o/train/)
 pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 1 --model gpt-4o
 
 # Run first 5 train tasks
-pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5
+pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5 --model gpt-4o
 
 # Run all dev tasks
 pytest tests/benchmarks/appworld/test_appworld.py --dataset dev --model gpt-4o
 
 # Run specific task (use actual task IDs like 82e2fac_1, not train_001)
 pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[82e2fac_1]'
 
-# Validate existing results without re-running
-pytest tests/benchmarks/appworld/test_appworld.py --validate-only
+# Validate existing results (auto-detects from results/gpt-4o/train/)
+pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
 ```
 
 ## CLI Options
@@ -84,13 +84,44 @@ tests/benchmarks/appworld/
 4. **Validation**: AppWorld evaluator compares DB state to ground truth
 5. **Result**: Pass/fail based on whether task requirements were met
 
+## Results Organization
+
+AppWorld tests automatically organize results during execution:
+
+```
+results/{model}/{dataset}/
+├── outputs/
+│   └── raw/
+│       ├── {task_id}_complete.json      # Conversation logs
+│       └── {task_id}_structured.jsonl   # Turn-by-turn events
+└── failure_reports/
+    └── failure_report_{task_id}.md      # Auto-generated for failed tests
+
+experiments/outputs/{model}/{dataset}/    # AppWorld evaluation data (~15GB)
+└── tasks/{task_id}/
+    ├── dbs/                              # Database snapshots
+    └── evaluation/
+        └── report.md                     # Evaluation results
+```
+
+### Cleanup
+
+After tests complete, clean up large experiment directories:
+
+```bash
+rm -rf experiments/outputs/gpt-4o/  # Frees ~15GB
+```
+
 ## Debugging
 
 ### Inspect Test Output
 ```bash
 # Structured logs
-cat outputs/raw/<task_id>_structured.jsonl
+cat results/gpt-4o/train/outputs/raw/<task_id>_structured.jsonl
 
 # Complete message history
-cat outputs/raw/<task_id>_complete.json
+cat results/gpt-4o/train/outputs/raw/<task_id>_complete.json
+
+# Failure report (auto-generated for failed tests)
+cat results/gpt-4o/train/failure_reports/failure_report_<task_id>.md
 ```
diff --git a/tests/benchmarks/appworld/conftest.py b/tests/benchmarks/appworld/conftest.py
@@ -1,8 +1,26 @@
 """Pytest configuration and fixtures for AppWorld benchmark."""
 
+from pathlib import Path
+
 import pytest
 
 
+@pytest.fixture
+def output_dir(request: pytest.FixtureRequest) -> Path:
+    """AppWorld-specific output directory.
+
+    Overrides the global output_dir fixture to write directly to
+    results/{model}/{dataset}/outputs/ for organized storage.
+    """
+    model = str(request.config.getoption("--model"))
+    dataset = str(request.config.getoption("--dataset"))
+
+    # Write directly to results directory
+    path = Path("results") / model / dataset / "outputs"
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add AppWorld-specific CLI options."""
     parser.addoption(
@@ -19,13 +37,32 @@ def pytest_addoption(parser: pytest.Parser) -> None:
     )
     parser.addoption(
         "--api-mode",
-        default="ground_truth",
+        default="app_oracle",
         choices=["predicted", "ground_truth", "app_oracle", "all"],
         help=(
             "API prediction mode: predicted (LLM), ground_truth (API-level oracle), "
             "app_oracle (app-level oracle), all (default: ground_truth)"
         ),
     )
+    parser.addoption(
+        "--experiment-dir",
+        default=None,
+        type=str,
+        help=(
+            "Experiment directory name (e.g., 'gpt-5/train' or 'claude-sonnet-4-5/dev'). "
+            "If not specified, auto-generates timestamp-based name. "
+            "Results will be saved to experiments/outputs/{experiment-dir}/"
+        ),
+    )
+    parser.addoption(
+        "--start-from",
+        default=None,
+        type=str,
+        help=(
+            "Start from specified task_id (skip all tests before it). "
+            "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs."
+        ),
+    )
 
 
 @pytest.fixture
@@ -53,3 +90,27 @@ def api_mode(request: pytest.FixtureRequest) -> str:
         "all": Use all available APIs (no limit)
     """
     return str(request.config.getoption("--api-mode"))
+
+
+@pytest.fixture(scope="session")
+def experiment_name(request: pytest.FixtureRequest) -> str:
+    """
+    Get or generate experiment directory name for the test session.
+
+    All tests in this session will write to the same experiment directory,
+    organized by task_id in subdirectories: experiments/outputs/{experiment_name}/tasks/{task_id}/
+
+    Automatically uses {model}/{dataset} pattern for organized experiment tracking.
+    """
+
+    experiment_dir = request.config.getoption("--experiment-dir", None)
+
+    if experiment_dir:
+        # Use specified experiment directory
+        return str(experiment_dir)
+    else:
+        # Use model/dataset pattern for organized experiment tracking
+        # This works for both normal runs and validation
+        model = str(request.config.getoption("--model"))
+        dataset = str(request.config.getoption("--dataset"))
+        return f"{model}/{dataset}"
diff --git a/tests/benchmarks/appworld/mcp_server.py b/tests/benchmarks/appworld/mcp_server.py
@@ -201,7 +201,7 @@ async def serve_task_mcp(task_id: str, experiment_name: str = "wags-benchmark")
     server = Server(server_name)
 
     # Register list_tools handler
-    @server.list_tools()
+    @server.list_tools()  # type: ignore[untyped-decorator]
     async def list_tools() -> list[Tool]:
         """List available tools with AppWorld's MCP schemas."""
         tools: list[Tool] = []
@@ -216,7 +216,7 @@ async def list_tools() -> list[Tool]:
         return tools
 
     # Register call_tool handler
-    @server.call_tool()
+    @server.call_tool()  # type: ignore[untyped-decorator]
     async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
         """Call AppWorld API and save databases on task completion."""
         app_name = api_name = name