chughtapan · chughtapan · Oct 23, 2025 · Oct 22, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -27,7 +27,10 @@ jobs:
         run: uv pip install --system pre-commit
 
       - name: Run pre-commit
-        run: pre-commit run --all-files
+        run: SKIP=mypy pre-commit run --all-files
+
+      - name: Run mypy
+        run: uv run mypy src/ tests/ --exclude 'tests/benchmarks/appworld/'
 
   test:
     runs-on: ubuntu-latest
@@ -50,7 +53,7 @@ jobs:
         run: uv pip install --system -e ".[dev]"
 
       - name: Run unit tests
-        run: uv run pytest tests/unit/ -v
+        run: uv run pytest tests/unit/ -v -p no:warnings
 
       - name: Run integration tests
-        run: uv run pytest tests/integration/ -v
+        run: uv run pytest tests/integration/ -v -p no:warnings
diff --git a/.gitignore b/.gitignore
@@ -58,4 +58,7 @@ test_script_*.py
 CLAUDE.md
 
 # mkdocs output
-site/
+site/
+
+# Appworld data
+data/
diff --git a/README.md b/README.md
@@ -153,9 +153,11 @@ pre-commit run --all-files
 
 ## Running Benchmarks
 
-WAGS includes evaluation support for the Berkeley Function Call Leaderboard (BFCL).
+WAGS includes evaluation support for:
+- **BFCL**: Berkeley Function Call Leaderboard
+- **AppWorld**: Realistic task evaluation across 9 day-to-day apps
 
-### Setup
+### BFCL Setup
 First, install the evaluation dependencies:
 
 ```bash
@@ -166,8 +168,22 @@ git submodule update --init --recursive
 uv pip install -e ".[dev,evals]"
 ```
 
+### AppWorld Setup
+
+```bash
+# Install evaluation dependencies
+UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+
+# Initialize AppWorld environment
+appworld install
+
+# Download benchmark data
+appworld download data
+```
+
 ### Run Benchmark Tests
 
+**BFCL:**
 ```bash
 # Run all BFCL tests
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py
@@ -179,8 +195,17 @@ uv pip install -e ".[dev,evals]"
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --model gpt-4o
 ```
 
+**AppWorld:**
+```bash
+# Run all train tasks
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+
+# Run specific task
+.venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]'
+```
+
 For detailed information, see:
-- **Benchmark architecture and categories**: [docs/benchmarks.md](https://chughtapan.github.io/wags/benchmarks/)
+- **Evaluation guide**: [docs/evals.md](https://chughtapan.github.io/wags/evals/)
 - **Test organization and patterns**: [tests/README.md](tests/README.md)
 
 ## License

diff --git a/docs/evals.md b/docs/evals.md
@@ -1,6 +1,8 @@
 # Running Evals
 
-Here's how to run BFCL multi-turn evaluations with <span class="wags-brand">wags</span>.
+Here's how to run benchmark evaluations with <span class="wags-brand">wags</span>. We currently support:
+- **BFCL**: Berkeley Function Call Leaderboard multi-turn tests
+- **AppWorld**: Realistic task evaluation across 9 day-to-day apps
 
 ## Setup
 
@@ -23,6 +25,21 @@ If you already have the submodule initialized:
 git submodule update --remote
 ```
 
+### AppWorld Setup
+
+For AppWorld benchmark evaluation:
+
+```bash
+# Install evaluation dependencies
+UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+
+# Initialize AppWorld environment
+appworld install
+
+# Download benchmark data
+appworld download data
+```
+
 ## Running Tests
 
 ### Basic Usage
@@ -72,5 +89,11 @@ Validate existing logs without running new tests:
 
 ## Further Reading
 
+### BFCL
 - **BFCL leaderboard**: Visit [gorilla.cs.berkeley.edu](https://gorilla.cs.berkeley.edu/leaderboard.html)
-- **Official BFCL repository**: [github.com/ShishirPatil/gorilla](https://github.com/ShishirPatil/gorilla)
+- **Official BFCL repository**: [github.com/ShishirPatil/gorilla](https://github.com/ShishirPatil/gorilla)
+
+### AppWorld
+- **AppWorld Website**: [appworld.dev](https://appworld.dev/)
+- **AppWorld GitHub**: [github.com/StonyBrookNLP/appworld](https://github.com/StonyBrookNLP/appworld)
+- **AppWorld Paper**: [arxiv.org/abs/2407.18901](https://arxiv.org/abs/2407.18901)
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,10 +33,19 @@ dev = [
 
 evals = [
     "bfcl-eval",
+    "appworld",
+    "appworld-experiments[simplified]",
+]
+
+[tool.uv]
+override-dependencies = [
+    "openai>=1.108.0",  # Override appworld-experiments' openai<=1.99.8 constraint
 ]
 
 [tool.uv.sources]
 bfcl-eval = { git = "https://github.com/chughtapan/gorilla.git", subdirectory = "berkeley-function-call-leaderboard", branch = "wags-dev" }
+appworld = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b" }
+appworld-experiments = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b", subdirectory = "experiments" }
 
 [tool.ruff]
 line-length = 120
@@ -61,6 +70,8 @@ ignore = ["PERF203", "PLC0415", "PLR0402"]
 "servers/*/handlers.py" = ["PLR0913"]
 # CLI commands naturally have many parameters (one per flag/option)
 "src/wags/cli/main.py" = ["PLR0913"]
+# Pytest fixtures require many parameters
+"tests/benchmarks/appworld/test_appworld.py" = ["PLR0913"]
 
 [tool.ruff.lint.pylint]
 allow-magic-value-types = ["bytes", "float", "int", "str"]
@@ -71,6 +82,10 @@ asyncio_mode = "auto"
 addopts = "-v --tb=short"
 # Only run unit and integration tests by default
 norecursedirs = ["tests/benchmarks", "tests/e2e", "tests/smoke"]
+filterwarnings = [
+    "ignore::sqlalchemy.exc.SADeprecationWarning:appworld.apps.lib.models.db",
+    "ignore::pydantic.warnings.PydanticDeprecatedSince20:appworld.apps.*",
+]
 
 [tool.mypy]
 warn_return_any = true
@@ -90,3 +105,14 @@ disallow_untyped_calls = true
 module = "bfcl_eval.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = [
+    "appworld.*",
+    "appworld_experiments.*",
+]
+follow_untyped_imports = true
+ignore_errors = true
+
+[[tool.mypy.overrides]]
+module = "tests.benchmarks.appworld.mcp_server"
+disable_error_code = ["attr-defined", "no-untyped-call", "misc"]
diff --git a/tests/README.md b/tests/README.md
@@ -66,8 +66,12 @@ Tests with real LLMs using FastAgent patterns.
 ```
 
 ### 4. Benchmark Tests (`benchmarks/`)
-BFCL (Berkeley Function Call Leaderboard) evaluation tests.
+Large-scale evaluation suites including BFCL and AppWorld.
 
+```bash
+# First-time setup
+UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+```
 **Characteristics:**
 - Large-scale evaluation suites
 - Excluded from default test runs
@@ -76,8 +80,9 @@ BFCL (Berkeley Function Call Leaderboard) evaluation tests.
 
 **Examples:**
 - `benchmarks/bfcl/test_bfcl.py` - BFCL multi-turn tests
+- `benchmarks/appworld/test_appworld.py` - AppWorld realistic task tests
 
-**Run:**
+**Run BFCL:**
 ```bash
 # Run all BFCL tests
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --model gpt-4o
@@ -89,6 +94,21 @@ BFCL (Berkeley Function Call Leaderboard) evaluation tests.
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py --validate-only
 ```
 
+**Run AppWorld:**
+```bash
+appworld install
+appworld download data
+
+# Run all train tasks
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+
+# Run specific task
+.venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]'
+
+# Validate existing results
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only
+```
+
 ### 5. Smoke Tests (`smoke/`)
 Quick validation and initial experimentation.
 
@@ -208,6 +228,10 @@ Registered markers:
 --validate-only           # Only validate existing logs (benchmarks)
 --log-dir DIR             # Log directory for validation
 --max-workers N           # Max concurrent tests (default: 4)
+
+# AppWorld-specific options
+--dataset DATASET         # AppWorld dataset: train, dev, test_normal, test_challenge
+--limit N                 # Run only first N tasks from dataset
 ```
 
 ## Common Commands

diff --git a/tests/benchmarks/appworld/.gitignore b/tests/benchmarks/appworld/.gitignore
@@ -0,0 +1,12 @@
+# AppWorld data directories
+data/
+experiments/
+
+# Docker volumes
+*.bundle
+
+# Test outputs
+outputs/
+*.jsonl
+*_complete.json
+*_structured.jsonl
diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md
@@ -0,0 +1,87 @@
+# AppWorld Benchmark
+
+## Install AppWorld
+
+```bash
+# Install evaluation dependencies
+UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+
+# Initialize AppWorld environment (unpacks encrypted code)
+appworld install
+
+# Download benchmark data (if not already present)
+appworld download data
+```
+
+## Run Tests
+
+```bash
+# Run first task from train dataset
+pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 1 --model gpt-4o
+
+# Run first 5 train tasks
+pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5
+
+# Run all dev tasks
+pytest tests/benchmarks/appworld/test_appworld.py --dataset dev --model gpt-4o
+
+# Run specific task (use actual task IDs like 82e2fac_1, not train_001)
+pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[82e2fac_1]'
+
+# Validate existing results without re-running
+pytest tests/benchmarks/appworld/test_appworld.py --validate-only
+```
+
+## CLI Options
+
+### Dataset Selection
+```bash
+--dataset {train,dev,test_normal,test_challenge}  # Default: train
+```
+- `train`: Training dataset (90 tasks)
+- `dev`: Development/validation dataset
+- `test_normal`: Standard test set
+- `test_challenge`: Challenging test set
+
+### Test Limits
+```bash
+--limit N  # Only run first N tasks from dataset
+```
+
+### Model Selection
+```bash
+--model gpt-4o  # LLM model to use (default: gpt-4o-mini)
+--temperature 0.001  # Temperature for sampling (default: 0.001)
+```
+
+## File Structure
+
+```
+tests/benchmarks/appworld/
+├── mcp_server.py               # Custom MCP wrapper (accepts task_id)
+├── appworld_helpers.py         # API prediction and instruction loading
+├── test_appworld.py            # Pytest test cases and evaluation
+├── fastagent.config.yaml       # MCP connection config (uses env vars)
+├── conftest.py                 # Pytest fixtures (dataset, limit, api_mode)
+├── system_instruction.txt      # Agent system instructions template
+└── README.md                   # This file
+```
+
+## Evaluation Process
+
+1. **Agent Execution**: Agent interacts with apps via MCP tools
+2. **Task Completion**: Agent calls `apis.supervisor.complete_task()`
+3. **State Capture**: Final database state is saved
+4. **Validation**: AppWorld evaluator compares DB state to ground truth
+5. **Result**: Pass/fail based on whether task requirements were met
+
+## Debugging
+
+### Inspect Test Output
+```bash
+# Structured logs
+cat outputs/raw/<task_id>_structured.jsonl
+
+# Complete message history
+cat outputs/raw/<task_id>_complete.json
+```
diff --git a/tests/benchmarks/appworld/__init__.py b/tests/benchmarks/appworld/__init__.py
@@ -0,0 +1 @@
+"""AppWorld benchmark integration for WAGS."""