Skip to content

Commit 0168d09

Browse files
authored
Merge pull request #36 from chughtapan/add-mcp-universe-benchmark
Add mcp universe benchmark
2 parents 93aa1ba + 2493834 commit 0168d09

File tree

14 files changed

+2132
-95
lines changed

14 files changed

+2132
-95
lines changed

pyproject.toml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,26 @@ dev = [
3535

3636
evals = [
3737
"bfcl-eval",
38+
"mcpuniverse",
39+
"jinja2>=3.0.0",
40+
"python-dotenv>=1.0.0",
3841
"appworld",
3942
"appworld-experiments[simplified]",
4043
]
4144

4245
[tool.uv]
4346
override-dependencies = [
44-
"openai[aiohttp]>=1.108.0", # Override appworld-experiments' openai<=1.99.8 constraint
47+
"openai[aiohttp]>=1.108.0", # Override appworld-experiments' openai<=1.99.8 constraint and mcpuniverse's openai==1.106.1
48+
"anthropic>=0.68.0", # Override mcpuniverse's anthropic==0.49.0 constraint
49+
"google-genai>=1.33.0", # Override mcpuniverse's google-genai==1.16.1 constraint
50+
"mistralai>=1.7.0", # Override mcpuniverse's mistralai==1.6.0 constraint
51+
"python-dotenv>=1.1.0", # Override mcpuniverse's python-dotenv==1.0.1 constraint
52+
"mcp @ git+https://github.com/chughtapan/python-sdk.git@wags-dev", # Align with core dependency
4553
]
4654

4755
[tool.uv.sources]
4856
bfcl-eval = { git = "https://github.com/chughtapan/gorilla.git", subdirectory = "berkeley-function-call-leaderboard", branch = "wags-dev" }
57+
mcpuniverse = { git = "https://github.com/vinamra57/MCP-Universe.git", branch = "wags-dev" }
4958
appworld = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b" }
5059
appworld-experiments = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b", subdirectory = "experiments" }
5160

@@ -111,6 +120,10 @@ disallow_untyped_calls = true
111120
module = "bfcl_eval.*"
112121
ignore_missing_imports = true
113122

123+
[[tool.mypy.overrides]]
124+
module = "mcpuniverse.*"
125+
ignore_missing_imports = true
126+
114127
[[tool.mypy.overrides]]
115128
module = [
116129
"appworld.*",

tests/benchmarks/appworld/mcp_server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ async def serve_task_mcp(task_id: str, experiment_name: str = "wags-benchmark")
201201
server = Server(server_name)
202202

203203
# Register list_tools handler
204-
@server.list_tools() # type: ignore[untyped-decorator]
204+
@server.list_tools()
205205
async def list_tools() -> list[Tool]:
206206
"""List available tools with AppWorld's MCP schemas."""
207207
tools: list[Tool] = []
@@ -216,7 +216,7 @@ async def list_tools() -> list[Tool]:
216216
return tools
217217

218218
# Register call_tool handler
219-
@server.call_tool() # type: ignore[untyped-decorator]
219+
@server.call_tool()
220220
async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
221221
"""Call AppWorld API and save databases on task completion."""
222222
app_name = api_name = name
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# MCP-Universe Repository Management Benchmark Integration
2+
3+
This directory contains the integration of the MCP-Universe repository management benchmark into WAGS.
4+
5+
## Overview
6+
7+
MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the **repository management domain** with:
8+
9+
- **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020)
10+
- Tests realistic GitHub operations including:
11+
12+
- Creating repositories and branches
13+
- Managing files and commits
14+
- Creating pull requests
15+
- Copying files between repositories
16+
- Managing issues and labels
17+
18+
## Quick Start
19+
20+
### Prerequisites
21+
22+
1. **Docker** - REQUIRED to run the GitHub MCP server
23+
- Install Docker Desktop: https://www.docker.com/products/docker-desktop
24+
- **Start Docker Desktop** before running tests
25+
- Verify installation: `docker --version`
26+
- **Note**: Using pinned version v0.15.0 for research reproducibility (before PR #1091 which added automatic instruction generation)
27+
- If `docker` command is not found, ensure Docker Desktop is running and restart your terminal
28+
2. **GitHub Personal Access Token** - For GitHub API access
29+
- **CRITICAL**: Use a dedicated test GitHub account for safety
30+
- Create token: https://github.com/settings/tokens
31+
3. **OpenAI API Key** (or Anthropic for Claude models) - For running the LLM agent
32+
4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/)
33+
34+
### Installation
35+
36+
```bash
37+
# Clone the repository (if not already done)
38+
git clone https://github.com/chughtapan/wags.git
39+
cd wags
40+
41+
# Install dependencies (pulls the forked MCP-Universe package via eval extras)
42+
UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
43+
44+
# Verify Docker is working
45+
docker run --rm hello-world
46+
47+
# Pre-pull the GitHub MCP server image (recommended for faster test startup)
48+
docker pull ghcr.io/github/github-mcp-server:v0.15.0
49+
```
50+
51+
**Note**: The `.[dev,evals]` extras install:
52+
- `mcpuniverse` from the fork [`vinamra57/MCP-Universe@72389d8`](https://github.com/vinamra57/MCP-Universe/tree/72389d8a04044dceb855f733a938d0344ac58813), which removes heavy 3D dependencies while keeping the repository-management configs
53+
- `bfcl-eval` for Berkeley Function Call Leaderboard evaluation
54+
- Other shared evaluation dependencies
55+
56+
All repository management task JSON files are bundled inside the installed `mcpuniverse` wheel, so no git submodules or manual data checkout are required.
57+
58+
### Configuration
59+
60+
Environment variables are automatically loaded from `servers/github/.env`. Create this file with:
61+
62+
```bash
63+
# servers/github/.env
64+
GITHUB_PERSONAL_ACCESS_TOKEN=your_github_token_here
65+
GITHUB_PERSONAL_ACCOUNT_NAME=your_github_username
66+
OPENAI_API_KEY=your_openai_key_here
67+
```
68+
69+
**IMPORTANT**: Use a dedicated test GitHub account. The AI agent will perform real operations on GitHub repositories.
70+
71+
Alternatively, you can manually export the environment variables:
72+
73+
```bash
74+
export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token_here"
75+
export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
76+
export OPENAI_API_KEY="your_openai_key_here"
77+
```
78+
79+
### Running Tests
80+
81+
Run all 28 repository management tasks:
82+
83+
```bash
84+
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
85+
--model gpt-4o-mini \
86+
--output-dir outputs/mcp_universe \
87+
-v
88+
```
89+
90+
Run a single task:
91+
92+
```bash
93+
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] \
94+
--model gpt-4o-mini \
95+
--output-dir outputs/mcp_universe \
96+
-v
97+
```
98+
99+
Run with different models:
100+
101+
```bash
102+
# Use GPT-4o
103+
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
104+
--model gpt-4o \
105+
--output-dir outputs/mcp_universe
106+
107+
# Use Claude (requires ANTHROPIC_API_KEY)
108+
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
109+
--model claude-3-5-sonnet-20241022 \
110+
--output-dir outputs/mcp_universe
111+
```
112+
113+
### Validate Mode
114+
115+
If you have existing output files, you can validate them without re-running the agent:
116+
117+
```bash
118+
uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
119+
--validate-only \
120+
--log-dir outputs/mcp_universe/raw \
121+
--output-dir outputs/mcp_universe
122+
```
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""MCP-Universe benchmark integration for wags."""
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Evaluation logic for MCP-Universe test results."""
2+
3+
import json
4+
from importlib import resources
5+
from pathlib import Path
6+
from typing import Any, cast
7+
8+
from mcpuniverse.common.context import Context
9+
from mcpuniverse.evaluator.evaluator import EvaluationResult, Evaluator
10+
11+
_DATA_DIR = resources.files("mcpuniverse").joinpath("benchmark/configs/test/repository_management")
12+
13+
14+
async def run_evaluation(
15+
task_id: str,
16+
context: Context,
17+
) -> dict[str, Any]:
18+
"""
19+
Run evaluation for a repository management task.
20+
21+
Args:
22+
task_id: Test case identifier
23+
context: MCP-Universe context with environment variables
24+
25+
Returns:
26+
Dictionary with evaluation results
27+
"""
28+
# Load task data
29+
task_file = _DATA_DIR.joinpath(f"{task_id}.json")
30+
with task_file.open("r", encoding="utf-8") as f:
31+
task = cast(dict[str, Any], json.load(f))
32+
33+
# Set MCP server config path for the evaluator
34+
# This tells MCPManager where to find the GitHub MCP server configuration
35+
mcp_config_path = Path(__file__).parent / "mcp_server_config.json"
36+
context.env["MCP_SERVER_CONFIG"] = str(mcp_config_path)
37+
38+
# Run all evaluators
39+
evaluation_results: list[EvaluationResult] = []
40+
41+
for evaluator_config in task.get("evaluators", []):
42+
evaluator = Evaluator(evaluator_config, context=context)
43+
result = await evaluator.evaluate({})
44+
evaluation_results.append(result)
45+
46+
# Calculate overall pass/fail
47+
all_passed = all(result.passed for result in evaluation_results)
48+
49+
return {
50+
"task_id": task_id,
51+
"passed": all_passed,
52+
"evaluation_results": [
53+
{
54+
"func": result.config.func,
55+
"op": result.config.op,
56+
"passed": result.passed,
57+
"reason": result.reason,
58+
"error": result.error,
59+
}
60+
for result in evaluation_results
61+
],
62+
"task_data": task,
63+
}

0 commit comments

Comments
 (0)