chughtapan
diff --git a/‎pyproject.toml‎
Lines changed: 14 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎tests/benchmarks/appworld/mcp_server.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/benchmarks/appworld/mcp_server.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/benchmarks/mcp_universe/README.md‎
Lines changed: 122 additions & 0 deletions b/‎tests/benchmarks/mcp_universe/README.md‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎tests/benchmarks/mcp_universe/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/benchmarks/mcp_universe/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/benchmarks/mcp_universe/evaluator.py‎
Lines changed: 63 additions & 0 deletions b/‎tests/benchmarks/mcp_universe/evaluator.py‎
Lines changed: 63 additions & 0 deletions
@@ -35,17 +35,26 @@ dev = [
 
 evals = [
     "bfcl-eval",
+    "mcpuniverse",
+    "jinja2>=3.0.0",
+    "python-dotenv>=1.0.0",
     "appworld",
     "appworld-experiments[simplified]",
 ]
 
 [tool.uv]
 override-dependencies = [
-    "openai[aiohttp]>=1.108.0",  # Override appworld-experiments' openai<=1.99.8 constraint
+    "openai[aiohttp]>=1.108.0",  # Override appworld-experiments' openai<=1.99.8 constraint and mcpuniverse's openai==1.106.1
+    "anthropic>=0.68.0",  # Override mcpuniverse's anthropic==0.49.0 constraint
+    "google-genai>=1.33.0",  # Override mcpuniverse's google-genai==1.16.1 constraint
+    "mistralai>=1.7.0",  # Override mcpuniverse's mistralai==1.6.0 constraint
+    "python-dotenv>=1.1.0",  # Override mcpuniverse's python-dotenv==1.0.1 constraint
+    "mcp @ git+https://github.com/chughtapan/python-sdk.git@wags-dev", # Align with core dependency
 ]
 
 [tool.uv.sources]
 bfcl-eval = { git = "https://github.com/chughtapan/gorilla.git", subdirectory = "berkeley-function-call-leaderboard", branch = "wags-dev" }
+mcpuniverse = { git = "https://github.com/vinamra57/MCP-Universe.git", branch = "wags-dev" }
 appworld = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b" }
 appworld-experiments = { git = "https://github.com/StonyBrookNLP/appworld.git", rev = "f31e671ea7e5acc5eb093c0d2761aee3b7a6567b", subdirectory = "experiments" }
 
@@ -111,6 +120,10 @@ disallow_untyped_calls = true
 module = "bfcl_eval.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "mcpuniverse.*"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = [
     "appworld.*",
 
@@ -201,7 +201,7 @@ async def serve_task_mcp(task_id: str, experiment_name: str = "wags-benchmark")
     server = Server(server_name)
 
     # Register list_tools handler
-    @server.list_tools()  # type: ignore[untyped-decorator]
+    @server.list_tools()
     async def list_tools() -> list[Tool]:
         """List available tools with AppWorld's MCP schemas."""
         tools: list[Tool] = []
@@ -216,7 +216,7 @@ async def list_tools() -> list[Tool]:
         return tools
 
     # Register call_tool handler
-    @server.call_tool()  # type: ignore[untyped-decorator]
+    @server.call_tool()
     async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
         """Call AppWorld API and save databases on task completion."""
         app_name = api_name = name
 
@@ -0,0 +1,122 @@
+# MCP-Universe Repository Management Benchmark Integration
+
+This directory contains the integration of the MCP-Universe repository management benchmark into WAGS.
+
+## Overview
+
+MCP-Universe is a comprehensive benchmark from Salesforce AI Research that evaluates LLMs on realistic tasks using real-world MCP servers. This integration focuses on the **repository management domain** with:
+
+- **28 pure GitHub tasks** (github_task_0001 through github_task_0030, excluding 0013 and 0020)
+- Tests realistic GitHub operations including:
+
+- Creating repositories and branches
+- Managing files and commits
+- Creating pull requests
+- Copying files between repositories
+- Managing issues and labels
+
+## Quick Start
+
+### Prerequisites
+
+1. **Docker** - REQUIRED to run the GitHub MCP server
+   - Install Docker Desktop: https://www.docker.com/products/docker-desktop
+   - **Start Docker Desktop** before running tests
+   - Verify installation: `docker --version`
+   - **Note**: Using pinned version v0.15.0 for research reproducibility (before PR #1091 which added automatic instruction generation)
+   - If `docker` command is not found, ensure Docker Desktop is running and restart your terminal
+2. **GitHub Personal Access Token** - For GitHub API access
+   - **CRITICAL**: Use a dedicated test GitHub account for safety
+   - Create token: https://github.com/settings/tokens
+3. **OpenAI API Key** (or Anthropic for Claude models) - For running the LLM agent
+4. **Python 3.13+** with [uv](https://docs.astral.sh/uv/)
+
+### Installation
+
+```bash
+# Clone the repository (if not already done)
+git clone https://github.com/chughtapan/wags.git
+cd wags
+
+# Install dependencies (pulls the forked MCP-Universe package via eval extras)
+UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
+
+# Verify Docker is working
+docker run --rm hello-world
+
+# Pre-pull the GitHub MCP server image (recommended for faster test startup)
+docker pull ghcr.io/github/github-mcp-server:v0.15.0
+```
+
+**Note**: The `.[dev,evals]` extras install:
+- `mcpuniverse` from the fork [`vinamra57/MCP-Universe@72389d8`](https://github.com/vinamra57/MCP-Universe/tree/72389d8a04044dceb855f733a938d0344ac58813), which removes heavy 3D dependencies while keeping the repository-management configs
+- `bfcl-eval` for Berkeley Function Call Leaderboard evaluation
+- Other shared evaluation dependencies
+
+All repository management task JSON files are bundled inside the installed `mcpuniverse` wheel, so no git submodules or manual data checkout are required.
+
+### Configuration
+
+Environment variables are automatically loaded from `servers/github/.env`. Create this file with:
+
+```bash
+# servers/github/.env
+GITHUB_PERSONAL_ACCESS_TOKEN=your_github_token_here
+GITHUB_PERSONAL_ACCOUNT_NAME=your_github_username
+OPENAI_API_KEY=your_openai_key_here
+```
+
+**IMPORTANT**: Use a dedicated test GitHub account. The AI agent will perform real operations on GitHub repositories.
+
+Alternatively, you can manually export the environment variables:
+
+```bash
+export GITHUB_PERSONAL_ACCESS_TOKEN="your_github_token_here"
+export GITHUB_PERSONAL_ACCOUNT_NAME="your_github_username"
+export OPENAI_API_KEY="your_openai_key_here"
+```
+
+### Running Tests
+
+Run all 28 repository management tasks:
+
+```bash
+uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
+    --model gpt-4o-mini \
+    --output-dir outputs/mcp_universe \
+    -v
+```
+
+Run a single task:
+
+```bash
+uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py::test_mcp_universe[github_task_0001] \
+    --model gpt-4o-mini \
+    --output-dir outputs/mcp_universe \
+    -v
+```
+
+Run with different models:
+
+```bash
+# Use GPT-4o
+uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
+    --model gpt-4o \
+    --output-dir outputs/mcp_universe
+
+# Use Claude (requires ANTHROPIC_API_KEY)
+uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
+    --model claude-3-5-sonnet-20241022 \
+    --output-dir outputs/mcp_universe
+```
+
+### Validate Mode
+
+If you have existing output files, you can validate them without re-running the agent:
+
+```bash
+uv run pytest tests/benchmarks/mcp_universe/test_mcp_universe.py \
+    --validate-only \
+    --log-dir outputs/mcp_universe/raw \
+    --output-dir outputs/mcp_universe
+```
@@ -0,0 +1 @@
+"""MCP-Universe benchmark integration for wags."""
@@ -0,0 +1,63 @@
+"""Evaluation logic for MCP-Universe test results."""
+
+import json
+from importlib import resources
+from pathlib import Path
+from typing import Any, cast
+
+from mcpuniverse.common.context import Context
+from mcpuniverse.evaluator.evaluator import EvaluationResult, Evaluator
+
+_DATA_DIR = resources.files("mcpuniverse").joinpath("benchmark/configs/test/repository_management")
+
+
+async def run_evaluation(
+    task_id: str,
+    context: Context,
+) -> dict[str, Any]:
+    """
+    Run evaluation for a repository management task.
+
+    Args:
+        task_id: Test case identifier
+        context: MCP-Universe context with environment variables
+
+    Returns:
+        Dictionary with evaluation results
+    """
+    # Load task data
+    task_file = _DATA_DIR.joinpath(f"{task_id}.json")
+    with task_file.open("r", encoding="utf-8") as f:
+        task = cast(dict[str, Any], json.load(f))
+
+    # Set MCP server config path for the evaluator
+    # This tells MCPManager where to find the GitHub MCP server configuration
+    mcp_config_path = Path(__file__).parent / "mcp_server_config.json"
+    context.env["MCP_SERVER_CONFIG"] = str(mcp_config_path)
+
+    # Run all evaluators
+    evaluation_results: list[EvaluationResult] = []
+
+    for evaluator_config in task.get("evaluators", []):
+        evaluator = Evaluator(evaluator_config, context=context)
+        result = await evaluator.evaluate({})
+        evaluation_results.append(result)
+
+    # Calculate overall pass/fail
+    all_passed = all(result.passed for result in evaluation_results)
+
+    return {
+        "task_id": task_id,
+        "passed": all_passed,
+        "evaluation_results": [
+            {
+                "func": result.config.func,
+                "op": result.config.op,
+                "passed": result.passed,
+                "reason": result.reason,
+                "error": result.error,
+            }
+            for result in evaluation_results
+        ],
+        "task_data": task,
+    }
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""MCP-Universe benchmark integration for wags."""`