Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/benchmarks/bfcl/data
Submodule data updated 43 files
+3 −0 .gitignore
+1 −1 berkeley-function-call-leaderboard/README.md
+9 −9 berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
+16 −9 berkeley-function-call-leaderboard/bfcl_eval/_llm_response_generation.py
+3 −0 berkeley-function-call-leaderboard/bfcl_eval/constants/eval_config.py
+31 −31 berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
+9 −9 berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_irrelevance.json
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_live_multiple.json
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_live_simple.json
+120 −120 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_multi_turn_base.json
+121 −121 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_multi_turn_long_context.json
+122 −122 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_multi_turn_miss_func.json
+122 −122 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_multi_turn_miss_param.json
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/data/BFCL_v4_parallel.json
+3 −3 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/gorilla_file_system.json
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/message_api.json
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/ticket_api.json
+3 −5 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/trading_bot.json
+3 −2 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/travel_booking.json
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/data/multi_turn_func_doc/vehicle_control.json
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_live_simple.json
+80 −80 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_multi_turn_base.json
+75 −75 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_multi_turn_long_context.json
+75 −75 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_multi_turn_miss_func.json
+79 −79 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_multi_turn_miss_param.json
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/data/possible_answer/BFCL_v4_parallel.json
+2 −1 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner.py
+37 −17 ...ey-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py
+4 −5 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/message_api.py
+3 −3 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/posting_api.py
+4 −4 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/ticket_api.py
+41 −92 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/trading_bot.py
+62 −25 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/travel_booking.py
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/vehicle_control.py
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/claude.py
+8 −22 berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/deepseek.py
+1 −1 berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/grok.py
+2 −2 berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/kimi.py
+2 −8 berkeley-function-call-leaderboard/bfcl_eval/model_handler/utils.py
+15 −7 berkeley-function-call-leaderboard/bfcl_eval/scripts/visualize_multi_turn_ground_truth_conversation.py
+71 −63 berkeley-function-call-leaderboard/bfcl_eval/utils.py
+2 −1 berkeley-function-call-leaderboard/pyproject.toml
127 changes: 84 additions & 43 deletions tests/benchmarks/bfcl/mcp_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,78 +12,119 @@
import sys
from typing import Any

from bfcl_eval.constants.eval_config import MULTI_TURN_FUNC_DOC_PATH
from bfcl_eval.constants.executable_backend_config import (
CLASS_FILE_PATH_MAPPING,
MULTI_TURN_FUNC_DOC_FILE_MAPPING,
STATELESS_CLASSES,
)
from mcp.server.fastmcp import FastMCP


def load_api_class(target_class_name: str) -> Any:
"""Load the specified API class dynamically and return instance."""
if target_class_name not in CLASS_FILE_PATH_MAPPING:
raise ValueError(f"Unknown class: {target_class_name}")
def load_api_class(class_name: str) -> Any:
"""Load and instantiate the specified API class."""
module = importlib.import_module(CLASS_FILE_PATH_MAPPING[class_name])
return getattr(module, class_name)()

# Load the class
module = importlib.import_module(CLASS_FILE_PATH_MAPPING[target_class_name])
instance = getattr(module, target_class_name)()
return instance

def load_func_docs(class_name: str) -> dict[str, dict[str, Any]]:
"""Load BFCL's function documentation for a class.

def load_scenario_from_test(test_file: str, test_id: str, target_class_name: str) -> dict[str, Any]:
"""Load scenario configuration from test file."""
scenario = {}
if test_file and test_id:
try:
with open(test_file) as f:
for line in f:
if line.strip():
entry = json.loads(line)
if entry.get("id") == test_id:
if "initial_config" in entry and target_class_name in entry["initial_config"]:
scenario = entry["initial_config"][target_class_name]
break
except Exception as e:
print(f"Warning: Could not load scenario: {e}", file=sys.stderr)
return scenario
Returns a dict mapping function names to their full documentation,
including rich descriptions and parameter schemas.
"""
if class_name not in MULTI_TURN_FUNC_DOC_FILE_MAPPING:
return {}

doc_path = MULTI_TURN_FUNC_DOC_PATH / MULTI_TURN_FUNC_DOC_FILE_MAPPING[class_name]
if not doc_path.exists():
return {}

docs = {}
with open(doc_path) as f:
for line in f:
if line.strip():
doc = json.loads(line)
docs[doc["name"]] = doc
return docs


def load_scenario_from_test(test_file: str, test_id: str, class_name: str) -> dict[str, Any]:
"""Load initial scenario configuration from a test file."""
if not test_file or not test_id:
return {}

with open(test_file) as f:
for line in f:
if line.strip():
entry = json.loads(line)
if entry.get("id") == test_id:
config: dict[str, Any] = entry.get("initial_config", {}).get(class_name, {})
return config

return {}


def patch_tool_with_func_doc(server: FastMCP, func_docs: dict[str, dict[str, Any]]) -> None:
"""Patch registered tools with BFCL's richer function documentation.

FastMCP's introspection doesn't extract parameter descriptions from docstrings.
BFCL provides pre-compiled JSON docs with proper descriptions, so we overlay them.
"""
for tool_name, tool in server._tool_manager._tools.items():
if tool_name not in func_docs:
continue

doc = func_docs[tool_name]

# Patch tool description
tool.description = doc.get("description", tool.description)

# Patch parameter descriptions
doc_params = doc.get("parameters", {}).get("properties", {})
tool_params = tool.parameters.get("properties", {})

for param_name, param_doc in doc_params.items():
if param_name in tool_params and "description" in param_doc:
tool_params[param_name]["description"] = param_doc["description"]


async def main() -> None:
parser = argparse.ArgumentParser(description="MCP Server for BFCL API classes")
parser.add_argument("class_name", help="API class name to load")
parser.add_argument("test_file", nargs="?", help="Test file path (optional)")
parser.add_argument("test_id", nargs="?", help="Test ID (optional)")

args = parser.parse_args()

target_class_name = args.class_name
class_name = args.class_name

if target_class_name not in CLASS_FILE_PATH_MAPPING:
print("Usage: python api_server.py <ClassName> [test_file.json test_id]", file=sys.stderr)
if class_name not in CLASS_FILE_PATH_MAPPING:
print("Usage: python mcp_server.py <ClassName> [test_file.json test_id]", file=sys.stderr)
print(f"Available classes: {', '.join(CLASS_FILE_PATH_MAPPING.keys())}", file=sys.stderr)
sys.exit(1)

try:
api_instance = load_api_class(target_class_name)
print(f"Successfully loaded {target_class_name}", file=sys.stderr)
# Load the API class
api = load_api_class(class_name)
print(f"Loaded {class_name}", file=sys.stderr)

# Load scenario if needed
if hasattr(api_instance, "_load_scenario") and target_class_name not in STATELESS_CLASSES:
scenario = load_scenario_from_test(args.test_file, args.test_id, target_class_name)
api_instance._load_scenario(scenario)
except Exception as e:
print(f"Error loading {target_class_name}: {e}", file=sys.stderr)
sys.exit(1)
# Initialize scenario state if needed
if hasattr(api, "_load_scenario") and class_name not in STATELESS_CLASSES:
scenario = load_scenario_from_test(args.test_file, args.test_id, class_name)
api._load_scenario(scenario)

# Create FastMCP server
server = FastMCP(f"{target_class_name.lower()}-api")
# Load BFCL's function documentation
func_docs = load_func_docs(class_name)

# Register all API methods as tools
for method_name, method in inspect.getmembers(api_instance, predicate=inspect.ismethod):
# Create server and register tools
server = FastMCP(f"{class_name.lower()}-api")

for method_name, method in inspect.getmembers(api, predicate=inspect.ismethod):
if not method_name.startswith("_"):
server.add_tool(method, name=method_name)

# Run the server
# Patch tools with BFCL's richer descriptions
patch_tool_with_func_doc(server, func_docs)

await server.run_stdio_async()


Expand Down