diff --git a/README.md b/README.md
index 7ab04aec..157ce394 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-#
ToolUniverse: Democratizing AI scientists
+#
ToolUniverse: Democratizing AI scientists
[](https://arxiv.org/abs/2509.23426)
[](https://badge.fury.io/py/tooluniverse)
@@ -232,6 +232,26 @@ Our comprehensive documentation covers everything from quick start to advanced w
- **[Adding Tools Tutorial](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/Adding_Tools_Tutorial.html)**: Step-by-step tool addition guide
- **[MCP Tool Registration](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/mcp_tool_registration_en.html)**: Register tools via MCP
+### MedTok + MedLog Integrations
+
+ToolUniverse now ships with first-class support for the MedTok tokenizer service and the MedLog reference collector/FHIR bridge.
+
+- **MedTok REST tools** (`tool_type=["medtok"]`) expose `/tokenize`, `/embed`, `/nearest_neighbors`, `/map_text_to_code`, `/search_text`, and `/codes/{system}/{code}`. Point them at a running service by setting `MEDTOK_BASE_URL` (defaults to `http://localhost:8000`).
+- **MedTok MCP auto-loader** (`tool_type=["medtok_mcp_auto_loader"]`) can register tools from the FastMCP wrapper. Set `MEDTOK_MCP_SERVER_HOST` to the host running the `medtok_tool.py` MCP server.
+- **MedLog collector + FHIR tools** (`tool_type=["medlog"]`) wrap the reference implementation's REST APIs. Configure the collectors' endpoints with `MEDLOG_COLLECTOR_BASE_URL` (default `http://localhost:7001`) and `MEDLOG_FHIR_BASE_URL` (default `http://localhost:7003`).
+
+See `tests/integration/test_medtok_medlog_tools.py` for end-to-end examples that start the services, invoke the tools, and validate responses.
+
+### End-to-End Demo Script
+
+To launch the reference services and exercise the toolchain automatically, run:
+
+```bash
+python scripts/run_full_demo.py # adds -h for options
+```
+
+The script starts MedTok + MedLog locally, runs representative tool calls (including optional external APIs like InterPro, KEGG, IUCN, JASPAR, MarineSpecies, cBioPortal, and Phenome Jax), and prints a success/failure summary.
+
### 📚 API Reference
- **[API Directory](https://zitniklab.hms.harvard.edu/ToolUniverse/api/modules.html)**: Complete module listing
- **[Core Modules](https://zitniklab.hms.harvard.edu/ToolUniverse/api/tooluniverse.html)**: Main ToolUniverse class and utilities
diff --git a/scripts/medlog_stub_server.py b/scripts/medlog_stub_server.py
new file mode 100644
index 00000000..700245fa
--- /dev/null
+++ b/scripts/medlog_stub_server.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+"""
+Lightweight MedLog stub servers for local demos.
+
+Run the collector:
+ python scripts/medlog_stub_server.py --mode collector --host 127.0.0.1 --port 8911
+
+Run the FHIR bridge:
+ python scripts/medlog_stub_server.py --mode fhir --host 127.0.0.1 --port 8912
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import threading
+import time
+from typing import Dict
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+
+
+STORE: Dict[str, Dict] = {}
+STORE_LOCK = threading.Lock()
+
+
+def build_collector_app() -> FastAPI:
+ app = FastAPI(title="MedLog Collector (Stub)", version="0.1.0")
+
+ @app.post("/medlog/events/init")
+ def init(payload: dict):
+ header = payload.get("header") or {}
+ event_id = header.get("event_id")
+ if not event_id:
+ raise HTTPException(400, "event_id required")
+ record = {
+ "header": header,
+ "model_instance": payload.get("model_instance", {}),
+ "user_identity": payload.get("user_identity", {}),
+ "target_identity": payload.get("target_identity"),
+ "inputs": payload.get("inputs"),
+ "retention_tier": payload.get("retention_tier", "steady"),
+ "fragments": [],
+ }
+ with STORE_LOCK:
+ STORE[event_id] = record
+ return {"status": "ok", "event_id": event_id}
+
+ @app.post("/medlog/events/{event_id}/append")
+ def append(event_id: str, fragment: dict):
+ with STORE_LOCK:
+ record = STORE.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ record["fragments"].append(fragment)
+ return {"status": "ok", "event_id": event_id}
+
+ @app.get("/medlog/events/{event_id}/prov")
+ def prov(event_id: str):
+ with STORE_LOCK:
+ record = STORE.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ header = record["header"]
+ return {"event_id": event_id, "provenance": {"header": header}}
+
+ @app.post("/query")
+ def query(body: dict):
+ run_id = body.get("run_id")
+ event_id = body.get("event_id")
+ limit = body.get("limit", 50)
+ results = []
+ with STORE_LOCK:
+ for eid, record in STORE.items():
+ header = record["header"]
+ if event_id and event_id != eid:
+ continue
+ if run_id and header.get("run_id") != run_id:
+ continue
+ results.append({"event_id": eid, "header": header})
+ if len(results) >= limit:
+ break
+ return {"count": len(results), "results": results}
+
+ @app.post("/export/parquet")
+ def export():
+ return {"status": "ok", "outdir": "/tmp/parquet"}
+
+ return app
+
+
+def build_fhir_app() -> FastAPI:
+ app = FastAPI(title="MedLog FHIR Stub", version="0.1.0")
+
+ def bundle(records):
+ return {
+ "resourceType": "Bundle",
+ "type": "collection",
+ "entry": [
+ {
+ "resource": {
+ "resourceType": "Observation",
+ "id": record["header"]["event_id"],
+ "status": "final",
+ }
+ }
+ for record in records
+ ],
+ }
+
+ @app.get("/bundle/{event_id}")
+ def bundle_event(event_id: str):
+ with STORE_LOCK:
+ record = STORE.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ return bundle([record])
+
+ @app.get("/bundle/run/{run_id}")
+ def bundle_run(run_id: str):
+ with STORE_LOCK:
+ records = [
+ record
+ for record in STORE.values()
+ if record["header"].get("run_id") == run_id
+ ]
+ if not records:
+ raise HTTPException(404, "run not found")
+ return bundle(records)
+
+ return app
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--mode", choices=["collector", "fhir"], required=True)
+ parser.add_argument("--host", default=os.getenv("MEDLOG_HOST", "127.0.0.1"))
+ parser.add_argument("--port", type=int, default=int(os.getenv("MEDLOG_PORT", 0)) or 0)
+ args = parser.parse_args()
+
+ if args.port == 0:
+ args.port = 8911 if args.mode == "collector" else 8912
+
+ app = build_collector_app() if args.mode == "collector" else build_fhir_app()
+ print(f"Starting MedLog {args.mode} stub on {args.host}:{args.port}")
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/run_full_demo.py b/scripts/run_full_demo.py
new file mode 100644
index 00000000..b06cc6c5
--- /dev/null
+++ b/scripts/run_full_demo.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python
+"""
+End-to-end ToolUniverse demo runner.
+
+This script bootstraps the MedTok and MedLog reference services locally, points
+ToolUniverse at them, and exercises a curated set of tools (MedTok, MedLog, and
+several public data tools such as InterPro, KEGG, IUCN, JASPAR, MarineSpecies,
+cBioPortal, Phenome Jax). It prints friendly status updates and reports any
+failures at the end.
+
+Usage:
+ python scripts/run_full_demo.py
+
+Optional flags:
+ --skip-network-tools Skip external API tools (InterPro, KEGG, etc.).
+ --medtok-host HOST Override MedTok host (default 127.0.0.1).
+ --medtok-port PORT Override MedTok port (default 8910).
+ --medlog-host HOST Override MedLog host (default 127.0.0.1).
+ --collector-port PORT Override MedLog collector port (default 8911).
+ --fhir-port PORT Override MedLog FHIR port (default 8912).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import tempfile
+import threading
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+import uvicorn
+from fastapi import FastAPI, HTTPException
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SRC_PATH = REPO_ROOT / "src"
+if str(SRC_PATH) not in sys.path:
+ sys.path.insert(0, str(SRC_PATH))
+
+from tooluniverse.execute_function import ToolUniverse
+
+MEDTOK_ROOT = REPO_ROOT.parent / "MedTok-FHIR-Starter"
+MEDLOG_ROOT = REPO_ROOT.parent / "medlog-reference"
+
+
+class ServerHandle:
+ """Run a FastAPI app in a background thread via uvicorn."""
+
+ def __init__(self, app: FastAPI, host: str, port: int):
+ config = uvicorn.Config(app, host=host, port=port, log_level="error", lifespan="off")
+ self.server = uvicorn.Server(config)
+ self.thread = threading.Thread(target=self.server.run, daemon=True)
+
+ def start(self) -> None:
+ self.thread.start()
+ while not self.server.started:
+ time.sleep(0.05)
+
+ def stop(self) -> None:
+ self.server.should_exit = True
+ self.thread.join(timeout=5)
+
+
+def _import_module_typed(module_path: Path):
+ import importlib.util
+
+ spec = importlib.util.spec_from_file_location(module_path.stem, module_path)
+ module = importlib.util.module_from_spec(spec)
+ assert spec and spec.loader
+ spec.loader.exec_module(module)
+ return module
+
+
+def _service_is_up(base_url: str, path: str, ok_statuses: Optional[List[int]] = None) -> bool:
+ try:
+ resp = requests.get(f"{base_url}{path}", timeout=2)
+ if ok_statuses is None:
+ return resp.status_code < 500
+ return resp.status_code in ok_statuses
+ except requests.RequestException:
+ return False
+
+
+def start_medtok(host: str, port: int):
+ """Start MedTok FastAPI service and return context info."""
+ service_path = MEDTOK_ROOT / "services" / "medtok_service"
+ if str(service_path) not in sys.path:
+ sys.path.insert(0, str(service_path))
+
+ base_url = os.environ.get("MEDTOK_BASE_URL") or f"http://{host}:{port}"
+ if _service_is_up(base_url, "/health", ok_statuses=[200]):
+ os.environ["MEDTOK_BASE_URL"] = base_url
+ print(f"MedTok already running at {base_url}, reusing existing instance.")
+ return {"server": None, "temp_config": None, "sys_path": str(service_path), "started": False}
+
+ config_path = MEDTOK_ROOT / "config" / "medtok_config.json"
+ config_data = json.loads(config_path.read_text(encoding="utf-8"))
+ config_data["code_metadata_path"] = str(MEDTOK_ROOT / "samples" / "code_metadata.csv")
+ config_data["graph_edges_path"] = str(MEDTOK_ROOT / "samples" / "code_graph_edges.csv")
+
+ tmp_config = tempfile.NamedTemporaryFile("w", suffix="_medtok_config.json", delete=False)
+ json.dump(config_data, tmp_config)
+ tmp_config.flush()
+ tmp_config.close()
+ os.environ["MEDTOK_CONFIG"] = tmp_config.name
+
+ module = _import_module_typed(service_path / "app.py")
+ module.MAPPING_CSV = str(MEDTOK_ROOT / "samples" / "code_mapping.csv")
+ app = module.app
+
+ server = ServerHandle(app, host, port)
+ server.start()
+ os.environ["MEDTOK_BASE_URL"] = f"http://{host}:{port}"
+
+ return {
+ "server": server,
+ "temp_config": tmp_config.name,
+ "sys_path": str(service_path),
+ "started": True,
+ }
+
+
+def _build_medlog_collector(store: Dict[str, Dict]):
+ app = FastAPI()
+
+ @app.post("/medlog/events/init")
+ def init(payload: dict):
+ header = payload.get("header") or {}
+ event_id = header.get("event_id")
+ if not event_id:
+ raise HTTPException(400, "event_id required")
+ record = {
+ "header": header,
+ "model_instance": payload.get("model_instance", {}),
+ "user_identity": payload.get("user_identity", {}),
+ "target_identity": payload.get("target_identity"),
+ "inputs": payload.get("inputs"),
+ "retention_tier": payload.get("retention_tier", "steady"),
+ "fragments": [],
+ }
+ store[event_id] = record
+ return {"status": "ok", "event_id": event_id}
+
+ @app.post("/medlog/events/{event_id}/append")
+ def append(event_id: str, fragment: dict):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ record["fragments"].append(fragment)
+ return {"status": "ok", "event_id": event_id}
+
+ @app.get("/medlog/events/{event_id}/prov")
+ def prov(event_id: str):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ return {"event_id": event_id, "provenance": {"header": record["header"]}}
+
+ @app.post("/query")
+ def query(body: dict):
+ run_id = body.get("run_id")
+ event_id = body.get("event_id")
+ limit = body.get("limit", 50)
+ matches = []
+ for eid, record in store.items():
+ header = record["header"]
+ if event_id and event_id != eid:
+ continue
+ if run_id and header.get("run_id") != run_id:
+ continue
+ matches.append({"event_id": eid, "header": header})
+ if len(matches) >= limit:
+ break
+ return {"count": len(matches), "results": matches}
+
+ @app.post("/export/parquet")
+ def export():
+ return {"status": "ok", "outdir": "/tmp/parquet"}
+
+ return app
+
+
+def _build_medlog_fhir(store: Dict[str, Dict]):
+ app = FastAPI()
+
+ def _bundle(records):
+ return {
+ "resourceType": "Bundle",
+ "type": "collection",
+ "entry": [
+ {
+ "resource": {
+ "resourceType": "Observation",
+ "id": record["header"]["event_id"],
+ "status": "final",
+ }
+ }
+ for record in records
+ ],
+ }
+
+ @app.get("/bundle/{event_id}")
+ def bundle_event(event_id: str):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ return _bundle([record])
+
+ @app.get("/bundle/run/{run_id}")
+ def bundle_run(run_id: str):
+ records = [
+ record
+ for record in store.values()
+ if record["header"].get("run_id") == run_id
+ ]
+ if not records:
+ raise HTTPException(404, "run not found")
+ return _bundle(records)
+
+ return app
+
+
+def start_medlog(host: str, collector_port: int, fhir_port: int):
+ store: Dict[str, Dict] = {}
+ collector_app = _build_medlog_collector(store)
+ fhir_app = _build_medlog_fhir(store)
+
+ collector_url = os.environ.get("MEDLOG_COLLECTOR_BASE_URL") or f"http://{host}:{collector_port}"
+ fhir_url = os.environ.get("MEDLOG_FHIR_BASE_URL") or f"http://{host}:{fhir_port}"
+
+ collector_server = None
+ fhir_server = None
+
+ if _service_is_up(collector_url, "/"):
+ print(f"MedLog collector already running at {collector_url}, reusing.")
+ else:
+ collector_server = ServerHandle(collector_app, host, collector_port)
+ collector_server.start()
+
+ if _service_is_up(fhir_url, "/bundle/test"):
+ print(f"MedLog FHIR service already running at {fhir_url}, reusing.")
+ else:
+ fhir_server = ServerHandle(fhir_app, host, fhir_port)
+ fhir_server.start()
+
+ os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}"
+ os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}"
+
+ return {"collector": collector_server, "fhir": fhir_server, "started": bool(collector_server or fhir_server)}
+
+
+def stop_medtok(ctx: Dict[str, str]):
+ if ctx.get("server"):
+ ctx["server"].stop()
+ if ctx.get("started"):
+ os.environ.pop("MEDTOK_BASE_URL", None)
+ os.environ.pop("MEDTOK_CONFIG", None)
+ temp_config = ctx.get("temp_config")
+ if temp_config:
+ try:
+ os.remove(temp_config)
+ except OSError:
+ pass
+ sys_path = ctx.get("sys_path")
+ if sys_path:
+ try:
+ sys.path.remove(sys_path)
+ except ValueError:
+ pass
+
+
+def stop_medlog(ctx: Dict[str, ServerHandle]):
+ if ctx.get("collector"):
+ ctx["collector"].stop()
+ if ctx.get("fhir"):
+ ctx["fhir"].stop()
+ if ctx.get("started"):
+ os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None)
+ os.environ.pop("MEDLOG_FHIR_BASE_URL", None)
+
+
+def preview_json(payload: Any, limit: int = 240) -> str:
+ """Return a compact preview of a payload for console logging."""
+ try:
+ text = json.dumps(payload, indent=2, ensure_ascii=False)
+ except TypeError:
+ text = str(payload)
+ text = text.strip()
+ if len(text) > limit:
+ return text[:limit].rstrip() + "..."
+ return text
+
+
+def call_tool(tu: ToolUniverse, name: str, **kwargs):
+ """Call a tool and handle ToolUniverse-specific errors."""
+ print(f"---> Calling {name} with {kwargs}")
+ try:
+ response = getattr(tu.tools, name)(**kwargs)
+ print(f"[OK] {name} succeeded")
+ return True, response
+ except Exception as exc: # pylint: disable=broad-except
+ print(f"[FAIL] {name} failed: {exc}")
+ return False, str(exc)
+
+
+def run_medlog_demo(tu: ToolUniverse) -> List[Dict[str, str]]:
+ results = []
+ header = {
+ "event_id": "evt-demo-1",
+ "run_id": "run-demo-1",
+ "timestamp": "2025-01-01T00:00:00Z",
+ }
+ model_instance = {"model": "demo", "version": "1.0"}
+ user_identity = {"name": "Dr. Example"}
+ steps = [
+ (
+ "MedLog_init_event",
+ dict(header=header, model_instance=model_instance, user_identity=user_identity),
+ "Open an event with metadata (who, when, which model).",
+ ),
+ (
+ "MedLog_append_fragment",
+ dict(event_id="evt-demo-1", fragment={"outputs": {"summary": "Patient stable"}}),
+ "Attach a fragment that captures model outputs for the event.",
+ ),
+ ("MedLog_get_provenance", dict(event_id="evt-demo-1"), "Retrieve provenance header saved for the event."),
+ ("MedLog_query_events", dict(run_id="run-demo-1"), "Query the store by run identifier."),
+ ("MedLog_export_parquet", dict(), "Trigger sample export (stub returns static location)."),
+ ("MedLog_fhir_bundle", dict(event_id="evt-demo-1"), "View the event as a single FHIR Observation bundle."),
+ ("MedLog_fhir_run_bundle", dict(run_id="run-demo-1"), "Bundle all events in the run as FHIR Observations."),
+ ]
+
+ for name, kwargs, description in steps:
+ print(f" - {description}")
+ success, payload = call_tool(tu, name, **kwargs)
+ note = None
+ if success:
+ if name == "MedLog_init_event":
+ note = f"Created event {payload.get('event_id')}"
+ elif name == "MedLog_append_fragment":
+ note = "Attached fragment with outputs summary"
+ elif name == "MedLog_get_provenance":
+ prov = payload.get("provenance", {})
+ note = f"Provenance keys: {', '.join(prov.keys()) or 'none'}"
+ elif name == "MedLog_query_events":
+ note = f"Query returned {payload.get('count', 0)} rows"
+ elif name == "MedLog_fhir_bundle":
+ note = f"Bundle contains {len(payload.get('entry', []))} resources"
+ elif name == "MedLog_fhir_run_bundle":
+ note = f"Run bundle resources: {len(payload.get('entry', []))}"
+ if success and note:
+ print(f" Result: {note}")
+ results.append({"tool": name, "success": success, "response": payload, "note": note})
+ return results
+
+
+def run_medtok_demo(tu: ToolUniverse) -> List[Dict[str, str]]:
+ tests = [
+ (
+ "MedTok_tokenize",
+ dict(codes=["A00", "E11"], system="ICD-10", include_metadata=True),
+ "Convert ICD-10 codes into internal token IDs plus metadata for downstream models.",
+ ),
+ ("MedTok_embed", dict(codes=["A00"], system="ICD-10"), "Generate vector embeddings for a medical code."),
+ ("MedTok_nearest_neighbors", dict(code="A00", k=3), "Find nearby codes in embedding space."),
+ ("MedTok_map_text_to_code", dict(text="type 2 diabetes", system="ICD-10"), "Map free text to the closest code."),
+ ("MedTok_search_text", dict(text="hypertension", k=4), "Search the terminology for matching codes by text."),
+ ("MedTok_code_info", dict(code="E11", system="ICD-10"), "Fetch descriptive details for a specific code."),
+ ]
+ results = []
+ for name, kwargs, description in tests:
+ print(f" - {description}")
+ success, payload = call_tool(tu, name, **kwargs)
+ note = None
+ if success:
+ if name == "MedTok_tokenize":
+ note = f"Received {len(payload.get('token_ids', []))} token IDs"
+ elif name == "MedTok_embed":
+ emb = payload.get("embeddings") or []
+ if emb:
+ note = f"Embedding dimension {payload.get('dim')}, first vector length {len(emb[0])}"
+ elif name == "MedTok_nearest_neighbors":
+ note = f"Returned {len(payload.get('neighbors', []))} neighbors"
+ elif name == "MedTok_map_text_to_code":
+ note = f"Mapped text to code {payload.get('code')}"
+ elif name == "MedTok_search_text":
+ note = f"Top match code {payload.get('matches', [{}])[0].get('code') if payload.get('matches') else 'N/A'}"
+ elif name == "MedTok_code_info":
+ note = f"Code info description: {payload.get('description', 'N/A')}"
+ if success and note:
+ print(f" Result: {note}")
+ results.append({"tool": name, "success": success, "response": payload, "note": note})
+ return results
+
+
+NETWORK_TOOLS = [
+ ("InterPro_search_entries", {"query": "BRCA1"}),
+ ("KEGG_find_entries", {"query": "ATP synthase", "database": "pathway"}),
+ ("IUCN_get_species_status", {"species": "Panthera leo"}),
+ ("JASPAR_search_motifs", {"query": "SOX2"}),
+ ("MarineSpecies_lookup", {"scientific_name": "Gadus morhua"}),
+ ("cBioPortal_search_studies", {"keyword": "breast cancer"}),
+ ("PhenomeJax_list_projects", {"keyword": "glucose"}),
+]
+
+
+def run_network_tools(tu: ToolUniverse) -> List[Dict[str, str]]:
+ outcomes = []
+ for name, kwargs in NETWORK_TOOLS:
+ success, payload = call_tool(tu, name, **kwargs)
+ note_parts: List[str] = []
+ if success:
+ if name == "InterPro_search_entries":
+ data = payload if isinstance(payload, dict) else {}
+ note_parts.append(f"Entries returned: {len(data.get('results', []))}")
+ elif name == "KEGG_find_entries":
+ if isinstance(payload, dict):
+ note_parts.append(f"Matched {len(payload.get('results', []))} entries")
+ elif isinstance(payload, list):
+ note_parts.append(f"Matched {len(payload)} entries")
+ elif name == "IUCN_get_species_status":
+ result = payload.get("result") if isinstance(payload, dict) else {}
+ if isinstance(result, list) and result:
+ result = result[0]
+ elif result is None:
+ result = {}
+ species = result.get("scientific_name")
+ category = result.get("category")
+ note_parts.append(f"{species} status {category}")
+ elif name == "JASPAR_search_motifs":
+ data = payload if isinstance(payload, dict) else {}
+ note_parts.append(f"Found {len(data.get('results', []))} motifs")
+ elif name == "MarineSpecies_lookup":
+ data = payload if isinstance(payload, dict) else {}
+ note_parts.append(f"Matches: {len(data.get('results', []))}")
+ elif name == "cBioPortal_search_studies":
+ data = payload if isinstance(payload, dict) else {}
+ note_parts.append(f"Studies returned: {len(data.get('studies', []))}")
+ elif name == "PhenomeJax_list_projects":
+ data = payload if isinstance(payload, dict) else {}
+ note_parts.append(f"Projects listed: {len(data.get('projects', []))}")
+
+ preview = preview_json(payload)
+ print(f" {name} preview: {preview}")
+ note_parts.append(f"Preview: {preview}")
+ else:
+ print(f" {name} error payload: {preview_json(payload)}")
+ note = " | ".join(note_parts) if note_parts else None
+ outcomes.append({"tool": name, "success": success, "response": payload, "note": note})
+ return outcomes
+
+
+def _extract_host(candidate: Dict[str, Any]) -> str:
+ host = candidate.get("host")
+ if host:
+ return str(host)
+ for key in ("url", "endpoint", "base_url"):
+ maybe = candidate.get(key)
+ if not maybe:
+ continue
+ parsed = urlparse(str(maybe))
+ if parsed.netloc:
+ return parsed.netloc
+ return "candidate"
+
+
+def _slugify_host(value: str) -> str:
+ slug = "".join(ch if ch.isalnum() else "_" for ch in value.lower())
+ slug = slug.strip("_")
+ return slug or "candidate"
+
+
+def run_vsd_demo(tu: ToolUniverse) -> List[Dict[str, str]]:
+ """
+ Demonstrate the Harvest -> Register -> Run workflow using Verified Source Directory helpers.
+ """
+ search_query = "ensembl rest api"
+ print(f"\nSearching harvest catalog for '{search_query}' candidates...")
+ results: List[Dict[str, Any]] = []
+
+ success_search, harvest_resp = call_tool(
+ tu,
+ "GenericHarvestTool",
+ query=search_query,
+ limit=5,
+ )
+ selected_candidate: Optional[Dict[str, Any]] = None
+ note_search: Optional[str] = None
+ if success_search:
+ candidates = (harvest_resp or {}).get("candidates") or []
+ note_search = f"Candidates returned: {len(candidates)}"
+ if candidates:
+ preferred_hosts = {"rest.ensembl.org", "api.open-meteo.com"}
+ for candidate_option in candidates:
+ host = _extract_host(candidate_option).lower()
+ if host in preferred_hosts:
+ selected_candidate = candidate_option
+ break
+ if not selected_candidate:
+ selected_candidate = candidates[0]
+ host = _extract_host(selected_candidate)
+ print(f" - Selected candidate: {selected_candidate.get('name')} ({selected_candidate.get('url')}) [host: {host}]")
+ print(f" Candidate preview: {preview_json(selected_candidate)}")
+ else:
+ print(" - Harvest returned no candidates.")
+ else:
+ print(f" - Harvest search failed payload: {preview_json(harvest_resp)}")
+ note_search = "Harvest search failed"
+ results.append({"tool": "GenericHarvestTool", "success": success_search, "response": harvest_resp, "note": note_search})
+
+ if not (success_search and selected_candidate):
+ results.append(
+ {
+ "tool": "HarvestCandidateTesterTool",
+ "success": False,
+ "response": {"error": "No harvest candidate available"},
+ "note": "Skipped testing",
+ }
+ )
+ return results
+
+ candidate = selected_candidate
+ print("\nTesting harvest candidate via HarvestCandidateTesterTool...")
+ success_probe, probe_resp = call_tool(
+ tu,
+ "HarvestCandidateTesterTool",
+ candidate=candidate,
+ )
+ probe_note = None
+ if success_probe:
+ status = (probe_resp.get("test") or {}).get("status")
+ probe_note = f"Probe status {status}"
+ print(f" - Probe preview: {preview_json(probe_resp)}")
+ else:
+ print(f" - Probe failure payload: {preview_json(probe_resp)}")
+ results.append({"tool": "HarvestCandidateTesterTool", "success": success_probe, "response": probe_resp, "note": probe_note})
+
+ if not (success_probe and probe_resp.get("ok")):
+ print("Skipping registration because candidate probe failed.")
+ results.append(
+ {
+ "tool": "VerifiedSourceRegisterTool",
+ "success": False,
+ "response": {"error": "Probe failed"},
+ "note": None,
+ }
+ )
+ return results
+
+ host_slug = _slugify_host(_extract_host(candidate))
+ tool_name = f"HarvestDemo_{host_slug[:40]}"
+
+ print("\nRegistering candidate with VerifiedSourceRegisterTool...")
+ success_reg, register_resp = call_tool(
+ tu,
+ "VerifiedSourceRegisterTool",
+ tool_name=tool_name,
+ candidate=candidate,
+ )
+ note_reg = None
+ if success_reg:
+ config = (register_resp or {}).get("config") or {}
+ base_url = (config.get("fields") or {}).get("base_url") or config.get("endpoint")
+ note_reg = f"Registered tool pointing to {base_url}"
+ print(f" - Registered config preview: {preview_json(config)}")
+ else:
+ print(f" - Registration failure payload: {preview_json(register_resp)}")
+ results.append(
+ {
+ "tool": "VerifiedSourceRegisterTool",
+ "success": success_reg,
+ "response": register_resp,
+ "note": note_reg,
+ }
+ )
+
+ if not success_reg:
+ return results
+
+ print("\nCalling newly registered tool...")
+ tu.load_tools(include_tools=[tool_name])
+ success_run, run_resp = call_tool(tu, tool_name)
+ note_run = None
+ if success_run:
+ preview = preview_json(run_resp)
+ note_run = f"Preview: {preview}"
+ print(f" - Run result preview: {preview}")
+ else:
+ print(f" - Run failure payload: {preview_json(run_resp)}")
+ results.append({"tool": tool_name, "success": success_run, "response": run_resp, "note": note_run})
+
+ print("\nCleaning up registered tool...")
+ success_rm, rm_resp = call_tool(
+ tu,
+ "VerifiedSourceRemoveTool",
+ tool_name=tool_name,
+ )
+ note_rm = "Removed from catalog" if success_rm else None
+ if success_rm:
+ print(f" - Removal confirmation: {preview_json(rm_resp)}")
+ else:
+ print(f" - Removal failure payload: {preview_json(rm_resp)}")
+ results.append({"tool": "VerifiedSourceRemoveTool", "success": success_rm, "response": rm_resp, "note": note_rm})
+
+ return results
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Run ToolUniverse end-to-end demo.")
+ parser.add_argument("--skip-network-tools", action="store_true", help="Skip tools that require external HTTP APIs.")
+ parser.add_argument("--skip-vsd", action="store_true", help="Skip harvest/register/run VSD demonstration.")
+ parser.add_argument("--medtok-host", default="127.0.0.1")
+ parser.add_argument("--medtok-port", type=int, default=8910)
+ parser.add_argument("--medlog-host", default="127.0.0.1")
+ parser.add_argument("--collector-port", type=int, default=8911)
+ parser.add_argument("--fhir-port", type=int, default=8912)
+ args = parser.parse_args()
+
+ medtok_ctx = None
+ medlog_ctx = None
+ all_results: List[Dict[str, str]] = []
+
+ try:
+ print("Starting MedTok service...")
+ medtok_ctx = start_medtok(args.medtok_host, args.medtok_port)
+ print(f"MedTok running at {os.environ['MEDTOK_BASE_URL']}")
+
+ print("Starting MedLog services...")
+ medlog_ctx = start_medlog(args.medlog_host, args.collector_port, args.fhir_port)
+ print(
+ f"MedLog collector at {os.environ['MEDLOG_COLLECTOR_BASE_URL']}, "
+ f"FHIR bridge at {os.environ['MEDLOG_FHIR_BASE_URL']}"
+ )
+
+ tu = ToolUniverse(hooks_enabled=False)
+ tu.load_tools(tool_type=["medtok", "medlog"])
+
+ print("\nRunning MedTok demo calls...")
+ all_results.extend(run_medtok_demo(tu))
+
+ print("\nRunning MedLog demo calls...")
+ all_results.extend(run_medlog_demo(tu))
+
+ if not args.skip_network_tools:
+ print("\nLoading network-enabled tools (InterPro, KEGG, IUCN, etc.)...")
+ categories = [
+ "interpro",
+ "kegg",
+ "iucn_red_list",
+ "jaspar",
+ "marine_species",
+ "cbioportal",
+ "phenome_jax",
+ ]
+ try:
+ tu.load_tools(tool_type=categories)
+ except Exception as exc: # pylint: disable=broad-except
+ print(f"[WARN] Failed to load network tool categories: {exc}")
+ else:
+ print("Running network tool calls...")
+ all_results.extend(run_network_tools(tu))
+ else:
+ print("\nSkipping external network tools.")
+
+ if not args.skip_vsd:
+ print("\nHarvest -> Register -> Run walkthrough...")
+ vsd_results = run_vsd_demo(tu)
+ all_results.extend(vsd_results)
+ else:
+ print("\nSkipping VSD harvest/register/run demo.")
+
+ finally:
+ if medtok_ctx:
+ print("\nStopping MedTok service...")
+ stop_medtok(medtok_ctx)
+ if medlog_ctx:
+ print("Stopping MedLog services...")
+ stop_medlog(medlog_ctx)
+
+ print("\n================ Demo Summary ================")
+ failures = [r for r in all_results if not r["success"]]
+ for result in all_results:
+ status = "PASS" if result["success"] else "FAIL"
+ print(f"{status:4} | {result['tool']}")
+ note = result.get("note")
+ if note:
+ print(f" {note}")
+ if not result["success"]:
+ print(f" -> {result['response']}")
+ print("=============================================")
+
+ if failures:
+ print(f"{len(failures)} tool calls failed.")
+ sys.exit(1)
+ print("All tool calls succeeded.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/run_new_tools.py b/scripts/run_new_tools.py
new file mode 100644
index 00000000..7ec93959
--- /dev/null
+++ b/scripts/run_new_tools.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Smoke-test the newly added bioscience tools against their live APIs.
+
+The script imports the tool classes directly (without loading the full
+ToolUniverse package) and prints representative responses. It respects
+the following environment variables when present:
+
+ IUCN_RED_LIST_TOKEN
+ CBIOPORTAL_API_TOKEN
+"""
+
+from __future__ import annotations
+
+import os
+import pprint
+import sys
+from pathlib import Path
+
+
+def _bootstrap_path() -> None:
+ """Ensure the local src/ directory is importable."""
+ repo_root = Path(__file__).resolve().parent.parent
+ src_path = repo_root / "src"
+ if str(src_path) not in sys.path:
+ sys.path.insert(0, str(src_path))
+ os.environ.setdefault("TOOLUNIVERSE_LIGHT_IMPORT", "true")
+
+
+def main() -> None:
+ _bootstrap_path()
+
+ from tooluniverse.interpro_tool import InterProTool
+ from tooluniverse.kegg_tool import KEGGTool
+ from tooluniverse.iucn_tool import IUCNRedListTool
+ from tooluniverse.jaspar_tool import JASPARRestTool
+ from tooluniverse.marine_species_tool import MarineSpeciesTool
+ from tooluniverse.cbioportal_tool import CBioPortalTool
+ from tooluniverse.phenome_jax_tool import PhenomeJaxTool
+
+ results = {
+ "interpro": InterProTool({"name": "InterPro_search_entries"}).run(
+ {"query": "kinase", "page_size": 2}
+ ),
+ "kegg": KEGGTool({"name": "KEGG_find_entries"}).run(
+ {"query": "glucose", "database": "pathway", "max_results": 2}
+ ),
+ "jaspar": JASPARRestTool({"name": "JASPAR_search_motifs"}).run(
+ {"query": "Arnt", "page_size": 2}
+ ),
+ "marine_species": MarineSpeciesTool({"name": "MarineSpecies_lookup"}).run(
+ {"scientific_name": "Delphinus delphis", "like": True}
+ ),
+ "cbioportal": CBioPortalTool({"name": "cBioPortal_search_studies"}).run(
+ {"keyword": "breast", "page_size": 2}
+ ),
+ "phenome_jax": PhenomeJaxTool({"name": "PhenomeJax_list_projects"}).run(
+ {"keyword": "glucose", "limit": 2}
+ ),
+ }
+
+ try:
+ results["iucn"] = IUCNRedListTool({"name": "IUCN_get_species_status"}).run(
+ {"species": "Panthera leo"}
+ )
+ except Exception as exc: # pragma: no cover - best-effort reporting
+ results["iucn"] = {"error": str(exc)}
+
+ for key, value in results.items():
+ print(f"=== {key.upper()} ===")
+ pprint.pprint(value)
+ print()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/tooluniverse/__init__.py b/src/tooluniverse/__init__.py
index bed8e3f3..658b7cdd 100644
--- a/src/tooluniverse/__init__.py
+++ b/src/tooluniverse/__init__.py
@@ -278,6 +278,18 @@ def __getattr__(self, name):
from .core_tool import CoreTool
from .pmc_tool import PMCTool
from .zenodo_tool import ZenodoTool
+ from . import vsd_tool # registers VerifiedSourceDiscoveryTool + VerifiedSourceRegisterTool
+ from . import vsd_api_tool # registers GenericRESTTool + GenericGraphQLTool
+ from . import context_keeper_tool # registers ContextKeeperTool
+ from . import candidate_tester_tool # registers HarvestCandidateTesterTool
+ from . import tool_navigator_tool # registers ToolNavigatorTool
+ from . import interpro_tool
+ from . import kegg_tool
+ from . import iucn_tool
+ from . import jaspar_tool
+ from . import marine_species_tool
+ from . import cbioportal_tool
+ from . import phenome_jax_tool
else:
# With lazy loading, create lazy import proxies that import modules only when accessed
MonarchTool = _LazyImportProxy("restful_tool", "MonarchTool")
@@ -368,6 +380,13 @@ def __getattr__(self, name):
CellosaurusGetCellLineInfoTool = _LazyImportProxy(
"cellosaurus_tool", "CellosaurusGetCellLineInfoTool"
)
+ InterProTool = _LazyImportProxy("interpro_tool", "InterProTool")
+ KEGGTool = _LazyImportProxy("kegg_tool", "KEGGTool")
+ IUCNRedListTool = _LazyImportProxy("iucn_tool", "IUCNRedListTool")
+ JASPARRestTool = _LazyImportProxy("jaspar_tool", "JASPARRestTool")
+ MarineSpeciesTool = _LazyImportProxy("marine_species_tool", "MarineSpeciesTool")
+ CBioPortalTool = _LazyImportProxy("cbioportal_tool", "CBioPortalTool")
+ PhenomeJaxTool = _LazyImportProxy("phenome_jax_tool", "PhenomeJaxTool")
# Literature search tools
ArXivTool = _LazyImportProxy("arxiv_tool", "ArXivTool")
CrossrefTool = _LazyImportProxy("crossref_tool", "CrossrefTool")
@@ -453,6 +472,10 @@ def __getattr__(self, name):
"ODPHPItemList",
"ODPHPTopicSearch",
"ODPHPOutlinkFetch",
+ "ContextKeeperTool",
+ "HarvestCandidateTesterTool",
+ "GenericHarvestTool",
+ "ToolNavigatorTool",
"CellosaurusSearchTool",
"CellosaurusQueryConverterTool",
"CellosaurusGetCellLineInfoTool",
diff --git a/src/tooluniverse/candidate_tester_tool.py b/src/tooluniverse/candidate_tester_tool.py
new file mode 100644
index 00000000..24b742d9
--- /dev/null
+++ b/src/tooluniverse/candidate_tester_tool.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+from .tool_registry import register_tool
+from .vsd_utils import build_config, probe_config
+
+HARVEST_CANDIDATE_TESTER_SCHEMA = {
+ "type": "object",
+ "properties": {
+ "candidate": {"type": "object"},
+ "tool_type": {"type": "string", "default": "dynamic_rest"},
+ "default_params": {"type": "object"},
+ "default_headers": {"type": "object"},
+ },
+ "required": ["candidate"],
+ "additionalProperties": False,
+}
+
+HARVEST_CANDIDATE_TESTER_CONFIG = {
+ "name": "HarvestCandidateTesterTool",
+ "description": "Probe a harvest/VSD candidate endpoint and report JSON readiness without registering it.",
+ "type": "HarvestCandidateTesterTool",
+ "category": "special_tools",
+ "parameter": HARVEST_CANDIDATE_TESTER_SCHEMA,
+}
+
+
+@register_tool("HarvestCandidateTesterTool", config=HARVEST_CANDIDATE_TESTER_CONFIG)
+class HarvestCandidateTesterTool:
+ """
+ Validate harvest/VSD candidates without registering them.
+ Returns HTTP diagnostics and suggestions for default params or headers.
+ """
+
+ name = "HarvestCandidateTesterTool"
+ description = "Test a harvest candidate endpoint to see if it returns usable JSON."
+ input_schema = HARVEST_CANDIDATE_TESTER_SCHEMA
+
+ def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None:
+ self.tool_config = tool_config or {}
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ candidate = arguments.get("candidate") or {}
+ tool_type = arguments.get("tool_type") or "dynamic_rest"
+ default_params = arguments.get("default_params")
+ default_headers = arguments.get("default_headers")
+
+ cfg = build_config(
+ candidate,
+ tool_type=tool_type,
+ default_params=default_params,
+ default_headers=default_headers,
+ )
+ probe = probe_config(cfg)
+
+ return {
+ "ok": bool(probe.get("ok")),
+ "test": probe,
+ "config": cfg,
+ }
diff --git a/src/tooluniverse/cbioportal_tool.py b/src/tooluniverse/cbioportal_tool.py
new file mode 100644
index 00000000..93a86a2f
--- /dev/null
+++ b/src/tooluniverse/cbioportal_tool.py
@@ -0,0 +1,74 @@
+import os
+from typing import Any, Dict, List
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+CBIOPORTAL_BASE_URL = "https://www.cbioportal.org/api"
+CBIOPORTAL_TOKEN_ENV = "CBIOPORTAL_API_TOKEN"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("CBioPortalTool")
+class CBioPortalTool(BaseTool):
+ """
+ Wrapper around the cBioPortal REST API for study discovery.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def _headers(self, arguments: Dict[str, Any]) -> Dict[str, str]:
+ headers = {"Accept": "application/json"}
+ token = (
+ arguments.get("token")
+ or self.tool_config.get("token")
+ or os.getenv(CBIOPORTAL_TOKEN_ENV)
+ )
+ if token:
+ headers["X-Auth-Token"] = token
+ return headers
+
+ def run(self, arguments):
+ keyword = (arguments or {}).get("keyword") or (arguments or {}).get("query")
+ if not keyword:
+ return {"error": "Missing required parameter: keyword"}
+
+ page_size = int(
+ (arguments or {}).get("page_size")
+ or self.tool_config.get("page_size", 20)
+ )
+ page_number = int((arguments or {}).get("page") or 0)
+
+ params = {
+ "keyword": keyword,
+ "pageSize": max(page_size, 1),
+ "pageNumber": max(page_number, 0),
+ "projection": "SUMMARY",
+ }
+
+ response = self.session.get(
+ f"{CBIOPORTAL_BASE_URL}/studies",
+ params=params,
+ headers=self._headers(arguments or {}),
+ timeout=REQUEST_TIMEOUT,
+ )
+ response.raise_for_status()
+ payload = response.json()
+
+ results: List[Dict[str, Any]] = []
+ for item in payload:
+ results.append(
+ {
+ "studyId": item.get("studyId"),
+ "name": item.get("name"),
+ "description": item.get("description"),
+ "cancerTypeId": item.get("cancerTypeId"),
+ "publicStudy": item.get("publicStudy"),
+ }
+ )
+
+ return {"results": results, "returned": len(results)}
diff --git a/src/tooluniverse/common_utils.py b/src/tooluniverse/common_utils.py
new file mode 100644
index 00000000..8fdb5d85
--- /dev/null
+++ b/src/tooluniverse/common_utils.py
@@ -0,0 +1,30 @@
+
+import os, json, time, threading, base64, io
+from typing import Any, Dict, Tuple
+
+_LOCK = threading.Lock()
+
+def ensure_dir(path: str):
+ os.makedirs(path, exist_ok=True)
+
+def vsd_generated_path() -> str:
+ base = os.environ.get("TOOLUNIVERSE_VSD_DIR") or os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd")
+ ensure_dir(base)
+ return os.path.join(base, "generated_tools.json")
+
+def read_json(path: str, default):
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ return json.load(f)
+ except Exception:
+ return default
+
+def write_json(path: str, data: Any):
+ ensure_dir(os.path.dirname(path))
+ tmp_path = f"{path}.tmp"
+ with open(tmp_path, "w", encoding="utf-8") as f:
+ json.dump(data, f, indent=2)
+ os.replace(tmp_path, path)
+
+def b64_png(png_bytes: bytes) -> str:
+ return base64.b64encode(png_bytes).decode("ascii")
diff --git a/src/tooluniverse/context_keeper_tool.py b/src/tooluniverse/context_keeper_tool.py
new file mode 100644
index 00000000..46dd2b0c
--- /dev/null
+++ b/src/tooluniverse/context_keeper_tool.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import json
+import os
+from typing import Any, Dict, Optional
+
+from .tool_registry import register_tool
+
+CONTEXT_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "context")
+CONTEXT_PATH = os.path.join(CONTEXT_DIR, "context.json")
+
+
+def _ensure_dir() -> None:
+ os.makedirs(CONTEXT_DIR, exist_ok=True)
+
+
+def _load_context() -> Dict[str, Any]:
+ if not os.path.exists(CONTEXT_PATH):
+ return {}
+ try:
+ with open(CONTEXT_PATH, "r", encoding="utf-8") as handle:
+ data = json.load(handle)
+ if isinstance(data, dict):
+ return data
+ except Exception:
+ pass
+ return {}
+
+
+def _write_context(data: Dict[str, Any]) -> None:
+ _ensure_dir()
+ tmp_path = f"{CONTEXT_PATH}.tmp"
+ with open(tmp_path, "w", encoding="utf-8") as handle:
+ json.dump(data, handle, indent=2, ensure_ascii=False)
+ os.replace(tmp_path, CONTEXT_PATH)
+
+
+@register_tool("ContextKeeperTool")
+class ContextKeeperTool:
+ """
+ Lightweight context store that agents can use to persist conversation or task state
+ between ToolUniverse calls. Data is saved under ~/.tooluniverse/context/context.json.
+ """
+
+ name = "ContextKeeperTool"
+ description = "Persist or retrieve task context (key/value pairs) for ongoing agent workflows."
+ input_schema = {
+ "type": "object",
+ "properties": {
+ "action": {
+ "type": "string",
+ "enum": ["get", "set", "append", "clear", "keys"],
+ "default": "get",
+ },
+ "key": {"type": "string", "description": "Context entry name"},
+ "value": {
+ "description": "Value to store; for append operations this should be a list item.",
+ },
+ },
+ "additionalProperties": False,
+ }
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ action = (arguments.get("action") or "get").lower()
+ key: Optional[str] = arguments.get("key")
+ value: Any = arguments.get("value")
+
+ context = _load_context()
+
+ if action == "keys":
+ return {"ok": True, "keys": sorted(context.keys())}
+
+ if action == "clear":
+ if key:
+ removed = context.pop(key, None) is not None
+ _write_context(context)
+ return {"ok": removed, "cleared": key if removed else None}
+ context.clear()
+ _write_context(context)
+ return {"ok": True, "cleared": "all"}
+
+ if action == "set":
+ if key is None:
+ return {"ok": False, "error": "key is required for set"}
+ context[key] = value
+ _write_context(context)
+ return {"ok": True, "key": key, "value": value}
+
+ if action == "append":
+ if key is None:
+ return {"ok": False, "error": "key is required for append"}
+ existing = context.get(key)
+ if existing is None:
+ context[key] = [value]
+ elif isinstance(existing, list):
+ existing.append(value)
+ else:
+ context[key] = [existing, value]
+ _write_context(context)
+ return {"ok": True, "key": key, "value": context[key]}
+
+ # default: get
+ if key:
+ return {"ok": True, "key": key, "value": context.get(key)}
+ return {"ok": True, "value": context}
diff --git a/src/tooluniverse/data/cbioportal_tools.json b/src/tooluniverse/data/cbioportal_tools.json
new file mode 100644
index 00000000..2cdc982b
--- /dev/null
+++ b/src/tooluniverse/data/cbioportal_tools.json
@@ -0,0 +1,34 @@
+[
+ {
+ "type": "CBioPortalTool",
+ "name": "cBioPortal_search_studies",
+ "description": "Search cBioPortal studies by keyword (supports optional API token).",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "keyword": {
+ "type": "string",
+ "description": "Keyword to search in study identifiers and descriptions."
+ },
+ "page": {
+ "type": "integer",
+ "description": "Zero-based page index.",
+ "default": 0,
+ "minimum": 0
+ },
+ "page_size": {
+ "type": "integer",
+ "description": "Number of records per page.",
+ "default": 20,
+ "minimum": 1,
+ "maximum": 1000
+ },
+ "token": {
+ "type": "string",
+ "description": "Optional API token (falls back to CBIOPORTAL_API_TOKEN environment variable)."
+ }
+ },
+ "required": ["keyword"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/interpro_tools.json b/src/tooluniverse/data/interpro_tools.json
new file mode 100644
index 00000000..bb3de457
--- /dev/null
+++ b/src/tooluniverse/data/interpro_tools.json
@@ -0,0 +1,30 @@
+[
+ {
+ "type": "InterProTool",
+ "name": "InterPro_search_entries",
+ "description": "Search InterPro entries by keyword using the official REST API.",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Keyword or accession to search within InterPro."
+ },
+ "page": {
+ "type": "integer",
+ "description": "Results page number (1-indexed).",
+ "default": 1,
+ "minimum": 1
+ },
+ "page_size": {
+ "type": "integer",
+ "description": "Number of records per page (max 200).",
+ "default": 25,
+ "minimum": 1,
+ "maximum": 200
+ }
+ },
+ "required": ["query"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/iucn_tools.json b/src/tooluniverse/data/iucn_tools.json
new file mode 100644
index 00000000..8f1b07b8
--- /dev/null
+++ b/src/tooluniverse/data/iucn_tools.json
@@ -0,0 +1,21 @@
+[
+ {
+ "type": "IUCNRedListTool",
+ "name": "IUCN_get_species_status",
+ "description": "Retrieve conservation status information from the IUCN Red List (requires IUCN_RED_LIST_TOKEN).",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "species": {
+ "type": "string",
+ "description": "Scientific name of the species (spaces allowed)."
+ },
+ "token": {
+ "type": "string",
+ "description": "Optional API token; falls back to IUCN_RED_LIST_TOKEN environment variable."
+ }
+ },
+ "required": ["species"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/jaspar_tools.json b/src/tooluniverse/data/jaspar_tools.json
new file mode 100644
index 00000000..b596865a
--- /dev/null
+++ b/src/tooluniverse/data/jaspar_tools.json
@@ -0,0 +1,38 @@
+[
+ {
+ "type": "JASPARRestTool",
+ "name": "JASPAR_search_motifs",
+ "description": "Search transcription factor binding motifs in the JASPAR database.",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Search term (matrix ID, gene symbol, or motif name)."
+ },
+ "tax_group": {
+ "type": "string",
+ "description": "Optional taxonomic group filter (e.g., vertebrates)."
+ },
+ "collection": {
+ "type": "string",
+ "description": "Optional collection filter (e.g., CORE, UNVALIDATED)."
+ },
+ "page": {
+ "type": "integer",
+ "description": "Page number (1-indexed).",
+ "default": 1,
+ "minimum": 1
+ },
+ "page_size": {
+ "type": "integer",
+ "description": "Number of results per page.",
+ "default": 10,
+ "minimum": 1,
+ "maximum": 100
+ }
+ },
+ "required": ["query"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/kegg_tools.json b/src/tooluniverse/data/kegg_tools.json
new file mode 100644
index 00000000..b20436be
--- /dev/null
+++ b/src/tooluniverse/data/kegg_tools.json
@@ -0,0 +1,27 @@
+[
+ {
+ "type": "KEGGTool",
+ "name": "KEGG_find_entries",
+ "description": "Find KEGG entries matching a query within a selected database.",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Search term (e.g., gene symbol, compound name)."
+ },
+ "database": {
+ "type": "string",
+ "description": "KEGG database name (pathway, compound, gene, etc.).",
+ "default": "pathway"
+ },
+ "max_results": {
+ "type": "integer",
+ "description": "Optional limit on number of results to return.",
+ "minimum": 1
+ }
+ },
+ "required": ["query"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/marine_species_tools.json b/src/tooluniverse/data/marine_species_tools.json
new file mode 100644
index 00000000..0d92a269
--- /dev/null
+++ b/src/tooluniverse/data/marine_species_tools.json
@@ -0,0 +1,27 @@
+[
+ {
+ "type": "MarineSpeciesTool",
+ "name": "MarineSpecies_lookup",
+ "description": "Lookup marine taxa using the World Register of Marine Species (WoRMS).",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "scientific_name": {
+ "type": "string",
+ "description": "Scientific name to search (exact or partial when like=true)."
+ },
+ "like": {
+ "type": "boolean",
+ "description": "Use fuzzy matching when true.",
+ "default": true
+ },
+ "marine_only": {
+ "type": "boolean",
+ "description": "Restrict results to marine taxa only.",
+ "default": true
+ }
+ },
+ "required": ["scientific_name"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/medlog_tools.json b/src/tooluniverse/data/medlog_tools.json
new file mode 100644
index 00000000..bf5799ce
--- /dev/null
+++ b/src/tooluniverse/data/medlog_tools.json
@@ -0,0 +1,134 @@
+[
+ {
+ "name": "MedLog_init_event",
+ "description": "Initialize or overwrite a MedLog event record. Supply the 9-field MedLog payload to capture headers, inputs, identities, and initial artifacts.",
+ "type": "MedLogInitEventTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "header": {
+ "type": "object",
+ "description": "MedLog header block including event_id, timestamps, risk metadata, and parent relationships."
+ },
+ "model_instance": {
+ "type": "object",
+ "description": "Model provenance metadata (model name, version, risk posture, vendor, etc.)."
+ },
+ "user_identity": {
+ "type": "object",
+ "description": "Information about the requesting user, clinician, or agent."
+ },
+ "target_identity": {
+ "type": "object",
+ "description": "Optional target entity such as patient or device identifiers."
+ },
+ "inputs": {
+ "type": "object",
+ "description": "Structured input payload captured at initialization."
+ },
+ "retention_tier": {
+ "type": "string",
+ "description": "Retention tier label (steady, critical, transient, etc.)."
+ }
+ },
+ "required": ["header", "model_instance", "user_identity"]
+ }
+ },
+ {
+ "name": "MedLog_append_fragment",
+ "description": "Append outputs, outcomes, artifacts, or feedback fragments to an existing MedLog event.",
+ "type": "MedLogAppendFragmentTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "event_id": {
+ "type": "string",
+ "description": "Identifier of the event to update."
+ },
+ "fragment": {
+ "type": "object",
+ "description": "Fragment payload containing any of internal_artifacts, outputs, outcomes, or user_feedback."
+ }
+ },
+ "required": ["event_id", "fragment"]
+ }
+ },
+ {
+ "name": "MedLog_get_provenance",
+ "description": "Fetch PROV-JSON bundle for a given event to support audit trails and lineage review.",
+ "type": "MedLogGetProvenanceTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "event_id": {
+ "type": "string",
+ "description": "Identifier of the event to retrieve."
+ }
+ },
+ "required": ["event_id"]
+ }
+ },
+ {
+ "name": "MedLog_query_events",
+ "description": "Query MedLog events by run or event identifier. Useful for dashboarding, analytics, and sampling inspection.",
+ "type": "MedLogQueryEventsTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "run_id": {
+ "type": "string",
+ "description": "Optional run identifier to filter results."
+ },
+ "event_id": {
+ "type": "string",
+ "description": "Optional event identifier to narrow results."
+ },
+ "limit": {
+ "type": "integer",
+ "description": "Maximum number of rows to return (default 50).",
+ "minimum": 1,
+ "maximum": 500
+ }
+ }
+ }
+ },
+ {
+ "name": "MedLog_export_parquet",
+ "description": "Trigger MedLog parquet export to the configured artifact directory.",
+ "type": "MedLogExportParquetTool",
+ "parameter": {
+ "type": "object",
+ "properties": {}
+ }
+ },
+ {
+ "name": "MedLog_fhir_bundle",
+ "description": "Retrieve the FHIR bundle synthesised for an individual MedLog event (Patient, Practitioner, Device, AuditEvent, Observations, Documents).",
+ "type": "MedLogFHIRBundleTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "event_id": {
+ "type": "string",
+ "description": "Identifier of the event to export."
+ }
+ },
+ "required": ["event_id"]
+ }
+ },
+ {
+ "name": "MedLog_fhir_run_bundle",
+ "description": "Aggregate all events in a run into a consolidated FHIR bundle for care-path review.",
+ "type": "MedLogFHIRRunBundleTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "run_id": {
+ "type": "string",
+ "description": "Run identifier to export."
+ }
+ },
+ "required": ["run_id"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/medtok_mcp_tools.json b/src/tooluniverse/data/medtok_mcp_tools.json
new file mode 100644
index 00000000..fef79cbf
--- /dev/null
+++ b/src/tooluniverse/data/medtok_mcp_tools.json
@@ -0,0 +1,11 @@
+[
+ {
+ "name": "mcp_auto_loader_medtok",
+ "description": "Discover and register MedTok tools from a running MedTok MCP server so they can be invoked directly through ToolUniverse.",
+ "type": "MCPAutoLoaderTool",
+ "server_url": "http://${MEDTOK_MCP_SERVER_HOST}:9001/mcp",
+ "tool_prefix": "medtok_",
+ "auto_register": true,
+ "required_api_keys": ["MEDTOK_MCP_SERVER_HOST"]
+ }
+]
diff --git a/src/tooluniverse/data/medtok_tools.json b/src/tooluniverse/data/medtok_tools.json
new file mode 100644
index 00000000..c54fe67b
--- /dev/null
+++ b/src/tooluniverse/data/medtok_tools.json
@@ -0,0 +1,134 @@
+[
+ {
+ "name": "MedTok_tokenize",
+ "description": "Tokenize one or more medical codes using the MedTok multimodal tokenizer. Useful for exposing token IDs and optional metadata to downstream workflows.",
+ "type": "MedTokTokenizeTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "codes": {
+ "type": "array",
+ "items": { "type": "string" },
+ "description": "List of codes to tokenize (e.g., ICD-10 identifiers)."
+ },
+ "system": {
+ "type": "string",
+ "description": "Coding system, defaults to ICD-10."
+ },
+ "include_metadata": {
+ "type": "boolean",
+ "description": "Return region-level metadata for each code."
+ }
+ },
+ "required": ["codes"]
+ }
+ },
+ {
+ "name": "MedTok_embed",
+ "description": "Generate MedTok embeddings for a batch of codes. Returns floating-point vectors suitable for similarity search or downstream ML tasks.",
+ "type": "MedTokEmbedTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "codes": {
+ "type": "array",
+ "items": { "type": "string" },
+ "description": "Codes to embed."
+ },
+ "system": {
+ "type": "string",
+ "description": "Coding system, defaults to ICD-10."
+ }
+ },
+ "required": ["codes"]
+ }
+ },
+ {
+ "name": "MedTok_nearest_neighbors",
+ "description": "Retrieve the nearest neighbours for a code from the MedTok embedding space with similarity scores.",
+ "type": "MedTokNearestNeighborsTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "code": {
+ "type": "string",
+ "description": "Anchor code for the neighbourhood query."
+ },
+ "system": {
+ "type": "string",
+ "description": "Coding system, defaults to ICD-10."
+ },
+ "k": {
+ "type": "integer",
+ "description": "Number of neighbours to return (default 5).",
+ "minimum": 1,
+ "maximum": 50
+ }
+ },
+ "required": ["code"]
+ }
+ },
+ {
+ "name": "MedTok_map_text_to_code",
+ "description": "Map free-text clinical language to the most relevant code using MedTok text semantics.",
+ "type": "MedTokMapTextTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "Clinical description or narrative."
+ },
+ "system": {
+ "type": "string",
+ "description": "Target coding system, defaults to ICD-10."
+ }
+ },
+ "required": ["text"]
+ }
+ },
+ {
+ "name": "MedTok_search_text",
+ "description": "Hybrid text + semantic search over the MedTok vocabulary. Useful for exploratory lookup workflows.",
+ "type": "MedTokSearchTextTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "Query text to search for."
+ },
+ "system": {
+ "type": ["string", "null"],
+ "description": "Optional coding system filter."
+ },
+ "k": {
+ "type": "integer",
+ "description": "Maximum number of matches (default 5).",
+ "minimum": 1,
+ "maximum": 50
+ }
+ },
+ "required": ["text"]
+ }
+ },
+ {
+ "name": "MedTok_code_info",
+ "description": "Retrieve metadata for a specific code including synonyms and graph context when available.",
+ "type": "MedTokCodeInfoTool",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "code": {
+ "type": "string",
+ "description": "Code identifier to fetch."
+ },
+ "system": {
+ "type": "string",
+ "description": "Coding system, defaults to ICD-10."
+ }
+ },
+ "required": ["code"]
+ }
+ }
+]
diff --git a/src/tooluniverse/data/phenome_jax_tools.json b/src/tooluniverse/data/phenome_jax_tools.json
new file mode 100644
index 00000000..2b3ad09c
--- /dev/null
+++ b/src/tooluniverse/data/phenome_jax_tools.json
@@ -0,0 +1,23 @@
+[
+ {
+ "type": "PhenomeJaxTool",
+ "name": "PhenomeJax_list_projects",
+ "description": "List Mouse Phenome Database projects with optional keyword filtering.",
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "keyword": {
+ "type": "string",
+ "description": "Optional keyword to filter projects by title or description."
+ },
+ "limit": {
+ "type": "integer",
+ "description": "Maximum number of projects to return.",
+ "default": 20,
+ "minimum": 1,
+ "maximum": 200
+ }
+ }
+ }
+ }
+]
diff --git a/src/tooluniverse/data/vsd.json b/src/tooluniverse/data/vsd.json
new file mode 100644
index 00000000..b359048e
--- /dev/null
+++ b/src/tooluniverse/data/vsd.json
@@ -0,0 +1,35 @@
+[
+ {
+ "name": "GenericHarvestTool",
+ "type": "GenericHarvestTool",
+ "description": "Live-harvest candidate API endpoints by invoking all modules in tooluniverse.harvest.",
+ "tool_type": "special_tools",
+ "enabled": true,
+ "visible": true,
+ "parameter": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Free-text hint, passed to all harvesters under tooluniverse.harvest."
+ },
+ "urls": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "format": "uri"
+ },
+ "description": "Explicit candidate URLs to validate and return (skips live harvesting)."
+ },
+ "limit": {
+ "type": "integer",
+ "minimum": 1,
+ "maximum": 50,
+ "default": 5,
+ "description": "Max number of candidates to return."
+ }
+ },
+ "additionalProperties": false
+ }
+ }
+]
diff --git a/src/tooluniverse/data/vsd_allowlist.json b/src/tooluniverse/data/vsd_allowlist.json
new file mode 100644
index 00000000..3c5258a2
--- /dev/null
+++ b/src/tooluniverse/data/vsd_allowlist.json
@@ -0,0 +1,4 @@
+[
+{"domain": "ema.europa.eu", "label": "EMA", "trust": 0.95, "registry": "ema"},
+{"domain": "ghoapi.azureedge.net", "label": "WHO GHO", "trust": 0.92, "registry": "who"}
+]
\ No newline at end of file
diff --git a/src/tooluniverse/data/vsd_tools.json b/src/tooluniverse/data/vsd_tools.json
new file mode 100644
index 00000000..398cd86b
--- /dev/null
+++ b/src/tooluniverse/data/vsd_tools.json
@@ -0,0 +1,34 @@
+[
+ {
+ "type": "VerifiedSourceDiscoveryTool",
+ "name": "vsd_discover_sources",
+ "description": "Discover trusted candidate sources for a free-text query",
+ "parameter": {
+ "type": "object",
+ "required": ["query"],
+ "properties": {
+ "query": { "type": "string" },
+ "limit": { "type": "integer" },
+ "allowlist_overrides": { "type": "array" }
+ }
+ },
+ "label": ["VSD", "Discovery"]
+ },
+ {
+ "type": "VerifiedSourceRegisterTool",
+ "name": "vsd_register_tool",
+ "description": "Register a VSD-generated tool bound to a trusted source",
+ "parameter": {
+ "type": "object",
+ "required": ["candidate", "tool_name"],
+ "properties": {
+ "candidate": { "type": "object" },
+ "tool_name": { "type": "string" },
+ "description": { "type": "string" },
+ "parameter_overrides": { "type": "object" },
+ "evidence_sample": { "type": "object" }
+ }
+ },
+ "label": ["VSD", "Synthesis"]
+ }
+]
diff --git a/src/tooluniverse/default_config.py b/src/tooluniverse/default_config.py
index 46095834..b3a64a5e 100644
--- a/src/tooluniverse/default_config.py
+++ b/src/tooluniverse/default_config.py
@@ -70,6 +70,13 @@
"medlineplus": os.path.join(current_dir, "data", "medlineplus_tools.json"),
"uniprot": os.path.join(current_dir, "data", "uniprot_tools.json"),
"cellosaurus": os.path.join(current_dir, "data", "cellosaurus_tools.json"),
+ "interpro": os.path.join(current_dir, "data", "interpro_tools.json"),
+ "kegg": os.path.join(current_dir, "data", "kegg_tools.json"),
+ "iucn_red_list": os.path.join(current_dir, "data", "iucn_tools.json"),
+ "jaspar": os.path.join(current_dir, "data", "jaspar_tools.json"),
+ "marine_species": os.path.join(current_dir, "data", "marine_species_tools.json"),
+ "cbioportal": os.path.join(current_dir, "data", "cbioportal_tools.json"),
+ "phenome_jax": os.path.join(current_dir, "data", "phenome_jax_tools.json"),
# 'software': os.path.join(current_dir, 'data', 'software_tools.json'),
# Package tools - categorized software tools
"software_bioinformatics": os.path.join(
@@ -150,6 +157,11 @@
"genomics": os.path.join(current_dir, "data", "genomics_tools.json"),
# Guideline and health policy tools
"guidelines": os.path.join(current_dir, "data", "unified_guideline_tools.json"),
+ "medtok": os.path.join(current_dir, "data", "medtok_tools.json"),
+ "medtok_mcp_auto_loader": os.path.join(
+ current_dir, "data", "medtok_mcp_tools.json"
+ ),
+ "medlog": os.path.join(current_dir, "data", "medlog_tools.json"),
}
diff --git a/src/tooluniverse/dynamic_rest_runner.py b/src/tooluniverse/dynamic_rest_runner.py
new file mode 100644
index 00000000..a3061d36
--- /dev/null
+++ b/src/tooluniverse/dynamic_rest_runner.py
@@ -0,0 +1,194 @@
+"""
+Dynamic REST/GraphQL tool loader for Verified Source Directory (VSD).
+
+This module keeps an in-memory registry of generated tool specifications and
+exposes helper functions for refreshing, inserting, or removing entries. Tools
+are backed by lightweight BaseTool subclasses that issue HTTP requests using
+the stored configuration.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import threading
+from typing import Any, Dict, Optional
+
+import requests
+
+from .base_tool import BaseTool
+from .common_utils import read_json, vsd_generated_path
+from .tool_registry import register_config, register_tool
+
+LOGGER = logging.getLogger("DynamicRESTRunner")
+_REGISTRY_LOCK = threading.Lock()
+_GENERATED_TOOLS: Dict[str, Dict[str, Any]] = {}
+
+
+def _normalize_spec(spec: Any) -> Dict[str, Dict[str, Any]]:
+ """
+ Accept legacy list or dict formats and normalize to {name: config}.
+ """
+ if isinstance(spec, dict):
+ if "generated_tools" in spec and isinstance(spec["generated_tools"], list):
+ return {
+ item.get("name"): dict(item)
+ for item in spec["generated_tools"]
+ if isinstance(item, dict) and item.get("name")
+ }
+ return {
+ name: dict(cfg)
+ for name, cfg in spec.items()
+ if isinstance(cfg, dict)
+ }
+
+ if isinstance(spec, list):
+ result: Dict[str, Dict[str, Any]] = {}
+ for item in spec:
+ if isinstance(item, dict) and item.get("name"):
+ result[item["name"]] = dict(item)
+ return result
+
+ return {}
+
+
+def _load_generated_specs() -> Dict[str, Dict[str, Any]]:
+ path = vsd_generated_path()
+ data = read_json(path, {})
+ return _normalize_spec(data)
+
+
+def _build_request_kwargs(config: Dict[str, Any], arguments: Dict[str, Any]) -> Dict[str, Any]:
+ fields = config.get("fields", {})
+ method = fields.get("method", "GET").upper()
+ timeout = fields.get("timeout", 30)
+ headers = fields.get("headers", {})
+ default_params = fields.get("default_params", {})
+
+ params = dict(default_params)
+ body: Optional[Any] = None
+
+ if method in {"GET", "DELETE"}:
+ params.update(arguments)
+ else:
+ if fields.get("body_format", "json") == "form":
+ body = dict(arguments)
+ else:
+ body = arguments or {}
+
+ kwargs: Dict[str, Any] = {
+ "method": method,
+ "url": fields.get("base_url"),
+ "headers": headers,
+ "timeout": timeout,
+ }
+ if params:
+ kwargs["params"] = params
+ if body is not None:
+ if fields.get("body_format", "json") == "form":
+ kwargs["data"] = body
+ else:
+ kwargs["json"] = body
+ return kwargs
+
+
+def _handle_response(response: requests.Response) -> Any:
+ try:
+ return response.json()
+ except ValueError:
+ return {
+ "status_code": response.status_code,
+ "text": response.text,
+ }
+
+
+@register_tool("GenericRESTTool")
+class GenericRESTTool(BaseTool):
+ """
+ Generic REST tool generated from a VSD configuration.
+ """
+
+ def run(self, arguments=None, stream_callback=None, **_: Any):
+ arguments = arguments or {}
+ kwargs = _build_request_kwargs(self.tool_config, arguments)
+ method = kwargs.pop("method")
+ url = kwargs.pop("url")
+
+ response = requests.request(method, url, **kwargs)
+ response.raise_for_status()
+ result = _handle_response(response)
+
+ if stream_callback:
+ stream_callback(json.dumps(result))
+ return result
+
+
+@register_tool("GenericGraphQLTool")
+class GenericGraphQLTool(BaseTool):
+ """
+ Generic GraphQL tool generated from a VSD configuration.
+ """
+
+ def run(self, arguments=None, stream_callback=None, **_: Any):
+ arguments = arguments or {}
+ fields = self.tool_config.get("fields", {})
+ headers = fields.get("headers", {})
+ timeout = fields.get("timeout", 30)
+ payload = {
+ "query": arguments.get("query") or fields.get("default_query"),
+ "variables": arguments.get("variables") or fields.get("default_variables", {}),
+ }
+
+ response = requests.post(
+ fields.get("base_url"),
+ json=payload,
+ headers=headers,
+ timeout=timeout,
+ )
+ response.raise_for_status()
+ result = _handle_response(response)
+
+ if stream_callback:
+ stream_callback(json.dumps(result))
+ return result
+
+
+def _register_generated_tool(tool_name: str, config: Dict[str, Any]) -> None:
+ config = dict(config)
+ config.setdefault("name", tool_name)
+ tool_type = config.get("type") or "GenericRESTTool"
+
+ register_config(tool_name, config)
+ _GENERATED_TOOLS[tool_name] = config
+
+ LOGGER.debug("Registered generated tool %s of type %s", tool_name, tool_type)
+
+
+def refresh_generated_registry() -> Dict[str, Dict[str, Any]]:
+ """
+ Reload generated tool specs from disk and update the runtime registry.
+ """
+ specs = _load_generated_specs()
+ with _REGISTRY_LOCK:
+ _GENERATED_TOOLS.clear()
+ for name, cfg in specs.items():
+ _register_generated_tool(name, cfg)
+ return specs
+
+
+def upsert_generated_tool(tool_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Insert or update a generated tool in the runtime registry.
+ """
+ with _REGISTRY_LOCK:
+ _register_generated_tool(tool_name, config)
+ return _GENERATED_TOOLS[tool_name]
+
+
+def remove_generated_tool(tool_name: str) -> None:
+ """
+ Remove a generated tool from the runtime registry.
+ """
+ with _REGISTRY_LOCK:
+ _GENERATED_TOOLS.pop(tool_name, None)
+ LOGGER.debug("Removed generated tool %s", tool_name)
diff --git a/src/tooluniverse/harvest/__init__.py b/src/tooluniverse/harvest/__init__.py
new file mode 100644
index 00000000..19c21109
--- /dev/null
+++ b/src/tooluniverse/harvest/__init__.py
@@ -0,0 +1 @@
+# Harvest subpackage
diff --git a/src/tooluniverse/harvest/domain_policies.py b/src/tooluniverse/harvest/domain_policies.py
new file mode 100644
index 00000000..49031914
--- /dev/null
+++ b/src/tooluniverse/harvest/domain_policies.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+from functools import lru_cache
+from typing import Dict, List
+
+# Conservative allow/deny fragments. We still compute a trust score as a gradient.
+ALLOWED_FRAGMENTS: List[str] = [
+ # government & intergovernmental
+ ".gov", ".mil", ".gob", ".gouv", ".go.", ".govt.nz", ".gc.ca",
+ "who.int", "worldbank.org", "oecd.org", "europa.eu", "esa.int",
+ # major scientific/health orgs
+ "nih.gov", "niddk.nih.gov", "ninds.nih.gov", "ncbi.nlm.nih.gov", "data.cdc.gov", "api.cdc.gov",
+ "fda.gov", "api.fda.gov", "epa.gov", "noaa.gov", "usgs.gov", "census.gov",
+ "data.gov", "healthdata.gov", "data.cms.gov", "data.hrsa.gov", "data.hhs.gov",
+ "ghoapi.azureedge.net",
+]
+
+BLOCKED_FRAGMENTS: List[str] = [
+ "mirror", "docshare", "scribd.com", "sharepdf", "academia.edu",
+ "stackprinter", "cachedview", "wayback", "pirated", "scrapeops",
+]
+
+@lru_cache(maxsize=4096)
+def domain_blocked(host: str) -> bool:
+ h = (host or "").lower()
+ return any(b in h for b in BLOCKED_FRAGMENTS)
+
+@lru_cache(maxsize=4096)
+def domain_allowed(host: str) -> bool:
+ # allow if any strong allow fragment present AND not blocked
+ h = (host or "").lower()
+ if domain_blocked(h):
+ return False
+ return any(a in h for a in ALLOWED_FRAGMENTS)
+
+@lru_cache(maxsize=4096)
+def trust_score(host: str) -> Dict:
+ """Return a graded trust score in [0,1] with reasons for ranking.
+ We don't *block* here (that's domain_blocked); we provide a signal for ranker.
+ """
+ h = (host or "").lower()
+ score = 0.0
+ reasons: List[str] = []
+ if domain_blocked(h):
+ return {"score": 0.0, "reasons": ["blocked"]}
+
+ # strong positives
+ if any(tld in h for tld in (".gov", "who.int", "worldbank.org", "europa.eu", "oecd.org")):
+ score += 0.65; reasons.append("gov/igo domain")
+ if any(seg in h for seg in ("nih.gov","ncbi.nlm.nih.gov","fda.gov","epa.gov","noaa.gov","usgs.gov","census.gov")):
+ score += 0.2; reasons.append("major science/health org")
+ # medium positives
+ if h.startswith("api.") or "/api" in h:
+ score += 0.05; reasons.append("api host")
+ # slight boost for data portals
+ if any(seg in h for seg in ("data.gov","healthdata.gov","data.cms.gov","data.cdc.gov","data.europa.eu")):
+ score += 0.08; reasons.append("open data portal")
+
+ score = max(0.0, min(1.0, score))
+ return {"score": round(score, 3), "reasons": reasons}
\ No newline at end of file
diff --git a/src/tooluniverse/harvest/openapi_utils.py b/src/tooluniverse/harvest/openapi_utils.py
new file mode 100644
index 00000000..4adcddd0
--- /dev/null
+++ b/src/tooluniverse/harvest/openapi_utils.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+import re, logging, json
+from typing import Dict, Optional, List
+import requests
+
+logger = logging.getLogger("OpenAPIUtils")
+
+OPENAPI_HINTS = ["openapi.json","openapi.yaml","openapi.yml","swagger.json","swagger.yaml","v3/api-docs"]
+
+def _root_of(url: str) -> str:
+ base = url.split("?",1)[0]
+ base = re.sub(r"(#.*)$","", base)
+ base = re.sub(r"/+$","", base)
+ m = re.match(r"^https?://[^/]+", base)
+ return m.group(0) if m else base
+
+def find_openapi_from_url(any_url: str) -> Optional[str]:
+ root = _root_of(any_url)
+ # try /openapi.json etc. at root and one level up
+ tries = [f"{root}/{hint}" for hint in OPENAPI_HINTS]
+ # also try without trailing /api segment if present
+ if root.endswith("/api"):
+ base = root.rsplit("/",1)[0]
+ tries.extend(f"{base}/{hint}" for hint in OPENAPI_HINTS)
+ for t in tries:
+ try:
+ r = requests.get(t, timeout=8)
+ if r.status_code == 200 and ("json" in r.headers.get("Content-Type","") or t.endswith(".json")):
+ # quick JSON sanity
+ try:
+ j = r.json()
+ if "openapi" in j or "swagger" in j:
+ return t
+ except Exception:
+ pass
+ if r.status_code == 200 and (t.endswith(".yaml") or t.endswith(".yml")):
+ return t
+ except requests.RequestException:
+ continue
+ return None
+
+def parse_openapi(spec_url: str) -> Dict:
+ r = requests.get(spec_url, timeout=15)
+ r.raise_for_status()
+ text = r.text
+ if spec_url.endswith((".yaml",".yml")):
+ try:
+ import yaml
+ except Exception as e:
+ raise RuntimeError("YAML support requires PyYAML: pip install pyyaml") from e
+ spec = yaml.safe_load(text)
+ else:
+ spec = r.json()
+
+ servers = spec.get("servers") or []
+ base_url = (servers[0].get("url") if servers and isinstance(servers[0], dict) else None) or None
+
+ paths = spec.get("paths") or {}
+ endpoints: List[Dict] = []
+ for path, methods in paths.items():
+ if not isinstance(methods, dict):
+ continue
+ for method, meta in methods.items():
+ if method.upper() not in ("GET","POST","PUT","PATCH","DELETE","OPTIONS","HEAD"):
+ continue
+ endpoints.append({"path": path, "method": method.upper(), "summary": (meta or {}).get("summary")})
+ return {"base_url": base_url, "endpoints": endpoints}
\ No newline at end of file
diff --git a/src/tooluniverse/harvest/promoter.py b/src/tooluniverse/harvest/promoter.py
new file mode 100644
index 00000000..6ef0d4d6
--- /dev/null
+++ b/src/tooluniverse/harvest/promoter.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+import os, json, tempfile, shutil
+from typing import Dict, Any, List
+
+# Where we persist generated tool configs so DynamicREST (or your server boot)
+# can load them. Mirrors your earlier logs (~/.tooluniverse/vsd/generated_tools.json).
+VSD_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd")
+VSD_PATH = os.path.join(VSD_DIR, "generated_tools.json")
+
+def _ensure_dir():
+ os.makedirs(VSD_DIR, exist_ok=True)
+
+def _read_json(path: str) -> Any:
+ if not os.path.exists(path):
+ return {}
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ return json.load(f) or {}
+ except Exception:
+ return {}
+
+def _atomic_write(path: str, data: Any):
+ tmp_fd, tmp_path = tempfile.mkstemp(prefix="vsd_", suffix=".json")
+ os.close(tmp_fd)
+ with open(tmp_path, "w", encoding="utf-8") as f:
+ json.dump(data, f, indent=2, ensure_ascii=False)
+ shutil.move(tmp_path, path)
+
+def _slug(host: str) -> str:
+ return (host or "unknown").lower().replace(".", "_").replace("-", "_")
+
+def build_candidate_tool_json(c: Dict[str, Any]) -> Dict[str, Any]:
+ # Minimal, UI-friendly payload for listing/debug
+ return {
+ "name": c.get("name"),
+ "host": c.get("host"),
+ "base_url": c.get("base_url"),
+ "doc_url": c.get("doc_url"),
+ "openapi_url": c.get("openapi_url"),
+ "endpoints": c.get("endpoints"),
+ "health": c.get("health"),
+ "cors": c.get("cors"),
+ "trust": c.get("trust"),
+ "source": c.get("source"),
+ "_rank_score": c.get("_rank_score"),
+ }
+
+def _dynamicrest_tool_config(c: Dict[str, Any]) -> Dict[str, Any]:
+ """Produce a DynamicREST-style tool definition.
+ Two modes:
+ - OpenAPI mode (preferred): reference spec URL.
+ - Manual mode: infer a few GET endpoints from verification results.
+ """
+ name = f"vsd_auto_{_slug(c.get('host') or '')}"
+ base_url = c.get("base_url")
+ openapi_url = c.get("openapi_url")
+ endpoints = c.get("endpoints") or []
+
+ cfg: Dict[str, Any] = {
+ "name": name,
+ "type": "DynamicREST",
+ "base_url": base_url,
+ "auth": c.get("auth") or {"type": "none"},
+ "metadata": {
+ "source": c.get("source"),
+ "trust": c.get("trust"),
+ "health": c.get("health"),
+ "doc_url": c.get("doc_url"),
+ },
+ }
+ if openapi_url:
+ cfg["openapi"] = {"spec_url": openapi_url}
+ elif endpoints:
+ # Trim to a handful of GET endpoints
+ routes: List[Dict[str, Any]] = []
+ for ep in endpoints[:5]:
+ routes.append({
+ "method": ep.get("method") or "GET",
+ "path": ep.get("path") or "/",
+ "name": (ep.get("summary") or ep.get("path") or "endpoint").strip("/").replace("/", "_")[:64] or "endpoint",
+ })
+ cfg["routes"] = routes
+ else:
+ # Last resort: allow a generic GET on '/'
+ cfg["routes"] = [{"method": "GET", "path": "/"}]
+ return cfg
+
+def promote_to_dynamicrest(c: Dict[str, Any]) -> str:
+ """Append/Update the generated tool config file so your server can load it.
+ Returns the registered tool name.
+ """
+ _ensure_dir()
+ current = _read_json(VSD_PATH)
+ if not isinstance(current, dict):
+ current = {}
+
+ cfg = _dynamicrest_tool_config(c)
+ name = cfg.get("name") or "vsd_auto_unknown"
+ current[name] = cfg
+ _atomic_write(VSD_PATH, current)
+ return name
\ No newline at end of file
diff --git a/src/tooluniverse/harvest/query_expansion.py b/src/tooluniverse/harvest/query_expansion.py
new file mode 100644
index 00000000..4ac4e959
--- /dev/null
+++ b/src/tooluniverse/harvest/query_expansion.py
@@ -0,0 +1,28 @@
+
+from __future__ import annotations
+from typing import List
+
+DENTAL_SYNONYMS = [
+ "oral health", "dentistry", "dental caries", "tooth decay",
+ "periodontal", "periodontitis", "orthodontic", "endodontic",
+ "prosthodontic", "oral cancer", "DMFT", "fluoride", "NIDCR", "CDC Oral Health",
+ "WHO Oral Health"
+]
+
+def expand_queries(query: str, max_queries: int = 6) -> List[str]:
+ base = query.strip()
+ if not base:
+ return []
+ expanded = [base,
+ f"{base} WHO API",
+ f"{base} site:who.int",
+ f"{base} site:data.cdc.gov",
+ f"{base} site:api.fda.gov"]
+ for syn in DENTAL_SYNONYMS[:4]:
+ expanded.append(f"{base} {syn}")
+ # de-dup and clip
+ seen = []
+ for q in expanded:
+ if q not in seen:
+ seen.append(q)
+ return seen[:max_queries]
diff --git a/src/tooluniverse/harvest/ranker.py b/src/tooluniverse/harvest/ranker.py
new file mode 100644
index 00000000..aa898ad1
--- /dev/null
+++ b/src/tooluniverse/harvest/ranker.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+import math
+from typing import List, Dict
+
+def _sim(a: str, b: str) -> float:
+ a,b = (a or "").lower(), (b or "").lower()
+ if not a or not b:
+ return 0.0
+ aset, bset = set(a.split()), set(b.split())
+ overlap = len(aset & bset)
+ return overlap / (len(aset) + 1e-6)
+
+def rank_candidates(query: str, candidates: List[Dict]) -> List[Dict]:
+ def score(c: Dict) -> float:
+ trust = float(((c.get("trust") or {}).get("score") or 0.0))
+ h = c.get("health") or {}
+ live = 1.0 if (h.get("ok") and (h.get("status",0) < 500)) else 0.0
+ lat = h.get("latency_ms") or 1500
+ lat_norm = max(0.0, 1.0 - min(lat, 4000)/4000.0)
+ fit = max(_sim(query, c.get("name","")), _sim(query, c.get("doc_url","")))
+ has_spec = 1.0 if c.get("openapi_url") else 0.2 if c.get("endpoints") else 0.0
+ cors = 0.3 if (c.get("cors") or {}).get("preflight") else 0.0
+ match_bonus = float(c.get("_match_score") or 0.0)
+ return (
+ 0.25 * trust
+ + 0.2 * (live * lat_norm)
+ + 0.23 * fit
+ + 0.1 * has_spec
+ + 0.05 * cors
+ + (0.35 * math.log1p(match_bonus) if match_bonus > 0 else 0.0)
+ )
+
+ ranked = sorted(candidates, key=score, reverse=True)
+ for i, c in enumerate(ranked):
+ c["_rank_score"] = round(score(c), 4)
+ return ranked
diff --git a/src/tooluniverse/harvest/searchers.py b/src/tooluniverse/harvest/searchers.py
new file mode 100644
index 00000000..e9daf2e8
--- /dev/null
+++ b/src/tooluniverse/harvest/searchers.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+import os, re, logging, requests, json
+from dataclasses import dataclass
+from typing import List, Optional, Dict, Any
+
+logger = logging.getLogger("HarvestSearch")
+DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8"))
+
+@dataclass
+class SearchResult:
+ title: str
+ url: str
+ snippet: str
+ source: str
+
+def _clean_host(url: str) -> str:
+ return re.sub(r"^https?://", "", url or "").split("/")[0].lower()
+
+def _normalize_candidate_url(url: str) -> str:
+ return (url or "").strip()
+
+# ---------------- CKAN adapter ----------------
+def _search_ckan(query: str, rows: int, base_url: str) -> List[SearchResult]:
+ out: List[SearchResult] = []
+ try:
+ r = requests.get(base_url, params={"q": query, "rows": rows}, timeout=DEFAULT_TIMEOUT)
+ r.raise_for_status()
+ payload = r.json()
+ # CKAN payload guard
+ result = (payload or {}).get("result") or {}
+ for pkg in result.get("results", []):
+ title = pkg.get("title") or pkg.get("name") or "CKAN dataset"
+ notes = (pkg.get("notes") or "")[:240]
+ for res in (pkg.get("resources") or []):
+ res_url = _normalize_candidate_url(res.get("url") or "")
+ if not res_url:
+ continue
+ out.append(SearchResult(title=title, url=res_url, snippet=notes, source=f"ckan:{_clean_host(base_url)}"))
+ except Exception as e:
+ logger.debug("CKAN search failed for %s: %s", base_url, e)
+ return out
+
+CATALOG_ADAPTERS = {
+ "ckan": _search_ckan,
+}
+
+def search_for_apis(query: str, rows: int = 100, catalogs: Optional[List[Dict[str, Any]]] = None) -> List[SearchResult]:
+ """Search across configured catalogs.
+ catalogs: list of dicts, e.g. [{"type": "ckan", "url": "https://.../api/3/action/package_search"}]
+ You can supply this via env HARVEST_CATALOGS='[ ... ]' or pass in directly.
+ """
+ results: List[SearchResult] = []
+ catalogs = catalogs or []
+ for cat in catalogs:
+ ctype = (cat.get("type") or "").lower().strip()
+ url = cat.get("url") or ""
+ if not ctype or not url:
+ continue
+ adapter = CATALOG_ADAPTERS.get(ctype)
+ if not adapter:
+ logger.debug("Unknown catalog type %s, skipping", ctype)
+ continue
+ results.extend(adapter(query=query, rows=rows, base_url=url))
+ return results
diff --git a/src/tooluniverse/harvest/static_catalog.py b/src/tooluniverse/harvest/static_catalog.py
new file mode 100644
index 00000000..83536f94
--- /dev/null
+++ b/src/tooluniverse/harvest/static_catalog.py
@@ -0,0 +1,539 @@
+from __future__ import annotations
+
+import math
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Set
+from urllib.parse import urlparse
+
+from .domain_policies import trust_score
+from .ranker import rank_candidates
+
+
+# -----------------------------------------------------------------------------
+# Static catalog data
+# -----------------------------------------------------------------------------
+
+RAW_CATALOG: List[Dict[str, object]] = [
+ {
+ "name": "ClinicalTrials.gov Study Fields API",
+ "url": "https://clinicaltrials.gov/api/query/study_fields",
+ "doc_url": "https://clinicaltrials.gov/api/gui/home",
+ "description": "Query structured fields from the ClinicalTrials.gov registry covering study design, enrollment, outcomes, and locations.",
+ "keywords": ["clinical", "trial", "study", "research", "ctgov", "clinicaltrials"],
+ "category": "clinical_trials",
+ "base_score": 0.95,
+ "endpoints": [
+ {"method": "GET", "path": "/api/query/study_fields", "summary": "Query study fields"},
+ {"method": "GET", "path": "/api/query/full_studies", "summary": "Fetch full study records"},
+ ],
+ },
+ {
+ "name": "NCI Clinical Trials API",
+ "url": "https://clinicaltrialsapi.cancer.gov/api/v1/clinical-trials",
+ "doc_url": "https://clinicaltrialsapi.cancer.gov",
+ "description": "REST API exposing cancer clinical trials curated by the National Cancer Institute (NCI) with filters across disease, stage, and therapy.",
+ "keywords": ["clinical", "trial", "oncology", "cancer", "nci", "research"],
+ "category": "clinical_trials",
+ "base_score": 0.88,
+ "endpoints": [
+ {"method": "GET", "path": "/api/v1/clinical-trials", "summary": "Search cancer clinical trials"},
+ {"method": "GET", "path": "/api/v1/diseases", "summary": "List disease terms"},
+ ],
+ },
+ {
+ "name": "FDA OpenFDA Drug Label API",
+ "url": "https://api.fda.gov/drug/label.json",
+ "doc_url": "https://open.fda.gov/apis/drug/label/",
+ "description": "OpenFDA drug labeling information with pharmacology, indications, warnings, and dosage guidance.",
+ "keywords": ["drug", "label", "fda", "pharmaceutical", "medication", "clinical"],
+ "category": "pharmacovigilance",
+ "base_score": 0.6,
+ "endpoints": [
+ {"method": "GET", "path": "/drug/label.json", "summary": "Query drug labeling records"},
+ {"method": "GET", "path": "/drug/event.json", "summary": "Retrieve drug adverse events"},
+ ],
+ },
+ {
+ "name": "FDA OpenFDA Adverse Events API",
+ "url": "https://api.fda.gov/drug/event.json",
+ "doc_url": "https://open.fda.gov/apis/drug/event/",
+ "description": "Adverse event case reports submitted to FDA FAERS with patient outcomes and drug role details.",
+ "keywords": ["adverse", "event", "pharmacovigilance", "drug safety", "faers"],
+ "category": "pharmacovigilance",
+ "base_score": 0.65,
+ "endpoints": [
+ {"method": "GET", "path": "/drug/event.json", "summary": "Search FAERS adverse event data"},
+ ],
+ },
+ {
+ "name": "FDA OpenFDA Device Recall API",
+ "url": "https://api.fda.gov/device/recall.json",
+ "doc_url": "https://open.fda.gov/apis/device/recall/",
+ "description": "Medical device recall records including classification, recall reason, and event dates.",
+ "keywords": ["medical device", "recall", "fda", "safety", "compliance"],
+ "category": "device_safety",
+ "base_score": 0.55,
+ "endpoints": [
+ {"method": "GET", "path": "/device/recall.json", "summary": "Retrieve device recall records"},
+ ],
+ },
+ {
+ "name": "CDC Socrata Open Data API",
+ "url": "https://data.cdc.gov/resource/9mfq-cb36.json",
+ "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/9mfq-cb36",
+ "description": "CDC curated datasets accessible via the Socrata Open Data API, including COVID-19 cases and vaccinations.",
+ "keywords": ["cdc", "public health", "covid", "vaccination", "socrata", "open data"],
+ "category": "public_health",
+ "base_score": 0.86,
+ "endpoints": [
+ {"method": "GET", "path": "/resource/.json", "summary": "Query CDC open datasets"},
+ ],
+ },
+ {
+ "name": "CDC PLACES Community Health API",
+ "url": "https://chronicdata.cdc.gov/resource/cwsq-ngmh.json",
+ "doc_url": "https://dev.socrata.com/foundry/chronicdata.cdc.gov/cwsq-ngmh",
+ "description": "Model-based estimates for chronic disease, health risk factors, and preventive services at local levels; supports community health assessments and dental health overlays.",
+ "keywords": ["community health", "chronic disease", "behavioral health", "cdc", "oral health"],
+ "category": "public_health",
+ "base_score": 0.8,
+ "endpoints": [
+ {"method": "GET", "path": "/resource/cwsq-ngmh.json", "summary": "Retrieve PLACES health estimates"},
+ ],
+ },
+ {
+ "name": "CDC Oral Health Data Portal API",
+ "url": "https://data.cdc.gov/resource/4nhi-4p9m.json",
+ "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/4nhi-4p9m",
+ "description": "Community oral health indicators including dental visits, sealant prevalence, and fluoridation coverage for dentistry analytics.",
+ "keywords": ["oral health", "dentistry", "dental", "fluoride", "sealant", "cdc"],
+ "category": "dentistry",
+ "base_score": 0.81,
+ "endpoints": [
+ {"method": "GET", "path": "/resource/4nhi-4p9m.json", "summary": "Query oral health indicator records"},
+ ],
+ },
+ {
+ "name": "WHO Global Health Observatory API",
+ "url": "https://ghoapi.azureedge.net/api/Indicator",
+ "doc_url": "https://www.who.int/data/gho/info/gho-odata-api",
+ "description": "World Health Organization indicators covering global health metrics, vaccination, and disease burden.",
+ "keywords": ["who", "global health", "indicator", "vaccination", "disease surveillance"],
+ "category": "global_health",
+ "base_score": 0.87,
+ "endpoints": [
+ {"method": "GET", "path": "/api/Indicator", "summary": "List WHO health indicators"},
+ {"method": "GET", "path": "/api/Indicator?$filter", "summary": "Filter indicators by code"},
+ ],
+ },
+ {
+ "name": "NIH RePORTER Projects API",
+ "url": "https://api.reporter.nih.gov/v2/projects/search",
+ "doc_url": "https://api.reporter.nih.gov/",
+ "description": "NIH-funded research projects with abstracts, funding amounts, and investigator information.",
+ "keywords": ["nih", "grants", "research", "project", "biomedical"],
+ "category": "research_funding",
+ "base_score": 0.83,
+ "endpoints": [
+ {"method": "POST", "path": "/v2/projects/search", "summary": "Search NIH-funded projects"},
+ ],
+ },
+ {
+ "name": "NCBI E-utilities ESummary API",
+ "url": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
+ "doc_url": "https://www.ncbi.nlm.nih.gov/books/NBK25500/",
+ "description": "Programmatic access to NCBI databases including PubMed, nucleotide, protein, and ClinVar content.",
+ "keywords": ["ncbi", "genomics", "pubmed", "sequence", "biomedical"],
+ "category": "genomics",
+ "base_score": 0.84,
+ "endpoints": [
+ {"method": "GET", "path": "/entrez/eutils/esearch.fcgi", "summary": "Search NCBI databases"},
+ {"method": "GET", "path": "/entrez/eutils/esummary.fcgi", "summary": "Retrieve database summaries"},
+ ],
+ },
+ {
+ "name": "Ensembl REST API",
+ "url": "https://rest.ensembl.org/info/ping",
+ "doc_url": "https://rest.ensembl.org",
+ "description": "Genomics REST service for Ensembl data including genes, variants, and comparative genomics with JSON outputs.",
+ "keywords": ["ensembl", "genomics", "variants", "gene", "rest service", "bioinformatics"],
+ "category": "genomics",
+ "base_score": 0.8,
+ "endpoints": [
+ {"method": "GET", "path": "/lookup/id/{id}", "summary": "Lookup Ensembl gene or transcript"},
+ {"method": "GET", "path": "/overlap/region/{species}/{region}", "summary": "Fetch features overlapping a region"},
+ ],
+ },
+ {
+ "name": "SAMHSA Behavioral Health Treatment Services Locator API",
+ "url": "https://findtreatment.samhsa.gov/locator",
+ "doc_url": "https://findtreatment.samhsa.gov/developers",
+ "description": "Behavioral health treatment provider directory with search by service type, payment, and location.",
+ "keywords": ["mental health", "treatment", "behavioral health", "samhsa"],
+ "category": "mental_health",
+ "base_score": 0.81,
+ "endpoints": [
+ {"method": "GET", "path": "/locator", "summary": "Search behavioral health providers"},
+ ],
+ },
+ {
+ "name": "USDA FoodData Central API",
+ "url": "https://api.nal.usda.gov/fdc/v1/foods/search",
+ "doc_url": "https://fdc.nal.usda.gov/api-guide.html",
+ "description": "Nutrient composition data for branded and experimental foods, with search and detail endpoints.",
+ "keywords": ["nutrition", "food", "dietary", "usda", "nutrients"],
+ "category": "nutrition",
+ "base_score": 0.79,
+ "endpoints": [
+ {"method": "POST", "path": "/fdc/v1/foods/search", "summary": "Search foods by keyword"},
+ {"method": "GET", "path": "/fdc/v1/food/{fdcId}", "summary": "Retrieve nutrient profile"},
+ ],
+ },
+ {
+ "name": "CDC Vaccination Coverage API",
+ "url": "https://data.cdc.gov/resource/8xkx-amqh.json",
+ "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/8xkx-amqh",
+ "description": "US vaccination coverage estimates by vaccine and demographic segment.",
+ "keywords": ["vaccination", "immunization", "cdc", "coverage", "public health"],
+ "category": "vaccination",
+ "base_score": 0.8,
+ "endpoints": [
+ {"method": "GET", "path": "/resource/8xkx-amqh.json", "summary": "Vaccination coverage records"},
+ ],
+ },
+ {
+ "name": "NOAA Climate Data Online API",
+ "url": "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets",
+ "doc_url": "https://www.ncdc.noaa.gov/cdo-web/webservices/v2",
+ "description": "Climate and weather datasets from NOAA including temperature, precipitation, and extremes for environmental monitoring and early warning systems.",
+ "keywords": ["environment", "environmental", "weather", "climate", "noaa", "meteorology", "monitoring"],
+ "category": "environmental",
+ "base_score": 0.78,
+ "endpoints": [
+ {"method": "GET", "path": "/cdo-web/api/v2/datasets", "summary": "List NOAA datasets"},
+ {"method": "GET", "path": "/cdo-web/api/v2/data", "summary": "Query climate observations"},
+ ],
+ },
+ {
+ "name": "EPA AirNow API",
+ "url": "https://www.airnowapi.org/aq/data/",
+ "doc_url": "https://docs.airnowapi.org/",
+ "description": "Air quality measurements and forecasts for US monitoring stations, including pollutants and AQI, supporting environmental monitoring pipelines.",
+ "keywords": ["air quality", "environment", "environmental", "epa", "pollution", "aqi", "monitoring"],
+ "category": "environmental",
+ "base_score": 0.77,
+ "endpoints": [
+ {"method": "GET", "path": "/aq/data/", "summary": "Retrieve air quality data"},
+ ],
+ },
+ {
+ "name": "Orphanet Rare Disease API",
+ "url": "https://www.orpha.net/OrphAPI/api/Disease",
+ "doc_url": "https://api.orphanet.net/OrphAPI/#!/Disease",
+ "description": "Rare disease catalog with Orpha codes, synonyms, epidemiology, and classification.",
+ "keywords": ["rare disease", "orphanet", "orpha", "genetic", "registry"],
+ "category": "rare_disease",
+ "base_score": 0.76,
+ "endpoints": [
+ {"method": "GET", "path": "/OrphAPI/api/Disease", "summary": "List rare diseases"},
+ {"method": "GET", "path": "/OrphAPI/api/Disease/{OrphaCode}", "summary": "Retrieve disease details"},
+ ],
+ },
+ {
+ "name": "RAREDISEASES.info NIH Service",
+ "url": "https://rarediseases.info.nih.gov/services/v1/diseases",
+ "doc_url": "https://rarediseases.info.nih.gov/developers",
+ "description": "NIH Genetic and Rare Diseases (GARD) API providing disease descriptions, symptoms, and resources.",
+ "keywords": ["rare disease", "nih", "gard", "genetic", "registry"],
+ "category": "rare_disease",
+ "base_score": 0.75,
+ "endpoints": [
+ {"method": "GET", "path": "/services/v1/diseases", "summary": "Search rare diseases"},
+ ],
+ },
+ {
+ "name": "USAFacts COVID-19 API",
+ "url": "https://api.usafacts.org/covid/covid-api/v1/cases",
+ "doc_url": "https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/api/",
+ "description": "County-level COVID-19 cases and deaths in the United States with daily updates.",
+ "keywords": ["covid", "pandemic", "surveillance", "epidemiology"],
+ "category": "pandemic",
+ "base_score": 0.74,
+ "endpoints": [
+ {"method": "GET", "path": "/covid/covid-api/v1/cases", "summary": "Retrieve COVID-19 cases"},
+ ],
+ },
+ {
+ "name": "Global.Health Line List API",
+ "url": "https://covid19-api.global.health/v1/line-list",
+ "doc_url": "https://global.health/documentation/api",
+ "description": "Anonymized global case line lists for pathogen surveillance, including demographics and travel history.",
+ "keywords": ["pandemic", "outbreak", "surveillance", "line list", "global health"],
+ "category": "pandemic",
+ "base_score": 0.73,
+ "endpoints": [
+ {"method": "GET", "path": "/v1/line-list", "summary": "Retrieve outbreak line list"},
+ ],
+ },
+ {
+ "name": "OpenFDA Food Enforcement API",
+ "url": "https://api.fda.gov/food/enforcement.json",
+ "doc_url": "https://open.fda.gov/apis/food/enforcement/",
+ "description": "Food recall enforcement reports with product description, reason, and distribution data.",
+ "keywords": ["food", "recall", "fda", "safety", "enforcement"],
+ "category": "food_safety",
+ "base_score": 0.55,
+ "endpoints": [
+ {"method": "GET", "path": "/food/enforcement.json", "summary": "Search food recall enforcement"},
+ ],
+ },
+ {
+ "name": "USDA National Farmers Market Directory API",
+ "url": "https://search.ams.usda.gov/farmersmarkets/v1/data.svc/zipSearch",
+ "doc_url": "https://www.ams.usda.gov/services/local-regional/food-directories-datasets",
+ "description": "Directory of US farmers markets with location, operation schedule, and services.",
+ "keywords": ["nutrition", "food access", "farmers market", "usda"],
+ "category": "nutrition",
+ "base_score": 0.7,
+ "endpoints": [
+ {"method": "GET", "path": "/farmersmarkets/v1/data.svc/zipSearch", "summary": "Find farmers markets by ZIP"},
+ ],
+ },
+ {
+ "name": "HealthData.gov CKAN Catalog API",
+ "url": "https://healthdata.gov/api/3/action/package_search",
+ "doc_url": "https://healthdata.gov/developer",
+ "description": "Catalog of US Department of Health and Human Services datasets via CKAN API.",
+ "keywords": ["open data", "catalog", "health data", "ckan", "metadata"],
+ "category": "data_catalog",
+ "base_score": 0.82,
+ "endpoints": [
+ {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search dataset catalog"},
+ ],
+ },
+ {
+ "name": "data.gov CKAN Catalog API",
+ "url": "https://catalog.data.gov/api/3/action/package_search",
+ "doc_url": "https://catalog.data.gov/dataset",
+ "description": "US Federal data catalog with metadata across climate, energy, health, and finance.",
+ "keywords": ["open data", "catalog", "federal", "ckan", "metadata"],
+ "category": "data_catalog",
+ "base_score": 0.8,
+ "endpoints": [
+ {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search the federal data catalog"},
+ ],
+ },
+ {
+ "name": "Europe PMC RESTful API",
+ "url": "https://www.ebi.ac.uk/europepmc/webservices/rest/search",
+ "doc_url": "https://europepmc.org/RestfulWebService",
+ "description": "Biomedical literature, grants, and patents from Europe PMC with advanced search syntax.",
+ "keywords": ["literature", "research", "biomedical", "europe pmc", "publications"],
+ "category": "literature",
+ "base_score": 0.78,
+ "endpoints": [
+ {"method": "GET", "path": "/webservices/rest/search", "summary": "Search biomedical literature"},
+ ],
+ },
+ {
+ "name": "OpenAlex Graph API",
+ "url": "https://api.openalex.org/works",
+ "doc_url": "https://docs.openalex.org/api",
+ "description": "Scholarly works, authors, concepts, and institutions graph with filtering for literature discovery and citation analysis.",
+ "keywords": ["literature", "openalex", "scholarly", "citations", "research graph"],
+ "category": "literature",
+ "base_score": 0.77,
+ "endpoints": [
+ {"method": "GET", "path": "/works", "summary": "Search scholarly works"},
+ {"method": "GET", "path": "/authors", "summary": "Browse scholarly authors"},
+ ],
+ },
+]
+
+
+# -----------------------------------------------------------------------------
+# Internal helpers
+# -----------------------------------------------------------------------------
+
+TOKEN_PATTERN = re.compile(r"[a-z0-9]+")
+
+
+def _tokenize(text: str) -> Set[str]:
+ tokens = set(TOKEN_PATTERN.findall((text or "").lower()))
+ enriched: Set[str] = set(tokens)
+ for tok in tokens:
+ if len(tok) <= 2:
+ continue
+ if tok.endswith("ies") and len(tok) > 3:
+ enriched.add(tok[:-3] + "y")
+ if tok.endswith("ing") and len(tok) > 4:
+ enriched.add(tok[:-3])
+ if tok.endswith("al") and len(tok) > 4:
+ enriched.add(tok[:-2])
+ if tok.endswith("s") and len(tok) > 3:
+ enriched.add(tok[:-1])
+ return enriched
+
+
+@dataclass(frozen=True)
+class CatalogRecord:
+ data: Dict[str, object]
+ tokens: Set[str]
+ keyword_tokens: Set[str]
+ base_score: float
+
+
+def _prepare_catalog(raw_items: Iterable[Dict[str, object]]) -> List[CatalogRecord]:
+ prepared: List[CatalogRecord] = []
+ for item in raw_items:
+ entry = deepcopy(item)
+
+ url = str(entry.get("url") or "").strip()
+ if not url:
+ continue
+ parsed = urlparse(url)
+ host = parsed.netloc.lower()
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
+
+ entry.setdefault("host", host)
+ entry.setdefault("base_url", base_url)
+ entry.setdefault("source", "static_catalog")
+ entry.setdefault("doc_url", entry.get("doc_url") or f"{base_url}/")
+ entry.setdefault("health", {"ok": True, "status": 200, "latency_ms": 180, "checked": "static"})
+ entry.setdefault("cors", {"preflight": False})
+ entry.setdefault("trust", trust_score(host))
+
+ keywords = entry.get("keywords") or []
+ if keywords:
+ desc = entry.get("description") or ""
+ kw_text = "; ".join(str(k) for k in keywords)
+ if kw_text and kw_text.lower() not in desc.lower():
+ entry["description"] = f"{desc} (keywords: {kw_text})"
+ keyword_tokens = _tokenize(" ".join(map(str, keywords)))
+ text_tokens = _tokenize(" ".join(
+ str(part) for part in (
+ entry.get("name", ""),
+ entry.get("description", ""),
+ entry.get("category", ""),
+ entry.get("doc_url", ""),
+ )
+ ))
+
+ base_score = float(entry.get("base_score") or 0.0)
+
+ prepared.append(
+ CatalogRecord(
+ data=entry,
+ tokens=text_tokens | keyword_tokens,
+ keyword_tokens=keyword_tokens,
+ base_score=base_score,
+ )
+ )
+
+ return prepared
+
+
+CATALOG: List[CatalogRecord] = _prepare_catalog(RAW_CATALOG)
+
+
+# -----------------------------------------------------------------------------
+# Public harvester interface
+# -----------------------------------------------------------------------------
+
+def _score_entry(tokens: Set[str], record: CatalogRecord) -> float:
+ if not tokens:
+ return record.base_score + 0.5
+
+ keyword_overlap = len(tokens & record.keyword_tokens)
+ text_overlap = len(tokens & record.tokens)
+
+ if keyword_overlap == 0 and text_overlap == 0:
+ return record.base_score * 0.1
+
+ precision = keyword_overlap / (len(tokens) or 1)
+ coverage = (keyword_overlap + text_overlap) / (len(record.tokens) or 1)
+
+ return (
+ 2.0 * keyword_overlap
+ + 1.2 * text_overlap
+ + 1.5 * precision
+ + 1.0 * coverage
+ + record.base_score * 0.25
+ )
+
+
+SYNONYM_MAP = {
+ "clinical": ["trial", "research"],
+ "dentistry": ["dental", "oral", "oralhealth"],
+ "dental": ["dentistry", "oral", "oralhealth"],
+ "oral": ["dentistry", "dental", "oralhealth"],
+ "environmental": ["environment", "climate", "monitoring"],
+ "environment": ["environmental", "climate", "air"],
+ "monitoring": ["surveillance", "tracking"],
+ "rare": ["orphan", "orphanet", "genetic"],
+ "disease": ["condition", "illness"],
+ "genomics": ["genomic", "gene", "sequence", "dna"],
+ "genomic": ["genomics", "gene", "dna"],
+ "pandemic": ["outbreak", "surveillance"],
+ "surveillance": ["monitoring", "tracking"],
+ "nutrition": ["food", "diet", "dietary"],
+ "vaccination": ["immunization", "vaccine"],
+ "mental": ["behavioral", "behavior", "psych"],
+ "health": ["healthcare", "publichealth"],
+ "pharmaceutical": ["drug", "medicine"],
+ "adverse": ["safety", "pharmacovigilance"],
+}
+
+
+def harvest(query: str, limit: int = 5, **kwargs) -> List[Dict[str, object]]:
+ """
+ Harvest candidate API endpoints from the static catalog.
+
+ Args:
+ query: Natural language search string.
+ limit: Maximum number of candidates to return.
+ **kwargs: Unused passthrough parameters for compatibility.
+ """
+ limit = max(1, min(int(limit or 5), 50))
+ query = (query or "").strip()
+
+ if not CATALOG:
+ return []
+
+ if not query:
+ top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit]
+ return [deepcopy(rec.data) for rec in top]
+
+ token_union: Set[str] = _tokenize(query)
+ for token in list(token_union):
+ for syn in SYNONYM_MAP.get(token, []):
+ token_union |= _tokenize(syn)
+
+ scored: List[Dict[str, object]] = []
+ for record in CATALOG:
+ score = _score_entry(token_union, record)
+ if score <= 0 and record.base_score <= 0:
+ continue
+ candidate = deepcopy(record.data)
+ candidate["_match_score"] = round(score, 4)
+ candidate["_match_terms"] = sorted(token_union & record.tokens)
+ scored.append(candidate)
+
+ if not scored:
+ top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit]
+ return [deepcopy(rec.data) for rec in top]
+
+ preliminary = sorted(scored, key=lambda c: c["_match_score"], reverse=True)[: limit * 3]
+ ranked = rank_candidates(query, preliminary)
+ final = ranked[:limit]
+
+ for cand in final:
+ cand.pop("_match_score", None)
+ cand.pop("_match_terms", None)
+
+ return final
+
+
+__all__ = ["harvest"]
diff --git a/src/tooluniverse/harvest/verifier.py b/src/tooluniverse/harvest/verifier.py
new file mode 100644
index 00000000..2da35df9
--- /dev/null
+++ b/src/tooluniverse/harvest/verifier.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+import os, time, logging, requests
+from typing import Dict, Optional
+
+logger = logging.getLogger("HarvestVerify")
+DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8"))
+SIZE_LIMIT = int(os.getenv("HARVEST_MAX_BYTES", "2000000"))
+JSON_ACCEPT = {"Accept": "application/json"}
+
+def _head(url: str, timeout=None):
+ try:
+ return requests.head(url, timeout=timeout or DEFAULT_TIMEOUT, allow_redirects=True)
+ except requests.RequestException:
+ return None
+
+def _health_probe(url: str, timeout=None) -> Dict:
+ t0 = time.time()
+ try:
+ rh = _head(url, timeout)
+ if rh is not None:
+ clen = int(rh.headers.get("Content-Length") or 0)
+ if clen and clen > SIZE_LIMIT:
+ return {"ok": False, "status": rh.status_code, "skipped": f"large({clen})"}
+ r = requests.get(url, timeout=timeout or DEFAULT_TIMEOUT, headers=JSON_ACCEPT)
+ return {"ok": r.status_code < 500, "status": r.status_code, "latency_ms": int((time.time()-t0)*1000), "ctype": r.headers.get("Content-Type","")}
+ except requests.RequestException as e:
+ return {"ok": False, "status": 0, "error": str(e)}
+
+def verify_candidate(result, timeout_s: Optional[int] = None) -> Optional[Dict]:
+ url = (result.url or "").strip()
+ if not url: return None
+ health = _health_probe(url, timeout=timeout_s)
+ return {"name": result.title, "url": url, "health": health, "source": result.source}
diff --git a/src/tooluniverse/interpro_tool.py b/src/tooluniverse/interpro_tool.py
new file mode 100644
index 00000000..99653e01
--- /dev/null
+++ b/src/tooluniverse/interpro_tool.py
@@ -0,0 +1,63 @@
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+INTERPRO_BASE_URL = "https://www.ebi.ac.uk/interpro/api/entry/interpro/"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("InterProTool")
+class InterProTool(BaseTool):
+ """
+ Tool wrapper for the InterPro REST API.
+ Provides entry search with pagination support.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def run(self, arguments):
+ query = (arguments or {}).get("query") or (arguments or {}).get("search")
+ if not query:
+ return {"error": "Missing required parameter: query"}
+
+ page = int((arguments or {}).get("page") or 1)
+ page_size = int(
+ (arguments or {}).get("page_size")
+ or self.tool_config.get("page_size", 25)
+ )
+
+ params = {
+ "search": query,
+ "page": max(page, 1),
+ "page_size": max(min(page_size, 200), 1),
+ }
+
+ response = self.session.get(
+ INTERPRO_BASE_URL, params=params, timeout=REQUEST_TIMEOUT
+ )
+ response.raise_for_status()
+ payload = response.json()
+
+ entries = []
+ for item in payload.get("results", []):
+ metadata = item.get("metadata", {})
+ entries.append(
+ {
+ "accession": metadata.get("accession"),
+ "name": metadata.get("name"),
+ "short_name": metadata.get("short_name"),
+ "type": metadata.get("type"),
+ "source_database": metadata.get("source_database"),
+ "integrated": metadata.get("integrated"),
+ }
+ )
+
+ return {
+ "count": payload.get("count", len(entries)),
+ "next": payload.get("next"),
+ "previous": payload.get("previous"),
+ "results": entries,
+ }
diff --git a/src/tooluniverse/iucn_tool.py b/src/tooluniverse/iucn_tool.py
new file mode 100644
index 00000000..49752fe2
--- /dev/null
+++ b/src/tooluniverse/iucn_tool.py
@@ -0,0 +1,73 @@
+import os
+from typing import Any, Dict, List
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+IUCN_BASE_URL = "https://apiv3.iucnredlist.org/api/v3/species/"
+IUCN_TOKEN_ENV = "IUCN_RED_LIST_TOKEN"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("IUCNRedListTool")
+class IUCNRedListTool(BaseTool):
+ """
+ Wrapper around the IUCN Red List API for species status lookups.
+ Requires an API token supplied via arguments, tool config, or environment.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def _resolve_token(self, arguments: Dict[str, Any]) -> str:
+ candidate = (
+ (arguments or {}).get("token")
+ or self.tool_config.get("token")
+ or os.getenv(IUCN_TOKEN_ENV)
+ )
+ if not candidate:
+ raise ValueError(
+ f"Missing IUCN API token. Provide 'token' argument or set {IUCN_TOKEN_ENV}."
+ )
+ return candidate
+
+ def run(self, arguments):
+ species = (arguments or {}).get("species") or (arguments or {}).get(
+ "species_name"
+ )
+ if not species:
+ return {"error": "Missing required parameter: species"}
+
+ try:
+ token = self._resolve_token(arguments or {})
+ except ValueError as exc:
+ return {"error": str(exc)}
+
+ response = self.session.get(
+ f"{IUCN_BASE_URL}{species}",
+ params={"token": token},
+ timeout=REQUEST_TIMEOUT,
+ )
+
+ if response.status_code == 404:
+ return {"count": 0, "results": []}
+
+ response.raise_for_status()
+ payload = response.json()
+
+ results: List[Dict[str, Any]] = []
+ for entry in payload.get("result", []):
+ results.append(
+ {
+ "scientific_name": entry.get("scientific_name"),
+ "category": entry.get("category"),
+ "population_trend": entry.get("population_trend"),
+ "distribution": entry.get("countries"),
+ "published_year": entry.get("published_year"),
+ }
+ )
+
+ return {"count": len(results), "results": results}
diff --git a/src/tooluniverse/jaspar_tool.py b/src/tooluniverse/jaspar_tool.py
new file mode 100644
index 00000000..399859db
--- /dev/null
+++ b/src/tooluniverse/jaspar_tool.py
@@ -0,0 +1,61 @@
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+JASPAR_BASE_URL = "https://jaspar.elixir.no/api/v1/matrix/"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("JASPARRestTool")
+class JASPARRestTool(BaseTool):
+ """
+ Wrapper around the JASPAR REST API for matrix searches.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def run(self, arguments):
+ query = (arguments or {}).get("query") or (arguments or {}).get("search")
+ if not query:
+ return {"error": "Missing required parameter: query"}
+
+ params = {
+ "search": query,
+ "page": (arguments or {}).get("page", 1),
+ "page_size": (arguments or {}).get("page_size")
+ or self.tool_config.get("page_size", 10),
+ }
+
+ for optional in ("tax_group", "collection", "type"):
+ value = (arguments or {}).get(optional)
+ if value:
+ params[optional] = value
+
+ response = self.session.get(
+ JASPAR_BASE_URL, params=params, timeout=REQUEST_TIMEOUT
+ )
+ response.raise_for_status()
+ payload = response.json()
+
+ results = []
+ for item in payload.get("results", []):
+ results.append(
+ {
+ "matrix_id": item.get("matrix_id"),
+ "name": item.get("name"),
+ "collection": item.get("collection"),
+ "tax_group": item.get("tax_group"),
+ "class": item.get("class"),
+ "family": item.get("family"),
+ }
+ )
+
+ return {
+ "count": payload.get("count", len(results)),
+ "next": payload.get("next"),
+ "previous": payload.get("previous"),
+ "results": results,
+ }
diff --git a/src/tooluniverse/kegg_tool.py b/src/tooluniverse/kegg_tool.py
new file mode 100644
index 00000000..31262f57
--- /dev/null
+++ b/src/tooluniverse/kegg_tool.py
@@ -0,0 +1,56 @@
+from typing import List
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+KEGG_BASE_URL = "https://rest.kegg.jp"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("KEGGTool")
+class KEGGTool(BaseTool):
+ """
+ Lightweight wrapper around the KEGG REST API for text-based queries.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def run(self, arguments):
+ query = (arguments or {}).get("query")
+ if not query:
+ return {"error": "Missing required parameter: query"}
+
+ database = (arguments or {}).get("database") or self.tool_config.get(
+ "database", "pathway"
+ )
+ max_results = (arguments or {}).get("max_results") or self.tool_config.get(
+ "max_results"
+ )
+
+ endpoint = f"{KEGG_BASE_URL}/find/{database}/{query}"
+ response = self.session.get(endpoint, timeout=REQUEST_TIMEOUT)
+ response.raise_for_status()
+
+ lines: List[str] = [
+ line for line in response.text.splitlines() if line.strip()
+ ]
+ if max_results:
+ try:
+ limit = int(max_results)
+ lines = lines[: max(limit, 0)]
+ except ValueError:
+ pass
+
+ results = []
+ for line in lines:
+ if "\t" in line:
+ identifier, description = line.split("\t", 1)
+ else:
+ identifier, description = line, ""
+ results.append({"id": identifier, "description": description})
+
+ return results
diff --git a/src/tooluniverse/logging_config.py b/src/tooluniverse/logging_config.py
index 6659f6a1..3cbc36ab 100644
--- a/src/tooluniverse/logging_config.py
+++ b/src/tooluniverse/logging_config.py
@@ -45,12 +45,12 @@ class ToolUniverseFormatter(logging.Formatter):
# Emoji prefixes for different log levels
EMOJI_PREFIX = {
- "DEBUG": "🔧 ",
- "INFO": "ℹ️ ",
- "PROGRESS": "⏳ ",
- "WARNING": "⚠️ ",
- "ERROR": "❌ ",
- "CRITICAL": "🚨 ",
+ "DEBUG": "[DEBUG] ",
+ "INFO": "[INFO] ",
+ "PROGRESS": "[PROGRESS] ",
+ "WARNING": "[WARN] ",
+ "ERROR": "[ERROR] ",
+ "CRITICAL": "[CRITICAL] ",
}
def format(self, record):
diff --git a/src/tooluniverse/marine_species_tool.py b/src/tooluniverse/marine_species_tool.py
new file mode 100644
index 00000000..16058d28
--- /dev/null
+++ b/src/tooluniverse/marine_species_tool.py
@@ -0,0 +1,62 @@
+from urllib.parse import quote
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+WORMS_BASE_URL = "https://www.marinespecies.org/rest"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("MarineSpeciesTool")
+class MarineSpeciesTool(BaseTool):
+ """
+ Wrapper for the World Register of Marine Species (WoRMS) REST API.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def run(self, arguments):
+ name = (arguments or {}).get("scientific_name") or (arguments or {}).get(
+ "name"
+ )
+ if not name:
+ return {"error": "Missing required parameter: scientific_name"}
+
+ like = (arguments or {}).get("like")
+ marine_only = (arguments or {}).get("marine_only")
+
+ params = {
+ "like": "true"
+ if (like if like is not None else self.tool_config.get("like", True))
+ else "false",
+ "marine_only": "true"
+ if (
+ marine_only
+ if marine_only is not None
+ else self.tool_config.get("marine_only", True)
+ )
+ else "false",
+ }
+
+ endpoint = f"{WORMS_BASE_URL}/AphiaRecordsByName/{quote(name)}"
+ response = self.session.get(endpoint, params=params, timeout=REQUEST_TIMEOUT)
+ response.raise_for_status()
+ payload = response.json() or []
+
+ results = []
+ for item in payload:
+ results.append(
+ {
+ "AphiaID": item.get("AphiaID"),
+ "scientificname": item.get("scientificname"),
+ "rank": item.get("rank"),
+ "status": item.get("status"),
+ "match_type": item.get("match_type"),
+ }
+ )
+
+ return results
diff --git a/src/tooluniverse/medlog_tool.py b/src/tooluniverse/medlog_tool.py
new file mode 100644
index 00000000..d375a903
--- /dev/null
+++ b/src/tooluniverse/medlog_tool.py
@@ -0,0 +1,143 @@
+"""
+MedLog integration tools.
+
+These tools expose MedLog collector and FHIR linkage capabilities as native
+ToolUniverse tools for event ingestion, querying, and audit retrieval.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+
+class _MedLogBaseTool(BaseTool):
+ """Shared utility methods for MedLog REST integration."""
+
+ DEFAULT_BASE_URL = "http://localhost:7001"
+
+ def __init__(self, tool_config: Dict[str, Any]):
+ super().__init__(tool_config)
+ self.base_url = os.getenv(
+ "MEDLOG_COLLECTOR_BASE_URL", self.DEFAULT_BASE_URL
+ ).rstrip("/")
+ self.session = requests.Session()
+
+ def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
+ url = f"{self.base_url}{path}"
+ try:
+ response = self.session.post(url, json=payload, timeout=30)
+ response.raise_for_status()
+ return response.json()
+ except requests.RequestException as exc: # pragma: no cover - network errors
+ return {"error": f"MedLog collector request failed: {exc}", "endpoint": url}
+
+ def _get(self, path: str) -> Dict[str, Any]:
+ url = f"{self.base_url}{path}"
+ try:
+ response = self.session.get(url, timeout=30)
+ response.raise_for_status()
+ return response.json()
+ except requests.RequestException as exc: # pragma: no cover - network errors
+ return {"error": f"MedLog collector request failed: {exc}", "endpoint": url}
+
+
+class _MedLogFHIRBaseTool(BaseTool):
+ """Shared logic for interacting with the MedLog FHIR linkage service."""
+
+ DEFAULT_FHIR_URL = "http://localhost:7003"
+
+ def __init__(self, tool_config: Dict[str, Any]):
+ super().__init__(tool_config)
+ self.fhir_base = os.getenv(
+ "MEDLOG_FHIR_BASE_URL", self.DEFAULT_FHIR_URL
+ ).rstrip("/")
+ self.session = requests.Session()
+
+ def _get(self, path: str) -> Dict[str, Any]:
+ url = f"{self.fhir_base}{path}"
+ try:
+ response = self.session.get(url, timeout=30)
+ response.raise_for_status()
+ return response.json()
+ except requests.RequestException as exc: # pragma: no cover - network errors
+ return {"error": f"MedLog FHIR request failed: {exc}", "endpoint": url}
+
+
+@register_tool("MedLogInitEventTool")
+class MedLogInitEventTool(_MedLogBaseTool):
+ """Create or update a MedLog event record."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ return self._post("/medlog/events/init", arguments)
+
+
+@register_tool("MedLogAppendFragmentTool")
+class MedLogAppendFragmentTool(_MedLogBaseTool):
+ """Append fragment data (artifacts, outputs, feedback) to an event."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ event_id = arguments.get("event_id")
+ fragment = arguments.get("fragment", {})
+ if not event_id:
+ return {"error": "Parameter 'event_id' is required."}
+ return self._post(f"/medlog/events/{event_id}/append", fragment)
+
+
+@register_tool("MedLogGetProvenanceTool")
+class MedLogGetProvenanceTool(_MedLogBaseTool):
+ """Retrieve PROV-JSON bundle for a specific event."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ event_id = arguments.get("event_id")
+ if not event_id:
+ return {"error": "Parameter 'event_id' is required."}
+ return self._get(f"/medlog/events/{event_id}/prov")
+
+
+@register_tool("MedLogQueryEventsTool")
+class MedLogQueryEventsTool(_MedLogBaseTool):
+ """Query MedLog events by run_id or event_id."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "run_id": arguments.get("run_id"),
+ "event_id": arguments.get("event_id"),
+ "limit": arguments.get("limit", 50),
+ }
+ return self._post("/query", payload)
+
+
+@register_tool("MedLogExportParquetTool")
+class MedLogExportParquetTool(_MedLogBaseTool):
+ """Trigger a parquet export of MedLog events."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ return self._post("/export/parquet", {})
+
+
+@register_tool("MedLogFHIRBundleTool")
+class MedLogFHIRBundleTool(_MedLogFHIRBaseTool):
+ """Fetch FHIR bundle for a specific event."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ event_id = arguments.get("event_id")
+ if not event_id:
+ return {"error": "Parameter 'event_id' is required."}
+ return self._get(f"/bundle/{event_id}")
+
+
+@register_tool("MedLogFHIRRunBundleTool")
+class MedLogFHIRRunBundleTool(_MedLogFHIRBaseTool):
+ """Fetch FHIR bundle aggregating all events in a run."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ run_id = arguments.get("run_id")
+ if not run_id:
+ return {"error": "Parameter 'run_id' is required."}
+ return self._get(f"/bundle/run/{run_id}")
diff --git a/src/tooluniverse/medtok_tool.py b/src/tooluniverse/medtok_tool.py
new file mode 100644
index 00000000..1bd4042f
--- /dev/null
+++ b/src/tooluniverse/medtok_tool.py
@@ -0,0 +1,122 @@
+"""
+MedTok integration tools.
+
+These tools provide a thin wrapper around the MedTok FastAPI service so that
+ToolUniverse users can tokenize, embed, and explore medical codes directly
+from the unified tool catalog.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict
+
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+
+class _MedTokBaseTool(BaseTool):
+ """Shared utilities for MedTok REST integrations."""
+
+ DEFAULT_BASE_URL = "http://localhost:8000"
+
+ def __init__(self, tool_config: Dict[str, Any]):
+ super().__init__(tool_config)
+ self.base_url = os.getenv("MEDTOK_BASE_URL", self.DEFAULT_BASE_URL).rstrip("/")
+ self.session = requests.Session()
+
+ def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
+ url = f"{self.base_url}{path}"
+ try:
+ response = self.session.post(url, json=payload, timeout=30)
+ response.raise_for_status()
+ return response.json()
+ except requests.RequestException as exc: # pragma: no cover - network errors
+ return {"error": f"MedTok request failed: {exc}", "endpoint": url}
+
+ def _get(self, path: str) -> Dict[str, Any]:
+ url = f"{self.base_url}{path}"
+ try:
+ response = self.session.get(url, timeout=30)
+ response.raise_for_status()
+ return response.json()
+ except requests.RequestException as exc: # pragma: no cover - network errors
+ return {"error": f"MedTok request failed: {exc}", "endpoint": url}
+
+
+@register_tool("MedTokTokenizeTool")
+class MedTokTokenizeTool(_MedTokBaseTool):
+ """Tokenize medical codes using MedTok multimodal tokenizer."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "codes": arguments.get("codes", []),
+ "system": arguments.get("system", "ICD-10"),
+ "include_metadata": arguments.get("include_metadata", False),
+ }
+ return self._post("/tokenize", payload)
+
+
+@register_tool("MedTokEmbedTool")
+class MedTokEmbedTool(_MedTokBaseTool):
+ """Generate token embeddings for a batch of codes."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "codes": arguments.get("codes", []),
+ "system": arguments.get("system", "ICD-10"),
+ }
+ return self._post("/embed", payload)
+
+
+@register_tool("MedTokNearestNeighborsTool")
+class MedTokNearestNeighborsTool(_MedTokBaseTool):
+ """Retrieve nearest neighbours for a code in embedding space."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "code": arguments.get("code"),
+ "k": arguments.get("k", 5),
+ "system": arguments.get("system", "ICD-10"),
+ }
+ return self._post("/nearest_neighbors", payload)
+
+
+@register_tool("MedTokMapTextTool")
+class MedTokMapTextTool(_MedTokBaseTool):
+ """Map free-text description to the closest medical code."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "text": arguments.get("text", ""),
+ "system": arguments.get("system", "ICD-10"),
+ }
+ return self._post("/map_text_to_code", payload)
+
+
+@register_tool("MedTokSearchTextTool")
+class MedTokSearchTextTool(_MedTokBaseTool):
+ """Perform text and semantic search across the code vocabulary."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ payload = {
+ "text": arguments.get("text", ""),
+ "system": arguments.get("system"),
+ "k": arguments.get("k", 5),
+ }
+ return self._post("/search_text", payload)
+
+
+@register_tool("MedTokCodeInfoTool")
+class MedTokCodeInfoTool(_MedTokBaseTool):
+ """Fetch detailed metadata for a specific code."""
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ system = arguments.get("system", "ICD-10")
+ code = arguments.get("code")
+ if not code:
+ return {"error": "Parameter 'code' is required."}
+ path = f"/codes/{system}/{code}"
+ return self._get(path)
diff --git a/src/tooluniverse/phenome_jax_tool.py b/src/tooluniverse/phenome_jax_tool.py
new file mode 100644
index 00000000..353f476a
--- /dev/null
+++ b/src/tooluniverse/phenome_jax_tool.py
@@ -0,0 +1,54 @@
+import requests
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+PHENOME_JAX_BASE_URL = "https://phenome.jax.org/api"
+REQUEST_TIMEOUT = 30
+
+
+@register_tool("PhenomeJaxTool")
+class PhenomeJaxTool(BaseTool):
+ """
+ Wrapper around the Mouse Phenome Database (MPD) API for project searches.
+ """
+
+ def __init__(self, tool_config):
+ super().__init__(tool_config)
+ self.session = requests.Session()
+
+ def run(self, arguments):
+ keyword = (arguments or {}).get("keyword") or (arguments or {}).get("query")
+ limit = int(
+ (arguments or {}).get("limit") or self.tool_config.get("limit", 20)
+ )
+
+ params = {"limit": max(limit, 1)}
+ if keyword:
+ params["keyword"] = keyword
+
+ response = self.session.get(
+ f"{PHENOME_JAX_BASE_URL}/projects",
+ params=params,
+ timeout=REQUEST_TIMEOUT,
+ )
+ response.raise_for_status()
+ payload = response.json()
+
+ projects = []
+ for item in payload.get("projects", []):
+ projects.append(
+ {
+ "projid": item.get("projid"),
+ "title": item.get("title"),
+ "mpdsector": item.get("mpdsector"),
+ "species": item.get("species"),
+ "status": item.get("status"),
+ "releasedate": item.get("releasedate"),
+ }
+ )
+
+ return {
+ "count": payload.get("count", len(projects)),
+ "projects": projects[: params["limit"]],
+ }
diff --git a/src/tooluniverse/tool_navigator_tool.py b/src/tooluniverse/tool_navigator_tool.py
new file mode 100644
index 00000000..1341dd98
--- /dev/null
+++ b/src/tooluniverse/tool_navigator_tool.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import math
+from typing import Any, Dict, List, Optional
+
+from .execute_function import ToolUniverse
+from .tool_registry import register_tool
+from .vsd_registry import load_catalog
+
+
+def _tokenize(text: str) -> List[str]:
+ return [t for t in (text or "").lower().split() if t]
+
+
+def _score(query_tokens: List[str], name: str, description: str) -> float:
+ haystack = f"{name} {description}".lower()
+ score = 0.0
+ for token in query_tokens:
+ if token in haystack:
+ score += 2.0
+ score += sum(1.0 for token in query_tokens if any(word.startswith(token) for word in haystack.split()))
+ return score
+
+
+def _format_tool(tool: Dict[str, Any]) -> Dict[str, Any]:
+ return {
+ "name": tool.get("name"),
+ "type": tool.get("type"),
+ "description": tool.get("description"),
+ "tool_type": tool.get("tool_type"),
+ "category": tool.get("category"),
+ "source": tool.get("source"),
+ }
+
+
+@register_tool("ToolNavigatorTool")
+class ToolNavigatorTool:
+ """
+ Search ToolUniverse's catalog (built-in + VSD) to help agents discover relevant tools.
+ """
+
+ name = "ToolNavigatorTool"
+ description = "Search ToolUniverse/Navigated catalog for tools matching a query."
+ input_schema = {
+ "type": "object",
+ "properties": {
+ "query": {"type": "string"},
+ "limit": {"type": "integer", "default": 10, "minimum": 1, "maximum": 50},
+ "categories": {
+ "type": "array",
+ "items": {"type": "string"},
+ "description": "Optional list of categories to include.",
+ },
+ "include_vsd": {
+ "type": "boolean",
+ "default": True,
+ "description": "Include dynamically registered VSD tools in the search.",
+ },
+ },
+ "required": ["query"],
+ "additionalProperties": False,
+ }
+
+ def __init__(self) -> None:
+ self._tooluniverse = ToolUniverse()
+
+ def _load_base_tools(self) -> List[Dict[str, Any]]:
+ if not getattr(self._tooluniverse, "all_tools", None):
+ self._tooluniverse.load_tools()
+ return list(getattr(self._tooluniverse, "all_tools", []))
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ query = (arguments.get("query") or "").strip()
+ if not query:
+ return {"ok": False, "error": "query is required"}
+
+ limit = int(arguments.get("limit") or 10)
+ include_vsd = bool(arguments.get("include_vsd", True))
+ categories = arguments.get("categories")
+ if categories and not isinstance(categories, list):
+ categories = [categories]
+ categories = [c.lower() for c in categories or []]
+
+ tools = self._load_base_tools()
+ if include_vsd:
+ for cfg in load_catalog().values():
+ tools.append(
+ {
+ "name": cfg.get("name"),
+ "type": "DynamicREST",
+ "description": (cfg.get("metadata") or {}).get("description"),
+ "tool_type": "dynamic_rest",
+ "category": "vsd",
+ "source": (cfg.get("metadata") or {}).get("source"),
+ }
+ )
+
+ query_tokens = _tokenize(query)
+ scored: List[tuple[float, Dict[str, Any]]] = []
+ for tool in tools:
+ if categories and (tool.get("category") or "").lower() not in categories:
+ continue
+ score = _score(query_tokens, tool.get("name", ""), tool.get("description", ""))
+ if score > 0:
+ scored.append((score, tool))
+
+ scored.sort(key=lambda item: item[0], reverse=True)
+ best = [_format_tool(tool) | {"score": round(score, 3)} for score, tool in scored[:limit]]
+
+ return {"ok": True, "query": query, "results": best, "total": len(scored)}
diff --git a/src/tooluniverse/tool_registry.py b/src/tooluniverse/tool_registry.py
index eb3b893f..c3f5d141 100644
--- a/src/tooluniverse/tool_registry.py
+++ b/src/tooluniverse/tool_registry.py
@@ -446,3 +446,18 @@ def get_tool_class_lazy(tool_name):
return _tool_registry.get(tool_name)
return None
+
+# --- VSD / compatibility shims ---
+def get_tool_class(name: str):
+ """
+ Backwards-compatible accessor used by scripts like SampleVDSRun.py.
+ Prefer get_tool_class_lazy(name) internally.
+ """
+ return get_tool_class_lazy(name)
+
+class _RegistryShim:
+ def get_tool_class(self, name: str):
+ return get_tool_class_lazy(name)
+
+# Expose a 'registry' object with get_tool_class, if callers expect it
+registry = _RegistryShim()
\ No newline at end of file
diff --git a/src/tooluniverse/utils.py b/src/tooluniverse/utils.py
index 88e778fb..ff4867e2 100755
--- a/src/tooluniverse/utils.py
+++ b/src/tooluniverse/utils.py
@@ -136,7 +136,7 @@ def read_json_list(file_path):
Returns
list: A list of dictionaries containing the JSON objects.
"""
- with open(file_path, "r") as file:
+ with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
diff --git a/src/tooluniverse/vsd_api_tool.py b/src/tooluniverse/vsd_api_tool.py
new file mode 100644
index 00000000..84a5c525
--- /dev/null
+++ b/src/tooluniverse/vsd_api_tool.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+import os
+import json
+from typing import Dict, Any
+
+from .base_tool import BaseTool
+from .tool_registry import register_tool
+
+# Reuse same storage locations as vsd_tool
+VSD_HOME = os.environ.get("TOOLUNIVERSE_VSD_DIR", os.path.expanduser("~/.tooluniverse/vsd"))
+GENERATED_TOOLS_PATH = os.path.join(VSD_HOME, "generated_tools.json")
+
+os.makedirs(VSD_HOME, exist_ok=True)
+
+
+def _save_tool(tool_spec: Dict[str, Any]) -> None:
+ """Upsert a generated tool spec into the registry file."""
+ tools: list[Dict[str, Any]] = []
+ if os.path.exists(GENERATED_TOOLS_PATH):
+ try:
+ with open(GENERATED_TOOLS_PATH, "r", encoding="utf-8") as f:
+ tools = json.load(f)
+ except Exception:
+ tools = []
+ by_name = {t.get("name"): t for t in tools}
+ by_name[tool_spec.get("name")] = tool_spec
+ with open(GENERATED_TOOLS_PATH, "w", encoding="utf-8") as f:
+ json.dump(list(by_name.values()), f, indent=2)
+
+
+@register_tool("VSDToolBuilder")
+class VSDToolBuilder(BaseTool):
+ """
+ Build and register a usable ToolUniverse tool from a harvested or discovered VSD candidate.
+
+ Input:
+ {
+ "candidate": {
+ "domain": "clinicaltrials.gov",
+ "endpoint": "https://clinicaltrials.gov/api/v2/studies",
+ "license": "CC0",
+ "score": 0.92
+ },
+ "tool_name": "clinicaltrials_search",
+ "description": "Query clinical trials with disease/condition filters",
+ "parameter_overrides": { ... optional JSON Schema ... }
+ }
+
+ Output:
+ {
+ "registered": true,
+ "name": "clinicaltrials_search",
+ "config_path": "/path/to/generated_tools.json"
+ }
+ """
+
+ def run(self, arguments: Dict[str, Any]):
+ if not arguments:
+ return {"error": "Missing arguments"}
+ cand = arguments.get("candidate") or {}
+ tool_name = arguments.get("tool_name")
+ desc = arguments.get("description") or f"VSD tool for {cand.get('domain')}"
+ param_override = arguments.get("parameter_overrides") or {}
+
+ if not tool_name:
+ return {"error": "tool_name is required"}
+ if not cand or not cand.get("endpoint"):
+ return {"error": "candidate with endpoint is required"}
+
+ endpoint = cand.get("endpoint")
+ domain = cand.get("domain", "unknown")
+
+ # Pick implementation type
+ if endpoint.endswith(".graphql") or "graphql" in endpoint:
+ impl_type = "GenericGraphQLTool"
+ elif endpoint.startswith("http"):
+ impl_type = "GenericRESTTool"
+ else:
+ impl_type = "URLHTMLTagTool"
+
+ # Default parameter schema (can be overridden)
+ params = param_override or {
+ "type": "object",
+ "properties": {
+ "query": {"type": "string", "default": ""},
+ "pageSize": {"type": "integer", "default": 10},
+ }
+ }
+
+ tool_spec = {
+ "type": impl_type,
+ "name": tool_name,
+ "description": desc,
+ "fields": {
+ "base_url": endpoint,
+ "method": "GET",
+ "default_params": {}
+ },
+ "parameter": params,
+ "label": ["VSD", cand.get("label") or domain],
+ "vsd": {
+ "domain": domain,
+ "endpoint": endpoint,
+ "license": cand.get("license", "unknown"),
+ "score": cand.get("score"),
+ "registry": cand.get("registry", "catalog"),
+ }
+ }
+
+ # Special case: ClinicalTrials.gov -> add arg_transform
+ if "clinicaltrials.gov" in endpoint and impl_type == "GenericRESTTool":
+ tool_spec["vsd"]["arg_transform"] = "ctgov_time_window"
+
+ _save_tool(tool_spec)
+ return {"registered": True, "name": tool_name, "config_path": GENERATED_TOOLS_PATH}
diff --git a/src/tooluniverse/vsd_catalog.py b/src/tooluniverse/vsd_catalog.py
new file mode 100644
index 00000000..95ec1269
--- /dev/null
+++ b/src/tooluniverse/vsd_catalog.py
@@ -0,0 +1,44 @@
+# src/tooluniverse/vsd_catalog.py
+import os, json
+from pathlib import Path
+from typing import List, Dict, Any
+
+VSD_DIR = Path(os.environ.get("TOOLUNIVERSE_VSD_DIR", Path.home() / ".tooluniverse" / "vsd"))
+ALLOWLIST_PATH = VSD_DIR / "allowlist.json"
+CATALOG_PATH = VSD_DIR / "catalog" / "vsd_catalog_candidates.json"
+
+def load_json(path: Path) -> Any:
+ if not path.exists():
+ return None
+ try:
+ return json.loads(path.read_text(encoding="utf-8"))
+ except Exception:
+ return None
+
+def load_allowlist(seed: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ user = load_json(ALLOWLIST_PATH) or []
+ merged = {e["domain"]: e for e in seed}
+ for e in user:
+ merged[e["domain"]] = {**merged.get(e["domain"], {}), **e}
+ return list(merged.values())
+
+def load_catalog_candidates() -> List[Dict[str, Any]]:
+ data = load_json(CATALOG_PATH) or []
+ # normalize minimal fields and keep only candidates
+ out = []
+ for d in data:
+ if d.get("status") not in (None, "candidate", "approved"):
+ continue
+ out.append({
+ "domain": d.get("domain"),
+ "label": d.get("label") or d.get("domain"),
+ "registry": d.get("registry") or "data.gov",
+ "endpoint": d.get("endpoint"),
+ "license": d.get("license") or "unknown",
+ "trust": float(d.get("trust") or 0.7),
+ "freshness": d.get("freshness") or "",
+ "api_kind": d.get("api_kind") or "rest",
+ "status": d.get("status") or "candidate",
+ "tags": d.get("tags") or [],
+ })
+ return out
diff --git a/src/tooluniverse/vsd_registry.py b/src/tooluniverse/vsd_registry.py
new file mode 100644
index 00000000..83b237f6
--- /dev/null
+++ b/src/tooluniverse/vsd_registry.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from .common_utils import read_json, write_json, vsd_generated_path
+
+
+def _normalize_catalog(data: Any) -> Dict[str, Dict[str, Any]]:
+ catalog: Dict[str, Dict[str, Any]] = {}
+ if not isinstance(data, dict):
+ return catalog
+
+ generated = data.get("generated_tools") if isinstance(data.get("generated_tools"), list) else None
+ if generated is not None:
+ for item in generated:
+ if isinstance(item, dict) and item.get("name"):
+ name = item["name"]
+ catalog[name] = dict(item)
+ return catalog
+
+ for name, cfg in data.items():
+ if not isinstance(cfg, dict):
+ continue
+ entry = dict(cfg)
+ entry.setdefault("name", name)
+ catalog[name] = entry
+ return catalog
+
+
+def load_catalog() -> Dict[str, Dict[str, Any]]:
+ """
+ Load the Verified Source catalog from disk and normalize it
+ to a {name: config} dictionary regardless of historical format.
+ """
+ path = vsd_generated_path()
+ data = read_json(path, {})
+ return _normalize_catalog(data)
+
+
+def save_catalog(catalog: Dict[str, Dict[str, Any]]) -> str:
+ """
+ Persist the catalog to disk as a flat {name: config} mapping.
+ Returns the file path for convenience.
+ """
+ path = vsd_generated_path()
+ # ensure each entry has its name
+ serializable = {name: dict(cfg, name=name) for name, cfg in catalog.items()}
+ write_json(path, serializable)
+ return path
+
+
+def upsert_tool(tool_name: str, cfg: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Insert or update a tool configuration in the catalog and propagate the
+ change to any in-process dynamic registries.
+ """
+ catalog = load_catalog()
+ config = dict(cfg)
+ config.setdefault("name", tool_name)
+ catalog[tool_name] = config
+ save_catalog(catalog)
+
+ # Notify dynamic REST runner (best-effort, optional import)
+ try:
+ from .dynamic_rest_runner import upsert_generated_tool # type: ignore
+
+ upsert_generated_tool(tool_name, config)
+ except Exception:
+ pass
+
+ return config
+
+
+def remove_tool(tool_name: str) -> bool:
+ """
+ Remove a tool from the catalog. Returns True if a tool was removed.
+ """
+ catalog = load_catalog()
+ if tool_name not in catalog:
+ return False
+ del catalog[tool_name]
+ save_catalog(catalog)
+
+ try:
+ from .dynamic_rest_runner import remove_generated_tool # type: ignore
+
+ remove_generated_tool(tool_name)
+ except Exception:
+ pass
+
+ return True
diff --git a/src/tooluniverse/vsd_tool.py b/src/tooluniverse/vsd_tool.py
new file mode 100644
index 00000000..98a09e24
--- /dev/null
+++ b/src/tooluniverse/vsd_tool.py
@@ -0,0 +1,246 @@
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, List
+from urllib.parse import urlparse
+
+from .tool_registry import register_tool
+from .vsd_registry import load_catalog, save_catalog, upsert_tool
+from .dynamic_rest_runner import refresh_generated_registry, remove_generated_tool
+from .vsd_utils import build_config, probe_config, stamp_metadata
+from .harvest.static_catalog import harvest as harvest_static
+
+GENERIC_HARVEST_SCHEMA = {
+ "type": "object",
+ "properties": {
+ "query": {
+ "type": "string",
+ "description": "Free-text search term passed to the harvest catalog.",
+ },
+ "limit": {
+ "type": "integer",
+ "minimum": 1,
+ "maximum": 50,
+ "default": 5,
+ "description": "Maximum number of candidates to return.",
+ },
+ "urls": {
+ "type": "array",
+ "items": {"type": "string", "format": "uri"},
+ "description": "Optional explicit URLs to wrap as manual candidates (skips catalog search).",
+ },
+ },
+ "additionalProperties": False,
+}
+
+GENERIC_HARVEST_CONFIG = {
+ "name": "GenericHarvestTool",
+ "description": "Search the harvest catalog (or wrap manual URLs) to produce candidate API endpoints.",
+ "type": "GenericHarvestTool",
+ "category": "special_tools",
+ "parameter": GENERIC_HARVEST_SCHEMA,
+}
+
+VERIFIED_SOURCE_REGISTER_SCHEMA = {
+ "type": "object",
+ "properties": {
+ "tool_name": {"type": "string"},
+ "tool_type": {"type": "string", "default": "dynamic_rest"},
+ "candidate": {"type": "object"},
+ "default_params": {"type": "object"},
+ "default_headers": {"type": "object"},
+ "force": {"type": "boolean", "default": False},
+ },
+ "required": ["tool_name", "candidate"],
+}
+
+VERIFIED_SOURCE_REGISTER_CONFIG = {
+ "name": "VerifiedSourceRegisterTool",
+ "description": "Register a DynamicREST tool into the verified-source catalog after probing it.",
+ "type": "VerifiedSourceRegisterTool",
+ "category": "special_tools",
+ "parameter": VERIFIED_SOURCE_REGISTER_SCHEMA,
+}
+
+VERIFIED_SOURCE_DISCOVERY_CONFIG = {
+ "name": "VerifiedSourceDiscoveryTool",
+ "description": "List the tools currently stored in the verified-source catalog.",
+ "type": "VerifiedSourceDiscoveryTool",
+ "category": "special_tools",
+ "parameter": {
+ "type": "object",
+ "properties": {},
+ "additionalProperties": False,
+ },
+}
+
+VERIFIED_SOURCE_REMOVE_SCHEMA = {
+ "type": "object",
+ "properties": {
+ "tool_name": {"type": "string"},
+ },
+ "required": ["tool_name"],
+}
+
+VERIFIED_SOURCE_REMOVE_CONFIG = {
+ "name": "VerifiedSourceRemoveTool",
+ "description": "Remove a generated tool from the verified-source catalog.",
+ "type": "VerifiedSourceRemoveTool",
+ "category": "special_tools",
+ "parameter": VERIFIED_SOURCE_REMOVE_SCHEMA,
+}
+
+
+@register_tool("GenericHarvestTool", config=GENERIC_HARVEST_CONFIG)
+class GenericHarvestTool:
+ name = "GenericHarvestTool"
+ description = "Harvest candidate API endpoints from the static catalog or wrap manual URLs."
+ input_schema = GENERIC_HARVEST_SCHEMA
+
+ def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None:
+ self.tool_config = tool_config or {}
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ query = (arguments.get("query") or "").strip()
+ limit_value = arguments.get("limit", 5)
+ try:
+ limit = int(limit_value)
+ except (TypeError, ValueError):
+ limit = 5
+ limit = max(1, min(limit, 50))
+ urls = arguments.get("urls") or []
+
+ candidates: List[Dict[str, Any]] = []
+
+ if urls:
+ for idx, raw_url in enumerate(urls):
+ if not raw_url:
+ continue
+ parsed = urlparse(str(raw_url))
+ host = parsed.netloc.lower()
+ base_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else raw_url
+ name = host or f"manual_candidate_{idx + 1}"
+ candidates.append(
+ {
+ "name": name,
+ "endpoint": raw_url,
+ "url": raw_url,
+ "base_url": base_url,
+ "host": host,
+ "source": "manual_urls",
+ "description": arguments.get("description") or "",
+ "trust": 0.5,
+ "health": {"ok": None, "status": None, "checked": "manual"},
+ }
+ )
+ else:
+ extra_args = {k: v for k, v in arguments.items() if k not in {"query", "limit", "urls"}}
+ candidates = harvest_static(query=query, limit=limit, **extra_args)
+
+ return {
+ "ok": True,
+ "query": query,
+ "count": len(candidates),
+ "candidates": candidates,
+ }
+
+
+@register_tool("VerifiedSourceRegisterTool", config=VERIFIED_SOURCE_REGISTER_CONFIG)
+class VerifiedSourceRegisterTool:
+ name = "VerifiedSourceRegisterTool"
+ description = "Register a DynamicREST tool in the verified-source directory"
+ input_schema = VERIFIED_SOURCE_REGISTER_SCHEMA
+
+ def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None:
+ self.tool_config = tool_config or {}
+
+ def __call__(
+ self,
+ tool_name: str,
+ candidate: Dict[str, Any],
+ tool_type: str = "dynamic_rest",
+ default_params: Dict[str, Any] | None = None,
+ default_headers: Dict[str, Any] | None = None,
+ force: bool = False,
+ ) -> Dict[str, Any]:
+ if not tool_name:
+ raise ValueError("tool_name is required")
+
+ cfg = build_config(
+ candidate or {},
+ tool_type=tool_type,
+ default_params=default_params,
+ default_headers=default_headers,
+ )
+
+ probe = probe_config(cfg)
+ stamp_metadata(cfg, probe)
+
+ if not probe.get("ok") and not force:
+ return {
+ "registered": False,
+ "name": tool_name,
+ "error": "Endpoint validation failed",
+ "test": probe,
+ "suggestion": "Provide default_params/default_headers or retry with force=True after ensuring credentials.",
+ }
+
+ cfg = upsert_tool(tool_name, cfg)
+ refresh_generated_registry()
+
+ return {"registered": True, "name": tool_name, "config": cfg}
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ return self.__call__(
+ tool_name=arguments.get("tool_name"),
+ candidate=arguments.get("candidate", {}),
+ tool_type=arguments.get("tool_type", "dynamic_rest"),
+ default_params=arguments.get("default_params"),
+ default_headers=arguments.get("default_headers"),
+ force=bool(arguments.get("force")),
+ )
+
+
+@register_tool("VerifiedSourceDiscoveryTool", config=VERIFIED_SOURCE_DISCOVERY_CONFIG)
+class VerifiedSourceDiscoveryTool:
+ name = "VerifiedSourceDiscoveryTool"
+ description = "Return the Verified-Source catalog."
+
+ def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None:
+ self.tool_config = tool_config or {}
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ catalog = load_catalog()
+ return {"ok": True, "tools": list(catalog.values())}
+
+
+@register_tool("VerifiedSourceRemoveTool", config=VERIFIED_SOURCE_REMOVE_CONFIG)
+class VerifiedSourceRemoveTool:
+ name = "VerifiedSourceRemoveTool"
+ description = "Remove a generated tool from the Verified-Source catalog."
+ input_schema = VERIFIED_SOURCE_REMOVE_SCHEMA
+
+ def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None:
+ self.tool_config = tool_config or {}
+
+ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
+ tool_name = arguments.get("tool_name")
+ if not tool_name:
+ return {"removed": False, "error": "tool_name is required"}
+ catalog = load_catalog()
+ if tool_name not in catalog:
+ return {"removed": False, "error": f"Unknown tool '{tool_name}'"}
+ del catalog[tool_name]
+ save_catalog(catalog)
+ remove_generated_tool(tool_name)
+ return {"removed": True, "name": tool_name}
+
+
+def register(server):
+ register_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool)
+ register_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool)
+ register_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool)
+
+ server.add_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool())
+ server.add_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool())
+ server.add_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool())
+ refresh_generated_registry()
diff --git a/src/tooluniverse/vsd_utils.py b/src/tooluniverse/vsd_utils.py
new file mode 100644
index 00000000..3f3250e7
--- /dev/null
+++ b/src/tooluniverse/vsd_utils.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+
+import time
+from copy import deepcopy
+from typing import Any, Dict
+
+import requests
+
+# ------------------------------------------------------------------------------
+# Host-specific overrides and requirements
+# ------------------------------------------------------------------------------
+
+HOST_OVERRIDES: Dict[str, Dict[str, Any]] = {
+ # Ensembl requires a concrete resource; expose the JSON heartbeat by default.
+ "rest.ensembl.org": {
+ "endpoint": "https://rest.ensembl.org/info/ping",
+ "default_headers": {"Accept": "application/json"},
+ "notes": "Ensembl REST base requires explicit resource. '/info/ping' provides a JSON heartbeat.",
+ },
+ "api.fda.gov": {
+ "default_params": {"limit": 5},
+ "default_headers": {"Accept": "application/json"},
+ },
+ "data.cdc.gov": {
+ "default_params": {"$limit": 5},
+ "default_headers": {"Accept": "application/json"},
+ },
+}
+
+HOST_REQUIREMENTS: Dict[str, Dict[str, Any]] = {
+ "api.nal.usda.gov": {
+ "requires_api_key": True,
+ "notes": "USDA FoodData Central requires an api_key query parameter.",
+ },
+ "www.ncdc.noaa.gov": {
+ "requires_api_key": True,
+ "notes": "NOAA CDO API requires a token header. See https://www.ncdc.noaa.gov/cdo-web/webservices/v2",
+ "default_headers": {"token": ""},
+ },
+ "clinicaltrialsapi.cancer.gov": {
+ "requires_api_key": True,
+ "notes": "ClinicalTrials API requires authenticated access for JSON responses.",
+ },
+ "findtreatment.samhsa.gov": {
+ "requires_manual_params": True,
+ "notes": "SAMHSA locator needs query parameters (e.g., state, lat/long) to return JSON.",
+ },
+}
+
+
+# ------------------------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------------------------
+
+def _derive_endpoint(candidate: Dict[str, Any]) -> str:
+ endpoint = candidate.get("endpoint") or candidate.get("url")
+ if endpoint:
+ return str(endpoint)
+
+ base_url = candidate.get("base_url")
+ routes = candidate.get("endpoints") or []
+ if base_url and isinstance(routes, list) and routes:
+ first = routes[0]
+ path = str(first.get("path") or "/")
+ if not base_url.endswith("/") and not path.startswith("/"):
+ return f"{base_url}/{path}"
+ if base_url.endswith("/") and path.startswith("/"):
+ return f"{base_url.rstrip('/')}{path}"
+ return f"{base_url}{path}"
+
+ if base_url:
+ return str(base_url)
+
+ raise ValueError("candidate.endpoint or candidate.url is required")
+
+
+def _apply_overrides(candidate: Dict[str, Any], cfg: Dict[str, Any]) -> None:
+ host = (candidate.get("host") or "").lower()
+
+ overrides = HOST_OVERRIDES.get(host)
+ if overrides:
+ fields = cfg.setdefault("fields", {})
+ if overrides.get("endpoint"):
+ cfg["endpoint"] = overrides["endpoint"]
+ fields["base_url"] = overrides["endpoint"]
+ if overrides.get("default_params"):
+ cfg.setdefault("default_params", {}).update(overrides["default_params"])
+ fields.setdefault("default_params", {}).update(overrides["default_params"])
+ if overrides.get("default_headers"):
+ cfg.setdefault("default_headers", {}).update(overrides["default_headers"])
+ fields.setdefault("headers", {}).update(overrides["default_headers"])
+ if overrides.get("notes"):
+ cfg.setdefault("metadata", {}).setdefault("notes", []).append(overrides["notes"])
+
+ requirements = HOST_REQUIREMENTS.get(host)
+ if requirements:
+ meta = cfg.setdefault("metadata", {})
+ meta.setdefault("requirements", {}).update(
+ {
+ key: value
+ for key, value in requirements.items()
+ if key not in {"default_headers"}
+ }
+ )
+ if requirements.get("default_headers"):
+ cfg.setdefault("default_headers", {}).update(requirements["default_headers"])
+ cfg.setdefault("fields", {}).setdefault("headers", {}).update(requirements["default_headers"])
+
+
+# ------------------------------------------------------------------------------
+# Public helpers used by VSD tools
+# ------------------------------------------------------------------------------
+
+def build_config(
+ candidate: Dict[str, Any],
+ tool_type: str = "dynamic_rest",
+ default_params: Dict[str, Any] | None = None,
+ default_headers: Dict[str, Any] | None = None,
+) -> Dict[str, Any]:
+ """
+ Produce a DynamicREST-style configuration dictionary from a harvest candidate.
+ """
+ endpoint = _derive_endpoint(candidate)
+ method = str(candidate.get("method") or candidate.get("http_method") or "GET").upper()
+ merged_params = deepcopy(candidate.get("default_params") or candidate.get("params") or {})
+ merged_headers = deepcopy(candidate.get("default_headers") or candidate.get("headers") or {})
+
+ # Allow overrides provided via arguments
+ if default_params:
+ merged_params.update(default_params)
+ if default_headers:
+ merged_headers.update(default_headers)
+
+ # Determine implementation class
+ declared_type = str(candidate.get("tool_type") or tool_type or "").lower()
+ impl_type = "GenericRESTTool"
+ if declared_type in {"graphql", "genericgraphqltool", "graph_ql"} or endpoint.endswith(".graphql"):
+ impl_type = "GenericGraphQLTool"
+
+ # Provide a permissive parameter schema with defaults from known params
+ parameter_schema: Dict[str, Any] = deepcopy(candidate.get("parameter_schema") or candidate.get("parameter") or {})
+ if not parameter_schema:
+ properties = {
+ key: {"description": f"Override default query parameter '{key}'", "default": value}
+ for key, value in merged_params.items()
+ }
+ parameter_schema = {
+ "type": "object",
+ "properties": properties,
+ "additionalProperties": True,
+ }
+
+ fields: Dict[str, Any] = {
+ "base_url": endpoint,
+ "method": method,
+ "default_params": merged_params,
+ "headers": merged_headers,
+ }
+
+ cfg: Dict[str, Any] = {
+ "type": impl_type,
+ "description": candidate.get("description") or "",
+ "fields": fields,
+ "parameter": parameter_schema,
+ "metadata": {
+ "source": candidate.get("source"),
+ "trust": candidate.get("trust"),
+ "health": candidate.get("health"),
+ "doc_url": candidate.get("doc_url"),
+ "description": candidate.get("description"),
+ "host": candidate.get("host"),
+ },
+ "vsd": candidate,
+ # Backwards compatibility fields expected by older utilities
+ "tool_type": candidate.get("tool_type") or tool_type or "dynamic_rest",
+ "endpoint": endpoint,
+ "method": method,
+ "default_params": merged_params,
+ "default_headers": merged_headers,
+ "auth": candidate.get("auth") or {"type": "none"},
+ }
+
+ response_key = candidate.get("response_key")
+ if response_key:
+ cfg["response_key"] = response_key
+
+ _apply_overrides(candidate, cfg)
+
+ return cfg
+
+
+def probe_config(cfg: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Execute a lightweight HTTP request to validate the generated configuration.
+ Returns diagnostic information including HTTP status and a JSON snippet if available.
+ """
+ fields = cfg.get("fields") or {}
+ url = cfg.get("endpoint") or fields.get("base_url")
+ method = (fields.get("method") or cfg.get("method") or "GET").upper()
+ params = deepcopy(fields.get("default_params") or cfg.get("default_params") or {})
+ headers = deepcopy(fields.get("headers") or cfg.get("default_headers") or {})
+ headers.setdefault("Accept", "application/json")
+
+ try:
+ if method == "GET":
+ resp = requests.get(url, params=params, headers=headers, timeout=20)
+ else:
+ resp = requests.request(method, url, json=params, headers=headers, timeout=20)
+ except Exception as exc:
+ return {"ok": False, "error": str(exc), "stage": "request"}
+
+ content_type = resp.headers.get("Content-Type", "")
+ preview = resp.text[:400] if resp.text else ""
+ sample = None
+ has_json = False
+
+ if "json" in content_type.lower():
+ try:
+ payload = resp.json()
+ has_json = True
+ if isinstance(payload, list):
+ sample = payload[:1]
+ elif isinstance(payload, dict):
+ sample = {k: payload[k] for i, k in enumerate(payload) if i < 5}
+ else:
+ sample = payload
+ except Exception:
+ has_json = False
+
+ status_ok = resp.status_code < 400
+
+ return {
+ "ok": bool(status_ok and (has_json or "json" in content_type.lower())),
+ "status": resp.status_code,
+ "content_type": content_type,
+ "has_json": has_json,
+ "sample": sample,
+ "preview": preview,
+ }
+
+
+def stamp_metadata(cfg: Dict[str, Any], probe: Dict[str, Any]) -> None:
+ """
+ Update metadata timestamps and probe results on a configuration dictionary.
+ """
+ metadata = cfg.setdefault("metadata", {})
+ metadata["registered_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
+ metadata["last_test"] = probe
diff --git a/tests/integration/test_medtok_medlog_tools.py b/tests/integration/test_medtok_medlog_tools.py
new file mode 100644
index 00000000..a708ecf8
--- /dev/null
+++ b/tests/integration/test_medtok_medlog_tools.py
@@ -0,0 +1,282 @@
+import importlib.util
+import json
+import os
+import sys
+import tempfile
+import threading
+import time
+from pathlib import Path
+
+import pytest
+import uvicorn
+from fastapi import FastAPI, HTTPException
+
+from tooluniverse.execute_function import ToolUniverse
+
+
+class _ServerHandle:
+ """Utility wrapper for running uvicorn servers in tests."""
+
+ def __init__(self, app: FastAPI, host: str, port: int):
+ config = uvicorn.Config(
+ app, host=host, port=port, log_level="error", lifespan="off"
+ )
+ self.server = uvicorn.Server(config)
+ self.thread = threading.Thread(target=self.server.run, daemon=True)
+
+ def start(self) -> None:
+ self.thread.start()
+ while not self.server.started:
+ time.sleep(0.05)
+
+ def stop(self) -> None:
+ self.server.should_exit = True
+ self.thread.join(timeout=5)
+
+
+def _import_medtok_app(module_path: Path):
+ spec = importlib.util.spec_from_file_location("medtok_service_app", module_path)
+ module = importlib.util.module_from_spec(spec)
+ assert spec.loader is not None
+ spec.loader.exec_module(module)
+ return module
+
+
+@pytest.fixture(scope="session")
+def medtok_server():
+ repo_root = Path(__file__).resolve().parents[3]
+ medtok_root = repo_root / "MedTok-FHIR-Starter"
+ service_dir = medtok_root / "services" / "medtok_service"
+ sys.path.insert(0, str(service_dir))
+
+ base_config_path = medtok_root / "config" / "medtok_config.json"
+ config_data = json.loads(base_config_path.read_text(encoding="utf-8"))
+ config_data["code_metadata_path"] = str(
+ medtok_root / "samples" / "code_metadata.csv"
+ )
+ config_data["graph_edges_path"] = str(
+ medtok_root / "samples" / "code_graph_edges.csv"
+ )
+ tmp_config = tempfile.NamedTemporaryFile(
+ "w", suffix="_medtok_config.json", delete=False
+ )
+ json.dump(config_data, tmp_config)
+ tmp_config.flush()
+ tmp_config.close()
+ os.environ["MEDTOK_CONFIG"] = tmp_config.name
+
+ module = _import_medtok_app(service_dir / "app.py")
+ module.MAPPING_CSV = str(medtok_root / "samples" / "code_mapping.csv")
+ app = module.app
+
+ host = "127.0.0.1"
+ port = 8910
+ server = _ServerHandle(app, host, port)
+ server.start()
+
+ base_url = f"http://{host}:{port}"
+ os.environ["MEDTOK_BASE_URL"] = base_url
+
+ yield base_url
+
+ server.stop()
+ os.environ.pop("MEDTOK_BASE_URL", None)
+ os.environ.pop("MEDTOK_CONFIG", None)
+ try:
+ os.remove(tmp_config.name)
+ except FileNotFoundError:
+ pass
+ sys.path.remove(str(service_dir))
+
+
+def _build_medlog_collector(store):
+ app = FastAPI()
+
+ @app.post("/medlog/events/init")
+ def init(payload: dict):
+ header = payload.get("header") or {}
+ event_id = header.get("event_id")
+ if not event_id:
+ raise HTTPException(400, "event_id required")
+ record = {
+ "header": header,
+ "model_instance": payload.get("model_instance", {}),
+ "user_identity": payload.get("user_identity", {}),
+ "target_identity": payload.get("target_identity"),
+ "inputs": payload.get("inputs"),
+ "retention_tier": payload.get("retention_tier", "steady"),
+ "fragments": [],
+ }
+ store[event_id] = record
+ return {"status": "ok", "event_id": event_id}
+
+ @app.post("/medlog/events/{event_id}/append")
+ def append(event_id: str, fragment: dict):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ record["fragments"].append(fragment)
+ return {"status": "ok", "event_id": event_id}
+
+ @app.get("/medlog/events/{event_id}/prov")
+ def prov(event_id: str):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ return {"event_id": event_id, "provenance": {"header": record["header"]}}
+
+ @app.post("/query")
+ def query(body: dict):
+ run_id = body.get("run_id")
+ event_id = body.get("event_id")
+ limit = body.get("limit", 50)
+ matches = []
+ for eid, record in store.items():
+ header = record["header"]
+ if event_id and event_id != eid:
+ continue
+ if run_id and header.get("run_id") != run_id:
+ continue
+ matches.append({"event_id": eid, "header": header})
+ if len(matches) >= limit:
+ break
+ return {"count": len(matches), "results": matches}
+
+ @app.post("/export/parquet")
+ def export():
+ return {"status": "ok", "outdir": "/tmp/parquet"}
+
+ return app
+
+
+def _build_medlog_fhir(store):
+ app = FastAPI()
+
+ def _bundle_for_records(records):
+ entries = []
+ for rec in records:
+ entries.append(
+ {
+ "resource": {
+ "resourceType": "Observation",
+ "id": rec["header"]["event_id"],
+ "status": "final",
+ }
+ }
+ )
+ return {"resourceType": "Bundle", "type": "collection", "entry": entries}
+
+ @app.get("/bundle/{event_id}")
+ def bundle(event_id: str):
+ record = store.get(event_id)
+ if record is None:
+ raise HTTPException(404, "event not found")
+ return _bundle_for_records([record])
+
+ @app.get("/bundle/run/{run_id}")
+ def bundle_run(run_id: str):
+ records = [
+ record
+ for record in store.values()
+ if record["header"].get("run_id") == run_id
+ ]
+ if not records:
+ raise HTTPException(404, "run not found")
+ return _bundle_for_records(records)
+
+ return app
+
+
+@pytest.fixture(scope="session")
+def medlog_servers():
+ store = {}
+ host = "127.0.0.1"
+ collector_port = 8911
+ fhir_port = 8912
+
+ collector_app = _build_medlog_collector(store)
+ fhir_app = _build_medlog_fhir(store)
+
+ collector = _ServerHandle(collector_app, host, collector_port)
+ fhir = _ServerHandle(fhir_app, host, fhir_port)
+ collector.start()
+ fhir.start()
+
+ os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}"
+ os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}"
+
+ yield store
+
+ collector.stop()
+ fhir.stop()
+ os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None)
+ os.environ.pop("MEDLOG_FHIR_BASE_URL", None)
+
+
+def test_medtok_rest_tools(medtok_server):
+ tu = ToolUniverse(hooks_enabled=False)
+ tu.load_tools(tool_type=["medtok"])
+
+ tokenize = tu.tools.MedTok_tokenize(
+ codes=["A00", "E11"], system="ICD-10", include_metadata=True
+ )
+ token_ids = tokenize.get("token_ids", [])
+ assert isinstance(token_ids, list)
+ assert len(token_ids) in (0, 2)
+
+ embed = tu.tools.MedTok_embed(codes=["A00"], system="ICD-10")
+ embeddings = embed.get("embeddings", [])
+ if embeddings:
+ assert isinstance(embeddings[0], list)
+ assert embed.get("dim") == len(embeddings[0])
+
+ neighbors = tu.tools.MedTok_nearest_neighbors(code="A00", k=3)
+ neighbor_list = neighbors.get("neighbors", [])
+ assert len(neighbor_list) <= 3
+
+ mapped = tu.tools.MedTok_map_text_to_code(text="type 2 diabetes", system="ICD-10")
+ assert "code" in mapped
+
+ search = tu.tools.MedTok_search_text(text="hypertension", k=4)
+ assert len(search.get("matches", [])) <= 4
+
+ code_info = tu.tools.MedTok_code_info(code="E11", system="ICD-10")
+ assert isinstance(code_info, dict)
+
+
+def test_medlog_tools_workflow(medlog_servers):
+ tu = ToolUniverse(hooks_enabled=False)
+ tu.load_tools(tool_type=["medlog"])
+
+ header = {
+ "event_id": "evt-1",
+ "run_id": "run-123",
+ "timestamp": "2025-01-01T00:00:00Z",
+ }
+ model_instance = {"model": "demo", "version": "1.0"}
+ user_identity = {"name": "Dr. Example"}
+
+ init_resp = tu.tools.MedLog_init_event(
+ header=header, model_instance=model_instance, user_identity=user_identity
+ )
+ assert init_resp["status"] == "ok"
+
+ fragment = {"outputs": {"summary": "Patient stable"}}
+ append_resp = tu.tools.MedLog_append_fragment(event_id="evt-1", fragment=fragment)
+ assert append_resp["status"] == "ok"
+
+ prov_resp = tu.tools.MedLog_get_provenance(event_id="evt-1")
+ assert prov_resp["event_id"] == "evt-1"
+
+ query_resp = tu.tools.MedLog_query_events(run_id="run-123")
+ assert query_resp["count"] == 1
+ assert query_resp["results"][0]["event_id"] == "evt-1"
+
+ export_resp = tu.tools.MedLog_export_parquet()
+ assert export_resp["status"] == "ok"
+
+ bundle_resp = tu.tools.MedLog_fhir_bundle(event_id="evt-1")
+ assert bundle_resp["resourceType"] == "Bundle"
+
+ run_bundle_resp = tu.tools.MedLog_fhir_run_bundle(run_id="run-123")
+ assert len(run_bundle_resp["entry"]) == 1
diff --git a/tests/unit/test_biodomain_tools.py b/tests/unit/test_biodomain_tools.py
new file mode 100644
index 00000000..ec769b13
--- /dev/null
+++ b/tests/unit/test_biodomain_tools.py
@@ -0,0 +1,117 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from tooluniverse.interpro_tool import InterProTool
+from tooluniverse.kegg_tool import KEGGTool
+from tooluniverse.iucn_tool import IUCNRedListTool, IUCN_TOKEN_ENV
+from tooluniverse.jaspar_tool import JASPARRestTool
+from tooluniverse.marine_species_tool import MarineSpeciesTool
+from tooluniverse.cbioportal_tool import CBioPortalTool
+from tooluniverse.phenome_jax_tool import PhenomeJaxTool
+
+
+def _mock_session_get(monkeypatch, target, payload=None, text=None, status_code=200):
+ response = MagicMock()
+ response.status_code = status_code
+ if payload is not None:
+ response.json.return_value = payload
+ if text is not None:
+ response.text = text
+ response.raise_for_status.return_value = None
+
+ def factory(self, *args, **kwargs):
+ return response
+
+ monkeypatch.setattr(target, factory)
+ return response
+
+
+@pytest.mark.unit
+def test_interpro_tool(monkeypatch):
+ payload = {
+ "count": 2,
+ "results": [
+ {"metadata": {"accession": "IPR000001", "name": "Example A", "type": "family"}},
+ {"metadata": {"accession": "IPR000002", "name": "Example B", "type": "domain"}},
+ ],
+ }
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ tool = InterProTool({"name": "InterPro_search_entries"})
+ result = tool.run({"query": "kinase"})
+ assert result["count"] == 2
+ assert result["results"][0]["accession"] == "IPR000001"
+
+
+@pytest.mark.unit
+def test_kegg_tool(monkeypatch):
+ text = "path:map00010\tGlycolysis / Gluconeogenesis\n"
+ _mock_session_get(monkeypatch, "requests.Session.get", text=text)
+ tool = KEGGTool({"name": "KEGG_find_entries"})
+ result = tool.run({"query": "glucose", "database": "pathway"})
+ assert result[0]["id"] == "path:map00010"
+
+
+@pytest.mark.unit
+def test_iucn_tool_requires_token(monkeypatch):
+ tool = IUCNRedListTool({"name": "IUCN_get_species_status"})
+ result = tool.run({"species": "Panthera leo"})
+ assert "error" in result
+
+
+@pytest.mark.unit
+def test_iucn_tool(monkeypatch):
+ payload = {"result": [{"scientific_name": "Panthera leo", "category": "VU"}]}
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ monkeypatch.setenv(IUCN_TOKEN_ENV, "dummy")
+ tool = IUCNRedListTool({"name": "IUCN_get_species_status"})
+ result = tool.run({"species": "Panthera leo"})
+ assert result["results"][0]["category"] == "VU"
+
+
+@pytest.mark.unit
+def test_jaspar_tool(monkeypatch):
+ payload = {
+ "count": 1,
+ "results": [{"matrix_id": "MA0004.1", "name": "Arnt", "collection": "CORE"}],
+ }
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ tool = JASPARRestTool({"name": "JASPAR_search_motifs"})
+ result = tool.run({"query": "Arnt"})
+ assert result["results"][0]["matrix_id"] == "MA0004.1"
+
+
+@pytest.mark.unit
+def test_marine_species_tool(monkeypatch):
+ payload = [{"AphiaID": 137094, "scientificname": "Delphinus delphis"}]
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ tool = MarineSpeciesTool({"name": "MarineSpecies_lookup"})
+ result = tool.run({"scientific_name": "Delphinus delphis"})
+ assert result[0]["AphiaID"] == 137094
+
+
+@pytest.mark.unit
+def test_cbioportal_tool(monkeypatch):
+ payload = [
+ {"studyId": "brca_tcga", "name": "Breast Cancer", "description": "Example"}
+ ]
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ tool = CBioPortalTool({"name": "cBioPortal_search_studies"})
+ result = tool.run({"keyword": "breast"})
+ assert result["results"][0]["studyId"] == "brca_tcga"
+
+
+@pytest.mark.unit
+def test_phenome_jax_tool(monkeypatch):
+ payload = {
+ "count": 2,
+ "projects": [
+ {"projid": 1, "title": "Glucose tolerance", "species": "mouse"},
+ {"projid": 2, "title": "Insulin", "species": "mouse"},
+ ],
+ }
+ _mock_session_get(monkeypatch, "requests.Session.get", payload=payload)
+ tool = PhenomeJaxTool({"name": "PhenomeJax_list_projects"})
+ result = tool.run({"keyword": "glucose", "limit": 1})
+ assert result["count"] == 2
+ assert result["projects"][0]["projid"] == 1