diff --git a/README.md b/README.md index 7ab04aec..157ce394 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ToolUniverse Logo ToolUniverse: Democratizing AI scientists +# ToolUniverse Logo ToolUniverse: Democratizing AI scientists [![Paper](https://img.shields.io/badge/Paper-Arxiv-blue)](https://arxiv.org/abs/2509.23426) [![PyPI version](https://badge.fury.io/py/tooluniverse.svg)](https://badge.fury.io/py/tooluniverse) @@ -232,6 +232,26 @@ Our comprehensive documentation covers everything from quick start to advanced w - **[Adding Tools Tutorial](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/Adding_Tools_Tutorial.html)**: Step-by-step tool addition guide - **[MCP Tool Registration](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/mcp_tool_registration_en.html)**: Register tools via MCP +### MedTok + MedLog Integrations + +ToolUniverse now ships with first-class support for the MedTok tokenizer service and the MedLog reference collector/FHIR bridge. + +- **MedTok REST tools** (`tool_type=["medtok"]`) expose `/tokenize`, `/embed`, `/nearest_neighbors`, `/map_text_to_code`, `/search_text`, and `/codes/{system}/{code}`. Point them at a running service by setting `MEDTOK_BASE_URL` (defaults to `http://localhost:8000`). +- **MedTok MCP auto-loader** (`tool_type=["medtok_mcp_auto_loader"]`) can register tools from the FastMCP wrapper. Set `MEDTOK_MCP_SERVER_HOST` to the host running the `medtok_tool.py` MCP server. +- **MedLog collector + FHIR tools** (`tool_type=["medlog"]`) wrap the reference implementation's REST APIs. Configure the collectors' endpoints with `MEDLOG_COLLECTOR_BASE_URL` (default `http://localhost:7001`) and `MEDLOG_FHIR_BASE_URL` (default `http://localhost:7003`). + +See `tests/integration/test_medtok_medlog_tools.py` for end-to-end examples that start the services, invoke the tools, and validate responses. + +### End-to-End Demo Script + +To launch the reference services and exercise the toolchain automatically, run: + +```bash +python scripts/run_full_demo.py # adds -h for options +``` + +The script starts MedTok + MedLog locally, runs representative tool calls (including optional external APIs like InterPro, KEGG, IUCN, JASPAR, MarineSpecies, cBioPortal, and Phenome Jax), and prints a success/failure summary. + ### 📚 API Reference - **[API Directory](https://zitniklab.hms.harvard.edu/ToolUniverse/api/modules.html)**: Complete module listing - **[Core Modules](https://zitniklab.hms.harvard.edu/ToolUniverse/api/tooluniverse.html)**: Main ToolUniverse class and utilities diff --git a/scripts/medlog_stub_server.py b/scripts/medlog_stub_server.py new file mode 100644 index 00000000..700245fa --- /dev/null +++ b/scripts/medlog_stub_server.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +""" +Lightweight MedLog stub servers for local demos. + +Run the collector: + python scripts/medlog_stub_server.py --mode collector --host 127.0.0.1 --port 8911 + +Run the FHIR bridge: + python scripts/medlog_stub_server.py --mode fhir --host 127.0.0.1 --port 8912 +""" + +from __future__ import annotations + +import argparse +import os +import threading +import time +from typing import Dict + +import uvicorn +from fastapi import FastAPI, HTTPException + + +STORE: Dict[str, Dict] = {} +STORE_LOCK = threading.Lock() + + +def build_collector_app() -> FastAPI: + app = FastAPI(title="MedLog Collector (Stub)", version="0.1.0") + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + with STORE_LOCK: + STORE[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + header = record["header"] + return {"event_id": event_id, "provenance": {"header": header}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + results = [] + with STORE_LOCK: + for eid, record in STORE.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + results.append({"event_id": eid, "header": header}) + if len(results) >= limit: + break + return {"count": len(results), "results": results} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def build_fhir_app() -> FastAPI: + app = FastAPI(title="MedLog FHIR Stub", version="0.1.0") + + def bundle(records): + return { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + { + "resource": { + "resourceType": "Observation", + "id": record["header"]["event_id"], + "status": "final", + } + } + for record in records + ], + } + + @app.get("/bundle/{event_id}") + def bundle_event(event_id: str): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return bundle([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + with STORE_LOCK: + records = [ + record + for record in STORE.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return bundle(records) + + return app + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["collector", "fhir"], required=True) + parser.add_argument("--host", default=os.getenv("MEDLOG_HOST", "127.0.0.1")) + parser.add_argument("--port", type=int, default=int(os.getenv("MEDLOG_PORT", 0)) or 0) + args = parser.parse_args() + + if args.port == 0: + args.port = 8911 if args.mode == "collector" else 8912 + + app = build_collector_app() if args.mode == "collector" else build_fhir_app() + print(f"Starting MedLog {args.mode} stub on {args.host}:{args.port}") + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_full_demo.py b/scripts/run_full_demo.py new file mode 100644 index 00000000..b06cc6c5 --- /dev/null +++ b/scripts/run_full_demo.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python +""" +End-to-end ToolUniverse demo runner. + +This script bootstraps the MedTok and MedLog reference services locally, points +ToolUniverse at them, and exercises a curated set of tools (MedTok, MedLog, and +several public data tools such as InterPro, KEGG, IUCN, JASPAR, MarineSpecies, +cBioPortal, Phenome Jax). It prints friendly status updates and reports any +failures at the end. + +Usage: + python scripts/run_full_demo.py + +Optional flags: + --skip-network-tools Skip external API tools (InterPro, KEGG, etc.). + --medtok-host HOST Override MedTok host (default 127.0.0.1). + --medtok-port PORT Override MedTok port (default 8910). + --medlog-host HOST Override MedLog host (default 127.0.0.1). + --collector-port PORT Override MedLog collector port (default 8911). + --fhir-port PORT Override MedLog FHIR port (default 8912). +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import tempfile +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse + +import requests +import uvicorn +from fastapi import FastAPI, HTTPException + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC_PATH = REPO_ROOT / "src" +if str(SRC_PATH) not in sys.path: + sys.path.insert(0, str(SRC_PATH)) + +from tooluniverse.execute_function import ToolUniverse + +MEDTOK_ROOT = REPO_ROOT.parent / "MedTok-FHIR-Starter" +MEDLOG_ROOT = REPO_ROOT.parent / "medlog-reference" + + +class ServerHandle: + """Run a FastAPI app in a background thread via uvicorn.""" + + def __init__(self, app: FastAPI, host: str, port: int): + config = uvicorn.Config(app, host=host, port=port, log_level="error", lifespan="off") + self.server = uvicorn.Server(config) + self.thread = threading.Thread(target=self.server.run, daemon=True) + + def start(self) -> None: + self.thread.start() + while not self.server.started: + time.sleep(0.05) + + def stop(self) -> None: + self.server.should_exit = True + self.thread.join(timeout=5) + + +def _import_module_typed(module_path: Path): + import importlib.util + + spec = importlib.util.spec_from_file_location(module_path.stem, module_path) + module = importlib.util.module_from_spec(spec) + assert spec and spec.loader + spec.loader.exec_module(module) + return module + + +def _service_is_up(base_url: str, path: str, ok_statuses: Optional[List[int]] = None) -> bool: + try: + resp = requests.get(f"{base_url}{path}", timeout=2) + if ok_statuses is None: + return resp.status_code < 500 + return resp.status_code in ok_statuses + except requests.RequestException: + return False + + +def start_medtok(host: str, port: int): + """Start MedTok FastAPI service and return context info.""" + service_path = MEDTOK_ROOT / "services" / "medtok_service" + if str(service_path) not in sys.path: + sys.path.insert(0, str(service_path)) + + base_url = os.environ.get("MEDTOK_BASE_URL") or f"http://{host}:{port}" + if _service_is_up(base_url, "/health", ok_statuses=[200]): + os.environ["MEDTOK_BASE_URL"] = base_url + print(f"MedTok already running at {base_url}, reusing existing instance.") + return {"server": None, "temp_config": None, "sys_path": str(service_path), "started": False} + + config_path = MEDTOK_ROOT / "config" / "medtok_config.json" + config_data = json.loads(config_path.read_text(encoding="utf-8")) + config_data["code_metadata_path"] = str(MEDTOK_ROOT / "samples" / "code_metadata.csv") + config_data["graph_edges_path"] = str(MEDTOK_ROOT / "samples" / "code_graph_edges.csv") + + tmp_config = tempfile.NamedTemporaryFile("w", suffix="_medtok_config.json", delete=False) + json.dump(config_data, tmp_config) + tmp_config.flush() + tmp_config.close() + os.environ["MEDTOK_CONFIG"] = tmp_config.name + + module = _import_module_typed(service_path / "app.py") + module.MAPPING_CSV = str(MEDTOK_ROOT / "samples" / "code_mapping.csv") + app = module.app + + server = ServerHandle(app, host, port) + server.start() + os.environ["MEDTOK_BASE_URL"] = f"http://{host}:{port}" + + return { + "server": server, + "temp_config": tmp_config.name, + "sys_path": str(service_path), + "started": True, + } + + +def _build_medlog_collector(store: Dict[str, Dict]): + app = FastAPI() + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + store[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return {"event_id": event_id, "provenance": {"header": record["header"]}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + matches = [] + for eid, record in store.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + matches.append({"event_id": eid, "header": header}) + if len(matches) >= limit: + break + return {"count": len(matches), "results": matches} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def _build_medlog_fhir(store: Dict[str, Dict]): + app = FastAPI() + + def _bundle(records): + return { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + { + "resource": { + "resourceType": "Observation", + "id": record["header"]["event_id"], + "status": "final", + } + } + for record in records + ], + } + + @app.get("/bundle/{event_id}") + def bundle_event(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return _bundle([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + records = [ + record + for record in store.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return _bundle(records) + + return app + + +def start_medlog(host: str, collector_port: int, fhir_port: int): + store: Dict[str, Dict] = {} + collector_app = _build_medlog_collector(store) + fhir_app = _build_medlog_fhir(store) + + collector_url = os.environ.get("MEDLOG_COLLECTOR_BASE_URL") or f"http://{host}:{collector_port}" + fhir_url = os.environ.get("MEDLOG_FHIR_BASE_URL") or f"http://{host}:{fhir_port}" + + collector_server = None + fhir_server = None + + if _service_is_up(collector_url, "/"): + print(f"MedLog collector already running at {collector_url}, reusing.") + else: + collector_server = ServerHandle(collector_app, host, collector_port) + collector_server.start() + + if _service_is_up(fhir_url, "/bundle/test"): + print(f"MedLog FHIR service already running at {fhir_url}, reusing.") + else: + fhir_server = ServerHandle(fhir_app, host, fhir_port) + fhir_server.start() + + os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}" + os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}" + + return {"collector": collector_server, "fhir": fhir_server, "started": bool(collector_server or fhir_server)} + + +def stop_medtok(ctx: Dict[str, str]): + if ctx.get("server"): + ctx["server"].stop() + if ctx.get("started"): + os.environ.pop("MEDTOK_BASE_URL", None) + os.environ.pop("MEDTOK_CONFIG", None) + temp_config = ctx.get("temp_config") + if temp_config: + try: + os.remove(temp_config) + except OSError: + pass + sys_path = ctx.get("sys_path") + if sys_path: + try: + sys.path.remove(sys_path) + except ValueError: + pass + + +def stop_medlog(ctx: Dict[str, ServerHandle]): + if ctx.get("collector"): + ctx["collector"].stop() + if ctx.get("fhir"): + ctx["fhir"].stop() + if ctx.get("started"): + os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None) + os.environ.pop("MEDLOG_FHIR_BASE_URL", None) + + +def preview_json(payload: Any, limit: int = 240) -> str: + """Return a compact preview of a payload for console logging.""" + try: + text = json.dumps(payload, indent=2, ensure_ascii=False) + except TypeError: + text = str(payload) + text = text.strip() + if len(text) > limit: + return text[:limit].rstrip() + "..." + return text + + +def call_tool(tu: ToolUniverse, name: str, **kwargs): + """Call a tool and handle ToolUniverse-specific errors.""" + print(f"---> Calling {name} with {kwargs}") + try: + response = getattr(tu.tools, name)(**kwargs) + print(f"[OK] {name} succeeded") + return True, response + except Exception as exc: # pylint: disable=broad-except + print(f"[FAIL] {name} failed: {exc}") + return False, str(exc) + + +def run_medlog_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + results = [] + header = { + "event_id": "evt-demo-1", + "run_id": "run-demo-1", + "timestamp": "2025-01-01T00:00:00Z", + } + model_instance = {"model": "demo", "version": "1.0"} + user_identity = {"name": "Dr. Example"} + steps = [ + ( + "MedLog_init_event", + dict(header=header, model_instance=model_instance, user_identity=user_identity), + "Open an event with metadata (who, when, which model).", + ), + ( + "MedLog_append_fragment", + dict(event_id="evt-demo-1", fragment={"outputs": {"summary": "Patient stable"}}), + "Attach a fragment that captures model outputs for the event.", + ), + ("MedLog_get_provenance", dict(event_id="evt-demo-1"), "Retrieve provenance header saved for the event."), + ("MedLog_query_events", dict(run_id="run-demo-1"), "Query the store by run identifier."), + ("MedLog_export_parquet", dict(), "Trigger sample export (stub returns static location)."), + ("MedLog_fhir_bundle", dict(event_id="evt-demo-1"), "View the event as a single FHIR Observation bundle."), + ("MedLog_fhir_run_bundle", dict(run_id="run-demo-1"), "Bundle all events in the run as FHIR Observations."), + ] + + for name, kwargs, description in steps: + print(f" - {description}") + success, payload = call_tool(tu, name, **kwargs) + note = None + if success: + if name == "MedLog_init_event": + note = f"Created event {payload.get('event_id')}" + elif name == "MedLog_append_fragment": + note = "Attached fragment with outputs summary" + elif name == "MedLog_get_provenance": + prov = payload.get("provenance", {}) + note = f"Provenance keys: {', '.join(prov.keys()) or 'none'}" + elif name == "MedLog_query_events": + note = f"Query returned {payload.get('count', 0)} rows" + elif name == "MedLog_fhir_bundle": + note = f"Bundle contains {len(payload.get('entry', []))} resources" + elif name == "MedLog_fhir_run_bundle": + note = f"Run bundle resources: {len(payload.get('entry', []))}" + if success and note: + print(f" Result: {note}") + results.append({"tool": name, "success": success, "response": payload, "note": note}) + return results + + +def run_medtok_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + tests = [ + ( + "MedTok_tokenize", + dict(codes=["A00", "E11"], system="ICD-10", include_metadata=True), + "Convert ICD-10 codes into internal token IDs plus metadata for downstream models.", + ), + ("MedTok_embed", dict(codes=["A00"], system="ICD-10"), "Generate vector embeddings for a medical code."), + ("MedTok_nearest_neighbors", dict(code="A00", k=3), "Find nearby codes in embedding space."), + ("MedTok_map_text_to_code", dict(text="type 2 diabetes", system="ICD-10"), "Map free text to the closest code."), + ("MedTok_search_text", dict(text="hypertension", k=4), "Search the terminology for matching codes by text."), + ("MedTok_code_info", dict(code="E11", system="ICD-10"), "Fetch descriptive details for a specific code."), + ] + results = [] + for name, kwargs, description in tests: + print(f" - {description}") + success, payload = call_tool(tu, name, **kwargs) + note = None + if success: + if name == "MedTok_tokenize": + note = f"Received {len(payload.get('token_ids', []))} token IDs" + elif name == "MedTok_embed": + emb = payload.get("embeddings") or [] + if emb: + note = f"Embedding dimension {payload.get('dim')}, first vector length {len(emb[0])}" + elif name == "MedTok_nearest_neighbors": + note = f"Returned {len(payload.get('neighbors', []))} neighbors" + elif name == "MedTok_map_text_to_code": + note = f"Mapped text to code {payload.get('code')}" + elif name == "MedTok_search_text": + note = f"Top match code {payload.get('matches', [{}])[0].get('code') if payload.get('matches') else 'N/A'}" + elif name == "MedTok_code_info": + note = f"Code info description: {payload.get('description', 'N/A')}" + if success and note: + print(f" Result: {note}") + results.append({"tool": name, "success": success, "response": payload, "note": note}) + return results + + +NETWORK_TOOLS = [ + ("InterPro_search_entries", {"query": "BRCA1"}), + ("KEGG_find_entries", {"query": "ATP synthase", "database": "pathway"}), + ("IUCN_get_species_status", {"species": "Panthera leo"}), + ("JASPAR_search_motifs", {"query": "SOX2"}), + ("MarineSpecies_lookup", {"scientific_name": "Gadus morhua"}), + ("cBioPortal_search_studies", {"keyword": "breast cancer"}), + ("PhenomeJax_list_projects", {"keyword": "glucose"}), +] + + +def run_network_tools(tu: ToolUniverse) -> List[Dict[str, str]]: + outcomes = [] + for name, kwargs in NETWORK_TOOLS: + success, payload = call_tool(tu, name, **kwargs) + note_parts: List[str] = [] + if success: + if name == "InterPro_search_entries": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Entries returned: {len(data.get('results', []))}") + elif name == "KEGG_find_entries": + if isinstance(payload, dict): + note_parts.append(f"Matched {len(payload.get('results', []))} entries") + elif isinstance(payload, list): + note_parts.append(f"Matched {len(payload)} entries") + elif name == "IUCN_get_species_status": + result = payload.get("result") if isinstance(payload, dict) else {} + if isinstance(result, list) and result: + result = result[0] + elif result is None: + result = {} + species = result.get("scientific_name") + category = result.get("category") + note_parts.append(f"{species} status {category}") + elif name == "JASPAR_search_motifs": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Found {len(data.get('results', []))} motifs") + elif name == "MarineSpecies_lookup": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Matches: {len(data.get('results', []))}") + elif name == "cBioPortal_search_studies": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Studies returned: {len(data.get('studies', []))}") + elif name == "PhenomeJax_list_projects": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Projects listed: {len(data.get('projects', []))}") + + preview = preview_json(payload) + print(f" {name} preview: {preview}") + note_parts.append(f"Preview: {preview}") + else: + print(f" {name} error payload: {preview_json(payload)}") + note = " | ".join(note_parts) if note_parts else None + outcomes.append({"tool": name, "success": success, "response": payload, "note": note}) + return outcomes + + +def _extract_host(candidate: Dict[str, Any]) -> str: + host = candidate.get("host") + if host: + return str(host) + for key in ("url", "endpoint", "base_url"): + maybe = candidate.get(key) + if not maybe: + continue + parsed = urlparse(str(maybe)) + if parsed.netloc: + return parsed.netloc + return "candidate" + + +def _slugify_host(value: str) -> str: + slug = "".join(ch if ch.isalnum() else "_" for ch in value.lower()) + slug = slug.strip("_") + return slug or "candidate" + + +def run_vsd_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + """ + Demonstrate the Harvest -> Register -> Run workflow using Verified Source Directory helpers. + """ + search_query = "ensembl rest api" + print(f"\nSearching harvest catalog for '{search_query}' candidates...") + results: List[Dict[str, Any]] = [] + + success_search, harvest_resp = call_tool( + tu, + "GenericHarvestTool", + query=search_query, + limit=5, + ) + selected_candidate: Optional[Dict[str, Any]] = None + note_search: Optional[str] = None + if success_search: + candidates = (harvest_resp or {}).get("candidates") or [] + note_search = f"Candidates returned: {len(candidates)}" + if candidates: + preferred_hosts = {"rest.ensembl.org", "api.open-meteo.com"} + for candidate_option in candidates: + host = _extract_host(candidate_option).lower() + if host in preferred_hosts: + selected_candidate = candidate_option + break + if not selected_candidate: + selected_candidate = candidates[0] + host = _extract_host(selected_candidate) + print(f" - Selected candidate: {selected_candidate.get('name')} ({selected_candidate.get('url')}) [host: {host}]") + print(f" Candidate preview: {preview_json(selected_candidate)}") + else: + print(" - Harvest returned no candidates.") + else: + print(f" - Harvest search failed payload: {preview_json(harvest_resp)}") + note_search = "Harvest search failed" + results.append({"tool": "GenericHarvestTool", "success": success_search, "response": harvest_resp, "note": note_search}) + + if not (success_search and selected_candidate): + results.append( + { + "tool": "HarvestCandidateTesterTool", + "success": False, + "response": {"error": "No harvest candidate available"}, + "note": "Skipped testing", + } + ) + return results + + candidate = selected_candidate + print("\nTesting harvest candidate via HarvestCandidateTesterTool...") + success_probe, probe_resp = call_tool( + tu, + "HarvestCandidateTesterTool", + candidate=candidate, + ) + probe_note = None + if success_probe: + status = (probe_resp.get("test") or {}).get("status") + probe_note = f"Probe status {status}" + print(f" - Probe preview: {preview_json(probe_resp)}") + else: + print(f" - Probe failure payload: {preview_json(probe_resp)}") + results.append({"tool": "HarvestCandidateTesterTool", "success": success_probe, "response": probe_resp, "note": probe_note}) + + if not (success_probe and probe_resp.get("ok")): + print("Skipping registration because candidate probe failed.") + results.append( + { + "tool": "VerifiedSourceRegisterTool", + "success": False, + "response": {"error": "Probe failed"}, + "note": None, + } + ) + return results + + host_slug = _slugify_host(_extract_host(candidate)) + tool_name = f"HarvestDemo_{host_slug[:40]}" + + print("\nRegistering candidate with VerifiedSourceRegisterTool...") + success_reg, register_resp = call_tool( + tu, + "VerifiedSourceRegisterTool", + tool_name=tool_name, + candidate=candidate, + ) + note_reg = None + if success_reg: + config = (register_resp or {}).get("config") or {} + base_url = (config.get("fields") or {}).get("base_url") or config.get("endpoint") + note_reg = f"Registered tool pointing to {base_url}" + print(f" - Registered config preview: {preview_json(config)}") + else: + print(f" - Registration failure payload: {preview_json(register_resp)}") + results.append( + { + "tool": "VerifiedSourceRegisterTool", + "success": success_reg, + "response": register_resp, + "note": note_reg, + } + ) + + if not success_reg: + return results + + print("\nCalling newly registered tool...") + tu.load_tools(include_tools=[tool_name]) + success_run, run_resp = call_tool(tu, tool_name) + note_run = None + if success_run: + preview = preview_json(run_resp) + note_run = f"Preview: {preview}" + print(f" - Run result preview: {preview}") + else: + print(f" - Run failure payload: {preview_json(run_resp)}") + results.append({"tool": tool_name, "success": success_run, "response": run_resp, "note": note_run}) + + print("\nCleaning up registered tool...") + success_rm, rm_resp = call_tool( + tu, + "VerifiedSourceRemoveTool", + tool_name=tool_name, + ) + note_rm = "Removed from catalog" if success_rm else None + if success_rm: + print(f" - Removal confirmation: {preview_json(rm_resp)}") + else: + print(f" - Removal failure payload: {preview_json(rm_resp)}") + results.append({"tool": "VerifiedSourceRemoveTool", "success": success_rm, "response": rm_resp, "note": note_rm}) + + return results + + +def main(): + parser = argparse.ArgumentParser(description="Run ToolUniverse end-to-end demo.") + parser.add_argument("--skip-network-tools", action="store_true", help="Skip tools that require external HTTP APIs.") + parser.add_argument("--skip-vsd", action="store_true", help="Skip harvest/register/run VSD demonstration.") + parser.add_argument("--medtok-host", default="127.0.0.1") + parser.add_argument("--medtok-port", type=int, default=8910) + parser.add_argument("--medlog-host", default="127.0.0.1") + parser.add_argument("--collector-port", type=int, default=8911) + parser.add_argument("--fhir-port", type=int, default=8912) + args = parser.parse_args() + + medtok_ctx = None + medlog_ctx = None + all_results: List[Dict[str, str]] = [] + + try: + print("Starting MedTok service...") + medtok_ctx = start_medtok(args.medtok_host, args.medtok_port) + print(f"MedTok running at {os.environ['MEDTOK_BASE_URL']}") + + print("Starting MedLog services...") + medlog_ctx = start_medlog(args.medlog_host, args.collector_port, args.fhir_port) + print( + f"MedLog collector at {os.environ['MEDLOG_COLLECTOR_BASE_URL']}, " + f"FHIR bridge at {os.environ['MEDLOG_FHIR_BASE_URL']}" + ) + + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medtok", "medlog"]) + + print("\nRunning MedTok demo calls...") + all_results.extend(run_medtok_demo(tu)) + + print("\nRunning MedLog demo calls...") + all_results.extend(run_medlog_demo(tu)) + + if not args.skip_network_tools: + print("\nLoading network-enabled tools (InterPro, KEGG, IUCN, etc.)...") + categories = [ + "interpro", + "kegg", + "iucn_red_list", + "jaspar", + "marine_species", + "cbioportal", + "phenome_jax", + ] + try: + tu.load_tools(tool_type=categories) + except Exception as exc: # pylint: disable=broad-except + print(f"[WARN] Failed to load network tool categories: {exc}") + else: + print("Running network tool calls...") + all_results.extend(run_network_tools(tu)) + else: + print("\nSkipping external network tools.") + + if not args.skip_vsd: + print("\nHarvest -> Register -> Run walkthrough...") + vsd_results = run_vsd_demo(tu) + all_results.extend(vsd_results) + else: + print("\nSkipping VSD harvest/register/run demo.") + + finally: + if medtok_ctx: + print("\nStopping MedTok service...") + stop_medtok(medtok_ctx) + if medlog_ctx: + print("Stopping MedLog services...") + stop_medlog(medlog_ctx) + + print("\n================ Demo Summary ================") + failures = [r for r in all_results if not r["success"]] + for result in all_results: + status = "PASS" if result["success"] else "FAIL" + print(f"{status:4} | {result['tool']}") + note = result.get("note") + if note: + print(f" {note}") + if not result["success"]: + print(f" -> {result['response']}") + print("=============================================") + + if failures: + print(f"{len(failures)} tool calls failed.") + sys.exit(1) + print("All tool calls succeeded.") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_new_tools.py b/scripts/run_new_tools.py new file mode 100644 index 00000000..7ec93959 --- /dev/null +++ b/scripts/run_new_tools.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +""" +Smoke-test the newly added bioscience tools against their live APIs. + +The script imports the tool classes directly (without loading the full +ToolUniverse package) and prints representative responses. It respects +the following environment variables when present: + + IUCN_RED_LIST_TOKEN + CBIOPORTAL_API_TOKEN +""" + +from __future__ import annotations + +import os +import pprint +import sys +from pathlib import Path + + +def _bootstrap_path() -> None: + """Ensure the local src/ directory is importable.""" + repo_root = Path(__file__).resolve().parent.parent + src_path = repo_root / "src" + if str(src_path) not in sys.path: + sys.path.insert(0, str(src_path)) + os.environ.setdefault("TOOLUNIVERSE_LIGHT_IMPORT", "true") + + +def main() -> None: + _bootstrap_path() + + from tooluniverse.interpro_tool import InterProTool + from tooluniverse.kegg_tool import KEGGTool + from tooluniverse.iucn_tool import IUCNRedListTool + from tooluniverse.jaspar_tool import JASPARRestTool + from tooluniverse.marine_species_tool import MarineSpeciesTool + from tooluniverse.cbioportal_tool import CBioPortalTool + from tooluniverse.phenome_jax_tool import PhenomeJaxTool + + results = { + "interpro": InterProTool({"name": "InterPro_search_entries"}).run( + {"query": "kinase", "page_size": 2} + ), + "kegg": KEGGTool({"name": "KEGG_find_entries"}).run( + {"query": "glucose", "database": "pathway", "max_results": 2} + ), + "jaspar": JASPARRestTool({"name": "JASPAR_search_motifs"}).run( + {"query": "Arnt", "page_size": 2} + ), + "marine_species": MarineSpeciesTool({"name": "MarineSpecies_lookup"}).run( + {"scientific_name": "Delphinus delphis", "like": True} + ), + "cbioportal": CBioPortalTool({"name": "cBioPortal_search_studies"}).run( + {"keyword": "breast", "page_size": 2} + ), + "phenome_jax": PhenomeJaxTool({"name": "PhenomeJax_list_projects"}).run( + {"keyword": "glucose", "limit": 2} + ), + } + + try: + results["iucn"] = IUCNRedListTool({"name": "IUCN_get_species_status"}).run( + {"species": "Panthera leo"} + ) + except Exception as exc: # pragma: no cover - best-effort reporting + results["iucn"] = {"error": str(exc)} + + for key, value in results.items(): + print(f"=== {key.upper()} ===") + pprint.pprint(value) + print() + + +if __name__ == "__main__": + main() diff --git a/src/tooluniverse/__init__.py b/src/tooluniverse/__init__.py index bed8e3f3..658b7cdd 100644 --- a/src/tooluniverse/__init__.py +++ b/src/tooluniverse/__init__.py @@ -278,6 +278,18 @@ def __getattr__(self, name): from .core_tool import CoreTool from .pmc_tool import PMCTool from .zenodo_tool import ZenodoTool + from . import vsd_tool # registers VerifiedSourceDiscoveryTool + VerifiedSourceRegisterTool + from . import vsd_api_tool # registers GenericRESTTool + GenericGraphQLTool + from . import context_keeper_tool # registers ContextKeeperTool + from . import candidate_tester_tool # registers HarvestCandidateTesterTool + from . import tool_navigator_tool # registers ToolNavigatorTool + from . import interpro_tool + from . import kegg_tool + from . import iucn_tool + from . import jaspar_tool + from . import marine_species_tool + from . import cbioportal_tool + from . import phenome_jax_tool else: # With lazy loading, create lazy import proxies that import modules only when accessed MonarchTool = _LazyImportProxy("restful_tool", "MonarchTool") @@ -368,6 +380,13 @@ def __getattr__(self, name): CellosaurusGetCellLineInfoTool = _LazyImportProxy( "cellosaurus_tool", "CellosaurusGetCellLineInfoTool" ) + InterProTool = _LazyImportProxy("interpro_tool", "InterProTool") + KEGGTool = _LazyImportProxy("kegg_tool", "KEGGTool") + IUCNRedListTool = _LazyImportProxy("iucn_tool", "IUCNRedListTool") + JASPARRestTool = _LazyImportProxy("jaspar_tool", "JASPARRestTool") + MarineSpeciesTool = _LazyImportProxy("marine_species_tool", "MarineSpeciesTool") + CBioPortalTool = _LazyImportProxy("cbioportal_tool", "CBioPortalTool") + PhenomeJaxTool = _LazyImportProxy("phenome_jax_tool", "PhenomeJaxTool") # Literature search tools ArXivTool = _LazyImportProxy("arxiv_tool", "ArXivTool") CrossrefTool = _LazyImportProxy("crossref_tool", "CrossrefTool") @@ -453,6 +472,10 @@ def __getattr__(self, name): "ODPHPItemList", "ODPHPTopicSearch", "ODPHPOutlinkFetch", + "ContextKeeperTool", + "HarvestCandidateTesterTool", + "GenericHarvestTool", + "ToolNavigatorTool", "CellosaurusSearchTool", "CellosaurusQueryConverterTool", "CellosaurusGetCellLineInfoTool", diff --git a/src/tooluniverse/candidate_tester_tool.py b/src/tooluniverse/candidate_tester_tool.py new file mode 100644 index 00000000..24b742d9 --- /dev/null +++ b/src/tooluniverse/candidate_tester_tool.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional + +from .tool_registry import register_tool +from .vsd_utils import build_config, probe_config + +HARVEST_CANDIDATE_TESTER_SCHEMA = { + "type": "object", + "properties": { + "candidate": {"type": "object"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + }, + "required": ["candidate"], + "additionalProperties": False, +} + +HARVEST_CANDIDATE_TESTER_CONFIG = { + "name": "HarvestCandidateTesterTool", + "description": "Probe a harvest/VSD candidate endpoint and report JSON readiness without registering it.", + "type": "HarvestCandidateTesterTool", + "category": "special_tools", + "parameter": HARVEST_CANDIDATE_TESTER_SCHEMA, +} + + +@register_tool("HarvestCandidateTesterTool", config=HARVEST_CANDIDATE_TESTER_CONFIG) +class HarvestCandidateTesterTool: + """ + Validate harvest/VSD candidates without registering them. + Returns HTTP diagnostics and suggestions for default params or headers. + """ + + name = "HarvestCandidateTesterTool" + description = "Test a harvest candidate endpoint to see if it returns usable JSON." + input_schema = HARVEST_CANDIDATE_TESTER_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + candidate = arguments.get("candidate") or {} + tool_type = arguments.get("tool_type") or "dynamic_rest" + default_params = arguments.get("default_params") + default_headers = arguments.get("default_headers") + + cfg = build_config( + candidate, + tool_type=tool_type, + default_params=default_params, + default_headers=default_headers, + ) + probe = probe_config(cfg) + + return { + "ok": bool(probe.get("ok")), + "test": probe, + "config": cfg, + } diff --git a/src/tooluniverse/cbioportal_tool.py b/src/tooluniverse/cbioportal_tool.py new file mode 100644 index 00000000..93a86a2f --- /dev/null +++ b/src/tooluniverse/cbioportal_tool.py @@ -0,0 +1,74 @@ +import os +from typing import Any, Dict, List + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +CBIOPORTAL_BASE_URL = "https://www.cbioportal.org/api" +CBIOPORTAL_TOKEN_ENV = "CBIOPORTAL_API_TOKEN" +REQUEST_TIMEOUT = 30 + + +@register_tool("CBioPortalTool") +class CBioPortalTool(BaseTool): + """ + Wrapper around the cBioPortal REST API for study discovery. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def _headers(self, arguments: Dict[str, Any]) -> Dict[str, str]: + headers = {"Accept": "application/json"} + token = ( + arguments.get("token") + or self.tool_config.get("token") + or os.getenv(CBIOPORTAL_TOKEN_ENV) + ) + if token: + headers["X-Auth-Token"] = token + return headers + + def run(self, arguments): + keyword = (arguments or {}).get("keyword") or (arguments or {}).get("query") + if not keyword: + return {"error": "Missing required parameter: keyword"} + + page_size = int( + (arguments or {}).get("page_size") + or self.tool_config.get("page_size", 20) + ) + page_number = int((arguments or {}).get("page") or 0) + + params = { + "keyword": keyword, + "pageSize": max(page_size, 1), + "pageNumber": max(page_number, 0), + "projection": "SUMMARY", + } + + response = self.session.get( + f"{CBIOPORTAL_BASE_URL}/studies", + params=params, + headers=self._headers(arguments or {}), + timeout=REQUEST_TIMEOUT, + ) + response.raise_for_status() + payload = response.json() + + results: List[Dict[str, Any]] = [] + for item in payload: + results.append( + { + "studyId": item.get("studyId"), + "name": item.get("name"), + "description": item.get("description"), + "cancerTypeId": item.get("cancerTypeId"), + "publicStudy": item.get("publicStudy"), + } + ) + + return {"results": results, "returned": len(results)} diff --git a/src/tooluniverse/common_utils.py b/src/tooluniverse/common_utils.py new file mode 100644 index 00000000..8fdb5d85 --- /dev/null +++ b/src/tooluniverse/common_utils.py @@ -0,0 +1,30 @@ + +import os, json, time, threading, base64, io +from typing import Any, Dict, Tuple + +_LOCK = threading.Lock() + +def ensure_dir(path: str): + os.makedirs(path, exist_ok=True) + +def vsd_generated_path() -> str: + base = os.environ.get("TOOLUNIVERSE_VSD_DIR") or os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd") + ensure_dir(base) + return os.path.join(base, "generated_tools.json") + +def read_json(path: str, default): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return default + +def write_json(path: str, data: Any): + ensure_dir(os.path.dirname(path)) + tmp_path = f"{path}.tmp" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + os.replace(tmp_path, path) + +def b64_png(png_bytes: bytes) -> str: + return base64.b64encode(png_bytes).decode("ascii") diff --git a/src/tooluniverse/context_keeper_tool.py b/src/tooluniverse/context_keeper_tool.py new file mode 100644 index 00000000..46dd2b0c --- /dev/null +++ b/src/tooluniverse/context_keeper_tool.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import json +import os +from typing import Any, Dict, Optional + +from .tool_registry import register_tool + +CONTEXT_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "context") +CONTEXT_PATH = os.path.join(CONTEXT_DIR, "context.json") + + +def _ensure_dir() -> None: + os.makedirs(CONTEXT_DIR, exist_ok=True) + + +def _load_context() -> Dict[str, Any]: + if not os.path.exists(CONTEXT_PATH): + return {} + try: + with open(CONTEXT_PATH, "r", encoding="utf-8") as handle: + data = json.load(handle) + if isinstance(data, dict): + return data + except Exception: + pass + return {} + + +def _write_context(data: Dict[str, Any]) -> None: + _ensure_dir() + tmp_path = f"{CONTEXT_PATH}.tmp" + with open(tmp_path, "w", encoding="utf-8") as handle: + json.dump(data, handle, indent=2, ensure_ascii=False) + os.replace(tmp_path, CONTEXT_PATH) + + +@register_tool("ContextKeeperTool") +class ContextKeeperTool: + """ + Lightweight context store that agents can use to persist conversation or task state + between ToolUniverse calls. Data is saved under ~/.tooluniverse/context/context.json. + """ + + name = "ContextKeeperTool" + description = "Persist or retrieve task context (key/value pairs) for ongoing agent workflows." + input_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["get", "set", "append", "clear", "keys"], + "default": "get", + }, + "key": {"type": "string", "description": "Context entry name"}, + "value": { + "description": "Value to store; for append operations this should be a list item.", + }, + }, + "additionalProperties": False, + } + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + action = (arguments.get("action") or "get").lower() + key: Optional[str] = arguments.get("key") + value: Any = arguments.get("value") + + context = _load_context() + + if action == "keys": + return {"ok": True, "keys": sorted(context.keys())} + + if action == "clear": + if key: + removed = context.pop(key, None) is not None + _write_context(context) + return {"ok": removed, "cleared": key if removed else None} + context.clear() + _write_context(context) + return {"ok": True, "cleared": "all"} + + if action == "set": + if key is None: + return {"ok": False, "error": "key is required for set"} + context[key] = value + _write_context(context) + return {"ok": True, "key": key, "value": value} + + if action == "append": + if key is None: + return {"ok": False, "error": "key is required for append"} + existing = context.get(key) + if existing is None: + context[key] = [value] + elif isinstance(existing, list): + existing.append(value) + else: + context[key] = [existing, value] + _write_context(context) + return {"ok": True, "key": key, "value": context[key]} + + # default: get + if key: + return {"ok": True, "key": key, "value": context.get(key)} + return {"ok": True, "value": context} diff --git a/src/tooluniverse/data/cbioportal_tools.json b/src/tooluniverse/data/cbioportal_tools.json new file mode 100644 index 00000000..2cdc982b --- /dev/null +++ b/src/tooluniverse/data/cbioportal_tools.json @@ -0,0 +1,34 @@ +[ + { + "type": "CBioPortalTool", + "name": "cBioPortal_search_studies", + "description": "Search cBioPortal studies by keyword (supports optional API token).", + "parameter": { + "type": "object", + "properties": { + "keyword": { + "type": "string", + "description": "Keyword to search in study identifiers and descriptions." + }, + "page": { + "type": "integer", + "description": "Zero-based page index.", + "default": 0, + "minimum": 0 + }, + "page_size": { + "type": "integer", + "description": "Number of records per page.", + "default": 20, + "minimum": 1, + "maximum": 1000 + }, + "token": { + "type": "string", + "description": "Optional API token (falls back to CBIOPORTAL_API_TOKEN environment variable)." + } + }, + "required": ["keyword"] + } + } +] diff --git a/src/tooluniverse/data/interpro_tools.json b/src/tooluniverse/data/interpro_tools.json new file mode 100644 index 00000000..bb3de457 --- /dev/null +++ b/src/tooluniverse/data/interpro_tools.json @@ -0,0 +1,30 @@ +[ + { + "type": "InterProTool", + "name": "InterPro_search_entries", + "description": "Search InterPro entries by keyword using the official REST API.", + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Keyword or accession to search within InterPro." + }, + "page": { + "type": "integer", + "description": "Results page number (1-indexed).", + "default": 1, + "minimum": 1 + }, + "page_size": { + "type": "integer", + "description": "Number of records per page (max 200).", + "default": 25, + "minimum": 1, + "maximum": 200 + } + }, + "required": ["query"] + } + } +] diff --git a/src/tooluniverse/data/iucn_tools.json b/src/tooluniverse/data/iucn_tools.json new file mode 100644 index 00000000..8f1b07b8 --- /dev/null +++ b/src/tooluniverse/data/iucn_tools.json @@ -0,0 +1,21 @@ +[ + { + "type": "IUCNRedListTool", + "name": "IUCN_get_species_status", + "description": "Retrieve conservation status information from the IUCN Red List (requires IUCN_RED_LIST_TOKEN).", + "parameter": { + "type": "object", + "properties": { + "species": { + "type": "string", + "description": "Scientific name of the species (spaces allowed)." + }, + "token": { + "type": "string", + "description": "Optional API token; falls back to IUCN_RED_LIST_TOKEN environment variable." + } + }, + "required": ["species"] + } + } +] diff --git a/src/tooluniverse/data/jaspar_tools.json b/src/tooluniverse/data/jaspar_tools.json new file mode 100644 index 00000000..b596865a --- /dev/null +++ b/src/tooluniverse/data/jaspar_tools.json @@ -0,0 +1,38 @@ +[ + { + "type": "JASPARRestTool", + "name": "JASPAR_search_motifs", + "description": "Search transcription factor binding motifs in the JASPAR database.", + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search term (matrix ID, gene symbol, or motif name)." + }, + "tax_group": { + "type": "string", + "description": "Optional taxonomic group filter (e.g., vertebrates)." + }, + "collection": { + "type": "string", + "description": "Optional collection filter (e.g., CORE, UNVALIDATED)." + }, + "page": { + "type": "integer", + "description": "Page number (1-indexed).", + "default": 1, + "minimum": 1 + }, + "page_size": { + "type": "integer", + "description": "Number of results per page.", + "default": 10, + "minimum": 1, + "maximum": 100 + } + }, + "required": ["query"] + } + } +] diff --git a/src/tooluniverse/data/kegg_tools.json b/src/tooluniverse/data/kegg_tools.json new file mode 100644 index 00000000..b20436be --- /dev/null +++ b/src/tooluniverse/data/kegg_tools.json @@ -0,0 +1,27 @@ +[ + { + "type": "KEGGTool", + "name": "KEGG_find_entries", + "description": "Find KEGG entries matching a query within a selected database.", + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search term (e.g., gene symbol, compound name)." + }, + "database": { + "type": "string", + "description": "KEGG database name (pathway, compound, gene, etc.).", + "default": "pathway" + }, + "max_results": { + "type": "integer", + "description": "Optional limit on number of results to return.", + "minimum": 1 + } + }, + "required": ["query"] + } + } +] diff --git a/src/tooluniverse/data/marine_species_tools.json b/src/tooluniverse/data/marine_species_tools.json new file mode 100644 index 00000000..0d92a269 --- /dev/null +++ b/src/tooluniverse/data/marine_species_tools.json @@ -0,0 +1,27 @@ +[ + { + "type": "MarineSpeciesTool", + "name": "MarineSpecies_lookup", + "description": "Lookup marine taxa using the World Register of Marine Species (WoRMS).", + "parameter": { + "type": "object", + "properties": { + "scientific_name": { + "type": "string", + "description": "Scientific name to search (exact or partial when like=true)." + }, + "like": { + "type": "boolean", + "description": "Use fuzzy matching when true.", + "default": true + }, + "marine_only": { + "type": "boolean", + "description": "Restrict results to marine taxa only.", + "default": true + } + }, + "required": ["scientific_name"] + } + } +] diff --git a/src/tooluniverse/data/medlog_tools.json b/src/tooluniverse/data/medlog_tools.json new file mode 100644 index 00000000..bf5799ce --- /dev/null +++ b/src/tooluniverse/data/medlog_tools.json @@ -0,0 +1,134 @@ +[ + { + "name": "MedLog_init_event", + "description": "Initialize or overwrite a MedLog event record. Supply the 9-field MedLog payload to capture headers, inputs, identities, and initial artifacts.", + "type": "MedLogInitEventTool", + "parameter": { + "type": "object", + "properties": { + "header": { + "type": "object", + "description": "MedLog header block including event_id, timestamps, risk metadata, and parent relationships." + }, + "model_instance": { + "type": "object", + "description": "Model provenance metadata (model name, version, risk posture, vendor, etc.)." + }, + "user_identity": { + "type": "object", + "description": "Information about the requesting user, clinician, or agent." + }, + "target_identity": { + "type": "object", + "description": "Optional target entity such as patient or device identifiers." + }, + "inputs": { + "type": "object", + "description": "Structured input payload captured at initialization." + }, + "retention_tier": { + "type": "string", + "description": "Retention tier label (steady, critical, transient, etc.)." + } + }, + "required": ["header", "model_instance", "user_identity"] + } + }, + { + "name": "MedLog_append_fragment", + "description": "Append outputs, outcomes, artifacts, or feedback fragments to an existing MedLog event.", + "type": "MedLogAppendFragmentTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to update." + }, + "fragment": { + "type": "object", + "description": "Fragment payload containing any of internal_artifacts, outputs, outcomes, or user_feedback." + } + }, + "required": ["event_id", "fragment"] + } + }, + { + "name": "MedLog_get_provenance", + "description": "Fetch PROV-JSON bundle for a given event to support audit trails and lineage review.", + "type": "MedLogGetProvenanceTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to retrieve." + } + }, + "required": ["event_id"] + } + }, + { + "name": "MedLog_query_events", + "description": "Query MedLog events by run or event identifier. Useful for dashboarding, analytics, and sampling inspection.", + "type": "MedLogQueryEventsTool", + "parameter": { + "type": "object", + "properties": { + "run_id": { + "type": "string", + "description": "Optional run identifier to filter results." + }, + "event_id": { + "type": "string", + "description": "Optional event identifier to narrow results." + }, + "limit": { + "type": "integer", + "description": "Maximum number of rows to return (default 50).", + "minimum": 1, + "maximum": 500 + } + } + } + }, + { + "name": "MedLog_export_parquet", + "description": "Trigger MedLog parquet export to the configured artifact directory.", + "type": "MedLogExportParquetTool", + "parameter": { + "type": "object", + "properties": {} + } + }, + { + "name": "MedLog_fhir_bundle", + "description": "Retrieve the FHIR bundle synthesised for an individual MedLog event (Patient, Practitioner, Device, AuditEvent, Observations, Documents).", + "type": "MedLogFHIRBundleTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to export." + } + }, + "required": ["event_id"] + } + }, + { + "name": "MedLog_fhir_run_bundle", + "description": "Aggregate all events in a run into a consolidated FHIR bundle for care-path review.", + "type": "MedLogFHIRRunBundleTool", + "parameter": { + "type": "object", + "properties": { + "run_id": { + "type": "string", + "description": "Run identifier to export." + } + }, + "required": ["run_id"] + } + } +] diff --git a/src/tooluniverse/data/medtok_mcp_tools.json b/src/tooluniverse/data/medtok_mcp_tools.json new file mode 100644 index 00000000..fef79cbf --- /dev/null +++ b/src/tooluniverse/data/medtok_mcp_tools.json @@ -0,0 +1,11 @@ +[ + { + "name": "mcp_auto_loader_medtok", + "description": "Discover and register MedTok tools from a running MedTok MCP server so they can be invoked directly through ToolUniverse.", + "type": "MCPAutoLoaderTool", + "server_url": "http://${MEDTOK_MCP_SERVER_HOST}:9001/mcp", + "tool_prefix": "medtok_", + "auto_register": true, + "required_api_keys": ["MEDTOK_MCP_SERVER_HOST"] + } +] diff --git a/src/tooluniverse/data/medtok_tools.json b/src/tooluniverse/data/medtok_tools.json new file mode 100644 index 00000000..c54fe67b --- /dev/null +++ b/src/tooluniverse/data/medtok_tools.json @@ -0,0 +1,134 @@ +[ + { + "name": "MedTok_tokenize", + "description": "Tokenize one or more medical codes using the MedTok multimodal tokenizer. Useful for exposing token IDs and optional metadata to downstream workflows.", + "type": "MedTokTokenizeTool", + "parameter": { + "type": "object", + "properties": { + "codes": { + "type": "array", + "items": { "type": "string" }, + "description": "List of codes to tokenize (e.g., ICD-10 identifiers)." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + }, + "include_metadata": { + "type": "boolean", + "description": "Return region-level metadata for each code." + } + }, + "required": ["codes"] + } + }, + { + "name": "MedTok_embed", + "description": "Generate MedTok embeddings for a batch of codes. Returns floating-point vectors suitable for similarity search or downstream ML tasks.", + "type": "MedTokEmbedTool", + "parameter": { + "type": "object", + "properties": { + "codes": { + "type": "array", + "items": { "type": "string" }, + "description": "Codes to embed." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + } + }, + "required": ["codes"] + } + }, + { + "name": "MedTok_nearest_neighbors", + "description": "Retrieve the nearest neighbours for a code from the MedTok embedding space with similarity scores.", + "type": "MedTokNearestNeighborsTool", + "parameter": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Anchor code for the neighbourhood query." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + }, + "k": { + "type": "integer", + "description": "Number of neighbours to return (default 5).", + "minimum": 1, + "maximum": 50 + } + }, + "required": ["code"] + } + }, + { + "name": "MedTok_map_text_to_code", + "description": "Map free-text clinical language to the most relevant code using MedTok text semantics.", + "type": "MedTokMapTextTool", + "parameter": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Clinical description or narrative." + }, + "system": { + "type": "string", + "description": "Target coding system, defaults to ICD-10." + } + }, + "required": ["text"] + } + }, + { + "name": "MedTok_search_text", + "description": "Hybrid text + semantic search over the MedTok vocabulary. Useful for exploratory lookup workflows.", + "type": "MedTokSearchTextTool", + "parameter": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Query text to search for." + }, + "system": { + "type": ["string", "null"], + "description": "Optional coding system filter." + }, + "k": { + "type": "integer", + "description": "Maximum number of matches (default 5).", + "minimum": 1, + "maximum": 50 + } + }, + "required": ["text"] + } + }, + { + "name": "MedTok_code_info", + "description": "Retrieve metadata for a specific code including synonyms and graph context when available.", + "type": "MedTokCodeInfoTool", + "parameter": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Code identifier to fetch." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + } + }, + "required": ["code"] + } + } +] diff --git a/src/tooluniverse/data/phenome_jax_tools.json b/src/tooluniverse/data/phenome_jax_tools.json new file mode 100644 index 00000000..2b3ad09c --- /dev/null +++ b/src/tooluniverse/data/phenome_jax_tools.json @@ -0,0 +1,23 @@ +[ + { + "type": "PhenomeJaxTool", + "name": "PhenomeJax_list_projects", + "description": "List Mouse Phenome Database projects with optional keyword filtering.", + "parameter": { + "type": "object", + "properties": { + "keyword": { + "type": "string", + "description": "Optional keyword to filter projects by title or description." + }, + "limit": { + "type": "integer", + "description": "Maximum number of projects to return.", + "default": 20, + "minimum": 1, + "maximum": 200 + } + } + } + } +] diff --git a/src/tooluniverse/data/vsd.json b/src/tooluniverse/data/vsd.json new file mode 100644 index 00000000..b359048e --- /dev/null +++ b/src/tooluniverse/data/vsd.json @@ -0,0 +1,35 @@ +[ + { + "name": "GenericHarvestTool", + "type": "GenericHarvestTool", + "description": "Live-harvest candidate API endpoints by invoking all modules in tooluniverse.harvest.", + "tool_type": "special_tools", + "enabled": true, + "visible": true, + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Free-text hint, passed to all harvesters under tooluniverse.harvest." + }, + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "description": "Explicit candidate URLs to validate and return (skips live harvesting)." + }, + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "default": 5, + "description": "Max number of candidates to return." + } + }, + "additionalProperties": false + } + } +] diff --git a/src/tooluniverse/data/vsd_allowlist.json b/src/tooluniverse/data/vsd_allowlist.json new file mode 100644 index 00000000..3c5258a2 --- /dev/null +++ b/src/tooluniverse/data/vsd_allowlist.json @@ -0,0 +1,4 @@ +[ +{"domain": "ema.europa.eu", "label": "EMA", "trust": 0.95, "registry": "ema"}, +{"domain": "ghoapi.azureedge.net", "label": "WHO GHO", "trust": 0.92, "registry": "who"} +] \ No newline at end of file diff --git a/src/tooluniverse/data/vsd_tools.json b/src/tooluniverse/data/vsd_tools.json new file mode 100644 index 00000000..398cd86b --- /dev/null +++ b/src/tooluniverse/data/vsd_tools.json @@ -0,0 +1,34 @@ +[ + { + "type": "VerifiedSourceDiscoveryTool", + "name": "vsd_discover_sources", + "description": "Discover trusted candidate sources for a free-text query", + "parameter": { + "type": "object", + "required": ["query"], + "properties": { + "query": { "type": "string" }, + "limit": { "type": "integer" }, + "allowlist_overrides": { "type": "array" } + } + }, + "label": ["VSD", "Discovery"] + }, + { + "type": "VerifiedSourceRegisterTool", + "name": "vsd_register_tool", + "description": "Register a VSD-generated tool bound to a trusted source", + "parameter": { + "type": "object", + "required": ["candidate", "tool_name"], + "properties": { + "candidate": { "type": "object" }, + "tool_name": { "type": "string" }, + "description": { "type": "string" }, + "parameter_overrides": { "type": "object" }, + "evidence_sample": { "type": "object" } + } + }, + "label": ["VSD", "Synthesis"] + } +] diff --git a/src/tooluniverse/default_config.py b/src/tooluniverse/default_config.py index 46095834..b3a64a5e 100644 --- a/src/tooluniverse/default_config.py +++ b/src/tooluniverse/default_config.py @@ -70,6 +70,13 @@ "medlineplus": os.path.join(current_dir, "data", "medlineplus_tools.json"), "uniprot": os.path.join(current_dir, "data", "uniprot_tools.json"), "cellosaurus": os.path.join(current_dir, "data", "cellosaurus_tools.json"), + "interpro": os.path.join(current_dir, "data", "interpro_tools.json"), + "kegg": os.path.join(current_dir, "data", "kegg_tools.json"), + "iucn_red_list": os.path.join(current_dir, "data", "iucn_tools.json"), + "jaspar": os.path.join(current_dir, "data", "jaspar_tools.json"), + "marine_species": os.path.join(current_dir, "data", "marine_species_tools.json"), + "cbioportal": os.path.join(current_dir, "data", "cbioportal_tools.json"), + "phenome_jax": os.path.join(current_dir, "data", "phenome_jax_tools.json"), # 'software': os.path.join(current_dir, 'data', 'software_tools.json'), # Package tools - categorized software tools "software_bioinformatics": os.path.join( @@ -150,6 +157,11 @@ "genomics": os.path.join(current_dir, "data", "genomics_tools.json"), # Guideline and health policy tools "guidelines": os.path.join(current_dir, "data", "unified_guideline_tools.json"), + "medtok": os.path.join(current_dir, "data", "medtok_tools.json"), + "medtok_mcp_auto_loader": os.path.join( + current_dir, "data", "medtok_mcp_tools.json" + ), + "medlog": os.path.join(current_dir, "data", "medlog_tools.json"), } diff --git a/src/tooluniverse/dynamic_rest_runner.py b/src/tooluniverse/dynamic_rest_runner.py new file mode 100644 index 00000000..a3061d36 --- /dev/null +++ b/src/tooluniverse/dynamic_rest_runner.py @@ -0,0 +1,194 @@ +""" +Dynamic REST/GraphQL tool loader for Verified Source Directory (VSD). + +This module keeps an in-memory registry of generated tool specifications and +exposes helper functions for refreshing, inserting, or removing entries. Tools +are backed by lightweight BaseTool subclasses that issue HTTP requests using +the stored configuration. +""" + +from __future__ import annotations + +import json +import logging +import threading +from typing import Any, Dict, Optional + +import requests + +from .base_tool import BaseTool +from .common_utils import read_json, vsd_generated_path +from .tool_registry import register_config, register_tool + +LOGGER = logging.getLogger("DynamicRESTRunner") +_REGISTRY_LOCK = threading.Lock() +_GENERATED_TOOLS: Dict[str, Dict[str, Any]] = {} + + +def _normalize_spec(spec: Any) -> Dict[str, Dict[str, Any]]: + """ + Accept legacy list or dict formats and normalize to {name: config}. + """ + if isinstance(spec, dict): + if "generated_tools" in spec and isinstance(spec["generated_tools"], list): + return { + item.get("name"): dict(item) + for item in spec["generated_tools"] + if isinstance(item, dict) and item.get("name") + } + return { + name: dict(cfg) + for name, cfg in spec.items() + if isinstance(cfg, dict) + } + + if isinstance(spec, list): + result: Dict[str, Dict[str, Any]] = {} + for item in spec: + if isinstance(item, dict) and item.get("name"): + result[item["name"]] = dict(item) + return result + + return {} + + +def _load_generated_specs() -> Dict[str, Dict[str, Any]]: + path = vsd_generated_path() + data = read_json(path, {}) + return _normalize_spec(data) + + +def _build_request_kwargs(config: Dict[str, Any], arguments: Dict[str, Any]) -> Dict[str, Any]: + fields = config.get("fields", {}) + method = fields.get("method", "GET").upper() + timeout = fields.get("timeout", 30) + headers = fields.get("headers", {}) + default_params = fields.get("default_params", {}) + + params = dict(default_params) + body: Optional[Any] = None + + if method in {"GET", "DELETE"}: + params.update(arguments) + else: + if fields.get("body_format", "json") == "form": + body = dict(arguments) + else: + body = arguments or {} + + kwargs: Dict[str, Any] = { + "method": method, + "url": fields.get("base_url"), + "headers": headers, + "timeout": timeout, + } + if params: + kwargs["params"] = params + if body is not None: + if fields.get("body_format", "json") == "form": + kwargs["data"] = body + else: + kwargs["json"] = body + return kwargs + + +def _handle_response(response: requests.Response) -> Any: + try: + return response.json() + except ValueError: + return { + "status_code": response.status_code, + "text": response.text, + } + + +@register_tool("GenericRESTTool") +class GenericRESTTool(BaseTool): + """ + Generic REST tool generated from a VSD configuration. + """ + + def run(self, arguments=None, stream_callback=None, **_: Any): + arguments = arguments or {} + kwargs = _build_request_kwargs(self.tool_config, arguments) + method = kwargs.pop("method") + url = kwargs.pop("url") + + response = requests.request(method, url, **kwargs) + response.raise_for_status() + result = _handle_response(response) + + if stream_callback: + stream_callback(json.dumps(result)) + return result + + +@register_tool("GenericGraphQLTool") +class GenericGraphQLTool(BaseTool): + """ + Generic GraphQL tool generated from a VSD configuration. + """ + + def run(self, arguments=None, stream_callback=None, **_: Any): + arguments = arguments or {} + fields = self.tool_config.get("fields", {}) + headers = fields.get("headers", {}) + timeout = fields.get("timeout", 30) + payload = { + "query": arguments.get("query") or fields.get("default_query"), + "variables": arguments.get("variables") or fields.get("default_variables", {}), + } + + response = requests.post( + fields.get("base_url"), + json=payload, + headers=headers, + timeout=timeout, + ) + response.raise_for_status() + result = _handle_response(response) + + if stream_callback: + stream_callback(json.dumps(result)) + return result + + +def _register_generated_tool(tool_name: str, config: Dict[str, Any]) -> None: + config = dict(config) + config.setdefault("name", tool_name) + tool_type = config.get("type") or "GenericRESTTool" + + register_config(tool_name, config) + _GENERATED_TOOLS[tool_name] = config + + LOGGER.debug("Registered generated tool %s of type %s", tool_name, tool_type) + + +def refresh_generated_registry() -> Dict[str, Dict[str, Any]]: + """ + Reload generated tool specs from disk and update the runtime registry. + """ + specs = _load_generated_specs() + with _REGISTRY_LOCK: + _GENERATED_TOOLS.clear() + for name, cfg in specs.items(): + _register_generated_tool(name, cfg) + return specs + + +def upsert_generated_tool(tool_name: str, config: Dict[str, Any]) -> Dict[str, Any]: + """ + Insert or update a generated tool in the runtime registry. + """ + with _REGISTRY_LOCK: + _register_generated_tool(tool_name, config) + return _GENERATED_TOOLS[tool_name] + + +def remove_generated_tool(tool_name: str) -> None: + """ + Remove a generated tool from the runtime registry. + """ + with _REGISTRY_LOCK: + _GENERATED_TOOLS.pop(tool_name, None) + LOGGER.debug("Removed generated tool %s", tool_name) diff --git a/src/tooluniverse/harvest/__init__.py b/src/tooluniverse/harvest/__init__.py new file mode 100644 index 00000000..19c21109 --- /dev/null +++ b/src/tooluniverse/harvest/__init__.py @@ -0,0 +1 @@ +# Harvest subpackage diff --git a/src/tooluniverse/harvest/domain_policies.py b/src/tooluniverse/harvest/domain_policies.py new file mode 100644 index 00000000..49031914 --- /dev/null +++ b/src/tooluniverse/harvest/domain_policies.py @@ -0,0 +1,59 @@ +from __future__ import annotations +from functools import lru_cache +from typing import Dict, List + +# Conservative allow/deny fragments. We still compute a trust score as a gradient. +ALLOWED_FRAGMENTS: List[str] = [ + # government & intergovernmental + ".gov", ".mil", ".gob", ".gouv", ".go.", ".govt.nz", ".gc.ca", + "who.int", "worldbank.org", "oecd.org", "europa.eu", "esa.int", + # major scientific/health orgs + "nih.gov", "niddk.nih.gov", "ninds.nih.gov", "ncbi.nlm.nih.gov", "data.cdc.gov", "api.cdc.gov", + "fda.gov", "api.fda.gov", "epa.gov", "noaa.gov", "usgs.gov", "census.gov", + "data.gov", "healthdata.gov", "data.cms.gov", "data.hrsa.gov", "data.hhs.gov", + "ghoapi.azureedge.net", +] + +BLOCKED_FRAGMENTS: List[str] = [ + "mirror", "docshare", "scribd.com", "sharepdf", "academia.edu", + "stackprinter", "cachedview", "wayback", "pirated", "scrapeops", +] + +@lru_cache(maxsize=4096) +def domain_blocked(host: str) -> bool: + h = (host or "").lower() + return any(b in h for b in BLOCKED_FRAGMENTS) + +@lru_cache(maxsize=4096) +def domain_allowed(host: str) -> bool: + # allow if any strong allow fragment present AND not blocked + h = (host or "").lower() + if domain_blocked(h): + return False + return any(a in h for a in ALLOWED_FRAGMENTS) + +@lru_cache(maxsize=4096) +def trust_score(host: str) -> Dict: + """Return a graded trust score in [0,1] with reasons for ranking. + We don't *block* here (that's domain_blocked); we provide a signal for ranker. + """ + h = (host or "").lower() + score = 0.0 + reasons: List[str] = [] + if domain_blocked(h): + return {"score": 0.0, "reasons": ["blocked"]} + + # strong positives + if any(tld in h for tld in (".gov", "who.int", "worldbank.org", "europa.eu", "oecd.org")): + score += 0.65; reasons.append("gov/igo domain") + if any(seg in h for seg in ("nih.gov","ncbi.nlm.nih.gov","fda.gov","epa.gov","noaa.gov","usgs.gov","census.gov")): + score += 0.2; reasons.append("major science/health org") + # medium positives + if h.startswith("api.") or "/api" in h: + score += 0.05; reasons.append("api host") + # slight boost for data portals + if any(seg in h for seg in ("data.gov","healthdata.gov","data.cms.gov","data.cdc.gov","data.europa.eu")): + score += 0.08; reasons.append("open data portal") + + score = max(0.0, min(1.0, score)) + return {"score": round(score, 3), "reasons": reasons} \ No newline at end of file diff --git a/src/tooluniverse/harvest/openapi_utils.py b/src/tooluniverse/harvest/openapi_utils.py new file mode 100644 index 00000000..4adcddd0 --- /dev/null +++ b/src/tooluniverse/harvest/openapi_utils.py @@ -0,0 +1,67 @@ +from __future__ import annotations +import re, logging, json +from typing import Dict, Optional, List +import requests + +logger = logging.getLogger("OpenAPIUtils") + +OPENAPI_HINTS = ["openapi.json","openapi.yaml","openapi.yml","swagger.json","swagger.yaml","v3/api-docs"] + +def _root_of(url: str) -> str: + base = url.split("?",1)[0] + base = re.sub(r"(#.*)$","", base) + base = re.sub(r"/+$","", base) + m = re.match(r"^https?://[^/]+", base) + return m.group(0) if m else base + +def find_openapi_from_url(any_url: str) -> Optional[str]: + root = _root_of(any_url) + # try /openapi.json etc. at root and one level up + tries = [f"{root}/{hint}" for hint in OPENAPI_HINTS] + # also try without trailing /api segment if present + if root.endswith("/api"): + base = root.rsplit("/",1)[0] + tries.extend(f"{base}/{hint}" for hint in OPENAPI_HINTS) + for t in tries: + try: + r = requests.get(t, timeout=8) + if r.status_code == 200 and ("json" in r.headers.get("Content-Type","") or t.endswith(".json")): + # quick JSON sanity + try: + j = r.json() + if "openapi" in j or "swagger" in j: + return t + except Exception: + pass + if r.status_code == 200 and (t.endswith(".yaml") or t.endswith(".yml")): + return t + except requests.RequestException: + continue + return None + +def parse_openapi(spec_url: str) -> Dict: + r = requests.get(spec_url, timeout=15) + r.raise_for_status() + text = r.text + if spec_url.endswith((".yaml",".yml")): + try: + import yaml + except Exception as e: + raise RuntimeError("YAML support requires PyYAML: pip install pyyaml") from e + spec = yaml.safe_load(text) + else: + spec = r.json() + + servers = spec.get("servers") or [] + base_url = (servers[0].get("url") if servers and isinstance(servers[0], dict) else None) or None + + paths = spec.get("paths") or {} + endpoints: List[Dict] = [] + for path, methods in paths.items(): + if not isinstance(methods, dict): + continue + for method, meta in methods.items(): + if method.upper() not in ("GET","POST","PUT","PATCH","DELETE","OPTIONS","HEAD"): + continue + endpoints.append({"path": path, "method": method.upper(), "summary": (meta or {}).get("summary")}) + return {"base_url": base_url, "endpoints": endpoints} \ No newline at end of file diff --git a/src/tooluniverse/harvest/promoter.py b/src/tooluniverse/harvest/promoter.py new file mode 100644 index 00000000..6ef0d4d6 --- /dev/null +++ b/src/tooluniverse/harvest/promoter.py @@ -0,0 +1,101 @@ +from __future__ import annotations +import os, json, tempfile, shutil +from typing import Dict, Any, List + +# Where we persist generated tool configs so DynamicREST (or your server boot) +# can load them. Mirrors your earlier logs (~/.tooluniverse/vsd/generated_tools.json). +VSD_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd") +VSD_PATH = os.path.join(VSD_DIR, "generated_tools.json") + +def _ensure_dir(): + os.makedirs(VSD_DIR, exist_ok=True) + +def _read_json(path: str) -> Any: + if not os.path.exists(path): + return {} + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) or {} + except Exception: + return {} + +def _atomic_write(path: str, data: Any): + tmp_fd, tmp_path = tempfile.mkstemp(prefix="vsd_", suffix=".json") + os.close(tmp_fd) + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + shutil.move(tmp_path, path) + +def _slug(host: str) -> str: + return (host or "unknown").lower().replace(".", "_").replace("-", "_") + +def build_candidate_tool_json(c: Dict[str, Any]) -> Dict[str, Any]: + # Minimal, UI-friendly payload for listing/debug + return { + "name": c.get("name"), + "host": c.get("host"), + "base_url": c.get("base_url"), + "doc_url": c.get("doc_url"), + "openapi_url": c.get("openapi_url"), + "endpoints": c.get("endpoints"), + "health": c.get("health"), + "cors": c.get("cors"), + "trust": c.get("trust"), + "source": c.get("source"), + "_rank_score": c.get("_rank_score"), + } + +def _dynamicrest_tool_config(c: Dict[str, Any]) -> Dict[str, Any]: + """Produce a DynamicREST-style tool definition. + Two modes: + - OpenAPI mode (preferred): reference spec URL. + - Manual mode: infer a few GET endpoints from verification results. + """ + name = f"vsd_auto_{_slug(c.get('host') or '')}" + base_url = c.get("base_url") + openapi_url = c.get("openapi_url") + endpoints = c.get("endpoints") or [] + + cfg: Dict[str, Any] = { + "name": name, + "type": "DynamicREST", + "base_url": base_url, + "auth": c.get("auth") or {"type": "none"}, + "metadata": { + "source": c.get("source"), + "trust": c.get("trust"), + "health": c.get("health"), + "doc_url": c.get("doc_url"), + }, + } + if openapi_url: + cfg["openapi"] = {"spec_url": openapi_url} + elif endpoints: + # Trim to a handful of GET endpoints + routes: List[Dict[str, Any]] = [] + for ep in endpoints[:5]: + routes.append({ + "method": ep.get("method") or "GET", + "path": ep.get("path") or "/", + "name": (ep.get("summary") or ep.get("path") or "endpoint").strip("/").replace("/", "_")[:64] or "endpoint", + }) + cfg["routes"] = routes + else: + # Last resort: allow a generic GET on '/' + cfg["routes"] = [{"method": "GET", "path": "/"}] + return cfg + +def promote_to_dynamicrest(c: Dict[str, Any]) -> str: + """Append/Update the generated tool config file so your server can load it. + Returns the registered tool name. + """ + _ensure_dir() + current = _read_json(VSD_PATH) + if not isinstance(current, dict): + current = {} + + cfg = _dynamicrest_tool_config(c) + name = cfg.get("name") or "vsd_auto_unknown" + current[name] = cfg + _atomic_write(VSD_PATH, current) + return name \ No newline at end of file diff --git a/src/tooluniverse/harvest/query_expansion.py b/src/tooluniverse/harvest/query_expansion.py new file mode 100644 index 00000000..4ac4e959 --- /dev/null +++ b/src/tooluniverse/harvest/query_expansion.py @@ -0,0 +1,28 @@ + +from __future__ import annotations +from typing import List + +DENTAL_SYNONYMS = [ + "oral health", "dentistry", "dental caries", "tooth decay", + "periodontal", "periodontitis", "orthodontic", "endodontic", + "prosthodontic", "oral cancer", "DMFT", "fluoride", "NIDCR", "CDC Oral Health", + "WHO Oral Health" +] + +def expand_queries(query: str, max_queries: int = 6) -> List[str]: + base = query.strip() + if not base: + return [] + expanded = [base, + f"{base} WHO API", + f"{base} site:who.int", + f"{base} site:data.cdc.gov", + f"{base} site:api.fda.gov"] + for syn in DENTAL_SYNONYMS[:4]: + expanded.append(f"{base} {syn}") + # de-dup and clip + seen = [] + for q in expanded: + if q not in seen: + seen.append(q) + return seen[:max_queries] diff --git a/src/tooluniverse/harvest/ranker.py b/src/tooluniverse/harvest/ranker.py new file mode 100644 index 00000000..aa898ad1 --- /dev/null +++ b/src/tooluniverse/harvest/ranker.py @@ -0,0 +1,36 @@ +from __future__ import annotations +import math +from typing import List, Dict + +def _sim(a: str, b: str) -> float: + a,b = (a or "").lower(), (b or "").lower() + if not a or not b: + return 0.0 + aset, bset = set(a.split()), set(b.split()) + overlap = len(aset & bset) + return overlap / (len(aset) + 1e-6) + +def rank_candidates(query: str, candidates: List[Dict]) -> List[Dict]: + def score(c: Dict) -> float: + trust = float(((c.get("trust") or {}).get("score") or 0.0)) + h = c.get("health") or {} + live = 1.0 if (h.get("ok") and (h.get("status",0) < 500)) else 0.0 + lat = h.get("latency_ms") or 1500 + lat_norm = max(0.0, 1.0 - min(lat, 4000)/4000.0) + fit = max(_sim(query, c.get("name","")), _sim(query, c.get("doc_url",""))) + has_spec = 1.0 if c.get("openapi_url") else 0.2 if c.get("endpoints") else 0.0 + cors = 0.3 if (c.get("cors") or {}).get("preflight") else 0.0 + match_bonus = float(c.get("_match_score") or 0.0) + return ( + 0.25 * trust + + 0.2 * (live * lat_norm) + + 0.23 * fit + + 0.1 * has_spec + + 0.05 * cors + + (0.35 * math.log1p(match_bonus) if match_bonus > 0 else 0.0) + ) + + ranked = sorted(candidates, key=score, reverse=True) + for i, c in enumerate(ranked): + c["_rank_score"] = round(score(c), 4) + return ranked diff --git a/src/tooluniverse/harvest/searchers.py b/src/tooluniverse/harvest/searchers.py new file mode 100644 index 00000000..e9daf2e8 --- /dev/null +++ b/src/tooluniverse/harvest/searchers.py @@ -0,0 +1,64 @@ +from __future__ import annotations +import os, re, logging, requests, json +from dataclasses import dataclass +from typing import List, Optional, Dict, Any + +logger = logging.getLogger("HarvestSearch") +DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8")) + +@dataclass +class SearchResult: + title: str + url: str + snippet: str + source: str + +def _clean_host(url: str) -> str: + return re.sub(r"^https?://", "", url or "").split("/")[0].lower() + +def _normalize_candidate_url(url: str) -> str: + return (url or "").strip() + +# ---------------- CKAN adapter ---------------- +def _search_ckan(query: str, rows: int, base_url: str) -> List[SearchResult]: + out: List[SearchResult] = [] + try: + r = requests.get(base_url, params={"q": query, "rows": rows}, timeout=DEFAULT_TIMEOUT) + r.raise_for_status() + payload = r.json() + # CKAN payload guard + result = (payload or {}).get("result") or {} + for pkg in result.get("results", []): + title = pkg.get("title") or pkg.get("name") or "CKAN dataset" + notes = (pkg.get("notes") or "")[:240] + for res in (pkg.get("resources") or []): + res_url = _normalize_candidate_url(res.get("url") or "") + if not res_url: + continue + out.append(SearchResult(title=title, url=res_url, snippet=notes, source=f"ckan:{_clean_host(base_url)}")) + except Exception as e: + logger.debug("CKAN search failed for %s: %s", base_url, e) + return out + +CATALOG_ADAPTERS = { + "ckan": _search_ckan, +} + +def search_for_apis(query: str, rows: int = 100, catalogs: Optional[List[Dict[str, Any]]] = None) -> List[SearchResult]: + """Search across configured catalogs. + catalogs: list of dicts, e.g. [{"type": "ckan", "url": "https://.../api/3/action/package_search"}] + You can supply this via env HARVEST_CATALOGS='[ ... ]' or pass in directly. + """ + results: List[SearchResult] = [] + catalogs = catalogs or [] + for cat in catalogs: + ctype = (cat.get("type") or "").lower().strip() + url = cat.get("url") or "" + if not ctype or not url: + continue + adapter = CATALOG_ADAPTERS.get(ctype) + if not adapter: + logger.debug("Unknown catalog type %s, skipping", ctype) + continue + results.extend(adapter(query=query, rows=rows, base_url=url)) + return results diff --git a/src/tooluniverse/harvest/static_catalog.py b/src/tooluniverse/harvest/static_catalog.py new file mode 100644 index 00000000..83536f94 --- /dev/null +++ b/src/tooluniverse/harvest/static_catalog.py @@ -0,0 +1,539 @@ +from __future__ import annotations + +import math +import re +from copy import deepcopy +from dataclasses import dataclass +from typing import Dict, Iterable, List, Set +from urllib.parse import urlparse + +from .domain_policies import trust_score +from .ranker import rank_candidates + + +# ----------------------------------------------------------------------------- +# Static catalog data +# ----------------------------------------------------------------------------- + +RAW_CATALOG: List[Dict[str, object]] = [ + { + "name": "ClinicalTrials.gov Study Fields API", + "url": "https://clinicaltrials.gov/api/query/study_fields", + "doc_url": "https://clinicaltrials.gov/api/gui/home", + "description": "Query structured fields from the ClinicalTrials.gov registry covering study design, enrollment, outcomes, and locations.", + "keywords": ["clinical", "trial", "study", "research", "ctgov", "clinicaltrials"], + "category": "clinical_trials", + "base_score": 0.95, + "endpoints": [ + {"method": "GET", "path": "/api/query/study_fields", "summary": "Query study fields"}, + {"method": "GET", "path": "/api/query/full_studies", "summary": "Fetch full study records"}, + ], + }, + { + "name": "NCI Clinical Trials API", + "url": "https://clinicaltrialsapi.cancer.gov/api/v1/clinical-trials", + "doc_url": "https://clinicaltrialsapi.cancer.gov", + "description": "REST API exposing cancer clinical trials curated by the National Cancer Institute (NCI) with filters across disease, stage, and therapy.", + "keywords": ["clinical", "trial", "oncology", "cancer", "nci", "research"], + "category": "clinical_trials", + "base_score": 0.88, + "endpoints": [ + {"method": "GET", "path": "/api/v1/clinical-trials", "summary": "Search cancer clinical trials"}, + {"method": "GET", "path": "/api/v1/diseases", "summary": "List disease terms"}, + ], + }, + { + "name": "FDA OpenFDA Drug Label API", + "url": "https://api.fda.gov/drug/label.json", + "doc_url": "https://open.fda.gov/apis/drug/label/", + "description": "OpenFDA drug labeling information with pharmacology, indications, warnings, and dosage guidance.", + "keywords": ["drug", "label", "fda", "pharmaceutical", "medication", "clinical"], + "category": "pharmacovigilance", + "base_score": 0.6, + "endpoints": [ + {"method": "GET", "path": "/drug/label.json", "summary": "Query drug labeling records"}, + {"method": "GET", "path": "/drug/event.json", "summary": "Retrieve drug adverse events"}, + ], + }, + { + "name": "FDA OpenFDA Adverse Events API", + "url": "https://api.fda.gov/drug/event.json", + "doc_url": "https://open.fda.gov/apis/drug/event/", + "description": "Adverse event case reports submitted to FDA FAERS with patient outcomes and drug role details.", + "keywords": ["adverse", "event", "pharmacovigilance", "drug safety", "faers"], + "category": "pharmacovigilance", + "base_score": 0.65, + "endpoints": [ + {"method": "GET", "path": "/drug/event.json", "summary": "Search FAERS adverse event data"}, + ], + }, + { + "name": "FDA OpenFDA Device Recall API", + "url": "https://api.fda.gov/device/recall.json", + "doc_url": "https://open.fda.gov/apis/device/recall/", + "description": "Medical device recall records including classification, recall reason, and event dates.", + "keywords": ["medical device", "recall", "fda", "safety", "compliance"], + "category": "device_safety", + "base_score": 0.55, + "endpoints": [ + {"method": "GET", "path": "/device/recall.json", "summary": "Retrieve device recall records"}, + ], + }, + { + "name": "CDC Socrata Open Data API", + "url": "https://data.cdc.gov/resource/9mfq-cb36.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/9mfq-cb36", + "description": "CDC curated datasets accessible via the Socrata Open Data API, including COVID-19 cases and vaccinations.", + "keywords": ["cdc", "public health", "covid", "vaccination", "socrata", "open data"], + "category": "public_health", + "base_score": 0.86, + "endpoints": [ + {"method": "GET", "path": "/resource/.json", "summary": "Query CDC open datasets"}, + ], + }, + { + "name": "CDC PLACES Community Health API", + "url": "https://chronicdata.cdc.gov/resource/cwsq-ngmh.json", + "doc_url": "https://dev.socrata.com/foundry/chronicdata.cdc.gov/cwsq-ngmh", + "description": "Model-based estimates for chronic disease, health risk factors, and preventive services at local levels; supports community health assessments and dental health overlays.", + "keywords": ["community health", "chronic disease", "behavioral health", "cdc", "oral health"], + "category": "public_health", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/resource/cwsq-ngmh.json", "summary": "Retrieve PLACES health estimates"}, + ], + }, + { + "name": "CDC Oral Health Data Portal API", + "url": "https://data.cdc.gov/resource/4nhi-4p9m.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/4nhi-4p9m", + "description": "Community oral health indicators including dental visits, sealant prevalence, and fluoridation coverage for dentistry analytics.", + "keywords": ["oral health", "dentistry", "dental", "fluoride", "sealant", "cdc"], + "category": "dentistry", + "base_score": 0.81, + "endpoints": [ + {"method": "GET", "path": "/resource/4nhi-4p9m.json", "summary": "Query oral health indicator records"}, + ], + }, + { + "name": "WHO Global Health Observatory API", + "url": "https://ghoapi.azureedge.net/api/Indicator", + "doc_url": "https://www.who.int/data/gho/info/gho-odata-api", + "description": "World Health Organization indicators covering global health metrics, vaccination, and disease burden.", + "keywords": ["who", "global health", "indicator", "vaccination", "disease surveillance"], + "category": "global_health", + "base_score": 0.87, + "endpoints": [ + {"method": "GET", "path": "/api/Indicator", "summary": "List WHO health indicators"}, + {"method": "GET", "path": "/api/Indicator?$filter", "summary": "Filter indicators by code"}, + ], + }, + { + "name": "NIH RePORTER Projects API", + "url": "https://api.reporter.nih.gov/v2/projects/search", + "doc_url": "https://api.reporter.nih.gov/", + "description": "NIH-funded research projects with abstracts, funding amounts, and investigator information.", + "keywords": ["nih", "grants", "research", "project", "biomedical"], + "category": "research_funding", + "base_score": 0.83, + "endpoints": [ + {"method": "POST", "path": "/v2/projects/search", "summary": "Search NIH-funded projects"}, + ], + }, + { + "name": "NCBI E-utilities ESummary API", + "url": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + "doc_url": "https://www.ncbi.nlm.nih.gov/books/NBK25500/", + "description": "Programmatic access to NCBI databases including PubMed, nucleotide, protein, and ClinVar content.", + "keywords": ["ncbi", "genomics", "pubmed", "sequence", "biomedical"], + "category": "genomics", + "base_score": 0.84, + "endpoints": [ + {"method": "GET", "path": "/entrez/eutils/esearch.fcgi", "summary": "Search NCBI databases"}, + {"method": "GET", "path": "/entrez/eutils/esummary.fcgi", "summary": "Retrieve database summaries"}, + ], + }, + { + "name": "Ensembl REST API", + "url": "https://rest.ensembl.org/info/ping", + "doc_url": "https://rest.ensembl.org", + "description": "Genomics REST service for Ensembl data including genes, variants, and comparative genomics with JSON outputs.", + "keywords": ["ensembl", "genomics", "variants", "gene", "rest service", "bioinformatics"], + "category": "genomics", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/lookup/id/{id}", "summary": "Lookup Ensembl gene or transcript"}, + {"method": "GET", "path": "/overlap/region/{species}/{region}", "summary": "Fetch features overlapping a region"}, + ], + }, + { + "name": "SAMHSA Behavioral Health Treatment Services Locator API", + "url": "https://findtreatment.samhsa.gov/locator", + "doc_url": "https://findtreatment.samhsa.gov/developers", + "description": "Behavioral health treatment provider directory with search by service type, payment, and location.", + "keywords": ["mental health", "treatment", "behavioral health", "samhsa"], + "category": "mental_health", + "base_score": 0.81, + "endpoints": [ + {"method": "GET", "path": "/locator", "summary": "Search behavioral health providers"}, + ], + }, + { + "name": "USDA FoodData Central API", + "url": "https://api.nal.usda.gov/fdc/v1/foods/search", + "doc_url": "https://fdc.nal.usda.gov/api-guide.html", + "description": "Nutrient composition data for branded and experimental foods, with search and detail endpoints.", + "keywords": ["nutrition", "food", "dietary", "usda", "nutrients"], + "category": "nutrition", + "base_score": 0.79, + "endpoints": [ + {"method": "POST", "path": "/fdc/v1/foods/search", "summary": "Search foods by keyword"}, + {"method": "GET", "path": "/fdc/v1/food/{fdcId}", "summary": "Retrieve nutrient profile"}, + ], + }, + { + "name": "CDC Vaccination Coverage API", + "url": "https://data.cdc.gov/resource/8xkx-amqh.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/8xkx-amqh", + "description": "US vaccination coverage estimates by vaccine and demographic segment.", + "keywords": ["vaccination", "immunization", "cdc", "coverage", "public health"], + "category": "vaccination", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/resource/8xkx-amqh.json", "summary": "Vaccination coverage records"}, + ], + }, + { + "name": "NOAA Climate Data Online API", + "url": "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets", + "doc_url": "https://www.ncdc.noaa.gov/cdo-web/webservices/v2", + "description": "Climate and weather datasets from NOAA including temperature, precipitation, and extremes for environmental monitoring and early warning systems.", + "keywords": ["environment", "environmental", "weather", "climate", "noaa", "meteorology", "monitoring"], + "category": "environmental", + "base_score": 0.78, + "endpoints": [ + {"method": "GET", "path": "/cdo-web/api/v2/datasets", "summary": "List NOAA datasets"}, + {"method": "GET", "path": "/cdo-web/api/v2/data", "summary": "Query climate observations"}, + ], + }, + { + "name": "EPA AirNow API", + "url": "https://www.airnowapi.org/aq/data/", + "doc_url": "https://docs.airnowapi.org/", + "description": "Air quality measurements and forecasts for US monitoring stations, including pollutants and AQI, supporting environmental monitoring pipelines.", + "keywords": ["air quality", "environment", "environmental", "epa", "pollution", "aqi", "monitoring"], + "category": "environmental", + "base_score": 0.77, + "endpoints": [ + {"method": "GET", "path": "/aq/data/", "summary": "Retrieve air quality data"}, + ], + }, + { + "name": "Orphanet Rare Disease API", + "url": "https://www.orpha.net/OrphAPI/api/Disease", + "doc_url": "https://api.orphanet.net/OrphAPI/#!/Disease", + "description": "Rare disease catalog with Orpha codes, synonyms, epidemiology, and classification.", + "keywords": ["rare disease", "orphanet", "orpha", "genetic", "registry"], + "category": "rare_disease", + "base_score": 0.76, + "endpoints": [ + {"method": "GET", "path": "/OrphAPI/api/Disease", "summary": "List rare diseases"}, + {"method": "GET", "path": "/OrphAPI/api/Disease/{OrphaCode}", "summary": "Retrieve disease details"}, + ], + }, + { + "name": "RAREDISEASES.info NIH Service", + "url": "https://rarediseases.info.nih.gov/services/v1/diseases", + "doc_url": "https://rarediseases.info.nih.gov/developers", + "description": "NIH Genetic and Rare Diseases (GARD) API providing disease descriptions, symptoms, and resources.", + "keywords": ["rare disease", "nih", "gard", "genetic", "registry"], + "category": "rare_disease", + "base_score": 0.75, + "endpoints": [ + {"method": "GET", "path": "/services/v1/diseases", "summary": "Search rare diseases"}, + ], + }, + { + "name": "USAFacts COVID-19 API", + "url": "https://api.usafacts.org/covid/covid-api/v1/cases", + "doc_url": "https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/api/", + "description": "County-level COVID-19 cases and deaths in the United States with daily updates.", + "keywords": ["covid", "pandemic", "surveillance", "epidemiology"], + "category": "pandemic", + "base_score": 0.74, + "endpoints": [ + {"method": "GET", "path": "/covid/covid-api/v1/cases", "summary": "Retrieve COVID-19 cases"}, + ], + }, + { + "name": "Global.Health Line List API", + "url": "https://covid19-api.global.health/v1/line-list", + "doc_url": "https://global.health/documentation/api", + "description": "Anonymized global case line lists for pathogen surveillance, including demographics and travel history.", + "keywords": ["pandemic", "outbreak", "surveillance", "line list", "global health"], + "category": "pandemic", + "base_score": 0.73, + "endpoints": [ + {"method": "GET", "path": "/v1/line-list", "summary": "Retrieve outbreak line list"}, + ], + }, + { + "name": "OpenFDA Food Enforcement API", + "url": "https://api.fda.gov/food/enforcement.json", + "doc_url": "https://open.fda.gov/apis/food/enforcement/", + "description": "Food recall enforcement reports with product description, reason, and distribution data.", + "keywords": ["food", "recall", "fda", "safety", "enforcement"], + "category": "food_safety", + "base_score": 0.55, + "endpoints": [ + {"method": "GET", "path": "/food/enforcement.json", "summary": "Search food recall enforcement"}, + ], + }, + { + "name": "USDA National Farmers Market Directory API", + "url": "https://search.ams.usda.gov/farmersmarkets/v1/data.svc/zipSearch", + "doc_url": "https://www.ams.usda.gov/services/local-regional/food-directories-datasets", + "description": "Directory of US farmers markets with location, operation schedule, and services.", + "keywords": ["nutrition", "food access", "farmers market", "usda"], + "category": "nutrition", + "base_score": 0.7, + "endpoints": [ + {"method": "GET", "path": "/farmersmarkets/v1/data.svc/zipSearch", "summary": "Find farmers markets by ZIP"}, + ], + }, + { + "name": "HealthData.gov CKAN Catalog API", + "url": "https://healthdata.gov/api/3/action/package_search", + "doc_url": "https://healthdata.gov/developer", + "description": "Catalog of US Department of Health and Human Services datasets via CKAN API.", + "keywords": ["open data", "catalog", "health data", "ckan", "metadata"], + "category": "data_catalog", + "base_score": 0.82, + "endpoints": [ + {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search dataset catalog"}, + ], + }, + { + "name": "data.gov CKAN Catalog API", + "url": "https://catalog.data.gov/api/3/action/package_search", + "doc_url": "https://catalog.data.gov/dataset", + "description": "US Federal data catalog with metadata across climate, energy, health, and finance.", + "keywords": ["open data", "catalog", "federal", "ckan", "metadata"], + "category": "data_catalog", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search the federal data catalog"}, + ], + }, + { + "name": "Europe PMC RESTful API", + "url": "https://www.ebi.ac.uk/europepmc/webservices/rest/search", + "doc_url": "https://europepmc.org/RestfulWebService", + "description": "Biomedical literature, grants, and patents from Europe PMC with advanced search syntax.", + "keywords": ["literature", "research", "biomedical", "europe pmc", "publications"], + "category": "literature", + "base_score": 0.78, + "endpoints": [ + {"method": "GET", "path": "/webservices/rest/search", "summary": "Search biomedical literature"}, + ], + }, + { + "name": "OpenAlex Graph API", + "url": "https://api.openalex.org/works", + "doc_url": "https://docs.openalex.org/api", + "description": "Scholarly works, authors, concepts, and institutions graph with filtering for literature discovery and citation analysis.", + "keywords": ["literature", "openalex", "scholarly", "citations", "research graph"], + "category": "literature", + "base_score": 0.77, + "endpoints": [ + {"method": "GET", "path": "/works", "summary": "Search scholarly works"}, + {"method": "GET", "path": "/authors", "summary": "Browse scholarly authors"}, + ], + }, +] + + +# ----------------------------------------------------------------------------- +# Internal helpers +# ----------------------------------------------------------------------------- + +TOKEN_PATTERN = re.compile(r"[a-z0-9]+") + + +def _tokenize(text: str) -> Set[str]: + tokens = set(TOKEN_PATTERN.findall((text or "").lower())) + enriched: Set[str] = set(tokens) + for tok in tokens: + if len(tok) <= 2: + continue + if tok.endswith("ies") and len(tok) > 3: + enriched.add(tok[:-3] + "y") + if tok.endswith("ing") and len(tok) > 4: + enriched.add(tok[:-3]) + if tok.endswith("al") and len(tok) > 4: + enriched.add(tok[:-2]) + if tok.endswith("s") and len(tok) > 3: + enriched.add(tok[:-1]) + return enriched + + +@dataclass(frozen=True) +class CatalogRecord: + data: Dict[str, object] + tokens: Set[str] + keyword_tokens: Set[str] + base_score: float + + +def _prepare_catalog(raw_items: Iterable[Dict[str, object]]) -> List[CatalogRecord]: + prepared: List[CatalogRecord] = [] + for item in raw_items: + entry = deepcopy(item) + + url = str(entry.get("url") or "").strip() + if not url: + continue + parsed = urlparse(url) + host = parsed.netloc.lower() + base_url = f"{parsed.scheme}://{parsed.netloc}" + + entry.setdefault("host", host) + entry.setdefault("base_url", base_url) + entry.setdefault("source", "static_catalog") + entry.setdefault("doc_url", entry.get("doc_url") or f"{base_url}/") + entry.setdefault("health", {"ok": True, "status": 200, "latency_ms": 180, "checked": "static"}) + entry.setdefault("cors", {"preflight": False}) + entry.setdefault("trust", trust_score(host)) + + keywords = entry.get("keywords") or [] + if keywords: + desc = entry.get("description") or "" + kw_text = "; ".join(str(k) for k in keywords) + if kw_text and kw_text.lower() not in desc.lower(): + entry["description"] = f"{desc} (keywords: {kw_text})" + keyword_tokens = _tokenize(" ".join(map(str, keywords))) + text_tokens = _tokenize(" ".join( + str(part) for part in ( + entry.get("name", ""), + entry.get("description", ""), + entry.get("category", ""), + entry.get("doc_url", ""), + ) + )) + + base_score = float(entry.get("base_score") or 0.0) + + prepared.append( + CatalogRecord( + data=entry, + tokens=text_tokens | keyword_tokens, + keyword_tokens=keyword_tokens, + base_score=base_score, + ) + ) + + return prepared + + +CATALOG: List[CatalogRecord] = _prepare_catalog(RAW_CATALOG) + + +# ----------------------------------------------------------------------------- +# Public harvester interface +# ----------------------------------------------------------------------------- + +def _score_entry(tokens: Set[str], record: CatalogRecord) -> float: + if not tokens: + return record.base_score + 0.5 + + keyword_overlap = len(tokens & record.keyword_tokens) + text_overlap = len(tokens & record.tokens) + + if keyword_overlap == 0 and text_overlap == 0: + return record.base_score * 0.1 + + precision = keyword_overlap / (len(tokens) or 1) + coverage = (keyword_overlap + text_overlap) / (len(record.tokens) or 1) + + return ( + 2.0 * keyword_overlap + + 1.2 * text_overlap + + 1.5 * precision + + 1.0 * coverage + + record.base_score * 0.25 + ) + + +SYNONYM_MAP = { + "clinical": ["trial", "research"], + "dentistry": ["dental", "oral", "oralhealth"], + "dental": ["dentistry", "oral", "oralhealth"], + "oral": ["dentistry", "dental", "oralhealth"], + "environmental": ["environment", "climate", "monitoring"], + "environment": ["environmental", "climate", "air"], + "monitoring": ["surveillance", "tracking"], + "rare": ["orphan", "orphanet", "genetic"], + "disease": ["condition", "illness"], + "genomics": ["genomic", "gene", "sequence", "dna"], + "genomic": ["genomics", "gene", "dna"], + "pandemic": ["outbreak", "surveillance"], + "surveillance": ["monitoring", "tracking"], + "nutrition": ["food", "diet", "dietary"], + "vaccination": ["immunization", "vaccine"], + "mental": ["behavioral", "behavior", "psych"], + "health": ["healthcare", "publichealth"], + "pharmaceutical": ["drug", "medicine"], + "adverse": ["safety", "pharmacovigilance"], +} + + +def harvest(query: str, limit: int = 5, **kwargs) -> List[Dict[str, object]]: + """ + Harvest candidate API endpoints from the static catalog. + + Args: + query: Natural language search string. + limit: Maximum number of candidates to return. + **kwargs: Unused passthrough parameters for compatibility. + """ + limit = max(1, min(int(limit or 5), 50)) + query = (query or "").strip() + + if not CATALOG: + return [] + + if not query: + top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit] + return [deepcopy(rec.data) for rec in top] + + token_union: Set[str] = _tokenize(query) + for token in list(token_union): + for syn in SYNONYM_MAP.get(token, []): + token_union |= _tokenize(syn) + + scored: List[Dict[str, object]] = [] + for record in CATALOG: + score = _score_entry(token_union, record) + if score <= 0 and record.base_score <= 0: + continue + candidate = deepcopy(record.data) + candidate["_match_score"] = round(score, 4) + candidate["_match_terms"] = sorted(token_union & record.tokens) + scored.append(candidate) + + if not scored: + top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit] + return [deepcopy(rec.data) for rec in top] + + preliminary = sorted(scored, key=lambda c: c["_match_score"], reverse=True)[: limit * 3] + ranked = rank_candidates(query, preliminary) + final = ranked[:limit] + + for cand in final: + cand.pop("_match_score", None) + cand.pop("_match_terms", None) + + return final + + +__all__ = ["harvest"] diff --git a/src/tooluniverse/harvest/verifier.py b/src/tooluniverse/harvest/verifier.py new file mode 100644 index 00000000..2da35df9 --- /dev/null +++ b/src/tooluniverse/harvest/verifier.py @@ -0,0 +1,33 @@ +from __future__ import annotations +import os, time, logging, requests +from typing import Dict, Optional + +logger = logging.getLogger("HarvestVerify") +DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8")) +SIZE_LIMIT = int(os.getenv("HARVEST_MAX_BYTES", "2000000")) +JSON_ACCEPT = {"Accept": "application/json"} + +def _head(url: str, timeout=None): + try: + return requests.head(url, timeout=timeout or DEFAULT_TIMEOUT, allow_redirects=True) + except requests.RequestException: + return None + +def _health_probe(url: str, timeout=None) -> Dict: + t0 = time.time() + try: + rh = _head(url, timeout) + if rh is not None: + clen = int(rh.headers.get("Content-Length") or 0) + if clen and clen > SIZE_LIMIT: + return {"ok": False, "status": rh.status_code, "skipped": f"large({clen})"} + r = requests.get(url, timeout=timeout or DEFAULT_TIMEOUT, headers=JSON_ACCEPT) + return {"ok": r.status_code < 500, "status": r.status_code, "latency_ms": int((time.time()-t0)*1000), "ctype": r.headers.get("Content-Type","")} + except requests.RequestException as e: + return {"ok": False, "status": 0, "error": str(e)} + +def verify_candidate(result, timeout_s: Optional[int] = None) -> Optional[Dict]: + url = (result.url or "").strip() + if not url: return None + health = _health_probe(url, timeout=timeout_s) + return {"name": result.title, "url": url, "health": health, "source": result.source} diff --git a/src/tooluniverse/interpro_tool.py b/src/tooluniverse/interpro_tool.py new file mode 100644 index 00000000..99653e01 --- /dev/null +++ b/src/tooluniverse/interpro_tool.py @@ -0,0 +1,63 @@ +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +INTERPRO_BASE_URL = "https://www.ebi.ac.uk/interpro/api/entry/interpro/" +REQUEST_TIMEOUT = 30 + + +@register_tool("InterProTool") +class InterProTool(BaseTool): + """ + Tool wrapper for the InterPro REST API. + Provides entry search with pagination support. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def run(self, arguments): + query = (arguments or {}).get("query") or (arguments or {}).get("search") + if not query: + return {"error": "Missing required parameter: query"} + + page = int((arguments or {}).get("page") or 1) + page_size = int( + (arguments or {}).get("page_size") + or self.tool_config.get("page_size", 25) + ) + + params = { + "search": query, + "page": max(page, 1), + "page_size": max(min(page_size, 200), 1), + } + + response = self.session.get( + INTERPRO_BASE_URL, params=params, timeout=REQUEST_TIMEOUT + ) + response.raise_for_status() + payload = response.json() + + entries = [] + for item in payload.get("results", []): + metadata = item.get("metadata", {}) + entries.append( + { + "accession": metadata.get("accession"), + "name": metadata.get("name"), + "short_name": metadata.get("short_name"), + "type": metadata.get("type"), + "source_database": metadata.get("source_database"), + "integrated": metadata.get("integrated"), + } + ) + + return { + "count": payload.get("count", len(entries)), + "next": payload.get("next"), + "previous": payload.get("previous"), + "results": entries, + } diff --git a/src/tooluniverse/iucn_tool.py b/src/tooluniverse/iucn_tool.py new file mode 100644 index 00000000..49752fe2 --- /dev/null +++ b/src/tooluniverse/iucn_tool.py @@ -0,0 +1,73 @@ +import os +from typing import Any, Dict, List + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +IUCN_BASE_URL = "https://apiv3.iucnredlist.org/api/v3/species/" +IUCN_TOKEN_ENV = "IUCN_RED_LIST_TOKEN" +REQUEST_TIMEOUT = 30 + + +@register_tool("IUCNRedListTool") +class IUCNRedListTool(BaseTool): + """ + Wrapper around the IUCN Red List API for species status lookups. + Requires an API token supplied via arguments, tool config, or environment. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def _resolve_token(self, arguments: Dict[str, Any]) -> str: + candidate = ( + (arguments or {}).get("token") + or self.tool_config.get("token") + or os.getenv(IUCN_TOKEN_ENV) + ) + if not candidate: + raise ValueError( + f"Missing IUCN API token. Provide 'token' argument or set {IUCN_TOKEN_ENV}." + ) + return candidate + + def run(self, arguments): + species = (arguments or {}).get("species") or (arguments or {}).get( + "species_name" + ) + if not species: + return {"error": "Missing required parameter: species"} + + try: + token = self._resolve_token(arguments or {}) + except ValueError as exc: + return {"error": str(exc)} + + response = self.session.get( + f"{IUCN_BASE_URL}{species}", + params={"token": token}, + timeout=REQUEST_TIMEOUT, + ) + + if response.status_code == 404: + return {"count": 0, "results": []} + + response.raise_for_status() + payload = response.json() + + results: List[Dict[str, Any]] = [] + for entry in payload.get("result", []): + results.append( + { + "scientific_name": entry.get("scientific_name"), + "category": entry.get("category"), + "population_trend": entry.get("population_trend"), + "distribution": entry.get("countries"), + "published_year": entry.get("published_year"), + } + ) + + return {"count": len(results), "results": results} diff --git a/src/tooluniverse/jaspar_tool.py b/src/tooluniverse/jaspar_tool.py new file mode 100644 index 00000000..399859db --- /dev/null +++ b/src/tooluniverse/jaspar_tool.py @@ -0,0 +1,61 @@ +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +JASPAR_BASE_URL = "https://jaspar.elixir.no/api/v1/matrix/" +REQUEST_TIMEOUT = 30 + + +@register_tool("JASPARRestTool") +class JASPARRestTool(BaseTool): + """ + Wrapper around the JASPAR REST API for matrix searches. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def run(self, arguments): + query = (arguments or {}).get("query") or (arguments or {}).get("search") + if not query: + return {"error": "Missing required parameter: query"} + + params = { + "search": query, + "page": (arguments or {}).get("page", 1), + "page_size": (arguments or {}).get("page_size") + or self.tool_config.get("page_size", 10), + } + + for optional in ("tax_group", "collection", "type"): + value = (arguments or {}).get(optional) + if value: + params[optional] = value + + response = self.session.get( + JASPAR_BASE_URL, params=params, timeout=REQUEST_TIMEOUT + ) + response.raise_for_status() + payload = response.json() + + results = [] + for item in payload.get("results", []): + results.append( + { + "matrix_id": item.get("matrix_id"), + "name": item.get("name"), + "collection": item.get("collection"), + "tax_group": item.get("tax_group"), + "class": item.get("class"), + "family": item.get("family"), + } + ) + + return { + "count": payload.get("count", len(results)), + "next": payload.get("next"), + "previous": payload.get("previous"), + "results": results, + } diff --git a/src/tooluniverse/kegg_tool.py b/src/tooluniverse/kegg_tool.py new file mode 100644 index 00000000..31262f57 --- /dev/null +++ b/src/tooluniverse/kegg_tool.py @@ -0,0 +1,56 @@ +from typing import List + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +KEGG_BASE_URL = "https://rest.kegg.jp" +REQUEST_TIMEOUT = 30 + + +@register_tool("KEGGTool") +class KEGGTool(BaseTool): + """ + Lightweight wrapper around the KEGG REST API for text-based queries. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def run(self, arguments): + query = (arguments or {}).get("query") + if not query: + return {"error": "Missing required parameter: query"} + + database = (arguments or {}).get("database") or self.tool_config.get( + "database", "pathway" + ) + max_results = (arguments or {}).get("max_results") or self.tool_config.get( + "max_results" + ) + + endpoint = f"{KEGG_BASE_URL}/find/{database}/{query}" + response = self.session.get(endpoint, timeout=REQUEST_TIMEOUT) + response.raise_for_status() + + lines: List[str] = [ + line for line in response.text.splitlines() if line.strip() + ] + if max_results: + try: + limit = int(max_results) + lines = lines[: max(limit, 0)] + except ValueError: + pass + + results = [] + for line in lines: + if "\t" in line: + identifier, description = line.split("\t", 1) + else: + identifier, description = line, "" + results.append({"id": identifier, "description": description}) + + return results diff --git a/src/tooluniverse/logging_config.py b/src/tooluniverse/logging_config.py index 6659f6a1..3cbc36ab 100644 --- a/src/tooluniverse/logging_config.py +++ b/src/tooluniverse/logging_config.py @@ -45,12 +45,12 @@ class ToolUniverseFormatter(logging.Formatter): # Emoji prefixes for different log levels EMOJI_PREFIX = { - "DEBUG": "🔧 ", - "INFO": "ℹ️ ", - "PROGRESS": "⏳ ", - "WARNING": "⚠️ ", - "ERROR": "❌ ", - "CRITICAL": "🚨 ", + "DEBUG": "[DEBUG] ", + "INFO": "[INFO] ", + "PROGRESS": "[PROGRESS] ", + "WARNING": "[WARN] ", + "ERROR": "[ERROR] ", + "CRITICAL": "[CRITICAL] ", } def format(self, record): diff --git a/src/tooluniverse/marine_species_tool.py b/src/tooluniverse/marine_species_tool.py new file mode 100644 index 00000000..16058d28 --- /dev/null +++ b/src/tooluniverse/marine_species_tool.py @@ -0,0 +1,62 @@ +from urllib.parse import quote + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +WORMS_BASE_URL = "https://www.marinespecies.org/rest" +REQUEST_TIMEOUT = 30 + + +@register_tool("MarineSpeciesTool") +class MarineSpeciesTool(BaseTool): + """ + Wrapper for the World Register of Marine Species (WoRMS) REST API. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def run(self, arguments): + name = (arguments or {}).get("scientific_name") or (arguments or {}).get( + "name" + ) + if not name: + return {"error": "Missing required parameter: scientific_name"} + + like = (arguments or {}).get("like") + marine_only = (arguments or {}).get("marine_only") + + params = { + "like": "true" + if (like if like is not None else self.tool_config.get("like", True)) + else "false", + "marine_only": "true" + if ( + marine_only + if marine_only is not None + else self.tool_config.get("marine_only", True) + ) + else "false", + } + + endpoint = f"{WORMS_BASE_URL}/AphiaRecordsByName/{quote(name)}" + response = self.session.get(endpoint, params=params, timeout=REQUEST_TIMEOUT) + response.raise_for_status() + payload = response.json() or [] + + results = [] + for item in payload: + results.append( + { + "AphiaID": item.get("AphiaID"), + "scientificname": item.get("scientificname"), + "rank": item.get("rank"), + "status": item.get("status"), + "match_type": item.get("match_type"), + } + ) + + return results diff --git a/src/tooluniverse/medlog_tool.py b/src/tooluniverse/medlog_tool.py new file mode 100644 index 00000000..d375a903 --- /dev/null +++ b/src/tooluniverse/medlog_tool.py @@ -0,0 +1,143 @@ +""" +MedLog integration tools. + +These tools expose MedLog collector and FHIR linkage capabilities as native +ToolUniverse tools for event ingestion, querying, and audit retrieval. +""" + +from __future__ import annotations + +import os +from typing import Any, Dict + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + + +class _MedLogBaseTool(BaseTool): + """Shared utility methods for MedLog REST integration.""" + + DEFAULT_BASE_URL = "http://localhost:7001" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.base_url = os.getenv( + "MEDLOG_COLLECTOR_BASE_URL", self.DEFAULT_BASE_URL + ).rstrip("/") + self.session = requests.Session() + + def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.post(url, json=payload, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog collector request failed: {exc}", "endpoint": url} + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog collector request failed: {exc}", "endpoint": url} + + +class _MedLogFHIRBaseTool(BaseTool): + """Shared logic for interacting with the MedLog FHIR linkage service.""" + + DEFAULT_FHIR_URL = "http://localhost:7003" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.fhir_base = os.getenv( + "MEDLOG_FHIR_BASE_URL", self.DEFAULT_FHIR_URL + ).rstrip("/") + self.session = requests.Session() + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.fhir_base}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog FHIR request failed: {exc}", "endpoint": url} + + +@register_tool("MedLogInitEventTool") +class MedLogInitEventTool(_MedLogBaseTool): + """Create or update a MedLog event record.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self._post("/medlog/events/init", arguments) + + +@register_tool("MedLogAppendFragmentTool") +class MedLogAppendFragmentTool(_MedLogBaseTool): + """Append fragment data (artifacts, outputs, feedback) to an event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + fragment = arguments.get("fragment", {}) + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._post(f"/medlog/events/{event_id}/append", fragment) + + +@register_tool("MedLogGetProvenanceTool") +class MedLogGetProvenanceTool(_MedLogBaseTool): + """Retrieve PROV-JSON bundle for a specific event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._get(f"/medlog/events/{event_id}/prov") + + +@register_tool("MedLogQueryEventsTool") +class MedLogQueryEventsTool(_MedLogBaseTool): + """Query MedLog events by run_id or event_id.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "run_id": arguments.get("run_id"), + "event_id": arguments.get("event_id"), + "limit": arguments.get("limit", 50), + } + return self._post("/query", payload) + + +@register_tool("MedLogExportParquetTool") +class MedLogExportParquetTool(_MedLogBaseTool): + """Trigger a parquet export of MedLog events.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self._post("/export/parquet", {}) + + +@register_tool("MedLogFHIRBundleTool") +class MedLogFHIRBundleTool(_MedLogFHIRBaseTool): + """Fetch FHIR bundle for a specific event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._get(f"/bundle/{event_id}") + + +@register_tool("MedLogFHIRRunBundleTool") +class MedLogFHIRRunBundleTool(_MedLogFHIRBaseTool): + """Fetch FHIR bundle aggregating all events in a run.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + run_id = arguments.get("run_id") + if not run_id: + return {"error": "Parameter 'run_id' is required."} + return self._get(f"/bundle/run/{run_id}") diff --git a/src/tooluniverse/medtok_tool.py b/src/tooluniverse/medtok_tool.py new file mode 100644 index 00000000..1bd4042f --- /dev/null +++ b/src/tooluniverse/medtok_tool.py @@ -0,0 +1,122 @@ +""" +MedTok integration tools. + +These tools provide a thin wrapper around the MedTok FastAPI service so that +ToolUniverse users can tokenize, embed, and explore medical codes directly +from the unified tool catalog. +""" + +from __future__ import annotations + +import os +from typing import Any, Dict + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + + +class _MedTokBaseTool(BaseTool): + """Shared utilities for MedTok REST integrations.""" + + DEFAULT_BASE_URL = "http://localhost:8000" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.base_url = os.getenv("MEDTOK_BASE_URL", self.DEFAULT_BASE_URL).rstrip("/") + self.session = requests.Session() + + def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.post(url, json=payload, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedTok request failed: {exc}", "endpoint": url} + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedTok request failed: {exc}", "endpoint": url} + + +@register_tool("MedTokTokenizeTool") +class MedTokTokenizeTool(_MedTokBaseTool): + """Tokenize medical codes using MedTok multimodal tokenizer.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "codes": arguments.get("codes", []), + "system": arguments.get("system", "ICD-10"), + "include_metadata": arguments.get("include_metadata", False), + } + return self._post("/tokenize", payload) + + +@register_tool("MedTokEmbedTool") +class MedTokEmbedTool(_MedTokBaseTool): + """Generate token embeddings for a batch of codes.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "codes": arguments.get("codes", []), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/embed", payload) + + +@register_tool("MedTokNearestNeighborsTool") +class MedTokNearestNeighborsTool(_MedTokBaseTool): + """Retrieve nearest neighbours for a code in embedding space.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "code": arguments.get("code"), + "k": arguments.get("k", 5), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/nearest_neighbors", payload) + + +@register_tool("MedTokMapTextTool") +class MedTokMapTextTool(_MedTokBaseTool): + """Map free-text description to the closest medical code.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "text": arguments.get("text", ""), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/map_text_to_code", payload) + + +@register_tool("MedTokSearchTextTool") +class MedTokSearchTextTool(_MedTokBaseTool): + """Perform text and semantic search across the code vocabulary.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "text": arguments.get("text", ""), + "system": arguments.get("system"), + "k": arguments.get("k", 5), + } + return self._post("/search_text", payload) + + +@register_tool("MedTokCodeInfoTool") +class MedTokCodeInfoTool(_MedTokBaseTool): + """Fetch detailed metadata for a specific code.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + system = arguments.get("system", "ICD-10") + code = arguments.get("code") + if not code: + return {"error": "Parameter 'code' is required."} + path = f"/codes/{system}/{code}" + return self._get(path) diff --git a/src/tooluniverse/phenome_jax_tool.py b/src/tooluniverse/phenome_jax_tool.py new file mode 100644 index 00000000..353f476a --- /dev/null +++ b/src/tooluniverse/phenome_jax_tool.py @@ -0,0 +1,54 @@ +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + +PHENOME_JAX_BASE_URL = "https://phenome.jax.org/api" +REQUEST_TIMEOUT = 30 + + +@register_tool("PhenomeJaxTool") +class PhenomeJaxTool(BaseTool): + """ + Wrapper around the Mouse Phenome Database (MPD) API for project searches. + """ + + def __init__(self, tool_config): + super().__init__(tool_config) + self.session = requests.Session() + + def run(self, arguments): + keyword = (arguments or {}).get("keyword") or (arguments or {}).get("query") + limit = int( + (arguments or {}).get("limit") or self.tool_config.get("limit", 20) + ) + + params = {"limit": max(limit, 1)} + if keyword: + params["keyword"] = keyword + + response = self.session.get( + f"{PHENOME_JAX_BASE_URL}/projects", + params=params, + timeout=REQUEST_TIMEOUT, + ) + response.raise_for_status() + payload = response.json() + + projects = [] + for item in payload.get("projects", []): + projects.append( + { + "projid": item.get("projid"), + "title": item.get("title"), + "mpdsector": item.get("mpdsector"), + "species": item.get("species"), + "status": item.get("status"), + "releasedate": item.get("releasedate"), + } + ) + + return { + "count": payload.get("count", len(projects)), + "projects": projects[: params["limit"]], + } diff --git a/src/tooluniverse/tool_navigator_tool.py b/src/tooluniverse/tool_navigator_tool.py new file mode 100644 index 00000000..1341dd98 --- /dev/null +++ b/src/tooluniverse/tool_navigator_tool.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import math +from typing import Any, Dict, List, Optional + +from .execute_function import ToolUniverse +from .tool_registry import register_tool +from .vsd_registry import load_catalog + + +def _tokenize(text: str) -> List[str]: + return [t for t in (text or "").lower().split() if t] + + +def _score(query_tokens: List[str], name: str, description: str) -> float: + haystack = f"{name} {description}".lower() + score = 0.0 + for token in query_tokens: + if token in haystack: + score += 2.0 + score += sum(1.0 for token in query_tokens if any(word.startswith(token) for word in haystack.split())) + return score + + +def _format_tool(tool: Dict[str, Any]) -> Dict[str, Any]: + return { + "name": tool.get("name"), + "type": tool.get("type"), + "description": tool.get("description"), + "tool_type": tool.get("tool_type"), + "category": tool.get("category"), + "source": tool.get("source"), + } + + +@register_tool("ToolNavigatorTool") +class ToolNavigatorTool: + """ + Search ToolUniverse's catalog (built-in + VSD) to help agents discover relevant tools. + """ + + name = "ToolNavigatorTool" + description = "Search ToolUniverse/Navigated catalog for tools matching a query." + input_schema = { + "type": "object", + "properties": { + "query": {"type": "string"}, + "limit": {"type": "integer", "default": 10, "minimum": 1, "maximum": 50}, + "categories": { + "type": "array", + "items": {"type": "string"}, + "description": "Optional list of categories to include.", + }, + "include_vsd": { + "type": "boolean", + "default": True, + "description": "Include dynamically registered VSD tools in the search.", + }, + }, + "required": ["query"], + "additionalProperties": False, + } + + def __init__(self) -> None: + self._tooluniverse = ToolUniverse() + + def _load_base_tools(self) -> List[Dict[str, Any]]: + if not getattr(self._tooluniverse, "all_tools", None): + self._tooluniverse.load_tools() + return list(getattr(self._tooluniverse, "all_tools", [])) + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + query = (arguments.get("query") or "").strip() + if not query: + return {"ok": False, "error": "query is required"} + + limit = int(arguments.get("limit") or 10) + include_vsd = bool(arguments.get("include_vsd", True)) + categories = arguments.get("categories") + if categories and not isinstance(categories, list): + categories = [categories] + categories = [c.lower() for c in categories or []] + + tools = self._load_base_tools() + if include_vsd: + for cfg in load_catalog().values(): + tools.append( + { + "name": cfg.get("name"), + "type": "DynamicREST", + "description": (cfg.get("metadata") or {}).get("description"), + "tool_type": "dynamic_rest", + "category": "vsd", + "source": (cfg.get("metadata") or {}).get("source"), + } + ) + + query_tokens = _tokenize(query) + scored: List[tuple[float, Dict[str, Any]]] = [] + for tool in tools: + if categories and (tool.get("category") or "").lower() not in categories: + continue + score = _score(query_tokens, tool.get("name", ""), tool.get("description", "")) + if score > 0: + scored.append((score, tool)) + + scored.sort(key=lambda item: item[0], reverse=True) + best = [_format_tool(tool) | {"score": round(score, 3)} for score, tool in scored[:limit]] + + return {"ok": True, "query": query, "results": best, "total": len(scored)} diff --git a/src/tooluniverse/tool_registry.py b/src/tooluniverse/tool_registry.py index eb3b893f..c3f5d141 100644 --- a/src/tooluniverse/tool_registry.py +++ b/src/tooluniverse/tool_registry.py @@ -446,3 +446,18 @@ def get_tool_class_lazy(tool_name): return _tool_registry.get(tool_name) return None + +# --- VSD / compatibility shims --- +def get_tool_class(name: str): + """ + Backwards-compatible accessor used by scripts like SampleVDSRun.py. + Prefer get_tool_class_lazy(name) internally. + """ + return get_tool_class_lazy(name) + +class _RegistryShim: + def get_tool_class(self, name: str): + return get_tool_class_lazy(name) + +# Expose a 'registry' object with get_tool_class, if callers expect it +registry = _RegistryShim() \ No newline at end of file diff --git a/src/tooluniverse/utils.py b/src/tooluniverse/utils.py index 88e778fb..ff4867e2 100755 --- a/src/tooluniverse/utils.py +++ b/src/tooluniverse/utils.py @@ -136,7 +136,7 @@ def read_json_list(file_path): Returns list: A list of dictionaries containing the JSON objects. """ - with open(file_path, "r") as file: + with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) return data diff --git a/src/tooluniverse/vsd_api_tool.py b/src/tooluniverse/vsd_api_tool.py new file mode 100644 index 00000000..84a5c525 --- /dev/null +++ b/src/tooluniverse/vsd_api_tool.py @@ -0,0 +1,115 @@ +from __future__ import annotations +import os +import json +from typing import Dict, Any + +from .base_tool import BaseTool +from .tool_registry import register_tool + +# Reuse same storage locations as vsd_tool +VSD_HOME = os.environ.get("TOOLUNIVERSE_VSD_DIR", os.path.expanduser("~/.tooluniverse/vsd")) +GENERATED_TOOLS_PATH = os.path.join(VSD_HOME, "generated_tools.json") + +os.makedirs(VSD_HOME, exist_ok=True) + + +def _save_tool(tool_spec: Dict[str, Any]) -> None: + """Upsert a generated tool spec into the registry file.""" + tools: list[Dict[str, Any]] = [] + if os.path.exists(GENERATED_TOOLS_PATH): + try: + with open(GENERATED_TOOLS_PATH, "r", encoding="utf-8") as f: + tools = json.load(f) + except Exception: + tools = [] + by_name = {t.get("name"): t for t in tools} + by_name[tool_spec.get("name")] = tool_spec + with open(GENERATED_TOOLS_PATH, "w", encoding="utf-8") as f: + json.dump(list(by_name.values()), f, indent=2) + + +@register_tool("VSDToolBuilder") +class VSDToolBuilder(BaseTool): + """ + Build and register a usable ToolUniverse tool from a harvested or discovered VSD candidate. + + Input: + { + "candidate": { + "domain": "clinicaltrials.gov", + "endpoint": "https://clinicaltrials.gov/api/v2/studies", + "license": "CC0", + "score": 0.92 + }, + "tool_name": "clinicaltrials_search", + "description": "Query clinical trials with disease/condition filters", + "parameter_overrides": { ... optional JSON Schema ... } + } + + Output: + { + "registered": true, + "name": "clinicaltrials_search", + "config_path": "/path/to/generated_tools.json" + } + """ + + def run(self, arguments: Dict[str, Any]): + if not arguments: + return {"error": "Missing arguments"} + cand = arguments.get("candidate") or {} + tool_name = arguments.get("tool_name") + desc = arguments.get("description") or f"VSD tool for {cand.get('domain')}" + param_override = arguments.get("parameter_overrides") or {} + + if not tool_name: + return {"error": "tool_name is required"} + if not cand or not cand.get("endpoint"): + return {"error": "candidate with endpoint is required"} + + endpoint = cand.get("endpoint") + domain = cand.get("domain", "unknown") + + # Pick implementation type + if endpoint.endswith(".graphql") or "graphql" in endpoint: + impl_type = "GenericGraphQLTool" + elif endpoint.startswith("http"): + impl_type = "GenericRESTTool" + else: + impl_type = "URLHTMLTagTool" + + # Default parameter schema (can be overridden) + params = param_override or { + "type": "object", + "properties": { + "query": {"type": "string", "default": ""}, + "pageSize": {"type": "integer", "default": 10}, + } + } + + tool_spec = { + "type": impl_type, + "name": tool_name, + "description": desc, + "fields": { + "base_url": endpoint, + "method": "GET", + "default_params": {} + }, + "parameter": params, + "label": ["VSD", cand.get("label") or domain], + "vsd": { + "domain": domain, + "endpoint": endpoint, + "license": cand.get("license", "unknown"), + "score": cand.get("score"), + "registry": cand.get("registry", "catalog"), + } + } + + # Special case: ClinicalTrials.gov -> add arg_transform + if "clinicaltrials.gov" in endpoint and impl_type == "GenericRESTTool": + tool_spec["vsd"]["arg_transform"] = "ctgov_time_window" + + _save_tool(tool_spec) + return {"registered": True, "name": tool_name, "config_path": GENERATED_TOOLS_PATH} diff --git a/src/tooluniverse/vsd_catalog.py b/src/tooluniverse/vsd_catalog.py new file mode 100644 index 00000000..95ec1269 --- /dev/null +++ b/src/tooluniverse/vsd_catalog.py @@ -0,0 +1,44 @@ +# src/tooluniverse/vsd_catalog.py +import os, json +from pathlib import Path +from typing import List, Dict, Any + +VSD_DIR = Path(os.environ.get("TOOLUNIVERSE_VSD_DIR", Path.home() / ".tooluniverse" / "vsd")) +ALLOWLIST_PATH = VSD_DIR / "allowlist.json" +CATALOG_PATH = VSD_DIR / "catalog" / "vsd_catalog_candidates.json" + +def load_json(path: Path) -> Any: + if not path.exists(): + return None + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return None + +def load_allowlist(seed: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + user = load_json(ALLOWLIST_PATH) or [] + merged = {e["domain"]: e for e in seed} + for e in user: + merged[e["domain"]] = {**merged.get(e["domain"], {}), **e} + return list(merged.values()) + +def load_catalog_candidates() -> List[Dict[str, Any]]: + data = load_json(CATALOG_PATH) or [] + # normalize minimal fields and keep only candidates + out = [] + for d in data: + if d.get("status") not in (None, "candidate", "approved"): + continue + out.append({ + "domain": d.get("domain"), + "label": d.get("label") or d.get("domain"), + "registry": d.get("registry") or "data.gov", + "endpoint": d.get("endpoint"), + "license": d.get("license") or "unknown", + "trust": float(d.get("trust") or 0.7), + "freshness": d.get("freshness") or "", + "api_kind": d.get("api_kind") or "rest", + "status": d.get("status") or "candidate", + "tags": d.get("tags") or [], + }) + return out diff --git a/src/tooluniverse/vsd_registry.py b/src/tooluniverse/vsd_registry.py new file mode 100644 index 00000000..83b237f6 --- /dev/null +++ b/src/tooluniverse/vsd_registry.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import Any, Dict + +from .common_utils import read_json, write_json, vsd_generated_path + + +def _normalize_catalog(data: Any) -> Dict[str, Dict[str, Any]]: + catalog: Dict[str, Dict[str, Any]] = {} + if not isinstance(data, dict): + return catalog + + generated = data.get("generated_tools") if isinstance(data.get("generated_tools"), list) else None + if generated is not None: + for item in generated: + if isinstance(item, dict) and item.get("name"): + name = item["name"] + catalog[name] = dict(item) + return catalog + + for name, cfg in data.items(): + if not isinstance(cfg, dict): + continue + entry = dict(cfg) + entry.setdefault("name", name) + catalog[name] = entry + return catalog + + +def load_catalog() -> Dict[str, Dict[str, Any]]: + """ + Load the Verified Source catalog from disk and normalize it + to a {name: config} dictionary regardless of historical format. + """ + path = vsd_generated_path() + data = read_json(path, {}) + return _normalize_catalog(data) + + +def save_catalog(catalog: Dict[str, Dict[str, Any]]) -> str: + """ + Persist the catalog to disk as a flat {name: config} mapping. + Returns the file path for convenience. + """ + path = vsd_generated_path() + # ensure each entry has its name + serializable = {name: dict(cfg, name=name) for name, cfg in catalog.items()} + write_json(path, serializable) + return path + + +def upsert_tool(tool_name: str, cfg: Dict[str, Any]) -> Dict[str, Any]: + """ + Insert or update a tool configuration in the catalog and propagate the + change to any in-process dynamic registries. + """ + catalog = load_catalog() + config = dict(cfg) + config.setdefault("name", tool_name) + catalog[tool_name] = config + save_catalog(catalog) + + # Notify dynamic REST runner (best-effort, optional import) + try: + from .dynamic_rest_runner import upsert_generated_tool # type: ignore + + upsert_generated_tool(tool_name, config) + except Exception: + pass + + return config + + +def remove_tool(tool_name: str) -> bool: + """ + Remove a tool from the catalog. Returns True if a tool was removed. + """ + catalog = load_catalog() + if tool_name not in catalog: + return False + del catalog[tool_name] + save_catalog(catalog) + + try: + from .dynamic_rest_runner import remove_generated_tool # type: ignore + + remove_generated_tool(tool_name) + except Exception: + pass + + return True diff --git a/src/tooluniverse/vsd_tool.py b/src/tooluniverse/vsd_tool.py new file mode 100644 index 00000000..98a09e24 --- /dev/null +++ b/src/tooluniverse/vsd_tool.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, List +from urllib.parse import urlparse + +from .tool_registry import register_tool +from .vsd_registry import load_catalog, save_catalog, upsert_tool +from .dynamic_rest_runner import refresh_generated_registry, remove_generated_tool +from .vsd_utils import build_config, probe_config, stamp_metadata +from .harvest.static_catalog import harvest as harvest_static + +GENERIC_HARVEST_SCHEMA = { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Free-text search term passed to the harvest catalog.", + }, + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "default": 5, + "description": "Maximum number of candidates to return.", + }, + "urls": { + "type": "array", + "items": {"type": "string", "format": "uri"}, + "description": "Optional explicit URLs to wrap as manual candidates (skips catalog search).", + }, + }, + "additionalProperties": False, +} + +GENERIC_HARVEST_CONFIG = { + "name": "GenericHarvestTool", + "description": "Search the harvest catalog (or wrap manual URLs) to produce candidate API endpoints.", + "type": "GenericHarvestTool", + "category": "special_tools", + "parameter": GENERIC_HARVEST_SCHEMA, +} + +VERIFIED_SOURCE_REGISTER_SCHEMA = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "candidate": {"type": "object"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + "force": {"type": "boolean", "default": False}, + }, + "required": ["tool_name", "candidate"], +} + +VERIFIED_SOURCE_REGISTER_CONFIG = { + "name": "VerifiedSourceRegisterTool", + "description": "Register a DynamicREST tool into the verified-source catalog after probing it.", + "type": "VerifiedSourceRegisterTool", + "category": "special_tools", + "parameter": VERIFIED_SOURCE_REGISTER_SCHEMA, +} + +VERIFIED_SOURCE_DISCOVERY_CONFIG = { + "name": "VerifiedSourceDiscoveryTool", + "description": "List the tools currently stored in the verified-source catalog.", + "type": "VerifiedSourceDiscoveryTool", + "category": "special_tools", + "parameter": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, +} + +VERIFIED_SOURCE_REMOVE_SCHEMA = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + }, + "required": ["tool_name"], +} + +VERIFIED_SOURCE_REMOVE_CONFIG = { + "name": "VerifiedSourceRemoveTool", + "description": "Remove a generated tool from the verified-source catalog.", + "type": "VerifiedSourceRemoveTool", + "category": "special_tools", + "parameter": VERIFIED_SOURCE_REMOVE_SCHEMA, +} + + +@register_tool("GenericHarvestTool", config=GENERIC_HARVEST_CONFIG) +class GenericHarvestTool: + name = "GenericHarvestTool" + description = "Harvest candidate API endpoints from the static catalog or wrap manual URLs." + input_schema = GENERIC_HARVEST_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + query = (arguments.get("query") or "").strip() + limit_value = arguments.get("limit", 5) + try: + limit = int(limit_value) + except (TypeError, ValueError): + limit = 5 + limit = max(1, min(limit, 50)) + urls = arguments.get("urls") or [] + + candidates: List[Dict[str, Any]] = [] + + if urls: + for idx, raw_url in enumerate(urls): + if not raw_url: + continue + parsed = urlparse(str(raw_url)) + host = parsed.netloc.lower() + base_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else raw_url + name = host or f"manual_candidate_{idx + 1}" + candidates.append( + { + "name": name, + "endpoint": raw_url, + "url": raw_url, + "base_url": base_url, + "host": host, + "source": "manual_urls", + "description": arguments.get("description") or "", + "trust": 0.5, + "health": {"ok": None, "status": None, "checked": "manual"}, + } + ) + else: + extra_args = {k: v for k, v in arguments.items() if k not in {"query", "limit", "urls"}} + candidates = harvest_static(query=query, limit=limit, **extra_args) + + return { + "ok": True, + "query": query, + "count": len(candidates), + "candidates": candidates, + } + + +@register_tool("VerifiedSourceRegisterTool", config=VERIFIED_SOURCE_REGISTER_CONFIG) +class VerifiedSourceRegisterTool: + name = "VerifiedSourceRegisterTool" + description = "Register a DynamicREST tool in the verified-source directory" + input_schema = VERIFIED_SOURCE_REGISTER_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + + def __call__( + self, + tool_name: str, + candidate: Dict[str, Any], + tool_type: str = "dynamic_rest", + default_params: Dict[str, Any] | None = None, + default_headers: Dict[str, Any] | None = None, + force: bool = False, + ) -> Dict[str, Any]: + if not tool_name: + raise ValueError("tool_name is required") + + cfg = build_config( + candidate or {}, + tool_type=tool_type, + default_params=default_params, + default_headers=default_headers, + ) + + probe = probe_config(cfg) + stamp_metadata(cfg, probe) + + if not probe.get("ok") and not force: + return { + "registered": False, + "name": tool_name, + "error": "Endpoint validation failed", + "test": probe, + "suggestion": "Provide default_params/default_headers or retry with force=True after ensuring credentials.", + } + + cfg = upsert_tool(tool_name, cfg) + refresh_generated_registry() + + return {"registered": True, "name": tool_name, "config": cfg} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self.__call__( + tool_name=arguments.get("tool_name"), + candidate=arguments.get("candidate", {}), + tool_type=arguments.get("tool_type", "dynamic_rest"), + default_params=arguments.get("default_params"), + default_headers=arguments.get("default_headers"), + force=bool(arguments.get("force")), + ) + + +@register_tool("VerifiedSourceDiscoveryTool", config=VERIFIED_SOURCE_DISCOVERY_CONFIG) +class VerifiedSourceDiscoveryTool: + name = "VerifiedSourceDiscoveryTool" + description = "Return the Verified-Source catalog." + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + catalog = load_catalog() + return {"ok": True, "tools": list(catalog.values())} + + +@register_tool("VerifiedSourceRemoveTool", config=VERIFIED_SOURCE_REMOVE_CONFIG) +class VerifiedSourceRemoveTool: + name = "VerifiedSourceRemoveTool" + description = "Remove a generated tool from the Verified-Source catalog." + input_schema = VERIFIED_SOURCE_REMOVE_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + tool_name = arguments.get("tool_name") + if not tool_name: + return {"removed": False, "error": "tool_name is required"} + catalog = load_catalog() + if tool_name not in catalog: + return {"removed": False, "error": f"Unknown tool '{tool_name}'"} + del catalog[tool_name] + save_catalog(catalog) + remove_generated_tool(tool_name) + return {"removed": True, "name": tool_name} + + +def register(server): + register_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool) + register_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool) + register_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool) + + server.add_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool()) + server.add_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool()) + server.add_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool()) + refresh_generated_registry() diff --git a/src/tooluniverse/vsd_utils.py b/src/tooluniverse/vsd_utils.py new file mode 100644 index 00000000..3f3250e7 --- /dev/null +++ b/src/tooluniverse/vsd_utils.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +import time +from copy import deepcopy +from typing import Any, Dict + +import requests + +# ------------------------------------------------------------------------------ +# Host-specific overrides and requirements +# ------------------------------------------------------------------------------ + +HOST_OVERRIDES: Dict[str, Dict[str, Any]] = { + # Ensembl requires a concrete resource; expose the JSON heartbeat by default. + "rest.ensembl.org": { + "endpoint": "https://rest.ensembl.org/info/ping", + "default_headers": {"Accept": "application/json"}, + "notes": "Ensembl REST base requires explicit resource. '/info/ping' provides a JSON heartbeat.", + }, + "api.fda.gov": { + "default_params": {"limit": 5}, + "default_headers": {"Accept": "application/json"}, + }, + "data.cdc.gov": { + "default_params": {"$limit": 5}, + "default_headers": {"Accept": "application/json"}, + }, +} + +HOST_REQUIREMENTS: Dict[str, Dict[str, Any]] = { + "api.nal.usda.gov": { + "requires_api_key": True, + "notes": "USDA FoodData Central requires an api_key query parameter.", + }, + "www.ncdc.noaa.gov": { + "requires_api_key": True, + "notes": "NOAA CDO API requires a token header. See https://www.ncdc.noaa.gov/cdo-web/webservices/v2", + "default_headers": {"token": ""}, + }, + "clinicaltrialsapi.cancer.gov": { + "requires_api_key": True, + "notes": "ClinicalTrials API requires authenticated access for JSON responses.", + }, + "findtreatment.samhsa.gov": { + "requires_manual_params": True, + "notes": "SAMHSA locator needs query parameters (e.g., state, lat/long) to return JSON.", + }, +} + + +# ------------------------------------------------------------------------------ +# Helpers +# ------------------------------------------------------------------------------ + +def _derive_endpoint(candidate: Dict[str, Any]) -> str: + endpoint = candidate.get("endpoint") or candidate.get("url") + if endpoint: + return str(endpoint) + + base_url = candidate.get("base_url") + routes = candidate.get("endpoints") or [] + if base_url and isinstance(routes, list) and routes: + first = routes[0] + path = str(first.get("path") or "/") + if not base_url.endswith("/") and not path.startswith("/"): + return f"{base_url}/{path}" + if base_url.endswith("/") and path.startswith("/"): + return f"{base_url.rstrip('/')}{path}" + return f"{base_url}{path}" + + if base_url: + return str(base_url) + + raise ValueError("candidate.endpoint or candidate.url is required") + + +def _apply_overrides(candidate: Dict[str, Any], cfg: Dict[str, Any]) -> None: + host = (candidate.get("host") or "").lower() + + overrides = HOST_OVERRIDES.get(host) + if overrides: + fields = cfg.setdefault("fields", {}) + if overrides.get("endpoint"): + cfg["endpoint"] = overrides["endpoint"] + fields["base_url"] = overrides["endpoint"] + if overrides.get("default_params"): + cfg.setdefault("default_params", {}).update(overrides["default_params"]) + fields.setdefault("default_params", {}).update(overrides["default_params"]) + if overrides.get("default_headers"): + cfg.setdefault("default_headers", {}).update(overrides["default_headers"]) + fields.setdefault("headers", {}).update(overrides["default_headers"]) + if overrides.get("notes"): + cfg.setdefault("metadata", {}).setdefault("notes", []).append(overrides["notes"]) + + requirements = HOST_REQUIREMENTS.get(host) + if requirements: + meta = cfg.setdefault("metadata", {}) + meta.setdefault("requirements", {}).update( + { + key: value + for key, value in requirements.items() + if key not in {"default_headers"} + } + ) + if requirements.get("default_headers"): + cfg.setdefault("default_headers", {}).update(requirements["default_headers"]) + cfg.setdefault("fields", {}).setdefault("headers", {}).update(requirements["default_headers"]) + + +# ------------------------------------------------------------------------------ +# Public helpers used by VSD tools +# ------------------------------------------------------------------------------ + +def build_config( + candidate: Dict[str, Any], + tool_type: str = "dynamic_rest", + default_params: Dict[str, Any] | None = None, + default_headers: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + """ + Produce a DynamicREST-style configuration dictionary from a harvest candidate. + """ + endpoint = _derive_endpoint(candidate) + method = str(candidate.get("method") or candidate.get("http_method") or "GET").upper() + merged_params = deepcopy(candidate.get("default_params") or candidate.get("params") or {}) + merged_headers = deepcopy(candidate.get("default_headers") or candidate.get("headers") or {}) + + # Allow overrides provided via arguments + if default_params: + merged_params.update(default_params) + if default_headers: + merged_headers.update(default_headers) + + # Determine implementation class + declared_type = str(candidate.get("tool_type") or tool_type or "").lower() + impl_type = "GenericRESTTool" + if declared_type in {"graphql", "genericgraphqltool", "graph_ql"} or endpoint.endswith(".graphql"): + impl_type = "GenericGraphQLTool" + + # Provide a permissive parameter schema with defaults from known params + parameter_schema: Dict[str, Any] = deepcopy(candidate.get("parameter_schema") or candidate.get("parameter") or {}) + if not parameter_schema: + properties = { + key: {"description": f"Override default query parameter '{key}'", "default": value} + for key, value in merged_params.items() + } + parameter_schema = { + "type": "object", + "properties": properties, + "additionalProperties": True, + } + + fields: Dict[str, Any] = { + "base_url": endpoint, + "method": method, + "default_params": merged_params, + "headers": merged_headers, + } + + cfg: Dict[str, Any] = { + "type": impl_type, + "description": candidate.get("description") or "", + "fields": fields, + "parameter": parameter_schema, + "metadata": { + "source": candidate.get("source"), + "trust": candidate.get("trust"), + "health": candidate.get("health"), + "doc_url": candidate.get("doc_url"), + "description": candidate.get("description"), + "host": candidate.get("host"), + }, + "vsd": candidate, + # Backwards compatibility fields expected by older utilities + "tool_type": candidate.get("tool_type") or tool_type or "dynamic_rest", + "endpoint": endpoint, + "method": method, + "default_params": merged_params, + "default_headers": merged_headers, + "auth": candidate.get("auth") or {"type": "none"}, + } + + response_key = candidate.get("response_key") + if response_key: + cfg["response_key"] = response_key + + _apply_overrides(candidate, cfg) + + return cfg + + +def probe_config(cfg: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a lightweight HTTP request to validate the generated configuration. + Returns diagnostic information including HTTP status and a JSON snippet if available. + """ + fields = cfg.get("fields") or {} + url = cfg.get("endpoint") or fields.get("base_url") + method = (fields.get("method") or cfg.get("method") or "GET").upper() + params = deepcopy(fields.get("default_params") or cfg.get("default_params") or {}) + headers = deepcopy(fields.get("headers") or cfg.get("default_headers") or {}) + headers.setdefault("Accept", "application/json") + + try: + if method == "GET": + resp = requests.get(url, params=params, headers=headers, timeout=20) + else: + resp = requests.request(method, url, json=params, headers=headers, timeout=20) + except Exception as exc: + return {"ok": False, "error": str(exc), "stage": "request"} + + content_type = resp.headers.get("Content-Type", "") + preview = resp.text[:400] if resp.text else "" + sample = None + has_json = False + + if "json" in content_type.lower(): + try: + payload = resp.json() + has_json = True + if isinstance(payload, list): + sample = payload[:1] + elif isinstance(payload, dict): + sample = {k: payload[k] for i, k in enumerate(payload) if i < 5} + else: + sample = payload + except Exception: + has_json = False + + status_ok = resp.status_code < 400 + + return { + "ok": bool(status_ok and (has_json or "json" in content_type.lower())), + "status": resp.status_code, + "content_type": content_type, + "has_json": has_json, + "sample": sample, + "preview": preview, + } + + +def stamp_metadata(cfg: Dict[str, Any], probe: Dict[str, Any]) -> None: + """ + Update metadata timestamps and probe results on a configuration dictionary. + """ + metadata = cfg.setdefault("metadata", {}) + metadata["registered_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") + metadata["last_test"] = probe diff --git a/tests/integration/test_medtok_medlog_tools.py b/tests/integration/test_medtok_medlog_tools.py new file mode 100644 index 00000000..a708ecf8 --- /dev/null +++ b/tests/integration/test_medtok_medlog_tools.py @@ -0,0 +1,282 @@ +import importlib.util +import json +import os +import sys +import tempfile +import threading +import time +from pathlib import Path + +import pytest +import uvicorn +from fastapi import FastAPI, HTTPException + +from tooluniverse.execute_function import ToolUniverse + + +class _ServerHandle: + """Utility wrapper for running uvicorn servers in tests.""" + + def __init__(self, app: FastAPI, host: str, port: int): + config = uvicorn.Config( + app, host=host, port=port, log_level="error", lifespan="off" + ) + self.server = uvicorn.Server(config) + self.thread = threading.Thread(target=self.server.run, daemon=True) + + def start(self) -> None: + self.thread.start() + while not self.server.started: + time.sleep(0.05) + + def stop(self) -> None: + self.server.should_exit = True + self.thread.join(timeout=5) + + +def _import_medtok_app(module_path: Path): + spec = importlib.util.spec_from_file_location("medtok_service_app", module_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +@pytest.fixture(scope="session") +def medtok_server(): + repo_root = Path(__file__).resolve().parents[3] + medtok_root = repo_root / "MedTok-FHIR-Starter" + service_dir = medtok_root / "services" / "medtok_service" + sys.path.insert(0, str(service_dir)) + + base_config_path = medtok_root / "config" / "medtok_config.json" + config_data = json.loads(base_config_path.read_text(encoding="utf-8")) + config_data["code_metadata_path"] = str( + medtok_root / "samples" / "code_metadata.csv" + ) + config_data["graph_edges_path"] = str( + medtok_root / "samples" / "code_graph_edges.csv" + ) + tmp_config = tempfile.NamedTemporaryFile( + "w", suffix="_medtok_config.json", delete=False + ) + json.dump(config_data, tmp_config) + tmp_config.flush() + tmp_config.close() + os.environ["MEDTOK_CONFIG"] = tmp_config.name + + module = _import_medtok_app(service_dir / "app.py") + module.MAPPING_CSV = str(medtok_root / "samples" / "code_mapping.csv") + app = module.app + + host = "127.0.0.1" + port = 8910 + server = _ServerHandle(app, host, port) + server.start() + + base_url = f"http://{host}:{port}" + os.environ["MEDTOK_BASE_URL"] = base_url + + yield base_url + + server.stop() + os.environ.pop("MEDTOK_BASE_URL", None) + os.environ.pop("MEDTOK_CONFIG", None) + try: + os.remove(tmp_config.name) + except FileNotFoundError: + pass + sys.path.remove(str(service_dir)) + + +def _build_medlog_collector(store): + app = FastAPI() + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + store[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return {"event_id": event_id, "provenance": {"header": record["header"]}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + matches = [] + for eid, record in store.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + matches.append({"event_id": eid, "header": header}) + if len(matches) >= limit: + break + return {"count": len(matches), "results": matches} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def _build_medlog_fhir(store): + app = FastAPI() + + def _bundle_for_records(records): + entries = [] + for rec in records: + entries.append( + { + "resource": { + "resourceType": "Observation", + "id": rec["header"]["event_id"], + "status": "final", + } + } + ) + return {"resourceType": "Bundle", "type": "collection", "entry": entries} + + @app.get("/bundle/{event_id}") + def bundle(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return _bundle_for_records([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + records = [ + record + for record in store.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return _bundle_for_records(records) + + return app + + +@pytest.fixture(scope="session") +def medlog_servers(): + store = {} + host = "127.0.0.1" + collector_port = 8911 + fhir_port = 8912 + + collector_app = _build_medlog_collector(store) + fhir_app = _build_medlog_fhir(store) + + collector = _ServerHandle(collector_app, host, collector_port) + fhir = _ServerHandle(fhir_app, host, fhir_port) + collector.start() + fhir.start() + + os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}" + os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}" + + yield store + + collector.stop() + fhir.stop() + os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None) + os.environ.pop("MEDLOG_FHIR_BASE_URL", None) + + +def test_medtok_rest_tools(medtok_server): + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medtok"]) + + tokenize = tu.tools.MedTok_tokenize( + codes=["A00", "E11"], system="ICD-10", include_metadata=True + ) + token_ids = tokenize.get("token_ids", []) + assert isinstance(token_ids, list) + assert len(token_ids) in (0, 2) + + embed = tu.tools.MedTok_embed(codes=["A00"], system="ICD-10") + embeddings = embed.get("embeddings", []) + if embeddings: + assert isinstance(embeddings[0], list) + assert embed.get("dim") == len(embeddings[0]) + + neighbors = tu.tools.MedTok_nearest_neighbors(code="A00", k=3) + neighbor_list = neighbors.get("neighbors", []) + assert len(neighbor_list) <= 3 + + mapped = tu.tools.MedTok_map_text_to_code(text="type 2 diabetes", system="ICD-10") + assert "code" in mapped + + search = tu.tools.MedTok_search_text(text="hypertension", k=4) + assert len(search.get("matches", [])) <= 4 + + code_info = tu.tools.MedTok_code_info(code="E11", system="ICD-10") + assert isinstance(code_info, dict) + + +def test_medlog_tools_workflow(medlog_servers): + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medlog"]) + + header = { + "event_id": "evt-1", + "run_id": "run-123", + "timestamp": "2025-01-01T00:00:00Z", + } + model_instance = {"model": "demo", "version": "1.0"} + user_identity = {"name": "Dr. Example"} + + init_resp = tu.tools.MedLog_init_event( + header=header, model_instance=model_instance, user_identity=user_identity + ) + assert init_resp["status"] == "ok" + + fragment = {"outputs": {"summary": "Patient stable"}} + append_resp = tu.tools.MedLog_append_fragment(event_id="evt-1", fragment=fragment) + assert append_resp["status"] == "ok" + + prov_resp = tu.tools.MedLog_get_provenance(event_id="evt-1") + assert prov_resp["event_id"] == "evt-1" + + query_resp = tu.tools.MedLog_query_events(run_id="run-123") + assert query_resp["count"] == 1 + assert query_resp["results"][0]["event_id"] == "evt-1" + + export_resp = tu.tools.MedLog_export_parquet() + assert export_resp["status"] == "ok" + + bundle_resp = tu.tools.MedLog_fhir_bundle(event_id="evt-1") + assert bundle_resp["resourceType"] == "Bundle" + + run_bundle_resp = tu.tools.MedLog_fhir_run_bundle(run_id="run-123") + assert len(run_bundle_resp["entry"]) == 1 diff --git a/tests/unit/test_biodomain_tools.py b/tests/unit/test_biodomain_tools.py new file mode 100644 index 00000000..ec769b13 --- /dev/null +++ b/tests/unit/test_biodomain_tools.py @@ -0,0 +1,117 @@ +from unittest.mock import MagicMock + +import pytest + +from tooluniverse.interpro_tool import InterProTool +from tooluniverse.kegg_tool import KEGGTool +from tooluniverse.iucn_tool import IUCNRedListTool, IUCN_TOKEN_ENV +from tooluniverse.jaspar_tool import JASPARRestTool +from tooluniverse.marine_species_tool import MarineSpeciesTool +from tooluniverse.cbioportal_tool import CBioPortalTool +from tooluniverse.phenome_jax_tool import PhenomeJaxTool + + +def _mock_session_get(monkeypatch, target, payload=None, text=None, status_code=200): + response = MagicMock() + response.status_code = status_code + if payload is not None: + response.json.return_value = payload + if text is not None: + response.text = text + response.raise_for_status.return_value = None + + def factory(self, *args, **kwargs): + return response + + monkeypatch.setattr(target, factory) + return response + + +@pytest.mark.unit +def test_interpro_tool(monkeypatch): + payload = { + "count": 2, + "results": [ + {"metadata": {"accession": "IPR000001", "name": "Example A", "type": "family"}}, + {"metadata": {"accession": "IPR000002", "name": "Example B", "type": "domain"}}, + ], + } + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + tool = InterProTool({"name": "InterPro_search_entries"}) + result = tool.run({"query": "kinase"}) + assert result["count"] == 2 + assert result["results"][0]["accession"] == "IPR000001" + + +@pytest.mark.unit +def test_kegg_tool(monkeypatch): + text = "path:map00010\tGlycolysis / Gluconeogenesis\n" + _mock_session_get(monkeypatch, "requests.Session.get", text=text) + tool = KEGGTool({"name": "KEGG_find_entries"}) + result = tool.run({"query": "glucose", "database": "pathway"}) + assert result[0]["id"] == "path:map00010" + + +@pytest.mark.unit +def test_iucn_tool_requires_token(monkeypatch): + tool = IUCNRedListTool({"name": "IUCN_get_species_status"}) + result = tool.run({"species": "Panthera leo"}) + assert "error" in result + + +@pytest.mark.unit +def test_iucn_tool(monkeypatch): + payload = {"result": [{"scientific_name": "Panthera leo", "category": "VU"}]} + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + monkeypatch.setenv(IUCN_TOKEN_ENV, "dummy") + tool = IUCNRedListTool({"name": "IUCN_get_species_status"}) + result = tool.run({"species": "Panthera leo"}) + assert result["results"][0]["category"] == "VU" + + +@pytest.mark.unit +def test_jaspar_tool(monkeypatch): + payload = { + "count": 1, + "results": [{"matrix_id": "MA0004.1", "name": "Arnt", "collection": "CORE"}], + } + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + tool = JASPARRestTool({"name": "JASPAR_search_motifs"}) + result = tool.run({"query": "Arnt"}) + assert result["results"][0]["matrix_id"] == "MA0004.1" + + +@pytest.mark.unit +def test_marine_species_tool(monkeypatch): + payload = [{"AphiaID": 137094, "scientificname": "Delphinus delphis"}] + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + tool = MarineSpeciesTool({"name": "MarineSpecies_lookup"}) + result = tool.run({"scientific_name": "Delphinus delphis"}) + assert result[0]["AphiaID"] == 137094 + + +@pytest.mark.unit +def test_cbioportal_tool(monkeypatch): + payload = [ + {"studyId": "brca_tcga", "name": "Breast Cancer", "description": "Example"} + ] + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + tool = CBioPortalTool({"name": "cBioPortal_search_studies"}) + result = tool.run({"keyword": "breast"}) + assert result["results"][0]["studyId"] == "brca_tcga" + + +@pytest.mark.unit +def test_phenome_jax_tool(monkeypatch): + payload = { + "count": 2, + "projects": [ + {"projid": 1, "title": "Glucose tolerance", "species": "mouse"}, + {"projid": 2, "title": "Insulin", "species": "mouse"}, + ], + } + _mock_session_get(monkeypatch, "requests.Session.get", payload=payload) + tool = PhenomeJaxTool({"name": "PhenomeJax_list_projects"}) + result = tool.run({"keyword": "glucose", "limit": 1}) + assert result["count"] == 2 + assert result["projects"][0]["projid"] == 1