From 1abe40ef0a9c1e411e82550137c1791c69d2b73c Mon Sep 17 00:00:00 2001 From: SufianTA Date: Fri, 24 Oct 2025 09:42:51 -0700 Subject: [PATCH 1/8] Commit message for new feature --- src/tooluniverse/__init__.py | 8 + src/tooluniverse/candidate_tester_tool.py | 48 ++ src/tooluniverse/common_utils.py | 30 ++ src/tooluniverse/context_keeper_tool.py | 105 ++++ src/tooluniverse/data/vsd.json | 35 ++ src/tooluniverse/data/vsd_allowlist.json | 4 + src/tooluniverse/data/vsd_tools.json | 34 ++ src/tooluniverse/harvest/__init__.py | 1 + src/tooluniverse/harvest/domain_policies.py | 59 +++ src/tooluniverse/harvest/openapi_utils.py | 67 +++ src/tooluniverse/harvest/promoter.py | 101 ++++ src/tooluniverse/harvest/query_expansion.py | 28 + src/tooluniverse/harvest/ranker.py | 36 ++ src/tooluniverse/harvest/searchers.py | 64 +++ src/tooluniverse/harvest/static_catalog.py | 539 ++++++++++++++++++++ src/tooluniverse/harvest/verifier.py | 33 ++ src/tooluniverse/tool_navigator_tool.py | 110 ++++ src/tooluniverse/tool_registry.py | 15 + src/tooluniverse/vsd_api_tool.py | 115 +++++ src/tooluniverse/vsd_catalog.py | 44 ++ src/tooluniverse/vsd_registry.py | 91 ++++ src/tooluniverse/vsd_tool.py | 115 +++++ src/tooluniverse/vsd_utils.py | 212 ++++++++ 23 files changed, 1894 insertions(+) create mode 100644 src/tooluniverse/candidate_tester_tool.py create mode 100644 src/tooluniverse/common_utils.py create mode 100644 src/tooluniverse/context_keeper_tool.py create mode 100644 src/tooluniverse/data/vsd.json create mode 100644 src/tooluniverse/data/vsd_allowlist.json create mode 100644 src/tooluniverse/data/vsd_tools.json create mode 100644 src/tooluniverse/harvest/__init__.py create mode 100644 src/tooluniverse/harvest/domain_policies.py create mode 100644 src/tooluniverse/harvest/openapi_utils.py create mode 100644 src/tooluniverse/harvest/promoter.py create mode 100644 src/tooluniverse/harvest/query_expansion.py create mode 100644 src/tooluniverse/harvest/ranker.py create mode 100644 src/tooluniverse/harvest/searchers.py create mode 100644 src/tooluniverse/harvest/static_catalog.py create mode 100644 src/tooluniverse/harvest/verifier.py create mode 100644 src/tooluniverse/tool_navigator_tool.py create mode 100644 src/tooluniverse/vsd_api_tool.py create mode 100644 src/tooluniverse/vsd_catalog.py create mode 100644 src/tooluniverse/vsd_registry.py create mode 100644 src/tooluniverse/vsd_tool.py create mode 100644 src/tooluniverse/vsd_utils.py diff --git a/src/tooluniverse/__init__.py b/src/tooluniverse/__init__.py index bed8e3f3..6a001040 100644 --- a/src/tooluniverse/__init__.py +++ b/src/tooluniverse/__init__.py @@ -278,6 +278,11 @@ def __getattr__(self, name): from .core_tool import CoreTool from .pmc_tool import PMCTool from .zenodo_tool import ZenodoTool + from . import vsd_tool # registers VerifiedSourceDiscoveryTool + VerifiedSourceRegisterTool + from . import vsd_api_tool # registers GenericRESTTool + GenericGraphQLTool + from . import context_keeper_tool # registers ContextKeeperTool + from . import candidate_tester_tool # registers HarvestCandidateTesterTool + from . import tool_navigator_tool # registers ToolNavigatorTool else: # With lazy loading, create lazy import proxies that import modules only when accessed MonarchTool = _LazyImportProxy("restful_tool", "MonarchTool") @@ -453,6 +458,9 @@ def __getattr__(self, name): "ODPHPItemList", "ODPHPTopicSearch", "ODPHPOutlinkFetch", + "ContextKeeperTool", + "HarvestCandidateTesterTool", + "ToolNavigatorTool", "CellosaurusSearchTool", "CellosaurusQueryConverterTool", "CellosaurusGetCellLineInfoTool", diff --git a/src/tooluniverse/candidate_tester_tool.py b/src/tooluniverse/candidate_tester_tool.py new file mode 100644 index 00000000..d37f95b5 --- /dev/null +++ b/src/tooluniverse/candidate_tester_tool.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Any, Dict + +from .tool_registry import register_tool +from .vsd_utils import build_config, probe_config + + +@register_tool("HarvestCandidateTesterTool") +class HarvestCandidateTesterTool: + """ + Validate harvest/VSD candidates without registering them. + Returns HTTP diagnostics and suggestions for default params or headers. + """ + + name = "HarvestCandidateTesterTool" + description = "Test a harvest candidate endpoint to see if it returns usable JSON." + input_schema = { + "type": "object", + "properties": { + "candidate": {"type": "object"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + }, + "required": ["candidate"], + "additionalProperties": False, + } + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + candidate = arguments.get("candidate") or {} + tool_type = arguments.get("tool_type") or "dynamic_rest" + default_params = arguments.get("default_params") + default_headers = arguments.get("default_headers") + + cfg = build_config( + candidate, + tool_type=tool_type, + default_params=default_params, + default_headers=default_headers, + ) + probe = probe_config(cfg) + + return { + "ok": bool(probe.get("ok")), + "test": probe, + "config": cfg, + } diff --git a/src/tooluniverse/common_utils.py b/src/tooluniverse/common_utils.py new file mode 100644 index 00000000..8fdb5d85 --- /dev/null +++ b/src/tooluniverse/common_utils.py @@ -0,0 +1,30 @@ + +import os, json, time, threading, base64, io +from typing import Any, Dict, Tuple + +_LOCK = threading.Lock() + +def ensure_dir(path: str): + os.makedirs(path, exist_ok=True) + +def vsd_generated_path() -> str: + base = os.environ.get("TOOLUNIVERSE_VSD_DIR") or os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd") + ensure_dir(base) + return os.path.join(base, "generated_tools.json") + +def read_json(path: str, default): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return default + +def write_json(path: str, data: Any): + ensure_dir(os.path.dirname(path)) + tmp_path = f"{path}.tmp" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + os.replace(tmp_path, path) + +def b64_png(png_bytes: bytes) -> str: + return base64.b64encode(png_bytes).decode("ascii") diff --git a/src/tooluniverse/context_keeper_tool.py b/src/tooluniverse/context_keeper_tool.py new file mode 100644 index 00000000..46dd2b0c --- /dev/null +++ b/src/tooluniverse/context_keeper_tool.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import json +import os +from typing import Any, Dict, Optional + +from .tool_registry import register_tool + +CONTEXT_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "context") +CONTEXT_PATH = os.path.join(CONTEXT_DIR, "context.json") + + +def _ensure_dir() -> None: + os.makedirs(CONTEXT_DIR, exist_ok=True) + + +def _load_context() -> Dict[str, Any]: + if not os.path.exists(CONTEXT_PATH): + return {} + try: + with open(CONTEXT_PATH, "r", encoding="utf-8") as handle: + data = json.load(handle) + if isinstance(data, dict): + return data + except Exception: + pass + return {} + + +def _write_context(data: Dict[str, Any]) -> None: + _ensure_dir() + tmp_path = f"{CONTEXT_PATH}.tmp" + with open(tmp_path, "w", encoding="utf-8") as handle: + json.dump(data, handle, indent=2, ensure_ascii=False) + os.replace(tmp_path, CONTEXT_PATH) + + +@register_tool("ContextKeeperTool") +class ContextKeeperTool: + """ + Lightweight context store that agents can use to persist conversation or task state + between ToolUniverse calls. Data is saved under ~/.tooluniverse/context/context.json. + """ + + name = "ContextKeeperTool" + description = "Persist or retrieve task context (key/value pairs) for ongoing agent workflows." + input_schema = { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["get", "set", "append", "clear", "keys"], + "default": "get", + }, + "key": {"type": "string", "description": "Context entry name"}, + "value": { + "description": "Value to store; for append operations this should be a list item.", + }, + }, + "additionalProperties": False, + } + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + action = (arguments.get("action") or "get").lower() + key: Optional[str] = arguments.get("key") + value: Any = arguments.get("value") + + context = _load_context() + + if action == "keys": + return {"ok": True, "keys": sorted(context.keys())} + + if action == "clear": + if key: + removed = context.pop(key, None) is not None + _write_context(context) + return {"ok": removed, "cleared": key if removed else None} + context.clear() + _write_context(context) + return {"ok": True, "cleared": "all"} + + if action == "set": + if key is None: + return {"ok": False, "error": "key is required for set"} + context[key] = value + _write_context(context) + return {"ok": True, "key": key, "value": value} + + if action == "append": + if key is None: + return {"ok": False, "error": "key is required for append"} + existing = context.get(key) + if existing is None: + context[key] = [value] + elif isinstance(existing, list): + existing.append(value) + else: + context[key] = [existing, value] + _write_context(context) + return {"ok": True, "key": key, "value": context[key]} + + # default: get + if key: + return {"ok": True, "key": key, "value": context.get(key)} + return {"ok": True, "value": context} diff --git a/src/tooluniverse/data/vsd.json b/src/tooluniverse/data/vsd.json new file mode 100644 index 00000000..b359048e --- /dev/null +++ b/src/tooluniverse/data/vsd.json @@ -0,0 +1,35 @@ +[ + { + "name": "GenericHarvestTool", + "type": "GenericHarvestTool", + "description": "Live-harvest candidate API endpoints by invoking all modules in tooluniverse.harvest.", + "tool_type": "special_tools", + "enabled": true, + "visible": true, + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Free-text hint, passed to all harvesters under tooluniverse.harvest." + }, + "urls": { + "type": "array", + "items": { + "type": "string", + "format": "uri" + }, + "description": "Explicit candidate URLs to validate and return (skips live harvesting)." + }, + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "default": 5, + "description": "Max number of candidates to return." + } + }, + "additionalProperties": false + } + } +] diff --git a/src/tooluniverse/data/vsd_allowlist.json b/src/tooluniverse/data/vsd_allowlist.json new file mode 100644 index 00000000..3c5258a2 --- /dev/null +++ b/src/tooluniverse/data/vsd_allowlist.json @@ -0,0 +1,4 @@ +[ +{"domain": "ema.europa.eu", "label": "EMA", "trust": 0.95, "registry": "ema"}, +{"domain": "ghoapi.azureedge.net", "label": "WHO GHO", "trust": 0.92, "registry": "who"} +] \ No newline at end of file diff --git a/src/tooluniverse/data/vsd_tools.json b/src/tooluniverse/data/vsd_tools.json new file mode 100644 index 00000000..398cd86b --- /dev/null +++ b/src/tooluniverse/data/vsd_tools.json @@ -0,0 +1,34 @@ +[ + { + "type": "VerifiedSourceDiscoveryTool", + "name": "vsd_discover_sources", + "description": "Discover trusted candidate sources for a free-text query", + "parameter": { + "type": "object", + "required": ["query"], + "properties": { + "query": { "type": "string" }, + "limit": { "type": "integer" }, + "allowlist_overrides": { "type": "array" } + } + }, + "label": ["VSD", "Discovery"] + }, + { + "type": "VerifiedSourceRegisterTool", + "name": "vsd_register_tool", + "description": "Register a VSD-generated tool bound to a trusted source", + "parameter": { + "type": "object", + "required": ["candidate", "tool_name"], + "properties": { + "candidate": { "type": "object" }, + "tool_name": { "type": "string" }, + "description": { "type": "string" }, + "parameter_overrides": { "type": "object" }, + "evidence_sample": { "type": "object" } + } + }, + "label": ["VSD", "Synthesis"] + } +] diff --git a/src/tooluniverse/harvest/__init__.py b/src/tooluniverse/harvest/__init__.py new file mode 100644 index 00000000..19c21109 --- /dev/null +++ b/src/tooluniverse/harvest/__init__.py @@ -0,0 +1 @@ +# Harvest subpackage diff --git a/src/tooluniverse/harvest/domain_policies.py b/src/tooluniverse/harvest/domain_policies.py new file mode 100644 index 00000000..49031914 --- /dev/null +++ b/src/tooluniverse/harvest/domain_policies.py @@ -0,0 +1,59 @@ +from __future__ import annotations +from functools import lru_cache +from typing import Dict, List + +# Conservative allow/deny fragments. We still compute a trust score as a gradient. +ALLOWED_FRAGMENTS: List[str] = [ + # government & intergovernmental + ".gov", ".mil", ".gob", ".gouv", ".go.", ".govt.nz", ".gc.ca", + "who.int", "worldbank.org", "oecd.org", "europa.eu", "esa.int", + # major scientific/health orgs + "nih.gov", "niddk.nih.gov", "ninds.nih.gov", "ncbi.nlm.nih.gov", "data.cdc.gov", "api.cdc.gov", + "fda.gov", "api.fda.gov", "epa.gov", "noaa.gov", "usgs.gov", "census.gov", + "data.gov", "healthdata.gov", "data.cms.gov", "data.hrsa.gov", "data.hhs.gov", + "ghoapi.azureedge.net", +] + +BLOCKED_FRAGMENTS: List[str] = [ + "mirror", "docshare", "scribd.com", "sharepdf", "academia.edu", + "stackprinter", "cachedview", "wayback", "pirated", "scrapeops", +] + +@lru_cache(maxsize=4096) +def domain_blocked(host: str) -> bool: + h = (host or "").lower() + return any(b in h for b in BLOCKED_FRAGMENTS) + +@lru_cache(maxsize=4096) +def domain_allowed(host: str) -> bool: + # allow if any strong allow fragment present AND not blocked + h = (host or "").lower() + if domain_blocked(h): + return False + return any(a in h for a in ALLOWED_FRAGMENTS) + +@lru_cache(maxsize=4096) +def trust_score(host: str) -> Dict: + """Return a graded trust score in [0,1] with reasons for ranking. + We don't *block* here (that's domain_blocked); we provide a signal for ranker. + """ + h = (host or "").lower() + score = 0.0 + reasons: List[str] = [] + if domain_blocked(h): + return {"score": 0.0, "reasons": ["blocked"]} + + # strong positives + if any(tld in h for tld in (".gov", "who.int", "worldbank.org", "europa.eu", "oecd.org")): + score += 0.65; reasons.append("gov/igo domain") + if any(seg in h for seg in ("nih.gov","ncbi.nlm.nih.gov","fda.gov","epa.gov","noaa.gov","usgs.gov","census.gov")): + score += 0.2; reasons.append("major science/health org") + # medium positives + if h.startswith("api.") or "/api" in h: + score += 0.05; reasons.append("api host") + # slight boost for data portals + if any(seg in h for seg in ("data.gov","healthdata.gov","data.cms.gov","data.cdc.gov","data.europa.eu")): + score += 0.08; reasons.append("open data portal") + + score = max(0.0, min(1.0, score)) + return {"score": round(score, 3), "reasons": reasons} \ No newline at end of file diff --git a/src/tooluniverse/harvest/openapi_utils.py b/src/tooluniverse/harvest/openapi_utils.py new file mode 100644 index 00000000..4adcddd0 --- /dev/null +++ b/src/tooluniverse/harvest/openapi_utils.py @@ -0,0 +1,67 @@ +from __future__ import annotations +import re, logging, json +from typing import Dict, Optional, List +import requests + +logger = logging.getLogger("OpenAPIUtils") + +OPENAPI_HINTS = ["openapi.json","openapi.yaml","openapi.yml","swagger.json","swagger.yaml","v3/api-docs"] + +def _root_of(url: str) -> str: + base = url.split("?",1)[0] + base = re.sub(r"(#.*)$","", base) + base = re.sub(r"/+$","", base) + m = re.match(r"^https?://[^/]+", base) + return m.group(0) if m else base + +def find_openapi_from_url(any_url: str) -> Optional[str]: + root = _root_of(any_url) + # try /openapi.json etc. at root and one level up + tries = [f"{root}/{hint}" for hint in OPENAPI_HINTS] + # also try without trailing /api segment if present + if root.endswith("/api"): + base = root.rsplit("/",1)[0] + tries.extend(f"{base}/{hint}" for hint in OPENAPI_HINTS) + for t in tries: + try: + r = requests.get(t, timeout=8) + if r.status_code == 200 and ("json" in r.headers.get("Content-Type","") or t.endswith(".json")): + # quick JSON sanity + try: + j = r.json() + if "openapi" in j or "swagger" in j: + return t + except Exception: + pass + if r.status_code == 200 and (t.endswith(".yaml") or t.endswith(".yml")): + return t + except requests.RequestException: + continue + return None + +def parse_openapi(spec_url: str) -> Dict: + r = requests.get(spec_url, timeout=15) + r.raise_for_status() + text = r.text + if spec_url.endswith((".yaml",".yml")): + try: + import yaml + except Exception as e: + raise RuntimeError("YAML support requires PyYAML: pip install pyyaml") from e + spec = yaml.safe_load(text) + else: + spec = r.json() + + servers = spec.get("servers") or [] + base_url = (servers[0].get("url") if servers and isinstance(servers[0], dict) else None) or None + + paths = spec.get("paths") or {} + endpoints: List[Dict] = [] + for path, methods in paths.items(): + if not isinstance(methods, dict): + continue + for method, meta in methods.items(): + if method.upper() not in ("GET","POST","PUT","PATCH","DELETE","OPTIONS","HEAD"): + continue + endpoints.append({"path": path, "method": method.upper(), "summary": (meta or {}).get("summary")}) + return {"base_url": base_url, "endpoints": endpoints} \ No newline at end of file diff --git a/src/tooluniverse/harvest/promoter.py b/src/tooluniverse/harvest/promoter.py new file mode 100644 index 00000000..6ef0d4d6 --- /dev/null +++ b/src/tooluniverse/harvest/promoter.py @@ -0,0 +1,101 @@ +from __future__ import annotations +import os, json, tempfile, shutil +from typing import Dict, Any, List + +# Where we persist generated tool configs so DynamicREST (or your server boot) +# can load them. Mirrors your earlier logs (~/.tooluniverse/vsd/generated_tools.json). +VSD_DIR = os.path.join(os.path.expanduser("~"), ".tooluniverse", "vsd") +VSD_PATH = os.path.join(VSD_DIR, "generated_tools.json") + +def _ensure_dir(): + os.makedirs(VSD_DIR, exist_ok=True) + +def _read_json(path: str) -> Any: + if not os.path.exists(path): + return {} + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) or {} + except Exception: + return {} + +def _atomic_write(path: str, data: Any): + tmp_fd, tmp_path = tempfile.mkstemp(prefix="vsd_", suffix=".json") + os.close(tmp_fd) + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + shutil.move(tmp_path, path) + +def _slug(host: str) -> str: + return (host or "unknown").lower().replace(".", "_").replace("-", "_") + +def build_candidate_tool_json(c: Dict[str, Any]) -> Dict[str, Any]: + # Minimal, UI-friendly payload for listing/debug + return { + "name": c.get("name"), + "host": c.get("host"), + "base_url": c.get("base_url"), + "doc_url": c.get("doc_url"), + "openapi_url": c.get("openapi_url"), + "endpoints": c.get("endpoints"), + "health": c.get("health"), + "cors": c.get("cors"), + "trust": c.get("trust"), + "source": c.get("source"), + "_rank_score": c.get("_rank_score"), + } + +def _dynamicrest_tool_config(c: Dict[str, Any]) -> Dict[str, Any]: + """Produce a DynamicREST-style tool definition. + Two modes: + - OpenAPI mode (preferred): reference spec URL. + - Manual mode: infer a few GET endpoints from verification results. + """ + name = f"vsd_auto_{_slug(c.get('host') or '')}" + base_url = c.get("base_url") + openapi_url = c.get("openapi_url") + endpoints = c.get("endpoints") or [] + + cfg: Dict[str, Any] = { + "name": name, + "type": "DynamicREST", + "base_url": base_url, + "auth": c.get("auth") or {"type": "none"}, + "metadata": { + "source": c.get("source"), + "trust": c.get("trust"), + "health": c.get("health"), + "doc_url": c.get("doc_url"), + }, + } + if openapi_url: + cfg["openapi"] = {"spec_url": openapi_url} + elif endpoints: + # Trim to a handful of GET endpoints + routes: List[Dict[str, Any]] = [] + for ep in endpoints[:5]: + routes.append({ + "method": ep.get("method") or "GET", + "path": ep.get("path") or "/", + "name": (ep.get("summary") or ep.get("path") or "endpoint").strip("/").replace("/", "_")[:64] or "endpoint", + }) + cfg["routes"] = routes + else: + # Last resort: allow a generic GET on '/' + cfg["routes"] = [{"method": "GET", "path": "/"}] + return cfg + +def promote_to_dynamicrest(c: Dict[str, Any]) -> str: + """Append/Update the generated tool config file so your server can load it. + Returns the registered tool name. + """ + _ensure_dir() + current = _read_json(VSD_PATH) + if not isinstance(current, dict): + current = {} + + cfg = _dynamicrest_tool_config(c) + name = cfg.get("name") or "vsd_auto_unknown" + current[name] = cfg + _atomic_write(VSD_PATH, current) + return name \ No newline at end of file diff --git a/src/tooluniverse/harvest/query_expansion.py b/src/tooluniverse/harvest/query_expansion.py new file mode 100644 index 00000000..4ac4e959 --- /dev/null +++ b/src/tooluniverse/harvest/query_expansion.py @@ -0,0 +1,28 @@ + +from __future__ import annotations +from typing import List + +DENTAL_SYNONYMS = [ + "oral health", "dentistry", "dental caries", "tooth decay", + "periodontal", "periodontitis", "orthodontic", "endodontic", + "prosthodontic", "oral cancer", "DMFT", "fluoride", "NIDCR", "CDC Oral Health", + "WHO Oral Health" +] + +def expand_queries(query: str, max_queries: int = 6) -> List[str]: + base = query.strip() + if not base: + return [] + expanded = [base, + f"{base} WHO API", + f"{base} site:who.int", + f"{base} site:data.cdc.gov", + f"{base} site:api.fda.gov"] + for syn in DENTAL_SYNONYMS[:4]: + expanded.append(f"{base} {syn}") + # de-dup and clip + seen = [] + for q in expanded: + if q not in seen: + seen.append(q) + return seen[:max_queries] diff --git a/src/tooluniverse/harvest/ranker.py b/src/tooluniverse/harvest/ranker.py new file mode 100644 index 00000000..aa898ad1 --- /dev/null +++ b/src/tooluniverse/harvest/ranker.py @@ -0,0 +1,36 @@ +from __future__ import annotations +import math +from typing import List, Dict + +def _sim(a: str, b: str) -> float: + a,b = (a or "").lower(), (b or "").lower() + if not a or not b: + return 0.0 + aset, bset = set(a.split()), set(b.split()) + overlap = len(aset & bset) + return overlap / (len(aset) + 1e-6) + +def rank_candidates(query: str, candidates: List[Dict]) -> List[Dict]: + def score(c: Dict) -> float: + trust = float(((c.get("trust") or {}).get("score") or 0.0)) + h = c.get("health") or {} + live = 1.0 if (h.get("ok") and (h.get("status",0) < 500)) else 0.0 + lat = h.get("latency_ms") or 1500 + lat_norm = max(0.0, 1.0 - min(lat, 4000)/4000.0) + fit = max(_sim(query, c.get("name","")), _sim(query, c.get("doc_url",""))) + has_spec = 1.0 if c.get("openapi_url") else 0.2 if c.get("endpoints") else 0.0 + cors = 0.3 if (c.get("cors") or {}).get("preflight") else 0.0 + match_bonus = float(c.get("_match_score") or 0.0) + return ( + 0.25 * trust + + 0.2 * (live * lat_norm) + + 0.23 * fit + + 0.1 * has_spec + + 0.05 * cors + + (0.35 * math.log1p(match_bonus) if match_bonus > 0 else 0.0) + ) + + ranked = sorted(candidates, key=score, reverse=True) + for i, c in enumerate(ranked): + c["_rank_score"] = round(score(c), 4) + return ranked diff --git a/src/tooluniverse/harvest/searchers.py b/src/tooluniverse/harvest/searchers.py new file mode 100644 index 00000000..e9daf2e8 --- /dev/null +++ b/src/tooluniverse/harvest/searchers.py @@ -0,0 +1,64 @@ +from __future__ import annotations +import os, re, logging, requests, json +from dataclasses import dataclass +from typing import List, Optional, Dict, Any + +logger = logging.getLogger("HarvestSearch") +DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8")) + +@dataclass +class SearchResult: + title: str + url: str + snippet: str + source: str + +def _clean_host(url: str) -> str: + return re.sub(r"^https?://", "", url or "").split("/")[0].lower() + +def _normalize_candidate_url(url: str) -> str: + return (url or "").strip() + +# ---------------- CKAN adapter ---------------- +def _search_ckan(query: str, rows: int, base_url: str) -> List[SearchResult]: + out: List[SearchResult] = [] + try: + r = requests.get(base_url, params={"q": query, "rows": rows}, timeout=DEFAULT_TIMEOUT) + r.raise_for_status() + payload = r.json() + # CKAN payload guard + result = (payload or {}).get("result") or {} + for pkg in result.get("results", []): + title = pkg.get("title") or pkg.get("name") or "CKAN dataset" + notes = (pkg.get("notes") or "")[:240] + for res in (pkg.get("resources") or []): + res_url = _normalize_candidate_url(res.get("url") or "") + if not res_url: + continue + out.append(SearchResult(title=title, url=res_url, snippet=notes, source=f"ckan:{_clean_host(base_url)}")) + except Exception as e: + logger.debug("CKAN search failed for %s: %s", base_url, e) + return out + +CATALOG_ADAPTERS = { + "ckan": _search_ckan, +} + +def search_for_apis(query: str, rows: int = 100, catalogs: Optional[List[Dict[str, Any]]] = None) -> List[SearchResult]: + """Search across configured catalogs. + catalogs: list of dicts, e.g. [{"type": "ckan", "url": "https://.../api/3/action/package_search"}] + You can supply this via env HARVEST_CATALOGS='[ ... ]' or pass in directly. + """ + results: List[SearchResult] = [] + catalogs = catalogs or [] + for cat in catalogs: + ctype = (cat.get("type") or "").lower().strip() + url = cat.get("url") or "" + if not ctype or not url: + continue + adapter = CATALOG_ADAPTERS.get(ctype) + if not adapter: + logger.debug("Unknown catalog type %s, skipping", ctype) + continue + results.extend(adapter(query=query, rows=rows, base_url=url)) + return results diff --git a/src/tooluniverse/harvest/static_catalog.py b/src/tooluniverse/harvest/static_catalog.py new file mode 100644 index 00000000..83536f94 --- /dev/null +++ b/src/tooluniverse/harvest/static_catalog.py @@ -0,0 +1,539 @@ +from __future__ import annotations + +import math +import re +from copy import deepcopy +from dataclasses import dataclass +from typing import Dict, Iterable, List, Set +from urllib.parse import urlparse + +from .domain_policies import trust_score +from .ranker import rank_candidates + + +# ----------------------------------------------------------------------------- +# Static catalog data +# ----------------------------------------------------------------------------- + +RAW_CATALOG: List[Dict[str, object]] = [ + { + "name": "ClinicalTrials.gov Study Fields API", + "url": "https://clinicaltrials.gov/api/query/study_fields", + "doc_url": "https://clinicaltrials.gov/api/gui/home", + "description": "Query structured fields from the ClinicalTrials.gov registry covering study design, enrollment, outcomes, and locations.", + "keywords": ["clinical", "trial", "study", "research", "ctgov", "clinicaltrials"], + "category": "clinical_trials", + "base_score": 0.95, + "endpoints": [ + {"method": "GET", "path": "/api/query/study_fields", "summary": "Query study fields"}, + {"method": "GET", "path": "/api/query/full_studies", "summary": "Fetch full study records"}, + ], + }, + { + "name": "NCI Clinical Trials API", + "url": "https://clinicaltrialsapi.cancer.gov/api/v1/clinical-trials", + "doc_url": "https://clinicaltrialsapi.cancer.gov", + "description": "REST API exposing cancer clinical trials curated by the National Cancer Institute (NCI) with filters across disease, stage, and therapy.", + "keywords": ["clinical", "trial", "oncology", "cancer", "nci", "research"], + "category": "clinical_trials", + "base_score": 0.88, + "endpoints": [ + {"method": "GET", "path": "/api/v1/clinical-trials", "summary": "Search cancer clinical trials"}, + {"method": "GET", "path": "/api/v1/diseases", "summary": "List disease terms"}, + ], + }, + { + "name": "FDA OpenFDA Drug Label API", + "url": "https://api.fda.gov/drug/label.json", + "doc_url": "https://open.fda.gov/apis/drug/label/", + "description": "OpenFDA drug labeling information with pharmacology, indications, warnings, and dosage guidance.", + "keywords": ["drug", "label", "fda", "pharmaceutical", "medication", "clinical"], + "category": "pharmacovigilance", + "base_score": 0.6, + "endpoints": [ + {"method": "GET", "path": "/drug/label.json", "summary": "Query drug labeling records"}, + {"method": "GET", "path": "/drug/event.json", "summary": "Retrieve drug adverse events"}, + ], + }, + { + "name": "FDA OpenFDA Adverse Events API", + "url": "https://api.fda.gov/drug/event.json", + "doc_url": "https://open.fda.gov/apis/drug/event/", + "description": "Adverse event case reports submitted to FDA FAERS with patient outcomes and drug role details.", + "keywords": ["adverse", "event", "pharmacovigilance", "drug safety", "faers"], + "category": "pharmacovigilance", + "base_score": 0.65, + "endpoints": [ + {"method": "GET", "path": "/drug/event.json", "summary": "Search FAERS adverse event data"}, + ], + }, + { + "name": "FDA OpenFDA Device Recall API", + "url": "https://api.fda.gov/device/recall.json", + "doc_url": "https://open.fda.gov/apis/device/recall/", + "description": "Medical device recall records including classification, recall reason, and event dates.", + "keywords": ["medical device", "recall", "fda", "safety", "compliance"], + "category": "device_safety", + "base_score": 0.55, + "endpoints": [ + {"method": "GET", "path": "/device/recall.json", "summary": "Retrieve device recall records"}, + ], + }, + { + "name": "CDC Socrata Open Data API", + "url": "https://data.cdc.gov/resource/9mfq-cb36.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/9mfq-cb36", + "description": "CDC curated datasets accessible via the Socrata Open Data API, including COVID-19 cases and vaccinations.", + "keywords": ["cdc", "public health", "covid", "vaccination", "socrata", "open data"], + "category": "public_health", + "base_score": 0.86, + "endpoints": [ + {"method": "GET", "path": "/resource/.json", "summary": "Query CDC open datasets"}, + ], + }, + { + "name": "CDC PLACES Community Health API", + "url": "https://chronicdata.cdc.gov/resource/cwsq-ngmh.json", + "doc_url": "https://dev.socrata.com/foundry/chronicdata.cdc.gov/cwsq-ngmh", + "description": "Model-based estimates for chronic disease, health risk factors, and preventive services at local levels; supports community health assessments and dental health overlays.", + "keywords": ["community health", "chronic disease", "behavioral health", "cdc", "oral health"], + "category": "public_health", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/resource/cwsq-ngmh.json", "summary": "Retrieve PLACES health estimates"}, + ], + }, + { + "name": "CDC Oral Health Data Portal API", + "url": "https://data.cdc.gov/resource/4nhi-4p9m.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/4nhi-4p9m", + "description": "Community oral health indicators including dental visits, sealant prevalence, and fluoridation coverage for dentistry analytics.", + "keywords": ["oral health", "dentistry", "dental", "fluoride", "sealant", "cdc"], + "category": "dentistry", + "base_score": 0.81, + "endpoints": [ + {"method": "GET", "path": "/resource/4nhi-4p9m.json", "summary": "Query oral health indicator records"}, + ], + }, + { + "name": "WHO Global Health Observatory API", + "url": "https://ghoapi.azureedge.net/api/Indicator", + "doc_url": "https://www.who.int/data/gho/info/gho-odata-api", + "description": "World Health Organization indicators covering global health metrics, vaccination, and disease burden.", + "keywords": ["who", "global health", "indicator", "vaccination", "disease surveillance"], + "category": "global_health", + "base_score": 0.87, + "endpoints": [ + {"method": "GET", "path": "/api/Indicator", "summary": "List WHO health indicators"}, + {"method": "GET", "path": "/api/Indicator?$filter", "summary": "Filter indicators by code"}, + ], + }, + { + "name": "NIH RePORTER Projects API", + "url": "https://api.reporter.nih.gov/v2/projects/search", + "doc_url": "https://api.reporter.nih.gov/", + "description": "NIH-funded research projects with abstracts, funding amounts, and investigator information.", + "keywords": ["nih", "grants", "research", "project", "biomedical"], + "category": "research_funding", + "base_score": 0.83, + "endpoints": [ + {"method": "POST", "path": "/v2/projects/search", "summary": "Search NIH-funded projects"}, + ], + }, + { + "name": "NCBI E-utilities ESummary API", + "url": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + "doc_url": "https://www.ncbi.nlm.nih.gov/books/NBK25500/", + "description": "Programmatic access to NCBI databases including PubMed, nucleotide, protein, and ClinVar content.", + "keywords": ["ncbi", "genomics", "pubmed", "sequence", "biomedical"], + "category": "genomics", + "base_score": 0.84, + "endpoints": [ + {"method": "GET", "path": "/entrez/eutils/esearch.fcgi", "summary": "Search NCBI databases"}, + {"method": "GET", "path": "/entrez/eutils/esummary.fcgi", "summary": "Retrieve database summaries"}, + ], + }, + { + "name": "Ensembl REST API", + "url": "https://rest.ensembl.org/info/ping", + "doc_url": "https://rest.ensembl.org", + "description": "Genomics REST service for Ensembl data including genes, variants, and comparative genomics with JSON outputs.", + "keywords": ["ensembl", "genomics", "variants", "gene", "rest service", "bioinformatics"], + "category": "genomics", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/lookup/id/{id}", "summary": "Lookup Ensembl gene or transcript"}, + {"method": "GET", "path": "/overlap/region/{species}/{region}", "summary": "Fetch features overlapping a region"}, + ], + }, + { + "name": "SAMHSA Behavioral Health Treatment Services Locator API", + "url": "https://findtreatment.samhsa.gov/locator", + "doc_url": "https://findtreatment.samhsa.gov/developers", + "description": "Behavioral health treatment provider directory with search by service type, payment, and location.", + "keywords": ["mental health", "treatment", "behavioral health", "samhsa"], + "category": "mental_health", + "base_score": 0.81, + "endpoints": [ + {"method": "GET", "path": "/locator", "summary": "Search behavioral health providers"}, + ], + }, + { + "name": "USDA FoodData Central API", + "url": "https://api.nal.usda.gov/fdc/v1/foods/search", + "doc_url": "https://fdc.nal.usda.gov/api-guide.html", + "description": "Nutrient composition data for branded and experimental foods, with search and detail endpoints.", + "keywords": ["nutrition", "food", "dietary", "usda", "nutrients"], + "category": "nutrition", + "base_score": 0.79, + "endpoints": [ + {"method": "POST", "path": "/fdc/v1/foods/search", "summary": "Search foods by keyword"}, + {"method": "GET", "path": "/fdc/v1/food/{fdcId}", "summary": "Retrieve nutrient profile"}, + ], + }, + { + "name": "CDC Vaccination Coverage API", + "url": "https://data.cdc.gov/resource/8xkx-amqh.json", + "doc_url": "https://dev.socrata.com/foundry/data.cdc.gov/8xkx-amqh", + "description": "US vaccination coverage estimates by vaccine and demographic segment.", + "keywords": ["vaccination", "immunization", "cdc", "coverage", "public health"], + "category": "vaccination", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/resource/8xkx-amqh.json", "summary": "Vaccination coverage records"}, + ], + }, + { + "name": "NOAA Climate Data Online API", + "url": "https://www.ncdc.noaa.gov/cdo-web/api/v2/datasets", + "doc_url": "https://www.ncdc.noaa.gov/cdo-web/webservices/v2", + "description": "Climate and weather datasets from NOAA including temperature, precipitation, and extremes for environmental monitoring and early warning systems.", + "keywords": ["environment", "environmental", "weather", "climate", "noaa", "meteorology", "monitoring"], + "category": "environmental", + "base_score": 0.78, + "endpoints": [ + {"method": "GET", "path": "/cdo-web/api/v2/datasets", "summary": "List NOAA datasets"}, + {"method": "GET", "path": "/cdo-web/api/v2/data", "summary": "Query climate observations"}, + ], + }, + { + "name": "EPA AirNow API", + "url": "https://www.airnowapi.org/aq/data/", + "doc_url": "https://docs.airnowapi.org/", + "description": "Air quality measurements and forecasts for US monitoring stations, including pollutants and AQI, supporting environmental monitoring pipelines.", + "keywords": ["air quality", "environment", "environmental", "epa", "pollution", "aqi", "monitoring"], + "category": "environmental", + "base_score": 0.77, + "endpoints": [ + {"method": "GET", "path": "/aq/data/", "summary": "Retrieve air quality data"}, + ], + }, + { + "name": "Orphanet Rare Disease API", + "url": "https://www.orpha.net/OrphAPI/api/Disease", + "doc_url": "https://api.orphanet.net/OrphAPI/#!/Disease", + "description": "Rare disease catalog with Orpha codes, synonyms, epidemiology, and classification.", + "keywords": ["rare disease", "orphanet", "orpha", "genetic", "registry"], + "category": "rare_disease", + "base_score": 0.76, + "endpoints": [ + {"method": "GET", "path": "/OrphAPI/api/Disease", "summary": "List rare diseases"}, + {"method": "GET", "path": "/OrphAPI/api/Disease/{OrphaCode}", "summary": "Retrieve disease details"}, + ], + }, + { + "name": "RAREDISEASES.info NIH Service", + "url": "https://rarediseases.info.nih.gov/services/v1/diseases", + "doc_url": "https://rarediseases.info.nih.gov/developers", + "description": "NIH Genetic and Rare Diseases (GARD) API providing disease descriptions, symptoms, and resources.", + "keywords": ["rare disease", "nih", "gard", "genetic", "registry"], + "category": "rare_disease", + "base_score": 0.75, + "endpoints": [ + {"method": "GET", "path": "/services/v1/diseases", "summary": "Search rare diseases"}, + ], + }, + { + "name": "USAFacts COVID-19 API", + "url": "https://api.usafacts.org/covid/covid-api/v1/cases", + "doc_url": "https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/api/", + "description": "County-level COVID-19 cases and deaths in the United States with daily updates.", + "keywords": ["covid", "pandemic", "surveillance", "epidemiology"], + "category": "pandemic", + "base_score": 0.74, + "endpoints": [ + {"method": "GET", "path": "/covid/covid-api/v1/cases", "summary": "Retrieve COVID-19 cases"}, + ], + }, + { + "name": "Global.Health Line List API", + "url": "https://covid19-api.global.health/v1/line-list", + "doc_url": "https://global.health/documentation/api", + "description": "Anonymized global case line lists for pathogen surveillance, including demographics and travel history.", + "keywords": ["pandemic", "outbreak", "surveillance", "line list", "global health"], + "category": "pandemic", + "base_score": 0.73, + "endpoints": [ + {"method": "GET", "path": "/v1/line-list", "summary": "Retrieve outbreak line list"}, + ], + }, + { + "name": "OpenFDA Food Enforcement API", + "url": "https://api.fda.gov/food/enforcement.json", + "doc_url": "https://open.fda.gov/apis/food/enforcement/", + "description": "Food recall enforcement reports with product description, reason, and distribution data.", + "keywords": ["food", "recall", "fda", "safety", "enforcement"], + "category": "food_safety", + "base_score": 0.55, + "endpoints": [ + {"method": "GET", "path": "/food/enforcement.json", "summary": "Search food recall enforcement"}, + ], + }, + { + "name": "USDA National Farmers Market Directory API", + "url": "https://search.ams.usda.gov/farmersmarkets/v1/data.svc/zipSearch", + "doc_url": "https://www.ams.usda.gov/services/local-regional/food-directories-datasets", + "description": "Directory of US farmers markets with location, operation schedule, and services.", + "keywords": ["nutrition", "food access", "farmers market", "usda"], + "category": "nutrition", + "base_score": 0.7, + "endpoints": [ + {"method": "GET", "path": "/farmersmarkets/v1/data.svc/zipSearch", "summary": "Find farmers markets by ZIP"}, + ], + }, + { + "name": "HealthData.gov CKAN Catalog API", + "url": "https://healthdata.gov/api/3/action/package_search", + "doc_url": "https://healthdata.gov/developer", + "description": "Catalog of US Department of Health and Human Services datasets via CKAN API.", + "keywords": ["open data", "catalog", "health data", "ckan", "metadata"], + "category": "data_catalog", + "base_score": 0.82, + "endpoints": [ + {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search dataset catalog"}, + ], + }, + { + "name": "data.gov CKAN Catalog API", + "url": "https://catalog.data.gov/api/3/action/package_search", + "doc_url": "https://catalog.data.gov/dataset", + "description": "US Federal data catalog with metadata across climate, energy, health, and finance.", + "keywords": ["open data", "catalog", "federal", "ckan", "metadata"], + "category": "data_catalog", + "base_score": 0.8, + "endpoints": [ + {"method": "GET", "path": "/api/3/action/package_search", "summary": "Search the federal data catalog"}, + ], + }, + { + "name": "Europe PMC RESTful API", + "url": "https://www.ebi.ac.uk/europepmc/webservices/rest/search", + "doc_url": "https://europepmc.org/RestfulWebService", + "description": "Biomedical literature, grants, and patents from Europe PMC with advanced search syntax.", + "keywords": ["literature", "research", "biomedical", "europe pmc", "publications"], + "category": "literature", + "base_score": 0.78, + "endpoints": [ + {"method": "GET", "path": "/webservices/rest/search", "summary": "Search biomedical literature"}, + ], + }, + { + "name": "OpenAlex Graph API", + "url": "https://api.openalex.org/works", + "doc_url": "https://docs.openalex.org/api", + "description": "Scholarly works, authors, concepts, and institutions graph with filtering for literature discovery and citation analysis.", + "keywords": ["literature", "openalex", "scholarly", "citations", "research graph"], + "category": "literature", + "base_score": 0.77, + "endpoints": [ + {"method": "GET", "path": "/works", "summary": "Search scholarly works"}, + {"method": "GET", "path": "/authors", "summary": "Browse scholarly authors"}, + ], + }, +] + + +# ----------------------------------------------------------------------------- +# Internal helpers +# ----------------------------------------------------------------------------- + +TOKEN_PATTERN = re.compile(r"[a-z0-9]+") + + +def _tokenize(text: str) -> Set[str]: + tokens = set(TOKEN_PATTERN.findall((text or "").lower())) + enriched: Set[str] = set(tokens) + for tok in tokens: + if len(tok) <= 2: + continue + if tok.endswith("ies") and len(tok) > 3: + enriched.add(tok[:-3] + "y") + if tok.endswith("ing") and len(tok) > 4: + enriched.add(tok[:-3]) + if tok.endswith("al") and len(tok) > 4: + enriched.add(tok[:-2]) + if tok.endswith("s") and len(tok) > 3: + enriched.add(tok[:-1]) + return enriched + + +@dataclass(frozen=True) +class CatalogRecord: + data: Dict[str, object] + tokens: Set[str] + keyword_tokens: Set[str] + base_score: float + + +def _prepare_catalog(raw_items: Iterable[Dict[str, object]]) -> List[CatalogRecord]: + prepared: List[CatalogRecord] = [] + for item in raw_items: + entry = deepcopy(item) + + url = str(entry.get("url") or "").strip() + if not url: + continue + parsed = urlparse(url) + host = parsed.netloc.lower() + base_url = f"{parsed.scheme}://{parsed.netloc}" + + entry.setdefault("host", host) + entry.setdefault("base_url", base_url) + entry.setdefault("source", "static_catalog") + entry.setdefault("doc_url", entry.get("doc_url") or f"{base_url}/") + entry.setdefault("health", {"ok": True, "status": 200, "latency_ms": 180, "checked": "static"}) + entry.setdefault("cors", {"preflight": False}) + entry.setdefault("trust", trust_score(host)) + + keywords = entry.get("keywords") or [] + if keywords: + desc = entry.get("description") or "" + kw_text = "; ".join(str(k) for k in keywords) + if kw_text and kw_text.lower() not in desc.lower(): + entry["description"] = f"{desc} (keywords: {kw_text})" + keyword_tokens = _tokenize(" ".join(map(str, keywords))) + text_tokens = _tokenize(" ".join( + str(part) for part in ( + entry.get("name", ""), + entry.get("description", ""), + entry.get("category", ""), + entry.get("doc_url", ""), + ) + )) + + base_score = float(entry.get("base_score") or 0.0) + + prepared.append( + CatalogRecord( + data=entry, + tokens=text_tokens | keyword_tokens, + keyword_tokens=keyword_tokens, + base_score=base_score, + ) + ) + + return prepared + + +CATALOG: List[CatalogRecord] = _prepare_catalog(RAW_CATALOG) + + +# ----------------------------------------------------------------------------- +# Public harvester interface +# ----------------------------------------------------------------------------- + +def _score_entry(tokens: Set[str], record: CatalogRecord) -> float: + if not tokens: + return record.base_score + 0.5 + + keyword_overlap = len(tokens & record.keyword_tokens) + text_overlap = len(tokens & record.tokens) + + if keyword_overlap == 0 and text_overlap == 0: + return record.base_score * 0.1 + + precision = keyword_overlap / (len(tokens) or 1) + coverage = (keyword_overlap + text_overlap) / (len(record.tokens) or 1) + + return ( + 2.0 * keyword_overlap + + 1.2 * text_overlap + + 1.5 * precision + + 1.0 * coverage + + record.base_score * 0.25 + ) + + +SYNONYM_MAP = { + "clinical": ["trial", "research"], + "dentistry": ["dental", "oral", "oralhealth"], + "dental": ["dentistry", "oral", "oralhealth"], + "oral": ["dentistry", "dental", "oralhealth"], + "environmental": ["environment", "climate", "monitoring"], + "environment": ["environmental", "climate", "air"], + "monitoring": ["surveillance", "tracking"], + "rare": ["orphan", "orphanet", "genetic"], + "disease": ["condition", "illness"], + "genomics": ["genomic", "gene", "sequence", "dna"], + "genomic": ["genomics", "gene", "dna"], + "pandemic": ["outbreak", "surveillance"], + "surveillance": ["monitoring", "tracking"], + "nutrition": ["food", "diet", "dietary"], + "vaccination": ["immunization", "vaccine"], + "mental": ["behavioral", "behavior", "psych"], + "health": ["healthcare", "publichealth"], + "pharmaceutical": ["drug", "medicine"], + "adverse": ["safety", "pharmacovigilance"], +} + + +def harvest(query: str, limit: int = 5, **kwargs) -> List[Dict[str, object]]: + """ + Harvest candidate API endpoints from the static catalog. + + Args: + query: Natural language search string. + limit: Maximum number of candidates to return. + **kwargs: Unused passthrough parameters for compatibility. + """ + limit = max(1, min(int(limit or 5), 50)) + query = (query or "").strip() + + if not CATALOG: + return [] + + if not query: + top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit] + return [deepcopy(rec.data) for rec in top] + + token_union: Set[str] = _tokenize(query) + for token in list(token_union): + for syn in SYNONYM_MAP.get(token, []): + token_union |= _tokenize(syn) + + scored: List[Dict[str, object]] = [] + for record in CATALOG: + score = _score_entry(token_union, record) + if score <= 0 and record.base_score <= 0: + continue + candidate = deepcopy(record.data) + candidate["_match_score"] = round(score, 4) + candidate["_match_terms"] = sorted(token_union & record.tokens) + scored.append(candidate) + + if not scored: + top = sorted(CATALOG, key=lambda rec: rec.base_score, reverse=True)[:limit] + return [deepcopy(rec.data) for rec in top] + + preliminary = sorted(scored, key=lambda c: c["_match_score"], reverse=True)[: limit * 3] + ranked = rank_candidates(query, preliminary) + final = ranked[:limit] + + for cand in final: + cand.pop("_match_score", None) + cand.pop("_match_terms", None) + + return final + + +__all__ = ["harvest"] diff --git a/src/tooluniverse/harvest/verifier.py b/src/tooluniverse/harvest/verifier.py new file mode 100644 index 00000000..2da35df9 --- /dev/null +++ b/src/tooluniverse/harvest/verifier.py @@ -0,0 +1,33 @@ +from __future__ import annotations +import os, time, logging, requests +from typing import Dict, Optional + +logger = logging.getLogger("HarvestVerify") +DEFAULT_TIMEOUT = int(os.getenv("HARVEST_TIMEOUT_S", "8")) +SIZE_LIMIT = int(os.getenv("HARVEST_MAX_BYTES", "2000000")) +JSON_ACCEPT = {"Accept": "application/json"} + +def _head(url: str, timeout=None): + try: + return requests.head(url, timeout=timeout or DEFAULT_TIMEOUT, allow_redirects=True) + except requests.RequestException: + return None + +def _health_probe(url: str, timeout=None) -> Dict: + t0 = time.time() + try: + rh = _head(url, timeout) + if rh is not None: + clen = int(rh.headers.get("Content-Length") or 0) + if clen and clen > SIZE_LIMIT: + return {"ok": False, "status": rh.status_code, "skipped": f"large({clen})"} + r = requests.get(url, timeout=timeout or DEFAULT_TIMEOUT, headers=JSON_ACCEPT) + return {"ok": r.status_code < 500, "status": r.status_code, "latency_ms": int((time.time()-t0)*1000), "ctype": r.headers.get("Content-Type","")} + except requests.RequestException as e: + return {"ok": False, "status": 0, "error": str(e)} + +def verify_candidate(result, timeout_s: Optional[int] = None) -> Optional[Dict]: + url = (result.url or "").strip() + if not url: return None + health = _health_probe(url, timeout=timeout_s) + return {"name": result.title, "url": url, "health": health, "source": result.source} diff --git a/src/tooluniverse/tool_navigator_tool.py b/src/tooluniverse/tool_navigator_tool.py new file mode 100644 index 00000000..1341dd98 --- /dev/null +++ b/src/tooluniverse/tool_navigator_tool.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import math +from typing import Any, Dict, List, Optional + +from .execute_function import ToolUniverse +from .tool_registry import register_tool +from .vsd_registry import load_catalog + + +def _tokenize(text: str) -> List[str]: + return [t for t in (text or "").lower().split() if t] + + +def _score(query_tokens: List[str], name: str, description: str) -> float: + haystack = f"{name} {description}".lower() + score = 0.0 + for token in query_tokens: + if token in haystack: + score += 2.0 + score += sum(1.0 for token in query_tokens if any(word.startswith(token) for word in haystack.split())) + return score + + +def _format_tool(tool: Dict[str, Any]) -> Dict[str, Any]: + return { + "name": tool.get("name"), + "type": tool.get("type"), + "description": tool.get("description"), + "tool_type": tool.get("tool_type"), + "category": tool.get("category"), + "source": tool.get("source"), + } + + +@register_tool("ToolNavigatorTool") +class ToolNavigatorTool: + """ + Search ToolUniverse's catalog (built-in + VSD) to help agents discover relevant tools. + """ + + name = "ToolNavigatorTool" + description = "Search ToolUniverse/Navigated catalog for tools matching a query." + input_schema = { + "type": "object", + "properties": { + "query": {"type": "string"}, + "limit": {"type": "integer", "default": 10, "minimum": 1, "maximum": 50}, + "categories": { + "type": "array", + "items": {"type": "string"}, + "description": "Optional list of categories to include.", + }, + "include_vsd": { + "type": "boolean", + "default": True, + "description": "Include dynamically registered VSD tools in the search.", + }, + }, + "required": ["query"], + "additionalProperties": False, + } + + def __init__(self) -> None: + self._tooluniverse = ToolUniverse() + + def _load_base_tools(self) -> List[Dict[str, Any]]: + if not getattr(self._tooluniverse, "all_tools", None): + self._tooluniverse.load_tools() + return list(getattr(self._tooluniverse, "all_tools", [])) + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + query = (arguments.get("query") or "").strip() + if not query: + return {"ok": False, "error": "query is required"} + + limit = int(arguments.get("limit") or 10) + include_vsd = bool(arguments.get("include_vsd", True)) + categories = arguments.get("categories") + if categories and not isinstance(categories, list): + categories = [categories] + categories = [c.lower() for c in categories or []] + + tools = self._load_base_tools() + if include_vsd: + for cfg in load_catalog().values(): + tools.append( + { + "name": cfg.get("name"), + "type": "DynamicREST", + "description": (cfg.get("metadata") or {}).get("description"), + "tool_type": "dynamic_rest", + "category": "vsd", + "source": (cfg.get("metadata") or {}).get("source"), + } + ) + + query_tokens = _tokenize(query) + scored: List[tuple[float, Dict[str, Any]]] = [] + for tool in tools: + if categories and (tool.get("category") or "").lower() not in categories: + continue + score = _score(query_tokens, tool.get("name", ""), tool.get("description", "")) + if score > 0: + scored.append((score, tool)) + + scored.sort(key=lambda item: item[0], reverse=True) + best = [_format_tool(tool) | {"score": round(score, 3)} for score, tool in scored[:limit]] + + return {"ok": True, "query": query, "results": best, "total": len(scored)} diff --git a/src/tooluniverse/tool_registry.py b/src/tooluniverse/tool_registry.py index eb3b893f..c3f5d141 100644 --- a/src/tooluniverse/tool_registry.py +++ b/src/tooluniverse/tool_registry.py @@ -446,3 +446,18 @@ def get_tool_class_lazy(tool_name): return _tool_registry.get(tool_name) return None + +# --- VSD / compatibility shims --- +def get_tool_class(name: str): + """ + Backwards-compatible accessor used by scripts like SampleVDSRun.py. + Prefer get_tool_class_lazy(name) internally. + """ + return get_tool_class_lazy(name) + +class _RegistryShim: + def get_tool_class(self, name: str): + return get_tool_class_lazy(name) + +# Expose a 'registry' object with get_tool_class, if callers expect it +registry = _RegistryShim() \ No newline at end of file diff --git a/src/tooluniverse/vsd_api_tool.py b/src/tooluniverse/vsd_api_tool.py new file mode 100644 index 00000000..84a5c525 --- /dev/null +++ b/src/tooluniverse/vsd_api_tool.py @@ -0,0 +1,115 @@ +from __future__ import annotations +import os +import json +from typing import Dict, Any + +from .base_tool import BaseTool +from .tool_registry import register_tool + +# Reuse same storage locations as vsd_tool +VSD_HOME = os.environ.get("TOOLUNIVERSE_VSD_DIR", os.path.expanduser("~/.tooluniverse/vsd")) +GENERATED_TOOLS_PATH = os.path.join(VSD_HOME, "generated_tools.json") + +os.makedirs(VSD_HOME, exist_ok=True) + + +def _save_tool(tool_spec: Dict[str, Any]) -> None: + """Upsert a generated tool spec into the registry file.""" + tools: list[Dict[str, Any]] = [] + if os.path.exists(GENERATED_TOOLS_PATH): + try: + with open(GENERATED_TOOLS_PATH, "r", encoding="utf-8") as f: + tools = json.load(f) + except Exception: + tools = [] + by_name = {t.get("name"): t for t in tools} + by_name[tool_spec.get("name")] = tool_spec + with open(GENERATED_TOOLS_PATH, "w", encoding="utf-8") as f: + json.dump(list(by_name.values()), f, indent=2) + + +@register_tool("VSDToolBuilder") +class VSDToolBuilder(BaseTool): + """ + Build and register a usable ToolUniverse tool from a harvested or discovered VSD candidate. + + Input: + { + "candidate": { + "domain": "clinicaltrials.gov", + "endpoint": "https://clinicaltrials.gov/api/v2/studies", + "license": "CC0", + "score": 0.92 + }, + "tool_name": "clinicaltrials_search", + "description": "Query clinical trials with disease/condition filters", + "parameter_overrides": { ... optional JSON Schema ... } + } + + Output: + { + "registered": true, + "name": "clinicaltrials_search", + "config_path": "/path/to/generated_tools.json" + } + """ + + def run(self, arguments: Dict[str, Any]): + if not arguments: + return {"error": "Missing arguments"} + cand = arguments.get("candidate") or {} + tool_name = arguments.get("tool_name") + desc = arguments.get("description") or f"VSD tool for {cand.get('domain')}" + param_override = arguments.get("parameter_overrides") or {} + + if not tool_name: + return {"error": "tool_name is required"} + if not cand or not cand.get("endpoint"): + return {"error": "candidate with endpoint is required"} + + endpoint = cand.get("endpoint") + domain = cand.get("domain", "unknown") + + # Pick implementation type + if endpoint.endswith(".graphql") or "graphql" in endpoint: + impl_type = "GenericGraphQLTool" + elif endpoint.startswith("http"): + impl_type = "GenericRESTTool" + else: + impl_type = "URLHTMLTagTool" + + # Default parameter schema (can be overridden) + params = param_override or { + "type": "object", + "properties": { + "query": {"type": "string", "default": ""}, + "pageSize": {"type": "integer", "default": 10}, + } + } + + tool_spec = { + "type": impl_type, + "name": tool_name, + "description": desc, + "fields": { + "base_url": endpoint, + "method": "GET", + "default_params": {} + }, + "parameter": params, + "label": ["VSD", cand.get("label") or domain], + "vsd": { + "domain": domain, + "endpoint": endpoint, + "license": cand.get("license", "unknown"), + "score": cand.get("score"), + "registry": cand.get("registry", "catalog"), + } + } + + # Special case: ClinicalTrials.gov -> add arg_transform + if "clinicaltrials.gov" in endpoint and impl_type == "GenericRESTTool": + tool_spec["vsd"]["arg_transform"] = "ctgov_time_window" + + _save_tool(tool_spec) + return {"registered": True, "name": tool_name, "config_path": GENERATED_TOOLS_PATH} diff --git a/src/tooluniverse/vsd_catalog.py b/src/tooluniverse/vsd_catalog.py new file mode 100644 index 00000000..95ec1269 --- /dev/null +++ b/src/tooluniverse/vsd_catalog.py @@ -0,0 +1,44 @@ +# src/tooluniverse/vsd_catalog.py +import os, json +from pathlib import Path +from typing import List, Dict, Any + +VSD_DIR = Path(os.environ.get("TOOLUNIVERSE_VSD_DIR", Path.home() / ".tooluniverse" / "vsd")) +ALLOWLIST_PATH = VSD_DIR / "allowlist.json" +CATALOG_PATH = VSD_DIR / "catalog" / "vsd_catalog_candidates.json" + +def load_json(path: Path) -> Any: + if not path.exists(): + return None + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return None + +def load_allowlist(seed: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + user = load_json(ALLOWLIST_PATH) or [] + merged = {e["domain"]: e for e in seed} + for e in user: + merged[e["domain"]] = {**merged.get(e["domain"], {}), **e} + return list(merged.values()) + +def load_catalog_candidates() -> List[Dict[str, Any]]: + data = load_json(CATALOG_PATH) or [] + # normalize minimal fields and keep only candidates + out = [] + for d in data: + if d.get("status") not in (None, "candidate", "approved"): + continue + out.append({ + "domain": d.get("domain"), + "label": d.get("label") or d.get("domain"), + "registry": d.get("registry") or "data.gov", + "endpoint": d.get("endpoint"), + "license": d.get("license") or "unknown", + "trust": float(d.get("trust") or 0.7), + "freshness": d.get("freshness") or "", + "api_kind": d.get("api_kind") or "rest", + "status": d.get("status") or "candidate", + "tags": d.get("tags") or [], + }) + return out diff --git a/src/tooluniverse/vsd_registry.py b/src/tooluniverse/vsd_registry.py new file mode 100644 index 00000000..83b237f6 --- /dev/null +++ b/src/tooluniverse/vsd_registry.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import Any, Dict + +from .common_utils import read_json, write_json, vsd_generated_path + + +def _normalize_catalog(data: Any) -> Dict[str, Dict[str, Any]]: + catalog: Dict[str, Dict[str, Any]] = {} + if not isinstance(data, dict): + return catalog + + generated = data.get("generated_tools") if isinstance(data.get("generated_tools"), list) else None + if generated is not None: + for item in generated: + if isinstance(item, dict) and item.get("name"): + name = item["name"] + catalog[name] = dict(item) + return catalog + + for name, cfg in data.items(): + if not isinstance(cfg, dict): + continue + entry = dict(cfg) + entry.setdefault("name", name) + catalog[name] = entry + return catalog + + +def load_catalog() -> Dict[str, Dict[str, Any]]: + """ + Load the Verified Source catalog from disk and normalize it + to a {name: config} dictionary regardless of historical format. + """ + path = vsd_generated_path() + data = read_json(path, {}) + return _normalize_catalog(data) + + +def save_catalog(catalog: Dict[str, Dict[str, Any]]) -> str: + """ + Persist the catalog to disk as a flat {name: config} mapping. + Returns the file path for convenience. + """ + path = vsd_generated_path() + # ensure each entry has its name + serializable = {name: dict(cfg, name=name) for name, cfg in catalog.items()} + write_json(path, serializable) + return path + + +def upsert_tool(tool_name: str, cfg: Dict[str, Any]) -> Dict[str, Any]: + """ + Insert or update a tool configuration in the catalog and propagate the + change to any in-process dynamic registries. + """ + catalog = load_catalog() + config = dict(cfg) + config.setdefault("name", tool_name) + catalog[tool_name] = config + save_catalog(catalog) + + # Notify dynamic REST runner (best-effort, optional import) + try: + from .dynamic_rest_runner import upsert_generated_tool # type: ignore + + upsert_generated_tool(tool_name, config) + except Exception: + pass + + return config + + +def remove_tool(tool_name: str) -> bool: + """ + Remove a tool from the catalog. Returns True if a tool was removed. + """ + catalog = load_catalog() + if tool_name not in catalog: + return False + del catalog[tool_name] + save_catalog(catalog) + + try: + from .dynamic_rest_runner import remove_generated_tool # type: ignore + + remove_generated_tool(tool_name) + except Exception: + pass + + return True diff --git a/src/tooluniverse/vsd_tool.py b/src/tooluniverse/vsd_tool.py new file mode 100644 index 00000000..b765f8fe --- /dev/null +++ b/src/tooluniverse/vsd_tool.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from typing import Any, Dict + +from .tool_registry import register_tool +from .vsd_registry import load_catalog, save_catalog, upsert_tool +from .dynamic_rest_runner import refresh_generated_registry, remove_generated_tool +from .vsd_utils import build_config, probe_config, stamp_metadata + + +class VerifiedSourceRegisterTool: + name = "VerifiedSourceRegisterTool" + description = "Register a DynamicREST tool in the verified-source directory" + input_schema = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "candidate": {"type": "object"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + "force": {"type": "boolean", "default": False}, + }, + "required": ["tool_name", "candidate"], + } + + def __call__( + self, + tool_name: str, + candidate: Dict[str, Any], + tool_type: str = "dynamic_rest", + default_params: Dict[str, Any] | None = None, + default_headers: Dict[str, Any] | None = None, + force: bool = False, + ) -> Dict[str, Any]: + if not tool_name: + raise ValueError("tool_name is required") + + cfg = build_config( + candidate or {}, + tool_type=tool_type, + default_params=default_params, + default_headers=default_headers, + ) + + probe = probe_config(cfg) + stamp_metadata(cfg, probe) + + if not probe.get("ok") and not force: + return { + "registered": False, + "name": tool_name, + "error": "Endpoint validation failed", + "test": probe, + "suggestion": "Provide default_params/default_headers or retry with force=True after ensuring credentials.", + } + + cfg = upsert_tool(tool_name, cfg) + refresh_generated_registry() + + return {"registered": True, "name": tool_name, "config": cfg} + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self.__call__( + tool_name=arguments.get("tool_name"), + candidate=arguments.get("candidate", {}), + tool_type=arguments.get("tool_type", "dynamic_rest"), + default_params=arguments.get("default_params"), + default_headers=arguments.get("default_headers"), + force=bool(arguments.get("force")), + ) + + +class VerifiedSourceDiscoveryTool: + name = "VerifiedSourceDiscoveryTool" + description = "Return the Verified-Source catalog." + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + catalog = load_catalog() + return {"ok": True, "tools": list(catalog.values())} + + +class VerifiedSourceRemoveTool: + name = "VerifiedSourceRemoveTool" + description = "Remove a generated tool from the Verified-Source catalog." + input_schema = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + }, + "required": ["tool_name"], + } + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + tool_name = arguments.get("tool_name") + if not tool_name: + return {"removed": False, "error": "tool_name is required"} + catalog = load_catalog() + if tool_name not in catalog: + return {"removed": False, "error": f"Unknown tool '{tool_name}'"} + del catalog[tool_name] + save_catalog(catalog) + remove_generated_tool(tool_name) + return {"removed": True, "name": tool_name} + + +def register(server): + register_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool) + register_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool) + register_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool) + + server.add_tool(VerifiedSourceRegisterTool.name, VerifiedSourceRegisterTool()) + server.add_tool(VerifiedSourceDiscoveryTool.name, VerifiedSourceDiscoveryTool()) + server.add_tool(VerifiedSourceRemoveTool.name, VerifiedSourceRemoveTool()) + refresh_generated_registry() diff --git a/src/tooluniverse/vsd_utils.py b/src/tooluniverse/vsd_utils.py new file mode 100644 index 00000000..a30c12dd --- /dev/null +++ b/src/tooluniverse/vsd_utils.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +import time +from copy import deepcopy +from typing import Any, Dict + +import requests + +# ------------------------------------------------------------------------------ +# Host-specific overrides and requirements +# ------------------------------------------------------------------------------ + +HOST_OVERRIDES: Dict[str, Dict[str, Any]] = { + # Ensembl requires a concrete resource; expose the JSON heartbeat by default. + "rest.ensembl.org": { + "endpoint": "https://rest.ensembl.org/info/ping", + "default_headers": {"Accept": "application/json"}, + "notes": "Ensembl REST base requires explicit resource. '/info/ping' provides a JSON heartbeat.", + }, + "api.fda.gov": { + "default_params": {"limit": 5}, + "default_headers": {"Accept": "application/json"}, + }, + "data.cdc.gov": { + "default_params": {"$limit": 5}, + "default_headers": {"Accept": "application/json"}, + }, +} + +HOST_REQUIREMENTS: Dict[str, Dict[str, Any]] = { + "api.nal.usda.gov": { + "requires_api_key": True, + "notes": "USDA FoodData Central requires an api_key query parameter.", + }, + "www.ncdc.noaa.gov": { + "requires_api_key": True, + "notes": "NOAA CDO API requires a token header. See https://www.ncdc.noaa.gov/cdo-web/webservices/v2", + "default_headers": {"token": ""}, + }, + "clinicaltrialsapi.cancer.gov": { + "requires_api_key": True, + "notes": "ClinicalTrials API requires authenticated access for JSON responses.", + }, + "findtreatment.samhsa.gov": { + "requires_manual_params": True, + "notes": "SAMHSA locator needs query parameters (e.g., state, lat/long) to return JSON.", + }, +} + + +# ------------------------------------------------------------------------------ +# Helpers +# ------------------------------------------------------------------------------ + +def _derive_endpoint(candidate: Dict[str, Any]) -> str: + endpoint = candidate.get("endpoint") or candidate.get("url") + if endpoint: + return str(endpoint) + + base_url = candidate.get("base_url") + routes = candidate.get("endpoints") or [] + if base_url and isinstance(routes, list) and routes: + first = routes[0] + path = str(first.get("path") or "/") + if not base_url.endswith("/") and not path.startswith("/"): + return f"{base_url}/{path}" + if base_url.endswith("/") and path.startswith("/"): + return f"{base_url.rstrip('/')}{path}" + return f"{base_url}{path}" + + if base_url: + return str(base_url) + + raise ValueError("candidate.endpoint or candidate.url is required") + + +def _apply_overrides(candidate: Dict[str, Any], cfg: Dict[str, Any]) -> None: + host = (candidate.get("host") or "").lower() + + overrides = HOST_OVERRIDES.get(host) + if overrides: + if overrides.get("endpoint"): + cfg["endpoint"] = overrides["endpoint"] + if overrides.get("default_params"): + cfg.setdefault("default_params", {}).update(overrides["default_params"]) + if overrides.get("default_headers"): + cfg.setdefault("default_headers", {}).update(overrides["default_headers"]) + if overrides.get("notes"): + cfg.setdefault("metadata", {}).setdefault("notes", []).append(overrides["notes"]) + + requirements = HOST_REQUIREMENTS.get(host) + if requirements: + meta = cfg.setdefault("metadata", {}) + meta.setdefault("requirements", {}).update( + { + key: value + for key, value in requirements.items() + if key not in {"default_headers"} + } + ) + if requirements.get("default_headers"): + cfg.setdefault("default_headers", {}).update(requirements["default_headers"]) + + +# ------------------------------------------------------------------------------ +# Public helpers used by VSD tools +# ------------------------------------------------------------------------------ + +def build_config( + candidate: Dict[str, Any], + tool_type: str = "dynamic_rest", + default_params: Dict[str, Any] | None = None, + default_headers: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + """ + Produce a DynamicREST-style configuration dictionary from a harvest candidate. + """ + endpoint = _derive_endpoint(candidate) + method = str(candidate.get("method") or candidate.get("http_method") or "GET").upper() + merged_params = deepcopy(candidate.get("default_params") or candidate.get("params") or {}) + merged_headers = deepcopy(candidate.get("default_headers") or candidate.get("headers") or {}) + + cfg: Dict[str, Any] = { + "type": tool_type, + "endpoint": endpoint, + "method": method, + "default_params": merged_params, + "default_headers": merged_headers, + "auth": candidate.get("auth") or {"type": "none"}, + "description": candidate.get("description") or "", + "tool_type": candidate.get("tool_type") or "dynamic_rest", + "metadata": { + "source": candidate.get("source"), + "trust": candidate.get("trust"), + "health": candidate.get("health"), + "doc_url": candidate.get("doc_url"), + "description": candidate.get("description"), + "host": candidate.get("host"), + }, + "vsd": candidate, + } + + response_key = candidate.get("response_key") + if response_key: + cfg["response_key"] = response_key + + if default_params: + cfg["default_params"].update(default_params) + if default_headers: + cfg["default_headers"].update(default_headers) + + _apply_overrides(candidate, cfg) + + return cfg + + +def probe_config(cfg: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a lightweight HTTP request to validate the generated configuration. + Returns diagnostic information including HTTP status and a JSON snippet if available. + """ + url = cfg.get("endpoint") + method = (cfg.get("method") or "GET").upper() + params = deepcopy(cfg.get("default_params") or {}) + headers = deepcopy(cfg.get("default_headers") or {}) + headers.setdefault("Accept", "application/json") + + try: + if method == "GET": + resp = requests.get(url, params=params, headers=headers, timeout=20) + else: + resp = requests.request(method, url, json=params, headers=headers, timeout=20) + except Exception as exc: + return {"ok": False, "error": str(exc), "stage": "request"} + + content_type = resp.headers.get("Content-Type", "") + preview = resp.text[:400] if resp.text else "" + sample = None + has_json = False + + if "json" in content_type.lower(): + try: + payload = resp.json() + has_json = True + if isinstance(payload, list): + sample = payload[:1] + elif isinstance(payload, dict): + sample = {k: payload[k] for i, k in enumerate(payload) if i < 5} + else: + sample = payload + except Exception: + has_json = False + + status_ok = resp.status_code < 400 + + return { + "ok": bool(status_ok and (has_json or "json" in content_type.lower())), + "status": resp.status_code, + "content_type": content_type, + "has_json": has_json, + "sample": sample, + "preview": preview, + } + + +def stamp_metadata(cfg: Dict[str, Any], probe: Dict[str, Any]) -> None: + """ + Update metadata timestamps and probe results on a configuration dictionary. + """ + metadata = cfg.setdefault("metadata", {}) + metadata["registered_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ") + metadata["last_test"] = probe From a11fcc23657d88be64b5e3e73f4d4362a44371fe Mon Sep 17 00:00:00 2001 From: SufianTA Date: Sun, 26 Oct 2025 19:06:47 -0700 Subject: [PATCH 2/8] Add VSD dynamic REST stack and Harvest helpers --- README.md | 20 + scripts/medlog_stub_server.py | 151 ++++ scripts/run_full_demo.py | 704 ++++++++++++++++++ src/tooluniverse/__init__.py | 1 + src/tooluniverse/candidate_tester_tool.py | 41 +- src/tooluniverse/data/medlog_tools.json | 134 ++++ src/tooluniverse/data/medtok_mcp_tools.json | 11 + src/tooluniverse/data/medtok_tools.json | 134 ++++ src/tooluniverse/default_config.py | 5 + src/tooluniverse/dynamic_rest_runner.py | 194 +++++ src/tooluniverse/logging_config.py | 12 +- src/tooluniverse/medlog_tool.py | 143 ++++ src/tooluniverse/medtok_tool.py | 122 +++ src/tooluniverse/utils.py | 2 +- src/tooluniverse/vsd_tool.py | 173 ++++- src/tooluniverse/vsd_utils.py | 66 +- tests/integration/test_medtok_medlog_tools.py | 282 +++++++ 17 files changed, 2138 insertions(+), 57 deletions(-) create mode 100644 scripts/medlog_stub_server.py create mode 100644 scripts/run_full_demo.py create mode 100644 src/tooluniverse/data/medlog_tools.json create mode 100644 src/tooluniverse/data/medtok_mcp_tools.json create mode 100644 src/tooluniverse/data/medtok_tools.json create mode 100644 src/tooluniverse/dynamic_rest_runner.py create mode 100644 src/tooluniverse/medlog_tool.py create mode 100644 src/tooluniverse/medtok_tool.py create mode 100644 tests/integration/test_medtok_medlog_tools.py diff --git a/README.md b/README.md index 7ab04aec..3ecf4433 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,26 @@ Our comprehensive documentation covers everything from quick start to advanced w - **[Adding Tools Tutorial](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/Adding_Tools_Tutorial.html)**: Step-by-step tool addition guide - **[MCP Tool Registration](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/mcp_tool_registration_en.html)**: Register tools via MCP +### MedTok + MedLog Integrations + +ToolUniverse now ships with first-class support for the MedTok tokenizer service and the MedLog reference collector/FHIR bridge. + +- **MedTok REST tools** (`tool_type=["medtok"]`) expose `/tokenize`, `/embed`, `/nearest_neighbors`, `/map_text_to_code`, `/search_text`, and `/codes/{system}/{code}`. Point them at a running service by setting `MEDTOK_BASE_URL` (defaults to `http://localhost:8000`). +- **MedTok MCP auto-loader** (`tool_type=["medtok_mcp_auto_loader"]`) can register tools from the FastMCP wrapper. Set `MEDTOK_MCP_SERVER_HOST` to the host running the `medtok_tool.py` MCP server. +- **MedLog collector + FHIR tools** (`tool_type=["medlog"]`) wrap the reference implementation's REST APIs. Configure the collectors' endpoints with `MEDLOG_COLLECTOR_BASE_URL` (default `http://localhost:7001`) and `MEDLOG_FHIR_BASE_URL` (default `http://localhost:7003`). + +See `tests/integration/test_medtok_medlog_tools.py` for end-to-end examples that start the services, invoke the tools, and validate responses. + +### End-to-End Demo Script + +To launch the reference services and exercise the toolchain automatically, run: + +```bash +python scripts/run_full_demo.py # adds -h for options +``` + +The script starts MedTok + MedLog locally, runs representative tool calls (including optional external APIs like InterPro, KEGG, IUCN, JASPAR, MarineSpecies, cBioPortal, and Phenome Jax), and prints a success/failure summary. + ### 📚 API Reference - **[API Directory](https://zitniklab.hms.harvard.edu/ToolUniverse/api/modules.html)**: Complete module listing - **[Core Modules](https://zitniklab.hms.harvard.edu/ToolUniverse/api/tooluniverse.html)**: Main ToolUniverse class and utilities diff --git a/scripts/medlog_stub_server.py b/scripts/medlog_stub_server.py new file mode 100644 index 00000000..700245fa --- /dev/null +++ b/scripts/medlog_stub_server.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +""" +Lightweight MedLog stub servers for local demos. + +Run the collector: + python scripts/medlog_stub_server.py --mode collector --host 127.0.0.1 --port 8911 + +Run the FHIR bridge: + python scripts/medlog_stub_server.py --mode fhir --host 127.0.0.1 --port 8912 +""" + +from __future__ import annotations + +import argparse +import os +import threading +import time +from typing import Dict + +import uvicorn +from fastapi import FastAPI, HTTPException + + +STORE: Dict[str, Dict] = {} +STORE_LOCK = threading.Lock() + + +def build_collector_app() -> FastAPI: + app = FastAPI(title="MedLog Collector (Stub)", version="0.1.0") + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + with STORE_LOCK: + STORE[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + header = record["header"] + return {"event_id": event_id, "provenance": {"header": header}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + results = [] + with STORE_LOCK: + for eid, record in STORE.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + results.append({"event_id": eid, "header": header}) + if len(results) >= limit: + break + return {"count": len(results), "results": results} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def build_fhir_app() -> FastAPI: + app = FastAPI(title="MedLog FHIR Stub", version="0.1.0") + + def bundle(records): + return { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + { + "resource": { + "resourceType": "Observation", + "id": record["header"]["event_id"], + "status": "final", + } + } + for record in records + ], + } + + @app.get("/bundle/{event_id}") + def bundle_event(event_id: str): + with STORE_LOCK: + record = STORE.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return bundle([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + with STORE_LOCK: + records = [ + record + for record in STORE.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return bundle(records) + + return app + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["collector", "fhir"], required=True) + parser.add_argument("--host", default=os.getenv("MEDLOG_HOST", "127.0.0.1")) + parser.add_argument("--port", type=int, default=int(os.getenv("MEDLOG_PORT", 0)) or 0) + args = parser.parse_args() + + if args.port == 0: + args.port = 8911 if args.mode == "collector" else 8912 + + app = build_collector_app() if args.mode == "collector" else build_fhir_app() + print(f"Starting MedLog {args.mode} stub on {args.host}:{args.port}") + uvicorn.run(app, host=args.host, port=args.port, log_level="info") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_full_demo.py b/scripts/run_full_demo.py new file mode 100644 index 00000000..b06cc6c5 --- /dev/null +++ b/scripts/run_full_demo.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python +""" +End-to-end ToolUniverse demo runner. + +This script bootstraps the MedTok and MedLog reference services locally, points +ToolUniverse at them, and exercises a curated set of tools (MedTok, MedLog, and +several public data tools such as InterPro, KEGG, IUCN, JASPAR, MarineSpecies, +cBioPortal, Phenome Jax). It prints friendly status updates and reports any +failures at the end. + +Usage: + python scripts/run_full_demo.py + +Optional flags: + --skip-network-tools Skip external API tools (InterPro, KEGG, etc.). + --medtok-host HOST Override MedTok host (default 127.0.0.1). + --medtok-port PORT Override MedTok port (default 8910). + --medlog-host HOST Override MedLog host (default 127.0.0.1). + --collector-port PORT Override MedLog collector port (default 8911). + --fhir-port PORT Override MedLog FHIR port (default 8912). +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import tempfile +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse + +import requests +import uvicorn +from fastapi import FastAPI, HTTPException + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC_PATH = REPO_ROOT / "src" +if str(SRC_PATH) not in sys.path: + sys.path.insert(0, str(SRC_PATH)) + +from tooluniverse.execute_function import ToolUniverse + +MEDTOK_ROOT = REPO_ROOT.parent / "MedTok-FHIR-Starter" +MEDLOG_ROOT = REPO_ROOT.parent / "medlog-reference" + + +class ServerHandle: + """Run a FastAPI app in a background thread via uvicorn.""" + + def __init__(self, app: FastAPI, host: str, port: int): + config = uvicorn.Config(app, host=host, port=port, log_level="error", lifespan="off") + self.server = uvicorn.Server(config) + self.thread = threading.Thread(target=self.server.run, daemon=True) + + def start(self) -> None: + self.thread.start() + while not self.server.started: + time.sleep(0.05) + + def stop(self) -> None: + self.server.should_exit = True + self.thread.join(timeout=5) + + +def _import_module_typed(module_path: Path): + import importlib.util + + spec = importlib.util.spec_from_file_location(module_path.stem, module_path) + module = importlib.util.module_from_spec(spec) + assert spec and spec.loader + spec.loader.exec_module(module) + return module + + +def _service_is_up(base_url: str, path: str, ok_statuses: Optional[List[int]] = None) -> bool: + try: + resp = requests.get(f"{base_url}{path}", timeout=2) + if ok_statuses is None: + return resp.status_code < 500 + return resp.status_code in ok_statuses + except requests.RequestException: + return False + + +def start_medtok(host: str, port: int): + """Start MedTok FastAPI service and return context info.""" + service_path = MEDTOK_ROOT / "services" / "medtok_service" + if str(service_path) not in sys.path: + sys.path.insert(0, str(service_path)) + + base_url = os.environ.get("MEDTOK_BASE_URL") or f"http://{host}:{port}" + if _service_is_up(base_url, "/health", ok_statuses=[200]): + os.environ["MEDTOK_BASE_URL"] = base_url + print(f"MedTok already running at {base_url}, reusing existing instance.") + return {"server": None, "temp_config": None, "sys_path": str(service_path), "started": False} + + config_path = MEDTOK_ROOT / "config" / "medtok_config.json" + config_data = json.loads(config_path.read_text(encoding="utf-8")) + config_data["code_metadata_path"] = str(MEDTOK_ROOT / "samples" / "code_metadata.csv") + config_data["graph_edges_path"] = str(MEDTOK_ROOT / "samples" / "code_graph_edges.csv") + + tmp_config = tempfile.NamedTemporaryFile("w", suffix="_medtok_config.json", delete=False) + json.dump(config_data, tmp_config) + tmp_config.flush() + tmp_config.close() + os.environ["MEDTOK_CONFIG"] = tmp_config.name + + module = _import_module_typed(service_path / "app.py") + module.MAPPING_CSV = str(MEDTOK_ROOT / "samples" / "code_mapping.csv") + app = module.app + + server = ServerHandle(app, host, port) + server.start() + os.environ["MEDTOK_BASE_URL"] = f"http://{host}:{port}" + + return { + "server": server, + "temp_config": tmp_config.name, + "sys_path": str(service_path), + "started": True, + } + + +def _build_medlog_collector(store: Dict[str, Dict]): + app = FastAPI() + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + store[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return {"event_id": event_id, "provenance": {"header": record["header"]}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + matches = [] + for eid, record in store.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + matches.append({"event_id": eid, "header": header}) + if len(matches) >= limit: + break + return {"count": len(matches), "results": matches} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def _build_medlog_fhir(store: Dict[str, Dict]): + app = FastAPI() + + def _bundle(records): + return { + "resourceType": "Bundle", + "type": "collection", + "entry": [ + { + "resource": { + "resourceType": "Observation", + "id": record["header"]["event_id"], + "status": "final", + } + } + for record in records + ], + } + + @app.get("/bundle/{event_id}") + def bundle_event(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return _bundle([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + records = [ + record + for record in store.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return _bundle(records) + + return app + + +def start_medlog(host: str, collector_port: int, fhir_port: int): + store: Dict[str, Dict] = {} + collector_app = _build_medlog_collector(store) + fhir_app = _build_medlog_fhir(store) + + collector_url = os.environ.get("MEDLOG_COLLECTOR_BASE_URL") or f"http://{host}:{collector_port}" + fhir_url = os.environ.get("MEDLOG_FHIR_BASE_URL") or f"http://{host}:{fhir_port}" + + collector_server = None + fhir_server = None + + if _service_is_up(collector_url, "/"): + print(f"MedLog collector already running at {collector_url}, reusing.") + else: + collector_server = ServerHandle(collector_app, host, collector_port) + collector_server.start() + + if _service_is_up(fhir_url, "/bundle/test"): + print(f"MedLog FHIR service already running at {fhir_url}, reusing.") + else: + fhir_server = ServerHandle(fhir_app, host, fhir_port) + fhir_server.start() + + os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}" + os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}" + + return {"collector": collector_server, "fhir": fhir_server, "started": bool(collector_server or fhir_server)} + + +def stop_medtok(ctx: Dict[str, str]): + if ctx.get("server"): + ctx["server"].stop() + if ctx.get("started"): + os.environ.pop("MEDTOK_BASE_URL", None) + os.environ.pop("MEDTOK_CONFIG", None) + temp_config = ctx.get("temp_config") + if temp_config: + try: + os.remove(temp_config) + except OSError: + pass + sys_path = ctx.get("sys_path") + if sys_path: + try: + sys.path.remove(sys_path) + except ValueError: + pass + + +def stop_medlog(ctx: Dict[str, ServerHandle]): + if ctx.get("collector"): + ctx["collector"].stop() + if ctx.get("fhir"): + ctx["fhir"].stop() + if ctx.get("started"): + os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None) + os.environ.pop("MEDLOG_FHIR_BASE_URL", None) + + +def preview_json(payload: Any, limit: int = 240) -> str: + """Return a compact preview of a payload for console logging.""" + try: + text = json.dumps(payload, indent=2, ensure_ascii=False) + except TypeError: + text = str(payload) + text = text.strip() + if len(text) > limit: + return text[:limit].rstrip() + "..." + return text + + +def call_tool(tu: ToolUniverse, name: str, **kwargs): + """Call a tool and handle ToolUniverse-specific errors.""" + print(f"---> Calling {name} with {kwargs}") + try: + response = getattr(tu.tools, name)(**kwargs) + print(f"[OK] {name} succeeded") + return True, response + except Exception as exc: # pylint: disable=broad-except + print(f"[FAIL] {name} failed: {exc}") + return False, str(exc) + + +def run_medlog_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + results = [] + header = { + "event_id": "evt-demo-1", + "run_id": "run-demo-1", + "timestamp": "2025-01-01T00:00:00Z", + } + model_instance = {"model": "demo", "version": "1.0"} + user_identity = {"name": "Dr. Example"} + steps = [ + ( + "MedLog_init_event", + dict(header=header, model_instance=model_instance, user_identity=user_identity), + "Open an event with metadata (who, when, which model).", + ), + ( + "MedLog_append_fragment", + dict(event_id="evt-demo-1", fragment={"outputs": {"summary": "Patient stable"}}), + "Attach a fragment that captures model outputs for the event.", + ), + ("MedLog_get_provenance", dict(event_id="evt-demo-1"), "Retrieve provenance header saved for the event."), + ("MedLog_query_events", dict(run_id="run-demo-1"), "Query the store by run identifier."), + ("MedLog_export_parquet", dict(), "Trigger sample export (stub returns static location)."), + ("MedLog_fhir_bundle", dict(event_id="evt-demo-1"), "View the event as a single FHIR Observation bundle."), + ("MedLog_fhir_run_bundle", dict(run_id="run-demo-1"), "Bundle all events in the run as FHIR Observations."), + ] + + for name, kwargs, description in steps: + print(f" - {description}") + success, payload = call_tool(tu, name, **kwargs) + note = None + if success: + if name == "MedLog_init_event": + note = f"Created event {payload.get('event_id')}" + elif name == "MedLog_append_fragment": + note = "Attached fragment with outputs summary" + elif name == "MedLog_get_provenance": + prov = payload.get("provenance", {}) + note = f"Provenance keys: {', '.join(prov.keys()) or 'none'}" + elif name == "MedLog_query_events": + note = f"Query returned {payload.get('count', 0)} rows" + elif name == "MedLog_fhir_bundle": + note = f"Bundle contains {len(payload.get('entry', []))} resources" + elif name == "MedLog_fhir_run_bundle": + note = f"Run bundle resources: {len(payload.get('entry', []))}" + if success and note: + print(f" Result: {note}") + results.append({"tool": name, "success": success, "response": payload, "note": note}) + return results + + +def run_medtok_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + tests = [ + ( + "MedTok_tokenize", + dict(codes=["A00", "E11"], system="ICD-10", include_metadata=True), + "Convert ICD-10 codes into internal token IDs plus metadata for downstream models.", + ), + ("MedTok_embed", dict(codes=["A00"], system="ICD-10"), "Generate vector embeddings for a medical code."), + ("MedTok_nearest_neighbors", dict(code="A00", k=3), "Find nearby codes in embedding space."), + ("MedTok_map_text_to_code", dict(text="type 2 diabetes", system="ICD-10"), "Map free text to the closest code."), + ("MedTok_search_text", dict(text="hypertension", k=4), "Search the terminology for matching codes by text."), + ("MedTok_code_info", dict(code="E11", system="ICD-10"), "Fetch descriptive details for a specific code."), + ] + results = [] + for name, kwargs, description in tests: + print(f" - {description}") + success, payload = call_tool(tu, name, **kwargs) + note = None + if success: + if name == "MedTok_tokenize": + note = f"Received {len(payload.get('token_ids', []))} token IDs" + elif name == "MedTok_embed": + emb = payload.get("embeddings") or [] + if emb: + note = f"Embedding dimension {payload.get('dim')}, first vector length {len(emb[0])}" + elif name == "MedTok_nearest_neighbors": + note = f"Returned {len(payload.get('neighbors', []))} neighbors" + elif name == "MedTok_map_text_to_code": + note = f"Mapped text to code {payload.get('code')}" + elif name == "MedTok_search_text": + note = f"Top match code {payload.get('matches', [{}])[0].get('code') if payload.get('matches') else 'N/A'}" + elif name == "MedTok_code_info": + note = f"Code info description: {payload.get('description', 'N/A')}" + if success and note: + print(f" Result: {note}") + results.append({"tool": name, "success": success, "response": payload, "note": note}) + return results + + +NETWORK_TOOLS = [ + ("InterPro_search_entries", {"query": "BRCA1"}), + ("KEGG_find_entries", {"query": "ATP synthase", "database": "pathway"}), + ("IUCN_get_species_status", {"species": "Panthera leo"}), + ("JASPAR_search_motifs", {"query": "SOX2"}), + ("MarineSpecies_lookup", {"scientific_name": "Gadus morhua"}), + ("cBioPortal_search_studies", {"keyword": "breast cancer"}), + ("PhenomeJax_list_projects", {"keyword": "glucose"}), +] + + +def run_network_tools(tu: ToolUniverse) -> List[Dict[str, str]]: + outcomes = [] + for name, kwargs in NETWORK_TOOLS: + success, payload = call_tool(tu, name, **kwargs) + note_parts: List[str] = [] + if success: + if name == "InterPro_search_entries": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Entries returned: {len(data.get('results', []))}") + elif name == "KEGG_find_entries": + if isinstance(payload, dict): + note_parts.append(f"Matched {len(payload.get('results', []))} entries") + elif isinstance(payload, list): + note_parts.append(f"Matched {len(payload)} entries") + elif name == "IUCN_get_species_status": + result = payload.get("result") if isinstance(payload, dict) else {} + if isinstance(result, list) and result: + result = result[0] + elif result is None: + result = {} + species = result.get("scientific_name") + category = result.get("category") + note_parts.append(f"{species} status {category}") + elif name == "JASPAR_search_motifs": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Found {len(data.get('results', []))} motifs") + elif name == "MarineSpecies_lookup": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Matches: {len(data.get('results', []))}") + elif name == "cBioPortal_search_studies": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Studies returned: {len(data.get('studies', []))}") + elif name == "PhenomeJax_list_projects": + data = payload if isinstance(payload, dict) else {} + note_parts.append(f"Projects listed: {len(data.get('projects', []))}") + + preview = preview_json(payload) + print(f" {name} preview: {preview}") + note_parts.append(f"Preview: {preview}") + else: + print(f" {name} error payload: {preview_json(payload)}") + note = " | ".join(note_parts) if note_parts else None + outcomes.append({"tool": name, "success": success, "response": payload, "note": note}) + return outcomes + + +def _extract_host(candidate: Dict[str, Any]) -> str: + host = candidate.get("host") + if host: + return str(host) + for key in ("url", "endpoint", "base_url"): + maybe = candidate.get(key) + if not maybe: + continue + parsed = urlparse(str(maybe)) + if parsed.netloc: + return parsed.netloc + return "candidate" + + +def _slugify_host(value: str) -> str: + slug = "".join(ch if ch.isalnum() else "_" for ch in value.lower()) + slug = slug.strip("_") + return slug or "candidate" + + +def run_vsd_demo(tu: ToolUniverse) -> List[Dict[str, str]]: + """ + Demonstrate the Harvest -> Register -> Run workflow using Verified Source Directory helpers. + """ + search_query = "ensembl rest api" + print(f"\nSearching harvest catalog for '{search_query}' candidates...") + results: List[Dict[str, Any]] = [] + + success_search, harvest_resp = call_tool( + tu, + "GenericHarvestTool", + query=search_query, + limit=5, + ) + selected_candidate: Optional[Dict[str, Any]] = None + note_search: Optional[str] = None + if success_search: + candidates = (harvest_resp or {}).get("candidates") or [] + note_search = f"Candidates returned: {len(candidates)}" + if candidates: + preferred_hosts = {"rest.ensembl.org", "api.open-meteo.com"} + for candidate_option in candidates: + host = _extract_host(candidate_option).lower() + if host in preferred_hosts: + selected_candidate = candidate_option + break + if not selected_candidate: + selected_candidate = candidates[0] + host = _extract_host(selected_candidate) + print(f" - Selected candidate: {selected_candidate.get('name')} ({selected_candidate.get('url')}) [host: {host}]") + print(f" Candidate preview: {preview_json(selected_candidate)}") + else: + print(" - Harvest returned no candidates.") + else: + print(f" - Harvest search failed payload: {preview_json(harvest_resp)}") + note_search = "Harvest search failed" + results.append({"tool": "GenericHarvestTool", "success": success_search, "response": harvest_resp, "note": note_search}) + + if not (success_search and selected_candidate): + results.append( + { + "tool": "HarvestCandidateTesterTool", + "success": False, + "response": {"error": "No harvest candidate available"}, + "note": "Skipped testing", + } + ) + return results + + candidate = selected_candidate + print("\nTesting harvest candidate via HarvestCandidateTesterTool...") + success_probe, probe_resp = call_tool( + tu, + "HarvestCandidateTesterTool", + candidate=candidate, + ) + probe_note = None + if success_probe: + status = (probe_resp.get("test") or {}).get("status") + probe_note = f"Probe status {status}" + print(f" - Probe preview: {preview_json(probe_resp)}") + else: + print(f" - Probe failure payload: {preview_json(probe_resp)}") + results.append({"tool": "HarvestCandidateTesterTool", "success": success_probe, "response": probe_resp, "note": probe_note}) + + if not (success_probe and probe_resp.get("ok")): + print("Skipping registration because candidate probe failed.") + results.append( + { + "tool": "VerifiedSourceRegisterTool", + "success": False, + "response": {"error": "Probe failed"}, + "note": None, + } + ) + return results + + host_slug = _slugify_host(_extract_host(candidate)) + tool_name = f"HarvestDemo_{host_slug[:40]}" + + print("\nRegistering candidate with VerifiedSourceRegisterTool...") + success_reg, register_resp = call_tool( + tu, + "VerifiedSourceRegisterTool", + tool_name=tool_name, + candidate=candidate, + ) + note_reg = None + if success_reg: + config = (register_resp or {}).get("config") or {} + base_url = (config.get("fields") or {}).get("base_url") or config.get("endpoint") + note_reg = f"Registered tool pointing to {base_url}" + print(f" - Registered config preview: {preview_json(config)}") + else: + print(f" - Registration failure payload: {preview_json(register_resp)}") + results.append( + { + "tool": "VerifiedSourceRegisterTool", + "success": success_reg, + "response": register_resp, + "note": note_reg, + } + ) + + if not success_reg: + return results + + print("\nCalling newly registered tool...") + tu.load_tools(include_tools=[tool_name]) + success_run, run_resp = call_tool(tu, tool_name) + note_run = None + if success_run: + preview = preview_json(run_resp) + note_run = f"Preview: {preview}" + print(f" - Run result preview: {preview}") + else: + print(f" - Run failure payload: {preview_json(run_resp)}") + results.append({"tool": tool_name, "success": success_run, "response": run_resp, "note": note_run}) + + print("\nCleaning up registered tool...") + success_rm, rm_resp = call_tool( + tu, + "VerifiedSourceRemoveTool", + tool_name=tool_name, + ) + note_rm = "Removed from catalog" if success_rm else None + if success_rm: + print(f" - Removal confirmation: {preview_json(rm_resp)}") + else: + print(f" - Removal failure payload: {preview_json(rm_resp)}") + results.append({"tool": "VerifiedSourceRemoveTool", "success": success_rm, "response": rm_resp, "note": note_rm}) + + return results + + +def main(): + parser = argparse.ArgumentParser(description="Run ToolUniverse end-to-end demo.") + parser.add_argument("--skip-network-tools", action="store_true", help="Skip tools that require external HTTP APIs.") + parser.add_argument("--skip-vsd", action="store_true", help="Skip harvest/register/run VSD demonstration.") + parser.add_argument("--medtok-host", default="127.0.0.1") + parser.add_argument("--medtok-port", type=int, default=8910) + parser.add_argument("--medlog-host", default="127.0.0.1") + parser.add_argument("--collector-port", type=int, default=8911) + parser.add_argument("--fhir-port", type=int, default=8912) + args = parser.parse_args() + + medtok_ctx = None + medlog_ctx = None + all_results: List[Dict[str, str]] = [] + + try: + print("Starting MedTok service...") + medtok_ctx = start_medtok(args.medtok_host, args.medtok_port) + print(f"MedTok running at {os.environ['MEDTOK_BASE_URL']}") + + print("Starting MedLog services...") + medlog_ctx = start_medlog(args.medlog_host, args.collector_port, args.fhir_port) + print( + f"MedLog collector at {os.environ['MEDLOG_COLLECTOR_BASE_URL']}, " + f"FHIR bridge at {os.environ['MEDLOG_FHIR_BASE_URL']}" + ) + + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medtok", "medlog"]) + + print("\nRunning MedTok demo calls...") + all_results.extend(run_medtok_demo(tu)) + + print("\nRunning MedLog demo calls...") + all_results.extend(run_medlog_demo(tu)) + + if not args.skip_network_tools: + print("\nLoading network-enabled tools (InterPro, KEGG, IUCN, etc.)...") + categories = [ + "interpro", + "kegg", + "iucn_red_list", + "jaspar", + "marine_species", + "cbioportal", + "phenome_jax", + ] + try: + tu.load_tools(tool_type=categories) + except Exception as exc: # pylint: disable=broad-except + print(f"[WARN] Failed to load network tool categories: {exc}") + else: + print("Running network tool calls...") + all_results.extend(run_network_tools(tu)) + else: + print("\nSkipping external network tools.") + + if not args.skip_vsd: + print("\nHarvest -> Register -> Run walkthrough...") + vsd_results = run_vsd_demo(tu) + all_results.extend(vsd_results) + else: + print("\nSkipping VSD harvest/register/run demo.") + + finally: + if medtok_ctx: + print("\nStopping MedTok service...") + stop_medtok(medtok_ctx) + if medlog_ctx: + print("Stopping MedLog services...") + stop_medlog(medlog_ctx) + + print("\n================ Demo Summary ================") + failures = [r for r in all_results if not r["success"]] + for result in all_results: + status = "PASS" if result["success"] else "FAIL" + print(f"{status:4} | {result['tool']}") + note = result.get("note") + if note: + print(f" {note}") + if not result["success"]: + print(f" -> {result['response']}") + print("=============================================") + + if failures: + print(f"{len(failures)} tool calls failed.") + sys.exit(1) + print("All tool calls succeeded.") + + +if __name__ == "__main__": + main() diff --git a/src/tooluniverse/__init__.py b/src/tooluniverse/__init__.py index 6a001040..96edf7be 100644 --- a/src/tooluniverse/__init__.py +++ b/src/tooluniverse/__init__.py @@ -460,6 +460,7 @@ def __getattr__(self, name): "ODPHPOutlinkFetch", "ContextKeeperTool", "HarvestCandidateTesterTool", + "GenericHarvestTool", "ToolNavigatorTool", "CellosaurusSearchTool", "CellosaurusQueryConverterTool", diff --git a/src/tooluniverse/candidate_tester_tool.py b/src/tooluniverse/candidate_tester_tool.py index d37f95b5..24b742d9 100644 --- a/src/tooluniverse/candidate_tester_tool.py +++ b/src/tooluniverse/candidate_tester_tool.py @@ -1,12 +1,32 @@ from __future__ import annotations -from typing import Any, Dict +from typing import Any, Dict, Optional from .tool_registry import register_tool from .vsd_utils import build_config, probe_config - -@register_tool("HarvestCandidateTesterTool") +HARVEST_CANDIDATE_TESTER_SCHEMA = { + "type": "object", + "properties": { + "candidate": {"type": "object"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + }, + "required": ["candidate"], + "additionalProperties": False, +} + +HARVEST_CANDIDATE_TESTER_CONFIG = { + "name": "HarvestCandidateTesterTool", + "description": "Probe a harvest/VSD candidate endpoint and report JSON readiness without registering it.", + "type": "HarvestCandidateTesterTool", + "category": "special_tools", + "parameter": HARVEST_CANDIDATE_TESTER_SCHEMA, +} + + +@register_tool("HarvestCandidateTesterTool", config=HARVEST_CANDIDATE_TESTER_CONFIG) class HarvestCandidateTesterTool: """ Validate harvest/VSD candidates without registering them. @@ -15,17 +35,10 @@ class HarvestCandidateTesterTool: name = "HarvestCandidateTesterTool" description = "Test a harvest candidate endpoint to see if it returns usable JSON." - input_schema = { - "type": "object", - "properties": { - "candidate": {"type": "object"}, - "tool_type": {"type": "string", "default": "dynamic_rest"}, - "default_params": {"type": "object"}, - "default_headers": {"type": "object"}, - }, - "required": ["candidate"], - "additionalProperties": False, - } + input_schema = HARVEST_CANDIDATE_TESTER_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: candidate = arguments.get("candidate") or {} diff --git a/src/tooluniverse/data/medlog_tools.json b/src/tooluniverse/data/medlog_tools.json new file mode 100644 index 00000000..bf5799ce --- /dev/null +++ b/src/tooluniverse/data/medlog_tools.json @@ -0,0 +1,134 @@ +[ + { + "name": "MedLog_init_event", + "description": "Initialize or overwrite a MedLog event record. Supply the 9-field MedLog payload to capture headers, inputs, identities, and initial artifacts.", + "type": "MedLogInitEventTool", + "parameter": { + "type": "object", + "properties": { + "header": { + "type": "object", + "description": "MedLog header block including event_id, timestamps, risk metadata, and parent relationships." + }, + "model_instance": { + "type": "object", + "description": "Model provenance metadata (model name, version, risk posture, vendor, etc.)." + }, + "user_identity": { + "type": "object", + "description": "Information about the requesting user, clinician, or agent." + }, + "target_identity": { + "type": "object", + "description": "Optional target entity such as patient or device identifiers." + }, + "inputs": { + "type": "object", + "description": "Structured input payload captured at initialization." + }, + "retention_tier": { + "type": "string", + "description": "Retention tier label (steady, critical, transient, etc.)." + } + }, + "required": ["header", "model_instance", "user_identity"] + } + }, + { + "name": "MedLog_append_fragment", + "description": "Append outputs, outcomes, artifacts, or feedback fragments to an existing MedLog event.", + "type": "MedLogAppendFragmentTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to update." + }, + "fragment": { + "type": "object", + "description": "Fragment payload containing any of internal_artifacts, outputs, outcomes, or user_feedback." + } + }, + "required": ["event_id", "fragment"] + } + }, + { + "name": "MedLog_get_provenance", + "description": "Fetch PROV-JSON bundle for a given event to support audit trails and lineage review.", + "type": "MedLogGetProvenanceTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to retrieve." + } + }, + "required": ["event_id"] + } + }, + { + "name": "MedLog_query_events", + "description": "Query MedLog events by run or event identifier. Useful for dashboarding, analytics, and sampling inspection.", + "type": "MedLogQueryEventsTool", + "parameter": { + "type": "object", + "properties": { + "run_id": { + "type": "string", + "description": "Optional run identifier to filter results." + }, + "event_id": { + "type": "string", + "description": "Optional event identifier to narrow results." + }, + "limit": { + "type": "integer", + "description": "Maximum number of rows to return (default 50).", + "minimum": 1, + "maximum": 500 + } + } + } + }, + { + "name": "MedLog_export_parquet", + "description": "Trigger MedLog parquet export to the configured artifact directory.", + "type": "MedLogExportParquetTool", + "parameter": { + "type": "object", + "properties": {} + } + }, + { + "name": "MedLog_fhir_bundle", + "description": "Retrieve the FHIR bundle synthesised for an individual MedLog event (Patient, Practitioner, Device, AuditEvent, Observations, Documents).", + "type": "MedLogFHIRBundleTool", + "parameter": { + "type": "object", + "properties": { + "event_id": { + "type": "string", + "description": "Identifier of the event to export." + } + }, + "required": ["event_id"] + } + }, + { + "name": "MedLog_fhir_run_bundle", + "description": "Aggregate all events in a run into a consolidated FHIR bundle for care-path review.", + "type": "MedLogFHIRRunBundleTool", + "parameter": { + "type": "object", + "properties": { + "run_id": { + "type": "string", + "description": "Run identifier to export." + } + }, + "required": ["run_id"] + } + } +] diff --git a/src/tooluniverse/data/medtok_mcp_tools.json b/src/tooluniverse/data/medtok_mcp_tools.json new file mode 100644 index 00000000..fef79cbf --- /dev/null +++ b/src/tooluniverse/data/medtok_mcp_tools.json @@ -0,0 +1,11 @@ +[ + { + "name": "mcp_auto_loader_medtok", + "description": "Discover and register MedTok tools from a running MedTok MCP server so they can be invoked directly through ToolUniverse.", + "type": "MCPAutoLoaderTool", + "server_url": "http://${MEDTOK_MCP_SERVER_HOST}:9001/mcp", + "tool_prefix": "medtok_", + "auto_register": true, + "required_api_keys": ["MEDTOK_MCP_SERVER_HOST"] + } +] diff --git a/src/tooluniverse/data/medtok_tools.json b/src/tooluniverse/data/medtok_tools.json new file mode 100644 index 00000000..c54fe67b --- /dev/null +++ b/src/tooluniverse/data/medtok_tools.json @@ -0,0 +1,134 @@ +[ + { + "name": "MedTok_tokenize", + "description": "Tokenize one or more medical codes using the MedTok multimodal tokenizer. Useful for exposing token IDs and optional metadata to downstream workflows.", + "type": "MedTokTokenizeTool", + "parameter": { + "type": "object", + "properties": { + "codes": { + "type": "array", + "items": { "type": "string" }, + "description": "List of codes to tokenize (e.g., ICD-10 identifiers)." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + }, + "include_metadata": { + "type": "boolean", + "description": "Return region-level metadata for each code." + } + }, + "required": ["codes"] + } + }, + { + "name": "MedTok_embed", + "description": "Generate MedTok embeddings for a batch of codes. Returns floating-point vectors suitable for similarity search or downstream ML tasks.", + "type": "MedTokEmbedTool", + "parameter": { + "type": "object", + "properties": { + "codes": { + "type": "array", + "items": { "type": "string" }, + "description": "Codes to embed." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + } + }, + "required": ["codes"] + } + }, + { + "name": "MedTok_nearest_neighbors", + "description": "Retrieve the nearest neighbours for a code from the MedTok embedding space with similarity scores.", + "type": "MedTokNearestNeighborsTool", + "parameter": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Anchor code for the neighbourhood query." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + }, + "k": { + "type": "integer", + "description": "Number of neighbours to return (default 5).", + "minimum": 1, + "maximum": 50 + } + }, + "required": ["code"] + } + }, + { + "name": "MedTok_map_text_to_code", + "description": "Map free-text clinical language to the most relevant code using MedTok text semantics.", + "type": "MedTokMapTextTool", + "parameter": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Clinical description or narrative." + }, + "system": { + "type": "string", + "description": "Target coding system, defaults to ICD-10." + } + }, + "required": ["text"] + } + }, + { + "name": "MedTok_search_text", + "description": "Hybrid text + semantic search over the MedTok vocabulary. Useful for exploratory lookup workflows.", + "type": "MedTokSearchTextTool", + "parameter": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Query text to search for." + }, + "system": { + "type": ["string", "null"], + "description": "Optional coding system filter." + }, + "k": { + "type": "integer", + "description": "Maximum number of matches (default 5).", + "minimum": 1, + "maximum": 50 + } + }, + "required": ["text"] + } + }, + { + "name": "MedTok_code_info", + "description": "Retrieve metadata for a specific code including synonyms and graph context when available.", + "type": "MedTokCodeInfoTool", + "parameter": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Code identifier to fetch." + }, + "system": { + "type": "string", + "description": "Coding system, defaults to ICD-10." + } + }, + "required": ["code"] + } + } +] diff --git a/src/tooluniverse/default_config.py b/src/tooluniverse/default_config.py index 46095834..982dc50e 100644 --- a/src/tooluniverse/default_config.py +++ b/src/tooluniverse/default_config.py @@ -150,6 +150,11 @@ "genomics": os.path.join(current_dir, "data", "genomics_tools.json"), # Guideline and health policy tools "guidelines": os.path.join(current_dir, "data", "unified_guideline_tools.json"), + "medtok": os.path.join(current_dir, "data", "medtok_tools.json"), + "medtok_mcp_auto_loader": os.path.join( + current_dir, "data", "medtok_mcp_tools.json" + ), + "medlog": os.path.join(current_dir, "data", "medlog_tools.json"), } diff --git a/src/tooluniverse/dynamic_rest_runner.py b/src/tooluniverse/dynamic_rest_runner.py new file mode 100644 index 00000000..a3061d36 --- /dev/null +++ b/src/tooluniverse/dynamic_rest_runner.py @@ -0,0 +1,194 @@ +""" +Dynamic REST/GraphQL tool loader for Verified Source Directory (VSD). + +This module keeps an in-memory registry of generated tool specifications and +exposes helper functions for refreshing, inserting, or removing entries. Tools +are backed by lightweight BaseTool subclasses that issue HTTP requests using +the stored configuration. +""" + +from __future__ import annotations + +import json +import logging +import threading +from typing import Any, Dict, Optional + +import requests + +from .base_tool import BaseTool +from .common_utils import read_json, vsd_generated_path +from .tool_registry import register_config, register_tool + +LOGGER = logging.getLogger("DynamicRESTRunner") +_REGISTRY_LOCK = threading.Lock() +_GENERATED_TOOLS: Dict[str, Dict[str, Any]] = {} + + +def _normalize_spec(spec: Any) -> Dict[str, Dict[str, Any]]: + """ + Accept legacy list or dict formats and normalize to {name: config}. + """ + if isinstance(spec, dict): + if "generated_tools" in spec and isinstance(spec["generated_tools"], list): + return { + item.get("name"): dict(item) + for item in spec["generated_tools"] + if isinstance(item, dict) and item.get("name") + } + return { + name: dict(cfg) + for name, cfg in spec.items() + if isinstance(cfg, dict) + } + + if isinstance(spec, list): + result: Dict[str, Dict[str, Any]] = {} + for item in spec: + if isinstance(item, dict) and item.get("name"): + result[item["name"]] = dict(item) + return result + + return {} + + +def _load_generated_specs() -> Dict[str, Dict[str, Any]]: + path = vsd_generated_path() + data = read_json(path, {}) + return _normalize_spec(data) + + +def _build_request_kwargs(config: Dict[str, Any], arguments: Dict[str, Any]) -> Dict[str, Any]: + fields = config.get("fields", {}) + method = fields.get("method", "GET").upper() + timeout = fields.get("timeout", 30) + headers = fields.get("headers", {}) + default_params = fields.get("default_params", {}) + + params = dict(default_params) + body: Optional[Any] = None + + if method in {"GET", "DELETE"}: + params.update(arguments) + else: + if fields.get("body_format", "json") == "form": + body = dict(arguments) + else: + body = arguments or {} + + kwargs: Dict[str, Any] = { + "method": method, + "url": fields.get("base_url"), + "headers": headers, + "timeout": timeout, + } + if params: + kwargs["params"] = params + if body is not None: + if fields.get("body_format", "json") == "form": + kwargs["data"] = body + else: + kwargs["json"] = body + return kwargs + + +def _handle_response(response: requests.Response) -> Any: + try: + return response.json() + except ValueError: + return { + "status_code": response.status_code, + "text": response.text, + } + + +@register_tool("GenericRESTTool") +class GenericRESTTool(BaseTool): + """ + Generic REST tool generated from a VSD configuration. + """ + + def run(self, arguments=None, stream_callback=None, **_: Any): + arguments = arguments or {} + kwargs = _build_request_kwargs(self.tool_config, arguments) + method = kwargs.pop("method") + url = kwargs.pop("url") + + response = requests.request(method, url, **kwargs) + response.raise_for_status() + result = _handle_response(response) + + if stream_callback: + stream_callback(json.dumps(result)) + return result + + +@register_tool("GenericGraphQLTool") +class GenericGraphQLTool(BaseTool): + """ + Generic GraphQL tool generated from a VSD configuration. + """ + + def run(self, arguments=None, stream_callback=None, **_: Any): + arguments = arguments or {} + fields = self.tool_config.get("fields", {}) + headers = fields.get("headers", {}) + timeout = fields.get("timeout", 30) + payload = { + "query": arguments.get("query") or fields.get("default_query"), + "variables": arguments.get("variables") or fields.get("default_variables", {}), + } + + response = requests.post( + fields.get("base_url"), + json=payload, + headers=headers, + timeout=timeout, + ) + response.raise_for_status() + result = _handle_response(response) + + if stream_callback: + stream_callback(json.dumps(result)) + return result + + +def _register_generated_tool(tool_name: str, config: Dict[str, Any]) -> None: + config = dict(config) + config.setdefault("name", tool_name) + tool_type = config.get("type") or "GenericRESTTool" + + register_config(tool_name, config) + _GENERATED_TOOLS[tool_name] = config + + LOGGER.debug("Registered generated tool %s of type %s", tool_name, tool_type) + + +def refresh_generated_registry() -> Dict[str, Dict[str, Any]]: + """ + Reload generated tool specs from disk and update the runtime registry. + """ + specs = _load_generated_specs() + with _REGISTRY_LOCK: + _GENERATED_TOOLS.clear() + for name, cfg in specs.items(): + _register_generated_tool(name, cfg) + return specs + + +def upsert_generated_tool(tool_name: str, config: Dict[str, Any]) -> Dict[str, Any]: + """ + Insert or update a generated tool in the runtime registry. + """ + with _REGISTRY_LOCK: + _register_generated_tool(tool_name, config) + return _GENERATED_TOOLS[tool_name] + + +def remove_generated_tool(tool_name: str) -> None: + """ + Remove a generated tool from the runtime registry. + """ + with _REGISTRY_LOCK: + _GENERATED_TOOLS.pop(tool_name, None) + LOGGER.debug("Removed generated tool %s", tool_name) diff --git a/src/tooluniverse/logging_config.py b/src/tooluniverse/logging_config.py index 6659f6a1..3cbc36ab 100644 --- a/src/tooluniverse/logging_config.py +++ b/src/tooluniverse/logging_config.py @@ -45,12 +45,12 @@ class ToolUniverseFormatter(logging.Formatter): # Emoji prefixes for different log levels EMOJI_PREFIX = { - "DEBUG": "🔧 ", - "INFO": "ℹ️ ", - "PROGRESS": "⏳ ", - "WARNING": "⚠️ ", - "ERROR": "❌ ", - "CRITICAL": "🚨 ", + "DEBUG": "[DEBUG] ", + "INFO": "[INFO] ", + "PROGRESS": "[PROGRESS] ", + "WARNING": "[WARN] ", + "ERROR": "[ERROR] ", + "CRITICAL": "[CRITICAL] ", } def format(self, record): diff --git a/src/tooluniverse/medlog_tool.py b/src/tooluniverse/medlog_tool.py new file mode 100644 index 00000000..d375a903 --- /dev/null +++ b/src/tooluniverse/medlog_tool.py @@ -0,0 +1,143 @@ +""" +MedLog integration tools. + +These tools expose MedLog collector and FHIR linkage capabilities as native +ToolUniverse tools for event ingestion, querying, and audit retrieval. +""" + +from __future__ import annotations + +import os +from typing import Any, Dict + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + + +class _MedLogBaseTool(BaseTool): + """Shared utility methods for MedLog REST integration.""" + + DEFAULT_BASE_URL = "http://localhost:7001" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.base_url = os.getenv( + "MEDLOG_COLLECTOR_BASE_URL", self.DEFAULT_BASE_URL + ).rstrip("/") + self.session = requests.Session() + + def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.post(url, json=payload, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog collector request failed: {exc}", "endpoint": url} + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog collector request failed: {exc}", "endpoint": url} + + +class _MedLogFHIRBaseTool(BaseTool): + """Shared logic for interacting with the MedLog FHIR linkage service.""" + + DEFAULT_FHIR_URL = "http://localhost:7003" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.fhir_base = os.getenv( + "MEDLOG_FHIR_BASE_URL", self.DEFAULT_FHIR_URL + ).rstrip("/") + self.session = requests.Session() + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.fhir_base}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedLog FHIR request failed: {exc}", "endpoint": url} + + +@register_tool("MedLogInitEventTool") +class MedLogInitEventTool(_MedLogBaseTool): + """Create or update a MedLog event record.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self._post("/medlog/events/init", arguments) + + +@register_tool("MedLogAppendFragmentTool") +class MedLogAppendFragmentTool(_MedLogBaseTool): + """Append fragment data (artifacts, outputs, feedback) to an event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + fragment = arguments.get("fragment", {}) + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._post(f"/medlog/events/{event_id}/append", fragment) + + +@register_tool("MedLogGetProvenanceTool") +class MedLogGetProvenanceTool(_MedLogBaseTool): + """Retrieve PROV-JSON bundle for a specific event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._get(f"/medlog/events/{event_id}/prov") + + +@register_tool("MedLogQueryEventsTool") +class MedLogQueryEventsTool(_MedLogBaseTool): + """Query MedLog events by run_id or event_id.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "run_id": arguments.get("run_id"), + "event_id": arguments.get("event_id"), + "limit": arguments.get("limit", 50), + } + return self._post("/query", payload) + + +@register_tool("MedLogExportParquetTool") +class MedLogExportParquetTool(_MedLogBaseTool): + """Trigger a parquet export of MedLog events.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + return self._post("/export/parquet", {}) + + +@register_tool("MedLogFHIRBundleTool") +class MedLogFHIRBundleTool(_MedLogFHIRBaseTool): + """Fetch FHIR bundle for a specific event.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + event_id = arguments.get("event_id") + if not event_id: + return {"error": "Parameter 'event_id' is required."} + return self._get(f"/bundle/{event_id}") + + +@register_tool("MedLogFHIRRunBundleTool") +class MedLogFHIRRunBundleTool(_MedLogFHIRBaseTool): + """Fetch FHIR bundle aggregating all events in a run.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + run_id = arguments.get("run_id") + if not run_id: + return {"error": "Parameter 'run_id' is required."} + return self._get(f"/bundle/run/{run_id}") diff --git a/src/tooluniverse/medtok_tool.py b/src/tooluniverse/medtok_tool.py new file mode 100644 index 00000000..1bd4042f --- /dev/null +++ b/src/tooluniverse/medtok_tool.py @@ -0,0 +1,122 @@ +""" +MedTok integration tools. + +These tools provide a thin wrapper around the MedTok FastAPI service so that +ToolUniverse users can tokenize, embed, and explore medical codes directly +from the unified tool catalog. +""" + +from __future__ import annotations + +import os +from typing import Any, Dict + +import requests + +from .base_tool import BaseTool +from .tool_registry import register_tool + + +class _MedTokBaseTool(BaseTool): + """Shared utilities for MedTok REST integrations.""" + + DEFAULT_BASE_URL = "http://localhost:8000" + + def __init__(self, tool_config: Dict[str, Any]): + super().__init__(tool_config) + self.base_url = os.getenv("MEDTOK_BASE_URL", self.DEFAULT_BASE_URL).rstrip("/") + self.session = requests.Session() + + def _post(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.post(url, json=payload, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedTok request failed: {exc}", "endpoint": url} + + def _get(self, path: str) -> Dict[str, Any]: + url = f"{self.base_url}{path}" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.json() + except requests.RequestException as exc: # pragma: no cover - network errors + return {"error": f"MedTok request failed: {exc}", "endpoint": url} + + +@register_tool("MedTokTokenizeTool") +class MedTokTokenizeTool(_MedTokBaseTool): + """Tokenize medical codes using MedTok multimodal tokenizer.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "codes": arguments.get("codes", []), + "system": arguments.get("system", "ICD-10"), + "include_metadata": arguments.get("include_metadata", False), + } + return self._post("/tokenize", payload) + + +@register_tool("MedTokEmbedTool") +class MedTokEmbedTool(_MedTokBaseTool): + """Generate token embeddings for a batch of codes.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "codes": arguments.get("codes", []), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/embed", payload) + + +@register_tool("MedTokNearestNeighborsTool") +class MedTokNearestNeighborsTool(_MedTokBaseTool): + """Retrieve nearest neighbours for a code in embedding space.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "code": arguments.get("code"), + "k": arguments.get("k", 5), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/nearest_neighbors", payload) + + +@register_tool("MedTokMapTextTool") +class MedTokMapTextTool(_MedTokBaseTool): + """Map free-text description to the closest medical code.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "text": arguments.get("text", ""), + "system": arguments.get("system", "ICD-10"), + } + return self._post("/map_text_to_code", payload) + + +@register_tool("MedTokSearchTextTool") +class MedTokSearchTextTool(_MedTokBaseTool): + """Perform text and semantic search across the code vocabulary.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + payload = { + "text": arguments.get("text", ""), + "system": arguments.get("system"), + "k": arguments.get("k", 5), + } + return self._post("/search_text", payload) + + +@register_tool("MedTokCodeInfoTool") +class MedTokCodeInfoTool(_MedTokBaseTool): + """Fetch detailed metadata for a specific code.""" + + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + system = arguments.get("system", "ICD-10") + code = arguments.get("code") + if not code: + return {"error": "Parameter 'code' is required."} + path = f"/codes/{system}/{code}" + return self._get(path) diff --git a/src/tooluniverse/utils.py b/src/tooluniverse/utils.py index 88e778fb..ff4867e2 100755 --- a/src/tooluniverse/utils.py +++ b/src/tooluniverse/utils.py @@ -136,7 +136,7 @@ def read_json_list(file_path): Returns list: A list of dictionaries containing the JSON objects. """ - with open(file_path, "r") as file: + with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) return data diff --git a/src/tooluniverse/vsd_tool.py b/src/tooluniverse/vsd_tool.py index b765f8fe..98a09e24 100644 --- a/src/tooluniverse/vsd_tool.py +++ b/src/tooluniverse/vsd_tool.py @@ -1,28 +1,157 @@ from __future__ import annotations -from typing import Any, Dict +from typing import Any, Dict, Optional, List +from urllib.parse import urlparse from .tool_registry import register_tool from .vsd_registry import load_catalog, save_catalog, upsert_tool from .dynamic_rest_runner import refresh_generated_registry, remove_generated_tool from .vsd_utils import build_config, probe_config, stamp_metadata +from .harvest.static_catalog import harvest as harvest_static + +GENERIC_HARVEST_SCHEMA = { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Free-text search term passed to the harvest catalog.", + }, + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "default": 5, + "description": "Maximum number of candidates to return.", + }, + "urls": { + "type": "array", + "items": {"type": "string", "format": "uri"}, + "description": "Optional explicit URLs to wrap as manual candidates (skips catalog search).", + }, + }, + "additionalProperties": False, +} + +GENERIC_HARVEST_CONFIG = { + "name": "GenericHarvestTool", + "description": "Search the harvest catalog (or wrap manual URLs) to produce candidate API endpoints.", + "type": "GenericHarvestTool", + "category": "special_tools", + "parameter": GENERIC_HARVEST_SCHEMA, +} + +VERIFIED_SOURCE_REGISTER_SCHEMA = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + "tool_type": {"type": "string", "default": "dynamic_rest"}, + "candidate": {"type": "object"}, + "default_params": {"type": "object"}, + "default_headers": {"type": "object"}, + "force": {"type": "boolean", "default": False}, + }, + "required": ["tool_name", "candidate"], +} + +VERIFIED_SOURCE_REGISTER_CONFIG = { + "name": "VerifiedSourceRegisterTool", + "description": "Register a DynamicREST tool into the verified-source catalog after probing it.", + "type": "VerifiedSourceRegisterTool", + "category": "special_tools", + "parameter": VERIFIED_SOURCE_REGISTER_SCHEMA, +} + +VERIFIED_SOURCE_DISCOVERY_CONFIG = { + "name": "VerifiedSourceDiscoveryTool", + "description": "List the tools currently stored in the verified-source catalog.", + "type": "VerifiedSourceDiscoveryTool", + "category": "special_tools", + "parameter": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, +} + +VERIFIED_SOURCE_REMOVE_SCHEMA = { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + }, + "required": ["tool_name"], +} + +VERIFIED_SOURCE_REMOVE_CONFIG = { + "name": "VerifiedSourceRemoveTool", + "description": "Remove a generated tool from the verified-source catalog.", + "type": "VerifiedSourceRemoveTool", + "category": "special_tools", + "parameter": VERIFIED_SOURCE_REMOVE_SCHEMA, +} + + +@register_tool("GenericHarvestTool", config=GENERIC_HARVEST_CONFIG) +class GenericHarvestTool: + name = "GenericHarvestTool" + description = "Harvest candidate API endpoints from the static catalog or wrap manual URLs." + input_schema = GENERIC_HARVEST_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} - + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: + query = (arguments.get("query") or "").strip() + limit_value = arguments.get("limit", 5) + try: + limit = int(limit_value) + except (TypeError, ValueError): + limit = 5 + limit = max(1, min(limit, 50)) + urls = arguments.get("urls") or [] + + candidates: List[Dict[str, Any]] = [] + + if urls: + for idx, raw_url in enumerate(urls): + if not raw_url: + continue + parsed = urlparse(str(raw_url)) + host = parsed.netloc.lower() + base_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else raw_url + name = host or f"manual_candidate_{idx + 1}" + candidates.append( + { + "name": name, + "endpoint": raw_url, + "url": raw_url, + "base_url": base_url, + "host": host, + "source": "manual_urls", + "description": arguments.get("description") or "", + "trust": 0.5, + "health": {"ok": None, "status": None, "checked": "manual"}, + } + ) + else: + extra_args = {k: v for k, v in arguments.items() if k not in {"query", "limit", "urls"}} + candidates = harvest_static(query=query, limit=limit, **extra_args) + + return { + "ok": True, + "query": query, + "count": len(candidates), + "candidates": candidates, + } + + +@register_tool("VerifiedSourceRegisterTool", config=VERIFIED_SOURCE_REGISTER_CONFIG) class VerifiedSourceRegisterTool: name = "VerifiedSourceRegisterTool" description = "Register a DynamicREST tool in the verified-source directory" - input_schema = { - "type": "object", - "properties": { - "tool_name": {"type": "string"}, - "tool_type": {"type": "string", "default": "dynamic_rest"}, - "candidate": {"type": "object"}, - "default_params": {"type": "object"}, - "default_headers": {"type": "object"}, - "force": {"type": "boolean", "default": False}, - }, - "required": ["tool_name", "candidate"], - } + input_schema = VERIFIED_SOURCE_REGISTER_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} def __call__( self, @@ -71,25 +200,27 @@ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: ) +@register_tool("VerifiedSourceDiscoveryTool", config=VERIFIED_SOURCE_DISCOVERY_CONFIG) class VerifiedSourceDiscoveryTool: name = "VerifiedSourceDiscoveryTool" description = "Return the Verified-Source catalog." + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} + def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: catalog = load_catalog() return {"ok": True, "tools": list(catalog.values())} +@register_tool("VerifiedSourceRemoveTool", config=VERIFIED_SOURCE_REMOVE_CONFIG) class VerifiedSourceRemoveTool: name = "VerifiedSourceRemoveTool" description = "Remove a generated tool from the Verified-Source catalog." - input_schema = { - "type": "object", - "properties": { - "tool_name": {"type": "string"}, - }, - "required": ["tool_name"], - } + input_schema = VERIFIED_SOURCE_REMOVE_SCHEMA + + def __init__(self, tool_config: Optional[Dict[str, Any]] = None) -> None: + self.tool_config = tool_config or {} def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: tool_name = arguments.get("tool_name") diff --git a/src/tooluniverse/vsd_utils.py b/src/tooluniverse/vsd_utils.py index a30c12dd..3f3250e7 100644 --- a/src/tooluniverse/vsd_utils.py +++ b/src/tooluniverse/vsd_utils.py @@ -79,12 +79,16 @@ def _apply_overrides(candidate: Dict[str, Any], cfg: Dict[str, Any]) -> None: overrides = HOST_OVERRIDES.get(host) if overrides: + fields = cfg.setdefault("fields", {}) if overrides.get("endpoint"): cfg["endpoint"] = overrides["endpoint"] + fields["base_url"] = overrides["endpoint"] if overrides.get("default_params"): cfg.setdefault("default_params", {}).update(overrides["default_params"]) + fields.setdefault("default_params", {}).update(overrides["default_params"]) if overrides.get("default_headers"): cfg.setdefault("default_headers", {}).update(overrides["default_headers"]) + fields.setdefault("headers", {}).update(overrides["default_headers"]) if overrides.get("notes"): cfg.setdefault("metadata", {}).setdefault("notes", []).append(overrides["notes"]) @@ -100,6 +104,7 @@ def _apply_overrides(candidate: Dict[str, Any], cfg: Dict[str, Any]) -> None: ) if requirements.get("default_headers"): cfg.setdefault("default_headers", {}).update(requirements["default_headers"]) + cfg.setdefault("fields", {}).setdefault("headers", {}).update(requirements["default_headers"]) # ------------------------------------------------------------------------------ @@ -120,15 +125,43 @@ def build_config( merged_params = deepcopy(candidate.get("default_params") or candidate.get("params") or {}) merged_headers = deepcopy(candidate.get("default_headers") or candidate.get("headers") or {}) - cfg: Dict[str, Any] = { - "type": tool_type, - "endpoint": endpoint, + # Allow overrides provided via arguments + if default_params: + merged_params.update(default_params) + if default_headers: + merged_headers.update(default_headers) + + # Determine implementation class + declared_type = str(candidate.get("tool_type") or tool_type or "").lower() + impl_type = "GenericRESTTool" + if declared_type in {"graphql", "genericgraphqltool", "graph_ql"} or endpoint.endswith(".graphql"): + impl_type = "GenericGraphQLTool" + + # Provide a permissive parameter schema with defaults from known params + parameter_schema: Dict[str, Any] = deepcopy(candidate.get("parameter_schema") or candidate.get("parameter") or {}) + if not parameter_schema: + properties = { + key: {"description": f"Override default query parameter '{key}'", "default": value} + for key, value in merged_params.items() + } + parameter_schema = { + "type": "object", + "properties": properties, + "additionalProperties": True, + } + + fields: Dict[str, Any] = { + "base_url": endpoint, "method": method, "default_params": merged_params, - "default_headers": merged_headers, - "auth": candidate.get("auth") or {"type": "none"}, + "headers": merged_headers, + } + + cfg: Dict[str, Any] = { + "type": impl_type, "description": candidate.get("description") or "", - "tool_type": candidate.get("tool_type") or "dynamic_rest", + "fields": fields, + "parameter": parameter_schema, "metadata": { "source": candidate.get("source"), "trust": candidate.get("trust"), @@ -138,17 +171,19 @@ def build_config( "host": candidate.get("host"), }, "vsd": candidate, + # Backwards compatibility fields expected by older utilities + "tool_type": candidate.get("tool_type") or tool_type or "dynamic_rest", + "endpoint": endpoint, + "method": method, + "default_params": merged_params, + "default_headers": merged_headers, + "auth": candidate.get("auth") or {"type": "none"}, } response_key = candidate.get("response_key") if response_key: cfg["response_key"] = response_key - if default_params: - cfg["default_params"].update(default_params) - if default_headers: - cfg["default_headers"].update(default_headers) - _apply_overrides(candidate, cfg) return cfg @@ -159,10 +194,11 @@ def probe_config(cfg: Dict[str, Any]) -> Dict[str, Any]: Execute a lightweight HTTP request to validate the generated configuration. Returns diagnostic information including HTTP status and a JSON snippet if available. """ - url = cfg.get("endpoint") - method = (cfg.get("method") or "GET").upper() - params = deepcopy(cfg.get("default_params") or {}) - headers = deepcopy(cfg.get("default_headers") or {}) + fields = cfg.get("fields") or {} + url = cfg.get("endpoint") or fields.get("base_url") + method = (fields.get("method") or cfg.get("method") or "GET").upper() + params = deepcopy(fields.get("default_params") or cfg.get("default_params") or {}) + headers = deepcopy(fields.get("headers") or cfg.get("default_headers") or {}) headers.setdefault("Accept", "application/json") try: diff --git a/tests/integration/test_medtok_medlog_tools.py b/tests/integration/test_medtok_medlog_tools.py new file mode 100644 index 00000000..a708ecf8 --- /dev/null +++ b/tests/integration/test_medtok_medlog_tools.py @@ -0,0 +1,282 @@ +import importlib.util +import json +import os +import sys +import tempfile +import threading +import time +from pathlib import Path + +import pytest +import uvicorn +from fastapi import FastAPI, HTTPException + +from tooluniverse.execute_function import ToolUniverse + + +class _ServerHandle: + """Utility wrapper for running uvicorn servers in tests.""" + + def __init__(self, app: FastAPI, host: str, port: int): + config = uvicorn.Config( + app, host=host, port=port, log_level="error", lifespan="off" + ) + self.server = uvicorn.Server(config) + self.thread = threading.Thread(target=self.server.run, daemon=True) + + def start(self) -> None: + self.thread.start() + while not self.server.started: + time.sleep(0.05) + + def stop(self) -> None: + self.server.should_exit = True + self.thread.join(timeout=5) + + +def _import_medtok_app(module_path: Path): + spec = importlib.util.spec_from_file_location("medtok_service_app", module_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +@pytest.fixture(scope="session") +def medtok_server(): + repo_root = Path(__file__).resolve().parents[3] + medtok_root = repo_root / "MedTok-FHIR-Starter" + service_dir = medtok_root / "services" / "medtok_service" + sys.path.insert(0, str(service_dir)) + + base_config_path = medtok_root / "config" / "medtok_config.json" + config_data = json.loads(base_config_path.read_text(encoding="utf-8")) + config_data["code_metadata_path"] = str( + medtok_root / "samples" / "code_metadata.csv" + ) + config_data["graph_edges_path"] = str( + medtok_root / "samples" / "code_graph_edges.csv" + ) + tmp_config = tempfile.NamedTemporaryFile( + "w", suffix="_medtok_config.json", delete=False + ) + json.dump(config_data, tmp_config) + tmp_config.flush() + tmp_config.close() + os.environ["MEDTOK_CONFIG"] = tmp_config.name + + module = _import_medtok_app(service_dir / "app.py") + module.MAPPING_CSV = str(medtok_root / "samples" / "code_mapping.csv") + app = module.app + + host = "127.0.0.1" + port = 8910 + server = _ServerHandle(app, host, port) + server.start() + + base_url = f"http://{host}:{port}" + os.environ["MEDTOK_BASE_URL"] = base_url + + yield base_url + + server.stop() + os.environ.pop("MEDTOK_BASE_URL", None) + os.environ.pop("MEDTOK_CONFIG", None) + try: + os.remove(tmp_config.name) + except FileNotFoundError: + pass + sys.path.remove(str(service_dir)) + + +def _build_medlog_collector(store): + app = FastAPI() + + @app.post("/medlog/events/init") + def init(payload: dict): + header = payload.get("header") or {} + event_id = header.get("event_id") + if not event_id: + raise HTTPException(400, "event_id required") + record = { + "header": header, + "model_instance": payload.get("model_instance", {}), + "user_identity": payload.get("user_identity", {}), + "target_identity": payload.get("target_identity"), + "inputs": payload.get("inputs"), + "retention_tier": payload.get("retention_tier", "steady"), + "fragments": [], + } + store[event_id] = record + return {"status": "ok", "event_id": event_id} + + @app.post("/medlog/events/{event_id}/append") + def append(event_id: str, fragment: dict): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + record["fragments"].append(fragment) + return {"status": "ok", "event_id": event_id} + + @app.get("/medlog/events/{event_id}/prov") + def prov(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return {"event_id": event_id, "provenance": {"header": record["header"]}} + + @app.post("/query") + def query(body: dict): + run_id = body.get("run_id") + event_id = body.get("event_id") + limit = body.get("limit", 50) + matches = [] + for eid, record in store.items(): + header = record["header"] + if event_id and event_id != eid: + continue + if run_id and header.get("run_id") != run_id: + continue + matches.append({"event_id": eid, "header": header}) + if len(matches) >= limit: + break + return {"count": len(matches), "results": matches} + + @app.post("/export/parquet") + def export(): + return {"status": "ok", "outdir": "/tmp/parquet"} + + return app + + +def _build_medlog_fhir(store): + app = FastAPI() + + def _bundle_for_records(records): + entries = [] + for rec in records: + entries.append( + { + "resource": { + "resourceType": "Observation", + "id": rec["header"]["event_id"], + "status": "final", + } + } + ) + return {"resourceType": "Bundle", "type": "collection", "entry": entries} + + @app.get("/bundle/{event_id}") + def bundle(event_id: str): + record = store.get(event_id) + if record is None: + raise HTTPException(404, "event not found") + return _bundle_for_records([record]) + + @app.get("/bundle/run/{run_id}") + def bundle_run(run_id: str): + records = [ + record + for record in store.values() + if record["header"].get("run_id") == run_id + ] + if not records: + raise HTTPException(404, "run not found") + return _bundle_for_records(records) + + return app + + +@pytest.fixture(scope="session") +def medlog_servers(): + store = {} + host = "127.0.0.1" + collector_port = 8911 + fhir_port = 8912 + + collector_app = _build_medlog_collector(store) + fhir_app = _build_medlog_fhir(store) + + collector = _ServerHandle(collector_app, host, collector_port) + fhir = _ServerHandle(fhir_app, host, fhir_port) + collector.start() + fhir.start() + + os.environ["MEDLOG_COLLECTOR_BASE_URL"] = f"http://{host}:{collector_port}" + os.environ["MEDLOG_FHIR_BASE_URL"] = f"http://{host}:{fhir_port}" + + yield store + + collector.stop() + fhir.stop() + os.environ.pop("MEDLOG_COLLECTOR_BASE_URL", None) + os.environ.pop("MEDLOG_FHIR_BASE_URL", None) + + +def test_medtok_rest_tools(medtok_server): + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medtok"]) + + tokenize = tu.tools.MedTok_tokenize( + codes=["A00", "E11"], system="ICD-10", include_metadata=True + ) + token_ids = tokenize.get("token_ids", []) + assert isinstance(token_ids, list) + assert len(token_ids) in (0, 2) + + embed = tu.tools.MedTok_embed(codes=["A00"], system="ICD-10") + embeddings = embed.get("embeddings", []) + if embeddings: + assert isinstance(embeddings[0], list) + assert embed.get("dim") == len(embeddings[0]) + + neighbors = tu.tools.MedTok_nearest_neighbors(code="A00", k=3) + neighbor_list = neighbors.get("neighbors", []) + assert len(neighbor_list) <= 3 + + mapped = tu.tools.MedTok_map_text_to_code(text="type 2 diabetes", system="ICD-10") + assert "code" in mapped + + search = tu.tools.MedTok_search_text(text="hypertension", k=4) + assert len(search.get("matches", [])) <= 4 + + code_info = tu.tools.MedTok_code_info(code="E11", system="ICD-10") + assert isinstance(code_info, dict) + + +def test_medlog_tools_workflow(medlog_servers): + tu = ToolUniverse(hooks_enabled=False) + tu.load_tools(tool_type=["medlog"]) + + header = { + "event_id": "evt-1", + "run_id": "run-123", + "timestamp": "2025-01-01T00:00:00Z", + } + model_instance = {"model": "demo", "version": "1.0"} + user_identity = {"name": "Dr. Example"} + + init_resp = tu.tools.MedLog_init_event( + header=header, model_instance=model_instance, user_identity=user_identity + ) + assert init_resp["status"] == "ok" + + fragment = {"outputs": {"summary": "Patient stable"}} + append_resp = tu.tools.MedLog_append_fragment(event_id="evt-1", fragment=fragment) + assert append_resp["status"] == "ok" + + prov_resp = tu.tools.MedLog_get_provenance(event_id="evt-1") + assert prov_resp["event_id"] == "evt-1" + + query_resp = tu.tools.MedLog_query_events(run_id="run-123") + assert query_resp["count"] == 1 + assert query_resp["results"][0]["event_id"] == "evt-1" + + export_resp = tu.tools.MedLog_export_parquet() + assert export_resp["status"] == "ok" + + bundle_resp = tu.tools.MedLog_fhir_bundle(event_id="evt-1") + assert bundle_resp["resourceType"] == "Bundle" + + run_bundle_resp = tu.tools.MedLog_fhir_run_bundle(run_id="run-123") + assert len(run_bundle_resp["entry"]) == 1 From e32eff7efa6e0b58be70ae35585a71c568875c5f Mon Sep 17 00:00:00 2001 From: SufianTA Date: Sun, 26 Oct 2025 19:22:46 -0700 Subject: [PATCH 3/8] Stub MedTok service in integration tests --- tests/integration/test_medtok_medlog_tools.py | 175 +++++++++++++----- 1 file changed, 133 insertions(+), 42 deletions(-) diff --git a/tests/integration/test_medtok_medlog_tools.py b/tests/integration/test_medtok_medlog_tools.py index a708ecf8..1d1cdadf 100644 --- a/tests/integration/test_medtok_medlog_tools.py +++ b/tests/integration/test_medtok_medlog_tools.py @@ -1,11 +1,6 @@ -import importlib.util -import json import os -import sys -import tempfile import threading import time -from pathlib import Path import pytest import uvicorn @@ -34,40 +29,142 @@ def stop(self) -> None: self.thread.join(timeout=5) -def _import_medtok_app(module_path: Path): - spec = importlib.util.spec_from_file_location("medtok_service_app", module_path) - module = importlib.util.module_from_spec(spec) - assert spec.loader is not None - spec.loader.exec_module(module) - return module - - @pytest.fixture(scope="session") def medtok_server(): - repo_root = Path(__file__).resolve().parents[3] - medtok_root = repo_root / "MedTok-FHIR-Starter" - service_dir = medtok_root / "services" / "medtok_service" - sys.path.insert(0, str(service_dir)) - - base_config_path = medtok_root / "config" / "medtok_config.json" - config_data = json.loads(base_config_path.read_text(encoding="utf-8")) - config_data["code_metadata_path"] = str( - medtok_root / "samples" / "code_metadata.csv" - ) - config_data["graph_edges_path"] = str( - medtok_root / "samples" / "code_graph_edges.csv" - ) - tmp_config = tempfile.NamedTemporaryFile( - "w", suffix="_medtok_config.json", delete=False - ) - json.dump(config_data, tmp_config) - tmp_config.flush() - tmp_config.close() - os.environ["MEDTOK_CONFIG"] = tmp_config.name + """ + Launch a minimal in-memory MedTok stub so MedTokTool wrappers can be tested + without cloning the full MedTok repository. + """ + + codes = { + "ICD-10": { + "A00": { + "code": "A00", + "system": "ICD-10", + "name": "Cholera", + "description": "Infection caused by Vibrio cholerae", + "aliases": ["cholera"], + "embedding": [0.9, 0.05, 0.05, 0.0], + "token_id": 101, + }, + "E11": { + "code": "E11", + "system": "ICD-10", + "name": "Type 2 diabetes mellitus", + "description": "Chronic condition impacting glucose metabolism", + "aliases": ["type 2 diabetes", "diabetes"], + "embedding": [0.1, 0.8, 0.1, 0.0], + "token_id": 202, + }, + "I10": { + "code": "I10", + "system": "ICD-10", + "name": "Essential (primary) hypertension", + "description": "Persistently high blood pressure", + "aliases": ["hypertension", "high blood pressure"], + "embedding": [0.05, 0.05, 0.85, 0.05], + "token_id": 303, + }, + } + } + + def _build_stub_app() -> FastAPI: + app = FastAPI(title="MedTok Stub Service", version="0.0.1") + + def _normalise_system(system: str) -> str: + return (system or "ICD-10").upper() + + def _fetch_code(system: str, code: str): + return codes.get(system, {}).get(code) + + def _match_text(system: str, text: str): + text_lower = (text or "").lower() + for record in codes.get(system, {}).values(): + if text_lower in record["code"].lower(): + return record + for alias in record.get("aliases", []): + if text_lower in alias.lower(): + return record + values = list(codes.get(system, {}).values()) + return values[0] if values else None + + @app.post("/tokenize") + def tokenize(payload: dict): + system = _normalise_system(payload.get("system")) + include_metadata = bool(payload.get("include_metadata")) + token_ids = [] + metadata = [] + for code in payload.get("codes", []): + record = _fetch_code(system, code) + if not record: + continue + token_ids.append(record["token_id"]) + if include_metadata: + metadata.append(record) + response = {"token_ids": token_ids} + if include_metadata: + response["metadata"] = metadata + return response + + @app.post("/embed") + def embed(payload: dict): + embeddings = [] + for code in payload.get("codes", []): + record = _fetch_code(_normalise_system(payload.get("system")), code) + if record: + embeddings.append(record["embedding"]) + dim = len(embeddings[0]) if embeddings else 0 + return {"embeddings": embeddings, "dim": dim} + + @app.post("/nearest_neighbors") + def nearest_neighbors(payload: dict): + neighbors = [ + {"code": "E11", "score": 0.42}, + {"code": "I10", "score": 0.33}, + {"code": "A00", "score": 0.28}, + ] + k = max(1, min(int(payload.get("k", 5)), len(neighbors))) + return {"code": payload.get("code"), "neighbors": neighbors[:k]} + + @app.post("/map_text_to_code") + def map_text(payload: dict): + system = _normalise_system(payload.get("system")) + match = _match_text(system, payload.get("text")) + if not match: + raise HTTPException(404, "No matching code found") + return { + "code": match["code"], + "system": match["system"], + "name": match["name"], + } + + @app.post("/search_text") + def search_text(payload: dict): + system = _normalise_system(payload.get("system")) + match = _match_text(system, payload.get("text")) + results = [] + if match: + results.append( + { + "code": match["code"], + "system": match["system"], + "description": match["description"], + "score": 0.9, + } + ) + k = int(payload.get("k", 5)) + return {"query": payload.get("text"), "matches": results[:k]} + + @app.get("/codes/{system}/{code}") + def code_info(system: str, code: str): + record = _fetch_code(_normalise_system(system), code) + if not record: + raise HTTPException(404, "Code not found") + return record + + return app - module = _import_medtok_app(service_dir / "app.py") - module.MAPPING_CSV = str(medtok_root / "samples" / "code_mapping.csv") - app = module.app + app = _build_stub_app() host = "127.0.0.1" port = 8910 @@ -81,12 +178,6 @@ def medtok_server(): server.stop() os.environ.pop("MEDTOK_BASE_URL", None) - os.environ.pop("MEDTOK_CONFIG", None) - try: - os.remove(tmp_config.name) - except FileNotFoundError: - pass - sys.path.remove(str(service_dir)) def _build_medlog_collector(store): From 1981dc99f274bc97ba30be3f2dce8d82e8992539 Mon Sep 17 00:00:00 2001 From: SufianTA Date: Sun, 26 Oct 2025 20:00:31 -0700 Subject: [PATCH 4/8] Update Readme --- README.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/README.md b/README.md index 7ab04aec..9692da83 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,44 @@ Our comprehensive documentation covers everything from quick start to advanced w - **[Adding Tools Tutorial](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/Adding_Tools_Tutorial.html)**: Step-by-step tool addition guide - **[MCP Tool Registration](https://zitniklab.hms.harvard.edu/ToolUniverse/tutorials/addtools/mcp_tool_registration_en.html)**: Register tools via MCP +### Verified Source Discovery (VSD) + Harvest Workflow + +ToolUniverse ships a “harvest → verify → register” pipeline that turns external REST endpoints into first-class Dynamic REST tools. + +1. **Harvest candidates** – `GenericHarvestTool` searches the static Harvest catalog or wraps ad‑hoc URLs you supply with `{"urls": [...]}`. +2. **Probe readiness** – `HarvestCandidateTesterTool` (optional) performs a dry run against the candidate, suggesting default query params or headers. +3. **Register verified tools** – `VerifiedSourceRegisterTool` stamps metadata, persists the tool in `~/.tooluniverse/vsd/generated_tools.json` (override with `TOOLUNIVERSE_VSD_DIR`), and hot-loads it through the Dynamic REST runner. +4. **Inspect / prune** – `VerifiedSourceDiscoveryTool` lists everything in the verified catalog, while `VerifiedSourceRemoveTool` deletes entries and unregisters their dynamic bindings. + +```python +from tooluniverse.vsd_tool import ( + GenericHarvestTool, + HarvestCandidateTesterTool, + VerifiedSourceRegisterTool, + VerifiedSourceDiscoveryTool, + VerifiedSourceRemoveTool, +) + +harvest = GenericHarvestTool({}) +candidate = harvest.run({"query": "clinical"})["candidates"][0] + +tester = HarvestCandidateTesterTool({}) +probe = tester.run({"candidate": candidate}) + +register = VerifiedSourceRegisterTool({}) +register.run( + "ClinicalTrialsREST", + candidate, + default_params={"search": "cancer"}, + force=True, # bypass strict validation once endpoint is known-good +) + +print(VerifiedSourceDiscoveryTool({}).run({})["tools"]) +VerifiedSourceRemoveTool({}).run({"tool_name": "ClinicalTrialsREST"}) +``` + +Registered tools are immediately available to agents via normal loading (e.g., `ToolUniverse().load_tools(tool_type=["dynamic_rest"])`). This workflow keeps internal sources (Harvest/VSD) separate from public REST integrations so they can ship on their own release cadence. + ### 📚 API Reference - **[API Directory](https://zitniklab.hms.harvard.edu/ToolUniverse/api/modules.html)**: Complete module listing - **[Core Modules](https://zitniklab.hms.harvard.edu/ToolUniverse/api/tooluniverse.html)**: Main ToolUniverse class and utilities From 83da56b250a12d9ffb8f0d48bd512f591deba0d6 Mon Sep 17 00:00:00 2001 From: SufianTA Date: Wed, 29 Oct 2025 20:51:45 -0700 Subject: [PATCH 5/8] Added Docker Provisioner --- .env.template | 17 ++ docs/expand_tooluniverse/architecture.rst | 32 +++ scripts/provision_docker_llm.py | 125 +++++++++ scripts/run_insightlab_demo.py | 172 ++++++++++++ .../compose_scripts/docker_llm_provisioner.py | 69 +++++ .../compose_scripts/harvest_auto_registrar.py | 182 +++++++++++++ src/tooluniverse/data/compose_tools.json | 178 ++++++++++++ src/tooluniverse/execute_function.py | 37 ++- .../remote/docker_llm/provision.py | 256 ++++++++++++++++++ src/tooluniverse/remote_tool.py | 10 +- .../tools/DockerLLMProvisioner.py | 69 +++++ .../tools/HarvestAutoRegistrar.py | 91 +++++++ src/tooluniverse/tools/__init__.py | 4 + tests/test_docker_llm_provision.py | 81 ++++++ tests/test_harvest_auto_registrar.py | 83 ++++++ 15 files changed, 1395 insertions(+), 11 deletions(-) create mode 100644 .env.template create mode 100644 scripts/provision_docker_llm.py create mode 100644 scripts/run_insightlab_demo.py create mode 100644 src/tooluniverse/compose_scripts/docker_llm_provisioner.py create mode 100644 src/tooluniverse/compose_scripts/harvest_auto_registrar.py create mode 100644 src/tooluniverse/remote/docker_llm/provision.py create mode 100644 src/tooluniverse/tools/DockerLLMProvisioner.py create mode 100644 src/tooluniverse/tools/HarvestAutoRegistrar.py create mode 100644 tests/test_docker_llm_provision.py create mode 100644 tests/test_harvest_auto_registrar.py diff --git a/.env.template b/.env.template new file mode 100644 index 00000000..85ccc277 --- /dev/null +++ b/.env.template @@ -0,0 +1,17 @@ +# API Keys for ToolUniverse +# Copy this file to .env and fill in your actual API keys + +At least one of: OPENAI_API_KEY, AZURE_OPENAI_API_KEY=your_api_key_here + +BOLTZ_MCP_SERVER_HOST=your_api_key_here + +EXPERT_FEEDBACK_MCP_SERVER_URL=your_api_key_here + +HF_TOKEN=your_api_key_here + +TXAGENT_MCP_SERVER_HOST=your_api_key_here + +USPTO_API_KEY=your_api_key_here + +USPTO_MCP_SERVER_HOST=your_api_key_here + diff --git a/docs/expand_tooluniverse/architecture.rst b/docs/expand_tooluniverse/architecture.rst index 5c90195c..eb901c7b 100644 --- a/docs/expand_tooluniverse/architecture.rst +++ b/docs/expand_tooluniverse/architecture.rst @@ -85,6 +85,8 @@ Repository Structure Tree │ ├── tool_finder_keyword.py # Keyword-based tool search │ ├── tool_finder_embedding.py # Embedding-based tool search │ ├── tool_finder_llm.py # LLM-powered tool discovery + │ ├── remote/docker_llm/ # Docker-based LLM provisioning helpers + │ ├── DockerLLMProvisioner.py # Compose tool for Docker LLM MCP auto-registration │ ├── embedding_database.py # Tool embedding database │ └── embedding_sync.py # Embedding synchronization │ │ @@ -314,6 +316,36 @@ Extension Points - Use `compose_tool.py` or add scripts in `compose_scripts/` for complex call chains - Leverage `tool_finder_*` for retrieval and routing assistance +Tool Loading Cheat Sheet +------------------------ + +- Package data is loaded from the JSON files mapped in :mod:`default_config.py` plus everything under ``src/tooluniverse/data/``. +- Remote/MCP entries are merged from both the packaged ``data/remote_tools`` directory **and** the user override folder ``~/.tooluniverse/remote_tools``. Dropping a JSON config there makes the tool visible without code changes. +- The runtime builds three main registries: + + 1. ``tool_files`` → category JSON manifests (local tools) + 2. ``data/remote_tools`` → bundled remote definitions + 3. ``~/.tooluniverse/remote_tools`` → user/automation supplied remote definitions + +- Use ``ToolUniverse.load_tools()`` to refresh the registry after adding new files without restarting the host process. + +Remote MCP Provisioning +----------------------- + +- ``DockerLLMProvisioner`` (compose tool) and ``scripts/provision_docker_llm.py`` automate standing up an MCP-enabled LLM in Docker, poll its ``/health`` endpoint, and emit the JSON configs under ``~/.tooluniverse/remote_tools`` so the new tool registers instantly. +- Remote stubs created from bundled configs (e.g., expert feedback, DepMap) are read-only until you connect ToolUniverse to the actual MCP server. You can: + + 1. Call ``ToolUniverse.load_mcp_tools(["http://server:port/mcp"])`` to ingest tools live, or + 2. Provision a local container via ``DockerLLMProvisioner`` or the CLI helper to host the endpoints yourself. +- The `RemoteTool` error message now includes these activation instructions when an agent accidentally calls an offline remote tool. + +Catalog Navigation Tips +----------------------- + +- ``ToolNavigatorTool`` combines the full catalog (including remote/VSD entries) with lightweight scoring—use it to shortlist relevant tools before running long compositions. +- ``ToolFinderKeyword`` / ``ToolFinderEmbedding`` provide complementary search modalities; both now benefit from the expanded metadata listed in ``~/.tooluniverse/remote_tools``. +- For big collections consider building category-specific shortlists in ``toolsets/`` and surfacing them via ``ToolNavigatorTool`` filters or custom compose tools. + Directory Quick Reference -------------------------- diff --git a/scripts/provision_docker_llm.py b/scripts/provision_docker_llm.py new file mode 100644 index 00000000..c3637b3a --- /dev/null +++ b/scripts/provision_docker_llm.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Provision a Docker-hosted LLM and register it with ToolUniverse. + +This script wraps the helper in tooluniverse.remote.docker_llm.provision so that +non-technical users can start the container and create the necessary MCP client +configurations with a single command. +""" + +from __future__ import annotations + +import argparse +import sys + +from tooluniverse.remote.docker_llm.provision import ( + DEFAULT_IMAGE, + ProvisionError, + provision_docker_llm, +) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Start a Docker-hosted LLM and register it with ToolUniverse." + ) + parser.add_argument( + "--image", + default=DEFAULT_IMAGE, + help=f"Docker image to run (default: {DEFAULT_IMAGE})", + ) + parser.add_argument( + "--container-name", + help="Name for the Docker container. Generated automatically if omitted.", + ) + parser.add_argument( + "--host", + default="127.0.0.1", + help="Host interface to bind (default: 127.0.0.1).", + ) + parser.add_argument( + "--host-port", + type=int, + default=9000, + help="Host port to expose the MCP endpoint on (default: 9000).", + ) + parser.add_argument( + "--container-port", + type=int, + default=8000, + help="Internal container port (default: 8000).", + ) + parser.add_argument( + "--tool-name", + default="DockerLLMChat", + help="Tool name to register inside ToolUniverse.", + ) + parser.add_argument( + "--tool-prefix", + help="Prefix used when auto-registering tools from the MCP server.", + ) + parser.add_argument( + "--mcp-tool-name", + default="docker_llm_chat", + help="Underlying MCP tool name exposed by the container.", + ) + parser.add_argument( + "--health-path", + default="/health", + help="HTTP path used for readiness checks (default: /health).", + ) + parser.add_argument( + "--timeout", + type=int, + default=120, + help="Seconds to wait for container health (default: 120).", + ) + parser.add_argument( + "--no-reuse", + action="store_true", + help="Always recreate the container instead of reusing an existing one.", + ) + parser.add_argument( + "--docker-cli", + default="docker", + help="Docker CLI executable to invoke (default: docker).", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + result = provision_docker_llm( + image=args.image, + container_name=args.container_name, + docker_cli=args.docker_cli, + host=args.host, + host_port=args.host_port, + container_port=args.container_port, + tool_name=args.tool_name, + tool_prefix=args.tool_prefix, + mcp_tool_name=args.mcp_tool_name, + health_path=args.health_path, + timeout_seconds=args.timeout, + reuse_container=not args.no_reuse, + ) + except ProvisionError as exc: + print(f"Provisioning failed: {exc}", file=sys.stderr) + return 1 + + print("Docker LLM provisioning complete.") + print(f" Container name : {result.container_name}") + print(f" MCP server URL : {result.server_url}") + print(f" Tool config : {result.config_path}") + print( + "Add the tool by reloading ToolUniverse or invoking " + "'DockerLLMProvisioner' from within the agent." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_insightlab_demo.py b/scripts/run_insightlab_demo.py new file mode 100644 index 00000000..651f1188 --- /dev/null +++ b/scripts/run_insightlab_demo.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +InsightLab end‑to‑end smoke test. + +1. Provisions a Docker-hosted MCP LLM if Docker is available. +2. Uses the InsightLab LLM for hypothesis drafting. +3. Runs the harvest → test → register flow and calls the registered tool. +4. Summarises findings with the LLM. + +Prerequisites: + - Run this on a machine with Docker installed and outbound HTTPS access. + - The image ghcr.io/tooluniverse/docker-llm-mcp:latest should be reachable (or already built). +""" + +import json +import sys +import traceback +from pprint import pprint + +import os +import sys +import traceback +from pathlib import Path +from pprint import pprint + +# Ensure the repository's src directory is importable when running as a script +PROJECT_ROOT = Path(__file__).resolve().parents[1] +SRC_PATH = PROJECT_ROOT / "src" +if str(SRC_PATH) not in sys.path: + sys.path.insert(0, str(SRC_PATH)) + +from tooluniverse.execute_function import ToolUniverse # noqa: E402 + + +def pretty(title, payload): + print(f"\n=== {title} ===") + try: + print(json.dumps(payload, indent=2)) + except TypeError: + pprint(payload) + + +def main(): + tu = ToolUniverse() + + try: + tu.load_tools() + except Exception: + print("Failed to load tools:") + traceback.print_exc() + return 1 + + print("Tools loaded.") + + # 1. Provision Docker LLM + try: + provision = tu.run_one_function( + { + "name": "DockerLLMProvisioner", + "arguments": { + "docker_image": "ghcr.io/tooluniverse/docker-llm-mcp:latest", + "host_port": 9010, + "reuse_container": True, + "tool_name": "InsightLabLLM", + }, + } + ) + pretty("DockerLLMProvisioner result", provision) + llm_tool_name = provision.get("tool_name") or "InsightLabLLM" + except Exception: + print("Docker provision step failed (Docker must be available).") + traceback.print_exc() + return 1 + + # Refresh tools to ensure local MCP client is loaded + tu.load_tools() + + # 2. Draft a hypothesis with the local LLM + hypothesis_prompt = ( + "Draft a research hypothesis about the linkage between vitamin D deficiency " + "and autoimmune disorders. Provide two key questions to investigate." + ) + try: + hypothesis = tu.run_one_function( + { + "name": llm_tool_name, + "arguments": { + "prompt": hypothesis_prompt, + "temperature": 0.3, + "max_tokens": 300, + }, + } + ) + pretty("Hypothesis output", hypothesis) + except Exception: + print("InsightLabLLM call failed.") + traceback.print_exc() + return 1 + + # 3. Harvest → register a dataset API + try: + harvest = tu.run_one_function( + { + "name": "HarvestAutoRegistrar", + "arguments": { + "query": "vitamin D immune dataset", + "limit": 5, + "tool_name": "vitamin_d_immune_api", + "auto_run": False, + }, + } + ) + pretty("HarvestAutoRegistrar result", harvest) + except Exception: + print("Harvest auto-registration failed (network required).") + traceback.print_exc() + return 1 + + registered_name = harvest.get("registered_tool_name") + if not registered_name: + print("No tool was registered; check the attempts above.") + return 1 + + tu.load_tools() + + # Call the newly registered tool with sample arguments (may need adjusting) + try: + api_call = tu.run_one_function( + {"name": registered_name, "arguments": {"q": "vitamin D", "rows": 5}} + ) + pretty(f"Call to {registered_name}", api_call) + except Exception: + print(f"Call to {registered_name} failed.") + traceback.print_exc() + return 1 + + # 4. Summarise the results with the LLM + summary_prompt = f""" + You produced the hypothesis: + {hypothesis} + + And retrieved data from {registered_name}: + {api_call} + + Provide: + - 150-word summary + - Confidence level between 0 and 1 with explanation + - Two suggested follow-up experiments + """ + try: + summary = tu.run_one_function( + { + "name": llm_tool_name, + "arguments": { + "prompt": summary_prompt, + "temperature": 0.3, + "max_tokens": 400, + }, + } + ) + pretty("InsightLab summary", summary) + except Exception: + print("Final summarisation failed.") + traceback.print_exc() + return 1 + + print("\nInsightLab smoke test completed successfully.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/tooluniverse/compose_scripts/docker_llm_provisioner.py b/src/tooluniverse/compose_scripts/docker_llm_provisioner.py new file mode 100644 index 00000000..d238f74f --- /dev/null +++ b/src/tooluniverse/compose_scripts/docker_llm_provisioner.py @@ -0,0 +1,69 @@ +""" +Compose script that provisions a Docker-hosted LLM and refreshes ToolUniverse. +""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +from ..remote.docker_llm.provision import ProvisionError, provision_docker_llm + + +def compose( + arguments: Dict[str, Any], + tooluniverse, + call_tool, + stream_callback=None, + emit_event=None, + memory_manager=None, +) -> Dict[str, Any]: + """Provision a Docker LLM container and register MCP configs.""" + + args = dict(arguments or {}) + + try: + result = provision_docker_llm( + image=args.get("docker_image"), + container_name=args.get("container_name"), + docker_cli=args.get("docker_cli", "docker"), + host=args.get("host", "127.0.0.1"), + host_port=int(args.get("host_port", 9000)), + container_port=int(args.get("container_port", 8000)), + env=args.get("env"), + volumes=args.get("volumes"), + extra_args=args.get("extra_args"), + tool_name=args.get("tool_name", "DockerLLMChat"), + tool_prefix=args.get("tool_prefix"), + mcp_tool_name=args.get("mcp_tool_name", "docker_llm_chat"), + health_path=args.get("health_path", "/health"), + timeout_seconds=int(args.get("timeout_seconds", 120)), + poll_interval=float(args.get("poll_interval", 2.0)), + reuse_container=bool(args.get("reuse_container", True)), + server_url=args.get("server_url"), + description=args.get("description"), + ) + except ProvisionError as exc: + return {"ok": False, "error": str(exc)} + + load_error: Optional[str] = None + if tooluniverse is not None: + try: + # Reload tool registry so the new MCP configs are visible immediately. + tooluniverse.load_tools() + except Exception as exc: # pragma: no cover - defensive + load_error = str(exc) + + payload = { + "ok": True, + "container_name": result.container_name, + "server_url": result.server_url, + "config_path": str(result.config_path), + "tool_name": result.tool_name, + } + if load_error: + payload["load_warning"] = load_error + + return payload + + +__all__ = ["compose"] diff --git a/src/tooluniverse/compose_scripts/harvest_auto_registrar.py b/src/tooluniverse/compose_scripts/harvest_auto_registrar.py new file mode 100644 index 00000000..6e195564 --- /dev/null +++ b/src/tooluniverse/compose_scripts/harvest_auto_registrar.py @@ -0,0 +1,182 @@ +""" +HarvestAutoRegistrar compose script. + +Coordinates the GenericHarvestTool, HarvestCandidateTesterTool, and +VerifiedSourceRegisterTool to discover, validate, and register new +DynamicREST tools. Designed to keep all orchestration logic inside the +ComposeTool framework so agents can call a single tool to go from query +to a runnable verified-source entry. +""" + +from __future__ import annotations + +import json +import re +import uuid +from typing import Any, Dict, List, Optional, Tuple + + +class _ComposeError(Exception): + """Internal marker so we can bubble failures cleanly.""" + + +def _as_dict(result: Any) -> Dict[str, Any]: + if isinstance(result, dict): + return result + if isinstance(result, str): + try: + decoded = json.loads(result) + if isinstance(decoded, dict): + return decoded + except json.JSONDecodeError: + pass + return {"raw_result": result} + + +def _emit(emit_event, event_type: str, data: Dict[str, Any]) -> None: + if emit_event: + emit_event(event_type, data) + + +def _generate_tool_name(base: Optional[str], suffix: str | None = None) -> str: + if base: + slug = re.sub(r"[^a-zA-Z0-9]+", "_", base).strip("_").lower() + if not slug: + slug = "vsd_auto" + else: + slug = "vsd_auto" + suffix = suffix or uuid.uuid4().hex[:6] + return f"{slug}_{suffix}" + + +def _select_candidates(arguments: Dict[str, Any]) -> Tuple[Optional[str], Dict[str, Any]]: + # Backwards compatibility: allow top-level query/limit keys + harvest_overrides = dict(arguments.get("harvest", {}) or {}) + query = (arguments.get("query") or harvest_overrides.get("query") or "").strip() + + if query: + harvest_overrides.setdefault("query", query) + + limit = arguments.get("limit") + if limit is None: + limit = harvest_overrides.get("limit", 5) + harvest_overrides["limit"] = max(1, min(int(limit or 5), 50)) + + return query, harvest_overrides + + +def compose( + arguments: Dict[str, Any], + tooluniverse, + call_tool, + stream_callback=None, + emit_event=None, + memory_manager=None, +) -> Dict[str, Any]: + """ + Discover, test, and register a new verified-source tool from harvest results. + """ + + args = dict(arguments or {}) + manual_candidates = args.get("candidates") + + # Prepare harvest step arguments (even if we skip calling the tool) + _, harvest_args = _select_candidates(args) + + results: Dict[str, Any] = { + "ok": False, + "attempts": [], + "registered_tool_name": None, + "run_result": None, + } + + # Step 1: gather candidates + if manual_candidates: + candidates = list(manual_candidates) + harvest_summary = { + "ok": True, + "source": "manual", + "count": len(candidates), + "query": harvest_args.get("query", ""), + } + else: + harvest_response = call_tool("GenericHarvestTool", harvest_args) + harvest_summary = _as_dict(harvest_response) + candidates = list(harvest_summary.get("candidates") or []) + + results["harvest"] = harvest_summary + _emit(emit_event, "harvest_completed", harvest_summary) + + if not candidates: + results["error"] = "No candidates returned from harvest." + return results + + skip_tests = bool(args.get("skip_tests")) + force_register = bool(args.get("force_register") or args.get("force")) + tester_overrides = dict(args.get("tester", {}) or {}) + register_overrides = dict(args.get("register", {}) or {}) + desired_tool_name = args.get("tool_name") + auto_run = bool(args.get("auto_run")) + tool_arguments = args.get("tool_arguments") or {} + + for index, candidate in enumerate(candidates): + attempt_record: Dict[str, Any] = {"candidate_index": index, "candidate": candidate} + tester_result = {"skipped": skip_tests} + + if not skip_tests: + tester_payload = dict(tester_overrides) + tester_payload.setdefault("candidate", candidate) + tester_response = call_tool("HarvestCandidateTesterTool", tester_payload) + tester_result = _as_dict(tester_response) + attempt_record["tester"] = tester_result + if not tester_result.get("ok") and not force_register: + attempt_record["status"] = "tester_failed" + results["attempts"].append(attempt_record) + continue + + register_payload = dict(register_overrides) + register_payload.setdefault("candidate", candidate) + register_payload.setdefault("force", force_register) + + tool_name = register_payload.get("tool_name") or desired_tool_name + if not tool_name: + host = (candidate.get("host") or candidate.get("name") or "").strip() + tool_name = _generate_tool_name(host, suffix=f"cand{index+1}") + register_payload["tool_name"] = tool_name + + register_response = call_tool("VerifiedSourceRegisterTool", register_payload) + register_result = _as_dict(register_response) + attempt_record["register"] = register_result + + if not register_result.get("registered"): + attempt_record["status"] = "registration_failed" + results["attempts"].append(attempt_record) + continue + + # Registration succeeded + registered_name = register_result.get("name") or tool_name + results["ok"] = True + results["registered_tool_name"] = registered_name + results["registration"] = register_result + attempt_record["status"] = "registered" + results["attempts"].append(attempt_record) + _emit(emit_event, "registration_success", register_result) + + if auto_run: + try: + run_payload = { + "name": registered_name, + "arguments": tool_arguments if isinstance(tool_arguments, dict) else {}, + } + run_result = tooluniverse.run_one_function(run_payload) + results["run_result"] = run_result + except Exception as exc: # pragma: no cover - defensive + results["run_error"] = str(exc) + return results + + results["error"] = "All candidates failed testing or registration." + _emit(emit_event, "registration_failed", {"attempts": results["attempts"]}) + return results + + +__all__ = ["compose"] diff --git a/src/tooluniverse/data/compose_tools.json b/src/tooluniverse/data/compose_tools.json index 5af4f882..169f983e 100644 --- a/src/tooluniverse/data/compose_tools.json +++ b/src/tooluniverse/data/compose_tools.json @@ -314,5 +314,183 @@ ], "composition_file": "tool_graph_generation.py", "composition_function": "compose" + }, + { + "type": "ComposeTool", + "name": "HarvestAutoRegistrar", + "description": "Discover, validate, and register new verified-source tools by chaining GenericHarvestTool, HarvestCandidateTesterTool, and VerifiedSourceRegisterTool. Optionally runs the newly registered tool.", + "parameter": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query passed to GenericHarvestTool when no explicit candidates are provided." + }, + "limit": { + "type": "integer", + "description": "Maximum number of harvest candidates to inspect.", + "default": 5, + "minimum": 1, + "maximum": 50 + }, + "harvest": { + "type": "object", + "description": "Additional arguments forwarded to GenericHarvestTool (e.g., urls, description, domain filters)." + }, + "candidates": { + "type": "array", + "description": "Optional pre-harvested candidates to evaluate instead of calling GenericHarvestTool.", + "items": { + "type": "object" + } + }, + "tester": { + "type": "object", + "description": "Overrides passed to HarvestCandidateTesterTool (e.g., default headers or params)." + }, + "register": { + "type": "object", + "description": "Overrides passed to VerifiedSourceRegisterTool (e.g., default headers, params, tool_type)." + }, + "tool_name": { + "type": "string", + "description": "Desired name for the registered tool. Automatically generated if omitted." + }, + "force_register": { + "type": "boolean", + "description": "Register the tool even if validation fails (mirrors the force flag on VerifiedSourceRegisterTool).", + "default": false + }, + "force": { + "type": "boolean", + "description": "Alias for force_register.", + "default": false + }, + "skip_tests": { + "type": "boolean", + "description": "Skip HarvestCandidateTesterTool and proceed straight to registration.", + "default": false + }, + "auto_run": { + "type": "boolean", + "description": "Invoke the registered tool immediately after successful registration.", + "default": false + }, + "tool_arguments": { + "type": "object", + "description": "Arguments forwarded to the registered tool when auto_run is true.", + "default": {} + } + } + }, + "auto_load_dependencies": true, + "fail_on_missing_tools": false, + "required_tools": [ + "GenericHarvestTool", + "HarvestCandidateTesterTool", + "VerifiedSourceRegisterTool" + ], + "composition_file": "harvest_auto_registrar.py", + "composition_function": "compose" + }, + { + "type": "ComposeTool", + "name": "DockerLLMProvisioner", + "description": "Launch a Docker-hosted LLM MCP server, wait for readiness, and register client/auto-loader configurations so the model appears inside ToolUniverse.", + "parameter": { + "type": "object", + "properties": { + "docker_image": { + "type": "string", + "description": "Docker image to run (default ghcr.io/tooluniverse/docker-llm-mcp:latest)." + }, + "container_name": { + "type": "string", + "description": "Optional container name. Generated automatically when omitted." + }, + "docker_cli": { + "type": "string", + "description": "Docker executable to invoke (default docker).", + "default": "docker" + }, + "host": { + "type": "string", + "description": "Host interface to bind (default 127.0.0.1).", + "default": "127.0.0.1" + }, + "host_port": { + "type": "integer", + "description": "Port exposed on the host for MCP traffic (default 9000).", + "default": 9000 + }, + "container_port": { + "type": "integer", + "description": "Internal container port that serves MCP (default 8000).", + "default": 8000 + }, + "env": { + "type": "object", + "description": "Environment variables to pass to docker run." + }, + "volumes": { + "type": "array", + "description": "Volume mappings for docker run (e.g. ['/host/path:/container/path']).", + "items": { "type": "string" } + }, + "extra_args": { + "type": "array", + "description": "Additional arguments appended to docker run.", + "items": { "type": "string" } + }, + "tool_name": { + "type": "string", + "description": "Tool name registered within ToolUniverse (default DockerLLMChat).", + "default": "DockerLLMChat" + }, + "tool_prefix": { + "type": "string", + "description": "Prefix applied when auto-registering tools from the MCP server." + }, + "mcp_tool_name": { + "type": "string", + "description": "Underlying MCP tool name exposed by the server (default docker_llm_chat).", + "default": "docker_llm_chat" + }, + "health_path": { + "type": "string", + "description": "Health-check path polled for readiness (default /health).", + "default": "/health" + }, + "timeout_seconds": { + "type": "integer", + "description": "Seconds to wait for the container to become healthy.", + "default": 120 + }, + "poll_interval": { + "type": "number", + "description": "Seconds between health checks during provisioning.", + "default": 2.0 + }, + "reuse_container": { + "type": "boolean", + "description": "Reuse and restart an existing container if present (default true).", + "default": true + }, + "server_url": { + "type": "string", + "description": "Override MCP base URL if different from http://host:host_port." + }, + "description": { + "type": "string", + "description": "Custom description applied to the registered tool." + } + }, + "additionalProperties": false + }, + "auto_load_dependencies": false, + "fail_on_missing_tools": false, + "required_tools": [], + "composition_file": "docker_llm_provisioner.py", + "composition_function": "compose" } ] diff --git a/src/tooluniverse/execute_function.py b/src/tooluniverse/execute_function.py index 3469dd92..0847056a 100755 --- a/src/tooluniverse/execute_function.py +++ b/src/tooluniverse/execute_function.py @@ -1421,21 +1421,31 @@ def _scan_predefined_files(self): all_tools.extend(tools_in_category) all_tool_names.update([tool["name"] for tool in tools_in_category]) - # Also include remote tools + # Also include remote tools from package data and user overrides + def _include_remote_dir(base_dir): + if not os.path.isdir(base_dir): + return + for fname in os.listdir(base_dir): + if not fname.lower().endswith(".json"): + continue + fpath = os.path.join(base_dir, fname) + remote_tools = self._read_tools_from_file(fpath) + if remote_tools: + all_tools.extend(remote_tools) + all_tool_names.update([tool["name"] for tool in remote_tools]) + try: remote_dir = os.path.join(current_dir, "data", "remote_tools") - if os.path.isdir(remote_dir): - for fname in os.listdir(remote_dir): - if not fname.lower().endswith(".json"): - continue - fpath = os.path.join(remote_dir, fname) - remote_tools = self._read_tools_from_file(fpath) - if remote_tools: - all_tools.extend(remote_tools) - all_tool_names.update([tool["name"] for tool in remote_tools]) + _include_remote_dir(remote_dir) except Exception as e: warning(f"Warning: Failed to scan remote tools directory: {e}") + try: + user_remote_dir = os.path.join(str(Path.home()), ".tooluniverse", "remote_tools") + _include_remote_dir(user_remote_dir) + except Exception as e: + warning(f"Warning: Failed to scan user remote tools directory: {e}") + return all_tools, all_tool_names def _scan_all_json_files(self): @@ -1463,6 +1473,13 @@ def _scan_all_json_files(self): if file.lower().endswith(".json"): json_files.append(os.path.join(root, file)) + # Include user-level remote tool configurations + user_remote_dir = os.path.join(str(Path.home()), ".tooluniverse", "remote_tools") + if os.path.isdir(user_remote_dir): + for file in os.listdir(user_remote_dir): + if file.lower().endswith(".json"): + json_files.append(os.path.join(user_remote_dir, file)) + self.logger.debug(f"Found {len(json_files)} JSON files to scan") # Read tools from each JSON file using the common method diff --git a/src/tooluniverse/remote/docker_llm/provision.py b/src/tooluniverse/remote/docker_llm/provision.py new file mode 100644 index 00000000..64f7ba25 --- /dev/null +++ b/src/tooluniverse/remote/docker_llm/provision.py @@ -0,0 +1,256 @@ +""" +Docker-hosted LLM provisioning helpers for ToolUniverse. + +These helpers start (or reuse) a Docker container that exposes an MCP-compatible +LLM service, wait for it to become healthy, and register client/auto-loader +configurations in the user's ToolUniverse remote tool directory. +""" + +from __future__ import annotations + +import json +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import requests + +DEFAULT_IMAGE = "ghcr.io/tooluniverse/docker-llm-mcp:latest" +DEFAULT_CONTAINER_BASENAME = "tooluniverse-llm" +DEFAULT_INTERNAL_PORT = 8000 +DEFAULT_TOOL_NAME = "DockerLLMChat" +DEFAULT_MCP_TOOL_NAME = "docker_llm_chat" +DEFAULT_HEALTH_PATH = "/health" + + +class ProvisionError(RuntimeError): + """Raised when Docker provisioning fails.""" + + +@dataclass +class ProvisionResult: + container_name: str + server_url: str + config_path: Path + tool_name: str + + +def _ensure_remote_dir() -> Path: + target = Path.home() / ".tooluniverse" / "remote_tools" + target.mkdir(parents=True, exist_ok=True) + return target + + +def _run_docker( + args: Sequence[str], *, docker_cli: str = "docker", check: bool = True +) -> subprocess.CompletedProcess: + command = [docker_cli, *args] + return subprocess.run( + command, + check=check, + capture_output=True, + text=True, + ) + + +def _container_exists(container_name: str, docker_cli: str) -> bool: + proc = _run_docker( + ["ps", "-a", "--filter", f"name=^{container_name}$", "--format", "{{.Names}}"], + docker_cli=docker_cli, + check=True, + ) + return any(line.strip() == container_name for line in proc.stdout.splitlines()) + + +def _container_running(container_name: str, docker_cli: str) -> bool: + proc = _run_docker( + ["ps", "--filter", f"name=^{container_name}$", "--format", "{{.Names}}"], + docker_cli=docker_cli, + check=True, + ) + return any(line.strip() == container_name for line in proc.stdout.splitlines()) + + +def _start_existing(container_name: str, docker_cli: str) -> None: + _run_docker(["start", container_name], docker_cli=docker_cli, check=True) + + +def _run_new_container( + *, + docker_cli: str, + image: str, + container_name: str, + host: str, + host_port: int, + container_port: int, + env: Optional[Dict[str, str]], + volumes: Optional[List[str]], + extra_args: Optional[List[str]], +) -> None: + cmd: List[str] = [ + "run", + "-d", + "--name", + container_name, + "-p", + f"{host}:{host_port}:{container_port}", + ] + + for key, value in (env or {}).items(): + cmd.extend(["-e", f"{key}={value}"]) + + for volume in volumes or []: + cmd.extend(["-v", volume]) + + if extra_args: + cmd.extend(extra_args) + + cmd.append(image) + _run_docker(cmd, docker_cli=docker_cli, check=True) + + +def _wait_for_health( + url: str, + *, + timeout: int, + interval: float, +) -> None: + deadline = time.time() + timeout + while time.time() < deadline: + try: + response = requests.get(url, timeout=5) + if 200 <= response.status_code < 400: + return + except requests.RequestException: + pass + time.sleep(interval) + raise ProvisionError(f"Container health check did not succeed at {url}") + + +def _write_remote_config( + config: List[Dict[str, object]], *, tool_name: str +) -> Path: + target_dir = _ensure_remote_dir() + path = target_dir / f"{tool_name}.json" + with path.open("w", encoding="utf-8") as handle: + json.dump(config, handle, indent=2) + return path + + +def provision_docker_llm( + image: str = DEFAULT_IMAGE, + *, + container_name: Optional[str] = None, + docker_cli: str = "docker", + host: str = "127.0.0.1", + host_port: int = 9000, + container_port: int = DEFAULT_INTERNAL_PORT, + env: Optional[Dict[str, str]] = None, + volumes: Optional[List[str]] = None, + extra_args: Optional[List[str]] = None, + tool_name: str = DEFAULT_TOOL_NAME, + tool_prefix: Optional[str] = None, + mcp_tool_name: str = DEFAULT_MCP_TOOL_NAME, + health_path: str = DEFAULT_HEALTH_PATH, + timeout_seconds: int = 120, + poll_interval: float = 2.0, + reuse_container: bool = True, + server_url: Optional[str] = None, + description: Optional[str] = None, +) -> ProvisionResult: + """ + Ensure a Docker-hosted LLM is running and registered with ToolUniverse. + """ + container_name = container_name or f"{DEFAULT_CONTAINER_BASENAME}-{int(time.time())}" + tool_prefix = tool_prefix or (tool_name.lower() + "_") + if not tool_prefix.endswith("_"): + tool_prefix += "_" + + # Verify Docker availability + try: + _run_docker(["version"], docker_cli=docker_cli, check=True) + except FileNotFoundError as exc: + raise ProvisionError("Docker CLI not found. Please install Docker Desktop.") from exc + except subprocess.CalledProcessError as exc: + raise ProvisionError(f"Docker is not available: {exc.stderr.strip()}") from exc + + exists = _container_exists(container_name, docker_cli) + running = _container_running(container_name, docker_cli) if exists else False + + if exists and not running and reuse_container: + _start_existing(container_name, docker_cli) + running = True + + if not exists or (exists and not running and not reuse_container): + if exists and not reuse_container: + _run_docker(["rm", "-f", container_name], docker_cli=docker_cli, check=True) + _run_new_container( + docker_cli=docker_cli, + image=image, + container_name=container_name, + host=host, + host_port=host_port, + container_port=container_port, + env=env, + volumes=volumes, + extra_args=extra_args, + ) + + base_url = server_url or f"http://{host}:{host_port}" + health_url = base_url.rstrip("/") + health_path + _wait_for_health(health_url, timeout=timeout_seconds, interval=poll_interval) + + config_description = ( + description + or "Interact with a locally hosted Docker LLM via MCP-compatible interface." + ) + + remote_config = [ + { + "name": f"{tool_prefix.rstrip('_')}_auto_loader", + "description": f"Automatically discover tools from the Docker-hosted LLM server at {base_url}.", + "type": "MCPAutoLoaderTool", + "server_url": f"{base_url.rstrip('/')}/mcp", + "tool_prefix": tool_prefix, + }, + { + "name": tool_name, + "description": config_description, + "type": "MCPClientTool", + "server_url": base_url, + "transport": "http", + "mcp_tool_name": mcp_tool_name, + "parameter": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "Prompt text to send to the Docker-hosted language model.", + }, + "temperature": { + "type": "number", + "description": "Sampling temperature for the model.", + "default": 0.7, + }, + "max_tokens": { + "type": "integer", + "description": "Maximum tokens to generate in the response.", + "default": 512, + }, + }, + "required": ["prompt"], + "additionalProperties": True, + }, + }, + ] + + config_path = _write_remote_config(remote_config, tool_name=tool_name) + + return ProvisionResult( + container_name=container_name, + server_url=base_url.rstrip("/"), + config_path=config_path, + tool_name=tool_name, + ) diff --git a/src/tooluniverse/remote_tool.py b/src/tooluniverse/remote_tool.py index 9c656f6e..17f1219b 100644 --- a/src/tooluniverse/remote_tool.py +++ b/src/tooluniverse/remote_tool.py @@ -51,6 +51,14 @@ def run(self, arguments=None): server_type = self.remote_info.get("server_type", "Unknown") original_type = self.remote_info.get("original_type", "Unknown") + guidance = ( + "This tool is registered as a remote MCP endpoint and cannot run locally. " + "If you want to activate it, connect ToolUniverse to the hosting server (e.g. " + "call `ToolUniverse.load_mcp_tools([...])` with the server URL) or provision a local " + "container via the `DockerLLMProvisioner` compose tool / `scripts/provision_docker_llm.py`. " + "Custom MCP configs placed in ~/.tooluniverse/remote_tools/*.json are picked up automatically." + ) + return { "error": "Remote tool not available for local execution", "tool_name": ( @@ -61,7 +69,7 @@ def run(self, arguments=None): "tool_type": "RemoteTool", "original_type": original_type, "server_type": server_type, - "message": "This tool is hosted on an external MCP/SMCP server and cannot be executed locally. Please use the external server directly.", + "message": guidance, "remote_info": self.remote_info, } diff --git a/src/tooluniverse/tools/DockerLLMProvisioner.py b/src/tooluniverse/tools/DockerLLMProvisioner.py new file mode 100644 index 00000000..df8d8eda --- /dev/null +++ b/src/tooluniverse/tools/DockerLLMProvisioner.py @@ -0,0 +1,69 @@ +""" +DockerLLMProvisioner + +Compose wrapper that provisions a Docker-hosted LLM MCP server and registers +its ToolUniverse configurations. +""" + +from typing import Any, Dict, Optional, Callable +from ._shared_client import get_shared_client + + +def DockerLLMProvisioner( + *, + docker_image: Optional[str] = None, + container_name: Optional[str] = None, + docker_cli: str = "docker", + host: str = "127.0.0.1", + host_port: int = 9000, + container_port: int = 8000, + env: Optional[Dict[str, str]] = None, + volumes: Optional[list[str]] = None, + extra_args: Optional[list[str]] = None, + tool_name: str = "DockerLLMChat", + tool_prefix: Optional[str] = None, + mcp_tool_name: str = "docker_llm_chat", + health_path: str = "/health", + timeout_seconds: int = 120, + poll_interval: float = 2.0, + reuse_container: bool = True, + server_url: Optional[str] = None, + description: Optional[str] = None, + stream_callback: Optional[Callable[[str], None]] = None, + use_cache: bool = False, + validate: bool = True, +) -> Any: + """ + Provision a Docker-hosted LLM and register MCP configs with ToolUniverse. + """ + + arguments: Dict[str, Any] = { + "docker_image": docker_image, + "container_name": container_name, + "docker_cli": docker_cli, + "host": host, + "host_port": host_port, + "container_port": container_port, + "env": env, + "volumes": volumes, + "extra_args": extra_args, + "tool_name": tool_name, + "tool_prefix": tool_prefix, + "mcp_tool_name": mcp_tool_name, + "health_path": health_path, + "timeout_seconds": timeout_seconds, + "poll_interval": poll_interval, + "reuse_container": reuse_container, + "server_url": server_url, + "description": description, + } + + return get_shared_client().run_one_function( + {"name": "DockerLLMProvisioner", "arguments": arguments}, + stream_callback=stream_callback, + use_cache=use_cache, + validate=validate, + ) + + +__all__ = ["DockerLLMProvisioner"] diff --git a/src/tooluniverse/tools/HarvestAutoRegistrar.py b/src/tooluniverse/tools/HarvestAutoRegistrar.py new file mode 100644 index 00000000..706b5a74 --- /dev/null +++ b/src/tooluniverse/tools/HarvestAutoRegistrar.py @@ -0,0 +1,91 @@ +""" +HarvestAutoRegistrar + +Compose workflow that harvests candidate APIs, validates them, and registers a +new verified-source tool. Optionally executes the registered tool immediately. +""" + +from typing import Any, Dict, List, Optional, Callable +from ._shared_client import get_shared_client + + +def HarvestAutoRegistrar( + query: Optional[str] = None, + limit: int = 5, + *, + harvest: Optional[Dict[str, Any]] = None, + candidates: Optional[List[Dict[str, Any]]] = None, + tester: Optional[Dict[str, Any]] = None, + register: Optional[Dict[str, Any]] = None, + tool_name: Optional[str] = None, + force_register: bool = False, + force: bool = False, + skip_tests: bool = False, + auto_run: bool = False, + tool_arguments: Optional[Dict[str, Any]] = None, + stream_callback: Optional[Callable[[str], None]] = None, + use_cache: bool = False, + validate: bool = True, +) -> Any: + """ + Discover, validate, and register a verified-source tool in a single call. + + Parameters + ---------- + query : str, optional + Harvest query when candidates are not supplied directly. + limit : int, default 5 + Maximum number of harvest candidates to inspect. + harvest : dict, optional + Additional arguments forwarded to GenericHarvestTool. + candidates : list, optional + Precomputed candidate objects. Skips calling GenericHarvestTool when provided. + tester : dict, optional + Overrides forwarded to HarvestCandidateTesterTool. + register : dict, optional + Overrides forwarded to VerifiedSourceRegisterTool. + tool_name : str, optional + Desired name for the registered tool. Auto-generated if omitted. + force_register : bool, default False + Register even when validation fails (mirrors VerifiedSourceRegisterTool.force). + force : bool, default False + Alias for force_register for convenience. + skip_tests : bool, default False + Bypass HarvestCandidateTesterTool and proceed straight to registration. + auto_run : bool, default False + Execute the registered tool immediately after a successful registration. + tool_arguments : dict, optional + Arguments forwarded to the registered tool when auto_run is True. + stream_callback : Callable, optional + Streaming callback handled by ToolUniverse shared client. + use_cache : bool, default False + Enable client-side caching. + validate : bool, default True + Validate payload before sending to ToolUniverse. + """ + payload = { + "name": "HarvestAutoRegistrar", + "arguments": { + "query": query, + "limit": limit, + "harvest": harvest, + "candidates": candidates, + "tester": tester, + "register": register, + "tool_name": tool_name, + "force_register": force_register or force, + "skip_tests": skip_tests, + "auto_run": auto_run, + "tool_arguments": tool_arguments or {}, + }, + } + + return get_shared_client().run_one_function( + payload, + stream_callback=stream_callback, + use_cache=use_cache, + validate=validate, + ) + + +__all__ = ["HarvestAutoRegistrar"] diff --git a/src/tooluniverse/tools/__init__.py b/src/tooluniverse/tools/__init__.py index 5d3e8b38..4d6e1e75 100644 --- a/src/tooluniverse/tools/__init__.py +++ b/src/tooluniverse/tools/__init__.py @@ -458,6 +458,8 @@ from .GO_search_terms import GO_search_terms from .GWAS_search_associations_by_gene import GWAS_search_associations_by_gene from .HAL_search_archive import HAL_search_archive +from .HarvestAutoRegistrar import HarvestAutoRegistrar +from .DockerLLMProvisioner import DockerLLMProvisioner from .HPA_get_biological_processes_by_gene import HPA_get_biological_processes_by_gene from .HPA_get_cancer_prognostics_by_gene import HPA_get_cancer_prognostics_by_gene from .HPA_get_comparative_expression_by_gene_and_cellline import ( @@ -1127,6 +1129,7 @@ "Crossref_search_works", "DBLP_search_publications", "DOAJ_search_articles", + "DockerLLMProvisioner", "DailyMed_get_spl_by_setid", "DailyMed_search_spls", "DataAnalysisValidityReviewer", @@ -1323,6 +1326,7 @@ "GO_search_terms", "GWAS_search_associations_by_gene", "HAL_search_archive", + "HarvestAutoRegistrar", "HPA_get_biological_processes_by_gene", "HPA_get_cancer_prognostics_by_gene", "HPA_get_comparative_expression_by_gene_and_cellline", diff --git a/tests/test_docker_llm_provision.py b/tests/test_docker_llm_provision.py new file mode 100644 index 00000000..0e313ac1 --- /dev/null +++ b/tests/test_docker_llm_provision.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import json +import subprocess +import tempfile +from pathlib import Path +from unittest import mock + +from tooluniverse.remote.docker_llm.provision import ( + ProvisionResult, + provision_docker_llm, +) +from tooluniverse.compose_scripts.docker_llm_provisioner import compose + + +def test_provision_creates_remote_config_and_runs_docker(): + temp_home = Path(tempfile.mkdtemp()) + commands = [] + + def fake_run(cmd, check, capture_output, text): + commands.append(cmd) + if cmd[0] == "docker" and cmd[1] == "version": + return subprocess.CompletedProcess(cmd, 0, "", "") + if cmd[0] == "docker" and cmd[1] == "ps": + return subprocess.CompletedProcess(cmd, 0, "", "") + if cmd[0] == "docker" and cmd[1] == "run": + return subprocess.CompletedProcess(cmd, 0, "", "") + raise AssertionError(f"Unexpected docker command: {cmd}") + + response = mock.Mock() + response.status_code = 200 + + with mock.patch("tooluniverse.remote.docker_llm.provision.Path.home", return_value=temp_home): + with mock.patch("tooluniverse.remote.docker_llm.provision.subprocess.run", side_effect=fake_run): + with mock.patch("tooluniverse.remote.docker_llm.provision.requests.get", return_value=response): + with mock.patch("tooluniverse.remote.docker_llm.provision.time.sleep"): + result = provision_docker_llm( + image="example/image:latest", + container_name="test-container", + host="127.0.0.1", + host_port=9100, + container_port=8000, + timeout_seconds=5, + poll_interval=0.01, + ) + + assert result.container_name == "test-container" + assert result.config_path.exists() + stored = json.loads(result.config_path.read_text(encoding="utf-8")) + assert isinstance(stored, list) + assert stored[1]["name"] == "DockerLLMChat" + assert ["docker", "run", "-d", "--name", "test-container", "-p", "127.0.0.1:9100:8000", "example/image:latest"] in commands + + +def test_compose_returns_payload_and_refreshes_tooluniverse(tmp_path): + config_path = tmp_path / "DockerLLMChat.json" + + def fake_provision(**kwargs): + config_path.write_text("[]", encoding="utf-8") + return ProvisionResult( + container_name="compose-container", + server_url="http://127.0.0.1:9000", + config_path=config_path, + tool_name="DockerLLMChat", + ) + + class DummyToolUniverse: + def __init__(self): + self.refreshed = False + + def load_tools(self): + self.refreshed = True + + dummy_tu = DummyToolUniverse() + + with mock.patch("tooluniverse.compose_scripts.docker_llm_provisioner.provision_docker_llm", side_effect=fake_provision): + result = compose({"host_port": 9005}, dummy_tu, call_tool=None) + + assert result["ok"] is True + assert result["container_name"] == "compose-container" + assert dummy_tu.refreshed is True diff --git a/tests/test_harvest_auto_registrar.py b/tests/test_harvest_auto_registrar.py new file mode 100644 index 00000000..d52a4b46 --- /dev/null +++ b/tests/test_harvest_auto_registrar.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import Any, Dict + +from tooluniverse.compose_scripts.harvest_auto_registrar import compose + + +class FakeToolUniverse: + def __init__(self) -> None: + self.invocations: list[Dict[str, Any]] = [] + + def run_one_function(self, payload: Dict[str, Any]) -> Dict[str, Any]: + self.invocations.append(payload) + return {"ok": True, "payload": payload} + + +def test_compose_registers_and_runs_single_candidate(): + tool_universe = FakeToolUniverse() + calls = {} + + def call_tool(name: str, payload: Dict[str, Any]): + calls.setdefault(name, []).append(payload) + if name == "HarvestCandidateTesterTool": + return {"ok": True, "test": {"status": 200}} + if name == "VerifiedSourceRegisterTool": + assert payload["tool_name"] == "my_registered_tool" + return {"registered": True, "name": payload["tool_name"], "config": {"endpoint": "https://example.com"}} + raise AssertionError(f"Unexpected tool call: {name}") + + candidate = {"name": "Example API", "host": "example.com", "endpoint": "https://example.com/api"} + + result = compose( + { + "candidates": [candidate], + "tool_name": "my_registered_tool", + "auto_run": True, + "tool_arguments": {"limit": 1}, + }, + tool_universe, + call_tool, + ) + + assert result["ok"] is True + assert result["registered_tool_name"] == "my_registered_tool" + assert result["registration"]["registered"] is True + assert tool_universe.invocations[0]["name"] == "my_registered_tool" + assert tool_universe.invocations[0]["arguments"] == {"limit": 1} + assert calls["HarvestCandidateTesterTool"][0]["candidate"] == candidate + + +def test_compose_generates_name_and_skips_failed_candidate(): + tool_universe = FakeToolUniverse() + register_tool_names = [] + + def call_tool(name: str, payload: Dict[str, Any]): + if name == "HarvestCandidateTesterTool": + ok = payload["candidate"]["host"] == "second.example.com" + return {"ok": ok, "test": {"status": 200 if ok else 500}} + if name == "VerifiedSourceRegisterTool": + register_tool_names.append(payload["tool_name"]) + return {"registered": True, "name": payload["tool_name"], "config": {}} + if name == "GenericHarvestTool": + return {"ok": True, "candidates": []} + raise AssertionError(f"Unexpected tool call: {name}") + + first = {"name": "Bad API", "host": "bad.example.com", "endpoint": "https://bad.example.com"} + second = {"name": "Good API", "host": "second.example.com", "endpoint": "https://second.example.com"} + + result = compose( + { + "candidates": [first, second], + "auto_run": False, + }, + tool_universe, + call_tool, + ) + + assert result["ok"] is True + assert result["registered_tool_name"] == register_tool_names[0] + assert register_tool_names[0].startswith("second_example_com_") + assert result["attempts"][0]["status"] == "tester_failed" + assert result["attempts"][1]["status"] == "registered" + assert tool_universe.invocations == [] From 30298f585e11119714f60928b89895b0516c141d Mon Sep 17 00:00:00 2001 From: SufianTA Date: Wed, 29 Oct 2025 21:07:20 -0700 Subject: [PATCH 6/8] Fix Tests --- tests/integration/test_coding_api_integration.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_coding_api_integration.py b/tests/integration/test_coding_api_integration.py index 09542387..d7087d5e 100644 --- a/tests/integration/test_coding_api_integration.py +++ b/tests/integration/test_coding_api_integration.py @@ -13,8 +13,11 @@ from pathlib import Path import pytest -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +# Add repository src/ directory to path so we import the checked-in package +REPO_ROOT = Path(__file__).resolve().parents[2] +SRC_DIR = REPO_ROOT / "src" +if str(SRC_DIR) not in sys.path: + sys.path.insert(0, str(SRC_DIR)) from tooluniverse import ToolUniverse # noqa: E402 from tooluniverse.generate_tools import main as generate_tools # noqa: E402 From 266b235f09aefc3bcbd41167df94c3c7ad58c660 Mon Sep 17 00:00:00 2001 From: SufianTA Date: Wed, 29 Oct 2025 21:15:04 -0700 Subject: [PATCH 7/8] Update Testing --- tests/integration/test_coding_api_integration.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/test_coding_api_integration.py b/tests/integration/test_coding_api_integration.py index d7087d5e..4dc11722 100644 --- a/tests/integration/test_coding_api_integration.py +++ b/tests/integration/test_coding_api_integration.py @@ -19,6 +19,11 @@ if str(SRC_DIR) not in sys.path: sys.path.insert(0, str(SRC_DIR)) +# Ensure we import the repo copy even if another version is already loaded +for module_name in list(sys.modules.keys()): + if module_name == "tooluniverse" or module_name.startswith("tooluniverse."): + del sys.modules[module_name] + from tooluniverse import ToolUniverse # noqa: E402 from tooluniverse.generate_tools import main as generate_tools # noqa: E402 From cdef93e24c4fec04945af2ac6ef82821e044cdae Mon Sep 17 00:00:00 2001 From: SufianTA Date: Wed, 29 Oct 2025 21:36:51 -0700 Subject: [PATCH 8/8] Testing update --- tests/integration/test_coding_api_integration.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_coding_api_integration.py b/tests/integration/test_coding_api_integration.py index 4dc11722..9bc50bb5 100644 --- a/tests/integration/test_coding_api_integration.py +++ b/tests/integration/test_coding_api_integration.py @@ -20,9 +20,13 @@ sys.path.insert(0, str(SRC_DIR)) # Ensure we import the repo copy even if another version is already loaded -for module_name in list(sys.modules.keys()): - if module_name == "tooluniverse" or module_name.startswith("tooluniverse."): - del sys.modules[module_name] +package = sys.modules.get("tooluniverse") +if package is not None: + module_path = Path(getattr(package, "__file__", "")).resolve() + if SRC_DIR not in module_path.parents: + for module_name in list(sys.modules.keys()): + if module_name == "tooluniverse" or module_name.startswith("tooluniverse."): + del sys.modules[module_name] from tooluniverse import ToolUniverse # noqa: E402 from tooluniverse.generate_tools import main as generate_tools # noqa: E402