diff --git a/.env.example b/.env.example index d65e0b71..4c9ef597 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,10 @@ # Required: Anthropic API Key for Claude models ANTHROPIC_API_KEY=your_anthropic_api_key_here +# Required if DISGENET API use: +DISGENET_API_KEY= +DISGENET_API_SERVER_HOST=https://api.disgenet.com + # Optional: OpenAI API Key (if using OpenAI models) OPENAI_API_KEY=your_openai_api_key_here diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 03c1697a..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,27 +0,0 @@ -# MANIFEST.in - -# Include all python files from the biomni package -recursive-include biomni *.py - -# Include the .pkl database files -recursive-include biomni/tool/schema_db *.pkl - -# Include specific files from biomni_env, but not the biomni_tools subdirectory -recursive-include biomni_env *.py *.sh *.yml *.yaml *.txt *.md *.json *.R - -# Exclude specific directories that are not part of the package -prune tutorials/data -prune biomni_env/biomni_tools - -# Exclude build artifacts and caches -global-exclude *.py[co] -global-exclude __pycache__ -global-exclude .ruff_cache -prune build -prune dist -prune *.egg-info - -# Include other necessary files -include README.md -include LICENSE -include pyproject.toml diff --git a/biomni/agent/a1.py b/biomni/agent/a1.py index 7a62e59f..dabcc060 100644 --- a/biomni/agent/a1.py +++ b/biomni/agent/a1.py @@ -2,6 +2,7 @@ import inspect import os import re +import sys from collections.abc import Generator from datetime import datetime from pathlib import Path @@ -9,6 +10,11 @@ import pandas as pd from dotenv import load_dotenv + +if os.path.exists(".env"): + load_dotenv(".env", override=False) + print("Loaded environment variables from .env") + from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage from langchain_core.prompts import ChatPromptTemplate from langgraph.checkpoint.memory import MemorySaver @@ -43,10 +49,6 @@ textify_api_dict, ) -if os.path.exists(".env"): - load_dotenv(".env", override=False) - print("Loaded environment variables from .env") - class AgentState(TypedDict): messages: list[BaseMessage] @@ -79,6 +81,10 @@ def __init__( commercial_mode: If True, excludes datasets that require commercial licenses or are non-commercial only """ + # --- DISGENET API key preflight (warn early, before the agent tries to select DISGENET tools) --- + # Note: We never prompt in non-interactive contexts (e.g., Flask/gunicorn); we only warn. + self.disgenet_api_available = self._disgenet_api_key_check() + # Use default_config values for unspecified parameters if path is None: path = default_config.path @@ -222,6 +228,47 @@ def __init__( self.timeout_seconds = timeout_seconds # 10 minutes default timeout self.configure() + _DISGENET_KEY_WARNED: bool = False + _DISGENET_KEY_PROMPTED: bool = False + + def _disgenet_api_key_check(self) -> bool: + """One-time DISGENET_API_KEY check. + + - If missing, prints a warning early during agent init. + - If stdin is a TTY, prompts once per process for a key (press Enter to skip). + """ + existing = os.getenv("DISGENET_API_KEY") + if existing: + return True + + if not A1._DISGENET_KEY_WARNED: + print( + "⚠️ DISGENET_API_KEY is not set. Please provide the DISGENET API KEY in the environment in order to get full access to DISGENET tools." + ) + A1._DISGENET_KEY_WARNED = True + + # Never block in non-interactive contexts (e.g., Flask/gunicorn, CI). + if not sys.stdin.isatty(): + return False + + # Avoid repeatedly prompting if the agent is re-initialized in a loop. + if A1._DISGENET_KEY_PROMPTED: + return False + A1._DISGENET_KEY_PROMPTED = True + + try: + from getpass import getpass + + entered = getpass("Enter DISGENET_API_KEY (leave blank to skip): ").strip() + if not entered: + return False + os.environ["DISGENET_API_KEY"] = entered + print("✅ DISGENET_API_KEY set for this process.") + return True + except Exception: + # If prompting fails for any reason, just keep going with the warning-only behavior. + return False + def add_tool(self, api): """Add a new tool to the agent's tool registry and make it available for retrieval. diff --git a/biomni/config.py b/biomni/config.py index 874a1adf..680aeff9 100644 --- a/biomni/config.py +++ b/biomni/config.py @@ -29,13 +29,13 @@ class BiomniConfig: """ # Data and execution settings - path: str = "./data" - timeout_seconds: int = 600 + path: str = os.getenv("BIOMNI_PATH", "./data") - # LLM settings (API keys still from environment) - llm: str = "claude-sonnet-4-5" - temperature: float = 0.7 + # LLM settings (API keys still from environment): claude-3-5-sonnet-20241022 || gpt-4o-mini-2024-07-18 + llm: str = os.getenv("BIOMNI_LLM", "gpt-5-mini-2025-08-07") + temperature: float = float(os.getenv("BIOMNI_TEMPERATURE", "1")) # 0.7 + timeout_seconds: int = 600 # Tool settings use_tool_retriever: bool = True diff --git a/biomni/env_desc.py b/biomni/env_desc.py index ec834d4c..d976af75 100644 --- a/biomni/env_desc.py +++ b/biomni/env_desc.py @@ -19,7 +19,6 @@ "ddinter_hormonal.csv": "Drug-drug interactions for systemic hormonal preparations from DDInter 2.0 database.", "ddinter_respiratory.csv": "Drug-drug interactions for respiratory system drugs from DDInter 2.0 database.", "ddinter_various.csv": "Drug-drug interactions for various drugs from DDInter 2.0 database.", - "DisGeNET.parquet": "Gene-disease associations from multiple sources.", "dosage_growth_defect.parquet": "Gene dosage changes affecting growth.", "enamine_cloud_library_smiles.pkl": "Compounds from Enamine REAL library with SMILES annotations.", "evebio_assay_table.csv": "Assay metadata with one row per assay from EveBio pharmome mapping.", diff --git a/biomni/env_desc_cm.py b/biomni/env_desc_cm.py index 177920da..f031f9b7 100644 --- a/biomni/env_desc_cm.py +++ b/biomni/env_desc_cm.py @@ -19,7 +19,6 @@ # "ddinter_hormonal.csv": "Drug-drug interactions for systemic hormonal preparations from DDInter 2.0 database.", # CC BY-NC-SA 4.0 - Non-commercial only # "ddinter_respiratory.csv": "Drug-drug interactions for respiratory system drugs from DDInter 2.0 database.", # CC BY-NC-SA 4.0 - Non-commercial only # "ddinter_various.csv": "Drug-drug interactions for various drugs from DDInter 2.0 database.", # CC BY-NC-SA 4.0 - Non-commercial only - # "DisGeNET.parquet": "Gene-disease associations from multiple sources.", # CC BY-NC-SA 4.0 - Non-commercial only "dosage_growth_defect.parquet": "Gene dosage changes affecting growth.", # "enamine_cloud_library_smiles.pkl": "Compounds from Enamine REAL library with SMILES annotations.", # Proprietary - Requires license # "evebio_assay_table.csv": "Assay metadata with one row per assay from EveBio pharmome mapping.", # Proprietary - Requires permission diff --git a/biomni/llm.py b/biomni/llm.py index 1ca4e6a4..d92ab9fa 100644 --- a/biomni/llm.py +++ b/biomni/llm.py @@ -35,7 +35,7 @@ def get_llm( # Use config values for any unspecified parameters if config is not None: if model is None: - model = config.llm_model + model = config.llm if temperature is None: temperature = config.temperature if source is None: diff --git a/biomni/tool/database.py b/biomni/tool/database.py index fa55b1c6..4f7b0dcc 100644 --- a/biomni/tool/database.py +++ b/biomni/tool/database.py @@ -55,6 +55,7 @@ def _query_llm_for_api(prompt, schema, system_template): model = default_config.llm api_key = default_config.api_key + temperature = default_config.temperature except ImportError: model = "claude-3-5-haiku-20241022" api_key = None @@ -63,7 +64,8 @@ def _query_llm_for_api(prompt, schema, system_template): # Format the system prompt with schema if provided if schema is not None: schema_json = json.dumps(schema, indent=2) - system_prompt = system_template.format(schema=schema_json) + # Use string replacement instead of .format() to avoid issues with curly braces in JSON + system_prompt = system_template.replace("{schema}", schema_json) else: system_prompt = system_template @@ -71,9 +73,9 @@ def _query_llm_for_api(prompt, schema, system_template): try: from biomni.config import default_config - llm = get_llm(model=model, temperature=0.0, api_key=api_key, config=default_config) + llm = get_llm(model=model, temperature=temperature, api_key=api_key, config=default_config) except ImportError: - llm = get_llm(model=model, temperature=0.0, api_key=api_key or "EMPTY") + llm = get_llm(model=model, temperature=temperature, api_key=api_key or "EMPTY") # Compose messages messages = [ @@ -83,7 +85,23 @@ def _query_llm_for_api(prompt, schema, system_template): # Query the LLM response = llm.invoke(messages) - llm_text = response.content.strip() + content = response.content + + # Handle response.content - based on actual runtime type + # Test output shows: [{'type': 'text', 'text': '...', 'annotations': []}] + if isinstance(content, list): + # List of content blocks (OpenAI Responses API format) + text_parts = [] + for block in content: + if isinstance(block, dict) and "text" in block: + text_parts.append(block["text"]) + llm_text = " ".join(text_parts).strip() + elif isinstance(content, str): + # Direct string content (legacy format) + llm_text = content.strip() + else: + # Fallback: convert to string + llm_text = str(content).strip() # Find JSON boundaries (in case LLM adds explanations) json_start = llm_text.find("{") @@ -103,6 +121,9 @@ def _query_llm_for_api(prompt, schema, system_template): "success": False, "error": f"Failed to parse LLM response: {str(e)}", "raw_response": llm_text if "llm_text" in locals() else "No content found", + "raw_response_preview": llm_text[:500] + if "llm_text" in locals() and len(llm_text) > 500 + else (llm_text if "llm_text" in locals() else "No content found"), } except Exception as e: return {"success": False, "error": f"Error querying LLM: {str(e)}"} @@ -160,6 +181,7 @@ def _query_rest_api(endpoint, method="GET", params=None, headers=None, json_data "endpoint": endpoint, "method": method, "description": description, + "full_url": response.url, # QUITAR ANTES DE PEDIR MERGE A BIOMNI TEAM }, "result": result, } @@ -4972,3 +4994,419 @@ def query_encode( api_result["result"] = _format_query_results(api_result["result"]) return api_result + + +def query_disgenet_api( + prompt: str, + verbose: bool = False, +): + """Query the DISGENET API using natural language. + + This function provides intelligent access to the DISGENET database through a three-step process: + 1. LLM-based API resolution: Translates natural language to base API URL + 2. Entity normalization: Normalizes disease and gene names to DISGENET identifiers + 3. Final API call: Executes the query with normalized parameters + + NOTE: This function returns only ONE page of results (page_number=0, 100 items). To get all results, increment the page_number parameter until no more results are returned. + Example query: Find diseases associated with BRAF V157A... + + ================================================================================ + PROMPT: Find diseases associated with BRAF V157A; page_number=0 + ================================================================================ + + Page 0: fetched 100 results (cumulative: 100/613) + + If the user wants more results, increment the page_number parameter for 100 more results. + For example, to get the next 100 results, query: + + ================================================================================ + PROMPT: Find diseases associated with BRAF V157A; page_number=1 + ================================================================================ + + Page 1: fetched 100 results (cumulative: 200/613) + + IMPORTANT: specify the order in which results are to be returned, ordered_by: DISGENET score, DSI, DPI, pLI, or publication year (pmYear) and additionally by polyphen, sift, odds_ratio, beta in case of VDA. + + Parameters + ---------- + prompt : str + Natural language query about diseases, genes, variants, GDA, VDA, DDA, or entity information queries. + Use a single, detailed query per call. Supports ordering results by score, DSI, DPI, pLI, or publication year (pmYear). Supports disease class queries. Supports filtering by a vast number of parameters to assess the strength, relevance, and confidence of gene-disease associations (GDAs) and variant-disease associations (VDAs). + Have in mind there are many parameters to filter by and order by. Ask the user to specify the parameters and the order by. + Examples: + - "Find genes associated with breast cancer, order by DISGENET score" + - "What diseases is BRCA1 associated with? order by publication date" + - "Show me variants related to Alzheimer's disease" + - "Get disease-disease associations for diabetes, order by DPI" + - "Get information on how to download the database" + verbose : bool, optional + Whether to return detailed results including normalization steps (default: False) + + Returns + ------- + dict + Dictionary containing keys: ['success', 'query_info', 'result', 'normalization', 'original_params', 'normalized_params'] + + - success: bool indicating if query succeeded + - query_info: Dictionary containing the API endpoint, method, and parameters + - result: Query results from DISGENET (single page=100 results), print results to see keys before parsing the results) + - normalization: Details about entity normalization + - original_params: Original parameters passed to the API + - normalized_params: Normalized parameters passed to the API + - error: Error message if failed + + IMPORTANT: Inspect result payload if verbose=True following the example below: + >>> query_result = query_disgenet_api( + ... "Find genes associated with Alzheimer's disease, order them by pmYear", verbose=True + ... ) + >>> result = query_result.get("result") if isinstance(query_result, dict) else None + >>> payload = result.get("payload") if isinstance(result, dict) else None + >>> Search the swagger documentation for the DTO (Data Transfer Object) and the field names to access the data. + """ + # Constants + DISGENET_API_BASE = os.getenv("DISGENET_API_SERVER_HOST", "https://api.disgenet.com") + SWAGGER_URL = f"{DISGENET_API_BASE}/v2/api-docs" + + # Validate inputs + if not prompt: + return {"error": "A natural language prompt is required"} + + # Validate API key + api_key = os.getenv("DISGENET_API_KEY") + if not api_key: + return {"error": "DISGENET_API_KEY environment variable not set. API key is required for DISGENET API access."} + + # STEP 1: API Resolution - Generate base URL from natural language prompt + # Fetch Swagger documentation + try: + swagger_response = requests.get(SWAGGER_URL, timeout=10) + swagger_response.raise_for_status() + disgenet_schema = swagger_response.json() + except Exception as e: + return {"error": f"Failed to fetch DISGENET Swagger documentation: {str(e)}"} + + # Create system prompt template + system_template = """ + You are an expert in translating natural language requests into DISGENET REST API calls. + + Here is the complete DISGENET API Swagger documentation: + {schema} + + Based on the user's natural language request, generate the appropriate API endpoint and parameters. + + IMPORTANT GUIDELINES: + 1. Read the DISGENET API Swagger documentation to understand the endpoints and parameters available. + 2. Main endpoints: + 2.1 gda: Get gene disease associations. + 2.1.1 /api/v1/gda/evidence: Get evidences that support gene disease associations. + 2.1.2 /api/v1/gda/summary: Get gene disease associations. + 2.2 vda: Get variant disease associations. + 2.2.1 /api/v1/vda/evidence: Get evidences that support variant disease associations. + 2.2.2 /api/v1/vda/summary: Get variant disease associations. + 2.3 dda: Get disease disease associations. + 2.4 entity: Genes, diseases, variants and publications + 2.4.1 /api/v1/entity/chemical: Get properties of chemicals(s). + 2.4.2 /api/v1/entity/disease: Get properties of disease(s). + 2.4.3 /api/v1/entity/gene: Get properties of gene(s). + 2.4.4 /api/v1/entity/publication: Get properties of publication(s). + 2.4.5 /api/v1/entity/variant: Get properties of variant(s). + 2.5 enrichment: Get DISGENET gene and variant enrichment analysis. + 2.5.1 /api/v1/enrichment/gene: Get gene enrichment analysis. + 2.5.2 /api/v1/enrichment/variant: Get variant enrichment analysis. + 2.6 embeddings: Get DISGENET embeddings for genes, diseases, variants and publications. + 2.6.1 /api/v1/embeddings/normalize: Get normalization of diseases, genes and chemicals(s), by free text search, one or more, separated by '|', up to 100. Use minimum_similarity_threshold=0.8 + 2.7 management: Get DISGENET REST API management information. + 2.7.1 /api/v1/public/last_release: Get the statistical summaries for current version of the DISGENET database. + 2.7.2 /api/v1/public/version: Get the current version of DISGENET data. + + 3. Extract gene names and disease names and chemical names as they appear in natural language + 4. DO NOT convert names to IDs yet - return them as plain text (e.g., "BRCA1", "breast cancer"). If CUI id provided always write it with the UMLS_ prefix (e.g., "UMLS_C0006142"). + 5. Common parameters: gene, disease, dis_class_list, chemical_id, order_by, page_number + 6. Default returns page_number=0 if not specified by user (first 100 results). + 7. Return a JSON object with: + - "endpoint": The API path (e.g., "/api/v1/gda/summary") + - "method": HTTP method ("GET" or "POST") + - "params": Dictionary of query parameters with natural language values + 8. Read the DISGENET API Swagger documentation again to ensure the endpoint and parameters are correct. + + EXAMPLES: + Q: "genes associated with breast cancer, order by DPI" + A: {{"endpoint": "/api/v1/gda/summary", "method": "GET", "params": {{"disease": "breast cancer", "order_by": "dpi", "page_number": 0}}}} + Q: "associations between APP and cancer" + A: {{"endpoint": "/api/v1/gda/summary", "method": "GET", "params": {{"gene": "APP", "dis_class_list": "CO4", "order_by": "dpi", "page_number": 0}}}} + Q: "associations between TARDBP and neurodegenerative disorders" + A: {{"endpoint": "/api/v1/gda/summary", "method": "GET", "params": {{"gene": "TARDBP","dis_class_list": "F03, C10", "order_by": "dpi", "page_number": 0}}}} + Q: "clinical evidence linking TDP-43 and ALS?" + A: {{"endpoint": "/api/v1/gda/summary", "method": "GET", "params": {{"gene": "TDP-43", "disease": "Amyotrophic lateral sclerosis", "page_number": 0}}}} + Q: "variants of FOXC2 gene" + A: {{"endpoint": "/api/v1/vda/summary", "method": "GET", "params": {{"gene": "FOXC2", "page_number": 0}}}} + Q: "more information about FOXC2 gene" + A: {{"endpoint": "/api/v1/vda/evidence", "method": "GET", "params": {{"gene": "FOXC2", "page_number": 0}}}} + Q: "normalize diseases schizophrenia, bipolar disorder, major depressive disorder" + A: {{"endpoint": "/api/v1/embeddings/normalize", "method": "GET", "params": {{"entity_type": "disease", "term_list": "schizophrenia|bipolar disorder|major depressive disorder"}}}} + Q: "more information about the database" + A: {{"endpoint": "/api/v1/public/last_release", "method": "GET", "params": {}}} + Return ONLY the JSON object with no additional text or explanations. + """ + + # Query LLM to generate the API call (passing None for schema since we already inserted it) + llm_result = _query_llm_for_api( + prompt=prompt, + schema=disgenet_schema, # Schema already inserted into template + system_template=system_template, + ) + + if not llm_result["success"]: + return llm_result + + # Get the endpoint and parameters from LLM's response + query_info = llm_result["data"] + endpoint = query_info.get("endpoint", "") + method = query_info.get("method", "GET") + params = query_info.get("params", {}) + + if not endpoint: + return { + "error": "Failed to generate a valid API endpoint from the prompt", + "llm_response": llm_result.get("raw_response", "No response"), + } + + # STEP 2: Entity Normalization - Normalize disease and gene names to DISGENET IDs + normalized_params = {} + normalization_log = {} + + for param_key, param_value in params.items(): + # Check if this parameter needs normalization + if param_key in ["disease"] and param_value: + # Normalize disease name to UMLS CUI + normalized_id = _normalize_disgenet_entity(param_value, "disease", DISGENET_API_BASE) + if normalized_id: + normalized_params[param_key] = normalized_id + normalization_log[param_key] = {"original": param_value, "normalized": normalized_id} + else: + # Fallback to original value + normalized_params[param_key] = param_value + normalization_log[param_key] = { + "original": param_value, + "normalized": None, + "note": "normalization failed, using original", + } + + elif param_key in ["gene"] and param_value: + # Normalize gene name to NCBI ID + normalized_id = _normalize_disgenet_entity(param_value, "gene", DISGENET_API_BASE) + if normalized_id: + # For genes, use the gene_ncbi_id parameter + normalized_params["gene_ncbi_id"] = normalized_id + normalization_log["gene"] = {"original": param_value, "normalized": f"gene_ncbi_id={normalized_id}"} + else: + # Fallback: use gene symbol + normalized_params["gene"] = param_value + normalization_log["gene"] = { + "original": param_value, + "normalized": None, + "note": "normalization failed, using original", + } + + elif param_key in ["variant"] and param_value: + # Variants typically don't need normalization + normalized_params[param_key] = param_value + normalization_log[param_key] = { + "original": param_value, + "normalized": param_value, + "note": "no normalization needed", + } + else: + # Pass through other parameters unchanged + normalized_params[param_key] = param_value + + # STEP 3: Execute the Final API Call with normalized parameters + full_url = f"{DISGENET_API_BASE}{endpoint}" + + # Print the generated endpoint for debugging + print(f"\n{'=' * 80}") + print(f"PROMPT: {prompt}") + print(f"API ENDPOINT: {endpoint}") + print(f"METHOD: {method}") + print(f"PARAMS: {params}") + print(f"NORMALIZATION: {normalization_log}") + + api_result = _query_rest_api( + endpoint=full_url, + method=method, + params=normalized_params, + headers={"Authorization": f"Bearer {api_key}"}, + description="DISGENET API query", + ) + + # Add metadata about normalization + if api_result.get("success"): + api_result["normalization"] = normalization_log + api_result["original_params"] = params + api_result["normalized_params"] = normalized_params + + # Format results if not verbose + if not verbose and "result" in api_result: + api_result["result"] = _format_query_results(api_result["result"]) + + # Print full URL for debugging (only if query_info exists) + if "query_info" in api_result: + full_url = api_result["query_info"].get("full_url", "API call failed") + print(f"FULL API URL: {full_url}") + print(f"{'=' * 80}\n") + + return api_result + + +def _normalize_disgenet_entity(term_list: str, entity_type: str, api_base: str) -> str | None: + """Normalize entity names to DISGENET identifiers using the DISGENET API embeddings/normalize endpoint. + + Parameters + ---------- + term_list : str + Expression to match (free text search, one or more, separated by '|', up to 100). + Examples: "breast cancer", "BRCA1", "rs123456", "term1|term2|term3" + entity_type : str + The type of entity ("disease", "gene", "chemical") + api_base : str + The DISGENET API base URL + + Returns + ------- + str | None + Normalized entity identifier(s): + - Single term: "UMLS_C0006142" or "672" or "chemical_id" + - Multiple terms: "UMLS_C0006142,UMLS_C0005586,UMLS_C0004352" (comma-separated) + - disease: "UMLS_C0006142" (extracted from normalizedId field) + - gene: "672" (NCBI ID) + - chemical: "chemical_id" (chemical ID) + Returns None if normalization fails + + Notes + ----- + - Requires DISGENET_API_KEY environment variable to be set + - For disease entities, ensures UMLS_ prefix is added if not present + - term_list supports multiple terms separated by '|' (pipe character), up to 100 terms + - When multiple terms are provided, all normalized IDs are returned as a comma-separated string + - Common separators (commas, semicolons) are automatically converted to pipes before API call + - The API returns multiple matches per term (similarity > 0.8), but only the first/best match is kept + - Example: 5 input terms may return 17 API results, but only 5 IDs are returned (one per term) + """ + # Validate API key + api_key = os.getenv("DISGENET_API_KEY") + if not api_key: + print("ERROR: DISGENET_API_KEY environment variable not set. API key is required for DISGENET API access.") + return None + + # Validate term_list format and count + if not term_list or not term_list.strip(): + print("ERROR: term_list cannot be empty") + return None + + # Normalize separators: convert commas and semicolons to pipes + # This allows users to pass terms in formats like "term1, term2" or "term1; term2" + term_list = term_list.replace(",", "|").replace(";", "|") + + # Clean up: remove extra whitespace and collapse multiple pipes + terms = [t.strip() for t in term_list.split("|") if t.strip()] + term_list = "|".join(terms) + + # Count terms and validate maximum + if len(terms) > 100: + print(f"ERROR: term_list contains {len(terms)} terms, but maximum is 100") + return None + + # Map entity types to API endpoints and response fields + endpoint = "/api/v1/embeddings/normalize" + + # Construct the full URL by combining api_base with endpoint + # Remove trailing slash from api_base if present, and ensure endpoint starts with / + api_base_clean = api_base.rstrip("/") + endpoint_clean = endpoint if endpoint.startswith("/") else f"/{endpoint}" + full_url = f"{api_base_clean}{endpoint_clean}" + + # Prepare query parameters including entity_type + # term_list is passed as-is (pipe-separated format) + query_params = {"entity_type": entity_type, "term_list": term_list} + + # Print normalization call information + print(f"\n{'~' * 80}") + print(f"NORMALIZING {entity_type.upper()}: '{term_list}'") + print(f"NORMALIZATION ENDPOINT: {endpoint}") + print(f"NORMALIZATION PARAMS: {query_params}") + + # Show the actual URL that will be called + from urllib.parse import quote, urlencode + + query_string = urlencode(query_params, quote_via=quote) + full_url_with_params = f"{full_url}?{query_string}" + print(f"FULL URL: {full_url_with_params}") + + try: + # Make request to entity endpoint + api_response = _query_rest_api( + endpoint=full_url, method="GET", params=query_params, headers={"Authorization": f"Bearer {api_key}"} + ) + + print(f"RESPONSE SUCCESS: {api_response.get('success', False)}") + + if api_response.get("success"): + # Extract the result data from the API response + data = api_response.get("result", {}) + + # The API returns a payload with results + if isinstance(data, dict) and "payload" in data: + payload = data["payload"] + print(f"PAYLOAD LENGTH: {len(payload) if payload else 0}") + + if payload and len(payload) > 0: + # Extract normalized IDs - keep only the first ID for each unique term + # The API returns multiple matches per term (similarity > 0.8), but we want only the best match + normalized_ids = [] + seen_terms = set() + + for _idx, result in enumerate(payload): + term = result.get("term", "") + similarity = result.get("similarity", 0.0) + extracted_value = result.get("normalizedId") + + # Only process the first occurrence of each term (highest similarity) + if term and term not in seen_terms: + seen_terms.add(term) + + if extracted_value: + # For disease, ensure UMLS_ prefix + if entity_type == "disease" and not str(extracted_value).startswith("UMLS_"): + normalized_value = f"UMLS_{extracted_value}" + else: + normalized_value = str(extracted_value) + + normalized_ids.append(normalized_value) + print(f"{entity_type}: {term}, Similarity={similarity}, normalizedId={extracted_value}") + else: + print(f" → WARNING: Could not extract value for term '{term}'") + + if normalized_ids: + # Join all normalized IDs with commas + final_result = ",".join(normalized_ids) + print(f"NORMALIZED VALUES ({len(normalized_ids)} total): {final_result}") + print(f"{'~' * 80}\n") + return final_result + else: + print(f"WARNING: Could not extract any values for entity type '{entity_type}'") + else: + print("WARNING: Empty payload") + else: + print("WARNING: Invalid response format - no payload") + print(f"RESPONSE DATA: {data}") + else: + error_msg = api_response.get("error", "Unknown error") + print(f"WARNING: API call failed - {error_msg}") + + print(f"{'~' * 80}\n") + return None + + except Exception as e: + print(f"ERROR: Failed to normalize {entity_type} '{term_list}': {str(e)}") + print(f"{'~' * 80}\n") + return None diff --git a/biomni/tool/literature.py b/biomni/tool/literature.py index e4b31b14..ab21dc17 100644 --- a/biomni/tool/literature.py +++ b/biomni/tool/literature.py @@ -401,3 +401,105 @@ def extract_pdf_content(url: str) -> str: return f"Error downloading PDF: {str(e)}" except Exception as e: return f"Error extracting text from PDF: {str(e)}" + + +def query_disgenet_evidence( + prompt: str, + verbose: bool = False, +): + """Query DISGENET for literature evidence supporting gene-disease or variant-disease associations. + + DISGENET is included in the literature tools category because it provides curated evidence from scientific literature sources that support associations between genes, variants, and diseases. + This function specifically targets the evidence endpoints of DISGENET which return detailed publication metadata, PubMed IDs or NCTIDs, and association types. + + WHY DISGENET IS A LITERATURE TOOL: + ---------------------------------- + DISGENET aggregates and curates data from multiple literature sources to establish gene-disease and variant-disease associations. Unlike typical databases that only store static data, DISGENET provides: + + 1. LITERATURE EVIDENCE: Each association is backed by evidence from scientific publications, + with PubMed IDs or NCTIDs, publication dates, and citation counts. + + 2. EVIDENCE SOURCES: Data is extracted from multiple literature sources including: + - PubMed abstracts (text mining), ClinicalTrials, ClinGen, Biobak, ClinVar, Curated, FinnGen, GenCC, GWASCat, HPO, Inferred, MGD_HUMAN, MGD_MOUSE, MODELS, ORPHANET, Phewascat, Psygenet, RGD_HUMAN, RGD_RAT, TEXTMINING_HUMAN, TEXTMINING_MODELS, UKBiobnk, UNIPORT + + 3. Association Types: Returns specific association types such as: + - Genetic variation associations from case-control studies + - Biomarker evidences from diagnostic studies + - Therapeutic evidences from clinical trials + - Altered expression evidences from genomic studies + + WHEN TO USE THIS FUNCTION: + ------------------------- + Use this function when you need to: + - Find published literature evidence for gene-disease associations + - Retrieve PubMed articles supporting specific genetic associations + - Get evidence details including sentence snippets from papers + - Explore the bibliographic basis of gene-disease or variant-disease links + - Order evidence by publication year (pmYear) to see latest research + - Access evidence metadata including source databases and confidence scores + + Examples of queries suitable for this function: + - "Find literature evidence linking BRCA1 to breast cancer" + - "Show me published studies about TP53 variants in cancer" + - "What publications support the association between APOE and Alzheimer's" + - "Get recent evidence papers for CFTR mutations in cystic fibrosis" + - "Find biomarker evidence for Parkinson's disease genes" + + The function automatically redirects to query_disgenet_api, which handles: + - Entity normalization (gene names → NCBI IDs, disease names → UMLS CUIs) + - API endpoint selection (/gda/evidence or /vda/evidence) + - Proper parameter formatting for evidence queries + + Parameters + ---------- + prompt : str + Use a single, detailed, natural language query per call about literature evidence for gene-disease or variant-disease associations. Supports ordering results by DISGENET score, DSI, DPI, pLI, or publication year (pmYear) and additionally by polyphen, sift, odds_ratio, beta in case of VDA. Supports disease class queries. Supports filtering by a vast number of parameters to assess the strength, relevance, and confidence of GDAs and VDAs. + The query should specify: + - The gene(s) or variant(s) of interest + - The disease(s) or phenotype(s) of interest + - Optional: ordering preference (e.g., "order by ...") + - Optional: parameters to filter by + + Examples: + - "Find evidence papers linking BRCA1 to ovarian cancer, order by pmYear" + - "Show clinical evidence for CFTR variants in cystic fibrosis" + - "Get biomarker evidence for APP gene in Alzheimer's disease" + + verbose : bool, optional + If True, returns detailed results including: + - Entity normalization steps + - Resolved API endpoint + - Full evidence metadata + Default is False for concise results. + + Returns + ------- + dict + Dictionary containing: + - success: bool indicating if query succeeded + - result: List of evidence entries with publication details including: + - PubMed IDs (pmid) or NCTIDs (nctId) + - Publication year (pmYear) + - Evidence sentence snippets + - Source databases + - Association scores + - Association type classifications + - normalization: Details about entity ID resolution (if verbose=True) + - error: Error message if query failed + + Notes + ----- + This function is a specialized wrapper around query_disgenet_api that emphasizes the literature/evidence aspects of DISGENET. It's designed to make DISGENET discoverable as a literature tool since its primary value is providing curated literature evidence for genetic associations. + + The DISGENET API returns paginated results (100 items per page). If you need more results, increment the page_number parameter in subsequent queries. + + """ + from biomni.tool.database import query_disgenet_api + + # Enhance prompt to target evidence endpoints specifically + evidence_prompt = ( + f"{prompt}. Focus on retrieving evidence entries with publication details and PubMed IDs or NCTIDs or both." + ) + + # Call the main DISGENET API function + return query_disgenet_api(prompt=evidence_prompt, verbose=verbose) diff --git a/biomni/tool/tool_description/database.py b/biomni/tool/tool_description/database.py index 820eece7..f960390d 100644 --- a/biomni/tool/tool_description/database.py +++ b/biomni/tool/tool_description/database.py @@ -728,4 +728,38 @@ }, ], }, + { + "description": "Query the DISGENET biomedical knowledge graph and database using natural language. This tool provides comprehensive access to gene–disease, variant–disease, and disease–disease associations, enabling both evidence-based and mechanistic insights into genetic and disease relationships. It automatically handles entity normalization (disease → UMLS CUI, gene → NCBI Gene ID) and dynamically selects the appropriate DISGENET API endpoint based on the query intent. Typical use cases include identifying disease-associated genes, variants or diseases, exploring genetic pleiotropy and disease specificity, filter by parameters and scores to assess the consequences of a variant-disease association, assessing the mechanistic links between genes, variants, and disease phenotypes. The API supports ordering and filtering by multiple evidence and relevance metrics." + "capabilities" + "Gene–Disease Associations (GDA)" + "Variant–Disease Associations (VDA)" + "Disease–Disease Associations (DDA)" + "Entity and metadata information queries (gene, variant, or disease)" + "Mechanistic and causal inference support between genes and diseases" + "Evidence-based ranking and prioritization of associations" + "key_metrics (among others)" + "DISGENET Score: 0–1, overall evidence strength of an association" + "Disease Specificity Index (DSI): 0–1, indicates how specific a gene/variant is to a single disease" + "Disease Pleiotropy Index (DPI): 0–1, indicates how many diverse diseases a gene/variant is involved in" + "Evidence Index (EI): 0–1, consensus level across publications (1 = full agreement)" + "pLI (Loss-of-function Intolerance): 0–1, gene intolerance to LoF mutations, relevant for drug safety" + "data_outputs: Structured association tables containing gene, disease, and variant identifiers; evidence scores; PubMed references; and both DISGENET andliterature-derived metrics." + "query_guidelines: Use a single, detailed, natural language query per call. Supports ordering results by score, DSI, DPI, pLI, or publication year (pmYear). Supports disease class queries. Supports filtering by a vast number of parameters to assess the strength, relevance, and confidence of GDAs, VDAs and DDAs.", + "name": "query_disgenet_api", + "optional_parameters": [ + { + "name": "verbose", + "type": "bool", + "default": False, + "description": "If true, returns detailed results including normalization steps, resolved API endpoint, and full metadata.", + } + ], + "required_parameters": [ + { + "name": "prompt", + "type": "str", + "description": "Natural language query about genes, variants, or diseases (e.g., 'genes linked to Parkinson’s disease', 'Musculoskeletal Diseases (disease class) associated with TP53', 'mechanistic insights between APOE and Alzheimer’s','List high-confidence gene–disease pairs ordered by evidence index', 'Show variants implicated in Alzheimer’s disease').", + } + ], + }, ] diff --git a/biomni/tool/tool_description/literature.py b/biomni/tool/tool_description/literature.py index 39daa7c5..4d9d3d18 100644 --- a/biomni/tool/tool_description/literature.py +++ b/biomni/tool/tool_description/literature.py @@ -158,4 +158,39 @@ } ], }, + { + "description": "Query DISGENET for literature evidence supporting gene-disease or variant-disease associations. " + "DISGENET is included as a literature tool because it aggregates curated evidence from multiple literature sources " + "such as PubMed abstracts (text mining), ClinGen, Biobak, ClinicalTrials, ClinVar, Curated, FinnGen, GenCC, GWASCat, HPO, Inferred, MGD_HUMAN, MGD_MOUSE, MODELS, ORPHANET, Phewascat, Psygenet, RGD_HUMAN, RGD_RAT, TEXTMINING_HUMAN, TEXTMINING_MODELS, UKBiobnk, UNIPORT." + "DISGENET provides literature-backed evidence including PubMed IDs or NCTIDs, publication years, sentence snippets from papers, " + "and association type classifications. " + "Use this when you need: " + "(1) Published literature evidence for gene/variant-disease associations, " + "(2) Evidence details with publication metadata, " + "(3) Filter evidence by source database, association type, publication year, scores, etc. " + "(4) Order evidence by publication year, scores, etc. " + "This function specifically targets DISGENET's evidence endpoints (/gda/evidence, /vda/evidence) which return " + "detailed publication metadata rather than just association summaries. " + "The function automatically handles entity normalization (gene names to NCBI IDs, disease names to UMLS CUIs) ", + "name": "query_disgenet_evidence", + "optional_parameters": [ + { + "default": False, + "description": "If True, returns detailed results including entity normalization steps, resolved API endpoint, and full evidence metadata.", + "name": "verbose", + "type": "bool", + } + ], + "required_parameters": [ + { + "default": None, + "description": "Use a single, detailed, natural language query per call about literature evidence for gene-disease or variant-disease associations. Supports ordering results by score, DSI, DPI, pLI, or publication year (pmYear). Supports disease class queries. Supports filtering by a vast number of parameters to assess the strength, relevance, and confidence of GDAs and VDAs." + "Examples: 'Find evidence papers linking BRCA1 to ovarian cancer, order by pmYear', " + "'Show clinical evidence for CFTR variants in cystic fibrosis', " + "'Get biomarker evidence for APP gene in Alzheimer's disease'", + "name": "prompt", + "type": "str", + } + ], + }, ] diff --git a/license_info.md b/license_info.md index a1854628..4a9ba1cd 100644 --- a/license_info.md +++ b/license_info.md @@ -12,7 +12,6 @@ A significant portion of the data used in Biomni requires a commercial license f | **BindingDB** | `BindingDB_All_202409.tsv` | Custom, non-commercial use granted. Commercial use requires a license. | Yes, with a commercial license. | [BindingDB](https://www.bindingdb.org) | | **Broad Repurposing Hub** | `broad_repurposing_hub_*.parquet` | CC BY 4.0 | Yes | [Broad Institute](https://www.broadinstitute.org/drug-repurposing-hub) | | **DDInter** | `ddinter_*.csv` | CC BY-NC-SA 4.0 | No, non-commercial use only. | [DDInter](http://ddinter.scbdd.com/) | -| **DisGeNET** | `DisGeNET.parquet` | CC BY-NC-SA 4.0 | No, non-commercial use only. | [DisGeNET](https://www.disgenet.org/) | | **Enamine** | `enamine_cloud_library_smiles.pkl` | Proprietary. Requires license for screening. | Yes, with a valid license. | [Enamine](https://enamine.net/) | | **EveBio** | `evebio_*.csv` | Appears to be proprietary data from EveBio. | Requires permission from EveBio. | EveBio | | **Gene Ontology (GO)** | `go-plus.json` | CC BY 4.0 | Yes | [Gene Ontology Consortium](http://geneontology.org/) |