From fed44e71fe16aba06055929ae47ed69e65535c9f Mon Sep 17 00:00:00 2001 From: PizzaSober Date: Tue, 27 Jan 2026 19:16:04 +0800 Subject: [PATCH] sync: paper-tool] --- service/app/mcp/literature.py | 463 ------------------ service/app/tools/builtin/__init__.py | 4 + service/app/tools/builtin/literature.py | 390 +++++++++++++++ service/app/tools/capabilities.py | 1 + service/app/tools/prepare.py | 6 + service/app/tools/registry.py | 14 + .../components/ChatToolbar/ToolSelector.tsx | 33 +- web/src/core/agent/toolConfig.ts | 19 + web/src/i18n/locales/en/app.json | 2 + web/src/i18n/locales/zh/app.json | 2 + 10 files changed, 461 insertions(+), 473 deletions(-) delete mode 100644 service/app/mcp/literature.py create mode 100644 service/app/tools/builtin/literature.py diff --git a/service/app/mcp/literature.py b/service/app/mcp/literature.py deleted file mode 100644 index 5c5e55ca..00000000 --- a/service/app/mcp/literature.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -Literature MCP Server - Multi-source academic literature search - -Provides tools for searching academic literature from multiple data sources -(OpenAlex, Semantic Scholar, PubMed, etc.) with unified interface. -""" - -import json -import logging -from datetime import datetime -from typing import Any - -import httpx -from fastmcp import FastMCP - -from app.utils.literature import SearchRequest, WorkDistributor - -logger = logging.getLogger(__name__) - -TRUE_VALUES = frozenset({"true", "1", "yes"}) -FALSE_VALUES = frozenset({"false", "0", "no"}) - -# Create FastMCP instance -mcp = FastMCP("literature") - -# Metadata for MCP server -__mcp_metadata__ = { - "name": "Literature Search", - "description": "Search academic literature from multiple sources with advanced filtering", - "version": "1.0.0", -} - - -@mcp.tool() -async def search_literature( - query: str, - mailto: str | None = None, - author: str | None = None, - institution: str | None = None, - source: str | None = None, - year_from: str | None = None, - year_to: str | None = None, - is_oa: str | None = None, - work_type: str | None = None, - language: str | None = None, - is_retracted: str | None = None, - has_abstract: str | None = None, - has_fulltext: str | None = None, - sort_by: str = "relevance", - max_results: str | int = 50, - data_sources: list[str] | None = None, - include_abstract: str | bool = False, -) -> str: - """ - Search academic literature from multiple data sources (OpenAlex, Semantic Scholar, PubMed, etc.) - - ๐Ÿ”‘ STRONGLY RECOMMENDED: Always provide a valid email address (mailto parameter) - โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• - - ๐Ÿ“Š Performance Difference: - - WITH email (mailto): 10 requests/second (fast, ideal for large searches) - - WITHOUT email (mailto): 1 request/second (slow, sequential processing) - - โš ๏ธ Impact: Omitting email can cause 10x slowdown or timeouts for large result sets. - Production research should ALWAYS include email. Example: "researcher@university.edu" - - Response Format Overview - โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• - The tool returns TWO sections automatically: - - 1๏ธโƒฃ EXECUTIVE SUMMARY - - Key statistics (total found, unique count, sources) - - Average citations and open access rate - - Publication year range - - Warning/issue resolution status - - 2๏ธโƒฃ DETAILED RESULTS (Complete JSON with URLs) - - Each paper includes: - โ€ข โœ… Valid URLs (access_url; doi is a raw identifier) - โ€ข Title, Authors (first 5), Publication Year - โ€ข Citation Count, Journal, Open Access Status - โ€ข Abstract (only if include_abstract=True) - - Format: JSON array for easy parsing/import - - All URLs are validated and functional - - Args: - query: Search keywords (e.g., "machine learning", "CRISPR", "cancer immunotherapy") - [REQUIRED] Most important parameter for accurate results - - mailto: Email address to enable fast API pool at OpenAlex - [โญ STRONGLY RECOMMENDED - includes your email] - Examples: "researcher@mit.edu", "student@university.edu", "name@company.com" - Impact: 10x faster searches. Production users MUST provide this. - Note: Email is private, only used for API identification. - - author: OPTIONAL - Filter by author name (e.g., "Albert Einstein", "Jennifer Doudna") - Will auto-correct common misspellings if not found exactly - - institution: OPTIONAL - Filter by affiliation (e.g., "MIT", "Harvard", "Stanford University") - Partial name matching supported - - source: OPTIONAL - Filter by journal/venue (e.g., "Nature", "Science", "JAMA") - Matches both journal names and abbreviated titles - - year_from: OPTIONAL - Start year (e.g., "2020" or 2020) - Accepts string or integer, will auto-clamp to valid range (1700-2026) - - year_to: OPTIONAL - End year (e.g., "2024" or 2024) - Accepts string or integer, will auto-clamp to valid range (1700-2026) - If year_from > year_to, they will be automatically swapped - - is_oa: OPTIONAL - Open access filter ("true"/"false"/"yes"/"no") - "true" returns ONLY open access papers with direct links - - work_type: OPTIONAL - Filter by publication type - Options: "article", "review", "preprint", "book", "dissertation", "dataset", etc. - - language: OPTIONAL - Filter by publication language (e.g., "en", "zh", "ja", "fr", "de") - "en" = English only, "zh" = Chinese only, etc. - - is_retracted: OPTIONAL - Retracted paper filter ("true"/"false") - "false" excludes retracted works (recommended for research) - "true" shows ONLY retracted papers (for auditing) - - has_abstract: OPTIONAL - Require abstract ("true"/"false") - "true" returns only papers with abstracts - - has_fulltext: OPTIONAL - Require full text access ("true"/"false") - "true" returns only papers with available full text - - sort_by: Sort results - "relevance" (default), "cited_by_count", "publication_date" - "cited_by_count" useful for influential papers - "publication_date" shows most recent first - - max_results: Result limit (default: 50, range: 1-1000, accepts string or int) - More results = slower query. Recommended: 50-200 for research - - data_sources: Advanced - Sources to query (default: ["openalex"]) - Can include: ["openalex", "semantic_scholar", "pubmed"] - - include_abstract: Include full abstracts in JSON output? (default: False) - True = include full abstracts for detailed review - False = save token budget by excluding abstracts - - Returns: - Markdown report with two sections: - - ๐Ÿ“‹ Section 1: EXECUTIVE SUMMARY - โ””โ”€ Search conditions recap - โ””โ”€ Total results found & unique count - โ””โ”€ Statistics: avg citations, OA rate, year range - โ””โ”€ โš ๏ธ Any warnings/filter issues & resolutions - - ๐Ÿ“Š Section 2: COMPLETE RESULTS (JSON Array) - โ””โ”€ Each paper object contains: - โ€ข "doi": Raw DOI string (not a URL) - โ€ข "title": Paper title - โ€ข "authors": Author names [first 5 only to save tokens] - โ€ข "publication_year": Publication date - โ€ข "cited_by_count": Citation impact metric - โ€ข "journal": Journal/venue name - โ€ข "description": Short description about the paper - โ””โ”€ access_url is validated and immediately accessible - โ””โ”€ Copy JSON directly into spreadsheet, database, or reference manager - - Usage Tips (READ THIS!) - โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• - โœ… DO: - - Always provide mailto (10x faster searches) - - Start simple: query + mailto first - - Review results before refining search - - Use filters incrementally to narrow down - - Set include_abstract=True only for final review (saves API calls) - - โŒ DON'T: - - Make multiple searches without reviewing first results - - Use vague keywords like "research" or "analysis" - - Search without mailto unless doing quick test - - Ignore the "Next Steps Guide" section - - Omit email for production/important research - """ - try: - # Validate query early to avoid accidental broad searches - if not query or not str(query).strip(): - return "โŒ Invalid input: query cannot be empty." - if len(str(query).strip()) < 3: - return "โŒ Invalid input: query is too short (minimum 3 characters)." - - # Convert string parameters to proper types - year_from_int = int(year_from) if year_from and str(year_from).strip() else None - year_to_int = int(year_to) if year_to and str(year_to).strip() else None - - # Clamp year ranges (warn but don't block search) - max_year = datetime.now().year + 1 - year_warning = "" - if year_from_int is not None and year_from_int > max_year: - year_warning += f"year_from {year_from_int}โ†’{max_year}. " - year_from_int = max_year - if year_to_int is not None and year_to_int < 1700: - year_warning += f"year_to {year_to_int}โ†’1700. " - year_to_int = 1700 - - # Ensure year_from <= year_to when both are set - if year_from_int is not None and year_to_int is not None and year_from_int > year_to_int: - year_warning += f"year_from {year_from_int} and year_to {year_to_int} swapped to maintain a valid range. " - year_from_int, year_to_int = year_to_int, year_from_int - - # Convert is_oa to boolean - bool_warning_parts: list[str] = [] - - def _parse_bool_field(raw: str | bool | None, field_name: str) -> bool | None: - if raw is None: - return None - if isinstance(raw, bool): - return raw - val = str(raw).strip().lower() - if val in TRUE_VALUES: - return True - if val in FALSE_VALUES: - return False - bool_warning_parts.append(f"{field_name}={raw!r} not recognized; ignoring this filter.") - return None - - # Convert bool-like fields - is_oa_bool = _parse_bool_field(is_oa, "is_oa") - is_retracted_bool = _parse_bool_field(is_retracted, "is_retracted") - has_abstract_bool = _parse_bool_field(has_abstract, "has_abstract") - has_fulltext_bool = _parse_bool_field(has_fulltext, "has_fulltext") - - # Convert max_results to int with early clamping - max_results_warning = "" - try: - max_results_int = int(max_results) if max_results else 50 - except (TypeError, ValueError): - max_results_warning = "โš ๏ธ max_results is not a valid integer; using default 50. " - max_results_int = 50 - - if max_results_int < 1: - max_results_warning += f"max_results {max_results_int}โ†’50 (minimum is 1). " - max_results_int = 50 - elif max_results_int > 1000: - max_results_warning += f"max_results {max_results_int}โ†’1000 (maximum is 1000). " - max_results_int = 1000 - - # Convert include_abstract to bool - include_abstract_bool = str(include_abstract).lower() in {"true", "1", "yes"} if include_abstract else False - - openalex_email = mailto.strip() if mailto and str(mailto).strip() else None - - logger.info( - "Literature search requested: query=%r, mailto=%s, max_results=%d", - query, - "" if openalex_email else None, - max_results_int, - ) - - # Create search request with converted types - request = SearchRequest( - query=query, - author=author, - institution=institution, - source=source, - year_from=year_from_int, - year_to=year_to_int, - is_oa=is_oa_bool, - work_type=work_type, - language=language, - is_retracted=is_retracted_bool, - has_abstract=has_abstract_bool, - has_fulltext=has_fulltext_bool, - sort_by=sort_by, - max_results=max_results_int, - data_sources=data_sources, - ) - - # Execute search - async with WorkDistributor(openalex_email=openalex_email) as distributor: - result = await distributor.search(request) - - if year_warning: - result.setdefault("warnings", []).append(f"โš ๏ธ Year adjusted: {year_warning.strip()}") - if bool_warning_parts: - result.setdefault("warnings", []).append("โš ๏ธ Boolean filter issues: " + " ".join(bool_warning_parts)) - if max_results_warning: - result.setdefault("warnings", []).append(max_results_warning.strip()) - - # Format output - return _format_search_result(request, result, include_abstract_bool) - - except ValueError as e: - logger.warning(f"Literature search validation error: {e}") - return f"โŒ Invalid input: {str(e)}" - except httpx.HTTPError as e: - logger.error(f"Literature search network error: {e}", exc_info=True) - return "โŒ Network error while contacting literature sources. Please try again later." - except Exception as e: - logger.error(f"Literature search failed: {e}", exc_info=True) - return "โŒ Unexpected error during search. Please retry or contact support." - - -def _format_search_result(request: SearchRequest, result: dict[str, Any], include_abstract: bool = False) -> str: - """ - Format search results into human-readable report + JSON data - - Args: - request: Original search request - result: Search result from WorkDistributor - include_abstract: Whether to include abstracts in JSON (default: False to save tokens) - - Returns: - Formatted markdown report with embedded JSON - """ - works = result["works"] - - # Build report sections - sections: list[str] = ["# Literature Search Report\n"] - - # Warnings and resolution status (if any) - if warnings := result.get("warnings", []): - sections.extend(["## โš ๏ธ Warnings and Resolution Status\n", *warnings, ""]) - - # Search conditions - conditions: list[str] = [ - f"- **Query**: {request.query}", - *([f"- **Author**: {request.author}"] if request.author else []), - *([f"- **Institution**: {request.institution}"] if request.institution else []), - *([f"- **Source**: {request.source}"] if request.source else []), - *( - [f"- **Year Range**: {request.year_from or '...'} - {request.year_to or '...'}"] - if request.year_from or request.year_to - else [] - ), - *([f"- **Open Access Only**: {'Yes' if request.is_oa else 'No'}"] if request.is_oa is not None else []), - *([f"- **Work Type**: {request.work_type}"] if request.work_type else []), - *([f"- **Language**: {request.language}"] if request.language else []), - *( - [f"- **Exclude Retracted**: {'No' if request.is_retracted else 'Yes'}"] - if request.is_retracted is not None - else [] - ), - *( - [f"- **Require Abstract**: {'Yes' if request.has_abstract else 'No'}"] - if request.has_abstract is not None - else [] - ), - *( - [f"- **Require Full Text**: {'Yes' if request.has_fulltext else 'No'}"] - if request.has_fulltext is not None - else [] - ), - f"- **Sort By**: {request.sort_by}", - f"- **Max Results**: {request.max_results}", - ] - sections.extend(["## Search Conditions\n", "\n".join(conditions), ""]) - - # Check if no results - if not works: - sections.extend(["## โŒ No Results Found\n", "**Suggestions to improve your search:**\n"]) - suggestions: list[str] = [ - "1. **Simplify keywords**: Try broader or different terms", - *(["2. **Remove author filter**: Author name may not be recognized"] if request.author else []), - *(["3. **Remove institution filter**: Try without institution constraint"] if request.institution else []), - *(["4. **Remove source filter**: Try without journal constraint"] if request.source else []), - *( - ["5. **Expand year range**: Current range may be too narrow"] - if request.year_from or request.year_to - else [] - ), - *(["6. **Remove open access filter**: Include non-OA papers"] if request.is_oa else []), - "7. **Check spelling**: Verify all terms are spelled correctly", - ] - sections.extend(["\n".join(suggestions), ""]) - return "\n".join(sections) - - # Statistics and overall insights - total_count = result["total_count"] - unique_count = result["unique_count"] - sources = result["sources"] - - stats: list[str] = [ - f"- **Total Found**: {total_count} works", - f"- **After Deduplication**: {unique_count} works", - ] - source_info = ", ".join(f"{name}: {count}" for name, count in sources.items()) - stats.append(f"- **Data Sources**: {source_info}") - - # Add insights - avg_citations = sum(w.cited_by_count for w in works) / len(works) - stats.append(f"- **Average Citations**: {avg_citations:.1f}") - - oa_count = sum(w.is_oa for w in works) - oa_ratio = (oa_count / len(works)) * 100 - stats.append(f"- **Open Access Rate**: {oa_ratio:.1f}% ({oa_count}/{len(works)})") - - if years := [w.publication_year for w in works if w.publication_year]: - stats.append(f"- **Year Range**: {min(years)} - {max(years)}") - - sections.extend(["## Search Statistics\n", "\n".join(stats), ""]) - - # Complete JSON list - sections.extend( - [ - "## Complete Works List (JSON)\n", - "The following JSON contains all works with full abstracts:\n" - if include_abstract - else "The following JSON contains all works (abstracts excluded to save tokens):\n", - "```json", - ] - ) - - # Convert works to dict for JSON serialization - works_dict = [] - for work in works: - work_data = { - "id": work.id, - "doi": work.doi, - "title": work.title, - "authors": work.authors[:5], # Limit to first 5 authors - "publication_year": work.publication_year, - "cited_by_count": work.cited_by_count, - "journal": work.journal, - "primary_institution": work.primary_institution, - "is_oa": work.is_oa, - "access_url": work.access_url, - "source": work.source, - } - # Only include abstract if requested - if include_abstract and work.abstract: - work_data["abstract"] = work.abstract - works_dict.append(work_data) - - sections.extend([json.dumps(works_dict, indent=2, ensure_ascii=False), "```", ""]) - - # Next steps guidance - prevent infinite loops - sections.extend(["---", "## ๐ŸŽฏ Next Steps Guide\n", "**Before making another search, consider:**\n"]) - next_steps: list[str] = [ - *(["โœ“ **Results found** - Review the JSON data above for your analysis"] if unique_count > 0 else []), - *( - [ - f"โš ๏ธ **Result limit reached** ({request.max_results}) - " - "Consider narrowing filters (author, year, journal) for more targeted results" - ] - if unique_count >= request.max_results - else [] - ), - *( - ["๐Ÿ’ก **Few results** - Consider broadening your search by removing some filters"] - if 0 < unique_count < 10 - else [] - ), - "", - "**To refine your search:**", - "- If too many results โ†’ Add more specific filters (author, institution, journal, year)", - "- If too few results โ†’ Remove filters or use broader keywords", - "- If wrong results โ†’ Check filter spelling and try variations", - "", - "โš ๏ธ **Important**: Avoid making multiple similar searches without reviewing results first!", - "Each search consumes API quota and context window. Make targeted, deliberate queries.", - ] - - sections.append("\n".join(next_steps)) - - return "\n".join(sections) diff --git a/service/app/tools/builtin/__init__.py b/service/app/tools/builtin/__init__.py index 0b2c48e0..2ccc256e 100644 --- a/service/app/tools/builtin/__init__.py +++ b/service/app/tools/builtin/__init__.py @@ -11,11 +11,13 @@ - image: Image generation and analysis - memory: Conversation history search (disabled) - research: Deep research workflow tools (component-internal, not exported here) +- literature: Literature search and normalization """ from app.tools.builtin.fetch import create_web_fetch_tool from app.tools.builtin.image import create_image_tools, create_image_tools_for_agent from app.tools.builtin.knowledge import create_knowledge_tools, create_knowledge_tools_for_agent +from app.tools.builtin.literature import create_literature_search_tool from app.tools.builtin.memory import create_memory_tools, create_memory_tools_for_agent from app.tools.builtin.search import create_web_search_tool @@ -24,6 +26,8 @@ "create_web_search_tool", # Fetch "create_web_fetch_tool", + # Literature + "create_literature_search_tool", # Knowledge "create_knowledge_tools", "create_knowledge_tools_for_agent", diff --git a/service/app/tools/builtin/literature.py b/service/app/tools/builtin/literature.py new file mode 100644 index 00000000..fdb0ee27 --- /dev/null +++ b/service/app/tools/builtin/literature.py @@ -0,0 +1,390 @@ +""" +Literature Search Tool + +LangChain tool for searching academic literature from multiple data sources +(OpenAlex, Semantic Scholar, PubMed, etc.) with unified interface. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime +from typing import Any, Literal + +import httpx +from langchain_core.tools import BaseTool, StructuredTool +from pydantic import BaseModel, Field + +from app.utils.literature import SearchRequest, WorkDistributor + +logger = logging.getLogger(__name__) + +TRUE_VALUES = frozenset({"true", "1", "yes"}) +FALSE_VALUES = frozenset({"false", "0", "no"}) + + +class LiteratureSearchInput(BaseModel): + """Input schema for literature search tool.""" + + query: str = Field( + description="Search keywords (e.g., 'machine learning', 'CRISPR', 'cancer immunotherapy'). " + "Most important parameter for accurate results." + ) + mailto: str | None = Field( + default=None, + description="Email address to enable fast API pool at OpenAlex. " + "STRONGLY RECOMMENDED - provides 10x faster searches. " + "Example: 'researcher@university.edu'", + ) + author: str | None = Field( + default=None, + description="Filter by author name (e.g., 'Albert Einstein', 'Jennifer Doudna'). " + "Will auto-correct common misspellings.", + ) + institution: str | None = Field( + default=None, + description="Filter by affiliation (e.g., 'MIT', 'Harvard', 'Stanford University'). " + "Partial name matching supported.", + ) + source: str | None = Field( + default=None, + description="Filter by journal/venue (e.g., 'Nature', 'Science', 'JAMA'). " + "Matches both journal names and abbreviated titles.", + ) + year_from: int | None = Field( + default=None, + description="Start year (e.g., 2020). Will auto-clamp to valid range (1700-current).", + ) + year_to: int | None = Field( + default=None, + description="End year (e.g., 2024). Will auto-clamp to valid range (1700-current).", + ) + is_oa: bool | None = Field( + default=None, + description="Open access filter. True returns ONLY open access papers with direct links.", + ) + work_type: str | None = Field( + default=None, + description="Filter by publication type. " + "Options: 'article', 'review', 'preprint', 'book', 'dissertation', 'dataset', etc.", + ) + language: str | None = Field( + default=None, + description="Filter by publication language (e.g., 'en', 'zh', 'ja', 'fr', 'de').", + ) + is_retracted: bool | None = Field( + default=None, + description="Retracted paper filter. False excludes retracted works (recommended). " + "True shows ONLY retracted papers (for auditing).", + ) + has_abstract: bool | None = Field( + default=None, + description="Require abstract. True returns only papers with abstracts.", + ) + has_fulltext: bool | None = Field( + default=None, + description="Require full text access. True returns only papers with available full text.", + ) + sort_by: Literal["relevance", "cited_by_count", "publication_date"] = Field( + default="relevance", + description="Sort results. 'cited_by_count' for influential papers, 'publication_date' for most recent first.", + ) + data_sources: list[str] | None = Field( + default=None, + description="Data sources to query. Options: ['openalex', 'semantic_scholar', 'pubmed']. " + "Default: ['openalex'].", + ) + + +async def _search_literature( + query: str, + mailto: str | None = None, + author: str | None = None, + institution: str | None = None, + source: str | None = None, + year_from: int | None = None, + year_to: int | None = None, + is_oa: bool | None = None, + work_type: str | None = None, + language: str | None = None, + is_retracted: bool | None = None, + has_abstract: bool | None = None, + has_fulltext: bool | None = None, + sort_by: str = "relevance", + data_sources: list[str] | None = None, +) -> str: + """ + Search academic literature from multiple data sources. + + Returns a markdown report with executive summary and JSON results. + """ + # Hard-coded: abstracts excluded to save tokens + include_abstract = False + + try: + # Validate query + if not query or not str(query).strip(): + return "Invalid input: query cannot be empty." + if len(str(query).strip()) < 3: + return "Invalid input: query is too short (minimum 3 characters)." + + # Clamp year ranges + max_year = datetime.now().year + 1 + year_warning = "" + year_from_clamped = year_from + year_to_clamped = year_to + + if year_from_clamped is not None and year_from_clamped > max_year: + year_warning += f"year_from {year_from_clamped} clamped to {max_year}. " + year_from_clamped = max_year + if year_to_clamped is not None and year_to_clamped < 1700: + year_warning += f"year_to {year_to_clamped} clamped to 1700. " + year_to_clamped = 1700 + + # Ensure year_from <= year_to when both are set + if year_from_clamped is not None and year_to_clamped is not None and year_from_clamped > year_to_clamped: + year_warning += f"year_from {year_from_clamped} and year_to {year_to_clamped} swapped. " + year_from_clamped, year_to_clamped = year_to_clamped, year_from_clamped + + # Hard-coded max_results + max_results = 10 + + openalex_email = mailto.strip() if mailto and str(mailto).strip() else None + + logger.info( + "Literature search requested: query=%r, mailto=%s, max_results=%d", + query, + "" if openalex_email else None, + max_results, + ) + + # Create search request + request = SearchRequest( + query=query, + author=author, + institution=institution, + source=source, + year_from=year_from_clamped, + year_to=year_to_clamped, + is_oa=is_oa, + work_type=work_type, + language=language, + is_retracted=is_retracted, + has_abstract=has_abstract, + has_fulltext=has_fulltext, + sort_by=sort_by, + max_results=max_results, + data_sources=data_sources, + ) + + # Execute search + async with WorkDistributor(openalex_email=openalex_email) as distributor: + result = await distributor.search(request) + + if year_warning: + result.setdefault("warnings", []).append(f"Year adjusted: {year_warning.strip()}") + + # Format output + return _format_search_result(request, result, include_abstract) + + except ValueError as e: + logger.warning(f"Literature search validation error: {e}") + return f"Invalid input: {e!s}" + except httpx.HTTPError as e: + logger.error(f"Literature search network error: {e}", exc_info=True) + return "Network error while contacting literature sources. Please try again later." + except Exception as e: + logger.error(f"Literature search failed: {e}", exc_info=True) + return "Unexpected error during search. Please retry or contact support." + + +def _format_search_result(request: SearchRequest, result: dict[str, Any], include_abstract: bool = False) -> str: + """ + Format search results into human-readable report + JSON data. + + Args: + request: Original search request + result: Search result from WorkDistributor + include_abstract: Whether to include abstracts in JSON (default: False to save tokens) + + Returns: + Formatted markdown report with embedded JSON + """ + works = result["works"] + + # Build report sections + sections: list[str] = ["# Literature Search Report\n"] + + # Warnings and resolution status (if any) + if warnings := result.get("warnings", []): + sections.extend(["## Warnings and Resolution Status\n", *warnings, ""]) + + # Search conditions + conditions: list[str] = [ + f"- **Query**: {request.query}", + *([f"- **Author**: {request.author}"] if request.author else []), + *([f"- **Institution**: {request.institution}"] if request.institution else []), + *([f"- **Source**: {request.source}"] if request.source else []), + *( + [f"- **Year Range**: {request.year_from or '...'} - {request.year_to or '...'}"] + if request.year_from or request.year_to + else [] + ), + *([f"- **Open Access Only**: {'Yes' if request.is_oa else 'No'}"] if request.is_oa is not None else []), + *([f"- **Work Type**: {request.work_type}"] if request.work_type else []), + *([f"- **Language**: {request.language}"] if request.language else []), + *( + [f"- **Exclude Retracted**: {'No' if request.is_retracted else 'Yes'}"] + if request.is_retracted is not None + else [] + ), + *( + [f"- **Require Abstract**: {'Yes' if request.has_abstract else 'No'}"] + if request.has_abstract is not None + else [] + ), + *( + [f"- **Require Full Text**: {'Yes' if request.has_fulltext else 'No'}"] + if request.has_fulltext is not None + else [] + ), + f"- **Sort By**: {request.sort_by}", + f"- **Max Results**: {request.max_results}", + ] + sections.extend(["## Search Conditions\n", "\n".join(conditions), ""]) + + # Check if no results + if not works: + sections.extend(["## No Results Found\n", "**Suggestions to improve your search:**\n"]) + suggestions: list[str] = [ + "1. **Simplify keywords**: Try broader or different terms", + *(["2. **Remove author filter**: Author name may not be recognized"] if request.author else []), + *(["3. **Remove institution filter**: Try without institution constraint"] if request.institution else []), + *(["4. **Remove source filter**: Try without journal constraint"] if request.source else []), + *( + ["5. **Expand year range**: Current range may be too narrow"] + if request.year_from or request.year_to + else [] + ), + *(["6. **Remove open access filter**: Include non-OA papers"] if request.is_oa else []), + "7. **Check spelling**: Verify all terms are spelled correctly", + ] + sections.extend(["\n".join(suggestions), ""]) + return "\n".join(sections) + + # Statistics and overall insights + total_count = result["total_count"] + unique_count = result["unique_count"] + sources = result["sources"] + + stats: list[str] = [ + f"- **Total Found**: {total_count} works", + f"- **After Deduplication**: {unique_count} works", + ] + source_info = ", ".join(f"{name}: {count}" for name, count in sources.items()) + stats.append(f"- **Data Sources**: {source_info}") + + # Add insights + avg_citations = sum(w.cited_by_count for w in works) / len(works) + stats.append(f"- **Average Citations**: {avg_citations:.1f}") + + oa_count = sum(w.is_oa for w in works) + oa_ratio = (oa_count / len(works)) * 100 + stats.append(f"- **Open Access Rate**: {oa_ratio:.1f}% ({oa_count}/{len(works)})") + + if years := [w.publication_year for w in works if w.publication_year]: + stats.append(f"- **Year Range**: {min(years)} - {max(years)}") + + sections.extend(["## Search Statistics\n", "\n".join(stats), ""]) + + # Complete JSON list + sections.extend( + [ + "## Complete Works List (JSON)\n", + "The following JSON contains all works with full abstracts:\n" + if include_abstract + else "The following JSON contains all works (abstracts excluded to save tokens):\n", + "```json", + ] + ) + + # Convert works to dict for JSON serialization + works_dict = [] + for work in works: + work_data = { + "id": work.id, + "doi": work.doi, + "title": work.title, + "authors": work.authors, + "publication_year": work.publication_year, + "cited_by_count": work.cited_by_count, + "journal": work.journal, + "primary_institution": work.primary_institution, + "is_oa": work.is_oa, + "access_url": work.access_url, + "source": work.source, + } + # Only include abstract if requested + if include_abstract and work.abstract: + work_data["abstract"] = work.abstract + works_dict.append(work_data) + + sections.extend([json.dumps(works_dict, indent=2, ensure_ascii=False), "```", ""]) + + # Next steps guidance + sections.extend(["---", "## Next Steps Guide\n", "**Before making another search, consider:**\n"]) + next_steps: list[str] = [ + *(["- **Results found** - Review the JSON data above for your analysis"] if unique_count > 0 else []), + *( + [ + f"- **Result limit reached** ({request.max_results}) - " + "Consider narrowing filters (author, year, journal) for more targeted results" + ] + if unique_count >= request.max_results + else [] + ), + *( + ["- **Few results** - Consider broadening your search by removing some filters"] + if 0 < unique_count < 10 + else [] + ), + "", + "**To refine your search:**", + "- If too many results: Add more specific filters (author, institution, journal, year)", + "- If too few results: Remove filters or use broader keywords", + "- If wrong results: Check filter spelling and try variations", + "", + "**Important**: Avoid making multiple similar searches without reviewing results first!", + "Each search consumes API quota and context window. Make targeted, deliberate queries.", + ] + + sections.append("\n".join(next_steps)) + + return "\n".join(sections) + + +def create_literature_search_tool() -> BaseTool: + """ + Create the literature search tool. + + Returns: + StructuredTool for literature search. + """ + return StructuredTool( + name="literature_search", + description=( + "Search academic literature from multiple data sources (OpenAlex, Semantic Scholar, PubMed). " + "Returns up to 10 papers with detailed information. " + "IMPORTANT: When presenting results to users, always include: " + "1) Paper title, 2) Authors, 3) Publication year, 4) Journal name, " + "5) Citation count, 6) access_url (clickable link to read the paper). " + "The access_url is critical - users need it to access the full paper. " + "Supports filtering by author, institution, journal, year range, open access status, and more." + ), + args_schema=LiteratureSearchInput, + coroutine=_search_literature, + ) + + +__all__ = ["create_literature_search_tool", "LiteratureSearchInput"] diff --git a/service/app/tools/capabilities.py b/service/app/tools/capabilities.py index 1b4deecf..c471c3fe 100644 --- a/service/app/tools/capabilities.py +++ b/service/app/tools/capabilities.py @@ -54,6 +54,7 @@ class ToolCapability(StrEnum): "bing_search": [ToolCapability.WEB_SEARCH], "tavily_search": [ToolCapability.WEB_SEARCH], "web_fetch": [ToolCapability.WEB_SEARCH], + "literature_search": [ToolCapability.WEB_SEARCH], # Knowledge tools "knowledge_list": [ToolCapability.KNOWLEDGE_RETRIEVAL], "knowledge_read": [ToolCapability.KNOWLEDGE_RETRIEVAL, ToolCapability.FILE_OPERATIONS], diff --git a/service/app/tools/prepare.py b/service/app/tools/prepare.py index 81b4a6df..a3dedabb 100644 --- a/service/app/tools/prepare.py +++ b/service/app/tools/prepare.py @@ -82,6 +82,7 @@ def _load_all_builtin_tools( Load all available builtin tools. - Web search + fetch: loaded if SearXNG is enabled + - Literature search: always loaded - Knowledge tools: loaded if effective knowledge_set_id exists and user_id is available - Image tools: loaded if image generation is enabled and user_id is available - Memory tools: loaded if agent and user_id are available (currently disabled) @@ -110,6 +111,11 @@ def _load_all_builtin_tools( if web_fetch: tools.append(web_fetch) + # Load literature search tool if available + literature_search = BuiltinToolRegistry.get("literature_search") + if literature_search: + tools.append(literature_search) + # Determine effective knowledge_set_id # Priority: session override > agent config effective_knowledge_set_id = session_knowledge_set_id or (agent.knowledge_set_id if agent else None) diff --git a/service/app/tools/registry.py b/service/app/tools/registry.py index 66d27420..8e95f4c3 100644 --- a/service/app/tools/registry.py +++ b/service/app/tools/registry.py @@ -175,6 +175,7 @@ def register_builtin_tools() -> None: """ from app.tools.builtin.fetch import create_web_fetch_tool from app.tools.builtin.knowledge import create_knowledge_tools + from app.tools.builtin.literature import create_literature_search_tool from app.tools.builtin.search import create_web_search_tool # Register web search tool @@ -204,6 +205,19 @@ def register_builtin_tools() -> None: cost=ToolCostConfig(base_cost=1), ) + # Register literature search tool + literature_tool = create_literature_search_tool() + BuiltinToolRegistry.register( + tool_id="literature_search", + tool=literature_tool, + category="search", + display_name="Literature Search", + ui_toggleable=True, + default_enabled=False, + requires_context=[], + cost=ToolCostConfig(base_cost=1), + ) + # Tool cost configs for knowledge tools knowledge_tool_costs = { "knowledge_list": ToolCostConfig(), # Free diff --git a/web/src/components/layouts/components/ChatToolbar/ToolSelector.tsx b/web/src/components/layouts/components/ChatToolbar/ToolSelector.tsx index 285d00b0..6312bac3 100644 --- a/web/src/components/layouts/components/ChatToolbar/ToolSelector.tsx +++ b/web/src/components/layouts/components/ChatToolbar/ToolSelector.tsx @@ -9,9 +9,11 @@ import { import { isImageEnabled, isKnowledgeEnabled, + isLiteratureSearchEnabled, isWebSearchEnabled, updateImageEnabled, updateKnowledgeEnabled, + updateLiteratureSearchEnabled, // updateMemoryEnabled, updateWebSearchEnabled, } from "@/core/agent/toolConfig"; @@ -22,6 +24,7 @@ import { } from "@/service/knowledgeSetService"; import type { Agent } from "@/types/agents"; import { + AcademicCapIcon, BookOpenIcon, CheckIcon, ChevronDownIcon, @@ -59,6 +62,7 @@ export function ToolSelector({ const webSearchEnabled = isWebSearchEnabled(agent); const knowledgeEnabled = isKnowledgeEnabled(agent); const imageEnabled = isImageEnabled(agent); + const literatureSearchEnabled = isLiteratureSearchEnabled(agent); // const memoryEnabled = isMemoryEnabled(agent); // Disabled: pending RAG/pgvector implementation // const memoryEnabled = false; // Hardcoded off until RAG is implemented @@ -70,6 +74,7 @@ export function ToolSelector({ webSearchEnabled, effectiveKnowledgeSetId && knowledgeEnabled, imageEnabled, + literatureSearchEnabled, // memoryEnabled, // Disabled: pending RAG/pgvector implementation ].filter(Boolean).length; @@ -115,6 +120,12 @@ export function ToolSelector({ await onUpdateAgent({ ...agent, graph_config: newGraphConfig }); }; + const handleToggleLiteratureSearch = async () => { + if (!agent) return; + const newGraphConfig = updateLiteratureSearchEnabled(agent, !literatureSearchEnabled); + await onUpdateAgent({ ...agent, graph_config: newGraphConfig }); + }; + // const handleToggleMemory = async () => { // if (!agent) return; // const newGraphConfig = updateMemoryEnabled(agent, !memoryEnabled); @@ -328,33 +339,35 @@ export function ToolSelector({ {imageEnabled && } - {/* Memory Search - Disabled: pending RAG/pgvector implementation */} - {/* */} + {literatureSearchEnabled && } + + + {/* Memory Search - Disabled: pending RAG/pgvector implementation */} diff --git a/web/src/core/agent/toolConfig.ts b/web/src/core/agent/toolConfig.ts index d36e1da9..1adef4df 100644 --- a/web/src/core/agent/toolConfig.ts +++ b/web/src/core/agent/toolConfig.ts @@ -20,6 +20,7 @@ export const BUILTIN_TOOLS = { GENERATE_IMAGE: "generate_image", READ_IMAGE: "read_image", MEMORY_SEARCH: "memory_search", + LITERATURE_SEARCH: "literature_search", } as const; // Web search tools as a group (search + fetch always together) @@ -43,6 +44,7 @@ export const ALL_BUILTIN_TOOL_IDS = [ BUILTIN_TOOLS.GENERATE_IMAGE, BUILTIN_TOOLS.READ_IMAGE, BUILTIN_TOOLS.MEMORY_SEARCH, + BUILTIN_TOOLS.LITERATURE_SEARCH, ]; // Image tools as a group @@ -310,3 +312,20 @@ export function updateMemoryEnabled( ): Record { return updateToolFilter(agent, BUILTIN_TOOLS.MEMORY_SEARCH, enabled); } + +/** + * Check if literature search is enabled + */ +export function isLiteratureSearchEnabled(agent: Agent | null): boolean { + return isToolEnabled(agent, BUILTIN_TOOLS.LITERATURE_SEARCH); +} + +/** + * Enable/disable literature search + */ +export function updateLiteratureSearchEnabled( + agent: Agent, + enabled: boolean, +): Record { + return updateToolFilter(agent, BUILTIN_TOOLS.LITERATURE_SEARCH, enabled); +} diff --git a/web/src/i18n/locales/en/app.json b/web/src/i18n/locales/en/app.json index beea8f01..84e27e08 100644 --- a/web/src/i18n/locales/en/app.json +++ b/web/src/i18n/locales/en/app.json @@ -60,6 +60,8 @@ "noKnowledge": "None", "image": "Image", "imageDesc": "Generate and read images", + "literatureSearch": "Literature", + "literatureSearchDesc": "Search academic papers", "memory": "Memory", "memoryDesc": "Search conversation history" }, diff --git a/web/src/i18n/locales/zh/app.json b/web/src/i18n/locales/zh/app.json index 9e689381..b1fa8987 100644 --- a/web/src/i18n/locales/zh/app.json +++ b/web/src/i18n/locales/zh/app.json @@ -60,6 +60,8 @@ "noKnowledge": "ๆ— ", "image": "ๅ›พ็‰‡", "imageDesc": "็”Ÿๆˆๅ’Œ่ฏปๅ–ๅ›พ็‰‡", + "literatureSearch": "ๆ–‡็Œฎๆœ็ดข", + "literatureSearchDesc": "ๆœ็ดขๅญฆๆœฏๆ–‡็Œฎๅ’Œ่ฎบๆ–‡", "memory": "่ฎฐๅฟ†", "memoryDesc": "ๆœ็ดขๅฏน่ฏๅކๅฒ" },