Skip to content
Closed
376 changes: 376 additions & 0 deletions service/app/mcp/literature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
"""
Literature MCP Server - Multi-source academic literature search

Provides tools for searching academic literature from multiple data sources
(OpenAlex, Semantic Scholar, PubMed, etc.) with unified interface.
"""

import json
import logging
from datetime import datetime
from typing import Any

import httpx
from fastmcp import FastMCP

from app.utils.literature import SearchRequest, WorkDistributor

logger = logging.getLogger(__name__)

# Create FastMCP instance
mcp = FastMCP("literature")

# Metadata for MCP server
__mcp_metadata__ = {
"name": "Literature Search",
"description": "Search academic literature from multiple sources with advanced filtering",
"version": "1.0.0",
}


@mcp.tool()
async def search_literature(
query: str,
mailto: str | None = None,
author: str | None = None,
institution: str | None = None,
source: str | None = None,
year_from: str | None = None,
year_to: str | None = None,
is_oa: str | None = None,
work_type: str | None = None,
language: str | None = None,
is_retracted: str | None = None,
has_abstract: str | None = None,
has_fulltext: str | None = None,
sort_by: str = "relevance",
max_results: str | int = 50,
data_sources: list[str] | None = None,
include_abstract: str | bool = False,
) -> str:
"""
Search academic literature from multiple data sources (OpenAlex, etc.)

⚠️ IMPORTANT: A valid email address (mailto parameter) enables the OpenAlex polite pool
(10 req/s). If omitted, the default pool is used (1 req/s, sequential). Production
usage should provide an email.

Basic usage: Provide query keywords and user's email. Returns a Markdown report
with statistics and JSON list of papers.

Args:
query: Search keywords (e.g., "machine learning", "CRISPR")
mailto: OPTIONAL - User's email (e.g., "[email protected]")
author: OPTIONAL - Author name (e.g., "Albert Einstein")
institution: OPTIONAL - Institution (e.g., "MIT", "Harvard University")
source: OPTIONAL - Journal (e.g., "Nature", "Science")
year_from: OPTIONAL - Start year (e.g., "2020" or 2020)
year_to: OPTIONAL - End year (e.g., "2024" or 2024)
is_oa: OPTIONAL - Open access only ("true"/"false")
work_type: OPTIONAL - Work type: "article", "review", "preprint", "book", "dissertation", etc.
language: OPTIONAL - Language code (e.g., "en" for English, "zh" for Chinese, "fr" for French)
is_retracted: OPTIONAL - Filter retracted works ("true" to include only retracted, "false" to exclude)
has_abstract: OPTIONAL - Require abstract ("true" to include only works with abstracts)
has_fulltext: OPTIONAL - Require full text ("true" to include only works with full text)
sort_by: Sort: "relevance" (default), "cited_by_count", "publication_date"
max_results: Max papers (default: 50, range: 1-200, accepts string or int)
data_sources: Sources to search (default: ["openalex"])
include_abstract: Include abstracts (default: False, accepts string or bool)

Returns:
Markdown report with:
- Warnings if filters fail
- Statistics (citations, open access rate)
- JSON list of papers (title, authors, DOI, etc.)
- Next steps guidance

Usage tips:
- START SIMPLE: just query + mailto
- Tool will suggest corrections if author/institution not found
- Review "Next Steps Guide" before searching again

Examples:
# Minimal (recommended)
search_literature("machine learning", mailto="[email protected]")

# With filters (accepts both strings and integers)
search_literature(
query="CRISPR",
mailto="[email protected]",
author="Jennifer Doudna",
year_from="2020",
year_to="2024"
)

# Recent reviews (past 5 years, English only)
search_literature(
query="cancer immunotherapy",
mailto="[email protected]",
work_type="review",
language="en",
year_from="2020",
sort_by="cited_by_count"
)

# Research articles with abstracts (exclude retracted)
search_literature(
query="CRISPR gene editing",
mailto="[email protected]",
work_type="article",
has_abstract="true",
is_retracted="false"
)
"""
try:
# Convert string parameters to proper types
year_from_int = int(year_from) if year_from and str(year_from).strip() else None
year_to_int = int(year_to) if year_to and str(year_to).strip() else None

# Clamp year ranges (warn but don't block search)
max_year = datetime.now().year + 1
year_warning = ""
if year_from_int is not None and year_from_int > max_year:
year_warning += f"year_from {year_from_int}→{max_year}. "
year_from_int = max_year
if year_to_int is not None and year_to_int < 1700:
year_warning += f"year_to {year_to_int}→1700. "
year_to_int = 1700

# Convert is_oa to boolean
is_oa_bool: bool | None = None
if is_oa is not None:
is_oa_bool = str(is_oa).lower() in ("true", "1", "yes")

# Convert is_retracted to boolean
is_retracted_bool: bool | None = None
if is_retracted is not None:
is_retracted_bool = str(is_retracted).lower() in ("true", "1", "yes")

# Convert has_abstract to boolean
has_abstract_bool: bool | None = None
if has_abstract is not None:
has_abstract_bool = str(has_abstract).lower() in ("true", "1", "yes")

# Convert has_fulltext to boolean
has_fulltext_bool: bool | None = None
if has_fulltext is not None:
has_fulltext_bool = str(has_fulltext).lower() in ("true", "1", "yes")

# Convert max_results to int
max_results_int = int(max_results) if max_results else 50

# Convert include_abstract to bool
include_abstract_bool = str(include_abstract).lower() in ("true", "1", "yes") if include_abstract else False

openalex_email = mailto.strip() if mailto and str(mailto).strip() else None

logger.info(
f"Literature search requested: query='{query}', mailto={openalex_email}, max_results={max_results_int}"
)

# Create search request with converted types
request = SearchRequest(
query=query,
author=author,
institution=institution,
source=source,
year_from=year_from_int,
year_to=year_to_int,
is_oa=is_oa_bool,
work_type=work_type,
language=language,
is_retracted=is_retracted_bool,
has_abstract=has_abstract_bool,
has_fulltext=has_fulltext_bool,
sort_by=sort_by,
max_results=max_results_int,
data_sources=data_sources,
)

# Execute search
async with WorkDistributor(openalex_email=openalex_email) as distributor:
result = await distributor.search(request)

if year_warning:
result.setdefault("warnings", []).append(f"⚠️ Year adjusted: {year_warning.strip()}")

# Format output
return _format_search_result(request, result, include_abstract_bool)

except ValueError as e:
logger.warning(f"Literature search validation error: {e}")
return f"❌ Invalid input: {str(e)}"
except httpx.HTTPError as e:
logger.error(f"Literature search network error: {e}", exc_info=True)
return "❌ Network error while contacting literature sources. Please try again later."
except Exception as e:
logger.error(f"Literature search failed: {e}", exc_info=True)
return "❌ Unexpected error during search. Please retry or contact support."


def _format_search_result(request: SearchRequest, result: dict[str, Any], include_abstract: bool = False) -> str:
"""
Format search results into human-readable report + JSON data

Args:
request: Original search request
result: Search result from WorkDistributor
include_abstract: Whether to include abstracts in JSON (default: False to save tokens)

Returns:
Formatted markdown report with embedded JSON
"""
works = result["works"]
total_count = result["total_count"]
unique_count = result["unique_count"]
sources = result["sources"]
warnings = result.get("warnings", [])

# Build report sections
sections: list[str] = []

# Header
sections.append("# Literature Search Report\n")

# Warnings and resolution status (if any)
if warnings:
sections.append("## ⚠️ Warnings and Resolution Status\n")
for warning in warnings:
sections.append(f"{warning}")
sections.append("")

# Search conditions
sections.append("## Search Conditions\n")
conditions: list[str] = []
conditions.append(f"- **Query**: {request.query}")
if request.author:
conditions.append(f"- **Author**: {request.author}")
if request.institution:
conditions.append(f"- **Institution**: {request.institution}")
if request.source:
conditions.append(f"- **Source**: {request.source}")
if request.year_from or request.year_to:
year_range = f"{request.year_from or '...'} - {request.year_to or '...'}"
conditions.append(f"- **Year Range**: {year_range}")
if request.is_oa is not None:
conditions.append(f"- **Open Access Only**: {'Yes' if request.is_oa else 'No'}")
if request.work_type:
conditions.append(f"- **Work Type**: {request.work_type}")
if request.language:
conditions.append(f"- **Language**: {request.language}")
if request.is_retracted is not None:
conditions.append(f"- **Exclude Retracted**: {'No' if request.is_retracted else 'Yes'}")
Comment on lines +261 to +262
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: “Exclude Retracted” 这个标签与 is_retracted 标志位的语义相反/容易令人困惑。

根据 docstring,is_retracted=True 表示“只要已撤稿的文献”,False 表示“排除已撤稿”。在当前渲染方式下,当 is_retracted=True 时会显示 Exclude Retracted: No,而 False 时显示 Yes,虽然在逻辑上自洽,但很难理解。建议重命名/重写这一行,用真正的过滤模式来描述(例如 Retracted Filter: only retracted / exclude retracted),而不是用一个语义反转的 “Exclude” 标志来表达。

Suggested change
if request.is_retracted is not None:
conditions.append(f"- **Exclude Retracted**: {'No' if request.is_retracted else 'Yes'}")
if request.is_retracted is not None:
retracted_mode = "Only retracted" if request.is_retracted else "Exclude retracted"
conditions.append(f"- **Retracted Filter**: {retracted_mode}")
Original comment in English

suggestion: The "Exclude Retracted" label is inverted/confusing relative to the is_retracted flag semantics.

Given the docstring, is_retracted=True means “only retracted” and False means “exclude retracted”. With the current rendering, Exclude Retracted will show No when True and Yes when False, which is logically consistent but hard to interpret. Consider renaming/rephrasing this line to describe the actual filter mode (e.g., Retracted Filter: only retracted / exclude retracted) rather than expressing it as an inverted “Exclude” flag.

Suggested change
if request.is_retracted is not None:
conditions.append(f"- **Exclude Retracted**: {'No' if request.is_retracted else 'Yes'}")
if request.is_retracted is not None:
retracted_mode = "Only retracted" if request.is_retracted else "Exclude retracted"
conditions.append(f"- **Retracted Filter**: {retracted_mode}")

if request.has_abstract is not None:
conditions.append(f"- **Require Abstract**: {'Yes' if request.has_abstract else 'No'}")
if request.has_fulltext is not None:
conditions.append(f"- **Require Full Text**: {'Yes' if request.has_fulltext else 'No'}")
conditions.append(f"- **Sort By**: {request.sort_by}")
conditions.append(f"- **Max Results**: {request.max_results}")
sections.append("\n".join(conditions))
sections.append("")

# Check if no results
if not works:
sections.append("## ❌ No Results Found\n")
sections.append("**Suggestions to improve your search:**\n")
suggestions: list[str] = []
suggestions.append("1. **Simplify keywords**: Try broader or different terms")
if request.author:
suggestions.append("2. **Remove author filter**: Author name may not be recognized")
if request.institution:
suggestions.append("3. **Remove institution filter**: Try without institution constraint")
if request.source:
suggestions.append("4. **Remove source filter**: Try without journal constraint")
if request.year_from or request.year_to:
suggestions.append("5. **Expand year range**: Current range may be too narrow")
if request.is_oa:
suggestions.append("6. **Remove open access filter**: Include non-OA papers")
suggestions.append("7. **Check spelling**: Verify all terms are spelled correctly")
sections.append("\n".join(suggestions))
sections.append("")
return "\n".join(sections)

# Statistics and overall insights
sections.append("## Search Statistics\n")
stats: list[str] = []
stats.append(f"- **Total Found**: {total_count} works")
stats.append(f"- **After Deduplication**: {unique_count} works")
source_info = ", ".join(f"{name}: {count}" for name, count in sources.items())
stats.append(f"- **Data Sources**: {source_info}")

# Add insights
if works:
avg_citations = sum(w.cited_by_count for w in works) / len(works)
stats.append(f"- **Average Citations**: {avg_citations:.1f}")

oa_count = sum(1 for w in works if w.is_oa)
oa_ratio = (oa_count / len(works)) * 100
stats.append(f"- **Open Access Rate**: {oa_ratio:.1f}% ({oa_count}/{len(works)})")

years = [w.publication_year for w in works if w.publication_year]
if years:
stats.append(f"- **Year Range**: {min(years)} - {max(years)}")

sections.append("\n".join(stats))
sections.append("")

# Complete JSON list
sections.append("## Complete Works List (JSON)\n")
if include_abstract:
sections.append("The following JSON contains all works with full abstracts:\n")
else:
sections.append("The following JSON contains all works (abstracts excluded to save tokens):\n")
sections.append("```json")

# Convert works to dict for JSON serialization
works_dict = []
for work in works:
work_data = {
"id": work.id,
"doi": work.doi,
"title": work.title,
"authors": work.authors[:5], # Limit to first 5 authors
"publication_year": work.publication_year,
"cited_by_count": work.cited_by_count,
"journal": work.journal,
"is_oa": work.is_oa,
"oa_url": work.oa_url,
"source": work.source,
}
# Only include abstract if requested
if include_abstract and work.abstract:
work_data["abstract"] = work.abstract
works_dict.append(work_data)

sections.append(json.dumps(works_dict, indent=2, ensure_ascii=False))
sections.append("```")
sections.append("")

# Next steps guidance - prevent infinite loops
sections.append("---")
sections.append("## 🎯 Next Steps Guide\n")
sections.append("**Before making another search, consider:**\n")
next_steps: list[str] = []

if unique_count > 0:
next_steps.append("✓ **Results found** - Review the JSON data above for your analysis")
if unique_count >= request.max_results:
next_steps.append(
f"⚠️ **Result limit reached** ({request.max_results}) - "
"Consider narrowing filters (author, year, journal) for more targeted results"
)
if unique_count < 10:
next_steps.append("💡 **Few results** - Consider broadening your search by removing some filters")

next_steps.append("")
next_steps.append("**To refine your search:**")
next_steps.append("- If too many results → Add more specific filters (author, institution, journal, year)")
next_steps.append("- If too few results → Remove filters or use broader keywords")
next_steps.append("- If wrong results → Check filter spelling and try variations")
next_steps.append("")
next_steps.append("⚠️ **Important**: Avoid making multiple similar searches without reviewing results first!")
next_steps.append("Each search consumes API quota and context window. Make targeted, deliberate queries.")

sections.append("\n".join(next_steps))

return "\n".join(sections)
17 changes: 17 additions & 0 deletions service/app/utils/literature/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Literature search utilities for multi-source academic literature retrieval
"""

from .base_client import BaseLiteratureClient
from .doi_cleaner import deduplicate_by_doi, normalize_doi
from .models import LiteratureWork, SearchRequest
from .work_distributor import WorkDistributor

__all__ = [
"BaseLiteratureClient",
"normalize_doi",
"deduplicate_by_doi",
"SearchRequest",
"LiteratureWork",
"WorkDistributor",
]
Loading
Loading