ScienceOL · xinquiry · Jan 22, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/service/app/mcp/literature.py b/service/app/mcp/literature.py
diff --git a/service/app/utils/literature/__init__.py b/service/app/utils/literature/__init__.py
@@ -0,0 +1,17 @@
+"""
+Literature search utilities for multi-source academic literature retrieval
+"""
+
+from .base_client import BaseLiteratureClient
+from .doi_cleaner import deduplicate_by_doi, normalize_doi
+from .models import LiteratureWork, SearchRequest
+from .work_distributor import WorkDistributor
+
+__all__ = [
+    "BaseLiteratureClient",
+    "normalize_doi",
+    "deduplicate_by_doi",
+    "SearchRequest",
+    "LiteratureWork",
+    "WorkDistributor",
+]
diff --git a/service/app/utils/literature/base_client.py b/service/app/utils/literature/base_client.py
@@ -0,0 +1,32 @@
+"""
+Abstract base class for literature data source clients
+"""
+
+from abc import ABC, abstractmethod
+
+from .models import LiteratureWork, SearchRequest
+
+
+class BaseLiteratureClient(ABC):
+    """
+    Base class for literature data source clients
+
+    All data source implementations (OpenAlex, Semantic Scholar, PubMed, etc.)
+    should inherit from this class and implement the required methods.
+    """
+
+    @abstractmethod
+    async def search(self, request: SearchRequest) -> tuple[list[LiteratureWork], list[str]]:
+        """
+        Execute search and return results in standard format
+
+        Args:
+            request: Standardized search request
+
+        Returns:
+            Tuple of (works, warnings) where warnings is a list of messages for LLM feedback
+
+        Raises:
+            Exception: If search fails after retries
+        """
+        pass
diff --git a/service/app/utils/literature/doi_cleaner.py b/service/app/utils/literature/doi_cleaner.py
@@ -0,0 +1,116 @@
+"""
+DOI normalization and deduplication utilities
+"""
+
+import re
+from typing import Protocol, TypeVar
+
+
+class WorkWithDOI(Protocol):
+    """Protocol for objects with DOI and citation information"""
+
+    doi: str | None
+    cited_by_count: int
+    publication_year: int | None
+
+
+T = TypeVar("T", bound=WorkWithDOI)
+
+
+def normalize_doi(doi: str | None) -> str | None:
+    """
+    Normalize DOI format to standard form
+
+    Removes common prefixes, validates format, and converts to lowercase.
+    DOI specification (ISO 26324) defines DOI matching as case-insensitive,
+    so lowercase conversion is safe and improves consistency.
+
+    Args:
+        doi: DOI string in any common format
+
+    Returns:
+        Normalized DOI (e.g., "10.1038/nature12345") or None if invalid
+
+    Examples:
+        >>> normalize_doi("https://doi.org/10.1038/nature12345")
+        "10.1038/nature12345"
+        >>> normalize_doi("DOI: 10.1038/nature12345")
+        "10.1038/nature12345"
+        >>> normalize_doi("doi:10.1038/nature12345")
+        "10.1038/nature12345"
+    """
+    if not doi:
+        return None
+
+    doi = doi.strip().lower()
+
+    # Remove common prefixes
+    doi = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", doi)
+    doi = re.sub(r"^doi:\s*", "", doi)
+
+    # Validate format (10.xxxx/yyyy)
+    return doi if re.match(r"^10\.\d+/.+", doi) else None
+
+
+def deduplicate_by_doi(works: list[T]) -> list[T]:
+    """
+    Deduplicate works by DOI, keeping the highest priority version
+
+    Priority rules:
+    1. Works with DOI take priority over those without
+    2. For same DOI, keep the one with higher citation count
+    3. If citation count is equal, keep the most recently published
+
+    Args:
+        works: List of LiteratureWork objects
+
+    Returns:
+        Deduplicated list of works
+
+    Examples:
+        >>> works = [
+        ...     LiteratureWork(doi="10.1038/1", cited_by_count=100, ...),
+        ...     LiteratureWork(doi="10.1038/1", cited_by_count=50, ...),
+        ...     LiteratureWork(doi=None, ...),
+        ... ]
+        >>> unique = deduplicate_by_doi(works)
+        >>> len(unique)
+        2
+        >>> unique[0].cited_by_count
+        100
+    """
+    # Group by: with DOI vs without DOI
+    with_doi: dict[str, T] = {}
+    without_doi: list[T] = []
+
+    for work in works:
+        # Check if work has doi attribute
+        if not work.doi:
+            without_doi.append(work)
+            continue
+
+        doi = normalize_doi(work.doi)
+        if not doi:
+            without_doi.append(work)
+            continue
+
+        # If DOI already exists, compare priority
+        if doi in with_doi:
+            existing = with_doi[doi]
+
+            # Higher citation count?
+            if work.cited_by_count > existing.cited_by_count:
+                with_doi[doi] = work
+            # Same citation count, more recent publication?
+            elif (
+                work.cited_by_count == existing.cited_by_count
+                and work.publication_year
+                and existing.publication_year
+                and work.publication_year > existing.publication_year
+            ):
+                with_doi[doi] = work
+        else:
+            with_doi[doi] = work
+
+    # Combine results: DOI works first, then non-DOI works
+    return list(with_doi.values()) + without_doi
diff --git a/service/app/utils/literature/models.py b/service/app/utils/literature/models.py
@@ -0,0 +1,80 @@
+"""
+Shared data models for literature utilities
+"""
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class SearchRequest:
+    """
+    Standardized search request format for all data sources
+
+    Attributes:
+        query: Search keywords (searches title, abstract, full text)
+        author: Author name (will be converted to author ID)
+        institution: Institution name (will be converted to institution ID)
+        source: Journal or conference name
+        year_from: Start year (inclusive)
+        year_to: End year (inclusive)
+        is_oa: Filter for open access only
+        work_type: Work type filter ("article", "review", "preprint", etc.)
+        language: Language code filter (e.g., "en", "zh", "fr")
+        is_retracted: Filter for retracted works (True to include only retracted, False to exclude)
+        has_abstract: Filter for works with abstracts
+        has_fulltext: Filter for works with full text available
+        sort_by: Sort method - "relevance", "cited_by_count", "publication_date"
+        max_results: Maximum number of results to return
+        data_sources: List of data sources to query (default: ["openalex"])
+    """
+
+    query: str
+    author: str | None = None
+    institution: str | None = None
+    source: str | None = None
+    year_from: int | None = None
+    year_to: int | None = None
+    is_oa: bool | None = None
+    work_type: str | None = None
+    language: str | None = None
+    is_retracted: bool | None = None
+    has_abstract: bool | None = None
+    has_fulltext: bool | None = None
+    sort_by: str = "relevance"
+    max_results: int = 50
+    data_sources: list[str] | None = None
+
+
+@dataclass
+class LiteratureWork:
+    """
+    Standardized literature work format across all data sources
+
+    Attributes:
+        id: Internal ID from the data source
+        doi: Digital Object Identifier (normalized format)
+        title: Work title
+        authors: List of author information [{"name": "...", "id": "..."}]
+        publication_year: Year of publication
+        cited_by_count: Number of citations
+        abstract: Abstract text
+        journal: Journal or venue name
+        is_oa: Whether open access
+        oa_url: URL to open access version
+        source: Data source name ("openalex", "semantic_scholar", etc.)
+        raw_data: Original data from the source (for debugging)
+    """
+
+    id: str
+    doi: str | None
+    title: str
+    authors: list[dict[str, str | None]]
+    publication_year: int | None
+    cited_by_count: int
+    abstract: str | None
+    journal: str | None
+    is_oa: bool
+    oa_url: str | None
+    source: str
+    raw_data: dict[str, Any] = field(default_factory=dict)