Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
400 changes: 400 additions & 0 deletions service/app/mcp/literature.py

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions service/app/utils/literature/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Literature search utilities for multi-source academic literature retrieval
"""

from .base_client import BaseLiteratureClient
from .doi_cleaner import deduplicate_by_doi, normalize_doi
from .models import LiteratureWork, SearchRequest
from .work_distributor import WorkDistributor

__all__ = [
"BaseLiteratureClient",
"normalize_doi",
"deduplicate_by_doi",
"SearchRequest",
"LiteratureWork",
"WorkDistributor",
]
32 changes: 32 additions & 0 deletions service/app/utils/literature/base_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Abstract base class for literature data source clients
"""

from abc import ABC, abstractmethod

from .models import LiteratureWork, SearchRequest


class BaseLiteratureClient(ABC):
"""
Base class for literature data source clients

All data source implementations (OpenAlex, Semantic Scholar, PubMed, etc.)
should inherit from this class and implement the required methods.
"""

@abstractmethod
async def search(self, request: SearchRequest) -> tuple[list[LiteratureWork], list[str]]:
"""
Execute search and return results in standard format

Args:
request: Standardized search request

Returns:
Tuple of (works, warnings) where warnings is a list of messages for LLM feedback

Raises:
Exception: If search fails after retries
"""
pass
116 changes: 116 additions & 0 deletions service/app/utils/literature/doi_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
DOI normalization and deduplication utilities
"""

import re
from typing import Protocol, TypeVar


class WorkWithDOI(Protocol):
"""Protocol for objects with DOI and citation information"""

doi: str | None
cited_by_count: int
publication_year: int | None


T = TypeVar("T", bound=WorkWithDOI)


def normalize_doi(doi: str | None) -> str | None:
"""
Normalize DOI format to standard form

Removes common prefixes, validates format, and converts to lowercase.
DOI specification (ISO 26324) defines DOI matching as case-insensitive,
so lowercase conversion is safe and improves consistency.

Args:
doi: DOI string in any common format

Returns:
Normalized DOI (e.g., "10.1038/nature12345") or None if invalid

Examples:
>>> normalize_doi("https://doi.org/10.1038/nature12345")
"10.1038/nature12345"
>>> normalize_doi("DOI: 10.1038/nature12345")
"10.1038/nature12345"
>>> normalize_doi("doi:10.1038/nature12345")
"10.1038/nature12345"
"""
if not doi:
return None

doi = doi.strip().lower()

# Remove common prefixes
doi = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", doi)
doi = re.sub(r"^doi:\s*", "", doi)

# Validate format (10.xxxx/yyyy)
return doi if re.match(r"^10\.\d+/.+", doi) else None


def deduplicate_by_doi(works: list[T]) -> list[T]:
"""
Deduplicate works by DOI, keeping the highest priority version

Priority rules:
1. Works with DOI take priority over those without
2. For same DOI, keep the one with higher citation count
3. If citation count is equal, keep the most recently published

Args:
works: List of LiteratureWork objects

Returns:
Deduplicated list of works

Examples:
>>> works = [
... LiteratureWork(doi="10.1038/1", cited_by_count=100, ...),
... LiteratureWork(doi="10.1038/1", cited_by_count=50, ...),
... LiteratureWork(doi=None, ...),
... ]
>>> unique = deduplicate_by_doi(works)
>>> len(unique)
2
>>> unique[0].cited_by_count
100
"""
# Group by: with DOI vs without DOI
with_doi: dict[str, T] = {}
without_doi: list[T] = []

for work in works:
# Check if work has doi attribute
if not work.doi:
without_doi.append(work)
continue

doi = normalize_doi(work.doi)
if not doi:
without_doi.append(work)
continue

# If DOI already exists, compare priority
if doi in with_doi:
existing = with_doi[doi]

# Higher citation count?
if work.cited_by_count > existing.cited_by_count:
with_doi[doi] = work
# Same citation count, more recent publication?
elif (
work.cited_by_count == existing.cited_by_count
and work.publication_year
and existing.publication_year
and work.publication_year > existing.publication_year
):
with_doi[doi] = work
else:
with_doi[doi] = work

# Combine results: DOI works first, then non-DOI works
return list(with_doi.values()) + without_doi
80 changes: 80 additions & 0 deletions service/app/utils/literature/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Shared data models for literature utilities
"""

from dataclasses import dataclass, field
from typing import Any


@dataclass
class SearchRequest:
"""
Standardized search request format for all data sources

Attributes:
query: Search keywords (searches title, abstract, full text)
author: Author name (will be converted to author ID)
institution: Institution name (will be converted to institution ID)
source: Journal or conference name
year_from: Start year (inclusive)
year_to: End year (inclusive)
is_oa: Filter for open access only
work_type: Work type filter ("article", "review", "preprint", etc.)
language: Language code filter (e.g., "en", "zh", "fr")
is_retracted: Filter for retracted works (True to include only retracted, False to exclude)
has_abstract: Filter for works with abstracts
has_fulltext: Filter for works with full text available
sort_by: Sort method - "relevance", "cited_by_count", "publication_date"
max_results: Maximum number of results to return
data_sources: List of data sources to query (default: ["openalex"])
"""

query: str
author: str | None = None
institution: str | None = None
source: str | None = None
year_from: int | None = None
year_to: int | None = None
is_oa: bool | None = None
work_type: str | None = None
language: str | None = None
is_retracted: bool | None = None
has_abstract: bool | None = None
has_fulltext: bool | None = None
sort_by: str = "relevance"
max_results: int = 50
data_sources: list[str] | None = None


@dataclass
class LiteratureWork:
"""
Standardized literature work format across all data sources

Attributes:
id: Internal ID from the data source
doi: Digital Object Identifier (normalized format)
title: Work title
authors: List of author information [{"name": "...", "id": "..."}]
publication_year: Year of publication
cited_by_count: Number of citations
abstract: Abstract text
journal: Journal or venue name
is_oa: Whether open access
oa_url: URL to open access version
source: Data source name ("openalex", "semantic_scholar", etc.)
raw_data: Original data from the source (for debugging)
"""

id: str
doi: str | None
title: str
authors: list[dict[str, str | None]]
publication_year: int | None
cited_by_count: int
abstract: str | None
journal: str | None
is_oa: bool
oa_url: str | None
source: str
raw_data: dict[str, Any] = field(default_factory=dict)
Loading
Loading