Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds retraction status #314

Merged
merged 11 commits into from
Sep 11, 2024
2 changes: 2 additions & 0 deletions paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .client_models import MetadataPostProcessor, MetadataProvider
from .crossref import CrossrefProvider
from .journal_quality import JournalQualityPostProcessor
from .retractions import RetrationDataPostProcessor
from .semantic_scholar import SemanticScholarProvider
from .unpaywall import UnpaywallProvider

Expand All @@ -32,6 +33,7 @@
| Sequence[Collection[type[MetadataPostProcessor | MetadataProvider]]]
) = DEFAULT_CLIENTS | { # type: ignore[operator]
UnpaywallProvider,
RetrationDataPostProcessor,
}


Expand Down
121 changes: 121 additions & 0 deletions paperqa/clients/retractions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import annotations

import csv
import datetime
import logging
import os

import aiohttp
from anyio import open_file
from pydantic import ValidationError
from tenacity import retry, stop_after_attempt, wait_exponential

from paperqa.types import DocDetails

from .client_models import DOIQuery, MetadataPostProcessor

logger = logging.getLogger(__name__)


class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]):
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None:

if retraction_data_path is None:
# Construct the path relative to module
self.retraction_data_path = str(
os.path.join(
os.path.dirname(__file__), "client_data", "retractions.csv"
)
)
else:
self.retraction_data_path = str(retraction_data_path)

self.retraction_filter: str = "Retraction"
self.doi_set: set[str] = set()
self.columns: list[str] = [
"RetractionDOI",
"OriginalPaperDOI",
"RetractionNature",
]

def _has_cache_expired(self) -> bool:
creation_time = os.path.getctime(self.retraction_data_path)
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace(
tzinfo=datetime.UTC
)

current_time = datetime.datetime.now(datetime.UTC)
time_difference = current_time - file_creation_date

return time_difference > datetime.timedelta(days=30)

def _is_csv_cached(self) -> bool:
return os.path.exists(self.retraction_data_path)
geemi725 marked this conversation as resolved.
Show resolved Hide resolved

@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=5, min=5),
reraise=True,
)
async def _download_retracted_dataset(self) -> None:

if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")):
CROSSREF_MAILTO = "[email protected]"
url = f"https://api.labs.crossref.org/data/retractionwatch?{CROSSREF_MAILTO}"

async with (
aiohttp.ClientSession() as session,
session.get(
url,
timeout=aiohttp.ClientTimeout(total=300),
) as response,
):
response.raise_for_status()

logger.info(
f"Retraction data was not cashed. Downloading retraction data from {url}..."
)

async with await open_file(self.retraction_data_path, "wb") as f:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
await f.write(chunk)

if os.path.getsize(self.retraction_data_path) == 0:
raise RuntimeError("Retraction data is empty")

def _filter_dois(self) -> None:
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row[self.columns[2]] == self.retraction_filter:
self.doi_set.add(row[self.columns[0]])
self.doi_set.add(row[self.columns[1]])

async def load_data(self) -> None:
if not self._is_csv_cached() or self._has_cache_expired():
await self._download_retracted_dataset()

self._filter_dois()

if not self.doi_set:
raise RuntimeError("Retraction data was not found.")

async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails:
if not self.doi_set:
await self.load_data()

return doc_details + DocDetails( # type: ignore[call-arg]
is_retracted=query.doi in self.doi_set
)

def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
try:
return DOIQuery(doi=doc_details.doi, **kwargs)
except ValidationError:
logger.debug(
f"Must have a valid DOI to query retraction data:{doc_details.doi} "
)
return None
8 changes: 8 additions & 0 deletions paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,9 @@ class DocDetails(Doc):
" We use None as a sentinel for unset values (like for determining hydration) "
" So, we use -1 means unknown quality and None means it needs to be hydrated.",
)
is_retracted: bool | None = Field(
geemi725 marked this conversation as resolved.
Show resolved Hide resolved
default=None, description="Flag for whether the paper is retracted."
)
doi: str | None = None
doi_url: str | None = None
doc_id: str | None = None
Expand Down Expand Up @@ -538,6 +541,10 @@ def __getitem__(self, item: str):

@property
def formatted_citation(self) -> str:

if self.is_retracted:
return f"**RETRACTED ARTICLE** Citation: {self.citation} Retrieved from http://retractiondatabase.org/."

if (
self.citation is None # type: ignore[redundant-expr]
or self.citation_count is None
Expand All @@ -551,6 +558,7 @@ def formatted_citation(self) -> str:
if self.source_quality >= 0
else None
)

if quality is None:
return f"{self.citation} This article has {self.citation_count} citations."
return (
Expand Down
47 changes: 45 additions & 2 deletions tests/test_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider
from paperqa.clients.journal_quality import JournalQualityPostProcessor
from paperqa.clients.retractions import RetrationDataPostProcessor


@pytest.mark.vcr
Expand Down Expand Up @@ -94,7 +95,17 @@
@pytest.mark.asyncio
async def test_title_search(paper_attributes: dict[str, str]):
async with aiohttp.ClientSession() as session:
client = DocMetadataClient(session, clients=ALL_CLIENTS)
client_list = list(ALL_CLIENTS)
client_list.remove(RetrationDataPostProcessor)
client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
client_list,
),
)
details = await client.query(title=paper_attributes["title"])
assert set(details.other["client_source"]) == set( # type: ignore[union-attr]
paper_attributes["source"]
Expand Down Expand Up @@ -180,7 +191,17 @@ async def test_title_search(paper_attributes: dict[str, str]):
@pytest.mark.asyncio
async def test_doi_search(paper_attributes: dict[str, str]):
async with aiohttp.ClientSession() as session:
client = DocMetadataClient(session, clients=ALL_CLIENTS)
client_list = list(ALL_CLIENTS)
client_list.remove(RetrationDataPostProcessor)
client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
client_list,
),
)
details = await client.query(doi=paper_attributes["doi"])
assert set(details.other["client_source"]) == set( # type: ignore[union-attr]
paper_attributes["source"]
Expand Down Expand Up @@ -505,3 +526,25 @@ async def test_ensure_sequential_run_early_stop(
record_indices["semantic_scholar"] != -1
), "Semantic Scholar should be found"
assert record_indices["early_stop"] != -1, "We should stop early."


@pytest.mark.asyncio
async def test_crossref_retraction_status():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test 👍

async with aiohttp.ClientSession() as session:
crossref_client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
[CrossrefProvider, RetrationDataPostProcessor],
),
)
crossref_details = await crossref_client.query(
title="The Dilemma and Countermeasures of Music Education under the Background of Big Data",
fields=["title", "doi", "authors", "journal"],
)

assert "**RETRACTED ARTICLE** Citation: Jiaye Han." in crossref_details.formatted_citation # type: ignore[union-attr]

assert crossref_details.is_retracted is True, "Should be retracted" # type: ignore[union-attr]