Future-House · whitead · Sep 11, 2024 · Sep 4, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/paperqa/clients/__init__.py b/paperqa/clients/__init__.py
@@ -13,6 +13,7 @@
 from .client_models import MetadataPostProcessor, MetadataProvider
 from .crossref import CrossrefProvider
 from .journal_quality import JournalQualityPostProcessor
+from .retractions import RetrationDataPostProcessor
 from .semantic_scholar import SemanticScholarProvider
 from .unpaywall import UnpaywallProvider
 
@@ -32,6 +33,7 @@
     | Sequence[Collection[type[MetadataPostProcessor | MetadataProvider]]]
 ) = DEFAULT_CLIENTS | {  # type: ignore[operator]
     UnpaywallProvider,
+    RetrationDataPostProcessor,
 }
 
 

diff --git a/paperqa/clients/retractions.py b/paperqa/clients/retractions.py
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import csv
+import datetime
+import logging
+import os
+
+import aiohttp
+from anyio import open_file
+from pydantic import ValidationError
+from tenacity import retry, stop_after_attempt, wait_exponential
+
+from paperqa.types import DocDetails
+
+from .client_models import DOIQuery, MetadataPostProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]):
+    def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None:
+
+        if retraction_data_path is None:
+            # Construct the path relative to module
+            self.retraction_data_path = str(
+                os.path.join(
+                    os.path.dirname(__file__), "client_data", "retractions.csv"
+                )
+            )
+        else:
+            self.retraction_data_path = str(retraction_data_path)
+
+        self.retraction_filter: str = "Retraction"
+        self.doi_set: set[str] = set()
+        self.columns: list[str] = [
+            "RetractionDOI",
+            "OriginalPaperDOI",
+            "RetractionNature",
+        ]
+
+    def _has_cache_expired(self) -> bool:
+        creation_time = os.path.getctime(self.retraction_data_path)
+        file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace(
+            tzinfo=datetime.UTC
+        )
+
+        current_time = datetime.datetime.now(datetime.UTC)
+        time_difference = current_time - file_creation_date
+
+        return time_difference > datetime.timedelta(days=30)
+
+    def _is_csv_cached(self) -> bool:
+        return os.path.exists(self.retraction_data_path)
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=5, min=5),
+        reraise=True,
+    )
+    async def _download_retracted_dataset(self) -> None:
+
+        if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")):
+            CROSSREF_MAILTO = "[email protected]"
+        url = f"https://api.labs.crossref.org/data/retractionwatch?{CROSSREF_MAILTO}"
+
+        async with (
+            aiohttp.ClientSession() as session,
+            session.get(
+                url,
+                timeout=aiohttp.ClientTimeout(total=300),
+            ) as response,
+        ):
+            response.raise_for_status()
+
+            logger.info(
+                f"Retraction data was not cashed. Downloading retraction data from {url}..."
+            )
+
+            async with await open_file(self.retraction_data_path, "wb") as f:
+                while True:
+                    chunk = await response.content.read(1024)
+                    if not chunk:
+                        break
+                    await f.write(chunk)
+
+            if os.path.getsize(self.retraction_data_path) == 0:
+                raise RuntimeError("Retraction data is empty")
+
+    def _filter_dois(self) -> None:
+        with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row in reader:
+                if row[self.columns[2]] == self.retraction_filter:
+                    self.doi_set.add(row[self.columns[0]])
+                    self.doi_set.add(row[self.columns[1]])
+
+    async def load_data(self) -> None:
+        if not self._is_csv_cached() or self._has_cache_expired():
+            await self._download_retracted_dataset()
+
+        self._filter_dois()
+
+        if not self.doi_set:
+            raise RuntimeError("Retraction data was not found.")
+
+    async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails:
+        if not self.doi_set:
+            await self.load_data()
+
+        return doc_details + DocDetails(  # type: ignore[call-arg]
+            is_retracted=query.doi in self.doi_set
+        )
+
+    def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
+        try:
+            return DOIQuery(doi=doc_details.doi, **kwargs)
+        except ValidationError:
+            logger.debug(
+                f"Must have a valid DOI to query retraction data:{doc_details.doi} "
+            )
+            return None
diff --git a/paperqa/types.py b/paperqa/types.py
@@ -320,6 +320,9 @@ class DocDetails(Doc):
         " We use None as a sentinel for unset values (like for determining hydration) "
         " So, we use -1 means unknown quality and None means it needs to be hydrated.",
     )
+    is_retracted: bool | None = Field(
+        default=None, description="Flag for whether the paper is retracted."
+    )
     doi: str | None = None
     doi_url: str | None = None
     doc_id: str | None = None
@@ -538,6 +541,10 @@ def __getitem__(self, item: str):
 
     @property
     def formatted_citation(self) -> str:
+
+        if self.is_retracted:
+            return f"**RETRACTED ARTICLE** Citation: {self.citation} Retrieved from http://retractiondatabase.org/."
+
         if (
             self.citation is None  # type: ignore[redundant-expr]
             or self.citation_count is None
@@ -551,6 +558,7 @@ def formatted_citation(self) -> str:
             if self.source_quality >= 0
             else None
         )
+
         if quality is None:
             return f"{self.citation} This article has {self.citation_count} citations."
         return (

diff --git a/tests/test_clients.py b/tests/test_clients.py
@@ -16,6 +16,7 @@
 )
 from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider
 from paperqa.clients.journal_quality import JournalQualityPostProcessor
+from paperqa.clients.retractions import RetrationDataPostProcessor
 
 
 @pytest.mark.vcr
@@ -94,7 +95,17 @@
 @pytest.mark.asyncio
 async def test_title_search(paper_attributes: dict[str, str]):
     async with aiohttp.ClientSession() as session:
-        client = DocMetadataClient(session, clients=ALL_CLIENTS)
+        client_list = list(ALL_CLIENTS)
+        client_list.remove(RetrationDataPostProcessor)
+        client = DocMetadataClient(
+            session,
+            clients=cast(
+                Collection[
+                    type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
+                ],
+                client_list,
+            ),
+        )
         details = await client.query(title=paper_attributes["title"])
         assert set(details.other["client_source"]) == set(  # type: ignore[union-attr]
             paper_attributes["source"]
@@ -180,7 +191,17 @@ async def test_title_search(paper_attributes: dict[str, str]):
 @pytest.mark.asyncio
 async def test_doi_search(paper_attributes: dict[str, str]):
     async with aiohttp.ClientSession() as session:
-        client = DocMetadataClient(session, clients=ALL_CLIENTS)
+        client_list = list(ALL_CLIENTS)
+        client_list.remove(RetrationDataPostProcessor)
+        client = DocMetadataClient(
+            session,
+            clients=cast(
+                Collection[
+                    type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
+                ],
+                client_list,
+            ),
+        )
         details = await client.query(doi=paper_attributes["doi"])
         assert set(details.other["client_source"]) == set(  # type: ignore[union-attr]
             paper_attributes["source"]
@@ -505,3 +526,25 @@ async def test_ensure_sequential_run_early_stop(
             record_indices["semantic_scholar"] != -1
         ), "Semantic Scholar should be found"
         assert record_indices["early_stop"] != -1, "We should stop early."
+
+
+@pytest.mark.asyncio
+async def test_crossref_retraction_status():
+    async with aiohttp.ClientSession() as session:
+        crossref_client = DocMetadataClient(
+            session,
+            clients=cast(
+                Collection[
+                    type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
+                ],
+                [CrossrefProvider, RetrationDataPostProcessor],
+            ),
+        )
+        crossref_details = await crossref_client.query(
+            title="The Dilemma and Countermeasures of Music Education under the Background of Big Data",
+            fields=["title", "doi", "authors", "journal"],
+        )
+
+        assert "**RETRACTED ARTICLE** Citation: Jiaye Han." in crossref_details.formatted_citation  # type: ignore[union-attr]
+
+        assert crossref_details.is_retracted is True, "Should be retracted"  # type: ignore[union-attr]