Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds retraction status #314

Merged
merged 11 commits into from
Sep 11, 2024
2 changes: 2 additions & 0 deletions paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .client_models import MetadataPostProcessor, MetadataProvider
from .crossref import CrossrefProvider
from .journal_quality import JournalQualityPostProcessor
from .retractions import RetrationDataPostProcessor
from .semantic_scholar import SemanticScholarProvider
from .unpaywall import UnpaywallProvider

Expand All @@ -32,6 +33,7 @@
| Sequence[Collection[type[MetadataPostProcessor | MetadataProvider]]]
) = DEFAULT_CLIENTS | { # type: ignore[operator]
UnpaywallProvider,
RetrationDataPostProcessor,
}


Expand Down
122 changes: 122 additions & 0 deletions paperqa/clients/retractions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from __future__ import annotations

import csv
import datetime
import logging
import os

import aiohttp
from anyio import open_file
from pydantic import ValidationError
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm.asyncio import tqdm
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tqdm is not one of our dependencies - will have to drop that.


from paperqa.types import DocDetails

from .client_models import DOIQuery, MetadataPostProcessor

logger = logging.getLogger(__name__)


class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]):
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None:

if retraction_data_path is None:
# Construct the path relative to module
self.retraction_data_path = str(
os.path.join(
os.path.dirname(__file__), "client_data", "retractions.csv"
)
)
else:
self.retraction_data_path = str(retraction_data_path)

self.retraction_filter: str = "Retraction"
self.doi_set: set[str] = set()
self.columns: list[str] = [
"RetractionDOI",
"OriginalPaperDOI",
"RetractionNature",
]

def _has_cache_expired(self) -> bool:
creation_time = os.path.getctime(self.retraction_data_path)
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace(
tzinfo=datetime.UTC
)

current_time = datetime.datetime.now(datetime.UTC)
time_difference = current_time - file_creation_date

return time_difference > datetime.timedelta(days=30)

def _is_csv_cached(self) -> bool:
return os.path.exists(self.retraction_data_path)
geemi725 marked this conversation as resolved.
Show resolved Hide resolved

@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=5, min=5),
reraise=True,
)
async def _download_retracted_dataset(self) -> None:

if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")):
CROSSREF_MAILTO = "[email protected]"
url = f"https://api.labs.crossref.org/data/retractionwatch?{CROSSREF_MAILTO}"

async with (
aiohttp.ClientSession() as session,
session.get(
url,
timeout=aiohttp.ClientTimeout(total=300),
) as response,
):
response.raise_for_status()

async with await open_file(self.retraction_data_path, "wb") as f:
with tqdm(
unit="iB", unit_scale=True, desc=self.retraction_data_path
) as progress_bar:
while True:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just use logging here and log some progress info

chunk = await response.content.read(1024)
if not chunk:
break
await f.write(chunk)
progress_bar.update(len(chunk))

if os.path.getsize(self.retraction_data_path) == 0:
raise RuntimeError("Retraction data is empty")

def _filter_dois(self) -> None:
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row[self.columns[2]] == self.retraction_filter:
self.doi_set.add(row[self.columns[0]])
self.doi_set.add(row[self.columns[1]])

async def load_data(self) -> None:
if not self._is_csv_cached() or self._has_cache_expired():
await self._download_retracted_dataset()

self._filter_dois()

if not self.doi_set:
raise RuntimeError("Retraction data was not found.")

async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails:
if not self.doi_set:
await self.load_data()

return doc_details + DocDetails( # type: ignore[call-arg]
is_retracted=query.doi in self.doi_set
)

def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
try:
return DOIQuery(doi=doc_details.doi, **kwargs)
except ValidationError:
logger.debug(
f"Must have a valid DOI to query retraction data:{doc_details.doi} "
)
return None
8 changes: 8 additions & 0 deletions paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,9 @@ class DocDetails(Doc):
" We use None as a sentinel for unset values (like for determining hydration) "
" So, we use -1 means unknown quality and None means it needs to be hydrated.",
)
is_retracted: bool | None = Field(
geemi725 marked this conversation as resolved.
Show resolved Hide resolved
default=None, description="Flag for whether the paper is retracted."
)
doi: str | None = None
doi_url: str | None = None
doc_id: str | None = None
Expand Down Expand Up @@ -538,6 +541,10 @@ def __getitem__(self, item: str):

@property
def formatted_citation(self) -> str:

if self.is_retracted:
return f"RETRACTED ARTICLE! Original doi: {self.doi}. Retrieved from http://retractiondatabase.org/."

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep the original citation though - so that you can see title, author, etc.

if (
self.citation is None # type: ignore[redundant-expr]
or self.citation_count is None
Expand All @@ -551,6 +558,7 @@ def formatted_citation(self) -> str:
if self.source_quality >= 0
else None
)

if quality is None:
return f"{self.citation} This article has {self.citation_count} citations."
return (
Expand Down
49 changes: 47 additions & 2 deletions tests/test_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider
from paperqa.clients.journal_quality import JournalQualityPostProcessor
from paperqa.clients.retractions import RetrationDataPostProcessor


@pytest.mark.vcr
Expand Down Expand Up @@ -94,7 +95,17 @@
@pytest.mark.asyncio
async def test_title_search(paper_attributes: dict[str, str]):
async with aiohttp.ClientSession() as session:
client = DocMetadataClient(session, clients=ALL_CLIENTS)
client_list = list(ALL_CLIENTS)
client_list.remove(RetrationDataPostProcessor)
client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
client_list,
),
)
details = await client.query(title=paper_attributes["title"])
assert set(details.other["client_source"]) == set( # type: ignore[union-attr]
paper_attributes["source"]
Expand Down Expand Up @@ -180,7 +191,17 @@ async def test_title_search(paper_attributes: dict[str, str]):
@pytest.mark.asyncio
async def test_doi_search(paper_attributes: dict[str, str]):
async with aiohttp.ClientSession() as session:
client = DocMetadataClient(session, clients=ALL_CLIENTS)
client_list = list(ALL_CLIENTS)
client_list.remove(RetrationDataPostProcessor)
client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
client_list,
),
)
details = await client.query(doi=paper_attributes["doi"])
assert set(details.other["client_source"]) == set( # type: ignore[union-attr]
paper_attributes["source"]
Expand Down Expand Up @@ -505,3 +526,27 @@ async def test_ensure_sequential_run_early_stop(
record_indices["semantic_scholar"] != -1
), "Semantic Scholar should be found"
assert record_indices["early_stop"] != -1, "We should stop early."


@pytest.mark.asyncio
async def test_crossref_retraction_status():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test 👍

async with aiohttp.ClientSession() as session:
crossref_client = DocMetadataClient(
session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
[CrossrefProvider, RetrationDataPostProcessor],
),
)
crossref_details = await crossref_client.query(
title="The Dilemma and Countermeasures of Music Education under the Background of Big Data",
fields=["title", "doi", "authors", "journal"],
)

assert (
crossref_details.formatted_citation # type: ignore[union-attr]
== "RETRACTED ARTICLE! Original doi: 10.1155/2022/8341966. Retrieved from http://retractiondatabase.org/."
), "Should inticate retraction status"
assert crossref_details.is_retracted is True, "Should be retracted" # type: ignore[union-attr]
Loading