Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds retraction status #314

Merged
merged 11 commits into from
Sep 11, 2024
2 changes: 2 additions & 0 deletions paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .client_models import MetadataPostProcessor, MetadataProvider
from .crossref import CrossrefProvider
from .journal_quality import JournalQualityPostProcessor
from .retractions import RetrationDataPostProcessor
from .semantic_scholar import SemanticScholarProvider
from .unpaywall import UnpaywallProvider

Expand All @@ -24,6 +25,7 @@
CrossrefProvider,
SemanticScholarProvider,
JournalQualityPostProcessor,
RetrationDataPostProcessor,
}

ALL_CLIENTS: (
Expand Down
117 changes: 117 additions & 0 deletions paperqa/clients/retractions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import asyncio
import csv
import datetime
import logging
import os

import aiohttp
from anyio import open_file
from pydantic import ValidationError
from tqdm.asyncio import tqdm
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tqdm is not one of our dependencies - will have to drop that.


from ..types import DocDetails
from .client_models import DOIQuery, MetadataPostProcessor

logger = logging.getLogger(__name__)


class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]):
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None:

if retraction_data_path is None:
# Construct the path relative to module
self.retraction_data_path = str(
os.path.join(
os.path.dirname(__file__), "client_data", "retractions.csv"
)
)
else:
self.retraction_data_path = str(retraction_data_path)

self.retraction_filter: str = "Retraction"
self.doi_set: set[str] = set()
self.columns: list[str] = [
"RetractionDOI",
"OriginalPaperDOI",
"RetractionNature",
]

def _has_cache_expired(self) -> bool:
creation_time = os.path.getctime(self.retraction_data_path)
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace(
tzinfo=datetime.UTC
)

current_time = datetime.datetime.now(datetime.UTC)
time_difference = current_time - file_creation_date

return time_difference > datetime.timedelta(days=30)

def _is_csv_cached(self) -> bool:
return os.path.exists(self.retraction_data_path)
geemi725 marked this conversation as resolved.
Show resolved Hide resolved

async def _download_raw_retracted(self) -> None:
retries = 3
delay = 5
url = "https://api.labs.crossref.org/data/retractionwatch"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you somehow move this functionality to paperqa/clients/crossref.py?

Can just make a free function there


for i in range(retries):
try:
async with aiohttp.ClientSession() as session, session.get(
url, timeout=aiohttp.ClientTimeout(total=300)
) as response:
response.raise_for_status()
async with await open_file(self.retraction_data_path, "wb") as f:
progress_bar = tqdm(
unit="iB", unit_scale=True, desc=self.retraction_data_path
)
while True:
chunk = await response.content.read(1024)
if not chunk:
break
await f.write(chunk)
progress_bar.update(len(chunk))
progress_bar.close()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use with statement here over manual close (that currently won't get called if an Exception happens)


except (TimeoutError, aiohttp.ClientError) as e:
if i < retries - 1:
await asyncio.sleep(delay)
delay *= 2
else:
raise RuntimeError(
f"Failed to download retracted data after {retries} attempts: {e}"
) from e

def _filter_dois(self) -> None:
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row[self.columns[2]] == self.retraction_filter:
self.doi_set.add(row[self.columns[0]])
self.doi_set.add(row[self.columns[1]])

async def load_data(self) -> None:
if not self._is_csv_cached() or self._has_cache_expired():
await self._download_raw_retracted()

self._filter_dois()

if not self.doi_set:
raise RuntimeError("Retraction data was not found.")

async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails:
if not self.doi_set:
await self.load_data()

return doc_details + DocDetails( # type: ignore[call-arg]
is_retracted=query.doi in self.doi_set
)

def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
try:
return DOIQuery(doi=doc_details.doi, **kwargs)
except ValidationError:
logger.debug("Must have a valid doi to query retraction data.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you place the DOI into an f-string here?

return None
3 changes: 3 additions & 0 deletions paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ class DocDetails(Doc):
" We use None as a sentinel for unset values (like for determining hydration) "
" So, we use -1 means unknown quality and None means it needs to be hydrated.",
)
is_retracted: bool | None = Field(
geemi725 marked this conversation as resolved.
Show resolved Hide resolved
default=None, description="Flag for whether the paper is retracted."
)
doi: str | None = None
doi_url: str | None = None
doc_id: str | None = None
Expand Down
22 changes: 22 additions & 0 deletions tests/test_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider
from paperqa.clients.journal_quality import JournalQualityPostProcessor
from paperqa.clients.retractions import RetrationDataPostProcessor


@pytest.mark.vcr
Expand Down Expand Up @@ -328,6 +329,27 @@ async def test_crossref_journalquality_fields_filtering():
), "Citation should be populated"


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_crossref_retraction_status():
async with aiohttp.ClientSession() as session:
crossref_client = DocMetadataClient(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice

session,
clients=cast(
Collection[
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]]
],
[CrossrefProvider, RetrationDataPostProcessor],
),
)
crossref_details = await crossref_client.query(
title="The Dilemma and Countermeasures of Music Education under the Background of Big Data",
fields=["title", "doi", "authors", "journal"],
)

assert crossref_details.is_retracted is True, "Should be retracted" # type: ignore[union-attr]


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_author_matching():
Expand Down
Loading