-
Notifications
You must be signed in to change notification settings - Fork 553
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds retraction status #314
Changes from 4 commits
0c8ebca
bd24406
9381296
26cdad5
f3ee3d1
e7b1441
d9edda0
71a1623
5fb3b71
ccb984f
3b38f25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
|
||
import asyncio | ||
import csv | ||
import datetime | ||
import logging | ||
import os | ||
|
||
import aiohttp | ||
from anyio import open_file | ||
from pydantic import ValidationError | ||
from tqdm.asyncio import tqdm | ||
|
||
from ..types import DocDetails | ||
from .client_models import DOIQuery, MetadataPostProcessor | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]): | ||
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None: | ||
|
||
if retraction_data_path is None: | ||
# Construct the path relative to module | ||
self.retraction_data_path = str( | ||
os.path.join( | ||
os.path.dirname(__file__), "client_data", "retractions.csv" | ||
) | ||
) | ||
else: | ||
self.retraction_data_path = str(retraction_data_path) | ||
|
||
self.retraction_filter: str = "Retraction" | ||
self.doi_set: set[str] = set() | ||
self.columns: list[str] = [ | ||
"RetractionDOI", | ||
"OriginalPaperDOI", | ||
"RetractionNature", | ||
] | ||
|
||
def _has_cache_expired(self) -> bool: | ||
creation_time = os.path.getctime(self.retraction_data_path) | ||
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace( | ||
tzinfo=datetime.UTC | ||
) | ||
|
||
current_time = datetime.datetime.now(datetime.UTC) | ||
time_difference = current_time - file_creation_date | ||
|
||
return time_difference > datetime.timedelta(days=30) | ||
|
||
def _is_csv_cached(self) -> bool: | ||
return os.path.exists(self.retraction_data_path) | ||
geemi725 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
async def _download_raw_retracted(self) -> None: | ||
retries = 3 | ||
delay = 5 | ||
url = "https://api.labs.crossref.org/data/retractionwatch" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you somehow move this functionality to Can just make a free function there |
||
|
||
for i in range(retries): | ||
try: | ||
async with aiohttp.ClientSession() as session, session.get( | ||
url, timeout=aiohttp.ClientTimeout(total=300) | ||
) as response: | ||
response.raise_for_status() | ||
async with await open_file(self.retraction_data_path, "wb") as f: | ||
progress_bar = tqdm( | ||
unit="iB", unit_scale=True, desc=self.retraction_data_path | ||
) | ||
while True: | ||
chunk = await response.content.read(1024) | ||
if not chunk: | ||
break | ||
await f.write(chunk) | ||
progress_bar.update(len(chunk)) | ||
progress_bar.close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use |
||
|
||
except (TimeoutError, aiohttp.ClientError) as e: | ||
if i < retries - 1: | ||
await asyncio.sleep(delay) | ||
delay *= 2 | ||
else: | ||
raise RuntimeError( | ||
f"Failed to download retracted data after {retries} attempts: {e}" | ||
) from e | ||
|
||
def _filter_dois(self) -> None: | ||
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
if row[self.columns[2]] == self.retraction_filter: | ||
self.doi_set.add(row[self.columns[0]]) | ||
self.doi_set.add(row[self.columns[1]]) | ||
|
||
async def load_data(self) -> None: | ||
if not self._is_csv_cached() or self._has_cache_expired(): | ||
await self._download_raw_retracted() | ||
|
||
self._filter_dois() | ||
|
||
if not self.doi_set: | ||
raise RuntimeError("Retraction data was not found.") | ||
|
||
async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails: | ||
if not self.doi_set: | ||
await self.load_data() | ||
|
||
return doc_details + DocDetails( # type: ignore[call-arg] | ||
is_retracted=query.doi in self.doi_set | ||
) | ||
|
||
def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None: | ||
try: | ||
return DOIQuery(doi=doc_details.doi, **kwargs) | ||
except ValidationError: | ||
logger.debug("Must have a valid doi to query retraction data.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you place the DOI into an f-string here? |
||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
) | ||
from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider | ||
from paperqa.clients.journal_quality import JournalQualityPostProcessor | ||
from paperqa.clients.retractions import RetrationDataPostProcessor | ||
|
||
|
||
@pytest.mark.vcr | ||
|
@@ -328,6 +329,27 @@ async def test_crossref_journalquality_fields_filtering(): | |
), "Citation should be populated" | ||
|
||
|
||
@pytest.mark.vcr | ||
@pytest.mark.asyncio | ||
async def test_crossref_retraction_status(): | ||
async with aiohttp.ClientSession() as session: | ||
crossref_client = DocMetadataClient( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice |
||
session, | ||
clients=cast( | ||
Collection[ | ||
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]] | ||
], | ||
[CrossrefProvider, RetrationDataPostProcessor], | ||
), | ||
) | ||
crossref_details = await crossref_client.query( | ||
title="The Dilemma and Countermeasures of Music Education under the Background of Big Data", | ||
fields=["title", "doi", "authors", "journal"], | ||
) | ||
|
||
assert crossref_details.is_retracted is True, "Should be retracted" # type: ignore[union-attr] | ||
|
||
|
||
@pytest.mark.vcr | ||
@pytest.mark.asyncio | ||
async def test_author_matching(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tqdm is not one of our dependencies - will have to drop that.