-
Notifications
You must be signed in to change notification settings - Fork 553
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds retraction status #314
Changes from 10 commits
0c8ebca
bd24406
9381296
26cdad5
f3ee3d1
e7b1441
d9edda0
71a1623
5fb3b71
ccb984f
3b38f25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from __future__ import annotations | ||
|
||
import csv | ||
import datetime | ||
import logging | ||
import os | ||
|
||
import aiohttp | ||
from anyio import open_file | ||
from pydantic import ValidationError | ||
from tenacity import retry, stop_after_attempt, wait_exponential | ||
from tqdm.asyncio import tqdm | ||
|
||
from paperqa.types import DocDetails | ||
|
||
from .client_models import DOIQuery, MetadataPostProcessor | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RetrationDataPostProcessor(MetadataPostProcessor[DOIQuery]): | ||
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None: | ||
|
||
if retraction_data_path is None: | ||
# Construct the path relative to module | ||
self.retraction_data_path = str( | ||
os.path.join( | ||
os.path.dirname(__file__), "client_data", "retractions.csv" | ||
) | ||
) | ||
else: | ||
self.retraction_data_path = str(retraction_data_path) | ||
|
||
self.retraction_filter: str = "Retraction" | ||
self.doi_set: set[str] = set() | ||
self.columns: list[str] = [ | ||
"RetractionDOI", | ||
"OriginalPaperDOI", | ||
"RetractionNature", | ||
] | ||
|
||
def _has_cache_expired(self) -> bool: | ||
creation_time = os.path.getctime(self.retraction_data_path) | ||
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace( | ||
tzinfo=datetime.UTC | ||
) | ||
|
||
current_time = datetime.datetime.now(datetime.UTC) | ||
time_difference = current_time - file_creation_date | ||
|
||
return time_difference > datetime.timedelta(days=30) | ||
|
||
def _is_csv_cached(self) -> bool: | ||
return os.path.exists(self.retraction_data_path) | ||
geemi725 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@retry( | ||
stop=stop_after_attempt(3), | ||
wait=wait_exponential(multiplier=5, min=5), | ||
reraise=True, | ||
) | ||
async def _download_retracted_dataset(self) -> None: | ||
|
||
if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")): | ||
CROSSREF_MAILTO = "[email protected]" | ||
url = f"https://api.labs.crossref.org/data/retractionwatch?{CROSSREF_MAILTO}" | ||
|
||
async with ( | ||
aiohttp.ClientSession() as session, | ||
session.get( | ||
url, | ||
timeout=aiohttp.ClientTimeout(total=300), | ||
) as response, | ||
): | ||
response.raise_for_status() | ||
|
||
async with await open_file(self.retraction_data_path, "wb") as f: | ||
with tqdm( | ||
unit="iB", unit_scale=True, desc=self.retraction_data_path | ||
) as progress_bar: | ||
while True: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's just use logging here and log some progress info |
||
chunk = await response.content.read(1024) | ||
if not chunk: | ||
break | ||
await f.write(chunk) | ||
progress_bar.update(len(chunk)) | ||
|
||
if os.path.getsize(self.retraction_data_path) == 0: | ||
raise RuntimeError("Retraction data is empty") | ||
|
||
def _filter_dois(self) -> None: | ||
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
if row[self.columns[2]] == self.retraction_filter: | ||
self.doi_set.add(row[self.columns[0]]) | ||
self.doi_set.add(row[self.columns[1]]) | ||
|
||
async def load_data(self) -> None: | ||
if not self._is_csv_cached() or self._has_cache_expired(): | ||
await self._download_retracted_dataset() | ||
|
||
self._filter_dois() | ||
|
||
if not self.doi_set: | ||
raise RuntimeError("Retraction data was not found.") | ||
|
||
async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails: | ||
if not self.doi_set: | ||
await self.load_data() | ||
|
||
return doc_details + DocDetails( # type: ignore[call-arg] | ||
is_retracted=query.doi in self.doi_set | ||
) | ||
|
||
def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None: | ||
try: | ||
return DOIQuery(doi=doc_details.doi, **kwargs) | ||
except ValidationError: | ||
logger.debug( | ||
f"Must have a valid DOI to query retraction data:{doc_details.doi} " | ||
) | ||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -320,6 +320,9 @@ class DocDetails(Doc): | |
" We use None as a sentinel for unset values (like for determining hydration) " | ||
" So, we use -1 means unknown quality and None means it needs to be hydrated.", | ||
) | ||
is_retracted: bool | None = Field( | ||
geemi725 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
default=None, description="Flag for whether the paper is retracted." | ||
) | ||
doi: str | None = None | ||
doi_url: str | None = None | ||
doc_id: str | None = None | ||
|
@@ -538,6 +541,10 @@ def __getitem__(self, item: str): | |
|
||
@property | ||
def formatted_citation(self) -> str: | ||
|
||
if self.is_retracted: | ||
return f"RETRACTED ARTICLE! Original doi: {self.doi}. Retrieved from http://retractiondatabase.org/." | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should keep the original citation though - so that you can see title, author, etc. |
||
if ( | ||
self.citation is None # type: ignore[redundant-expr] | ||
or self.citation_count is None | ||
|
@@ -551,6 +558,7 @@ def formatted_citation(self) -> str: | |
if self.source_quality >= 0 | ||
else None | ||
) | ||
|
||
if quality is None: | ||
return f"{self.citation} This article has {self.citation_count} citations." | ||
return ( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
) | ||
from paperqa.clients.client_models import MetadataPostProcessor, MetadataProvider | ||
from paperqa.clients.journal_quality import JournalQualityPostProcessor | ||
from paperqa.clients.retractions import RetrationDataPostProcessor | ||
|
||
|
||
@pytest.mark.vcr | ||
|
@@ -94,7 +95,17 @@ | |
@pytest.mark.asyncio | ||
async def test_title_search(paper_attributes: dict[str, str]): | ||
async with aiohttp.ClientSession() as session: | ||
client = DocMetadataClient(session, clients=ALL_CLIENTS) | ||
client_list = list(ALL_CLIENTS) | ||
client_list.remove(RetrationDataPostProcessor) | ||
client = DocMetadataClient( | ||
session, | ||
clients=cast( | ||
Collection[ | ||
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]] | ||
], | ||
client_list, | ||
), | ||
) | ||
details = await client.query(title=paper_attributes["title"]) | ||
assert set(details.other["client_source"]) == set( # type: ignore[union-attr] | ||
paper_attributes["source"] | ||
|
@@ -180,7 +191,17 @@ async def test_title_search(paper_attributes: dict[str, str]): | |
@pytest.mark.asyncio | ||
async def test_doi_search(paper_attributes: dict[str, str]): | ||
async with aiohttp.ClientSession() as session: | ||
client = DocMetadataClient(session, clients=ALL_CLIENTS) | ||
client_list = list(ALL_CLIENTS) | ||
client_list.remove(RetrationDataPostProcessor) | ||
client = DocMetadataClient( | ||
session, | ||
clients=cast( | ||
Collection[ | ||
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]] | ||
], | ||
client_list, | ||
), | ||
) | ||
details = await client.query(doi=paper_attributes["doi"]) | ||
assert set(details.other["client_source"]) == set( # type: ignore[union-attr] | ||
paper_attributes["source"] | ||
|
@@ -505,3 +526,27 @@ async def test_ensure_sequential_run_early_stop( | |
record_indices["semantic_scholar"] != -1 | ||
), "Semantic Scholar should be found" | ||
assert record_indices["early_stop"] != -1, "We should stop early." | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_crossref_retraction_status(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice test 👍 |
||
async with aiohttp.ClientSession() as session: | ||
crossref_client = DocMetadataClient( | ||
session, | ||
clients=cast( | ||
Collection[ | ||
type[MetadataPostProcessor[Any]] | type[MetadataProvider[Any]] | ||
], | ||
[CrossrefProvider, RetrationDataPostProcessor], | ||
), | ||
) | ||
crossref_details = await crossref_client.query( | ||
title="The Dilemma and Countermeasures of Music Education under the Background of Big Data", | ||
fields=["title", "doi", "authors", "journal"], | ||
) | ||
|
||
assert ( | ||
crossref_details.formatted_citation # type: ignore[union-attr] | ||
== "RETRACTED ARTICLE! Original doi: 10.1155/2022/8341966. Retrieved from http://retractiondatabase.org/." | ||
), "Should inticate retraction status" | ||
assert crossref_details.is_retracted is True, "Should be retracted" # type: ignore[union-attr] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tqdm is not one of our dependencies - will have to drop that.