From a1cc3dde04362b25630577fad33d474b27431f6e Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 15 Jan 2025 15:20:31 +0100 Subject: [PATCH 01/33] added auth for memegraph access --- src/research_index_backend/config.py | 29 ++++++++++----------------- src/research_index_backend/session.py | 4 +++- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/research_index_backend/config.py b/src/research_index_backend/config.py index cafcfa2..9cabfb0 100644 --- a/src/research_index_backend/config.py +++ b/src/research_index_backend/config.py @@ -10,10 +10,12 @@ class Config: def __init__(self): load_dotenv() - + self.mg_host: str = os.getenv("MG_HOST", "127.0.0.1") self.mg_port: int = int(os.getenv("MG_PORT", 7687)) self.mg_port_alt: int = int(os.getenv("MG_PORT_ALT", 7444)) + self.mg_user: str = os.getenv("MG_USER") + self.mg_pass: str = os.getenv("MG_PASS") self.orcid_name_similarity_threshold: float = float( os.getenv("ORCID_NAME_SIMILARITY_THRESHOLD", 0.8) @@ -30,23 +32,15 @@ def __init__(self): ) self.openaire_token_endpoint = f"{self.openaire_service}/uoa-user-management/api/users/getAccessToken" - self.refresh_token: str = "" - self.token = None - - @property - def refresh_token(self): - return os.getenv("REFRESH_TOKEN", None) - - @property - def token(self): - if self.token: - return self.token - else: - self.token = self._get_personal_token() - return self.token - self._validate() + @property + def refresh_token(self): + return os.getenv("REFRESH_TOKEN") + @property + def token(self): + return self._get_personal_token() + def _validate(self): if not 0 <= self.orcid_name_similarity_threshold <= 1: raise ValueError( @@ -71,12 +65,11 @@ def _get_personal_token(self) -> str: except requests.JSONDecodeError as e: logger.error(f"Error decoding JSON response: {e}") raise ValueError( - "Failed to obtain personal token due to JSON decode error" + "Failed to obtain personal token due to JSON decode error. Check if the refresh token is correct or has not expired." ) else: raise ValueError( "No refresh token found, could not obtain personal token" ) - config = Config() diff --git a/src/research_index_backend/session.py b/src/research_index_backend/session.py index 4c9557b..beee34e 100644 --- a/src/research_index_backend/session.py +++ b/src/research_index_backend/session.py @@ -10,6 +10,8 @@ MG_HOST = config.mg_host MG_PORT = config.mg_port +MG_USER = config.mg_user +MG_PASS = config.mg_pass def connect_to_db(f): @@ -18,7 +20,7 @@ def with_connection_(*args, **kwargs): try: URI = f"bolt://{MG_HOST}:{MG_PORT}" - AUTH = ("", "") + AUTH = (MG_USER, MG_PASS) with GraphDatabase.driver(URI, auth=AUTH) as db: db.verify_connectivity() return f(*args, db, **kwargs) From e9fec9f1ea6e34a976996728f2742f96857e196d Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 15 Jan 2025 15:29:49 +0100 Subject: [PATCH 02/33] updated readme to include information on database auth env variables --- readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/readme.md b/readme.md index e5228a7..bac6a77 100644 --- a/readme.md +++ b/readme.md @@ -10,6 +10,8 @@ The package is not yet deployed to PyPI. Only an editable (development) install ```MG_HOST= MG_PORT= MG_PORT_ALT= + MG_USER= + MG_PASS= ORCID_NAME_SIMILARITY_THRESHOLD= NAME_SIMILARITY_THRESHOLD= OPENAIRE_API="https://api.openaire.eu" From 3084d94e23356b7fd9375210310308472aee565c Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Fri, 24 Jan 2025 10:53:50 +0100 Subject: [PATCH 03/33] implemented a metadata fetcher class and updated the tests --- .../create_graph_from_doi.py | 35 +---- src/research_index_backend/get_metadata.py | 138 ++++++++---------- tests/test_metadata.py | 112 +++++++------- 3 files changed, 129 insertions(+), 156 deletions(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index f1423b3..2dd1904 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -28,10 +28,7 @@ from .config import config from .create_graph import load_initial_data -from .get_metadata import ( - get_metadata_from_openaire, - get_metadata_from_openalex, -) +from .get_metadata import MetadataFetcher from .models import AnonymousArticle, Article, Author from .parser import parse_metadata from .session import connect_to_db @@ -80,26 +77,6 @@ def validate_dois(list_of_dois: List) -> Dict[str, List]: return dois -def get_output_metadata( - session: requests_cache.CachedSession, doi: str, source: str = "OpenAire" -) -> Dict: - """Request metadata from OpenAire Graph - - Arguments - --------- - session: CachedSession - doi: str - source: str, default='OpenAire' - The API to connect to - """ - if source == "OpenAire": - return get_metadata_from_openaire(session, doi, TOKEN) - elif source == "OpenAlex": - return get_metadata_from_openalex(session, doi) - else: - raise ValueError("Incorrect argument for output metadata source") - - @connect_to_db def match_author_name(db: Driver, author: Dict) -> List: name = f"{author['first_name'][0]} {author['last_name']}" @@ -244,16 +221,16 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool: def main(list_of_dois) -> bool: """ """ - dois = validate_dois(list_of_dois) valid_dois = dois["valid"] session = requests_cache.CachedSession("doi_cache", expire_after=30) + metadata_fetcher = MetadataFetcher(session) # Initialize fetcher for valid_doi in tqdm(valid_dois): try: - openalex_metadata = get_output_metadata( - session, valid_doi, "OpenAlex" + openalex_metadata = metadata_fetcher.get_output_metadata( + valid_doi, source="OpenAlex" ) except ValueError as ex: logger.error( @@ -261,7 +238,9 @@ def main(list_of_dois) -> bool: ) openalex_metadata = {"id": None} try: - metadata = get_output_metadata(session, valid_doi, "OpenAire") + metadata = metadata_fetcher.get_output_metadata( + valid_doi, source="OpenAire" + ) except ValueError as ex: logger.error( f"No OpenAire metadata found for doi {valid_doi}: {ex}" diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index e25e97b..f167323 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -1,91 +1,81 @@ from json import JSONDecodeError, dump from logging import DEBUG, basicConfig, getLogger from os import makedirs +from typing import Dict import requests import requests_cache from .config import config -logger = getLogger(__name__) -basicConfig( - filename="research_index_backend.log", - filemode="w", - encoding="utf-8", - level=DEBUG, -) - - -def get_metadata_from_openaire( - session: requests_cache.CachedSession, doi: str, token -): - """Gets metadata from OpenAire +class MetadataFetcher: + def __init__(self, session: requests_cache.CachedSession, token: str = None): + self.session = session + self.token = token or config.token + self.logger = getLogger(__name__) + basicConfig( + filename="research_index_backend.log", + filemode="w", + encoding="utf-8", + level=DEBUG, + ) + + def _save_json_response(self, response, directory: str, doi: str) -> None: + """Helper method to save JSON responses""" + clean_doi = doi.replace("/", "") + makedirs(directory, exist_ok=True) + + with open(f"{directory}/{clean_doi}.json", "w") as json_file: + try: + dump(response.json(), json_file) + except JSONDecodeError as ex: + self.logger.error(str(ex)) - Arguments - --------- - session: CachedSession - doi: str + def get_metadata_from_openaire(self, doi: str) -> Dict: + """Gets metadata from OpenAire""" + query = f"?format=json&doi={doi}" + headers = {"Authorization": f"Bearer {self.token}"} + api_url = f"{config.openaire_api}/search/researchProducts" - Returns - ------- - """ - query = f"?format=json&doi={doi}" - headers = {"Authorization": f"Bearer {token}"} - api_url = f"{config.openaire_api}/search/researchProducts" + response = self.session.get(api_url + query, headers=headers) - response = session.get(api_url + query, headers=headers) + self.logger.debug(f"Response code: {response.status_code}") + response.raise_for_status() - logger.debug(f"Response code: {response.status_code}") - response.raise_for_status() + if error := response.json().get("error"): + raise ValueError(error) - if error := response.json().get("error"): - raise ValueError(error) + self._save_json_response(response, "data/json/openaire", doi) - clean_doi = doi.replace("/", "") - directory = "data/json/openaire" - makedirs(directory, exist_ok=True) + if response.json()["response"]["results"]: + return response.json() + else: + raise ValueError(f"DOI {doi} returned no results") - with open(f"data/json/openaire/{clean_doi}.json", "w") as json_file: + def get_metadata_from_openalex(self, doi: str) -> Dict: + """Gets metadata from OpenAlex""" + self.logger.info(f"Requesting {doi} from OpenAlex") + query = f"doi:{doi}?mailto=wusher@kth.se" + api_url = "https://api.openalex.org/works/" + + response = self.session.get(api_url + query) + try: - dump(response.json(), json_file) - except JSONDecodeError as ex: - logger.error(str(ex)) - if response.json()["response"]["results"]: - return response.json() - else: - raise ValueError(f"DOI {doi} returned no results") - - -def get_metadata_from_openalex(session, doi): - """Gets metadata from OpenAlex - - Arguments - --------- - session: CachedSession - doi: str - - Returns - ------- - """ - - logger.info(f"Requesting {doi} from OpenAlex") - query = f"doi:{doi}?mailto=wusher@kth.se" - api_url = "https://api.openalex.org/works/" - response = session.get(api_url + query) - directory = "data/json/openalex" - makedirs(directory, exist_ok=True) - try: - response.raise_for_status() - clean_doi = doi.replace("/", "") - with open(f"data/json/openalex/{clean_doi}.json", "w") as json_file: - try: - dump(response.json(), json_file) - except JSONDecodeError as ex: - logger.error(str(ex)) - except requests.exceptions.HTTPError as err: - logger.error(str(err)) - - if response.json(): - return response.json() - else: - raise ValueError(f"DOI {doi} returned no results") + response.raise_for_status() + self._save_json_response(response, "data/json/openalex", doi) + except requests.exceptions.HTTPError as err: + self.logger.error(str(err)) + + if response.json(): + return response.json() + else: + raise ValueError(f"DOI {doi} returned no results") + + def get_output_metadata(self, doi: str, source: str = "OpenAire") -> Dict: + """Request metadata from specified source""" + if source == "OpenAire": + return self.get_metadata_from_openaire(doi) + elif source == "OpenAlex": + return self.get_metadata_from_openalex(doi) + else: + raise ValueError("Incorrect argument for output metadata source") \ No newline at end of file diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bca9f76..b5131bf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -8,86 +8,90 @@ import pytest from requests_cache import CachedSession -from research_index_backend.create_graph_from_doi import ( - get_output_metadata, - score_name_similarity, -) +from research_index_backend.get_metadata import MetadataFetcher +from research_index_backend.create_graph_from_doi import score_name_similarity +@pytest.fixture +def session(): + return CachedSession() -@pytest.mark.skip(reason="Requires access to OpenAire Graph API") -def test_broken_doi(): - """An incorrect DOI should raise an error""" - s = CachedSession() +@pytest.fixture +def fetcher(session): + return MetadataFetcher(session) - broken_doi = "10.1dd016/j.envsoft.2021" - with pytest.raises(ValueError) as ex: - get_output_metadata(s, broken_doi) - expected = "DOI 10.1dd016/j.envsoft.2021 returned no results" - assert str(ex.value) == expected +class TestMetadataFetcher: + @pytest.mark.skip(reason="Requires access to OpenAire Graph API") + def test_broken_doi(self, fetcher): + """An incorrect DOI should raise an error""" + broken_doi = "10.1dd016/j.envsoft.2021" + with pytest.raises(ValueError) as ex: + fetcher.get_output_metadata(broken_doi) + expected = "DOI 10.1dd016/j.envsoft.2021 returned no results" + assert str(ex.value) == expected +class TestNameScoring: + def test_score_names_same(self): -def test_score_names_same(): + name1 = "Will Usher" + name2 = "Will Usher" + assert score_name_similarity(name1, name2) == 1.0 - name1 = "Will Usher" - name2 = "Will Usher" - assert score_name_similarity(name1, name2) == 1.0 + def test_score_names_different(self): -def test_score_names_different(): + name1 = "Will Usher" + name2 = "1298139487(*&^)" + assert score_name_similarity(name1, name2) == 0.0 - name1 = "Will Usher" - name2 = "1298139487(*&^)" - assert score_name_similarity(name1, name2) == 0.0 + def test_score_names_truncated(self): -def test_score_names_truncated(): + name1 = "Vignesh Sridha" + name2 = "Vignesh Sridharan" + assert score_name_similarity(name1, name2) > 0.8 - name1 = "Vignesh Sridha" - name2 = "Vignesh Sridharan" - assert score_name_similarity(name1, name2) > 0.8 + def test_score_names_reversed(self): -def test_score_names_reversed(): + name1 = "Sridharan Vignesh" + name2 = "Vignesh Sridharan" + assert score_name_similarity(name1, name2) == 1.0 - name1 = "Sridharan Vignesh" - name2 = "Vignesh Sridharan" - assert score_name_similarity(name1, name2) == 1.0 + def test_score_names_ignore_case(self): -def test_score_names_ignore_case(): + name1 = "Sridharan Vignesh" + name2 = "VIGNESH Sridharan" + assert score_name_similarity(name1, name2) == 1.0 - name1 = "Sridharan Vignesh" - name2 = "VIGNESH Sridharan" - assert score_name_similarity(name1, name2) == 1.0 + def test_score_names_similar_but_different(self): -def test_score_names_similar_but_different(): + name1 = "James Sridharan" + name2 = "Vignesh Sridharan" + assert score_name_similarity(name1, name2) == 0.65625 - name1 = "James Sridharan" - name2 = "Vignesh Sridharan" - assert score_name_similarity(name1, name2) == 0.65625 + def test_score_names_similar_fernandos_1(self): -def test_score_names_similar_fernandos_1(): + name1 = "Fernando Antonio Plazas" + name2 = "Fernando Plazas-Nino" + assert score_name_similarity(name1, name2) < 0.8 - name1 = "Fernando Antonio Plazas" - name2 = "Fernando Plazas-Nino" - assert score_name_similarity(name1, name2) < 0.8 + def test_score_names_similar_fernandos_2(self): + name1 = "Fernando Plazas-Niño" + name2 = "Fernando Antonio Plazas-Niño" + assert score_name_similarity(name1, name2) > 0.8 -def test_score_names_similar_fernandos_2(): - name1 = "Fernando Plazas-Niño" - name2 = "Fernando Antonio Plazas-Niño" - assert score_name_similarity(name1, name2) > 0.8 + def test_score_names_similar_fernandos_3(self): + name1 = "Fernando Plazas-Niño" + name2 = "Fernando Plazas-Nino" + assert score_name_similarity(name1, name2) > 0.8 -def test_score_names_similar_fernandos_3(): - name1 = "Fernando Plazas-Niño" - name2 = "Fernando Plazas-Nino" - assert score_name_similarity(name1, name2) > 0.8 - -def test_score_names_similar_fernandos_4(): - name1 = "Fernando ANtonio Plazas" - name2 = "Fernando Antonio Plazas Nino" - assert score_name_similarity(name1, name2) > 0.8 + def test_score_names_similar_fernandos_4(self): + name1 = "Fernando ANtonio Plazas" + name2 = "Fernando Antonio Plazas Nino" + assert score_name_similarity(name1, name2) > 0.8 From fb327671f7340394eab154b0dea3931966d0aabe Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Fri, 24 Jan 2025 11:36:24 +0100 Subject: [PATCH 04/33] removed unused optimus module --- src/research_index_backend/optimus.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 src/research_index_backend/optimus.py diff --git a/src/research_index_backend/optimus.py b/src/research_index_backend/optimus.py deleted file mode 100644 index 875374e..0000000 --- a/src/research_index_backend/optimus.py +++ /dev/null @@ -1,25 +0,0 @@ -from transformers import AutoTokenizer -from transformers import AutoTokenizer, DistilBertForQuestionAnswering -import torch -import pandas as pd -corpus = pd.read_csv('corpus.csv', usecols=['abstract'])['abstract'].to_list() - -question, text = "What is the objective?", corpus[0] - -from transformers import pipeline - -question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') - -QUESTIONS = ["What is the aim?", - "What are the aims of the paper?", - "What problem is solved?", - "What are the objectives of the article?"] - -for context in corpus[0:3]: - results = [] - for question in QUESTIONS: - result = question_answerer(question=question, context=context) - print(f"Question: {question}") - print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}") - results.append(result) - From a682abbc668f39bfa01b0422f849770990a36c69 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 29 Jan 2025 08:14:05 +0100 Subject: [PATCH 05/33] added a doi manager --- src/research_index_backend/doi.py | 181 ++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 src/research_index_backend/doi.py diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py new file mode 100644 index 0000000..eac7e0e --- /dev/null +++ b/src/research_index_backend/doi.py @@ -0,0 +1,181 @@ +"""DOI (Digital Object Identifier) validation and management module. + +This module handles: +1. DOI pattern validation +2. Database existence checks +3. Metadata validation +4. Batch processing with limits: TODO +""" + +from pydantic import BaseModel +from logging import getLogger +from re import IGNORECASE, compile +import time +from typing import Dict, List +from neo4j import Driver + +from .session import connect_to_db + +logger = getLogger(__name__) + +DOI_PATTERN = "10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$" + + +class DOI(BaseModel): + doi: str + valid_pattern: bool = False + already_exists: bool = False + openalex_metadata: bool = False + openaire_metadata: bool = False + ingestion_success: bool = False + + +class DOITracker(DOI): + doi_tracker: Dict[str, DOI] + + +class DOIManager: + def __init__( + self, list_of_dois: List[str], limit: int, update_metadata=True + ) -> None: + + if not list_of_dois: + raise ValueError("DOI list cannot be empty") + if (limit <= 0) or (limit > len(list_of_dois)): + raise ValueError( + "Limit must be positive and less than the number of DOIs" + ) + + self.list_of_dois = [ + doi.strip() + .rstrip(".") + .replace("doi.org/", "") + .replace("https://doi.org/", "") + for doi in list_of_dois + ] + self.limit = limit + self.update_metadata = update_metadata + self.doi_tracker = { + doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit] + } + self.PATTERN = compile(DOI_PATTERN, IGNORECASE) + + def start_ingestion(self): + self.start_time = time.time() + + def end_ingestion(self): + self.end_time = time.time() + + def pattern_check(self): + try: + for doi in self.doi_tracker: + if search := self.PATTERN.search(doi): + logger.debug(f"Valid DOI pattern: {search.group()}") + self.doi_tracker[doi].valid_pattern = True + else: + logger.warning(f"Invalid DOI pattern: {doi.doi}") + except Exception as e: + logger.error(f"Error whilst checking DOI pattern: {e}") + raise + + @connect_to_db + def search_dois(self, db: Driver): + valid_dois = [ + doi.doi for doi in self.doi_tracker.values() if doi.valid_pattern + ] + + self.num_valid_pattern_dois = len(valid_dois) + self.num_invalid_pattern_dois = ( + len(self.doi_tracker) - self.num_valid_pattern_dois + ) + + if not valid_dois: + msg = "No DOIs have passed the pattern check and make sure to run pattern check first." + logger.warning(msg) + raise ValueError(msg) + query = """ + UNWIND $dois as doi + OPTIONAL MATCH (o:Output {doi: doi}) + RETURN doi, COUNT(o) > 0 as exists""" + try: + results, _, _ = db.execute_query(query, dois=valid_dois) + existing_dois = [ + record["doi"] for record in results if record["exists"] + ] + for doi in self.doi_tracker: + if doi in existing_dois: + self.doi_tracker[doi].already_exists = True + + self.num_existing_dois = len(existing_dois) + self.num_new_dois = ( + self.num_valid_pattern_dois - self.num_existing_dois + ) + + except Exception as e: + logger.error(f"Error whilst searching for DOIs: {e}") + raise + + def validate_dois(self) -> Dict[str, List[str]]: + try: + self.pattern_check() + self.search_dois() + return self.doi_tracker + except Exception as e: + logger.error(f"DOI validation failed: {e}") + raise + + def ingestion_metrics(self) -> Dict[str, int]: + total_time = self.end_time - self.start_time + metadata_failure = 0 + + if self.update_metadata: + metadata_failure = self.num_valid_pattern_dois - len( + [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].ingestion_success + ] + ) + else: + for doi in self.doi_tracker: + if ( + not self.doi_tracker[doi].ingestion_success + and not self.doi_tracker[doi].already_exists + ): + metadata_failure += 1 + + num_ingested_dois = len( + [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].ingestion_success + ] + ) + + openalex_success = len( + [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].openalex_metadata + ] + ) + openaire_success = len( + [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].openaire_metadata + ] + ) + + return { + "submitted_dois": len(self.list_of_dois), + "new_dois": self.num_new_dois, + "existing_dois": self.num_existing_dois, + "ingested_dois": num_ingested_dois, + "metadata_failure": metadata_failure, + "valid_pattern_dois": self.num_valid_pattern_dois, + "invalid_pattern_dois": self.num_invalid_pattern_dois, + "openalex_success": openalex_success, + "openaire_success": openaire_success, + "total_time_seconds": round(total_time, 3), + } From f54b711faf34e681ee82f9375b3052207561d06a Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 29 Jan 2025 08:15:04 +0100 Subject: [PATCH 06/33] updated ingestion pipeline to incorporate doi tracker --- .../create_graph_from_doi.py | 97 ++++++++++++------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index 2dd1904..e3ddd4e 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -32,6 +32,7 @@ from .models import AnonymousArticle, Article, Author from .parser import parse_metadata from .session import connect_to_db +from .doi import DOIManager TOKEN = config.token @@ -172,7 +173,7 @@ def check_upload_author(db: Driver, author: Dict) -> Author: name_author = author["first_name"] + " " + author["last_name"] score = score_name_similarity(name_results, name_author) if score < ORCID_NAME_SIMILARITY_THRESHOLD: - logger.warning(error_message + f". Ratio: {score}") + logger.warning(f"{error_message}. Ratio: {score}") results = match_author_name(author) else: # Try a match on full name, or create new node @@ -219,59 +220,92 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool: return True -def main(list_of_dois) -> bool: - """ """ - dois = validate_dois(list_of_dois) - valid_dois = dois["valid"] +def main(list_of_dois: list,limit: int, update_metadata: bool): + try: + doi_manager = DOIManager(list_of_dois, limit=limit, update_metadata=update_metadata) + + doi_manager.start_ingestion() + doi_manager.validate_dois() + if not doi_manager.update_metadata and not doi_manager.num_new_dois: + logger.warning("No new DOIs to process or valid existing DOIs to update.") + doi_manager.end_ingestion() + return doi_manager + except Exception as e: + logger.error(f"Error validating DOIs: {e}") + raise e + session = requests_cache.CachedSession("doi_cache", expire_after=30) - metadata_fetcher = MetadataFetcher(session) # Initialize fetcher - - for valid_doi in tqdm(valid_dois): + metadata_fetcher = MetadataFetcher(session) + + for doi in tqdm(doi_manager.doi_tracker): + if doi_manager.doi_tracker[doi].already_exists and not doi_manager.update_metadata: + logger.info(f"DOI {doi} already exists in the database.") + continue try: openalex_metadata = metadata_fetcher.get_output_metadata( - valid_doi, source="OpenAlex" + doi, source="OpenAlex" ) + doi_manager.doi_tracker[doi].openalex_metadata = True except ValueError as ex: logger.error( - f"No OpenAlex metadata found for doi {valid_doi}: {ex}" + f"No OpenAlex metadata found for doi {doi}: {ex}" ) - openalex_metadata = {"id": None} + openalex_metadata = {"id": None} try: metadata = metadata_fetcher.get_output_metadata( - valid_doi, source="OpenAire" + doi, source="OpenAire" ) + doi_manager.doi_tracker[doi].openaire_metadata = True except ValueError as ex: logger.error( - f"No OpenAire metadata found for doi {valid_doi}: {ex}" + f"No OpenAire metadata found for doi {doi}: {ex}" ) else: outputs_metadata = parse_metadata( - metadata, valid_doi, openalex_metadata + metadata, doi, openalex_metadata ) for output in outputs_metadata: try: result = upload_article_to_memgraph(output) + doi_manager.doi_tracker[doi].ingestion_success = True except DatabaseError as ex: logger.error(f"Error uploading {output.doi} to Memgraph") logger.error(f"{ex}") logger.debug(output) raise ex if result: - logger.info(f"Upload {valid_doi} successful") + logger.info(f"Upload {doi} successful") else: - logger.warning(f"Upload {valid_doi} failed") - - return True + logger.warning(f"Upload {doi} failed") + doi_manager.end_ingestion() + return doi_manager def argument_parser(): - - parser = argparse.ArgumentParser() - help = "Provide the path to CSV file containing a list of dois" - parser.add_argument("list_of_dois", help=help) - help = "Deletes any existing data and creates a new database" - parser.add_argument("--initialise", action="store_true", help=help) + parser = argparse.ArgumentParser( + description="Process DOIs and create/update a graph database" + ) + parser.add_argument( + "list_of_dois", + help="Path to CSV file containing list of DOIs" + ) + parser.add_argument( + "-i", "--initialise", + action="store_true", + help="Delete existing data and create new database" + ) + parser.add_argument( + "-l", "--limit", + type=int, + default=50, + help="Limit number of DOIs to process (default: 50)" + ) + parser.add_argument( + "-u", "--update-metadata", + action="store_true", + help="Update metadata for existing DOIs" + ) return parser.parse_args() @@ -317,7 +351,7 @@ def add_country_relations(db: Driver): def entry_point(db: Driver): """This is the console entry point to the programme""" - args = argument_parser() + args = argument_parser() list_of_dois = [] with open(args.list_of_dois, "r") as csv_file: for line in csv_file: @@ -329,14 +363,11 @@ def entry_point(db: Driver): logger.info("Deleted graph") load_initial_data(join("data", "init")) - result = main(list_of_dois) + doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata) add_country_relations() - - if result: - print("Success") - + report = doi_manager.ingestion_metrics() + logger.info(f"Report: {report}") + print(f"Report: {report}") if __name__ == "__main__": - - if result := entry_point(): - print("Success") + entry_point() \ No newline at end of file From 4239271e5ec6f5078e320a7add0142e676867f8b Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 29 Jan 2025 08:17:23 +0100 Subject: [PATCH 07/33] updated usage to include limit and update metadata flags --- readme.md | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/readme.md b/readme.md index bac6a77..f93054d 100644 --- a/readme.md +++ b/readme.md @@ -4,9 +4,9 @@ The package is not yet deployed to PyPI. Only an editable (development) install 1. Provide a list of DOIs in a CSV file format `list_of_dois.csv` 2. Clone the repository `git clonehttps://github.com/ClimateCompatibleGrowth/research_index_backend.git` -2. Change directory `cd research_index_backend` -2. Install the package `pip install -e .` as an editable package (development install) -3. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters: +3. Change directory `cd research_index_backend` +4. Install the package `pip install -e .` as an editable package (development install) +5. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters: ```MG_HOST= MG_PORT= MG_PORT_ALT= @@ -19,27 +19,33 @@ The package is not yet deployed to PyPI. Only an editable (development) install REFRESH_TOKEN= ``` -4. Provision Memgraph graph database and set up environment variables +6. Provision Memgraph graph database and set up environment variables Once the VM is up and running, SSH into the VM, download and install memgraph $ curl -O https://download.memgraph.com/memgraph/v2.14.1/ubuntu-20.04/memgraph_2.14.1-1_amd64.deb $ sudo dpkg -i /memgraph_2.14.1-1_amd64.deb -5. Run the backend: - - $ research_index --help - usage: research_index [-h] [--initialise INITIALISE] list_of_dois +7. Run the backend: + research_index --help + usage: research_index [-h] [-i] [-l LIMIT] [-u] list_of_dois + positional arguments: - list_of_dois Provide the path to CSV file containing a list of dois + list_of_dois Path to CSV file containing list of DOIs options: - -h, --help show this help message and exit - --initialise INITIALISE - Deletes any existing data and creates a new database + -h, --help Show this help message and exit + -i, --initialise Delete existing data and create new database + -l, --limit N Limit number of DOIs to process (default: 50) + -u, --update-metadata Update metadata for existing DOIs + + Examples: + -> Process 10 DOIs from file: + $ research_index list_of_dois.csv -l 10 # Process 10 DOIs from file - $ research_index list_of_dois.csv --initalise + -> Update metadata for existing DOIs + $ research_index list_of_dois.csv --update-metadata # Development From 9c9507130631362c59261c4a2d4f0ead007b8764 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Sun, 2 Feb 2025 22:00:18 +0100 Subject: [PATCH 08/33] returning both ingestion metrics and doi statuses --- .../create_graph_from_doi.py | 7 +- src/research_index_backend/doi.py | 141 ++++++++++-------- 2 files changed, 80 insertions(+), 68 deletions(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index e3ddd4e..b9b19a1 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -365,9 +365,10 @@ def entry_point(db: Driver): doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata) add_country_relations() - report = doi_manager.ingestion_metrics() - logger.info(f"Report: {report}") - print(f"Report: {report}") + metrics, processed_dois = doi_manager.ingestion_metrics() + logger.info(f"Report: {metrics}, {processed_dois}") + print(f"Report: {metrics}") + return metrics, processed_dois if __name__ == "__main__": entry_point() \ No newline at end of file diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index eac7e0e..33afac4 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -7,20 +7,19 @@ 4. Batch processing with limits: TODO """ -from pydantic import BaseModel +import time from logging import getLogger from re import IGNORECASE, compile -import time from typing import Dict, List + from neo4j import Driver +from pydantic import BaseModel from .session import connect_to_db logger = getLogger(__name__) DOI_PATTERN = "10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$" - - class DOI(BaseModel): doi: str valid_pattern: bool = False @@ -68,28 +67,26 @@ def end_ingestion(self): def pattern_check(self): try: + self.valid_pattern_dois = [] + self.invalid_pattern_dois = [] + for doi in self.doi_tracker: if search := self.PATTERN.search(doi): logger.debug(f"Valid DOI pattern: {search.group()}") self.doi_tracker[doi].valid_pattern = True + self.valid_pattern_dois.append(doi) else: - logger.warning(f"Invalid DOI pattern: {doi.doi}") + logger.warning(f"Invalid DOI pattern: {doi}") + self.invalid_pattern_dois.append(doi) + self.num_valid_pattern_dois = len(self.valid_pattern_dois) + self.num_invalid_pattern_dois = len(self.invalid_pattern_dois) except Exception as e: logger.error(f"Error whilst checking DOI pattern: {e}") raise @connect_to_db def search_dois(self, db: Driver): - valid_dois = [ - doi.doi for doi in self.doi_tracker.values() if doi.valid_pattern - ] - - self.num_valid_pattern_dois = len(valid_dois) - self.num_invalid_pattern_dois = ( - len(self.doi_tracker) - self.num_valid_pattern_dois - ) - - if not valid_dois: + if not self.valid_pattern_dois: msg = "No DOIs have passed the pattern check and make sure to run pattern check first." logger.warning(msg) raise ValueError(msg) @@ -98,18 +95,21 @@ def search_dois(self, db: Driver): OPTIONAL MATCH (o:Output {doi: doi}) RETURN doi, COUNT(o) > 0 as exists""" try: - results, _, _ = db.execute_query(query, dois=valid_dois) - existing_dois = [ + results, _, _ = db.execute_query( + query, dois=self.valid_pattern_dois + ) + self.existing_dois = [ record["doi"] for record in results if record["exists"] ] + self.new_dois = [ + record["doi"] for record in results if not record["exists"] + ] for doi in self.doi_tracker: - if doi in existing_dois: + if doi in self.existing_dois: self.doi_tracker[doi].already_exists = True - self.num_existing_dois = len(existing_dois) - self.num_new_dois = ( - self.num_valid_pattern_dois - self.num_existing_dois - ) + self.num_new_dois = len(self.new_dois) + self.num_existing_new_dois = len(self.existing_dois) except Exception as e: logger.error(f"Error whilst searching for DOIs: {e}") @@ -126,56 +126,67 @@ def validate_dois(self) -> Dict[str, List[str]]: def ingestion_metrics(self) -> Dict[str, int]: total_time = self.end_time - self.start_time - metadata_failure = 0 - - if self.update_metadata: - metadata_failure = self.num_valid_pattern_dois - len( - [ - doi - for doi in self.doi_tracker - if self.doi_tracker[doi].ingestion_success - ] - ) - else: - for doi in self.doi_tracker: - if ( - not self.doi_tracker[doi].ingestion_success - and not self.doi_tracker[doi].already_exists - ): - metadata_failure += 1 - - num_ingested_dois = len( - [ - doi - for doi in self.doi_tracker - if self.doi_tracker[doi].ingestion_success - ] - ) - openalex_success = len( - [ - doi - for doi in self.doi_tracker - if self.doi_tracker[doi].openalex_metadata - ] - ) - openaire_success = len( - [ - doi - for doi in self.doi_tracker - if self.doi_tracker[doi].openaire_metadata - ] + processed_dois = ( + self.valid_pattern_dois if self.update_metadata else self.new_dois ) - return { + metadata_pass = [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].ingestion_success + and doi in processed_dois + ] + metadata_failure = [ + doi + for doi in self.doi_tracker + if not self.doi_tracker[doi].ingestion_success + and doi in processed_dois + ] + + self.ingested_dois = [ + doi + for doi in self.doi_tracker + if self.doi_tracker[doi].ingestion_success + ] + + openalex_success = [ + doi + for doi in processed_dois + if self.doi_tracker[doi].openalex_metadata + ] + openaire_success = [ + doi + for doi in processed_dois + if self.doi_tracker[doi].openaire_metadata + ] + + metrics = { "submitted_dois": len(self.list_of_dois), + "processed_dois": len(processed_dois), "new_dois": self.num_new_dois, - "existing_dois": self.num_existing_dois, - "ingested_dois": num_ingested_dois, - "metadata_failure": metadata_failure, + "existing_dois": self.num_existing_new_dois, + "ingested_dois": len(self.ingested_dois), + "metadata_pass": len(metadata_pass), + "metadata_failure": len(metadata_failure), "valid_pattern_dois": self.num_valid_pattern_dois, "invalid_pattern_dois": self.num_invalid_pattern_dois, + "openalex_success": len(openalex_success), + "openaire_success": len(openaire_success), + "total_time_seconds": round(total_time, 3), + } + + doi_states = { + "submitted_dois": self.list_of_dois, + "processed_dois": processed_dois, + "new_dois": self.new_dois, + "existing_dois": self.existing_dois, + "ingested_dois": self.ingested_dois, + "metadata_pass": metadata_pass, + "metadata_failure": metadata_failure, "openalex_success": openalex_success, "openaire_success": openaire_success, - "total_time_seconds": round(total_time, 3), + "valid_pattern_dois": self.valid_pattern_dois, + "invalid_pattern_dois": self.invalid_pattern_dois, } + return metrics, doi_states From be6deab3c2a67530d17913a4dcdab4b52cc9cd2f Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Sun, 2 Feb 2025 22:45:33 +0100 Subject: [PATCH 09/33] removed unused validation check --- .../create_graph_from_doi.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index b9b19a1..e0b9626 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -53,31 +53,6 @@ level=DEBUG, ) - -# Use regex pattern from -# https://www.crossref.org/blog/dois-and-matching-regular-expressions/ -PATTERN = compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", IGNORECASE) - - -def validate_dois(list_of_dois: List) -> Dict[str, List]: - """Validate DOIs""" - - dois = defaultdict(list) - # Iterate over the list of possible DOIs and return valid, otherwise raise - # a warning - for doi in list_of_dois: - if not doi == "": - search = PATTERN.search(doi) - if search: - logger.info(f"{search.group()} is a valid DOI.") - dois["valid"].append(search.group()) - else: - logger.warning(f"{doi} is not a DOI.") - dois["invalid"].append(doi) - - return dois - - @connect_to_db def match_author_name(db: Driver, author: Dict) -> List: name = f"{author['first_name'][0]} {author['last_name']}" From 175600ccbdefb2be958b1e5e1b90022532e17c54 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Sun, 2 Feb 2025 22:47:23 +0100 Subject: [PATCH 10/33] previous pattern was accepting malformed dois --- src/research_index_backend/doi.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index 33afac4..ce72a55 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -19,7 +19,10 @@ logger = getLogger(__name__) -DOI_PATTERN = "10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$" +# Use regex pattern from +# https://www.crossref.org/blog/dois-and-matching-regular-expressions/ +DOI_PATTERN = r"10\.\d{4,9}/(?=.*\d)[-._;()/:A-Z0-9]+$" + class DOI(BaseModel): doi: str valid_pattern: bool = False @@ -48,8 +51,8 @@ def __init__( self.list_of_dois = [ doi.strip() .rstrip(".") - .replace("doi.org/", "") .replace("https://doi.org/", "") + .replace("doi.org/", "") for doi in list_of_dois ] self.limit = limit From 57218e044e689804407c5d861033d236f562cf75 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Sun, 2 Feb 2025 22:49:08 +0100 Subject: [PATCH 11/33] updated tests for doi manager --- tests/test_dois.py | 96 ++++++++++++++++++++++++++++++++++++++ tests/test_validate_doi.py | 35 -------------- 2 files changed, 96 insertions(+), 35 deletions(-) create mode 100644 tests/test_dois.py delete mode 100644 tests/test_validate_doi.py diff --git a/tests/test_dois.py b/tests/test_dois.py new file mode 100644 index 0000000..b34c9a9 --- /dev/null +++ b/tests/test_dois.py @@ -0,0 +1,96 @@ +from research_index_backend.doi import DOIManager + +valid_dois = [ + "10.5281/zenodo.8140241", + "10.5281/ZENODO.8140241", + "10.5281/zenodo.8141555", + "10.5281/zenodo.8140100", + "10.5281/zenodo.8140153", + "10.5281/zenodo.8139242", + "10.5281/zenodo.8140226", + "10.5281/zenodo.8140289", +] + +invalid_dois = [ + "", + "non_empty_string", + "10.5281zenodo.8140226", + "10.5281/zenodo", +] + +raw_dois = [ + "10.1371/journal.pclm.0000331", + "doi.org/10.5281/zenodo.11395843", + "doi.org/10.5281/zenodo.11396572", + "10.5281/zenodo.11396370", + "https://doi.org/10.5281/zenodo.11395518", + "10.5281/zenodo.11395518.", + " 10.5281/zenodo.11395519 ", +] + +cleaned_dois = [ + "10.1371/journal.pclm.0000331", + "10.5281/zenodo.11395843", + "10.5281/zenodo.11396572", + "10.5281/zenodo.11396370", + "10.5281/zenodo.11395518", + "10.5281/zenodo.11395518", + "10.5281/zenodo.11395519", +] + +def test_valid_dois(): + """Test that valid DOI patterns are correctly identified.""" + doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False) + doi_manager.pattern_check() + for doi in doi_manager.doi_tracker: + assert doi_manager.doi_tracker[doi].valid_pattern + +def test_invalid_dois(): + """Test that invalid DOI patterns are correctly identified.""" + doi_manager = DOIManager(invalid_dois, limit=len(invalid_dois), update_metadata=False) + doi_manager.pattern_check() + for doi in doi_manager.doi_tracker: + assert not doi_manager.doi_tracker[doi].valid_pattern + +def test_mixed_dois(): + """Test processing of mixed valid and invalid DOIs.""" + doi_manager = DOIManager( + valid_dois + invalid_dois, + limit=len(valid_dois + invalid_dois), + update_metadata=False, + ) + doi_manager.pattern_check() + valid_count = sum(1 for doi in doi_manager.doi_tracker.values() if doi.valid_pattern) + invalid_count = sum(1 for doi in doi_manager.doi_tracker.values() if not doi.valid_pattern) + + assert valid_count == len(valid_dois) + assert invalid_count == len(invalid_dois) + +def test_doi_objects(): + """Test DOI object initialization and default values.""" + doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False) + doi_manager.pattern_check() + + for doi in doi_manager.doi_tracker: + doi_obj = doi_manager.doi_tracker[doi] + assert doi_obj.doi == doi, "DOI string mismatch" + assert doi_obj.valid_pattern, "Pattern should be valid" + assert not doi_obj.already_exists, "Should not exist by default" + assert not doi_obj.openalex_metadata, "Should not have OpenAlex metadata" + assert not doi_obj.openaire_metadata, "Should not have OpenAire metadata" + assert not doi_obj.ingestion_success, "Should not be ingested" + +def test_pattern_cleaner(): + """Test DOI pattern cleaning functionality.""" + doi_manager = DOIManager(raw_dois, limit=len(raw_dois), update_metadata=False) + assert doi_manager.list_of_dois == cleaned_dois, "DOI cleaning failed" + +def test_case_insensitive_pattern(): + """Test that DOI pattern matching is case insensitive.""" + doi_manager = DOIManager( + ["10.5281/zenodo.8140241", "10.5281/ZENODO.8140241"], + limit=2, + update_metadata=False + ) + doi_manager.pattern_check() + assert all(doi.valid_pattern for doi in doi_manager.doi_tracker.values()) \ No newline at end of file diff --git a/tests/test_validate_doi.py b/tests/test_validate_doi.py deleted file mode 100644 index 4c4e98f..0000000 --- a/tests/test_validate_doi.py +++ /dev/null @@ -1,35 +0,0 @@ -from pytest import fixture - -from research_index_backend.create_graph_from_doi import validate_dois - - -@fixture -def invalid(): - return [ - "10.1371/journal.pclm.0000331", - "", - "doi.org/10.5281/zenodo.11395843", - "doi.org/10.5281/zenodo.11396572", - "10.5281/zenodo.11396370", - "https://doi.org/10.5281/zenodo.11395518", - ] - - -@fixture -def expected(): - return [ - "10.1371/journal.pclm.0000331", - "10.5281/zenodo.11395843", - "10.5281/zenodo.11396572", - "10.5281/zenodo.11396370", - "10.5281/zenodo.11395518", - ] - - -class TestValidateDois: - - def test_validate_dois(self, invalid, expected): - - actual = validate_dois(invalid) - assert actual["valid"] == expected - assert actual["invalid"] == [] From 53e87fca9b8b1a91d100abb7640954bccc50f51d Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Mon, 3 Feb 2025 14:05:53 +0100 Subject: [PATCH 12/33] fix to doitracker --- src/research_index_backend/doi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index ce72a55..34257ef 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -32,7 +32,7 @@ class DOI(BaseModel): ingestion_success: bool = False -class DOITracker(DOI): +class DOITracker(BaseModel): doi_tracker: Dict[str, DOI] @@ -57,7 +57,7 @@ def __init__( ] self.limit = limit self.update_metadata = update_metadata - self.doi_tracker = { + self.doi_tracker: DOITracker = { doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit] } self.PATTERN = compile(DOI_PATTERN, IGNORECASE) From 9b0172ec55403617c66b507525bf4df5f2046358 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 4 Feb 2025 10:13:24 +0100 Subject: [PATCH 13/33] Changed config so that token is only read on demand, rather than module load --- src/research_index_backend/config.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/research_index_backend/config.py b/src/research_index_backend/config.py index 9cabfb0..f0d0704 100644 --- a/src/research_index_backend/config.py +++ b/src/research_index_backend/config.py @@ -10,7 +10,7 @@ class Config: def __init__(self): load_dotenv() - + self.mg_host: str = os.getenv("MG_HOST", "127.0.0.1") self.mg_port: int = int(os.getenv("MG_PORT", 7687)) self.mg_port_alt: int = int(os.getenv("MG_PORT_ALT", 7444)) @@ -32,14 +32,22 @@ def __init__(self): ) self.openaire_token_endpoint = f"{self.openaire_service}/uoa-user-management/api/users/getAccessToken" - self._validate() + self.refresh_token: str = "" + self.token = None + + @property + def refresh_token(self): + return os.getenv("REFRESH_TOKEN", None) - @property - def refresh_token(self): - return os.getenv("REFRESH_TOKEN") - @property - def token(self): - return self._get_personal_token() + @property + def token(self): + if self.token: + return self.token + else: + self.token = self._get_personal_token() + return self.token + + self._validate() def _validate(self): if not 0 <= self.orcid_name_similarity_threshold <= 1: @@ -65,11 +73,12 @@ def _get_personal_token(self) -> str: except requests.JSONDecodeError as e: logger.error(f"Error decoding JSON response: {e}") raise ValueError( - "Failed to obtain personal token due to JSON decode error. Check if the refresh token is correct or has not expired." + "Failed to obtain personal token due to JSON decode error" ) else: raise ValueError( "No refresh token found, could not obtain personal token" ) + config = Config() From e8cee175c8badcc138fae8b4c564c8b56f7c8741 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 4 Feb 2025 10:30:39 +0100 Subject: [PATCH 14/33] Use properties to access read-only token variables --- src/research_index_backend/config.py | 32 +++++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/research_index_backend/config.py b/src/research_index_backend/config.py index f0d0704..d9b8b59 100644 --- a/src/research_index_backend/config.py +++ b/src/research_index_backend/config.py @@ -32,22 +32,28 @@ def __init__(self): ) self.openaire_token_endpoint = f"{self.openaire_service}/uoa-user-management/api/users/getAccessToken" - self.refresh_token: str = "" - self.token = None - - @property - def refresh_token(self): - return os.getenv("REFRESH_TOKEN", None) + self._refresh_token: str = "" + self._token = None + self._validate() - @property - def token(self): - if self.token: - return self.token + @property + def refresh_token(self): + if self._refresh_token: + return self._refresh_token + else: + self._refresh_token = os.getenv("REFRESH_TOKEN", None) + if self._refresh_token: + return self._refresh_token else: - self.token = self._get_personal_token() - return self.token + raise ValueError("No refresh token provided") - self._validate() + @property + def token(self): + if self._token: + return self._token + else: + self._token = self._get_personal_token() + return self._token def _validate(self): if not 0 <= self.orcid_name_similarity_threshold <= 1: From c6941c777d28d9c4df1dc90270b44518cbe0a399 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 4 Feb 2025 10:33:30 +0100 Subject: [PATCH 15/33] Remove unused global variable --- src/research_index_backend/config.py | 2 +- .../create_graph_from_doi.py | 74 ++++++++++--------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/src/research_index_backend/config.py b/src/research_index_backend/config.py index d9b8b59..266f51b 100644 --- a/src/research_index_backend/config.py +++ b/src/research_index_backend/config.py @@ -33,7 +33,7 @@ def __init__(self): self.openaire_token_endpoint = f"{self.openaire_service}/uoa-user-management/api/users/getAccessToken" self._refresh_token: str = "" - self._token = None + self._token: str = "" self._validate() @property diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index e0b9626..d4b79f5 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -13,11 +13,9 @@ """ import argparse -from collections import defaultdict from difflib import SequenceMatcher from logging import DEBUG, basicConfig, getLogger from os.path import join -from re import IGNORECASE, compile from typing import Dict, List from uuid import uuid4 @@ -28,13 +26,11 @@ from .config import config from .create_graph import load_initial_data +from .doi import DOIManager from .get_metadata import MetadataFetcher from .models import AnonymousArticle, Article, Author from .parser import parse_metadata from .session import connect_to_db -from .doi import DOIManager - -TOKEN = config.token MG_HOST = config.mg_host MG_PORT = config.mg_port @@ -53,6 +49,7 @@ level=DEBUG, ) + @connect_to_db def match_author_name(db: Driver, author: Dict) -> List: name = f"{author['first_name'][0]} {author['last_name']}" @@ -195,51 +192,51 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool: return True -def main(list_of_dois: list,limit: int, update_metadata: bool): - try: - doi_manager = DOIManager(list_of_dois, limit=limit, update_metadata=update_metadata) - +def main(list_of_dois: list, limit: int, update_metadata: bool): + try: + doi_manager = DOIManager( + list_of_dois, limit=limit, update_metadata=update_metadata + ) + doi_manager.start_ingestion() doi_manager.validate_dois() if not doi_manager.update_metadata and not doi_manager.num_new_dois: - logger.warning("No new DOIs to process or valid existing DOIs to update.") + logger.warning( + "No new DOIs to process or valid existing DOIs to update." + ) doi_manager.end_ingestion() return doi_manager except Exception as e: logger.error(f"Error validating DOIs: {e}") raise e - session = requests_cache.CachedSession("doi_cache", expire_after=30) metadata_fetcher = MetadataFetcher(session) - + for doi in tqdm(doi_manager.doi_tracker): - if doi_manager.doi_tracker[doi].already_exists and not doi_manager.update_metadata: + if ( + doi_manager.doi_tracker[doi].already_exists + and not doi_manager.update_metadata + ): logger.info(f"DOI {doi} already exists in the database.") - continue + continue try: openalex_metadata = metadata_fetcher.get_output_metadata( doi, source="OpenAlex" ) doi_manager.doi_tracker[doi].openalex_metadata = True except ValueError as ex: - logger.error( - f"No OpenAlex metadata found for doi {doi}: {ex}" - ) - openalex_metadata = {"id": None} + logger.error(f"No OpenAlex metadata found for doi {doi}: {ex}") + openalex_metadata = {"id": None} try: metadata = metadata_fetcher.get_output_metadata( doi, source="OpenAire" ) doi_manager.doi_tracker[doi].openaire_metadata = True except ValueError as ex: - logger.error( - f"No OpenAire metadata found for doi {doi}: {ex}" - ) + logger.error(f"No OpenAire metadata found for doi {doi}: {ex}") else: - outputs_metadata = parse_metadata( - metadata, doi, openalex_metadata - ) + outputs_metadata = parse_metadata(metadata, doi, openalex_metadata) for output in outputs_metadata: try: result = upload_article_to_memgraph(output) @@ -262,24 +259,26 @@ def argument_parser(): description="Process DOIs and create/update a graph database" ) parser.add_argument( - "list_of_dois", - help="Path to CSV file containing list of DOIs" + "list_of_dois", help="Path to CSV file containing list of DOIs" ) parser.add_argument( - "-i", "--initialise", + "-i", + "--initialise", action="store_true", - help="Delete existing data and create new database" + help="Delete existing data and create new database", ) parser.add_argument( - "-l", "--limit", + "-l", + "--limit", type=int, default=50, - help="Limit number of DOIs to process (default: 50)" + help="Limit number of DOIs to process (default: 50)", ) parser.add_argument( - "-u", "--update-metadata", + "-u", + "--update-metadata", action="store_true", - help="Update metadata for existing DOIs" + help="Update metadata for existing DOIs", ) return parser.parse_args() @@ -326,7 +325,7 @@ def add_country_relations(db: Driver): def entry_point(db: Driver): """This is the console entry point to the programme""" - args = argument_parser() + args = argument_parser() list_of_dois = [] with open(args.list_of_dois, "r") as csv_file: for line in csv_file: @@ -338,12 +337,15 @@ def entry_point(db: Driver): logger.info("Deleted graph") load_initial_data(join("data", "init")) - doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata) + doi_manager = main( + list_of_dois, limit=args.limit, update_metadata=args.update_metadata + ) add_country_relations() metrics, processed_dois = doi_manager.ingestion_metrics() logger.info(f"Report: {metrics}, {processed_dois}") - print(f"Report: {metrics}") + print(f"Report: {metrics}") return metrics, processed_dois + if __name__ == "__main__": - entry_point() \ No newline at end of file + entry_point() From 29dc51ccf6cf277ac0b74ad01cf68683f71a7e2f Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Tue, 4 Feb 2025 11:53:32 +0100 Subject: [PATCH 16/33] tabular print out for metrics and entry point should return None --- src/research_index_backend/create_graph_from_doi.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index d4b79f5..cc8becd 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -322,7 +322,7 @@ def add_country_relations(db: Driver): @connect_to_db -def entry_point(db: Driver): +def entry_point(db: Driver) -> None: """This is the console entry point to the programme""" args = argument_parser() @@ -342,9 +342,14 @@ def entry_point(db: Driver): ) add_country_relations() metrics, processed_dois = doi_manager.ingestion_metrics() + logger.info(f"Report: {metrics}, {processed_dois}") - print(f"Report: {metrics}") - return metrics, processed_dois + + max_key_length = max(len(key) for key in metrics.keys()) + print(f"{'Metric'.ljust(max_key_length)} | Value") + print("-" * (max_key_length + 9)) + for key, value in metrics.items(): + print(f"{key.ljust(max_key_length)} | {value}") if __name__ == "__main__": From 813db561f80c49209c9369ed87223b5592ea89d8 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Tue, 4 Feb 2025 14:45:56 +0100 Subject: [PATCH 17/33] added docstring for doi manager and the limit logic --- src/research_index_backend/doi.py | 56 +++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index 34257ef..27adb72 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -37,17 +37,61 @@ class DOITracker(BaseModel): class DOIManager: + """Manages the validation and ingestion tracking of Digital Object Identifiers (DOIs). + + This class handles DOI validation, database existence checks, and metadata tracking. + It processes DOIs up to a specified limit and can optionally update metadata + for existing entries. + + Parameters + ---------- + list_of_dois : List[str] + List of DOI strings to process + limit : int + Maximum number of DOIs to process from the list + update_metadata : bool, optional + Whether to update metadata for existing DOIs (default is True) + + Attributes + ---------- + doi_tracker : Dict[str, DOI] + Dictionary tracking the state of each processed DOI + valid_pattern_dois : List[str] + DOIs that match the valid pattern + invalid_pattern_dois : List[str] + DOIs that don't match the valid pattern + existing_dois : List[str] + DOIs that already exist in the database + new_dois : List[str] + DOIs that are not yet in the database + + Methods + ------- + validate_dois() + Performs pattern validation and database existence checks + ingestion_metrics() + Returns metrics about the ingestion process + pattern_check() + Validates DOI patterns against the standard format + search_dois() + Checks database for existing DOIs + + Raises + ------ + ValueError + If DOI list is empty or limit is invalid + """ def __init__( self, list_of_dois: List[str], limit: int, update_metadata=True ) -> None: if not list_of_dois: raise ValueError("DOI list cannot be empty") - if (limit <= 0) or (limit > len(list_of_dois)): + if limit <= 0: raise ValueError( - "Limit must be positive and less than the number of DOIs" + "Limit must be positive and less than or equal to the number of DOIs." ) - + self.list_of_dois = [ doi.strip() .rstrip(".") @@ -55,7 +99,7 @@ def __init__( .replace("doi.org/", "") for doi in list_of_dois ] - self.limit = limit + self.limit = limit if limit < len(self.list_of_dois) else len(self.list_of_dois) self.update_metadata = update_metadata self.doi_tracker: DOITracker = { doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit] @@ -88,9 +132,9 @@ def pattern_check(self): raise @connect_to_db - def search_dois(self, db: Driver): + def search_dois(self, db: Driver) -> None: if not self.valid_pattern_dois: - msg = "No DOIs have passed the pattern check and make sure to run pattern check first." + msg = "None of the provided DOIs match the valid pattern." logger.warning(msg) raise ValueError(msg) query = """ From 55d5dca32db06e13b33eb32c330c3b8e1179c209 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 5 Feb 2025 08:14:33 +0100 Subject: [PATCH 18/33] Fix parser where no date, clean titles --- src/research_index_backend/parser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/research_index_backend/parser.py b/src/research_index_backend/parser.py index 29ad565..b4a0018 100644 --- a/src/research_index_backend/parser.py +++ b/src/research_index_backend/parser.py @@ -112,12 +112,12 @@ def parse_metadata( for x in title_meta: count += 1 if x["@classid"] == "main title": - title = x["$"] + title = clean_html(x["$"]) break else: pass else: - title = title_meta["$"] + title = clean_html(title_meta["$"]) logger.info(f"Parsing output {title}") publisher = entity.get("publisher", None) @@ -178,6 +178,9 @@ def parse_metadata( issue = None volume = None + publication_year = None + publication_month = None + publication_day = None # Get the acceptance date: date_of_acceptance = entity.get("dateofacceptance", None) From 6d2b228988d5894844dca18bf5404507d0642d2b Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 5 Feb 2025 08:21:04 +0100 Subject: [PATCH 19/33] Remove line breaks in parsed material --- src/research_index_backend/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/research_index_backend/utils.py b/src/research_index_backend/utils.py index 7692327..b64549e 100644 --- a/src/research_index_backend/utils.py +++ b/src/research_index_backend/utils.py @@ -17,6 +17,8 @@ def clean_html(raw_html): .replace(" ", " ") .replace(" ", " ") .replace(" ", " ") + .replace("\n", " ") + .strip() ) return unescape(normalize("NFC", cleantext)) From 9e13ac2a021469f48386ee6be39690923a4141e9 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 5 Feb 2025 08:22:28 +0100 Subject: [PATCH 20/33] Add test for cleaning line breaks --- tests/test_utilities.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index c180221..72aa6a4 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -16,3 +16,11 @@ def test_clean_utf(): actual = clean_html(text) assert actual == expected + + +def test_clean_line_breaks(): + text = " \n bla\nbla" + expected = "bla bla" + actual = clean_html(text) + + assert actual == expected From ab7df3cc42f6e1a96e98b345e3276eaa37704187 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 5 Feb 2025 08:28:34 +0100 Subject: [PATCH 21/33] Updated test and function to remove double spaces after line breaks --- src/research_index_backend/utils.py | 2 +- tests/test_utilities.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/research_index_backend/utils.py b/src/research_index_backend/utils.py index b64549e..6ee48f0 100644 --- a/src/research_index_backend/utils.py +++ b/src/research_index_backend/utils.py @@ -9,6 +9,7 @@ def clean_html(raw_html): """Remove HTML markup from a string and normalize UTF8""" cleantext = ( sub(CLEANR, "", raw_html) + .replace("\n", " ") .replace("\xa0", " ") .replace("\u00ad", " ") .replace("�", " ") @@ -17,7 +18,6 @@ def clean_html(raw_html): .replace(" ", " ") .replace(" ", " ") .replace(" ", " ") - .replace("\n", " ") .strip() ) return unescape(normalize("NFC", cleantext)) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 72aa6a4..a8085dc 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -19,7 +19,7 @@ def test_clean_utf(): def test_clean_line_breaks(): - text = " \n bla\nbla" + text = " \n bla\n bla" expected = "bla bla" actual = clean_html(text) From d3c952de9bc2998eef0f20e306c0163fd702a944 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 15:24:51 +0100 Subject: [PATCH 22/33] updated error handling for search_doi method --- src/research_index_backend/doi.py | 42 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index 27adb72..8c5fa06 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -13,6 +13,7 @@ from typing import Dict, List from neo4j import Driver +from neo4j.exceptions import ServiceUnavailable, Neo4jError # https://neo4j.com/docs/api/python-driver/current/api.html#errors from pydantic import BaseModel from .session import connect_to_db @@ -142,26 +143,33 @@ def search_dois(self, db: Driver) -> None: OPTIONAL MATCH (o:Output {doi: doi}) RETURN doi, COUNT(o) > 0 as exists""" try: - results, _, _ = db.execute_query( - query, dois=self.valid_pattern_dois - ) - self.existing_dois = [ - record["doi"] for record in results if record["exists"] - ] - self.new_dois = [ - record["doi"] for record in results if not record["exists"] - ] - for doi in self.doi_tracker: - if doi in self.existing_dois: - self.doi_tracker[doi].already_exists = True - - self.num_new_dois = len(self.new_dois) - self.num_existing_new_dois = len(self.existing_dois) - + results, _, _ = db.execute_query(query, dois=self.valid_pattern_dois) + except ServiceUnavailable as e: + logger.error(f"Neo4j service unavailable: {e}") + raise + except Neo4jError as e: + logger.error(f"Neo4j error occurred during query execution: {e}") + raise except Exception as e: - logger.error(f"Error whilst searching for DOIs: {e}") + logger.error(f"Unexpected error whilst searching for DOIs: {e}") raise + self.existing_dois = [ + record["doi"] for record in results if record["exists"] + ] + self.new_dois = [ + record["doi"] for record in results if not record["exists"] + ] + for doi in self.doi_tracker: + if doi in self.existing_dois: + self.doi_tracker[doi].already_exists = True + + self.num_new_dois = len(self.new_dois) + self.num_existing_new_dois = len(self.existing_dois) + + logger.info(f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs") + + def validate_dois(self) -> Dict[str, List[str]]: try: self.pattern_check() From 53e50964a6b821e7676d9be2476069a3e396a7f0 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 15:54:45 +0100 Subject: [PATCH 23/33] fix naming error --- src/research_index_backend/doi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index 8c5fa06..ef60d2e 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -165,7 +165,7 @@ def search_dois(self, db: Driver) -> None: self.doi_tracker[doi].already_exists = True self.num_new_dois = len(self.new_dois) - self.num_existing_new_dois = len(self.existing_dois) + self.num_existing_dois = len(self.existing_dois) logger.info(f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs") @@ -220,7 +220,7 @@ def ingestion_metrics(self) -> Dict[str, int]: "submitted_dois": len(self.list_of_dois), "processed_dois": len(processed_dois), "new_dois": self.num_new_dois, - "existing_dois": self.num_existing_new_dois, + "existing_dois": self.num_existing_dois, "ingested_dois": len(self.ingested_dois), "metadata_pass": len(metadata_pass), "metadata_failure": len(metadata_failure), From 911c57a7a65163379a5aff0a524842ed8fcc0add Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 18:00:12 +0100 Subject: [PATCH 24/33] added input validation for doi manager --- src/research_index_backend/doi.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index ef60d2e..e585da6 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -83,16 +83,10 @@ class DOIManager: If DOI list is empty or limit is invalid """ def __init__( - self, list_of_dois: List[str], limit: int, update_metadata=True + self, list_of_dois: List[str], limit: int, update_metadata: bool = True ) -> None: - if not list_of_dois: - raise ValueError("DOI list cannot be empty") - if limit <= 0: - raise ValueError( - "Limit must be positive and less than or equal to the number of DOIs." - ) - + self._validate_inputs(list_of_dois, limit, update_metadata) self.list_of_dois = [ doi.strip() .rstrip(".") @@ -107,12 +101,22 @@ def __init__( } self.PATTERN = compile(DOI_PATTERN, IGNORECASE) + def _validate_inputs(self, dois: List[str], limit: int, update_metadata: bool) -> None: + if not isinstance(dois, list): + raise TypeError("DOIs must be provided as a list") + if not dois: + raise ValueError("DOI list cannot be empty") + if not isinstance(limit, int) or limit <= 0: + raise ValueError("Limit must be a positive integer") + if not isinstance(update_metadata, bool): + raise TypeError("update_metadata must be a boolean") + def start_ingestion(self): self.start_time = time.time() def end_ingestion(self): self.end_time = time.time() - + def pattern_check(self): try: self.valid_pattern_dois = [] @@ -215,7 +219,6 @@ def ingestion_metrics(self) -> Dict[str, int]: for doi in processed_dois if self.doi_tracker[doi].openaire_metadata ] - metrics = { "submitted_dois": len(self.list_of_dois), "processed_dois": len(processed_dois), From 766c69a1e3459e416455e9ea6c7b60cce084afac Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 18:05:44 +0100 Subject: [PATCH 25/33] updated doi tests --- tests/test_dois.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/test_dois.py b/tests/test_dois.py index b34c9a9..6f163ab 100644 --- a/tests/test_dois.py +++ b/tests/test_dois.py @@ -1,3 +1,4 @@ +import pytest from research_index_backend.doi import DOIManager valid_dois = [ @@ -93,4 +94,25 @@ def test_case_insensitive_pattern(): update_metadata=False ) doi_manager.pattern_check() - assert all(doi.valid_pattern for doi in doi_manager.doi_tracker.values()) \ No newline at end of file + assert all(doi.valid_pattern for doi in doi_manager.doi_tracker.values()) + +def test_invalid_limit(): + """Test that providing an invalid (negative) limit raises a ValueError.""" + with pytest.raises(ValueError): + # Expect DOIManager to raise an error upon invalid limit input. + doi_manager = DOIManager(["10.5281/zenodo.8140241"], limit=-5, update_metadata=False) + doi_manager.validate_dois() + +def test_wrong_type_for_doi_list(): + """Test that providing a wrong type (non-iterable) for DOI list raises a TypeError.""" + with pytest.raises(TypeError): + # Passing a single string instead of a list should raise a TypeError. + DOIManager("10.5281/zenodo.8140241", limit=1, update_metadata=False) + +def test_wrong_tyoe_for_update_metadata(): + """Test that providing a wrong type for update_metadata raises a TypeError.""" + with pytest.raises(TypeError): + # Passing a string instead of a boolean should raise a TypeError. + DOIManager(["10.5281/zenodo.8140241"], limit=1, update_metadata="False") + +# TODO: should the elements of the list of DOIs be checked for type or this is handled in the entry point? \ No newline at end of file From fb098aadcffbc5325b226aeaa053ceb90922727b Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 18:20:12 +0100 Subject: [PATCH 26/33] including updated dois to report and minor formatting --- src/research_index_backend/doi.py | 37 ++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index e585da6..d9e0740 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -13,7 +13,10 @@ from typing import Dict, List from neo4j import Driver -from neo4j.exceptions import ServiceUnavailable, Neo4jError # https://neo4j.com/docs/api/python-driver/current/api.html#errors +from neo4j.exceptions import ( + ServiceUnavailable, + Neo4jError, +) # https://neo4j.com/docs/api/python-driver/current/api.html#errors from pydantic import BaseModel from .session import connect_to_db @@ -24,6 +27,7 @@ # https://www.crossref.org/blog/dois-and-matching-regular-expressions/ DOI_PATTERN = r"10\.\d{4,9}/(?=.*\d)[-._;()/:A-Z0-9]+$" + class DOI(BaseModel): doi: str valid_pattern: bool = False @@ -82,6 +86,7 @@ class DOIManager: ValueError If DOI list is empty or limit is invalid """ + def __init__( self, list_of_dois: List[str], limit: int, update_metadata: bool = True ) -> None: @@ -94,14 +99,18 @@ def __init__( .replace("doi.org/", "") for doi in list_of_dois ] - self.limit = limit if limit < len(self.list_of_dois) else len(self.list_of_dois) + self.limit = ( + limit if limit < len(self.list_of_dois) else len(self.list_of_dois) + ) self.update_metadata = update_metadata self.doi_tracker: DOITracker = { doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit] } self.PATTERN = compile(DOI_PATTERN, IGNORECASE) - def _validate_inputs(self, dois: List[str], limit: int, update_metadata: bool) -> None: + def _validate_inputs( + self, dois: List[str], limit: int, update_metadata: bool + ) -> None: if not isinstance(dois, list): raise TypeError("DOIs must be provided as a list") if not dois: @@ -109,14 +118,14 @@ def _validate_inputs(self, dois: List[str], limit: int, update_metadata: bool) - if not isinstance(limit, int) or limit <= 0: raise ValueError("Limit must be a positive integer") if not isinstance(update_metadata, bool): - raise TypeError("update_metadata must be a boolean") + raise TypeError("update_metadata must be a boolean") def start_ingestion(self): self.start_time = time.time() def end_ingestion(self): self.end_time = time.time() - + def pattern_check(self): try: self.valid_pattern_dois = [] @@ -147,7 +156,9 @@ def search_dois(self, db: Driver) -> None: OPTIONAL MATCH (o:Output {doi: doi}) RETURN doi, COUNT(o) > 0 as exists""" try: - results, _, _ = db.execute_query(query, dois=self.valid_pattern_dois) + results, _, _ = db.execute_query( + query, dois=self.valid_pattern_dois + ) except ServiceUnavailable as e: logger.error(f"Neo4j service unavailable: {e}") raise @@ -170,9 +181,10 @@ def search_dois(self, db: Driver) -> None: self.num_new_dois = len(self.new_dois) self.num_existing_dois = len(self.existing_dois) - - logger.info(f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs") + logger.info( + f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs" + ) def validate_dois(self) -> Dict[str, List[str]]: try: @@ -203,6 +215,13 @@ def ingestion_metrics(self) -> Dict[str, int]: and doi in processed_dois ] + updated_existing_dois = [ + doi + for doi in processed_dois + if self.doi_tracker[doi].ingestion_success + and doi in self.existing_dois + ] + self.ingested_dois = [ doi for doi in self.doi_tracker @@ -224,6 +243,7 @@ def ingestion_metrics(self) -> Dict[str, int]: "processed_dois": len(processed_dois), "new_dois": self.num_new_dois, "existing_dois": self.num_existing_dois, + "updated_existing_dois": len(updated_existing_dois), "ingested_dois": len(self.ingested_dois), "metadata_pass": len(metadata_pass), "metadata_failure": len(metadata_failure), @@ -239,6 +259,7 @@ def ingestion_metrics(self) -> Dict[str, int]: "processed_dois": processed_dois, "new_dois": self.new_dois, "existing_dois": self.existing_dois, + "updated_existing_dois": updated_existing_dois, "ingested_dois": self.ingested_dois, "metadata_pass": metadata_pass, "metadata_failure": metadata_failure, From 0bb08dcfc7b8f77c5f0afc2a1de98ea5814c0587 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Wed, 5 Feb 2025 19:28:48 +0100 Subject: [PATCH 27/33] added choice to save fetched metadata to disk --- readme.md | 5 +++-- .../create_graph_from_doi.py | 15 ++++++++++----- src/research_index_backend/get_metadata.py | 9 ++++++--- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/readme.md b/readme.md index f93054d..ccfa8c6 100644 --- a/readme.md +++ b/readme.md @@ -39,13 +39,14 @@ The package is not yet deployed to PyPI. Only an editable (development) install -i, --initialise Delete existing data and create new database -l, --limit N Limit number of DOIs to process (default: 50) -u, --update-metadata Update metadata for existing DOIs + -w, --write-metadata Save JSON responses to disk Examples: -> Process 10 DOIs from file: $ research_index list_of_dois.csv -l 10 # Process 10 DOIs from file - -> Update metadata for existing DOIs - $ research_index list_of_dois.csv --update-metadata + -> Update metadata for existing DOIs and save metadata + $ research_index list_of_dois.csv --update-metadata --write-metadata # Development diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index cc8becd..4275c17 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -192,7 +192,7 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool: return True -def main(list_of_dois: list, limit: int, update_metadata: bool): +def main(list_of_dois: list, limit: int, update_metadata: bool, write_metadata: bool): try: doi_manager = DOIManager( list_of_dois, limit=limit, update_metadata=update_metadata @@ -211,7 +211,7 @@ def main(list_of_dois: list, limit: int, update_metadata: bool): raise e session = requests_cache.CachedSession("doi_cache", expire_after=30) - metadata_fetcher = MetadataFetcher(session) + metadata_fetcher = MetadataFetcher(session, save_json=write_metadata) for doi in tqdm(doi_manager.doi_tracker): if ( @@ -280,6 +280,12 @@ def argument_parser(): action="store_true", help="Update metadata for existing DOIs", ) + parser.add_argument( + "-w", + "--write-metadata", + action="store_true", + help="Store metadata in JSON files", + ) return parser.parse_args() @@ -337,9 +343,8 @@ def entry_point(db: Driver) -> None: logger.info("Deleted graph") load_initial_data(join("data", "init")) - doi_manager = main( - list_of_dois, limit=args.limit, update_metadata=args.update_metadata - ) + doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata, + write_metadata=args.write_metadata) add_country_relations() metrics, processed_dois = doi_manager.ingestion_metrics() diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index f167323..f230ece 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -9,9 +9,10 @@ from .config import config class MetadataFetcher: - def __init__(self, session: requests_cache.CachedSession, token: str = None): + def __init__(self, session: requests_cache.CachedSession, token: str = None, save_json: bool = False): self.session = session self.token = token or config.token + self.save_json = save_json self.logger = getLogger(__name__) basicConfig( filename="research_index_backend.log", @@ -45,7 +46,8 @@ def get_metadata_from_openaire(self, doi: str) -> Dict: if error := response.json().get("error"): raise ValueError(error) - self._save_json_response(response, "data/json/openaire", doi) + if self.save_json: + self._save_json_response(response, "data/json/openaire", doi) if response.json()["response"]["results"]: return response.json() @@ -62,7 +64,8 @@ def get_metadata_from_openalex(self, doi: str) -> Dict: try: response.raise_for_status() - self._save_json_response(response, "data/json/openalex", doi) + if self.save_json: + self._save_json_response(response, "data/json/openalex", doi) except requests.exceptions.HTTPError as err: self.logger.error(str(err)) From ff02627094e15a3766e6958f35bfc7b0c04712c8 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Thu, 6 Feb 2025 08:09:41 +0100 Subject: [PATCH 28/33] rough fix for OpenAire 403 and tests --- src/research_index_backend/get_metadata.py | 29 ++++++++++++--------- tests/test_metadata.py | 30 +++++++++++++++------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index f230ece..639052d 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -38,21 +38,26 @@ def get_metadata_from_openaire(self, doi: str) -> Dict: headers = {"Authorization": f"Bearer {self.token}"} api_url = f"{config.openaire_api}/search/researchProducts" - response = self.session.get(api_url + query, headers=headers) - - self.logger.debug(f"Response code: {response.status_code}") - response.raise_for_status() + try: + response = self.session.get(api_url + query, headers=headers) + self.logger.debug(f"Response code: {response.status_code}") + response.raise_for_status() - if error := response.json().get("error"): - raise ValueError(error) + if error := response.json().get("error"): + raise ValueError(error) - if self.save_json: - self._save_json_response(response, "data/json/openaire", doi) + if self.save_json: + self._save_json_response(response, "data/json/openaire", doi) - if response.json()["response"]["results"]: - return response.json() - else: - raise ValueError(f"DOI {doi} returned no results") + if response.json()["response"]["results"]: + return response.json() + else: + raise ValueError(f"DOI {doi} returned no results") + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + raise ValueError("OpenAire API token is invalid or expired. Please update the token and try again.") from e + else: + raise def get_metadata_from_openalex(self, doi: str) -> Dict: """Gets metadata from OpenAlex""" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b5131bf..a8847d3 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -7,6 +7,7 @@ import pytest from requests_cache import CachedSession +from requests.exceptions import HTTPError from research_index_backend.get_metadata import MetadataFetcher from research_index_backend.create_graph_from_doi import score_name_similarity @@ -19,15 +20,26 @@ def session(): def fetcher(session): return MetadataFetcher(session) -class TestMetadataFetcher: - @pytest.mark.skip(reason="Requires access to OpenAire Graph API") - def test_broken_doi(self, fetcher): - """An incorrect DOI should raise an error""" - broken_doi = "10.1dd016/j.envsoft.2021" - with pytest.raises(ValueError) as ex: - fetcher.get_output_metadata(broken_doi) - expected = "DOI 10.1dd016/j.envsoft.2021 returned no results" - assert str(ex.value) == expected +def dummy_get_403(url, headers): + class DummyResponse: + status_code = 403 + def json(self): + return {} + def raise_for_status(self): + http_err = HTTPError("403 Client Error: Forbidden for url: " + url) + http_err.response = self + raise http_err + return DummyResponse() + +class TestMetadataFetcher403: + def test_api_403_response(self, session, monkeypatch): + monkeypatch.setattr(session, "get", dummy_get_403) + fetcher = MetadataFetcher(session=session) + doi = "10.1016/j.apenergy.2023.121219" + with pytest.raises(ValueError) as e: + fetcher.get_output_metadata(doi) + expected = "OpenAire API token is invalid or expired. Please update the token and try again." + assert str(e.value) == expected class TestNameScoring: def test_score_names_same(self): From ff8ded7b6f80a40fb1801f3883107cd0a1e5b008 Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Thu, 6 Feb 2025 08:19:59 +0100 Subject: [PATCH 29/33] metadata update default should be False --- src/research_index_backend/doi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index d9e0740..e0a5ecd 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -88,7 +88,7 @@ class DOIManager: """ def __init__( - self, list_of_dois: List[str], limit: int, update_metadata: bool = True + self, list_of_dois: List[str], limit: int, update_metadata: bool = False ) -> None: self._validate_inputs(list_of_dois, limit, update_metadata) From bed31e1fa8f687534f2de3a4a6eedf21dddc1a1d Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Thu, 6 Feb 2025 15:35:44 +0100 Subject: [PATCH 30/33] printout invalid pattern and metadata failure dois --- src/research_index_backend/create_graph_from_doi.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index 4275c17..3b8accc 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -356,6 +356,14 @@ def entry_point(db: Driver) -> None: for key, value in metrics.items(): print(f"{key.ljust(max_key_length)} | {value}") - + print("\nProcessing Results:") + print(f"• Failed metadata DOIs ({metrics['metadata_failure']}):") + for doi in processed_dois['metadata_failure']: + print(f" - {doi}") + + print(f"\n• Invalid pattern DOIs ({metrics['invalid_pattern_dois']}):") + for doi in processed_dois['invalid_pattern_dois']: + print(f" - {doi}") + if __name__ == "__main__": entry_point() From a0eda747b3c6adc52e10885d2f7886833f3d426b Mon Sep 17 00:00:00 2001 From: Francis Tembo Date: Thu, 6 Feb 2025 16:20:21 +0100 Subject: [PATCH 31/33] reporting duplicated submissions --- src/research_index_backend/create_graph_from_doi.py | 6 +++++- src/research_index_backend/doi.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index 3b8accc..4bcd29c 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -357,7 +357,7 @@ def entry_point(db: Driver) -> None: print(f"{key.ljust(max_key_length)} | {value}") print("\nProcessing Results:") - print(f"• Failed metadata DOIs ({metrics['metadata_failure']}):") + print(f"\n• Failed metadata DOIs ({metrics['metadata_failure']}):") for doi in processed_dois['metadata_failure']: print(f" - {doi}") @@ -365,5 +365,9 @@ def entry_point(db: Driver) -> None: for doi in processed_dois['invalid_pattern_dois']: print(f" - {doi}") + print(f"\n• Duplicated Submissions ({metrics['duplicated_submissions']}):") + for doi in processed_dois['duplicated_submissions']: + print(f" - {doi}") + if __name__ == "__main__": entry_point() diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index e0a5ecd..dd4375a 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -8,6 +8,7 @@ """ import time +from collections import Counter from logging import getLogger from re import IGNORECASE, compile from typing import Dict, List @@ -201,6 +202,8 @@ def ingestion_metrics(self) -> Dict[str, int]: processed_dois = ( self.valid_pattern_dois if self.update_metadata else self.new_dois ) + + duplicated_submissions = [doi for doi, count in Counter(self.list_of_dois).items() if count > 1] metadata_pass = [ doi @@ -240,6 +243,7 @@ def ingestion_metrics(self) -> Dict[str, int]: ] metrics = { "submitted_dois": len(self.list_of_dois), + "duplicated_submissions": len(duplicated_submissions), "processed_dois": len(processed_dois), "new_dois": self.num_new_dois, "existing_dois": self.num_existing_dois, @@ -256,6 +260,7 @@ def ingestion_metrics(self) -> Dict[str, int]: doi_states = { "submitted_dois": self.list_of_dois, + "duplicated_submissions": duplicated_submissions, "processed_dois": processed_dois, "new_dois": self.new_dois, "existing_dois": self.existing_dois, From 92e791927ebca29ba0fb70c7d4716a35e56428be Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 6 Feb 2025 17:46:55 +0100 Subject: [PATCH 32/33] Raise error when refresh token is garbled or missing --- src/research_index_backend/config.py | 36 ++++++++++++++++------ src/research_index_backend/get_metadata.py | 20 ++++++++---- tests/test_metadata.py | 27 ++++++++-------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/src/research_index_backend/config.py b/src/research_index_backend/config.py index 266f51b..3dab3f1 100644 --- a/src/research_index_backend/config.py +++ b/src/research_index_backend/config.py @@ -70,17 +70,33 @@ def _get_personal_token(self) -> str: if refresh_token := os.getenv("REFRESH_TOKEN"): logger.info("Found refresh token. Obtaining personal token.") query = f"?refreshToken={refresh_token}" - response = requests.get(self.openaire_token_endpoint + query) - logger.info(f"Status code: {response.status_code}") try: - response_json = response.json() - logger.debug(response_json) - return response_json["access_token"] - except requests.JSONDecodeError as e: - logger.error(f"Error decoding JSON response: {e}") - raise ValueError( - "Failed to obtain personal token due to JSON decode error" - ) + response = requests.get(self.openaire_token_endpoint + query) + logger.info(f"Status code: {response.status_code}") + response.raise_for_status() + except requests.exceptions.HTTPError: + if 400 <= response.status_code < 500: + raise ValueError( + "OpenAire refresh token is invalid or expired. Please update token and try again." + ) + elif 500 <= response.status_code < 600: + raise + else: + raise + else: + try: + response_json = response.json() + logger.debug(response_json) + return response_json["access_token"] + except requests.JSONDecodeError as e: + logger.error(f"Error decoding JSON response: {e}") + raise ValueError( + "Failed to obtain personal token due to JSON decode error" + ) + except Exception as e: + msg = str(e) + logger.error(f"{msg}") + raise else: raise ValueError( "No refresh token found, could not obtain personal token" diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index 639052d..75316fd 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -8,8 +8,14 @@ from .config import config + class MetadataFetcher: - def __init__(self, session: requests_cache.CachedSession, token: str = None, save_json: bool = False): + def __init__( + self, + session: requests_cache.CachedSession, + token: str = "", + save_json: bool = False, + ): self.session = session self.token = token or config.token self.save_json = save_json @@ -25,7 +31,7 @@ def _save_json_response(self, response, directory: str, doi: str) -> None: """Helper method to save JSON responses""" clean_doi = doi.replace("/", "") makedirs(directory, exist_ok=True) - + with open(f"{directory}/{clean_doi}.json", "w") as json_file: try: dump(response.json(), json_file) @@ -55,7 +61,9 @@ def get_metadata_from_openaire(self, doi: str) -> Dict: raise ValueError(f"DOI {doi} returned no results") except requests.exceptions.HTTPError as e: if e.response.status_code == 403: - raise ValueError("OpenAire API token is invalid or expired. Please update the token and try again.") from e + raise ValueError( + "OpenAire API token is invalid or expired. Please update the token and try again." + ) from e else: raise @@ -64,9 +72,9 @@ def get_metadata_from_openalex(self, doi: str) -> Dict: self.logger.info(f"Requesting {doi} from OpenAlex") query = f"doi:{doi}?mailto=wusher@kth.se" api_url = "https://api.openalex.org/works/" - + response = self.session.get(api_url + query) - + try: response.raise_for_status() if self.save_json: @@ -86,4 +94,4 @@ def get_output_metadata(self, doi: str, source: str = "OpenAire") -> Dict: elif source == "OpenAlex": return self.get_metadata_from_openalex(doi) else: - raise ValueError("Incorrect argument for output metadata source") \ No newline at end of file + raise ValueError("Incorrect argument for output metadata source") diff --git a/tests/test_metadata.py b/tests/test_metadata.py index a8847d3..d0c339d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -6,41 +6,47 @@ """ import pytest -from requests_cache import CachedSession from requests.exceptions import HTTPError +from requests_cache import CachedSession -from research_index_backend.get_metadata import MetadataFetcher from research_index_backend.create_graph_from_doi import score_name_similarity +from research_index_backend.get_metadata import MetadataFetcher + @pytest.fixture def session(): return CachedSession() + @pytest.fixture def fetcher(session): return MetadataFetcher(session) + def dummy_get_403(url, headers): class DummyResponse: status_code = 403 + def json(self): return {} + def raise_for_status(self): http_err = HTTPError("403 Client Error: Forbidden for url: " + url) http_err.response = self raise http_err + return DummyResponse() + class TestMetadataFetcher403: def test_api_403_response(self, session, monkeypatch): monkeypatch.setattr(session, "get", dummy_get_403) - fetcher = MetadataFetcher(session=session) - doi = "10.1016/j.apenergy.2023.121219" with pytest.raises(ValueError) as e: - fetcher.get_output_metadata(doi) - expected = "OpenAire API token is invalid or expired. Please update the token and try again." + MetadataFetcher(session=session) + expected = "OpenAire refresh token is invalid or expired. Please update token and try again." assert str(e.value) == expected + class TestNameScoring: def test_score_names_same(self): @@ -48,61 +54,52 @@ def test_score_names_same(self): name2 = "Will Usher" assert score_name_similarity(name1, name2) == 1.0 - def test_score_names_different(self): name1 = "Will Usher" name2 = "1298139487(*&^)" assert score_name_similarity(name1, name2) == 0.0 - def test_score_names_truncated(self): name1 = "Vignesh Sridha" name2 = "Vignesh Sridharan" assert score_name_similarity(name1, name2) > 0.8 - def test_score_names_reversed(self): name1 = "Sridharan Vignesh" name2 = "Vignesh Sridharan" assert score_name_similarity(name1, name2) == 1.0 - def test_score_names_ignore_case(self): name1 = "Sridharan Vignesh" name2 = "VIGNESH Sridharan" assert score_name_similarity(name1, name2) == 1.0 - def test_score_names_similar_but_different(self): name1 = "James Sridharan" name2 = "Vignesh Sridharan" assert score_name_similarity(name1, name2) == 0.65625 - def test_score_names_similar_fernandos_1(self): name1 = "Fernando Antonio Plazas" name2 = "Fernando Plazas-Nino" assert score_name_similarity(name1, name2) < 0.8 - def test_score_names_similar_fernandos_2(self): name1 = "Fernando Plazas-Niño" name2 = "Fernando Antonio Plazas-Niño" assert score_name_similarity(name1, name2) > 0.8 - def test_score_names_similar_fernandos_3(self): name1 = "Fernando Plazas-Niño" name2 = "Fernando Plazas-Nino" assert score_name_similarity(name1, name2) > 0.8 - def test_score_names_similar_fernandos_4(self): name1 = "Fernando ANtonio Plazas" name2 = "Fernando Antonio Plazas Nino" From 1e812710ae0eb83d6f1050e4edd98b912c5eea7d Mon Sep 17 00:00:00 2001 From: Will Usher Date: Fri, 7 Feb 2025 08:43:29 +0100 Subject: [PATCH 33/33] Fix test --- src/research_index_backend/get_metadata.py | 5 ++--- tests/test_metadata.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/research_index_backend/get_metadata.py b/src/research_index_backend/get_metadata.py index 75316fd..24cc9c6 100644 --- a/src/research_index_backend/get_metadata.py +++ b/src/research_index_backend/get_metadata.py @@ -17,7 +17,6 @@ def __init__( save_json: bool = False, ): self.session = session - self.token = token or config.token self.save_json = save_json self.logger = getLogger(__name__) basicConfig( @@ -41,7 +40,7 @@ def _save_json_response(self, response, directory: str, doi: str) -> None: def get_metadata_from_openaire(self, doi: str) -> Dict: """Gets metadata from OpenAire""" query = f"?format=json&doi={doi}" - headers = {"Authorization": f"Bearer {self.token}"} + headers = {"Authorization": f"Bearer {config.token}"} api_url = f"{config.openaire_api}/search/researchProducts" try: @@ -62,7 +61,7 @@ def get_metadata_from_openaire(self, doi: str) -> Dict: except requests.exceptions.HTTPError as e: if e.response.status_code == 403: raise ValueError( - "OpenAire API token is invalid or expired. Please update the token and try again." + "OpenAire refresh token is invalid or expired. Please update token and try again." ) from e else: raise diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d0c339d..13eae86 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -42,7 +42,7 @@ class TestMetadataFetcher403: def test_api_403_response(self, session, monkeypatch): monkeypatch.setattr(session, "get", dummy_get_403) with pytest.raises(ValueError) as e: - MetadataFetcher(session=session) + MetadataFetcher(session=session).get_metadata_from_openaire("doi") expected = "OpenAire refresh token is invalid or expired. Please update token and try again." assert str(e.value) == expected