From c3844779f9b7833f5b5a503cf67854e628c91810 Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Wed, 28 Aug 2024 16:05:56 -0700 Subject: [PATCH] feat: implement automatic workbook annotation Add functionality to automatically annotate workbooks using an annotator. Note: Annotation quality may be limited. Consider these as recommendations rather than definitive annotations. --- src/spinneret/annotator.py | 76 ++++++++++++++++++++++++++++++++++++++ tests/test_annotator.py | 47 ++++++++++++++++++++++- 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py index 7bba879..9dc1de5 100644 --- a/src/spinneret/annotator.py +++ b/src/spinneret/annotator.py @@ -1,7 +1,9 @@ """The annotator module""" +import os from typing import Union from requests import get, exceptions +import pandas as pd # pylint: disable=too-many-locals @@ -103,3 +105,77 @@ def get_bioportal_annotation( label = r.json().get("prefLabel", None) annotations.append({"label": label, "uri": uri}) return annotations + + +def annotate_workbook(workbook_path: str, output_path: str) -> None: + """Annotate a workbook with automated annotation + + :param workbook_path: The path to the workbook to be annotated + corresponding to the EML file. + :param output_path: The path to write the annotated workbook. + :returns: None + :notes: The workbook is annotated by annotators best suited for the XPaths + in the EML file. The annotated workbook is written back to the same + path as the original workbook. + """ + # Ensure the workbook and eml file match to avoid errors + print(f"Annotating workbook {workbook_path}") + + # Load the workbook and EML for processing + wb = pd.read_csv(workbook_path, sep="\t", encoding="utf-8") + + # Iterate over workbook rows and annotate + wb_additional_rows = pd.DataFrame(columns=wb.columns) + for index, row in wb.iterrows(): + + # Adding standard predicates based on the subject element name + if row["element"] == "dataset": + wb.loc[index, "predicate"] = "is about" + wb.loc[index, "predicate_id"] = "http://purl.obolibrary.org/obo/IAO_0000136" + elif row["element"] == "attribute": + wb.loc[index, "predicate"] = "contains measurements of type" + wb.loc[index, "predicate_id"] = ( + "http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType" + ) + + # Get annotations for the element's descriptive text + if row["element"] == "dataset": + annotation = get_bioportal_annotation( + text=row["description"], + api_key=os.environ["BIOPORTAL_API_KEY"], + ontologies="ENVO", # ENVO provides environmental terms + exclude_synonyms="true", + ) + elif row["element"] == "attribute": + annotation = get_bioportal_annotation( + text=row["description"], + api_key=os.environ["BIOPORTAL_API_KEY"], + ontologies="ECSO", # ECSO provides measurment terms + exclude_synonyms="true", + ) + else: + continue + + # Add annotations to the workbook. Add first annotation to row then the + # remainder to a separate data frame to be appended at the end. + if annotation: + wb.loc[index, "object"] = annotation[0]["label"] + wb.loc[index, "object_id"] = annotation[0]["uri"] + wb.loc[index, "author"] = "BioPortal Annotator" + wb.loc[index, "date"] = pd.Timestamp.now() + if len(annotation) > 1: + for item in annotation[1:]: + # Create row + new_row = wb.loc[index] + new_row.loc["object"] = item["label"] + new_row.loc["object_id"] = item["uri"] + new_row["author"] = "BioPortal Annotator" + new_row["date"] = pd.Timestamp.now() + # Append row to additional rows df + wb_additional_rows.loc[len(wb_additional_rows)] = new_row + + # Append additional rows to the workbook + wb = pd.concat([wb, wb_additional_rows], ignore_index=True) + + # Write the annotated workbook back to the original path + wb.to_csv(output_path, sep="\t", index=False, encoding="utf-8") diff --git a/tests/test_annotator.py b/tests/test_annotator.py index 42abbeb..69b3a47 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -1,8 +1,10 @@ """Test annotator code""" import os +from shutil import copyfile import pytest -from spinneret.annotator import get_bioportal_annotation +import pandas as pd +from spinneret.annotator import get_bioportal_annotation, annotate_workbook from spinneret.utilities import load_configuration @@ -36,3 +38,46 @@ def test_get_bioportal_annotation(): assert isinstance(item["uri"], str) assert item["label"] != "" assert item["uri"] != "" + + +# pylint: disable=duplicate-code +def test_annotate_workbook(tmp_path): + """Test annotate_workbook""" + # Load the API key in the configuration file + if not os.path.exists("config.json"): + pytest.skip("Skipping test due to missing config.json file in package root.") + load_configuration("config.json") + + # Copy the workbook to tmp_path for editing + wb_path = "tests/edi.3.9_annotation_workbook.tsv" + wb_path_copy = str(tmp_path) + "/edi.3.9_annotation_workbook.tsv" + copyfile(wb_path, wb_path_copy) + wb_path_annotated = str(tmp_path) + "/edi.3.9_annotation_workbook_annotated.tsv" + + # Check features of the unannotated workbook + assert os.path.exists(wb_path_copy) + wb = pd.read_csv(wb_path_copy, sep="\t", encoding="utf-8") + # The columns to be annotated should be empty + cols_to_annotate = [ + "predicate", + "predicate_id", + "object", + "object_id", + "author", + "date", + ] + for col in cols_to_annotate: + assert wb[col].isnull().all() + + # Annotate the workbook copy + annotate_workbook( + workbook_path=wb_path_copy, + output_path=wb_path_annotated, + ) + + # Check the workbook was annotated + assert os.path.exists(wb_path_annotated) + wb = pd.read_csv(wb_path_annotated, sep="\t", encoding="utf-8") + # The columns to be annotated should be full + for col in cols_to_annotate: + assert not wb[col].isnull().all()