Skip to content

Commit

Permalink
feat: add element descriptions to workbook
Browse files Browse the repository at this point in the history
Add the corresponding element description directly within the workbook
to streamline the annotation process and reduce potential errors. This
eliminates the need for manual navigation to the data package landing
page to verify element details.
  • Loading branch information
clnsmth authored Aug 27, 2024
1 parent b6e7c92 commit 9d6da08
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 20 deletions.
33 changes: 33 additions & 0 deletions src/spinneret/workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def create(
str(uuid4()),
eml.getpath(e),
subcon["context"],
get_description(e),
subcon["subject"],
"", # predicate
"", # predicate id
Expand All @@ -84,6 +85,7 @@ def create(
"element_id",
"element_xpath",
"context",
"description",
"subject",
"predicate",
"predicate_id",
Expand Down Expand Up @@ -140,3 +142,34 @@ def get_subject_and_context(element: etree._Element) -> dict:
context = None
res = {"subject": subject, "context": context}
return res


def get_description(element: etree._Element) -> str:
"""Get the description of an element
:param element: The EML element to be annotated.
:returns: The description of the element.
"""
entities = [
"dataTable",
"otherEntity",
"spatialVector",
"spatialRaster",
"storedProcedure",
"view",
]
if element.tag in "dataset":
# Add abstract and keywords, they are descriptive of the entire dataset
abstract = element.xpath("./abstract")
abstract = etree.tostring(abstract[0], encoding="utf-8", method="text")
abstract = abstract.decode("utf-8").strip()
keywords = element.xpath(".//keyword")
keywords = [k.text for k in keywords]
description = abstract + " " + " ".join(keywords)
elif element.tag in entities:
description = element.findtext(".//entityName")
elif element.tag in "attribute":
description = element.findtext(".//attributeDefinition")
else:
description = None
return description
117 changes: 97 additions & 20 deletions tests/edi.3.9_annotation_workbook.tsv
Original file line number Diff line number Diff line change
@@ -1,20 +1,97 @@
package_id url element element_id element_xpath context subject predicate predicate_id object object_id author date comment
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset b762aade-ffeb-4195-bdb3-7bedfc238f4f /eml:eml/dataset edi.3.9 dataset
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 0af1b0c1-42d3-4a95-acf9-739dde6dec1f /eml:eml/dataset/dataTable dataset SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 10139c2c-cd82-48c1-9e08-62d2d5b1bb11 /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv data_source
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c1722e3c-a7e0-4d57-bb18-f1d8ddb6f9a8 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv sample_method
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 0bca748f-5a11-40e0-9bbc-b06926ceb67f /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv date
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute be405a5b-a3f2-4962-9aaa-e8fceff32d69 /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv site_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute fb9a9e58-beee-43da-8fe6-15e530841691 /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv subsite_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c70c3c84-5bc7-498a-a7ff-cd9e8476434f /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv transect_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute e30f49bb-0d67-494b-9853-610cf837f27d /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv replicate_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 645692d8-e253-4771-87be-ee3a4264486d /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv proj_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 5ed9e722-b4e7-4283-9462-e128e9aaf6e1 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv points
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d2902fdf-74e8-454a-9b8c-0666d426aacd /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv count
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f8c106fb-fcba-415f-b985-363bf952bf5f /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv auth_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 8303d233-73d1-495d-974c-33de9c706abf /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv auth_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b01bf32e-c67b-4eb1-bf8b-6212ee68b31d /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv taxon_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c68a2d03-5685-4336-bc45-a552114119d3 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv site_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 0a89d25e-47a7-4931-8074-b510a1bb1864 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv subsite_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d68e1ef4-8553-4a94-88de-146988ae76ef /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv latitude
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 8a570431-09e8-49e5-8e35-1dfa9c8cd6cb /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv longitude
package_id url element element_id element_xpath context description subject predicate predicate_id object object_id author date comment
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset ccbc2d88-fff7-471c-a123-4f30f533b707 /eml:eml/dataset edi.3.9 "The Santa Barbara Channel Marine Biodiversity Observation Network
(SBCMBON) tracks long-term patterns in species abundance and
diversity. This dataset contains cover of kelp forest sessile
invertebrates, understory macroalgae, and substrate types by
integrating data from four contributing projects working in the kelp
forests of the Santa Barbara Channel, USA. Divers collect data on
using either uniform point contact (UPC) or random point contact (RPC)
methods.


The four contributing projects are two research projects: The Santa
Barbara Coastal LTER (SBC LTER) and the Partnership for
Interdisciplinary Studies of Coastal Oceans (PISCO), the kelp forest
monitoring program of the Santa Barbara Channel National Park, and the
San Nicolas Island monitoring program supported by USGS. Together,
these projects have recorded data for more than 200 species at
approximately 100 sites on both the mainland coast and on the Santa
Barbara Channel Islands. Sampling began in 1982 and is ongoing. Data
were collected by human observation (divers using SCUBA) during
regular surveys.


Percent cover is recorded for taxa where individuals cannot be
counted. Cover can be calculated from the data here as the fraction of
total points at which the taxon was present x 100. With UPC and RPC
methods, multiple species can be recorded at any given point. The
total percent cover of all species combined using this method can
exceed 100%; however, the percent cover of any single species cannot
exceed 100%. See Methods for information on integration and data
processing.


MBON is funded by National Aeronautics and Space Administration
(NASA), Bureau of Ocean Energy Management (BOEM), and National Oceanic
and Atmospheric Administration (NOAA).


For users who are interested in using all or part of this integrated
datasets, please contact data owners to discuss your research
interests, data-related issues or any other questions. A recommended
citation for the data package is available from the download page. In
addition, any manuscript generated using this dataset is expected to
be sent to the data owners before publication so we can be sure the
data is used in the proper context and methods are reported
accurately:


Santa Barbara Coastal LTER (LTER):


Dan Reed [email protected]


Robert Miller [email protected]


Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO):


Jenn Caselle [email protected]


Kelp forest monitoring (KFM):


David Kushner [email protected]


Joshua Sprague [email protected]


San Nicolas Island monitoring (SNI):


Kevin Lafferty [email protected]


Mike Kenner [email protected] Population Abundance BasisofRecord: HumanObservation Occurrence: OrganismQuantity Taxon: ScientificName algae invertebrate random point contact Santa Barbara Channel Marine BON uniform point contact" dataset
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 74397dfa-151e-4eb8-818b-c9d63ead8272 /eml:eml/dataset/dataTable dataset SBCMBON kelp forest integrated benthic cover biological survey SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 6a2d4c50-e984-4dc0-91b5-4dd5a5ab989f /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Source project for this data data_source
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 874b9da5-7494-44ef-ba52-149220062520 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Sampling method sample_method
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d2e4f6af-b556-4a0a-97e8-67a551590a92 /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Date of survey date
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f5304e0a-caa5-4f1d-b686-039722d3cda3 /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv ID of a site, assigned by each project site_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b534c98f-3d20-46ba-8d2c-4c188418d48f /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the subsite,one level below site subsite_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 28d7b009-3d4b-4535-943b-647cfdaae22d /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the transect transect_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 5daece56-6088-44c1-8434-ec8dba727b80 /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the replicate replicate_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 790a95e1-22f2-4dac-8b36-7e0114f26bc4 /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Code assigned by SBC MBON for this taxon from this data source (project) proj_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 1370ae52-41da-49d1-92dd-d5077500cfd9 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of total points counted on a UPC or RPC survey points
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d9ac0022-c157-416d-9f93-509f629273a1 /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of organisms counted count
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 7de66f26-b649-459f-ad93-122d7eda3b1b /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon code assigned by an authoritative source auth_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f8eb86e2-f02a-4ff9-bd9a-c0b810c312db /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Name of the athority or registry assigning the Authoritative Taxon Code auth_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 686a51da-c990-45fe-a8bb-94fc5e52d41b /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon name, usually species binomial or other taxon name taxon_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 65cbc590-943a-4656-824b-e449b28cca01 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv The site, as named by each project site_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 93fa3786-fde3-453e-8dd8-85f892ce6191 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Survey region within a site subsite_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 3bc6f4fc-40c1-47c5-96c7-5a6b2b6d649b /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site latitude latitude
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 18301dfd-766b-43d8-ae27-31ae32cd4828 /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site longitude longitude
21 changes: 21 additions & 0 deletions tests/test_workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import os
import tempfile
import pandas as pd
from lxml import etree
from spinneret import workbook
from spinneret import datasets
from spinneret.workbook import get_description


def test_create():
Expand Down Expand Up @@ -33,3 +35,22 @@ def test_create():
for c in cols:
if c != "element_id": # new UUIDs won't match the fixture
assert sorted(wb[c].unique()) == sorted(wbf[c].unique())


def test_get_description():
"""Test that the get_description function returns a description for each
element"""
# Read test file
eml_dir = datasets.get_example_eml_dir()
eml_file = eml_dir + "/" + "edi.3.9.xml"
eml = etree.parse(eml_file)

# Elements to test (note dataTable is a general test for data entities)
elements = ["dataset", "dataTable", "attribute"]

# Test each element
for element in elements:
element = eml.xpath(".//" + element)[0]
description = get_description(element)
assert isinstance(description, str)
assert len(description) > 0

0 comments on commit 9d6da08

Please sign in to comment.