Skip to content

Commit

Permalink
fix: make get_description more resilient to missing elements
Browse files Browse the repository at this point in the history
Modify `workbook.get_description` to gracefully handle missing optional
elements (abstract, keywordSet), preventing unnecessary failures.
  • Loading branch information
clnsmth committed Aug 31, 2024
1 parent 281c098 commit 3640da3
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 24 deletions.
14 changes: 10 additions & 4 deletions src/spinneret/workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,17 @@ def get_description(element: etree._Element) -> str:
if element.tag in "dataset":
# Add abstract and keywords, they are descriptive of the entire dataset
abstract = element.xpath("./abstract")
abstract = etree.tostring(abstract[0], encoding="utf-8", method="text")
abstract = abstract.decode("utf-8").strip()
if len(abstract) != 0: # abstract is optional
abstract = etree.tostring(abstract[0], encoding="utf-8", method="text")
abstract = abstract.decode("utf-8").strip()
else:
abstract = ""
keywords = element.xpath(".//keyword")
keywords = [k.text for k in keywords]
description = abstract + " " + " ".join(keywords)
if len(keywords) != 0: # keywords are optional
keywords = [k.text for k in keywords]
else:
keywords = ""
description = abstract + " ".join(keywords)
elif element.tag in entities:
description = element.findtext(".//entityName")
elif element.tag in "attribute":
Expand Down
40 changes: 20 additions & 20 deletions tests/edi.3.9_annotation_workbook.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
package_id url element element_id element_xpath context description subject predicate predicate_id object object_id author date comment
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset ccbc2d88-fff7-471c-a123-4f30f533b707 /eml:eml/dataset edi.3.9 "The Santa Barbara Channel Marine Biodiversity Observation Network
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset 87feede4-13bc-412c-9d81-40545eb54e22 /eml:eml/dataset edi.3.9 "The Santa Barbara Channel Marine Biodiversity Observation Network
(SBCMBON) tracks long-term patterns in species abundance and
diversity. This dataset contains cover of kelp forest sessile
invertebrates, understory macroalgae, and substrate types by
Expand Down Expand Up @@ -76,22 +76,22 @@ edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 da
Kevin Lafferty [email protected]


Mike Kenner mkenner@ucsc.edu Population Abundance BasisofRecord: HumanObservation Occurrence: OrganismQuantity Taxon: ScientificName algae invertebrate random point contact Santa Barbara Channel Marine BON uniform point contact" dataset
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 74397dfa-151e-4eb8-818b-c9d63ead8272 /eml:eml/dataset/dataTable dataset SBCMBON kelp forest integrated benthic cover biological survey SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 6a2d4c50-e984-4dc0-91b5-4dd5a5ab989f /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Source project for this data data_source
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 874b9da5-7494-44ef-ba52-149220062520 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Sampling method sample_method
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d2e4f6af-b556-4a0a-97e8-67a551590a92 /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Date of survey date
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f5304e0a-caa5-4f1d-b686-039722d3cda3 /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv ID of a site, assigned by each project site_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b534c98f-3d20-46ba-8d2c-4c188418d48f /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the subsite,one level below site subsite_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 28d7b009-3d4b-4535-943b-647cfdaae22d /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the transect transect_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 5daece56-6088-44c1-8434-ec8dba727b80 /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the replicate replicate_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 790a95e1-22f2-4dac-8b36-7e0114f26bc4 /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Code assigned by SBC MBON for this taxon from this data source (project) proj_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 1370ae52-41da-49d1-92dd-d5077500cfd9 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of total points counted on a UPC or RPC survey points
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d9ac0022-c157-416d-9f93-509f629273a1 /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of organisms counted count
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 7de66f26-b649-459f-ad93-122d7eda3b1b /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon code assigned by an authoritative source auth_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f8eb86e2-f02a-4ff9-bd9a-c0b810c312db /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Name of the athority or registry assigning the Authoritative Taxon Code auth_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 686a51da-c990-45fe-a8bb-94fc5e52d41b /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon name, usually species binomial or other taxon name taxon_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 65cbc590-943a-4656-824b-e449b28cca01 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv The site, as named by each project site_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 93fa3786-fde3-453e-8dd8-85f892ce6191 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Survey region within a site subsite_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 3bc6f4fc-40c1-47c5-96c7-5a6b2b6d649b /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site latitude latitude
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 18301dfd-766b-43d8-ae27-31ae32cd4828 /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site longitude longitude
Mike Kenner mkenner@ucsc.eduPopulation Abundance BasisofRecord: HumanObservation Occurrence: OrganismQuantity Taxon: ScientificName algae invertebrate random point contact Santa Barbara Channel Marine BON uniform point contact" dataset
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 8b1d8a06-a062-485f-95aa-e0e08a0dcaf2 /eml:eml/dataset/dataTable dataset SBCMBON kelp forest integrated benthic cover biological survey SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d81ab758-0831-411a-984f-a76d6d6f5582 /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Source project for this data data_source
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 66c97223-4317-4c91-8177-bbd83ed0f847 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Sampling method sample_method
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 42df9151-9914-486b-9f4f-694e84aeaf72 /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Date of survey date
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 552f7665-87ab-4d16-857d-00325b25d39e /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv ID of a site, assigned by each project site_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 38abf50a-1f6e-494f-9537-c3f3b60f5269 /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the subsite,one level below site subsite_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute a9b7494f-4185-4779-8543-a55917fa6841 /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the transect transect_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 9c4664f8-e21c-4208-a6e2-2eb44deb594e /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the replicate replicate_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 69fcc003-992a-4c37-9132-44608515bda7 /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Code assigned by SBC MBON for this taxon from this data source (project) proj_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 1dabc36c-12be-4ac1-9ccd-7eb72d1643a0 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of total points counted on a UPC or RPC survey points
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b6561c2f-b615-4bea-b81f-d1bb35f02601 /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of organisms counted count
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute db6d969e-bb57-4cd5-a94d-e6c45234741b /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon code assigned by an authoritative source auth_taxon_id
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 2fde667d-f9a1-499c-b4a8-425e9cda9663 /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Name of the athority or registry assigning the Authoritative Taxon Code auth_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 2a204613-bb44-4229-90a5-e027739a3c9f /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon name, usually species binomial or other taxon name taxon_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 856bb74d-6331-43cd-908b-2e12666b9a11 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv The site, as named by each project site_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 51b73b52-a8b0-4a54-8f1f-306e67e9e8e3 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Survey region within a site subsite_name
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c505ba1b-531e-401c-a19e-84cdfa76a4f3 /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site latitude latitude
edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 9c3341b5-2dd8-4fb6-863e-dce86c05128b /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site longitude longitude
19 changes: 19 additions & 0 deletions tests/test_workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,22 @@ def test_get_description():
description = get_description(element)
assert isinstance(description, str)
assert len(description) > 0


def test_get_description_handles_missing_element():
"""Test that the get_description function returns an empty string when the
optional elements are missing"""

# Read test file
eml_file = datasets.get_example_eml_dir() + "/" + "edi.3.9.xml"
eml = etree.parse(eml_file)

# Remove abstract and keywordSet elements from dataset
element = eml.xpath(".//dataset")[0]
element.remove(element.find("abstract"))
for kw in element.findall(".//keywordSet"):
element.remove(kw)

# Test element with missing abstract
description = get_description(element)
assert description == ""

0 comments on commit 3640da3

Please sign in to comment.