diff --git a/src/spinneret/workbook.py b/src/spinneret/workbook.py index eeaa9bb..855626a 100644 --- a/src/spinneret/workbook.py +++ b/src/spinneret/workbook.py @@ -67,6 +67,7 @@ def create( str(uuid4()), eml.getpath(e), subcon["context"], + get_description(e), subcon["subject"], "", # predicate "", # predicate id @@ -84,6 +85,7 @@ def create( "element_id", "element_xpath", "context", + "description", "subject", "predicate", "predicate_id", @@ -140,3 +142,34 @@ def get_subject_and_context(element: etree._Element) -> dict: context = None res = {"subject": subject, "context": context} return res + + +def get_description(element: etree._Element) -> str: + """Get the description of an element + + :param element: The EML element to be annotated. + :returns: The description of the element. + """ + entities = [ + "dataTable", + "otherEntity", + "spatialVector", + "spatialRaster", + "storedProcedure", + "view", + ] + if element.tag in "dataset": + # Add abstract and keywords, they are descriptive of the entire dataset + abstract = element.xpath("./abstract") + abstract = etree.tostring(abstract[0], encoding="utf-8", method="text") + abstract = abstract.decode("utf-8").strip() + keywords = element.xpath(".//keyword") + keywords = [k.text for k in keywords] + description = abstract + " " + " ".join(keywords) + elif element.tag in entities: + description = element.findtext(".//entityName") + elif element.tag in "attribute": + description = element.findtext(".//attributeDefinition") + else: + description = None + return description diff --git a/tests/edi.3.9_annotation_workbook.tsv b/tests/edi.3.9_annotation_workbook.tsv index a09e869..fbc465e 100644 --- a/tests/edi.3.9_annotation_workbook.tsv +++ b/tests/edi.3.9_annotation_workbook.tsv @@ -1,20 +1,97 @@ -package_id url element element_id element_xpath context subject predicate predicate_id object object_id author date comment -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset b762aade-ffeb-4195-bdb3-7bedfc238f4f /eml:eml/dataset edi.3.9 dataset -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 0af1b0c1-42d3-4a95-acf9-739dde6dec1f /eml:eml/dataset/dataTable dataset SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 10139c2c-cd82-48c1-9e08-62d2d5b1bb11 /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv data_source -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c1722e3c-a7e0-4d57-bb18-f1d8ddb6f9a8 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv sample_method -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 0bca748f-5a11-40e0-9bbc-b06926ceb67f /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv date -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute be405a5b-a3f2-4962-9aaa-e8fceff32d69 /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv site_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute fb9a9e58-beee-43da-8fe6-15e530841691 /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv subsite_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c70c3c84-5bc7-498a-a7ff-cd9e8476434f /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv transect_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute e30f49bb-0d67-494b-9853-610cf837f27d /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv replicate_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 645692d8-e253-4771-87be-ee3a4264486d /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv proj_taxon_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 5ed9e722-b4e7-4283-9462-e128e9aaf6e1 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv points -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d2902fdf-74e8-454a-9b8c-0666d426aacd /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv count -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f8c106fb-fcba-415f-b985-363bf952bf5f /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv auth_taxon_id -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 8303d233-73d1-495d-974c-33de9c706abf /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv auth_name -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b01bf32e-c67b-4eb1-bf8b-6212ee68b31d /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv taxon_name -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute c68a2d03-5685-4336-bc45-a552114119d3 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv site_name -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 0a89d25e-47a7-4931-8074-b510a1bb1864 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv subsite_name -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d68e1ef4-8553-4a94-88de-146988ae76ef /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv latitude -edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 8a570431-09e8-49e5-8e35-1dfa9c8cd6cb /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv longitude +package_id url element element_id element_xpath context description subject predicate predicate_id object object_id author date comment +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataset ccbc2d88-fff7-471c-a123-4f30f533b707 /eml:eml/dataset edi.3.9 "The Santa Barbara Channel Marine Biodiversity Observation Network + (SBCMBON) tracks long-term patterns in species abundance and + diversity. This dataset contains cover of kelp forest sessile + invertebrates, understory macroalgae, and substrate types by + integrating data from four contributing projects working in the kelp + forests of the Santa Barbara Channel, USA. Divers collect data on + using either uniform point contact (UPC) or random point contact (RPC) + methods. + + + The four contributing projects are two research projects: The Santa + Barbara Coastal LTER (SBC LTER) and the Partnership for + Interdisciplinary Studies of Coastal Oceans (PISCO), the kelp forest + monitoring program of the Santa Barbara Channel National Park, and the + San Nicolas Island monitoring program supported by USGS. Together, + these projects have recorded data for more than 200 species at + approximately 100 sites on both the mainland coast and on the Santa + Barbara Channel Islands. Sampling began in 1982 and is ongoing. Data + were collected by human observation (divers using SCUBA) during + regular surveys. + + + Percent cover is recorded for taxa where individuals cannot be + counted. Cover can be calculated from the data here as the fraction of + total points at which the taxon was present x 100. With UPC and RPC + methods, multiple species can be recorded at any given point. The + total percent cover of all species combined using this method can + exceed 100%; however, the percent cover of any single species cannot + exceed 100%. See Methods for information on integration and data + processing. + + + MBON is funded by National Aeronautics and Space Administration + (NASA), Bureau of Ocean Energy Management (BOEM), and National Oceanic + and Atmospheric Administration (NOAA). + + + For users who are interested in using all or part of this integrated + datasets, please contact data owners to discuss your research + interests, data-related issues or any other questions. A recommended + citation for the data package is available from the download page. In + addition, any manuscript generated using this dataset is expected to + be sent to the data owners before publication so we can be sure the + data is used in the proper context and methods are reported + accurately: + + + Santa Barbara Coastal LTER (LTER): + + + Dan Reed dan.reed@lifesci.ucsb.edu + + + Robert Miller miller@msi.ucsb.edu + + + Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO): + + + Jenn Caselle caselle@ucsb.edu + + + Kelp forest monitoring (KFM): + + + David Kushner david_kushner@nps.gov + + + Joshua Sprague joshua_sprague@nps.gov + + + San Nicolas Island monitoring (SNI): + + + Kevin Lafferty Klafferty@usgs.gov + + + Mike Kenner mkenner@ucsc.edu Population Abundance BasisofRecord: HumanObservation Occurrence: OrganismQuantity Taxon: ScientificName algae invertebrate random point contact Santa Barbara Channel Marine BON uniform point contact" dataset +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 dataTable 74397dfa-151e-4eb8-818b-c9d63ead8272 /eml:eml/dataset/dataTable dataset SBCMBON kelp forest integrated benthic cover biological survey SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 6a2d4c50-e984-4dc0-91b5-4dd5a5ab989f /eml:eml/dataset/dataTable/attributeList/attribute[1] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Source project for this data data_source +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 874b9da5-7494-44ef-ba52-149220062520 /eml:eml/dataset/dataTable/attributeList/attribute[2] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Sampling method sample_method +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d2e4f6af-b556-4a0a-97e8-67a551590a92 /eml:eml/dataset/dataTable/attributeList/attribute[3] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Date of survey date +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f5304e0a-caa5-4f1d-b686-039722d3cda3 /eml:eml/dataset/dataTable/attributeList/attribute[4] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv ID of a site, assigned by each project site_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute b534c98f-3d20-46ba-8d2c-4c188418d48f /eml:eml/dataset/dataTable/attributeList/attribute[5] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the subsite,one level below site subsite_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 28d7b009-3d4b-4535-943b-647cfdaae22d /eml:eml/dataset/dataTable/attributeList/attribute[6] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the transect transect_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 5daece56-6088-44c1-8434-ec8dba727b80 /eml:eml/dataset/dataTable/attributeList/attribute[7] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Identifier for the replicate replicate_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 790a95e1-22f2-4dac-8b36-7e0114f26bc4 /eml:eml/dataset/dataTable/attributeList/attribute[8] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Code assigned by SBC MBON for this taxon from this data source (project) proj_taxon_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 1370ae52-41da-49d1-92dd-d5077500cfd9 /eml:eml/dataset/dataTable/attributeList/attribute[9] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of total points counted on a UPC or RPC survey points +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute d9ac0022-c157-416d-9f93-509f629273a1 /eml:eml/dataset/dataTable/attributeList/attribute[10] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Number of organisms counted count +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 7de66f26-b649-459f-ad93-122d7eda3b1b /eml:eml/dataset/dataTable/attributeList/attribute[11] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon code assigned by an authoritative source auth_taxon_id +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute f8eb86e2-f02a-4ff9-bd9a-c0b810c312db /eml:eml/dataset/dataTable/attributeList/attribute[12] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Name of the athority or registry assigning the Authoritative Taxon Code auth_name +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 686a51da-c990-45fe-a8bb-94fc5e52d41b /eml:eml/dataset/dataTable/attributeList/attribute[13] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Taxon name, usually species binomial or other taxon name taxon_name +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 65cbc590-943a-4656-824b-e449b28cca01 /eml:eml/dataset/dataTable/attributeList/attribute[14] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv The site, as named by each project site_name +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 93fa3786-fde3-453e-8dd8-85f892ce6191 /eml:eml/dataset/dataTable/attributeList/attribute[15] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Survey region within a site subsite_name +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 3bc6f4fc-40c1-47c5-96c7-5a6b2b6d649b /eml:eml/dataset/dataTable/attributeList/attribute[16] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site latitude latitude +edi.3.9 https://portal.edirepository.org/nis/metadataviewer?packageid=edi.3.9 attribute 18301dfd-766b-43d8-ae27-31ae32cd4828 /eml:eml/dataset/dataTable/attributeList/attribute[17] SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv Site longitude longitude diff --git a/tests/test_workbook.py b/tests/test_workbook.py index cbdfb79..2b08197 100644 --- a/tests/test_workbook.py +++ b/tests/test_workbook.py @@ -3,8 +3,10 @@ import os import tempfile import pandas as pd +from lxml import etree from spinneret import workbook from spinneret import datasets +from spinneret.workbook import get_description def test_create(): @@ -33,3 +35,22 @@ def test_create(): for c in cols: if c != "element_id": # new UUIDs won't match the fixture assert sorted(wb[c].unique()) == sorted(wbf[c].unique()) + + +def test_get_description(): + """Test that the get_description function returns a description for each + element""" + # Read test file + eml_dir = datasets.get_example_eml_dir() + eml_file = eml_dir + "/" + "edi.3.9.xml" + eml = etree.parse(eml_file) + + # Elements to test (note dataTable is a general test for data entities) + elements = ["dataset", "dataTable", "attribute"] + + # Test each element + for element in elements: + element = eml.xpath(".//" + element)[0] + description = get_description(element) + assert isinstance(description, str) + assert len(description) > 0