[#8] initial adapter for raster aggregation

CUAHSI · Mar 21, 2024 · d676590 · d676590
1 parent 100b66d
commit d676590
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 66 deletions.
diff --git a/hsextract/adapters/hydroshare.py b/hsextract/adapters/hydroshare.py
@@ -296,16 +296,9 @@ class _NetCDFAggregationMetadata(BaseModel):
     variables: List[Variable]
     spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
     period_coverage: Optional[TemporalCoverage]
-    content_files: Optional[List[ContentFile]]
     # the extracted file (media object) metadata is already in schema.MediaObject format
     associatedMedia: Optional[List[schema.MediaObject]]
 
-    def to_aggregation_associated_media(self):
-        media_objects = []
-        for content_file in self.content_files:
-            media_objects.append(content_file.to_dataset_media_object())
-        return media_objects
-
     def to_aggregation_spatial_coverage(self):
         if self.spatial_coverage:
             return self.spatial_coverage.to_dataset_spatial_coverage()
@@ -332,3 +325,110 @@ def to_catalog_dataset(self):
         aggregation_metadata.additionalProperty = []
         aggregation_metadata.associatedMedia = self.associatedMedia
         return aggregation_metadata
+
+
+class BandInformation(BaseModel):
+    name: str
+    maximum_value: float
+    minimum_value: float
+    no_data_value: float
+
+    def to_aggregation_band_as_additional_property(self):
+        band = schema.PropertyValue.construct()
+        band.name = self.name
+        band.maxValue = self.maximum_value
+        band.minValue = self.minimum_value
+        band.value = schema.PropertyValue.construct()
+        band.value.name = "no_data_value"
+        band.value.value = self.no_data_value
+        return band
+
+
+class SpatialReference(BaseModel):
+    projection_string: str
+    projection: str
+    datum: str
+    eastlimit: float
+    northlimit: float
+    units: str
+
+    def to_aggregation_spatial_reference_as_additional_property(self):
+        spatial_reference = schema.PropertyValue.construct()
+        spatial_reference.name = "spatial_reference"
+        spatial_reference.value = []
+        proj_str = schema.PropertyValue.construct()
+        proj_str.name = "projection_string"
+        proj_str.value = self.projection_string
+        spatial_reference.value.append(proj_str)
+
+        projection = schema.PropertyValue.construct()
+        projection.name = "projection"
+        projection.value = self.projection
+        spatial_reference.value.append(projection)
+
+        east_limit = schema.PropertyValue.construct()
+        east_limit.name = "eastlimit"
+        east_limit.value = self.eastlimit
+        spatial_reference.value.append(east_limit)
+
+        north_limit = schema.PropertyValue.construct()
+        north_limit.name = "northlimit"
+        north_limit.value = self.northlimit
+        spatial_reference.value.append(north_limit)
+
+        return spatial_reference
+
+
+class _RasterAggregationMetadata(BaseModel):
+    title: Optional[str]
+    spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
+    period_coverage: Optional[TemporalCoverage]
+    # the extracted file (media object) metadata is already in schema.MediaObject format
+    associatedMedia: Optional[List[schema.MediaObject]]
+    band_information: BandInformation
+    cell_information: dict
+    spatial_reference: SpatialReference
+
+    def to_aggregation_cell_info_as_additional_properties(self):
+        additional_properties = []
+        if self.cell_information:
+            for key, value in self.cell_information.items():
+                prop = schema.PropertyValue.construct()
+                prop.name = key
+                prop.value = value
+                additional_properties.append(prop)
+        return additional_properties
+
+    def to_aggregation_spatial_coverage(self):
+        if self.spatial_coverage:
+            aggr_spatial_coverage = self.spatial_coverage.to_dataset_spatial_coverage()
+            if aggr_spatial_coverage:
+                aggr_spatial_coverage.additionalProperty = [
+                    self.spatial_reference.to_aggregation_spatial_reference_as_additional_property()]
+
+            return aggr_spatial_coverage
+        return None
+
+    def to_aggregation_period_coverage(self):
+        if self.period_coverage:
+            return self.period_coverage.to_dataset_temporal_coverage()
+        return None
+
+    def to_catalog_dataset(self):
+        aggregation_metadata = schema.RasterAggregationMetadata.construct()
+        aggregation_metadata.name = self.title
+        aggregation_metadata.spatialCoverage = self.to_aggregation_spatial_coverage()
+        aggregation_metadata.temporalCoverage = self.to_aggregation_period_coverage()
+        aggregation_metadata.additionalProperty = self.to_aggregation_cell_info_as_additional_properties()
+        aggregation_metadata.additionalProperty.append(
+            self.band_information.to_aggregation_band_as_additional_property())
+        aggregation_metadata.associatedMedia = self.associatedMedia
+        return aggregation_metadata
+
+
+class RasterAggregationMetadataAdapter:
+    @staticmethod
+    def to_catalog_record(aggr_metadata: dict):
+        """Converts extracted raster aggregation metadata to a catalog dataset record"""
+        aggr_model = _RasterAggregationMetadata(**aggr_metadata)
+        return aggr_model.to_catalog_dataset()
diff --git a/hsextract/models/schema.py b/hsextract/models/schema.py
@@ -301,13 +301,65 @@ def validate_box(cls, v):
         return v
 
 
+class PropertyValueBase(SchemaBaseModel):
+    type: str = Field(
+        alias="@type",
+        default="PropertyValue",
+        const="PropertyValue",
+        description="A property-value pair.",
+    )
+    propertyID: Optional[str] = Field(
+        title="Property ID", description="The ID of the property."
+    )
+    name: str = Field(description="The name of the property.")
+    value: str = Field(description="The value of the property.")
+    unitCode: Optional[str] = Field(
+        title="Measurement unit", description="The unit of measurement for the value."
+    )
+    description: Optional[str] = Field(description="A description of the property.")
+    minValue: Optional[float] = Field(
+        title="Minimum value", description="The minimum allowed value for the property."
+    )
+    maxValue: Optional[float] = Field(
+        title="Maximum value", description="The maximum allowed value for the property."
+    )
+    measurementTechnique: Optional[str] = Field(
+        title="Measurement technique", description="A technique or technology used in a measurement."
+    )
+
+    class Config:
+        title = "PropertyValue"
+
+    @root_validator
+    def validate_min_max_values(cls, values):
+        min_value = values.get("minValue", None)
+        max_value = values.get("maxValue", None)
+        if min_value is not None and max_value is not None:
+            if min_value > max_value:
+                raise ValueError("Minimum value must be less than or equal to maximum value")
+
+        return values
+
+
+class PropertyValue(PropertyValueBase):
+    # using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
+    # in order for the schema generation (schema.json) to work. Self referencing nested models leads to
+    # infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
+    value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")
+
+
 class Place(SchemaBaseModel):
     type: str = Field(alias="@type", default="Place", description="Represents the focus area of the record's content.")
     name: Optional[str] = Field(description="Name of the place.")
     geo: Optional[Union[GeoCoordinates, GeoShape]] = Field(
         description="Specifies the geographic coordinates of the place in the form of a point location, line, "
         "or area coverage extent."
     )
+    additionalProperty: Optional[List[PropertyValue]] = Field(
+        title="Additional properties",
+        default=[],
+        description="Additional properties of the location/place."
+    )
 
     @root_validator
     def validate_geo_or_name_required(cls, values):
@@ -324,7 +376,7 @@ class MediaObject(SchemaBaseModel):
         title="Content URL",
         description="The direct URL link to access or download the actual content of the media object.",
     )
-    encodingFormat: str = Field(
+    encodingFormat: Optional[str] = Field(
         title="Encoding format", description="Represents the specific file format in which the media is encoded."
     )  # TODO enum for encoding formats
     contentSize: str = Field(
@@ -363,53 +415,6 @@ def validate_content_size(cls, v):
         return v
 
 
-class PropertyValueBase(SchemaBaseModel):
-    type: str = Field(
-        alias="@type",
-        default="PropertyValue",
-        const="PropertyValue",
-        description="A property-value pair.",
-    )
-    propertyID: Optional[str] = Field(
-        title="Property ID", description="The ID of the property."
-    )
-    name: str = Field(description="The name of the property.")
-    value: str = Field(description="The value of the property.")
-    unitCode: Optional[str] = Field(
-        title="Measurement unit", description="The unit of measurement for the value."
-    )
-    description: Optional[str] = Field(description="A description of the property.")
-    minValue: Optional[float] = Field(
-        title="Minimum value", description="The minimum allowed value for the property."
-    )
-    maxValue: Optional[float] = Field(
-        title="Maximum value", description="The maximum allowed value for the property."
-    )
-    measurementTechnique: Optional[str] = Field(
-        title="Measurement technique", description="A technique or technology used in a measurement."
-    )
-
-    class Config:
-        title = "PropertyValue"
-
-    @root_validator
-    def validate_min_max_values(cls, values):
-        min_value = values.get("minValue", None)
-        max_value = values.get("maxValue", None)
-        if min_value is not None and max_value is not None:
-            if min_value > max_value:
-                raise ValueError("Minimum value must be less than or equal to maximum value")
-
-        return values
-
-
-class PropertyValue(PropertyValueBase):
-    # using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
-    # in order for the schema generation (schema.json) to work. Self referencing nested models leads to
-    # infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
-    value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")
-
-
 class CoreMetadata(SchemaBaseModel):
     context: HttpUrl = Field(
         alias='@context',
@@ -599,3 +604,12 @@ class NetCDFAggregationMetadata(BaseAggregationMetadata):
     variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
         title="Variables measured", description="Measured variables."
     )
+
+
+class RasterAggregationMetadata(BaseAggregationMetadata):
+    type: str = Field(
+        alias="@type",
+        default="Geo Raster Dataset",
+        const=True,
+        description="Type of aggregation."
+    )
diff --git a/hsextract/utils.py b/hsextract/utils.py
@@ -5,7 +5,11 @@
 
 from pathlib import Path
 
-from hsextract.adapters.hydroshare import HydroshareMetadataAdapter, NetCDFAggregationMetadataAdapter
+from hsextract.adapters.hydroshare import (
+    HydroshareMetadataAdapter,
+    NetCDFAggregationMetadataAdapter,
+    RasterAggregationMetadataAdapter,
+)
 from hsextract.listing.utils import prepare_files
 from hsextract.models.schema import CoreMetadataDOC
 from hsextract.raster.utils import extract_from_tif_file
@@ -23,8 +27,8 @@ def _to_metadata_path(filepath: str, user_metadata_filename: str):
     return os.path.join(".hs", dirname, "dataset_metadata.json")
 
 
-def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_filename: str):
-    extracted_metadata = extract_metadata(type, filepath)
+def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_filename: str, use_adapter=True):
+    extracted_metadata = extract_metadata(type, filepath, use_adapter)
     if extracted_metadata:
         filepath = _to_metadata_path(filepath, user_metadata_filename)
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
@@ -33,7 +37,9 @@ def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_file
     return filepath, extracted_metadata is not None
 
 
-def extract_metadata(type: str, filepath):
+def extract_metadata(type: str, filepath, use_adapter=True):
+    # use_adapter is a flag to determine if the metadata should be converted to a catalog record
+    # it is set to False in tests when testing for the raw extracted metadata
     try:
         extracted_metadata = _extract_metadata(type, filepath)
     except Exception as e:
@@ -47,20 +53,29 @@ def extract_metadata(type: str, filepath):
     del extracted_metadata["content_files"]
     if type == "user_meta":
         extracted_metadata["associatedMedia"] = all_file_metadata
+        # user specified metadata is not extracted - so always use the core metadata adapter
         return json.loads(CoreMetadataDOC.construct(**extracted_metadata).json())
     else:
         extracted_metadata["associatedMedia"] = all_file_metadata
-        if type == "netcdf":
-            adapter = NetCDFAggregationMetadataAdapter()
-        catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
-        return catalog_record
+        if use_adapter:
+            if type == 'raster':
+                adapter = RasterAggregationMetadataAdapter()
+            if type == "netcdf":
+                adapter = NetCDFAggregationMetadataAdapter()
+
+            catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
+            return catalog_record
+        else:
+            return extracted_metadata
 
 
 def _extract_metadata(type: str, filepath):
     extension = os.path.splitext(filepath)[1]
     metadata = None
     if type == "raster":
         metadata = extract_from_tif_file(filepath)
+        # convert ordered dict to dict
+        metadata["cell_information"] = dict(metadata["cell_information"])
     elif type == "feature":
         metadata = extract_metadata_and_files(filepath)
     elif type == "netcdf":
@@ -87,7 +102,7 @@ def _extract_metadata(type: str, filepath):
     return metadata
 
 
-async def list_and_extract(path: str, user_metadata_filename: str, base_url: str):
+async def list_and_extract(path: str, user_metadata_filename: str, base_url: str, use_adapter=True):
     current_directory = os.getcwd()
     try:
         os.chdir(path)
@@ -103,7 +118,8 @@ async def list_and_extract(path: str, user_metadata_filename: str, base_url: str
             for file in files:
                 tasks.append(
                     asyncio.get_running_loop().run_in_executor(
-                        None, extract_metadata_with_file_path, category, file, user_metadata_filename
+                        None, extract_metadata_with_file_path, category, file, user_metadata_filename,
+                        use_adapter
                     )
                 )
 
@@ -116,7 +132,7 @@ async def list_and_extract(path: str, user_metadata_filename: str, base_url: str
 
         # The netcdf library does not seem to be thread safe, running them in this thread
         for file in netcdf_files:
-            results.append(extract_metadata_with_file_path("netcdf", file, user_metadata_filename))
+            results.append(extract_metadata_with_file_path("netcdf", file, user_metadata_filename, use_adapter))
 
         metadata_manifest = [
             {file_path: f"{file_path}.json"}