Skip to content

Commit

Permalink
[#8] initial adapter for raster aggregation
Browse files Browse the repository at this point in the history
  • Loading branch information
pkdash committed Mar 21, 2024
1 parent 100b66d commit d676590
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 66 deletions.
114 changes: 107 additions & 7 deletions hsextract/adapters/hydroshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,9 @@ class _NetCDFAggregationMetadata(BaseModel):
variables: List[Variable]
spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
period_coverage: Optional[TemporalCoverage]
content_files: Optional[List[ContentFile]]
# the extracted file (media object) metadata is already in schema.MediaObject format
associatedMedia: Optional[List[schema.MediaObject]]

def to_aggregation_associated_media(self):
media_objects = []
for content_file in self.content_files:
media_objects.append(content_file.to_dataset_media_object())
return media_objects

def to_aggregation_spatial_coverage(self):
if self.spatial_coverage:
return self.spatial_coverage.to_dataset_spatial_coverage()
Expand All @@ -332,3 +325,110 @@ def to_catalog_dataset(self):
aggregation_metadata.additionalProperty = []
aggregation_metadata.associatedMedia = self.associatedMedia
return aggregation_metadata


class BandInformation(BaseModel):
name: str
maximum_value: float
minimum_value: float
no_data_value: float

def to_aggregation_band_as_additional_property(self):
band = schema.PropertyValue.construct()
band.name = self.name
band.maxValue = self.maximum_value
band.minValue = self.minimum_value
band.value = schema.PropertyValue.construct()
band.value.name = "no_data_value"
band.value.value = self.no_data_value
return band


class SpatialReference(BaseModel):
projection_string: str
projection: str
datum: str
eastlimit: float
northlimit: float
units: str

def to_aggregation_spatial_reference_as_additional_property(self):
spatial_reference = schema.PropertyValue.construct()
spatial_reference.name = "spatial_reference"
spatial_reference.value = []
proj_str = schema.PropertyValue.construct()
proj_str.name = "projection_string"
proj_str.value = self.projection_string
spatial_reference.value.append(proj_str)

projection = schema.PropertyValue.construct()
projection.name = "projection"
projection.value = self.projection
spatial_reference.value.append(projection)

east_limit = schema.PropertyValue.construct()
east_limit.name = "eastlimit"
east_limit.value = self.eastlimit
spatial_reference.value.append(east_limit)

north_limit = schema.PropertyValue.construct()
north_limit.name = "northlimit"
north_limit.value = self.northlimit
spatial_reference.value.append(north_limit)

return spatial_reference


class _RasterAggregationMetadata(BaseModel):
title: Optional[str]
spatial_coverage: Optional[Union[SpatialCoverageBox, SpatialCoveragePoint]]
period_coverage: Optional[TemporalCoverage]
# the extracted file (media object) metadata is already in schema.MediaObject format
associatedMedia: Optional[List[schema.MediaObject]]
band_information: BandInformation
cell_information: dict
spatial_reference: SpatialReference

def to_aggregation_cell_info_as_additional_properties(self):
additional_properties = []
if self.cell_information:
for key, value in self.cell_information.items():
prop = schema.PropertyValue.construct()
prop.name = key
prop.value = value
additional_properties.append(prop)
return additional_properties

def to_aggregation_spatial_coverage(self):
if self.spatial_coverage:
aggr_spatial_coverage = self.spatial_coverage.to_dataset_spatial_coverage()
if aggr_spatial_coverage:
aggr_spatial_coverage.additionalProperty = [
self.spatial_reference.to_aggregation_spatial_reference_as_additional_property()]

return aggr_spatial_coverage
return None

def to_aggregation_period_coverage(self):
if self.period_coverage:
return self.period_coverage.to_dataset_temporal_coverage()
return None

def to_catalog_dataset(self):
aggregation_metadata = schema.RasterAggregationMetadata.construct()
aggregation_metadata.name = self.title
aggregation_metadata.spatialCoverage = self.to_aggregation_spatial_coverage()
aggregation_metadata.temporalCoverage = self.to_aggregation_period_coverage()
aggregation_metadata.additionalProperty = self.to_aggregation_cell_info_as_additional_properties()
aggregation_metadata.additionalProperty.append(
self.band_information.to_aggregation_band_as_additional_property())
aggregation_metadata.associatedMedia = self.associatedMedia
return aggregation_metadata


class RasterAggregationMetadataAdapter:
@staticmethod
def to_catalog_record(aggr_metadata: dict):
"""Converts extracted raster aggregation metadata to a catalog dataset record"""
aggr_model = _RasterAggregationMetadata(**aggr_metadata)
return aggr_model.to_catalog_dataset()
110 changes: 62 additions & 48 deletions hsextract/models/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,13 +301,65 @@ def validate_box(cls, v):
return v


class PropertyValueBase(SchemaBaseModel):
type: str = Field(
alias="@type",
default="PropertyValue",
const="PropertyValue",
description="A property-value pair.",
)
propertyID: Optional[str] = Field(
title="Property ID", description="The ID of the property."
)
name: str = Field(description="The name of the property.")
value: str = Field(description="The value of the property.")
unitCode: Optional[str] = Field(
title="Measurement unit", description="The unit of measurement for the value."
)
description: Optional[str] = Field(description="A description of the property.")
minValue: Optional[float] = Field(
title="Minimum value", description="The minimum allowed value for the property."
)
maxValue: Optional[float] = Field(
title="Maximum value", description="The maximum allowed value for the property."
)
measurementTechnique: Optional[str] = Field(
title="Measurement technique", description="A technique or technology used in a measurement."
)

class Config:
title = "PropertyValue"

@root_validator
def validate_min_max_values(cls, values):
min_value = values.get("minValue", None)
max_value = values.get("maxValue", None)
if min_value is not None and max_value is not None:
if min_value > max_value:
raise ValueError("Minimum value must be less than or equal to maximum value")

return values


class PropertyValue(PropertyValueBase):
# using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
# in order for the schema generation (schema.json) to work. Self referencing nested models leads to
# infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")


class Place(SchemaBaseModel):
type: str = Field(alias="@type", default="Place", description="Represents the focus area of the record's content.")
name: Optional[str] = Field(description="Name of the place.")
geo: Optional[Union[GeoCoordinates, GeoShape]] = Field(
description="Specifies the geographic coordinates of the place in the form of a point location, line, "
"or area coverage extent."
)
additionalProperty: Optional[List[PropertyValue]] = Field(
title="Additional properties",
default=[],
description="Additional properties of the location/place."
)

@root_validator
def validate_geo_or_name_required(cls, values):
Expand All @@ -324,7 +376,7 @@ class MediaObject(SchemaBaseModel):
title="Content URL",
description="The direct URL link to access or download the actual content of the media object.",
)
encodingFormat: str = Field(
encodingFormat: Optional[str] = Field(
title="Encoding format", description="Represents the specific file format in which the media is encoded."
) # TODO enum for encoding formats
contentSize: str = Field(
Expand Down Expand Up @@ -363,53 +415,6 @@ def validate_content_size(cls, v):
return v


class PropertyValueBase(SchemaBaseModel):
type: str = Field(
alias="@type",
default="PropertyValue",
const="PropertyValue",
description="A property-value pair.",
)
propertyID: Optional[str] = Field(
title="Property ID", description="The ID of the property."
)
name: str = Field(description="The name of the property.")
value: str = Field(description="The value of the property.")
unitCode: Optional[str] = Field(
title="Measurement unit", description="The unit of measurement for the value."
)
description: Optional[str] = Field(description="A description of the property.")
minValue: Optional[float] = Field(
title="Minimum value", description="The minimum allowed value for the property."
)
maxValue: Optional[float] = Field(
title="Maximum value", description="The maximum allowed value for the property."
)
measurementTechnique: Optional[str] = Field(
title="Measurement technique", description="A technique or technology used in a measurement."
)

class Config:
title = "PropertyValue"

@root_validator
def validate_min_max_values(cls, values):
min_value = values.get("minValue", None)
max_value = values.get("maxValue", None)
if min_value is not None and max_value is not None:
if min_value > max_value:
raise ValueError("Minimum value must be less than or equal to maximum value")

return values


class PropertyValue(PropertyValueBase):
# using PropertyValueBase model instead of PropertyValue model as one of the types for the value field
# in order for the schema generation (schema.json) to work. Self referencing nested models leads to
# infinite loop in our custom schema generation code when trying to replace dict with key '$ref'
value: Union[str, PropertyValueBase, List[PropertyValueBase]] = Field(description="The value of the property.")


class CoreMetadata(SchemaBaseModel):
context: HttpUrl = Field(
alias='@context',
Expand Down Expand Up @@ -599,3 +604,12 @@ class NetCDFAggregationMetadata(BaseAggregationMetadata):
variableMeasured: Optional[List[Union[str, PropertyValue]]] = Field(
title="Variables measured", description="Measured variables."
)


class RasterAggregationMetadata(BaseAggregationMetadata):
type: str = Field(
alias="@type",
default="Geo Raster Dataset",
const=True,
description="Type of aggregation."
)
38 changes: 27 additions & 11 deletions hsextract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@

from pathlib import Path

from hsextract.adapters.hydroshare import HydroshareMetadataAdapter, NetCDFAggregationMetadataAdapter
from hsextract.adapters.hydroshare import (
HydroshareMetadataAdapter,
NetCDFAggregationMetadataAdapter,
RasterAggregationMetadataAdapter,
)
from hsextract.listing.utils import prepare_files
from hsextract.models.schema import CoreMetadataDOC
from hsextract.raster.utils import extract_from_tif_file
Expand All @@ -23,8 +27,8 @@ def _to_metadata_path(filepath: str, user_metadata_filename: str):
return os.path.join(".hs", dirname, "dataset_metadata.json")


def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_filename: str):
extracted_metadata = extract_metadata(type, filepath)
def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_filename: str, use_adapter=True):
extracted_metadata = extract_metadata(type, filepath, use_adapter)
if extracted_metadata:
filepath = _to_metadata_path(filepath, user_metadata_filename)
os.makedirs(os.path.dirname(filepath), exist_ok=True)
Expand All @@ -33,7 +37,9 @@ def extract_metadata_with_file_path(type: str, filepath: str, user_metadata_file
return filepath, extracted_metadata is not None


def extract_metadata(type: str, filepath):
def extract_metadata(type: str, filepath, use_adapter=True):
# use_adapter is a flag to determine if the metadata should be converted to a catalog record
# it is set to False in tests when testing for the raw extracted metadata
try:
extracted_metadata = _extract_metadata(type, filepath)
except Exception as e:
Expand All @@ -47,20 +53,29 @@ def extract_metadata(type: str, filepath):
del extracted_metadata["content_files"]
if type == "user_meta":
extracted_metadata["associatedMedia"] = all_file_metadata
# user specified metadata is not extracted - so always use the core metadata adapter
return json.loads(CoreMetadataDOC.construct(**extracted_metadata).json())
else:
extracted_metadata["associatedMedia"] = all_file_metadata
if type == "netcdf":
adapter = NetCDFAggregationMetadataAdapter()
catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
return catalog_record
if use_adapter:
if type == 'raster':
adapter = RasterAggregationMetadataAdapter()
if type == "netcdf":
adapter = NetCDFAggregationMetadataAdapter()

catalog_record = json.loads(adapter.to_catalog_record(extracted_metadata).json())
return catalog_record
else:
return extracted_metadata


def _extract_metadata(type: str, filepath):
extension = os.path.splitext(filepath)[1]
metadata = None
if type == "raster":
metadata = extract_from_tif_file(filepath)
# convert ordered dict to dict
metadata["cell_information"] = dict(metadata["cell_information"])
elif type == "feature":
metadata = extract_metadata_and_files(filepath)
elif type == "netcdf":
Expand All @@ -87,7 +102,7 @@ def _extract_metadata(type: str, filepath):
return metadata


async def list_and_extract(path: str, user_metadata_filename: str, base_url: str):
async def list_and_extract(path: str, user_metadata_filename: str, base_url: str, use_adapter=True):
current_directory = os.getcwd()
try:
os.chdir(path)
Expand All @@ -103,7 +118,8 @@ async def list_and_extract(path: str, user_metadata_filename: str, base_url: str
for file in files:
tasks.append(
asyncio.get_running_loop().run_in_executor(
None, extract_metadata_with_file_path, category, file, user_metadata_filename
None, extract_metadata_with_file_path, category, file, user_metadata_filename,
use_adapter
)
)

Expand All @@ -116,7 +132,7 @@ async def list_and_extract(path: str, user_metadata_filename: str, base_url: str

# The netcdf library does not seem to be thread safe, running them in this thread
for file in netcdf_files:
results.append(extract_metadata_with_file_path("netcdf", file, user_metadata_filename))
results.append(extract_metadata_with_file_path("netcdf", file, user_metadata_filename, use_adapter))

metadata_manifest = [
{file_path: f"{file_path}.json"}
Expand Down

0 comments on commit d676590

Please sign in to comment.