From a0ce4359ec32edf93e4f64b3251c2ce284e1f26a Mon Sep 17 00:00:00 2001 From: pkdash Date: Mon, 27 Feb 2023 15:13:27 -0500 Subject: [PATCH 01/23] [#44] initial work - loading aggregation data to data processing object --- hsclient/hydroshare.py | 116 +++++++++++++++++++++++++++++++++++++---- requirements.txt | 4 ++ 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 4c843e8..0a92c59 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -1,6 +1,8 @@ import getpass import os +import pathlib import pickle +import shutil import sqlite3 import tempfile import time @@ -13,8 +15,11 @@ from urllib.parse import quote, unquote, urlparse from zipfile import ZipFile +import fiona import pandas +import rasterio import requests +import xarray from hsmodels.schemas import load_rdf, rdf_string from hsmodels.schemas.base_models import BaseMetadata from hsmodels.schemas.enums import AggregationType @@ -108,6 +113,7 @@ def __init__(self, map_path, hs_session, checksums=None): self._parsed_files = None self._parsed_aggregations = None self._parsed_checksums = checksums + self._data_object = None def __str__(self): return self._map_path @@ -232,6 +238,10 @@ def main_file_path(self) -> str: return self.files()[0].folder return self.files()[0].path + @property + def data_object(self) -> Union[pandas.Series, fiona.Collection, rasterio.DatasetReader, xarray.Dataset, None]: + return self._data_object + @refresh def save(self) -> None: """ @@ -281,10 +291,10 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: aggregations = self._aggregations for key, value in kwargs.items(): if key.startswith('file__'): - file_args = {key[len('file__') :]: value} + file_args = {key[len('file__'):]: value} aggregations = [agg for agg in aggregations if agg.files(**file_args)] elif key.startswith('files__'): - file_args = {key[len('files__') :]: value} + file_args = {key[len('files__'):]: value} aggregations = [agg for agg in aggregations if agg.files(**file_args)] else: aggregations = filter(lambda agg: attribute_filter(agg.metadata, key, value), aggregations) @@ -314,14 +324,15 @@ def refresh(self) -> None: self._parsed_files = None self._parsed_aggregations = None self._parsed_checksums = None + self._data_object = None - def as_series(self, series_id: str, agg_path: str = None) -> Dict[int, pandas.Series]: + def as_series(self, series_id: str, agg_path: str = None) -> pandas.DataFrame: """ - Creates a pandas Series object out of an aggregation of type TimeSeries. - :param series_id: The series_id of the timeseries result to be converted to a Series object. + Creates a pandas DataFrame object out of an aggregation of type TimeSeries. + :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have it downloaded locally. - :return: A pandas.Series object + :return: A pandas.DataFrame object """ def to_series(timeseries_file: str): @@ -332,13 +343,98 @@ def to_series(timeseries_file: str): con, ).squeeze() + return self._get_data_object(agg_path=agg_path, func=to_series) + + def as_multi_dimensional_dataset(self, agg_path: str = None) -> xarray.Dataset: + """ + Creates a xarray Dataset object out of an aggregation of type NetCDF. + :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have + it downloaded locally. + :return: A xarray.Dataset object + """ + if self.metadata.type != AggregationType.MultidimensionalAggregation: + raise Exception("Aggregation is not of type NetCDF") + + return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) + + def as_feature_collection(self, agg_path: str = None) -> fiona.Collection: + """ + Creates a fiona Collection object out of an aggregation of type GeoFeature. + :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have + it downloaded locally at aggr_path. + :return: A fiona.Collection object + Note: The caller is responsible for closing the fiona.Collection object to free up aggregation files used to + create this object. + """ + if self.metadata.type != AggregationType.GeographicFeatureAggregation: + raise Exception("Aggregation is not of type GeoFeature") + + return self._get_data_object(agg_path=agg_path, func=fiona.open) + + def as_raster_dataset(self, agg_path: str = None) -> rasterio.DatasetReader: + """ + Creates a rasterio DatasetReader object out of an aggregation of type GeoRaster + :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have + it downloaded locally at aggr_path. + :return: A rasterio.DatasetReader object + Note: The caller is responsible for closing the rasterio.DatasetReader object to free up aggregation files + used to create this object. + """ + if self.metadata.type != AggregationType.GeographicRasterAggregation: + raise Exception("Aggregation is not of type GeoRaster") + + return self._get_data_object(agg_path=agg_path, func=rasterio.open) + + def as_data_object(self, series_id: str = None, agg_path: str = None) -> \ + Union[pandas.DataFrame, fiona.Collection, rasterio.DatasetReader, xarray.Dataset, None]: + """Load aggregation data to a relevant data object tyoe""" + + if self.metadata.type == AggregationType.TimeSeriesAggregation: + if not series_id: + raise Exception("Please specify series_id for which the timeseries data object is needed.") + return self.as_series(series_id=series_id, agg_path=agg_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + return self.as_multi_dimensional_dataset(agg_path=agg_path) + if self.metadata.type == AggregationType.GeographicFeatureAggregation: + return self.as_feature_collection(agg_path=agg_path) + if self.metadata.type == AggregationType.GeographicRasterAggregation: + return self.as_raster_dataset(agg_path=agg_path) + + raise Exception(f"Data object is not supported for '{self.metadata.type}' aggregation type") + + def _get_data_object(self, agg_path, func): + if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: + return self._data_object + + main_file_ext = pathlib.Path(self.main_file_path).suffix if agg_path is None: - with tempfile.TemporaryDirectory() as td: + td = tempfile.mkdtemp() + try: self._download(unzip_to=td) # zip extracted to folder with main file name - file_name = self.file(extension=".sqlite").name - return to_series(urljoin(td, file_name, file_name)) - return to_series(urljoin(agg_path, self.file(extension=".sqlite").name)) + file_name = self.file(extension=main_file_ext).name + file_path = urljoin(td, file_name, file_name) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.close() + finally: + # we can delete the temporary directory for the data object created + # for these 2 aggregation types only. For other aggregation types, the generated data object + # needs to have access to the aggregation files in the temporary directory - so it's the caller's + # responsibility to delete the temporary directory + if self.metadata.type in (AggregationType.TimeSeriesAggregation, + AggregationType.MultidimensionalAggregation): + shutil.rmtree(td) + else: + file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.close() + + # cache the object for the aggregation + self._data_object = data_object + + return data_object class Resource(Aggregation): diff --git a/requirements.txt b/requirements.txt index 93b1196..a6d02eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,10 @@ pytest == 6.0.2 requests == 2.24.0 email-validator pandas +netCDF4 +xarray +rasterio +fiona isort black pytest-xdist From 96734941b039c48e840be349fdbeb5692eede51e Mon Sep 17 00:00:00 2001 From: pkdash Date: Thu, 2 Mar 2023 10:25:38 -0500 Subject: [PATCH 02/23] [#44] initial work for editing data for netcdf and timeseries aggregations --- hsclient/hydroshare.py | 295 +++++++++++++++++++++++++++++++++-------- 1 file changed, 242 insertions(+), 53 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 0a92c59..69e355e 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -6,20 +6,41 @@ import sqlite3 import tempfile import time +from contextlib import closing import urllib.parse from datetime import datetime from functools import wraps from posixpath import basename, dirname, join as urljoin, splitext from pprint import pformat -from typing import Dict, List, Union +from typing import Dict, List, Union, TYPE_CHECKING from urllib.parse import quote, unquote, urlparse from zipfile import ZipFile -import fiona -import pandas -import rasterio +if TYPE_CHECKING: + import fiona + import pandas + import rasterio + import xarray +else: + try: + import fiona + except ImportError: + fiona = None + try: + import pandas + except ImportError: + pandas = None + try: + import rasterio + except ImportError: + rasterio = None + try: + import xarray + except ImportError: + xarray = None + import requests -import xarray + from hsmodels.schemas import load_rdf, rdf_string from hsmodels.schemas.base_models import BaseMetadata from hsmodels.schemas.enums import AggregationType @@ -211,6 +232,124 @@ def _download(self, save_path: str = "", unzip_to: str = None) -> str: return unzip_to return downloaded_zip + def _get_data_object(self, agg_path, func): + if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: + return self._data_object + + main_file_ext = pathlib.Path(self.main_file_path).suffix + if agg_path is None: + td = tempfile.mkdtemp() + try: + self._download(unzip_to=td) + # zip extracted to folder with main file name + file_name = self.file(extension=main_file_ext).name + file_path = urljoin(td, file_name, file_name) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.load() + data_object.close() + finally: + # we can delete the temporary directory for the data object created + # for these 2 aggregation types only. For other aggregation types, the generated data object + # needs to have access to the aggregation files in the temporary directory - so it's the caller's + # responsibility to delete the temporary directory + if self.metadata.type in (AggregationType.TimeSeriesAggregation, + AggregationType.MultidimensionalAggregation): + shutil.rmtree(td) + else: + file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.close() + + # cache the object for the aggregation + self._data_object = data_object + + return data_object + + def _save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, destination_path=""): + if self._data_object is None: + raise Exception("No data object exists for this aggregation.") + + main_file_ext = pathlib.Path(self.main_file_path).suffix + temp_dir = None + if not agg_path: + temp_dir = tempfile.mkdtemp() + try: + self._download(unzip_to=temp_dir) + # zip extracted to folder with main file name + file_name = self.file(extension=main_file_ext).name + file_path = urljoin(temp_dir, file_name, file_name) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + self._data_object.to_netcdf(file_path, format="NETCDF4") + except Exception: + shutil.rmtree(temp_dir) + raise + else: + file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + self._data_object.to_netcdf(file_path, format="NETCDF4") + + if self.metadata.type == AggregationType.TimeSeriesAggregation: + with closing(sqlite3.connect(file_path)) as conn: + # write the dataframe to a temp table + self._data_object.to_sql('temp', conn, if_exists='replace', index=False) + # delete the matching records from the TimeSeriesResultValues table + conn.execute("DELETE FROM TimeSeriesResultValues WHERE ResultID IN (SELECT ResultID FROM temp)") + conn.execute("INSERT INTO TimeSeriesResultValues SELECT * FROM temp") + # delete the temp table + conn.execute("DROP TABLE temp") + conn.commit() + + aggr_path = self.main_file_path + data_object = self._data_object + aggr_type = self.metadata.type + if not as_new_aggr: + # cache some of the metadata fields of the original aggregation to update the metadata of the + # updated aggregation + # TODO: There may be additional metadata fields that we need to consider to use for the updated aggregation + keywords = self.metadata.subjects + additional_meta = self.metadata.additional_metadata + if aggr_type == AggregationType.TimeSeriesAggregation: + title = self.metadata.title + abstract = self.metadata.abstract + + # delete this aggregation from Hydroshare + # TODO: If the creation of the replacement aggregation fails for some reason, then with the following + # delete action we will lose this aggregation from HydroShare. Need to keep a copy of the + # original aggregation locally so that we can upload that to HydroShare. + self.delete() + + # upload the updated data file to the same location as the aggregation it's replacing - this should + # create a new aggregation of the same type + resource.file_upload(file_path) + + # retrieve the updated aggregation + aggr = resource.aggregation(file__path=aggr_path) + + # update metadata + for kw in keywords: + if kw not in aggr.metadata.subjects: + aggr.metadata.subjects.append(kw) + aggr.metadata.additional_metadata = additional_meta + if aggr_type == AggregationType.TimeSeriesAggregation: + aggr.metadata.title = title + aggr.metadata.abstract = abstract + aggr.save() + else: + # upload the data file to the path as specified by 'destination_path' to create a + # new aggregation of the same type + resource.file_upload(file_path, destination_path=destination_path) + + # retrieve the new aggregation + aggr_path = urljoin(destination_path, os.path.basename(aggr_path)) + aggr = resource.aggregation(file__path=aggr_path) + + aggr._data_object = data_object + if temp_dir is not None: + shutil.rmtree(temp_dir) + return aggr + @property def metadata_file(self): """The path to the metadata file""" @@ -239,7 +378,8 @@ def main_file_path(self) -> str: return self.files()[0].path @property - def data_object(self) -> Union[pandas.Series, fiona.Collection, rasterio.DatasetReader, xarray.Dataset, None]: + def data_object(self) -> \ + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: return self._data_object @refresh @@ -326,7 +466,19 @@ def refresh(self) -> None: self._parsed_checksums = None self._data_object = None - def as_series(self, series_id: str, agg_path: str = None) -> pandas.DataFrame: + def delete(self) -> None: + """Deletes this aggregation from HydroShare""" + path = urljoin( + self._hsapi_path, + "functions", + "delete-file-type", + self.metadata.type.value + "LogicalFile", + self.main_file_path, + ) + self._hs_session.delete(path, status_code=200) + self.refresh() + + def as_series(self, series_id: str, agg_path: str = None) -> 'pandas.DataFrame': """ Creates a pandas DataFrame object out of an aggregation of type TimeSeries. :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. @@ -334,6 +486,11 @@ def as_series(self, series_id: str, agg_path: str = None) -> pandas.DataFrame: it downloaded locally. :return: A pandas.DataFrame object """ + # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then + # make this method as a private method. + + if pandas is None: + raise Exception("pandas package not found") def to_series(timeseries_file: str): con = sqlite3.connect(timeseries_file) @@ -345,19 +502,24 @@ def to_series(timeseries_file: str): return self._get_data_object(agg_path=agg_path, func=to_series) - def as_multi_dimensional_dataset(self, agg_path: str = None) -> xarray.Dataset: + def as_multi_dimensional_dataset(self, agg_path: str = None) -> 'xarray.Dataset': """ Creates a xarray Dataset object out of an aggregation of type NetCDF. :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have it downloaded locally. :return: A xarray.Dataset object """ + # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then + # make this method as a private method. + if self.metadata.type != AggregationType.MultidimensionalAggregation: raise Exception("Aggregation is not of type NetCDF") + if xarray is None: + raise Exception("xarray package not found") return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) - def as_feature_collection(self, agg_path: str = None) -> fiona.Collection: + def as_feature_collection(self, agg_path: str = None) -> 'fiona.Collection': """ Creates a fiona Collection object out of an aggregation of type GeoFeature. :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have @@ -366,12 +528,16 @@ def as_feature_collection(self, agg_path: str = None) -> fiona.Collection: Note: The caller is responsible for closing the fiona.Collection object to free up aggregation files used to create this object. """ + # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then + # make this method as a private method. + if self.metadata.type != AggregationType.GeographicFeatureAggregation: raise Exception("Aggregation is not of type GeoFeature") - + if fiona is None: + raise Exception("fiona package not found") return self._get_data_object(agg_path=agg_path, func=fiona.open) - def as_raster_dataset(self, agg_path: str = None) -> rasterio.DatasetReader: + def as_raster_dataset(self, agg_path: str = None) -> 'rasterio.DatasetReader': """ Creates a rasterio DatasetReader object out of an aggregation of type GeoRaster :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have @@ -380,14 +546,19 @@ def as_raster_dataset(self, agg_path: str = None) -> rasterio.DatasetReader: Note: The caller is responsible for closing the rasterio.DatasetReader object to free up aggregation files used to create this object. """ + # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then + # make this method as a private method. + if self.metadata.type != AggregationType.GeographicRasterAggregation: raise Exception("Aggregation is not of type GeoRaster") + if rasterio is None: + raise Exception("rasterio package not found") return self._get_data_object(agg_path=agg_path, func=rasterio.open) def as_data_object(self, series_id: str = None, agg_path: str = None) -> \ - Union[pandas.DataFrame, fiona.Collection, rasterio.DatasetReader, xarray.Dataset, None]: - """Load aggregation data to a relevant data object tyoe""" + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: + """Load aggregation data to a relevant data object type""" if self.metadata.type == AggregationType.TimeSeriesAggregation: if not series_id: @@ -402,39 +573,65 @@ def as_data_object(self, series_id: str = None, agg_path: str = None) -> \ raise Exception(f"Data object is not supported for '{self.metadata.type}' aggregation type") - def _get_data_object(self, agg_path, func): - if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: - return self._data_object + def update_netcdf_data(self, resource, agg_path: str = "", as_new_aggr=False, destination_path="") -> 'Aggregation': + """ + Updates the netcdf file associated with this aggregation. Then uploads the updated netcdf file + to create a new aggregation that replaces the original aggregation. + :param resource: The resource object to which this aggregation belongs. + :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have + it downloaded locally at aggr_path. + :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be + updated/replaced. + :param destination_path: The destination folder path where the new aggregation will be created. This folder + path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. + :return: The updated netcdf aggregation or a new netcdf aggregation (an instance of Aggregation) + """ - main_file_ext = pathlib.Path(self.main_file_path).suffix - if agg_path is None: - td = tempfile.mkdtemp() - try: - self._download(unzip_to=td) - # zip extracted to folder with main file name - file_name = self.file(extension=main_file_ext).name - file_path = urljoin(td, file_name, file_name) - data_object = func(file_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - data_object.close() - finally: - # we can delete the temporary directory for the data object created - # for these 2 aggregation types only. For other aggregation types, the generated data object - # needs to have access to the aggregation files in the temporary directory - so it's the caller's - # responsibility to delete the temporary directory - if self.metadata.type in (AggregationType.TimeSeriesAggregation, - AggregationType.MultidimensionalAggregation): - shutil.rmtree(td) - else: - file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) - data_object = func(file_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - data_object.close() + # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then + # make this method as a private method. - # cache the object for the aggregation - self._data_object = data_object + if self.metadata.type != AggregationType.MultidimensionalAggregation: + raise Exception("Not a NetCDF aggregation") - return data_object + return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) + + def update_timeseries_data(self, resource, agg_path: str = "", as_new_aggr=False, + destination_path="") -> 'Aggregation': + """ + Updates the sqlite file associated with this aggregation. Then uploads the updated sqlite file + to create a new aggregation that replaces the original aggregation. + :param resource: The resource object to which this aggregation belongs. + :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have + it downloaded locally at aggr_path. + :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be + updated/replaced. + :param destination_path: The destination folder path where the new aggregation will be created. This folder + path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. + :return: The updated timeseries aggregation or a new timeseries aggregation (an instance of Aggregation) + """ + + # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then + # make this method as a private method. + + if self.metadata.type != AggregationType.TimeSeriesAggregation: + raise Exception("Not a timeseries aggregation") + + return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) + + def save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, destination_path="") -> 'Aggregation': + """ + Updates the data file(s) of this aggregation using the associated data processing object + and either updates this aggregation or creates a new aggregation using the updated data files. + """ + if self.metadata.type != AggregationType.MultidimensionalAggregation: + return self.update_netcdf_data(resource, agg_path, as_new_aggr, destination_path) + + if self.metadata.type != AggregationType.TimeSeriesAggregation: + return self.update_timeseries_data(resource, agg_path, as_new_aggr, destination_path) + + # TODO: Implement this functionality for Raster and GeoFeature aggregations + + raise Exception("Saving of data object is not supported for this aggregation type") class Resource(Aggregation): @@ -767,20 +964,12 @@ def aggregation_delete(self, aggregation: Aggregation) -> None: :param aggregation: The aggregation object to delete :return: None """ - path = urljoin( - aggregation._hsapi_path, - "functions", - "delete-file-type", - aggregation.metadata.type.value + "LogicalFile", - aggregation.main_file_path, - ) - aggregation._hs_session.delete(path, status_code=200) - aggregation.refresh() + aggregation.delete() def aggregation_download(self, aggregation: Aggregation, save_path: str = "", unzip_to: str = None) -> str: """ Download an aggregation from HydroShare - :param aggregation: The aggreation to download + :param aggregation: The aggregation to download :param save_path: The local path to save the aggregation to, defaults to the current directory :param unzip_to: If set, the resulting download will be unzipped to the specified path :return: None From 61d9ba3622c2b0ef87a2b3a844f645223f30ff48 Mon Sep 17 00:00:00 2001 From: pkdash Date: Thu, 2 Mar 2023 10:28:54 -0500 Subject: [PATCH 03/23] [#44] making the data processing package installation optional --- setup.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 179a5ed..c0913ab 100644 --- a/setup.py +++ b/setup.py @@ -12,14 +12,20 @@ 'hsmodels>=0.5.1', 'requests', 'requests_oauthlib', - 'pandas' ], + extras_require={ + "pandas": ["pandas"], + "xarray": ["netCDF4", "xarray"], + "rasterio": ["rasterio"], + "fiona": ["fiona"], + "all": ["pandas", "netCDF4", "xarray", "rasterio", "fiona"], + }, url='https://github.com/hydroshare/hsclient', license='MIT', author='Scott Black', author_email='scott.black@usu.edu', description='A python client for managing HydroShare resources', - python_requires='>=3.6', + python_requires='>=3.9', long_description=README, long_description_content_type="text/markdown", classifiers=[ From 7315cad0b4073531d2f42acf164155f9731d733b Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 3 Mar 2023 15:57:13 -0500 Subject: [PATCH 04/23] [#44] data processing objects to work only with downloaded aggregations --- hsclient/hydroshare.py | 111 ++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 73 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 69e355e..28939b4 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -2,7 +2,6 @@ import os import pathlib import pickle -import shutil import sqlite3 import tempfile import time @@ -232,63 +231,37 @@ def _download(self, save_path: str = "", unzip_to: str = None) -> str: return unzip_to return downloaded_zip + def _validate_aggregation_path(self, agg_path: str): + main_file_ext = pathlib.Path(self.main_file_path).suffix + file_name = self.file(extension=main_file_ext).name + file_path = urljoin(agg_path, file_name) + if not os.path.exists(file_path) or not os.path.isfile(file_path): + file_path = urljoin(file_path, file_name) + if not os.path.exists(file_path): + raise Exception(f"Aggregation was not found at: {agg_path}") + return file_path + def _get_data_object(self, agg_path, func): if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: return self._data_object - main_file_ext = pathlib.Path(self.main_file_path).suffix - if agg_path is None: - td = tempfile.mkdtemp() - try: - self._download(unzip_to=td) - # zip extracted to folder with main file name - file_name = self.file(extension=main_file_ext).name - file_path = urljoin(td, file_name, file_name) - data_object = func(file_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - data_object.load() - data_object.close() - finally: - # we can delete the temporary directory for the data object created - # for these 2 aggregation types only. For other aggregation types, the generated data object - # needs to have access to the aggregation files in the temporary directory - so it's the caller's - # responsibility to delete the temporary directory - if self.metadata.type in (AggregationType.TimeSeriesAggregation, - AggregationType.MultidimensionalAggregation): - shutil.rmtree(td) - else: - file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) - data_object = func(file_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - data_object.close() + file_path = self._validate_aggregation_path(agg_path) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.load() + data_object.close() # cache the object for the aggregation self._data_object = data_object - return data_object - def _save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, destination_path=""): + def _save_data_object(self, resource, agg_path: str, as_new_aggr=False, destination_path=""): if self._data_object is None: raise Exception("No data object exists for this aggregation.") - main_file_ext = pathlib.Path(self.main_file_path).suffix - temp_dir = None - if not agg_path: - temp_dir = tempfile.mkdtemp() - try: - self._download(unzip_to=temp_dir) - # zip extracted to folder with main file name - file_name = self.file(extension=main_file_ext).name - file_path = urljoin(temp_dir, file_name, file_name) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - self._data_object.to_netcdf(file_path, format="NETCDF4") - except Exception: - shutil.rmtree(temp_dir) - raise - else: - file_path = urljoin(agg_path, self.file(extension=main_file_ext).name) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - self._data_object.to_netcdf(file_path, format="NETCDF4") + file_path = self._validate_aggregation_path(agg_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + self._data_object.to_netcdf(file_path, format="NETCDF4") if self.metadata.type == AggregationType.TimeSeriesAggregation: with closing(sqlite3.connect(file_path)) as conn: @@ -301,7 +274,7 @@ def _save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, des conn.execute("DROP TABLE temp") conn.commit() - aggr_path = self.main_file_path + aggr_file_path = self.main_file_path data_object = self._data_object aggr_type = self.metadata.type if not as_new_aggr: @@ -325,7 +298,7 @@ def _save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, des resource.file_upload(file_path) # retrieve the updated aggregation - aggr = resource.aggregation(file__path=aggr_path) + aggr = resource.aggregation(file__path=aggr_file_path) # update metadata for kw in keywords: @@ -342,12 +315,10 @@ def _save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, des resource.file_upload(file_path, destination_path=destination_path) # retrieve the new aggregation - aggr_path = urljoin(destination_path, os.path.basename(aggr_path)) + aggr_path = urljoin(destination_path, os.path.basename(aggr_file_path)) aggr = resource.aggregation(file__path=aggr_path) aggr._data_object = data_object - if temp_dir is not None: - shutil.rmtree(temp_dir) return aggr @property @@ -478,12 +449,11 @@ def delete(self) -> None: self._hs_session.delete(path, status_code=200) self.refresh() - def as_series(self, series_id: str, agg_path: str = None) -> 'pandas.DataFrame': + def as_series(self, series_id: str, agg_path: str) -> 'pandas.DataFrame': """ Creates a pandas DataFrame object out of an aggregation of type TimeSeries. :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally. + :param agg_path: The local path where this aggregation has been downloaded previously. :return: A pandas.DataFrame object """ # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then @@ -502,11 +472,10 @@ def to_series(timeseries_file: str): return self._get_data_object(agg_path=agg_path, func=to_series) - def as_multi_dimensional_dataset(self, agg_path: str = None) -> 'xarray.Dataset': + def as_multi_dimensional_dataset(self, agg_path: str) -> 'xarray.Dataset': """ Creates a xarray Dataset object out of an aggregation of type NetCDF. - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally. + :param agg_path: The local path where this aggregation has been downloaded previously. :return: A xarray.Dataset object """ # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then @@ -519,11 +488,10 @@ def as_multi_dimensional_dataset(self, agg_path: str = None) -> 'xarray.Dataset' return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) - def as_feature_collection(self, agg_path: str = None) -> 'fiona.Collection': + def as_feature_collection(self, agg_path: str) -> 'fiona.Collection': """ Creates a fiona Collection object out of an aggregation of type GeoFeature. - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally at aggr_path. + :param agg_path: The local path where this aggregation has been downloaded previously. :return: A fiona.Collection object Note: The caller is responsible for closing the fiona.Collection object to free up aggregation files used to create this object. @@ -537,11 +505,10 @@ def as_feature_collection(self, agg_path: str = None) -> 'fiona.Collection': raise Exception("fiona package not found") return self._get_data_object(agg_path=agg_path, func=fiona.open) - def as_raster_dataset(self, agg_path: str = None) -> 'rasterio.DatasetReader': + def as_raster_dataset(self, agg_path: str) -> 'rasterio.DatasetReader': """ Creates a rasterio DatasetReader object out of an aggregation of type GeoRaster - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally at aggr_path. + :param agg_path: The local path where this aggregation has been downloaded previously. :return: A rasterio.DatasetReader object Note: The caller is responsible for closing the rasterio.DatasetReader object to free up aggregation files used to create this object. @@ -556,7 +523,7 @@ def as_raster_dataset(self, agg_path: str = None) -> 'rasterio.DatasetReader': return self._get_data_object(agg_path=agg_path, func=rasterio.open) - def as_data_object(self, series_id: str = None, agg_path: str = None) -> \ + def as_data_object(self, agg_path: str, series_id: str = "") -> \ Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: """Load aggregation data to a relevant data object type""" @@ -573,13 +540,12 @@ def as_data_object(self, series_id: str = None, agg_path: str = None) -> \ raise Exception(f"Data object is not supported for '{self.metadata.type}' aggregation type") - def update_netcdf_data(self, resource, agg_path: str = "", as_new_aggr=False, destination_path="") -> 'Aggregation': + def update_netcdf_data(self, resource, agg_path: str, as_new_aggr=False, destination_path="") -> 'Aggregation': """ Updates the netcdf file associated with this aggregation. Then uploads the updated netcdf file to create a new aggregation that replaces the original aggregation. :param resource: The resource object to which this aggregation belongs. - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally at aggr_path. + :param agg_path: The local path where this aggregation has been downloaded previously. :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be updated/replaced. :param destination_path: The destination folder path where the new aggregation will be created. This folder @@ -595,14 +561,13 @@ def update_netcdf_data(self, resource, agg_path: str = "", as_new_aggr=False, de return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) - def update_timeseries_data(self, resource, agg_path: str = "", as_new_aggr=False, + def update_timeseries_data(self, resource, agg_path: str, as_new_aggr=False, destination_path="") -> 'Aggregation': """ Updates the sqlite file associated with this aggregation. Then uploads the updated sqlite file to create a new aggregation that replaces the original aggregation. :param resource: The resource object to which this aggregation belongs. - :param agg_path: Not required. Include this parameter to avoid downloading the aggregation if you already have - it downloaded locally at aggr_path. + :param agg_path: The local path where this aggregation has been downloaded previously. :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be updated/replaced. :param destination_path: The destination folder path where the new aggregation will be created. This folder @@ -618,15 +583,15 @@ def update_timeseries_data(self, resource, agg_path: str = "", as_new_aggr=False return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) - def save_data_object(self, resource, agg_path: str = "", as_new_aggr=False, destination_path="") -> 'Aggregation': + def save_data_object(self, resource, agg_path: str, as_new_aggr=False, destination_path="") -> 'Aggregation': """ Updates the data file(s) of this aggregation using the associated data processing object and either updates this aggregation or creates a new aggregation using the updated data files. """ - if self.metadata.type != AggregationType.MultidimensionalAggregation: + if self.metadata.type == AggregationType.MultidimensionalAggregation: return self.update_netcdf_data(resource, agg_path, as_new_aggr, destination_path) - if self.metadata.type != AggregationType.TimeSeriesAggregation: + if self.metadata.type == AggregationType.TimeSeriesAggregation: return self.update_timeseries_data(resource, agg_path, as_new_aggr, destination_path) # TODO: Implement this functionality for Raster and GeoFeature aggregations From 22ee6834d6a6bc433df75a153140eac0fc750043 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 3 Mar 2023 17:52:17 -0500 Subject: [PATCH 05/23] [#44] fixing tests - marked some tests to skip for bugs in hydroshare --- tests/test_functional.py | 75 ++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 69ba820..6710e02 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -17,7 +17,7 @@ def change_test_dir(request): @pytest.fixture() def hydroshare(change_test_dir): - hs = HydroShare(os.getenv("HYDRO_USERNAME"), os.getenv("HYDRO_PASSWORD")) + hs = HydroShare(os.getenv("HYDRO_USERNAME"), os.getenv("HYDRO_PASSWORD"), host="beta.hydroshare.org") return hs @@ -51,6 +51,24 @@ def timeseries_resource(new_resource): return new_resource +@pytest.fixture() +def resource_with_netcdf_aggr(new_resource): + files = [ + "SWE_time.nc", + "SWE_time_header_info.txt", + "SWE_time_resmap.xml", + "SWE_time_meta.xml", + ] + root_path = "data/test_resource_metadata_files/" + new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) + return new_resource + + +@pytest.fixture() +def resource_with_raster_aggr(resource): + return resource + + def test_absolute_path_multiple_file_upload(new_resource): files = [ "other.txt", @@ -185,7 +203,7 @@ def test_resource_metadata_updating(new_resource): new_resource.metadata.title = "resource test" new_resource.metadata.additional_metadata = {"key1": "value1", "key2": "value2", "key3": "value3"} new_resource.metadata.abstract = "world’s" - new_resource.metadata.relations = [Relation(type=RelationType.isCopiedFrom, value="is hosted by value")] + new_resource.metadata.relations = [Relation(type=RelationType.isReferencedBy, value="is hosted by value")] new_resource.save() @@ -198,7 +216,7 @@ def test_resource_metadata_updating(new_resource): assert new_resource.metadata.additional_metadata["key3"] == "value3" assert new_resource.metadata.abstract == "world’s" - assert new_resource.metadata.relations == [Relation(type=RelationType.isCopiedFrom, value="is hosted by value")] + assert new_resource.metadata.relations == [Relation(type=RelationType.isReferencedBy, value="is hosted by value")] def test_system_metadata(new_resource): @@ -290,6 +308,7 @@ def test_aggregation_delete(resource): assert len(resource.files()) == 1 +@pytest.mark.skip(reason="this test fails due to a bug (#4995) in hydroshare") def test_aggregation_remove(resource): resource.refresh() assert len(resource.aggregations()) == 1 @@ -388,6 +407,7 @@ def test_empty_creator(new_resource): assert "creators list must have at least one creator" in str(e) +@pytest.mark.skip(reason="this test fails due to a bug (#4995) in hydroshare") @pytest.mark.parametrize( "files", [ @@ -416,21 +436,36 @@ def test_empty_creator(new_resource): def test_aggregations(new_resource, files): root_path = "data/test_resource_metadata_files/" file_count = len(files) - 2 # exclude rdf/xml file + aggr_file_count = file_count new_resource.file_upload(*[os.path.join(root_path, file) for file in files]) assert len(new_resource.aggregations()) == 1 assert len(new_resource.files()) == 0 agg = new_resource.aggregations()[0] agg_type = agg.metadata.type - assert len(agg.files()) == file_count + assert len(agg.files()) == aggr_file_count new_resource.aggregation_remove(agg) assert len(new_resource.aggregations()) == 0 + if agg_type == "GeoRaster": + # TODO: Due to a bug (#4995) in hydroshare, the vrt file of the aggregation gets deleted when the aggregation + # is removed + file_count = file_count - 1 + elif agg_type == "NetCDF": + # the txt file of the aggregation gets deleted when the netcdf aggregation is removed. + file_count = file_count - 1 + assert len(new_resource.files()) == file_count + if agg_type == "GeoRaster": + # TODO: Due to a bug (#4995) in hydroshare, the vrt file of the aggregation gets deleted when the aggregation + # is removed -so we need to upload that vrt file again for now + new_resource.file_upload(os.path.join(root_path, files[2])) + assert len(new_resource.files()) == file_count + 1 + main_file = next(f for f in new_resource.files() if f.path.endswith(files[0])) assert main_file agg = new_resource.file_aggregate(main_file, agg_type) assert len(new_resource.aggregations()) == 1 assert len(new_resource.files()) == 0 - assert len(agg.files()) == file_count + assert len(agg.files()) == aggr_file_count with tempfile.TemporaryDirectory() as tmp: new_resource.aggregation_download(agg, tmp) files = os.listdir(tmp) @@ -440,6 +475,7 @@ def test_aggregations(new_resource, files): assert len(new_resource.files()) == 0 +@pytest.mark.skip(reason="there is a bug (#4998) in hydroshare that causes this test to fail") @pytest.mark.parametrize( "files", [ @@ -478,7 +514,7 @@ def test_aggregation_fileset(new_resource, files): assert len(new_resource.files()) == 0 -def test_pandas_series_local(timeseries_resource): +def test_pandas_series(timeseries_resource): timeseries_resource.refresh() timeseries = timeseries_resource.aggregation(type=AggregationType.TimeSeriesAggregation) series_result = next( @@ -488,14 +524,22 @@ def test_pandas_series_local(timeseries_resource): assert len(series) == 1333 -def test_pandas_series_remote(timeseries_resource): - timeseries_resource.refresh() - timeseries = timeseries_resource.aggregation(type=AggregationType.TimeSeriesAggregation) - series_result = next( - r for r in timeseries.metadata.time_series_results if r.series_id == "3b9037f8-1ebc-11e6-a304-f45c8999816f" - ) - series_map = timeseries.as_series(series_result.series_id) - assert len(series_map) == 1440 +def test_raster_as_data_object(resource_with_raster_aggr): + resource_with_raster_aggr.refresh() + raster_aggr = resource_with_raster_aggr.aggregation(type=AggregationType.GeographicRasterAggregation) + dataset = raster_aggr.as_data_object(agg_path="data/test_resource_metadata_files") + assert dataset.__class__.__name__ == "DatasetReader" + # raster should have 1 band + assert dataset.count == 1 + + +def test_netcdf_as_data_object(resource_with_netcdf_aggr): + resource_with_netcdf_aggr.refresh() + nc_aggr = resource_with_netcdf_aggr.aggregation(type=AggregationType.MultidimensionalAggregation) + dataset = nc_aggr.as_data_object(agg_path="data/test_resource_metadata_files") + assert dataset.__class__.__name__ == "Dataset" + # netcdf dimensions + assert dataset.dims['time'] == 2184 def test_folder_zip(new_resource): @@ -556,6 +600,8 @@ def test_filename_spaces(hydroshare): filename = res.file_download(file, save_path=td) assert os.path.basename(filename) == "with spaces file.txt" + res.delete() + def test_copy(new_resource): try: @@ -587,5 +633,6 @@ def test_resource_public(resource): resource.set_sharing_status(public=False) assert resource.system_metadata()['public'] is False + def test_instantiate_hydroshare_object_without_args(): HydroShare() From ee1e0090754811c33b33aac3053583ea93220dd2 Mon Sep 17 00:00:00 2001 From: pkdash Date: Tue, 7 Mar 2023 15:14:51 -0500 Subject: [PATCH 06/23] [#44] initial work for editing data for geo-feature aggregation --- hsclient/hydroshare.py | 315 ++++++++++++++++++++++++++++++----------- 1 file changed, 231 insertions(+), 84 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 28939b4..18a490e 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -2,6 +2,7 @@ import os import pathlib import pickle +import shutil import sqlite3 import tempfile import time @@ -11,7 +12,7 @@ from functools import wraps from posixpath import basename, dirname, join as urljoin, splitext from pprint import pformat -from typing import Dict, List, Union, TYPE_CHECKING +from typing import Dict, List, Union, TYPE_CHECKING, Callable from urllib.parse import quote, unquote, urlparse from zipfile import ZipFile @@ -231,7 +232,7 @@ def _download(self, save_path: str = "", unzip_to: str = None) -> str: return unzip_to return downloaded_zip - def _validate_aggregation_path(self, agg_path: str): + def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: main_file_ext = pathlib.Path(self.main_file_path).suffix file_name = self.file(extension=main_file_ext).name file_path = urljoin(agg_path, file_name) @@ -239,9 +240,26 @@ def _validate_aggregation_path(self, agg_path: str): file_path = urljoin(file_path, file_name) if not os.path.exists(file_path): raise Exception(f"Aggregation was not found at: {agg_path}") + + if for_save_data: + if self.metadata.type == AggregationType.GeographicFeatureAggregation: + if file_path == self._data_object.path: + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. This should be a path where " + f"you have the updated shape files") + else: + for aggr_file in self.files(): + aggr_file = basename(aggr_file) + if aggr_file.endswith(".shp.xml") or aggr_file.endswith(".sbn") or aggr_file.endswith(".sbx"): + # these are optional files for geo feature aggregation + continue + if not os.path.exists(os.path.join(agg_path, aggr_file)): + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"Missing file '{aggr_file}'") return file_path - def _get_data_object(self, agg_path, func): + def _get_data_object(self, agg_path: str, func: Callable) -> \ + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: + if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: return self._data_object @@ -251,75 +269,21 @@ def _get_data_object(self, agg_path, func): data_object.load() data_object.close() - # cache the object for the aggregation + # cache the data object for the aggregation self._data_object = data_object return data_object - def _save_data_object(self, resource, agg_path: str, as_new_aggr=False, destination_path=""): + def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: AggregationType) -> None: + if self.metadata.type != agg_type: + raise Exception(f"Not a {agg_type.value} aggregation") + if self._data_object is None: raise Exception("No data object exists for this aggregation.") - file_path = self._validate_aggregation_path(agg_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - self._data_object.to_netcdf(file_path, format="NETCDF4") - - if self.metadata.type == AggregationType.TimeSeriesAggregation: - with closing(sqlite3.connect(file_path)) as conn: - # write the dataframe to a temp table - self._data_object.to_sql('temp', conn, if_exists='replace', index=False) - # delete the matching records from the TimeSeriesResultValues table - conn.execute("DELETE FROM TimeSeriesResultValues WHERE ResultID IN (SELECT ResultID FROM temp)") - conn.execute("INSERT INTO TimeSeriesResultValues SELECT * FROM temp") - # delete the temp table - conn.execute("DROP TABLE temp") - conn.commit() - - aggr_file_path = self.main_file_path - data_object = self._data_object - aggr_type = self.metadata.type - if not as_new_aggr: - # cache some of the metadata fields of the original aggregation to update the metadata of the - # updated aggregation - # TODO: There may be additional metadata fields that we need to consider to use for the updated aggregation - keywords = self.metadata.subjects - additional_meta = self.metadata.additional_metadata - if aggr_type == AggregationType.TimeSeriesAggregation: - title = self.metadata.title - abstract = self.metadata.abstract - - # delete this aggregation from Hydroshare - # TODO: If the creation of the replacement aggregation fails for some reason, then with the following - # delete action we will lose this aggregation from HydroShare. Need to keep a copy of the - # original aggregation locally so that we can upload that to HydroShare. - self.delete() - - # upload the updated data file to the same location as the aggregation it's replacing - this should - # create a new aggregation of the same type - resource.file_upload(file_path) - - # retrieve the updated aggregation - aggr = resource.aggregation(file__path=aggr_file_path) - - # update metadata - for kw in keywords: - if kw not in aggr.metadata.subjects: - aggr.metadata.subjects.append(kw) - aggr.metadata.additional_metadata = additional_meta - if aggr_type == AggregationType.TimeSeriesAggregation: - aggr.metadata.title = title - aggr.metadata.abstract = abstract - aggr.save() - else: - # upload the data file to the path as specified by 'destination_path' to create a - # new aggregation of the same type - resource.file_upload(file_path, destination_path=destination_path) - - # retrieve the new aggregation - aggr_path = urljoin(destination_path, os.path.basename(aggr_file_path)) - aggr = resource.aggregation(file__path=aggr_path) - - aggr._data_object = data_object - return aggr + # check this aggregation is part of the specified resource + aggr = resource.aggregation(file__path=self.main_file_path) + if aggr is None: + raise Exception("This aggregation is not part of the specified resource.") @property def metadata_file(self): @@ -484,7 +448,7 @@ def as_multi_dimensional_dataset(self, agg_path: str) -> 'xarray.Dataset': if self.metadata.type != AggregationType.MultidimensionalAggregation: raise Exception("Aggregation is not of type NetCDF") if xarray is None: - raise Exception("xarray package not found") + raise Exception("xarray package was not found") return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) @@ -502,7 +466,7 @@ def as_feature_collection(self, agg_path: str) -> 'fiona.Collection': if self.metadata.type != AggregationType.GeographicFeatureAggregation: raise Exception("Aggregation is not of type GeoFeature") if fiona is None: - raise Exception("fiona package not found") + raise Exception("fiona package was not found") return self._get_data_object(agg_path=agg_path, func=fiona.open) def as_raster_dataset(self, agg_path: str) -> 'rasterio.DatasetReader': @@ -519,17 +483,25 @@ def as_raster_dataset(self, agg_path: str) -> 'rasterio.DatasetReader': if self.metadata.type != AggregationType.GeographicRasterAggregation: raise Exception("Aggregation is not of type GeoRaster") if rasterio is None: - raise Exception("rasterio package not found") + raise Exception("rasterio package was not found") return self._get_data_object(agg_path=agg_path, func=rasterio.open) def as_data_object(self, agg_path: str, series_id: str = "") -> \ - Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: - """Load aggregation data to a relevant data object type""" + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: + """ + Loads aggregation data to a relevant data object type. Data for a timeseries aggregation is loaded as pandas + DataFrame, data for a geo feature aggregation os loaded as a fiona Collection object, data for a raster + aggregation is loaded as rasterio DatasetReader object, and data for a netcdf aggregation is loaded as xarray + Dataset object. + :param agg_path: The local path where this aggregation has been downloaded previously. + :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. A value for this + parameter is required only for a timeseries aggregation. + """ if self.metadata.type == AggregationType.TimeSeriesAggregation: if not series_id: - raise Exception("Please specify series_id for which the timeseries data object is needed.") + raise Exception("Provide the series_id for which the timeseries data object is needed.") return self.as_series(series_id=series_id, agg_path=agg_path) if self.metadata.type == AggregationType.MultidimensionalAggregation: return self.as_multi_dimensional_dataset(agg_path=agg_path) @@ -540,7 +512,8 @@ def as_data_object(self, agg_path: str, series_id: str = "") -> \ raise Exception(f"Data object is not supported for '{self.metadata.type}' aggregation type") - def update_netcdf_data(self, resource, agg_path: str, as_new_aggr=False, destination_path="") -> 'Aggregation': + def update_netcdf_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': """ Updates the netcdf file associated with this aggregation. Then uploads the updated netcdf file to create a new aggregation that replaces the original aggregation. @@ -556,16 +529,50 @@ def update_netcdf_data(self, resource, agg_path: str, as_new_aggr=False, destina # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then # make this method as a private method. - if self.metadata.type != AggregationType.MultidimensionalAggregation: - raise Exception("Not a NetCDF aggregation") + self._validate_aggregation_for_update(resource, AggregationType.MultidimensionalAggregation) + file_path = self._validate_aggregation_path(agg_path, for_save_data=True) + self._data_object.to_netcdf(file_path, format="NETCDF4") + aggr_main_file_path = self.main_file_path + data_object = self._data_object + if not as_new_aggr: + destination_path = dirname(self.main_file_path) + + # cache some of the metadata fields of the original aggregation to update the metadata of the + # updated aggregation + keywords = self.metadata.subjects + additional_meta = self.metadata.additional_metadata + + # TODO: keep a local backup copy of the aggregation before deleting it + self.delete() + resource.file_upload(file_path, destination_path=destination_path) + + # retrieve the updated aggregation + aggr = resource.aggregation(file__path=aggr_main_file_path) - return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) + # update metadata + for kw in keywords: + if kw not in aggr.metadata.subjects: + aggr.metadata.subjects.append(kw) + aggr.metadata.additional_metadata = additional_meta + aggr.save() + else: + # creating a new aggregation by uploading the updated data files + resource.file_upload(file_path, destination_path=destination_path) - def update_timeseries_data(self, resource, agg_path: str, as_new_aggr=False, - destination_path="") -> 'Aggregation': + # retrieve the new aggregation + agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) + aggr = resource.aggregation(file__path=agg_path) + data_object = None + + aggr._data_object = data_object + return aggr + + def update_timeseries_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': """ Updates the sqlite file associated with this aggregation. Then uploads the updated sqlite file - to create a new aggregation that replaces the original aggregation. + to create a new aggregation that either replaces the original aggregation or adds as a new + aggregation. :param resource: The resource object to which this aggregation belongs. :param agg_path: The local path where this aggregation has been downloaded previously. :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be @@ -578,12 +585,149 @@ def update_timeseries_data(self, resource, agg_path: str, as_new_aggr=False, # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then # make this method as a private method. - if self.metadata.type != AggregationType.TimeSeriesAggregation: - raise Exception("Not a timeseries aggregation") + self._validate_aggregation_for_update(resource, AggregationType.TimeSeriesAggregation) + file_path = self._validate_aggregation_path(agg_path, for_save_data=True) + with closing(sqlite3.connect(file_path)) as conn: + # write the dataframe to a temp table + self._data_object.to_sql('temp', conn, if_exists='replace', index=False) + # delete the matching records from the TimeSeriesResultValues table + conn.execute("DELETE FROM TimeSeriesResultValues WHERE ResultID IN (SELECT ResultID FROM temp)") + conn.execute("INSERT INTO TimeSeriesResultValues SELECT * FROM temp") + # delete the temp table + conn.execute("DROP TABLE temp") + conn.commit() + + aggr_main_file_path = self.main_file_path + data_object = self._data_object + if not as_new_aggr: + destination_path = dirname(self.main_file_path) - return self._save_data_object(resource, agg_path, as_new_aggr, destination_path) + # cache some of the metadata fields of the original aggregation to update the metadata of the + # updated aggregation + keywords = self.metadata.subjects + additional_meta = self.metadata.additional_metadata + title = self.metadata.title + abstract = self.metadata.abstract + + # TODO: If the creation of the replacement aggregation fails for some reason, then with the following + # delete action we will lose this aggregation from HydroShare. Need to keep a copy of the + # original aggregation locally so that we can upload that to HydroShare if needed. + self.delete() + resource.file_upload(file_path, destination_path=destination_path) + # retrieve the updated aggregation + aggr = resource.aggregation(file__path=aggr_main_file_path) + + # update metadata + for kw in keywords: + if kw not in aggr.metadata.subjects: + aggr.metadata.subjects.append(kw) + aggr.metadata.additional_metadata = additional_meta + aggr.metadata.title = title + aggr.metadata.abstract = abstract + aggr.save() + else: + # creating a new aggregation by uploading the updated data files + resource.file_upload(file_path, destination_path=destination_path) + + # retrieve the new aggregation + agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) + aggr = resource.aggregation(file__path=agg_path) + data_object = None - def save_data_object(self, resource, agg_path: str, as_new_aggr=False, destination_path="") -> 'Aggregation': + aggr._data_object = data_object + return aggr + + def update_geo_feature_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': + """ + Updates the shape files associated with this aggregation. Then uploads all files associated with this + aggregation to create a new aggregation that either replaces the original aggregation or adds as a new + aggregation. + :param resource: The resource object to which this aggregation belongs. + :param agg_path: The local path where this aggregation has been downloaded previously. + :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be + updated/replaced. + :param destination_path: The destination folder path where the new aggregation will be created. This folder + path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. + :return: The updated geo-feature aggregation or a new geo-feature aggregation (an instance of Aggregation) + """ + + # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then + # make this method as a private method. + + def upload_shape_files(main_file_path, dst_path=""): + shp_file_dir_path = os.path.dirname(main_file_path) + filename_starts_with = f"{pathlib.Path(main_file_path).stem}." + shape_files = [] + for item in os.listdir(shp_file_dir_path): + if item.startswith(filename_starts_with): + file_full_path = os.path.join(shp_file_dir_path, item) + shape_files.append(file_full_path) + resource.file_upload(*shape_files, destination_path=dst_path) + + self._validate_aggregation_for_update(resource, AggregationType.GeographicFeatureAggregation) + file_path = self._validate_aggregation_path(agg_path, for_save_data=True) + aggr_main_file_path = self.main_file_path + data_object = self._data_object + if not as_new_aggr: + destination_path = dirname(self.main_file_path) + + # cache some of the metadata fields of the original aggregation to update the metadata of the + # updated aggregation + keywords = self.metadata.subjects + additional_meta = self.metadata.additional_metadata + + # TODO: keep a local backup copy of the aggregation before deleting it + self.delete() + # copy the updated shape files to the original shape file location where the user downloaded the + # aggregation previously + src_shp_file_dir_path = os.path.dirname(file_path) + tgt_shp_file_dir_path = os.path.dirname(data_object.path) + agg_path = tgt_shp_file_dir_path + filename_starts_with = f"{pathlib.Path(file_path).stem}." + + # need to close the fiona.Collection object to free up access to all the original shape files + data_object.close() + + for item in os.listdir(src_shp_file_dir_path): + if item.startswith(filename_starts_with): + src_file_full_path = os.path.join(src_shp_file_dir_path, item) + tgt_file_full_path = os.path.join(tgt_shp_file_dir_path, item) + shutil.copyfile(src_file_full_path, tgt_file_full_path) + + # upload the updated shape files to replace this aggregation + upload_shape_files(main_file_path=data_object.path, dst_path=destination_path) + + # retrieve the updated aggregation + aggr = resource.aggregation(file__path=aggr_main_file_path) + + # update aggregation metadata + for kw in keywords: + if kw not in aggr.metadata.subjects: + aggr.metadata.subjects.append(kw) + aggr.metadata.additional_metadata = additional_meta + aggr.save() + + # load aggregation data to fiona Collection object + data_object = aggr.as_data_object(agg_path=agg_path) + else: + # creating a new aggregation + # close the original fiona Collection object + data_object.close() + + # upload the updated shape files to create a new geo feature aggregation + upload_shape_files(main_file_path=file_path, dst_path=destination_path) + + # retrieve the new aggregation + agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) + aggr = resource.aggregation(file__path=agg_path) + data_object = None + + aggr._data_object = data_object + return aggr + + def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': """ Updates the data file(s) of this aggregation using the associated data processing object and either updates this aggregation or creates a new aggregation using the updated data files. @@ -594,7 +738,10 @@ def save_data_object(self, resource, agg_path: str, as_new_aggr=False, destinati if self.metadata.type == AggregationType.TimeSeriesAggregation: return self.update_timeseries_data(resource, agg_path, as_new_aggr, destination_path) - # TODO: Implement this functionality for Raster and GeoFeature aggregations + if self.metadata.type == AggregationType.GeographicFeatureAggregation: + return self.update_geo_feature_data(resource, agg_path, as_new_aggr, destination_path) + + # TODO: Implement this functionality for Raster aggregation raise Exception("Saving of data object is not supported for this aggregation type") From 07d8a24eecc5079b3267c1ac4bb42b5900efb28f Mon Sep 17 00:00:00 2001 From: Jeff Horsburgh Date: Tue, 21 Mar 2023 20:52:52 -0600 Subject: [PATCH 07/23] Update example notebooks Updated example Jupyter notebooks with changes based on testing. Also minor edits to text to change "hs rdf" to "hsclient". --- docs/examples/Aggregation_Operations.ipynb | 107 +++++++------- docs/examples/Basic_Operations.ipynb | 63 +++++---- docs/examples/File_Operations.ipynb | 105 +++++++------- docs/examples/Metadata_Operations.ipynb | 155 +++++++++------------ 4 files changed, 201 insertions(+), 229 deletions(-) diff --git a/docs/examples/Aggregation_Operations.ipynb b/docs/examples/Aggregation_Operations.ipynb index 0ae9948..a08a08e 100644 --- a/docs/examples/Aggregation_Operations.ipynb +++ b/docs/examples/Aggregation_Operations.ipynb @@ -6,13 +6,13 @@ "id": "HHsuQMMJyms4" }, "source": [ - "# HS RDF HydroShare Python Client Resource Aggregation Operation Examples \n", + "# hsclient HydroShare Python Client Resource Aggregation Operation Examples\n", "\n", "\n", "---\n", "\n", "\n", - "The following code snippets show examples for how to use the HS RDF HydroShare Python Client to manipulate aggregations of known content types in HydroShare. HydroShare's content type aggregations include individual file, fileset, time series, geographic feature, geographic raster, and multidimensional NetCDF." + "The following code snippets show examples for how to use the hsclient HydroShare Python Client to manipulate aggregations of known content types in HydroShare. HydroShare's content type aggregations include individual file, fileset, time series, geographic feature, geographic raster, and multidimensional NetCDF." ] }, { @@ -21,9 +21,9 @@ "id": "b_Tj5gJx0fRj" }, "source": [ - "## Install the HS RDF Python Client\n", + "## Install the hsclient Python Client\n", "\n", - "The HS RDF Python Client for HydroShare won't be installed by default, so it has to be installed first before you can work with it. Use the following command to install the Python Client from the GitHub repository. Eventually we will distribute this package via the Python Package Index (PyPi) so that it can be installed via pip from PyPi." + "The hsclient Python Client for HydroShare may not be installed by default in your Python environment, so it has to be installed first before you can work with it. Use the following command to install hsclient via the Python Package Index (PyPi)." ] }, { @@ -87,11 +87,11 @@ "new_resource = hs.create()\n", "\n", "# Get the HydroShare identifier for the new resource\n", - "resIdentifier = new_resource.resource_id\n", - "print('The HydroShare Identifier for your new resource is: ' + resIdentifier)\n", + "res_identifier = new_resource.resource_id\n", + "print(f'The HydroShare Identifier for your new resource is: {res_identifier}')\n", "\n", "# Construct a hyperlink for the new resource\n", - "print('Your new resource is available at: ' + new_resource.metadata.url)" + "print(f'Your new resource is available at: {new_resource.metadata.url}')" ] }, { @@ -136,19 +136,19 @@ "# Import the aggregation types\n", "from hsmodels.schemas.enums import AggregationType\n", "\n", - "# Upload a single content file to the resource. This is a generic sample comma separated \n", + "# Upload a single content file to the resource. This is a generic sample comma separated\n", "# values (CSV) data file with some tabular data\n", "new_resource.file_upload('Example_Files/Data_File1.csv')\n", "\n", "# Specify the file you want to add the aggregation to\n", - "file = new_resource.file(path=\"Data_File1.csv\")\n", + "file = new_resource.file(path='Data_File1.csv')\n", "\n", "# Create a single file aggregation on the file and refresh the resource\n", "agg = new_resource.file_aggregate(file, AggregationType.SingleFileAggregation)\n", "\n", "# Print the title for the aggregation that was added to the resource\n", - "print('The following aggregation was added to the resource: ' + agg.metadata.title)\n", - "print('Aggregation type: ' + agg.metadata.type)" + "print(f'The following aggregation was added to the resource: {agg.metadata.title}')\n", + "print(f'Aggregation type: {agg.metadata.type}')" ] }, { @@ -178,14 +178,14 @@ "outputs": [], "source": [ "# Set the title and subject keywords for the aggregation\n", - "agg.metadata.title = \"A CSV Data File Single File Aggregation\"\n", - "agg.metadata.subjects = ['CSV','Aggregation', 'Single File','Data']\n", + "agg.metadata.title = 'A CSV Data File Single File Aggregation'\n", + "agg.metadata.subjects = ['CSV', 'Aggregation', 'Single File', 'Data']\n", "\n", "# Print the title and keywords for the aggregation\n", - "print('Aggregation Title: ' + agg.metadata.title)\n", - "print('Aggregation Keywords: ' + ', '.join(agg.metadata.subjects))\n", + "print(f'Aggregation Title: {agg.metadata.title}')\n", + "print(f'Aggregation Keywords: {\", \".join(agg.metadata.subjects)}')\n", "\n", - "# Save the aggregation to write all of the metadata to HydroShare\n", + "# Save the aggregation to write the metadata to HydroShare\n", "agg.save()" ] }, @@ -204,27 +204,27 @@ "metadata": {}, "outputs": [], "source": [ - "# Add an extended metadata element to the aggregation as a key-value pair \n", + "# Add an extended metadata element to the aggregation as a key-value pair\n", "agg.metadata.additional_metadata['New Element Key'] = 'Text value of new element.'\n", "\n", "# Remove an individual key-value pair from the aggregation using its key\n", "del agg.metadata.additional_metadata['New Element Key']\n", "\n", - "# Or, you can clear out all of the extended metadata elements that might exist\n", + "# Or, you can clear out all extended metadata elements that might exist\n", "agg.metadata.additional_metadata.clear()\n", "\n", "# Add multiple key-value pairs to the aggregation at once using a Python dictionary\n", "agg.metadata.additional_metadata = {\n", " 'Observed Variable': 'Water use',\n", - " 'Site Location': 'Valley View Tower Dormatory on Utah State University\\'s Campus in Logan, UT'\n", + " 'Site Location': 'Valley View Tower Dormitory on Utah State University\\'s Campus in Logan, UT'\n", "}\n", "\n", "# Print the extended metadata elements\n", "print('The extended metadata elements for the aggregation include:')\n", "for key, value in agg.metadata.additional_metadata.items():\n", - " print (key + ':', value)\n", - " \n", - "# Save the aggregation to write all of the metadata to HydroShare\n", + " print(key + ':', value)\n", + "\n", + "# Save the aggregation to write the metadata to HydroShare\n", "agg.save()" ] }, @@ -276,14 +276,14 @@ "agg.metadata.period_coverage = PeriodCoverage(start=beginDate, end=endDate)\n", "\n", "# Print the temporal coverage information\n", - "print('Temporal Coverage')\n", + "print('Temporal Coverage:')\n", "print(agg.metadata.period_coverage)\n", "\n", "# Print the spatial coverage information\n", - "print('\\nSpatial Coverage')\n", + "print('\\nSpatial Coverage:')\n", "print(agg.metadata.spatial_coverage)\n", "\n", - "# Save the aggregation to write all of the metadata to HydroShare\n", + "# Save the aggregation to write the metadata to HydroShare\n", "agg.save()" ] }, @@ -309,10 +309,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a list of the files that make up the shapefile to be uploaded \n", - "file_list = ['Example_Files/watersheds.cpg', 'Example_Files/watersheds.dbf', \n", + "# Create a list of the files that make up the shapefile to be uploaded\n", + "file_list = ['Example_Files/watersheds.cpg', 'Example_Files/watersheds.dbf',\n", " 'Example_Files/watersheds.prj', 'Example_Files/watersheds.sbn',\n", - " 'Example_Files/watersheds.sbx', 'Example_Files/watersheds.shp', \n", + " 'Example_Files/watersheds.sbx', 'Example_Files/watersheds.shp',\n", " 'Example_Files/watersheds.shx', 'Example_Files/watersheds.shp.xml']\n", "\n", "# Upload the files to the resource all at the same time\n", @@ -327,7 +327,7 @@ "source": [ "If you upload all of the files of a shapefile together as shown above, HydroShare automatically recognizes the files as a shapefile and auto-aggregates the files into a geographic feature aggregation for you. So, you then just need to get the aggregation that was created if you want to further operate on it - e.g., to modify the aggregation-level metadata.\n", "\n", - "Metadata for a geographic feature aggregation includes a title, subject keywords, extended key-value pairs, temporal coverage, spatial coverage, geometry information, spatial reference, and field information. When HydroShare creates the aggregation on the shapefile, the spatial coverage, geometry information, spatial reference, and attribute field information metadata will be automatically set for you. You can then set all of the other metadata elements as shown above for the single file aggregation if you need to." + "Metadata for a geographic feature aggregation includes a title, subject keywords, extended key-value pairs, temporal coverage, spatial coverage, geometry information, spatial reference, and attribute field information. When HydroShare creates the aggregation on the shapefile, the spatial coverage, geometry information, spatial reference, and attribute field information metadata will be automatically set for you. You can then set all of the other metadata elements as shown above for the single file aggregation if you need to." ] }, { @@ -339,15 +339,15 @@ "# Get the aggregation that was just created\n", "\n", "# You can get the aggregation by searching for a file that is inside of it\n", - "agg = new_resource.aggregation(file__name=\"watersheds.shp\")\n", + "agg = new_resource.aggregation(file__name='watersheds.shp')\n", "\n", "# Or, you can get the aggregation by searching for its title, which is initially\n", "# set to the name of the shapefile\n", - "agg = new_resource.aggregation(title=\"watersheds\")\n", + "agg = new_resource.aggregation(title='watersheds')\n", "\n", "# Print the title for the aggregation that was added to the resource\n", - "print('The following aggregation was added to the resource: ' + agg.metadata.title)\n", - "print('Aggregation type: ' + agg.metadata.type)" + "print(f'The following aggregation was added to the resource: {agg.metadata.title}')\n", + "print(f'Aggregation type: {agg.metadata.type}')" ] }, { @@ -373,11 +373,11 @@ "\n", "# Get the aggregation that was just created - initially the title will be \"logan1\"\n", "# based on the name of the first .tif file that appears in the .vrt file\n", - "agg = new_resource.aggregation(title=\"logan1\")\n", + "agg = new_resource.aggregation(title='logan1')\n", "\n", "# Print the title for the aggregation that was added to the resource\n", - "print('The following aggregation was added to the resource: ' + agg.metadata.title)\n", - "print('Aggregation type: ' + agg.metadata.type)" + "print(f'The following aggregation was added to the resource: {agg.metadata.title}')\n", + "print(f'Aggregation type: {agg.metadata.type}')" ] }, { @@ -399,11 +399,11 @@ "new_resource.file_upload('Example_Files/SWE_time.nc')\n", "\n", "# Get the aggregation by searching for the NetCDF file that is inside of it\n", - "agg = new_resource.aggregation(file__name=\"SWE_time.nc\")\n", + "agg = new_resource.aggregation(file__name='SWE_time.nc')\n", "\n", "# Print the title for the aggregation that was added to the resource\n", - "print('The following aggregation was added to the resource: ' + agg.metadata.title)\n", - "print('Aggregation type: ' + agg.metadata.type)" + "print(f'The following aggregation was added to the resource: {agg.metadata.title}')\n", + "print(f'Aggregation type: {agg.metadata.type}')" ] }, { @@ -425,11 +425,11 @@ "new_resource.file_upload('Example_Files/ODM2.sqlite')\n", "\n", "# Get the aggregation by searching for the SQLite file that is inside of it\n", - "agg = new_resource.aggregation(file__name=\"ODM2.sqlite\")\n", + "agg = new_resource.aggregation(file__name='ODM2.sqlite')\n", "\n", "# Print the title for the aggregation that was added to the resource\n", - "print('The following aggregation was added to the resource: ' + agg.metadata.title)\n", - "print('Aggregation type: ' + agg.metadata.type)" + "print(f'The following aggregation was added to the resource: {agg.metadata.title}')\n", + "print(f'Aggregation type: {agg.metadata.type}')" ] }, { @@ -452,7 +452,7 @@ "\n", "# Add some files to the folder\n", "new_resource.file_upload('Example_Files/Data_File1.csv', 'Example_Files/Data_File2.csv',\n", - " destination_path='Fileset_Aggregation')\n", + " destination_path='Fileset_Aggregation')\n", "\n", "# TODO: How to set a fileset aggregation on a folder containing files?" ] @@ -473,14 +473,14 @@ "outputs": [], "source": [ "# Get the time series aggregation that was created above\n", - "agg = new_resource.aggregation(type=\"TimeSeries\")\n", + "agg = new_resource.aggregation(type='TimeSeries')\n", "\n", "# Print the metadata associated with the aggregation\n", - "print('Aggregation Title: ' + agg.metadata.title)\n", - "print('Aggregation Type: ' + agg.metadata.type)\n", - "print('Aggregation Keywords: ' + ', '.join(agg.metadata.subjects))\n", - "print('Aggregation Temporal Coverage: ' + str(agg.metadata.period_coverage))\n", - "print('Aggregation Spatial Coverage: ' + str(agg.metadata.spatial_coverage))\n", + "print(f'Aggregation Title: {agg.metadata.title}')\n", + "print(f'Aggregation Type: {agg.metadata.type}')\n", + "print(f'Aggregation Keywords: {\", \".join(agg.metadata.subjects)}')\n", + "print(f'Aggregation Temporal Coverage: {agg.metadata.period_coverage}')\n", + "print(f'Aggregation Spatial Coverage: {agg.metadata.spatial_coverage}')\n", "\n", "# Print the list of files in the aggregation\n", "file_list = agg.files()\n", @@ -646,13 +646,6 @@ "# Delete the aggregation and metadata along with files within aggregation\n", "new_resource.aggregation_delete(agg)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -682,4 +675,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/examples/Basic_Operations.ipynb b/docs/examples/Basic_Operations.ipynb index 3ac41bb..d28ac8b 100644 --- a/docs/examples/Basic_Operations.ipynb +++ b/docs/examples/Basic_Operations.ipynb @@ -6,13 +6,13 @@ "id": "HHsuQMMJyms4" }, "source": [ - "# HS RDF HydroShare Python Client Basic Resource Operation Examples \n", + "# hsclient HydroShare Python Client Basic Resource Operation Examples\n", "\n", "\n", "---\n", "\n", "\n", - "The following code snippets show examples for how to use the HS RDF HydroShare Python Client for performing basic resource operations. " + "The following code snippets show examples for how to use the hsclient HydroShare Python Client for performing basic resource operations." ] }, { @@ -21,9 +21,9 @@ "id": "b_Tj5gJx0fRj" }, "source": [ - "## Install the HS RDF Python Client\n", + "## Install the hsclient Python Client\n", "\n", - "The HS RDF Python Client for HydroShare won't be installed by default, so it has to be installed first before you can work with it. Use the following command to install the Python Client from the GitHub repository. Eventually we will distribute this package via the Python Package Index (PyPi) so that it can be installed via pip from PyPi." + "The hsclient Python Client for HydroShare may not be installed by default in your Python environment, so it has to be installed first before you can work with it. Use the following command to install hsclient via the Python Package Index (PyPi)." ] }, { @@ -52,7 +52,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "3njsiY73m7_V" + "id": "3njsiY73m7_V", + "pycharm": { + "is_executing": true + } }, "outputs": [], "source": [ @@ -73,7 +76,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": true + } + }, "outputs": [], "source": [ "from hsclient import HydroShare\n", @@ -108,7 +115,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "W9azvJ_Co87w" + "id": "W9azvJ_Co87w", + "pycharm": { + "is_executing": true + } }, "outputs": [], "source": [ @@ -116,11 +126,11 @@ "new_resource = hs.create()\n", "\n", "# Get the HydroShare identifier for the new resource\n", - "resIdentifier = new_resource.resource_id\n", - "print('The HydroShare Identifier for your new resource is: ' + resIdentifier)\n", + "res_identifier = new_resource.resource_id\n", + "print(f'The HydroShare Identifier for your new resources is: {res_identifier}')\n", "\n", - "# Construct a hyperlink for the new resource\n", - "print('Your new resource is available at: ' + new_resource.metadata.url)" + "# Construct a hyperlink to access the HydroShare landing page for the new resource\n", + "print(f'Your new resource is available at: {new_resource.metadata.url}')" ] }, { @@ -131,7 +141,7 @@ "source": [ "### Retrieving an Existing Resource\n", "\n", - "If you want to work on an existing resource rather than creating a new one, you can retrieve an existing resource using its HydroShare Identifier. The resource identifier is passed as a string. The resource's metadata is retrieved and loaded into memory." + "If you want to work on an existing resource rather than creating a new one, you can retrieve an existing resource using its HydroShare identifier. The resource identifier is passed as a string. The resource's metadata is retrieved and loaded into memory." ] }, { @@ -143,9 +153,8 @@ "outputs": [], "source": [ "# Get an existing resource using its identifier\n", - "existing_resource = hs.resource(resIdentifier)\n", - "\n", - "print('Just retrieved the resource with ID: ' + resIdentifier)" + "existing_resource = hs.resource(res_identifier)\n", + "print(f'Just retrieved the resource with ID: {res_identifier}')" ] }, { @@ -188,8 +197,8 @@ "outputs": [], "source": [ "# Delete the resource using its identifier\n", - "hs.resource(resIdentifier).delete()\n", - "print('Deleted resource: ' + resIdentifier)" + "hs.resource(res_identifier).delete()\n", + "print(f'Deleted resource with ID: {res_identifier}')" ] }, { @@ -214,19 +223,13 @@ "outputs": [], "source": [ "# Get the resource you want to download using its identifier\n", - "res = hs.resource(resIdentifier)\n", + "res_identifier = '7561aa12fd824ebb8edbee05af19b910'\n", + "res = hs.resource(res_identifier)\n", "\n", - "# Download the resource as a zipped file. Pass in a file path as a string if you\n", - "# want to download to a particular location.\n", + "# Download the resource as a zipped Bagit file\n", + "# Pass in a file path if you want to download to a particular location\n", "res.download()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -237,7 +240,7 @@ "toc_visible": true }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -251,9 +254,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/examples/File_Operations.ipynb b/docs/examples/File_Operations.ipynb index a455859..9bac287 100644 --- a/docs/examples/File_Operations.ipynb +++ b/docs/examples/File_Operations.ipynb @@ -6,13 +6,13 @@ "id": "HHsuQMMJyms4" }, "source": [ - "# HS RDF HydroShare Python Client Resource File Operation Examples \n", + "# hsclient HydroShare Python Client Resource File Operation Examples\n", "\n", "\n", "---\n", "\n", "\n", - "The following code snippets show examples for how to use the HS RDF HydroShare Python Client to manipulate files within a HydroShare Resource. " + "The following code snippets show examples for how to use the hsclient HydroShare Python Client to manipulate files within a HydroShare Resource." ] }, { @@ -21,9 +21,9 @@ "id": "b_Tj5gJx0fRj" }, "source": [ - "## Install the HS RDF Python Client\n", + "## Install the hsclient Python Client\n", "\n", - "The HS RDF Python Client for HydroShare won't be installed by default, so it has to be installed first before you can work with it. Use the following command to install the Python Client from the GitHub repository. Eventually we will distribute this package via the Python Package Index (PyPi) so that it can be installed via pip from PyPi." + "The hsclient Python Client for HydroShare may not be installed by default in your Python environment, so it has to be installed first before you can work with it. Use the following command to install hsclient via the Python Package Index (PyPi)." ] }, { @@ -85,11 +85,11 @@ "new_resource = hs.create()\n", "\n", "# Get the HydroShare identifier for the new resource\n", - "resIdentifier = new_resource.resource_id\n", - "print('The HydroShare Identifier for your new resource is: ' + resIdentifier)\n", + "res_identifier = new_resource.resource_id\n", + "print(f'The HydroShare Identifier for your new resource is: {res_identifier}')\n", "\n", "# Construct a hyperlink for the new resource\n", - "print('Your new resource is available at: ' + new_resource.metadata.url)" + "print(f'Your new resource is available at: {new_resource.metadata.url}')" ] }, { @@ -114,10 +114,10 @@ "outputs": [], "source": [ "# Print the title of the resource and the list of files it contains\n", - "print('Working on: ' + new_resource.metadata.title)\n", + "print(f'Working on: {new_resource.metadata.title}')\n", "print('File list:')\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.name)" + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.name)" ] }, { @@ -128,7 +128,7 @@ "source": [ "### Adding Files to a Resource\n", "\n", - "You may need to add content files to your resource. The examples here upload files from the `Example_Files` folder that is included with the HydroShare resource that contains these Jupyter Notebook examples. If you are running in your own local Python environment and want to load files from your local machine, you would specify the path to the file(s) on your hard drive. If you want to upload multiple files at once, you can pass multiple file paths separated by commas to the `upload()` function.\n", + "You may need to add content files to your resource. The examples here upload files from the `Example_Files` folder that is included in the same folder that contains these Jupyter Notebook examples. If you are running in your own local Python environment and want to load files from your local machine, you would specify the path to the file(s) on your hard drive. If you want to upload multiple files at once, you can pass multiple file paths separated by commas to the `upload()` function.\n", "\n", "Note that if you upload files that already exist, those files will be overwritten." ] @@ -141,13 +141,13 @@ }, "outputs": [], "source": [ - "# Upload one or more files to your resource \n", + "# Upload one or more files to your resource\n", "new_resource.file_upload('Example_Files/Data_File1.csv', 'Example_Files/Data_File2.csv')\n", "\n", "# Print the names of the files in the resource\n", "print('Updated file list after adding a file: ')\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.path)" + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.path)" ] }, { @@ -171,8 +171,8 @@ "\n", "# Print the names of the files in the resource\n", "print('Updated file list after adding a file: ')\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.path)" + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.path)" ] }, { @@ -199,37 +199,37 @@ "metadata": {}, "outputs": [], "source": [ - "# Get a list of all of the files in the resource that are not part of an aggregation\n", + "# Get a list of the files in the resource that are not part of an aggregation\n", "file_list = new_resource.files()\n", "print('All files that are not part of an aggregation:')\n", "print(*file_list, sep='\\n')\n", "print('\\n')\n", - " \n", - "# Get a list of all of the files in the resource inclusive of files that are inside \n", + "\n", + "# Get a list of the files in the resource inclusive of files that are inside\n", "# content type aggregations\n", "file_list = new_resource.files(search_aggregations=True)\n", "print('All files in the resource:')\n", "print(*file_list, sep='\\n')\n", "print('\\n')\n", "\n", - "# Get a list of all of the files within a folder in the resource\n", + "# Get a list of the files within a folder in the resource\n", "# Note that you have to pass the full relative path to the folder you are searching\n", "# because there may be multiple folders within a resource with the same name.\n", "# To get files in the root folder, pass an empty string (folder=\"\")\n", - "file_list = new_resource.files(folder=\"New_Folder\")\n", + "file_list = new_resource.files(folder='New_Folder')\n", "print('All files within a specific folder:')\n", "print(*file_list, sep='\\n')\n", "print('\\n')\n", - " \n", + "\n", "# Get a list of all files that have a specific extension. This searches all folders\n", - "file_list = new_resource.files(extension=\".csv\")\n", + "file_list = new_resource.files(extension='.csv')\n", "print('All files with a .csv file extension:')\n", "print(*file_list, sep='\\n')\n", "print('\\n')\n", "\n", "# Filters can be combined\n", "# Get a list of all files in a particular folder that have a specific extension\n", - "file_list = new_resource.files(folder=\"New_Folder\", extension=\".csv\")\n", + "file_list = new_resource.files(folder='New_Folder', extension='.csv')\n", "print('All files with a .csv file extension in a particular folder:')\n", "print(*file_list, sep='\\n')" ] @@ -250,16 +250,16 @@ "outputs": [], "source": [ "# Get a single file using its path relative to the resource content directory\n", - "file = new_resource.file(path=\"New_Folder/Data_File2.csv\")\n", + "file = new_resource.file(path='New_Folder/Data_File2.csv')\n", "print('File retrieved using path:')\n", "print(file)\n", "print('\\n')\n", - " \n", + "\n", "# Get a single file using its name\n", "# Note that if you have multiple files in your resource with the same name, but in different\n", "# folders, you should search for a particular file using the path parameter to ensure that\n", "# you get the right file\n", - "file = new_resource.file(name=\"Data_File2.csv\")\n", + "file = new_resource.file(name='Data_File2.csv')\n", "print('File retrieved using name:')\n", "print(file)" ] @@ -280,17 +280,15 @@ "outputs": [], "source": [ "# Search for a file within a resource\n", - "file = new_resource.file(path=\"New_Folder/Data_File2.csv\")\n", + "file = new_resource.file(path='New_Folder/Data_File2.csv')\n", "\n", "# Print the properties of the file\n", - "print('File name: ' + file.name)\n", - "print('File extension:' + file.extension)\n", - "print('File folder name: ' + file.folder)\n", - "print('File path: ' + file.path)\n", - "print('File url_path: ' + file.url)\n", - "#print('File checksum:' + file.checksum)\n", - "\n", - "# TODO: The checksum property is not implemented yet" + "print(f'File name: {file.name}')\n", + "print(f'File extension: {file.extension}')\n", + "print(f'File folder name: {file.folder}')\n", + "print(f'File path: {file.path}')\n", + "print(f'File url_path: {file.url}')\n", + "print(f'File checksum: {file.checksum}')" ] }, { @@ -309,15 +307,15 @@ "outputs": [], "source": [ "# Get a file to rename - use the relative path to the file to make sure you have the right one\n", - "file = new_resource.file(path=\"Data_File2.csv\")\n", + "file = new_resource.file(path='Data_File2.csv')\n", "\n", "# Rename the file to whatever you want\n", "new_resource.file_rename(file, 'Data_File2_Renamed.csv')\n", "\n", "# Print the names of the files in the resource\n", - "print('Updated file list after adding a file: ')\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.path)" + "print('Updated file list after renaming a file: ')\n", + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.path)" ] }, { @@ -334,15 +332,15 @@ "outputs": [], "source": [ "# Get a file to move\n", - "file = new_resource.file(path=\"Data_File1.csv\")\n", + "file = new_resource.file(path='Data_File1.csv')\n", "\n", "# Move the file to a different folder\n", "new_resource.file_rename(file, 'New_Folder/Data_File1.csv')\n", "\n", "# Print the names of the files in the resource\n", "print('Updated file list after adding a file: ')\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.path)" + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.path)" ] }, { @@ -413,14 +411,14 @@ "outputs": [], "source": [ "# Specify the file you want to delete\n", - "file = new_resource.file(path=\"New_Folder/Data_File2.csv\")\n", + "file = new_resource.file(path='New_Folder/Data_File2.csv')\n", "\n", "new_resource.file_delete(file)\n", "\n", "# Print the names of the files in the resource\n", - "print(\"Updated file list after removing file: \")\n", - "for file in new_resource.files(search_aggregations=True): \n", - " print(file.path)" + "print('Updated file list after removing file: ')\n", + "for file in new_resource.files(search_aggregations=True):\n", + " print(file.path)" ] }, { @@ -428,12 +426,11 @@ "metadata": {}, "source": [ "### TODO: The following items are being worked on\n", - "\n", - "* Delete a folder and all of the files within it.\n", - "* Moving a folder.\n", - "* Zip a file or a folder.\n", - "* Rename a folder.\n", - "* Download a folder as a zipped file." + "* Delete a folder and all of the files inside it\n", + "* Moving a folder\n", + "* Zip a file or folder\n", + "* Rename a folder\n", + "* Download a folder as a zipped file" ] } ], @@ -464,4 +461,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/examples/Metadata_Operations.ipynb b/docs/examples/Metadata_Operations.ipynb index f94c578..4fcb707 100644 --- a/docs/examples/Metadata_Operations.ipynb +++ b/docs/examples/Metadata_Operations.ipynb @@ -6,13 +6,13 @@ "id": "HHsuQMMJyms4" }, "source": [ - "# HS RDF HydroShare Python Client Resource Metadata Editing Examples \n", + "# hsclient HydroShare Python Client Resource Metadata Editing Examples\n", "\n", "\n", "---\n", "\n", "\n", - "The following code snippets show examples for how to use the HS RDF HydroShare Python Client for creating and editing resource level metadata for a HydroShare resource. " + "The following code snippets show examples for how to use the hsclient HydroShare Python Client for creating and editing resource level metadata for a HydroShare resource." ] }, { @@ -21,9 +21,9 @@ "id": "b_Tj5gJx0fRj" }, "source": [ - "## Install the HS RDF HydroShare Python Client\n", + "## Install the hsclient HydroShare Python Client\n", "\n", - "The HS RDF Python Client for HydroShare won't be installed by default, so it has to be installed first before you can work with it. Use the following command to install the Python Client from the GitHub repository. Eventually we will distribute this package via the Python Package Index (PyPi) so that it can be installed via pip from PyPi." + "The hsclient Python Client for HydroShare may not be installed by default in your Python environment, so it has to be installed first before you can work with it. Use the following command to install hsclient via the Python Package Index (PyPi)." ] }, { @@ -87,11 +87,11 @@ "new_resource = hs.create()\n", "\n", "# Get the HydroShare identifier for the new resource\n", - "resIdentifier = new_resource.resource_id\n", - "print('The HydroShare Identifier for your new resource is: ' + resIdentifier)\n", + "res_identifier = new_resource.resource_id\n", + "print(f'The HydroShare Identifier for your new resource is: {res_identifier}')\n", "\n", - "# Construct a hyperlink for the new resource\n", - "print('Your new resource is available at: ' + new_resource.metadata.url)" + "# Construct a hyperlink to access the HydroShare landing page for the new resource\n", + "print(f'Your new resource is available at: {new_resource.metadata.url}')" ] }, { @@ -127,11 +127,11 @@ "outputs": [], "source": [ "# Set the Title for the resource\n", - "new_resource.metadata.title = 'Resource for Testing the HS RDF HydroShare Python Client'\n", + "new_resource.metadata.title = 'Resource for Testing the hsclient HydroShare Python Client'\n", "\n", "# Set the Abstract text for the resource\n", "new_resource.metadata.abstract = (\n", - " 'This resource was created as a demonstration of using the HS RDF ' \n", + " 'This resource was created as a demonstration of using the hsclient '\n", " 'Python Client for HydroShare. Once you have completed all of the '\n", " 'steps in this notebook, you will have a fully populated HydroShare '\n", " 'Resource.'\n", @@ -141,8 +141,8 @@ "new_resource.save()\n", "\n", "# Print the title just added to the resource\n", - "print('Title: ' + new_resource.metadata.title)\n", - "print('Abstract: ' + new_resource.metadata.abstract)" + "print(f'Title: {new_resource.metadata.title}')\n", + "print(f'Abstract: {new_resource.metadata.abstract}')" ] }, { @@ -165,7 +165,7 @@ "outputs": [], "source": [ "# Create subject keywords for the resource using a list of strings\n", - "new_resource.metadata.subjects = ['HS RDF', 'Python', 'HydroShare', 'Another Keyword']\n", + "new_resource.metadata.subjects = ['hsclient', 'Python', 'HydroShare', 'Another Keyword']\n", "\n", "# New keywords can be appended to the existing list\n", "new_resource.metadata.subjects.append('Hydroinformatics')\n", @@ -179,7 +179,7 @@ "# Print the keywords for the resource\n", "print('The list of keywords for the resource includes:')\n", "for keyword in new_resource.metadata.subjects:\n", - " print(keyword)" + " print(keyword)" ] }, { @@ -238,11 +238,11 @@ "new_resource.save()\n", "\n", "# Print the temporal coverage information\n", - "print('Temporal Coverage')\n", + "print('Temporal Coverage:')\n", "print(new_resource.metadata.period_coverage)\n", "\n", "# Print the spatial coverage information\n", - "print('\\nSpatial Coverage')\n", + "print('\\nSpatial Coverage:')\n", "print(new_resource.metadata.spatial_coverage)" ] }, @@ -271,7 +271,7 @@ "# Remove an individual key-value pair using its key\n", "del new_resource.metadata.additional_metadata['New Element Key']\n", "\n", - "# Or, you can clear out all of the additional metadata elements that might exist\n", + "# Or, you can clear out all additional metadata elements that might exist\n", "new_resource.metadata.additional_metadata.clear()\n", "\n", "# Add multiple key-value pairs at once using a Python dictionary\n", @@ -287,49 +287,7 @@ "# Print the extended metadata elements for the resource\n", "print('The extended metadata elements for the resource include:')\n", "for key, value in new_resource.metadata.additional_metadata.items():\n", - " print (key + ':', value)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F9mbqyDJ20lp" - }, - "source": [ - "### Sources\n", - "\n", - "Sources are stored as a list of strings. Sources can be added or removed by adding or removing source strings from the list." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xZsp02SEbGVC" - }, - "outputs": [], - "source": [ - "# If you have existing Sources in your resource, you can remove all of them\n", - "# by clearing the local list and then saving the resource \n", - "new_resource.metadata.sources.clear()\n", - "new_resource.save()\n", - "\n", - "# Add a Source to the resource\n", - "sourceString = (\n", - " 'Mihalevich, B. A., Horsburgh, J. S., Melcher, A. A. (2017). High-frequency '\n", - " 'measurements reveal spatial and temporal patterns of dissolved organic '\n", - " 'matter in an urban water conveyance, Environmental Monitoring and '\n", - " 'Assessment, http://dx.doi.org/10.1007/s10661-017-6310-y.'\n", - ")\n", - "new_resource.metadata.sources.append(sourceString)\n", - "\n", - "# Save the changes to the resource in HydroShare\n", - "new_resource.save()\n", - "\n", - "# Print the Source metadata\n", - "print('The list of Sources includes:')\n", - "for source in new_resource.metadata.sources:\n", - " print(source)" + " print(f'{key}: {value}')" ] }, { @@ -361,16 +319,25 @@ "new_resource.save()\n", "\n", "# Create a new relation object\n", - "newRelation = Relation(type=RelationType.isDataFor, \n", - " value=('Bastidas Pacheco, C. J., Horsburgh, J. S., Tracy, ' \n", - " 'R. J. (2020). A low-cost, open source monitoring ' \n", - " 'system for collecting high-resolution water use '\n", - " 'data on magnetically-driven residential water ' \n", - " 'meters, Sensors, 20(13), 3655, '\n", - " 'https://doi.org/10.3390/s20133655.'))\n", + "new_relation = Relation(type=RelationType.isReferencedBy,\n", + " value=('Bastidas Pacheco, C. J., Horsburgh, J. S., Tracy, '\n", + " 'R. J. (2020). A low-cost, open source monitoring '\n", + " 'system for collecting high-resolution water use '\n", + " 'data on magnetically-driven residential water '\n", + " 'meters, Sensors, 20(13), 3655, '\n", + " 'https://doi.org/10.3390/s20133655.'))\n", "\n", "# Append the new Related Resource to the list of Related Resources\n", - "new_resource.metadata.relations.append(newRelation)\n", + "new_resource.metadata.relations.append(new_relation)\n", + "\n", + "# Add another related resource with a different relationship type\n", + "new_relation = Relation(type=RelationType.references,\n", + " value=('Mihalevich, B. A., Horsburgh, J. S., Melcher, A. A. (2017). '\n", + " 'High-frequency measurements reveal spatial and temporal patterns '\n", + " 'of dissolved organic matter in an urban water conveyance, '\n", + " 'Environmental Monitoring and Assessment, '\n", + " 'https://doi.org/10.1007/s10661-017-6310-y.'))\n", + "new_resource.metadata.relations.append(new_relation)\n", "\n", "# Save the changes to the resource in HydroShare\n", "new_resource.save()\n", @@ -378,7 +345,7 @@ "# Print the list of Related Resources\n", "print('The list of Related Resources includes:')\n", "for relatedResource in new_resource.metadata.relations:\n", - " print(relatedResource.type.value + ': ' + relatedResource.value)" + " print(f'{relatedResource.type.value}: {relatedResource.value}')" ] }, { @@ -424,7 +391,7 @@ "# Print the AwardInfo\n", "print('Funding sources added: ')\n", "for award in new_resource.metadata.awards:\n", - " print('Award Title: ' + award.title)" + " print(f'Award Title: {award.title}')" ] }, { @@ -460,10 +427,10 @@ "\n", "# Instantiate a new Creator object for a Creator that is not a HydroShare user\n", "newCreator2 = Creator(name='Doe, John A.',\n", - " organization='Utah Water Research Laboratory',\n", - " email='john.doe@usu.edu',\n", - " address='8200 Old Main Hill, Logan, UT 84322-8200',\n", - " phone='123-456-7890')\n", + " organization='Utah Water Research Laboratory',\n", + " email='john.doe@usu.edu',\n", + " address='8200 Old Main Hill, Logan, UT 84322-8200',\n", + " phone='123-456-7890')\n", "\n", "# Append the new Creator to the resource's list of Creators\n", "new_resource.metadata.creators.append(newCreator2)\n", @@ -493,10 +460,10 @@ "# Print the Creator names\n", "print('The list of Creators includes: ')\n", "for creator in new_resource.metadata.creators:\n", - " if creator.name is None:\n", - " print(creator.organization)\n", - " else:\n", - " print(creator.name)" + " if creator.name is None:\n", + " print(creator.organization)\n", + " else:\n", + " print(creator.name)" ] }, { @@ -526,10 +493,10 @@ "# Print the modified order of the Creator names\n", "print('The list of Creators includes: ')\n", "for creator in new_resource.metadata.creators:\n", - " if creator.name is None:\n", - " print(creator.organization)\n", - " else:\n", - " print(creator.name)" + " if creator.name is None:\n", + " print(creator.organization)\n", + " else:\n", + " print(creator.name)" ] }, { @@ -553,7 +520,7 @@ "del new_resource.metadata.creators[1:]\n", "new_resource.save()\n", "\n", - "print('Number of remaining creators: ' + str(len(new_resource.metadata.creators)))" + "print(f'Number of remaining creators: {len(new_resource.metadata.creators)}')" ] }, { @@ -615,7 +582,7 @@ "# Print the Contributor names\n", "print('The list of Contributors includes: ')\n", "for Contributor in new_resource.metadata.contributors:\n", - " print(Contributor.name)" + " print(Contributor.name)" ] }, { @@ -639,7 +606,7 @@ "new_resource.metadata.contributors.clear()\n", "new_resource.save()\n", "\n", - "print('Number of remaining Contributors: ' + str(len(new_resource.metadata.contributors)))" + "print(f'Number of remaining Contributors: {len(new_resource.metadata.contributors)}')" ] }, { @@ -667,9 +634,10 @@ "# Set the rights statement and the URL that points to its description\n", "new_resource.metadata.rights.statement = (\n", " 'This resource is shared under the Creative Commons '\n", - " 'Attribution-NoCommercial-NoDerivs CC BY-NC-ND.'\n", + " 'Attribution-NonCommercial-NoDerivatives 4.0 International'\n", + " '(CC BY-NC-ND 4.0).'\n", ")\n", - "new_resource.metadata.rights.url = 'http://creativecommons.org/licenses/by-nc-nd/4.0/'\n", + "new_resource.metadata.rights.url = 'https://creativecommons.org/licenses/by-nc-nd/4.0/'\n", "\n", "# Save the changes to the resource in HydroShare\n", "new_resource.save()\n", @@ -678,7 +646,7 @@ "print(new_resource.metadata.rights.statement)\n", "print(new_resource.metadata.rights.url)\n", "\n", - "# You can also use one of the available, pre-generated Rights Statements \n", + "# You can also use one of the available, pre-generated Rights Statements\n", "# available in HydroShare\n", "new_resource.metadata.rights = Rights.Creative_Commons_Attribution_CC_BY()\n", "new_resource.save()\n", @@ -687,6 +655,17 @@ "print(new_resource.metadata.rights.statement)\n", "print(new_resource.metadata.rights.url)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# TODO: Related geospatial features is not implemented yet" + ], + "metadata": { + "collapsed": false + } } ], "metadata": { From 069631ed6dd165f8a8f6304d68dbf84d57bc040f Mon Sep 17 00:00:00 2001 From: pkdash Date: Thu, 23 Mar 2023 22:18:20 -0400 Subject: [PATCH 08/23] [#44] adding aggregation type classes for data object support --- hsclient/hydroshare.py | 480 +++++++++++++++++++++++------------------ 1 file changed, 270 insertions(+), 210 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 18a490e..e562400 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -6,13 +6,14 @@ import sqlite3 import tempfile import time -from contextlib import closing import urllib.parse +from concurrent.futures import ThreadPoolExecutor +from contextlib import closing from datetime import datetime from functools import wraps from posixpath import basename, dirname, join as urljoin, splitext from pprint import pformat -from typing import Dict, List, Union, TYPE_CHECKING, Callable +from typing import Callable, Dict, List, TYPE_CHECKING, Union from urllib.parse import quote, unquote, urlparse from zipfile import ZipFile @@ -134,7 +135,7 @@ def __init__(self, map_path, hs_session, checksums=None): self._parsed_files = None self._parsed_aggregations = None self._parsed_checksums = checksums - self._data_object = None + self._main_file_path = None def __str__(self): return self._map_path @@ -177,11 +178,40 @@ def _files(self): @property def _aggregations(self): + + def populate_files(_aggr): + _aggr._files + + def populate_metadata(_aggr): + _aggr._metadata + if not self._parsed_aggregations: self._parsed_aggregations = [] for file in self._map.describes.files: if is_aggregation(str(file)): self._parsed_aggregations.append(Aggregation(unquote(file.path), self._hs_session, self._checksums)) + + # load files (instances of File) and metadata for all aggregations + with ThreadPoolExecutor() as executor: + executor.map(populate_files, self._parsed_aggregations) + executor.map(populate_metadata, self._parsed_aggregations) + + # convert aggregations to aggregation type supporting data object + aggregations_copy = self._parsed_aggregations[:] + typed_aggregation_classes = {AggregationType.MultidimensionalAggregation: NetCDFAggregation, + AggregationType.TimeSeriesAggregation: TimeseriesAggregation, + AggregationType.GeographicRasterAggregation: GeoRasterAggregation, + AggregationType.GeographicFeatureAggregation: GeoFeatureAggregation, + } + for aggr in aggregations_copy: + typed_aggr = None + typed_aggr_cls = typed_aggregation_classes.get(aggr.metadata.type, None) + if typed_aggr_cls: + typed_aggr = typed_aggr_cls.create(base_aggr=aggr) + if typed_aggr: + self._parsed_aggregations.remove(aggr) + self._parsed_aggregations.append(typed_aggr) + return self._parsed_aggregations @property @@ -232,59 +262,6 @@ def _download(self, save_path: str = "", unzip_to: str = None) -> str: return unzip_to return downloaded_zip - def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: - main_file_ext = pathlib.Path(self.main_file_path).suffix - file_name = self.file(extension=main_file_ext).name - file_path = urljoin(agg_path, file_name) - if not os.path.exists(file_path) or not os.path.isfile(file_path): - file_path = urljoin(file_path, file_name) - if not os.path.exists(file_path): - raise Exception(f"Aggregation was not found at: {agg_path}") - - if for_save_data: - if self.metadata.type == AggregationType.GeographicFeatureAggregation: - if file_path == self._data_object.path: - raise Exception(f"Aggregation path '{agg_path}' is not a valid path. This should be a path where " - f"you have the updated shape files") - else: - for aggr_file in self.files(): - aggr_file = basename(aggr_file) - if aggr_file.endswith(".shp.xml") or aggr_file.endswith(".sbn") or aggr_file.endswith(".sbx"): - # these are optional files for geo feature aggregation - continue - if not os.path.exists(os.path.join(agg_path, aggr_file)): - raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " - f"Missing file '{aggr_file}'") - return file_path - - def _get_data_object(self, agg_path: str, func: Callable) -> \ - Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: - - if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: - return self._data_object - - file_path = self._validate_aggregation_path(agg_path) - data_object = func(file_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - data_object.load() - data_object.close() - - # cache the data object for the aggregation - self._data_object = data_object - return data_object - - def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: AggregationType) -> None: - if self.metadata.type != agg_type: - raise Exception(f"Not a {agg_type.value} aggregation") - - if self._data_object is None: - raise Exception("No data object exists for this aggregation.") - - # check this aggregation is part of the specified resource - aggr = resource.aggregation(file__path=self.main_file_path) - if aggr is None: - raise Exception("This aggregation is not part of the specified resource.") - @property def metadata_file(self): """The path to the metadata file""" @@ -303,19 +280,19 @@ def metadata_path(self) -> str: @property def main_file_path(self) -> str: """The path to the main file in the aggregation""" + if self._main_file_path is not None: + return self._main_file_path mft = main_file_type(self.metadata.type) if mft: for file in self.files(): if str(file).endswith(mft): - return file.path + self._main_file_path = file.path + return self._main_file_path if self.metadata.type == AggregationType.FileSetAggregation: - return self.files()[0].folder - return self.files()[0].path - - @property - def data_object(self) -> \ - Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: - return self._data_object + self._main_file_path = self.files()[0].folder + return self._main_file_path + self._main_file_path = self.files()[0].path + return self._main_file_path @refresh def save(self) -> None: @@ -364,6 +341,17 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: :return: a List of Aggregation objects matching the filter parameters """ aggregations = self._aggregations + + # when searching using 'file__path' or files__path' as the key, there can be only one matching aggregation + file_path = kwargs.get("file__path", "") + if not file_path: + file_path = kwargs.get("files__path", "") + if file_path: + for agg in aggregations: + if agg.files(path=file_path): + return [agg] + return [] + for key, value in kwargs.items(): if key.startswith('file__'): file_args = {key[len('file__'):]: value} @@ -399,7 +387,7 @@ def refresh(self) -> None: self._parsed_files = None self._parsed_aggregations = None self._parsed_checksums = None - self._data_object = None + self._main_file_path = None def delete(self) -> None: """Deletes this aggregation from HydroShare""" @@ -413,121 +401,89 @@ def delete(self) -> None: self._hs_session.delete(path, status_code=200) self.refresh() - def as_series(self, series_id: str, agg_path: str) -> 'pandas.DataFrame': - """ - Creates a pandas DataFrame object out of an aggregation of type TimeSeries. - :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. - :param agg_path: The local path where this aggregation has been downloaded previously. - :return: A pandas.DataFrame object - """ - # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then - # make this method as a private method. - if pandas is None: - raise Exception("pandas package not found") +class DataObjectSupportingAggregation(Aggregation): + """Base class for any aggregation supporting aggregation type specific data analysis object (e.g. pandas)""" - def to_series(timeseries_file: str): - con = sqlite3.connect(timeseries_file) - return pandas.read_sql( - f'SELECT * FROM TimeSeriesResultValues WHERE ResultID IN ' - f'(SELECT ResultID FROM Results WHERE ResultUUID = "{series_id}");', - con, - ).squeeze() + @staticmethod + def create(aggr_cls, base_aggr): + """creates a type specific aggregation object from an instance of Aggregation""" + aggr = aggr_cls(base_aggr._map_path, base_aggr._hs_session, base_aggr._parsed_checksums) + aggr._retrieved_map = base_aggr._retrieved_map + aggr._retrieved_metadata = base_aggr._retrieved_metadata + aggr._parsed_files = base_aggr._parsed_files + aggr._parsed_aggregations = base_aggr._parsed_aggregations + aggr._main_file_path = base_aggr._main_file_path + aggr._data_object = None + return aggr - return self._get_data_object(agg_path=agg_path, func=to_series) + def refresh(self) -> None: + super().refresh() + self._data_object = None - def as_multi_dimensional_dataset(self, agg_path: str) -> 'xarray.Dataset': - """ - Creates a xarray Dataset object out of an aggregation of type NetCDF. - :param agg_path: The local path where this aggregation has been downloaded previously. - :return: A xarray.Dataset object - """ - # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then - # make this method as a private method. + @property + def data_object(self) -> \ + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset', None]: + return self._data_object - if self.metadata.type != AggregationType.MultidimensionalAggregation: - raise Exception("Aggregation is not of type NetCDF") - if xarray is None: - raise Exception("xarray package was not found") + def _get_file_path(self, agg_path): + main_file_ext = pathlib.Path(self.main_file_path).suffix + file_name = self.file(extension=main_file_ext).name + file_path = urljoin(agg_path, file_name) + if not os.path.exists(file_path) or not os.path.isfile(file_path): + file_path = urljoin(file_path, file_name) + if not os.path.exists(file_path): + raise Exception(f"Aggregation was not found at: {agg_path}") + return file_path - return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) + def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: + return self._get_file_path(agg_path) - def as_feature_collection(self, agg_path: str) -> 'fiona.Collection': - """ - Creates a fiona Collection object out of an aggregation of type GeoFeature. - :param agg_path: The local path where this aggregation has been downloaded previously. - :return: A fiona.Collection object - Note: The caller is responsible for closing the fiona.Collection object to free up aggregation files used to - create this object. - """ - # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then - # make this method as a private method. + def _get_data_object(self, agg_path: str, func: Callable) -> \ + Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: - if self.metadata.type != AggregationType.GeographicFeatureAggregation: - raise Exception("Aggregation is not of type GeoFeature") - if fiona is None: - raise Exception("fiona package was not found") - return self._get_data_object(agg_path=agg_path, func=fiona.open) + if self._data_object is not None and self.metadata.type != AggregationType.TimeSeriesAggregation: + return self._data_object - def as_raster_dataset(self, agg_path: str) -> 'rasterio.DatasetReader': - """ - Creates a rasterio DatasetReader object out of an aggregation of type GeoRaster - :param agg_path: The local path where this aggregation has been downloaded previously. - :return: A rasterio.DatasetReader object - Note: The caller is responsible for closing the rasterio.DatasetReader object to free up aggregation files - used to create this object. - """ - # TODO: if we decide that the user will prefer to use `as_data_object` method rather than this method, then - # make this method as a private method. + file_path = self._validate_aggregation_path(agg_path) + data_object = func(file_path) + if self.metadata.type == AggregationType.MultidimensionalAggregation: + data_object.load() + data_object.close() - if self.metadata.type != AggregationType.GeographicRasterAggregation: - raise Exception("Aggregation is not of type GeoRaster") - if rasterio is None: - raise Exception("rasterio package was not found") + # cache the data object for the aggregation + self._data_object = data_object + return data_object - return self._get_data_object(agg_path=agg_path, func=rasterio.open) + def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: AggregationType) -> None: + if self.metadata.type != agg_type: + raise Exception(f"Not a {agg_type.value} aggregation") - def as_data_object(self, agg_path: str, series_id: str = "") -> \ - Union['pandas.DataFrame', 'fiona.Collection', 'rasterio.DatasetReader', 'xarray.Dataset']: - """ - Loads aggregation data to a relevant data object type. Data for a timeseries aggregation is loaded as pandas - DataFrame, data for a geo feature aggregation os loaded as a fiona Collection object, data for a raster - aggregation is loaded as rasterio DatasetReader object, and data for a netcdf aggregation is loaded as xarray - Dataset object. - :param agg_path: The local path where this aggregation has been downloaded previously. - :param series_id: The series_id of the timeseries result to be converted to a Dataframe object. A value for this - parameter is required only for a timeseries aggregation. - """ + if self._data_object is None: + raise Exception("No data object exists for this aggregation.") - if self.metadata.type == AggregationType.TimeSeriesAggregation: - if not series_id: - raise Exception("Provide the series_id for which the timeseries data object is needed.") - return self.as_series(series_id=series_id, agg_path=agg_path) - if self.metadata.type == AggregationType.MultidimensionalAggregation: - return self.as_multi_dimensional_dataset(agg_path=agg_path) - if self.metadata.type == AggregationType.GeographicFeatureAggregation: - return self.as_feature_collection(agg_path=agg_path) - if self.metadata.type == AggregationType.GeographicRasterAggregation: - return self.as_raster_dataset(agg_path=agg_path) + # check this aggregation is part of the specified resource + aggr = resource.aggregation(file__path=self.main_file_path) + if aggr is None: + raise Exception("This aggregation is not part of the specified resource.") - raise Exception(f"Data object is not supported for '{self.metadata.type}' aggregation type") - def update_netcdf_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, - destination_path: str = "") -> 'Aggregation': - """ - Updates the netcdf file associated with this aggregation. Then uploads the updated netcdf file - to create a new aggregation that replaces the original aggregation. - :param resource: The resource object to which this aggregation belongs. - :param agg_path: The local path where this aggregation has been downloaded previously. - :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be - updated/replaced. - :param destination_path: The destination folder path where the new aggregation will be created. This folder - path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. - :return: The updated netcdf aggregation or a new netcdf aggregation (an instance of Aggregation) - """ +class NetCDFAggregation(DataObjectSupportingAggregation): + + @classmethod + def create(cls, base_aggr): + return super().create(aggr_cls=cls, base_aggr=base_aggr) + + def as_data_object(self, agg_path: str) -> 'xarray.Dataset': + if self.metadata.type != AggregationType.MultidimensionalAggregation: + raise Exception("Aggregation is not of type NetCDF") + if xarray is None: + raise Exception("xarray package was not found") - # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then - # make this method as a private method. + return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) + + def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': self._validate_aggregation_for_update(resource, AggregationType.MultidimensionalAggregation) file_path = self._validate_aggregation_path(agg_path, for_save_data=True) @@ -567,23 +523,29 @@ def update_netcdf_data(self, resource: 'Resource', agg_path: str, as_new_aggr: b aggr._data_object = data_object return aggr - def update_timeseries_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, - destination_path: str = "") -> 'Aggregation': - """ - Updates the sqlite file associated with this aggregation. Then uploads the updated sqlite file - to create a new aggregation that either replaces the original aggregation or adds as a new - aggregation. - :param resource: The resource object to which this aggregation belongs. - :param agg_path: The local path where this aggregation has been downloaded previously. - :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be - updated/replaced. - :param destination_path: The destination folder path where the new aggregation will be created. This folder - path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. - :return: The updated timeseries aggregation or a new timeseries aggregation (an instance of Aggregation) - """ - # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then - # make this method as a private method. +class TimeseriesAggregation(DataObjectSupportingAggregation): + + @classmethod + def create(cls, base_aggr): + return super().create(aggr_cls=cls, base_aggr=base_aggr) + + def as_data_object(self, agg_path: str, series_id: str = "") -> 'pandas.DataFrame': + if pandas is None: + raise Exception("pandas package not found") + + def to_series(timeseries_file: str): + con = sqlite3.connect(timeseries_file) + return pandas.read_sql( + f'SELECT * FROM TimeSeriesResultValues WHERE ResultID IN ' + f'(SELECT ResultID FROM Results WHERE ResultUUID = "{series_id}");', + con, + ).squeeze() + + return self._get_data_object(agg_path=agg_path, func=to_series) + + def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': self._validate_aggregation_for_update(resource, AggregationType.TimeSeriesAggregation) file_path = self._validate_aggregation_path(agg_path, for_save_data=True) @@ -637,24 +599,33 @@ def update_timeseries_data(self, resource: 'Resource', agg_path: str, as_new_agg aggr._data_object = data_object return aggr - def update_geo_feature_data(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, - destination_path: str = "") -> 'Aggregation': - """ - Updates the shape files associated with this aggregation. Then uploads all files associated with this - aggregation to create a new aggregation that either replaces the original aggregation or adds as a new - aggregation. - :param resource: The resource object to which this aggregation belongs. - :param agg_path: The local path where this aggregation has been downloaded previously. - :param as_new_aggr: If True a new aggregation will be created, otherwise this aggregation will be - updated/replaced. - :param destination_path: The destination folder path where the new aggregation will be created. This folder - path must already exist in resource. This parameter is used only when 'as_new_aggr' is True. - :return: The updated geo-feature aggregation or a new geo-feature aggregation (an instance of Aggregation) - """ - # TODO: if we decide that the user will prefer to use `save_data_object` rather than this method, then - # make this method as a private method. +class GeoFeatureAggregation(DataObjectSupportingAggregation): + + @classmethod + def create(cls, base_aggr): + return super().create(aggr_cls=cls, base_aggr=base_aggr) + + def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: + if for_save_data: + for aggr_file in self.files(): + aggr_file = basename(aggr_file) + if aggr_file.endswith(".shp.xml") or aggr_file.endswith(".sbn") or aggr_file.endswith(".sbx"): + # these are optional files for geo feature aggregation + continue + if not os.path.exists(os.path.join(agg_path, aggr_file)): + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"Missing file '{aggr_file}'") + file_path = self._get_file_path(agg_path) + return file_path + + def as_data_object(self, agg_path: str) -> 'fiona.Collection': + if fiona is None: + raise Exception("fiona package was not found") + return self._get_data_object(agg_path=agg_path, func=fiona.open) + def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, + destination_path: str = "") -> 'Aggregation': def upload_shape_files(main_file_path, dst_path=""): shp_file_dir_path = os.path.dirname(main_file_path) filename_starts_with = f"{pathlib.Path(main_file_path).stem}." @@ -726,24 +697,115 @@ def upload_shape_files(main_file_path, dst_path=""): aggr._data_object = data_object return aggr + +class GeoRasterAggregation(DataObjectSupportingAggregation): + + @classmethod + def create(cls, base_aggr): + return super().create(aggr_cls=cls, base_aggr=base_aggr) + + def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: + if for_save_data: + tif_file_count = 0 + vrt_file_count = 0 + tif_file_path = "" + vrt_file_path = "" + for item in os.listdir(agg_path): + item_full_path = os.path.join(agg_path, item) + if os.path.isfile(item_full_path): + file_ext = pathlib.Path(item_full_path).suffix.lower() + if file_ext in (".tif", ".tiff"): + tif_file_count += 1 + tif_file_path = item_full_path + elif file_ext == '.vrt': + vrt_file_path = item_full_path + vrt_file_count += 1 + if vrt_file_count > 1: + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"More than one vrt was file found") + else: + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"There are files that are not of raster file types") + if tif_file_count == 0: + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"No tif file was found") + if tif_file_count > 1 and vrt_file_count == 0: + raise Exception(f"Aggregation path '{agg_path}' is not a valid path. " + f"Missing a vrt file") + if vrt_file_path: + file_path = vrt_file_path + else: + file_path = tif_file_path + else: + file_path = self._get_file_path(agg_path) + + return file_path + + def as_data_object(self, agg_path: str) -> 'rasterio.DatasetReader': + if rasterio is None: + raise Exception("rasterio package was not found") + + return self._get_data_object(agg_path=agg_path, func=rasterio.open) + def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, destination_path: str = "") -> 'Aggregation': - """ - Updates the data file(s) of this aggregation using the associated data processing object - and either updates this aggregation or creates a new aggregation using the updated data files. - """ - if self.metadata.type == AggregationType.MultidimensionalAggregation: - return self.update_netcdf_data(resource, agg_path, as_new_aggr, destination_path) + def upload_raster_files(dst_path=""): + raster_files = [] + for item in os.listdir(agg_path): + item_full_path = os.path.join(agg_path, item) + if os.path.isfile(item_full_path): + raster_files.append(item_full_path) + resource.file_upload(*raster_files, destination_path=dst_path) + + def get_main_file_path(): + main_file_name = os.path.basename(file_path) + if not main_file_name.lower().endswith('.vrt'): + main_file_name = pathlib.Path(main_file_name).stem + ".vrt" + if destination_path: + aggr_main_file_path = os.path.join(destination_path, main_file_name) + else: + aggr_main_file_path = main_file_name + return aggr_main_file_path + + self._validate_aggregation_for_update(resource, AggregationType.GeographicRasterAggregation) + file_path = self._validate_aggregation_path(agg_path, for_save_data=True) + # aggr_main_file_path = self.main_file_path + # data_object = self._data_object + if not as_new_aggr: + destination_path = dirname(self.main_file_path) + + # cache some of the metadata fields of the original aggregation to update the metadata of the + # updated aggregation + keywords = self.metadata.subjects + additional_meta = self.metadata.additional_metadata - if self.metadata.type == AggregationType.TimeSeriesAggregation: - return self.update_timeseries_data(resource, agg_path, as_new_aggr, destination_path) + # TODO: keep a local backup copy of the aggregation before deleting it + self.delete() + upload_raster_files(dst_path=destination_path) - if self.metadata.type == AggregationType.GeographicFeatureAggregation: - return self.update_geo_feature_data(resource, agg_path, as_new_aggr, destination_path) + # retrieve the updated aggregation + # compute the main file name + aggr_main_file_path = get_main_file_path() + aggr = resource.aggregation(file__path=aggr_main_file_path) + + # update metadata + for kw in keywords: + if kw not in aggr.metadata.subjects: + aggr.metadata.subjects.append(kw) + aggr.metadata.additional_metadata = additional_meta + aggr.save() + else: + # creating a new aggregation by uploading the updated data files + upload_raster_files(dst_path=destination_path) - # TODO: Implement this functionality for Raster aggregation + # retrieve the new aggregation + aggr_main_file_path = get_main_file_path() + agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) + aggr = resource.aggregation(file__path=agg_path) - raise Exception("Saving of data object is not supported for this aggregation type") + data_object = None + aggr._data_object = data_object + return aggr class Resource(Aggregation): @@ -1149,7 +1211,6 @@ def retrieve_string(self, path): def retrieve_file(self, path, save_path=""): file = self.get(path, status_code=200, allow_redirects=True) - cd = file.headers['content-disposition'] filename = urllib.parse.unquote(cd.split("filename=")[1].strip('"')) downloaded_file = os.path.join(save_path, filename) @@ -1173,7 +1234,6 @@ def retrieve_zip(self, path, save_path="", params=None): if params is None: params = {} file = self.get(path, status_code=200, allow_redirects=True, params=params) - json_response = file.json() task_id = json_response['task_id'] download_path = json_response['download_path'] From 6f036f1f4718c4b1d1f1e931154329cc524d2dcb Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 Mar 2023 18:19:42 -0400 Subject: [PATCH 09/23] [#44] adding new method to move aggregation --- hsclient/hydroshare.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index e562400..cf94cbe 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -1131,6 +1131,31 @@ def aggregation_remove(self, aggregation: Aggregation) -> None: aggregation._hs_session.post(path, status_code=200) aggregation.refresh() + @refresh + def aggregation_move(self, aggregation: Aggregation, dst_path: str = "") -> None: + """ + Moves an aggregation from its current location to another folder in HydroShare. + :param aggregation: The aggregation object to move + :param dst_path: The target file path to move the aggregation to + :return: None + """ + path = urljoin( + aggregation._hsapi_path, + aggregation.metadata.type.value + "LogicalFile", + aggregation.main_file_path, + "functions", + "move-file-type", + dst_path, + ) + response = aggregation._hs_session.post(path, status_code=200) + json_response = response.json() + task_id = json_response['id'] + status = json_response['status'] + if status in ("Not ready", "progress"): + while aggregation._hs_session.check_task(task_id) != 'true': + time.sleep(1) + aggregation.refresh() + @refresh def aggregation_delete(self, aggregation: Aggregation) -> None: """ From 2e52e0ec2f832370f607917a807ab386dd950e29 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 Mar 2023 18:30:40 -0400 Subject: [PATCH 10/23] [#44] using aggregation resmap filename to filter aggregation by file path search --- hsclient/__init__.py | 11 ++++++++++- hsclient/hydroshare.py | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/hsclient/__init__.py b/hsclient/__init__.py index a951514..285d4ec 100644 --- a/hsclient/__init__.py +++ b/hsclient/__init__.py @@ -1,2 +1,11 @@ -from hsclient.hydroshare import Aggregation, File, HydroShare, Resource +from hsclient.hydroshare import ( + Aggregation, + File, + HydroShare, + Resource, + NetCDFAggregation, + TimeseriesAggregation, + GeoRasterAggregation, + GeoFeatureAggregation, +) from hsclient.oauth2_model import Token diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index cf94cbe..40b4849 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -347,9 +347,18 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: if not file_path: file_path = kwargs.get("files__path", "") if file_path: - for agg in aggregations: - if agg.files(path=file_path): - return [agg] + dir_path = os.path.dirname(file_path) + file_name = pathlib.Path(file_path).stem + if dir_path: + aggr_map_path = urljoin(dir_path, file_name) + else: + aggr_map_path = file_name + + aggr_map_path = f"{aggr_map_path}_resmap.xml" + for aggr in self._parsed_aggregations: + aggr_map_full_path = f"/{aggr._resource_path}/data/contents/{aggr_map_path}" + if aggr._map_path == aggr_map_full_path: + return [aggr] return [] for key, value in kwargs.items(): From 469c4e2945bc9870d63a2ed5b9e0c886028d063f Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 Mar 2023 18:34:59 -0400 Subject: [PATCH 11/23] [#44] preventing aggregation delete as part of aggregation update via data object --- hsclient/hydroshare.py | 99 ++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 40b4849..edfe9ad 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -15,6 +15,7 @@ from pprint import pformat from typing import Callable, Dict, List, TYPE_CHECKING, Union from urllib.parse import quote, unquote, urlparse +from uuid import uuid4 from zipfile import ZipFile if TYPE_CHECKING: @@ -179,9 +180,6 @@ def _files(self): @property def _aggregations(self): - def populate_files(_aggr): - _aggr._files - def populate_metadata(_aggr): _aggr._metadata @@ -191,9 +189,8 @@ def populate_metadata(_aggr): if is_aggregation(str(file)): self._parsed_aggregations.append(Aggregation(unquote(file.path), self._hs_session, self._checksums)) - # load files (instances of File) and metadata for all aggregations + # load metadata for all aggregations (metadata is needed to create a typed aggregation) with ThreadPoolExecutor() as executor: - executor.map(populate_files, self._parsed_aggregations) executor.map(populate_metadata, self._parsed_aggregations) # convert aggregations to aggregation type supporting data object @@ -204,11 +201,10 @@ def populate_metadata(_aggr): AggregationType.GeographicFeatureAggregation: GeoFeatureAggregation, } for aggr in aggregations_copy: - typed_aggr = None typed_aggr_cls = typed_aggregation_classes.get(aggr.metadata.type, None) if typed_aggr_cls: typed_aggr = typed_aggr_cls.create(base_aggr=aggr) - if typed_aggr: + # swapping the generic aggregation with the typed aggregation in the aggregation list self._parsed_aggregations.remove(aggr) self._parsed_aggregations.append(typed_aggr) @@ -476,6 +472,24 @@ def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: Aggre if aggr is None: raise Exception("This aggregation is not part of the specified resource.") + def _update_aggregation(self, resource, *files): + temp_folder = uuid4().hex + resource.folder_create(temp_folder) + resource.file_upload(*files, destination_path=temp_folder) + # check aggregation got created in the temp folder + file_path = os.path.join(temp_folder, os.path.basename(self.main_file_path)) + original_aggr_dir_path = dirname(self.main_file_path) + aggr = resource.aggregation(file__path=file_path) + if aggr is not None: + # delete this aggregation which will be replaced with the updated aggregation + self.delete() + # move the aggregation from the temp folder to the location of the deleted aggregation + resource.aggregation_move(aggr, dst_path=original_aggr_dir_path) + + resource.folder_delete(temp_folder) + if aggr is None: + raise Exception("Failed to update aggregation") + class NetCDFAggregation(DataObjectSupportingAggregation): @@ -484,11 +498,8 @@ def create(cls, base_aggr): return super().create(aggr_cls=cls, base_aggr=base_aggr) def as_data_object(self, agg_path: str) -> 'xarray.Dataset': - if self.metadata.type != AggregationType.MultidimensionalAggregation: - raise Exception("Aggregation is not of type NetCDF") if xarray is None: raise Exception("xarray package was not found") - return self._get_data_object(agg_path=agg_path, func=xarray.open_dataset) def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, @@ -498,18 +509,14 @@ def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: boo file_path = self._validate_aggregation_path(agg_path, for_save_data=True) self._data_object.to_netcdf(file_path, format="NETCDF4") aggr_main_file_path = self.main_file_path - data_object = self._data_object if not as_new_aggr: - destination_path = dirname(self.main_file_path) - # cache some of the metadata fields of the original aggregation to update the metadata of the # updated aggregation keywords = self.metadata.subjects additional_meta = self.metadata.additional_metadata - # TODO: keep a local backup copy of the aggregation before deleting it - self.delete() - resource.file_upload(file_path, destination_path=destination_path) + # upload the updated aggregation files + self._update_aggregation(resource, file_path) # retrieve the updated aggregation aggr = resource.aggregation(file__path=aggr_main_file_path) @@ -521,15 +528,14 @@ def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: boo aggr.metadata.additional_metadata = additional_meta aggr.save() else: - # creating a new aggregation by uploading the updated data files + # creating a new aggregation resource.file_upload(file_path, destination_path=destination_path) # retrieve the new aggregation agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) aggr = resource.aggregation(file__path=agg_path) - data_object = None - aggr._data_object = data_object + aggr._data_object = None return aggr @@ -571,8 +577,6 @@ def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: boo aggr_main_file_path = self.main_file_path data_object = self._data_object if not as_new_aggr: - destination_path = dirname(self.main_file_path) - # cache some of the metadata fields of the original aggregation to update the metadata of the # updated aggregation keywords = self.metadata.subjects @@ -580,11 +584,8 @@ def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: boo title = self.metadata.title abstract = self.metadata.abstract - # TODO: If the creation of the replacement aggregation fails for some reason, then with the following - # delete action we will lose this aggregation from HydroShare. Need to keep a copy of the - # original aggregation locally so that we can upload that to HydroShare if needed. - self.delete() - resource.file_upload(file_path, destination_path=destination_path) + # upload the updated aggregation files to the temp folder - to create the updated aggregation + self._update_aggregation(resource, file_path) # retrieve the updated aggregation aggr = resource.aggregation(file__path=aggr_main_file_path) @@ -643,32 +644,30 @@ def upload_shape_files(main_file_path, dst_path=""): if item.startswith(filename_starts_with): file_full_path = os.path.join(shp_file_dir_path, item) shape_files.append(file_full_path) - resource.file_upload(*shape_files, destination_path=dst_path) + + if not dst_path: + self._update_aggregation(resource, *shape_files) + else: + resource.file_upload(*shape_files, destination_path=dst_path) self._validate_aggregation_for_update(resource, AggregationType.GeographicFeatureAggregation) file_path = self._validate_aggregation_path(agg_path, for_save_data=True) aggr_main_file_path = self.main_file_path data_object = self._data_object + # need to close the fiona.Collection object to free up access to all the original shape files + data_object.close() if not as_new_aggr: - destination_path = dirname(self.main_file_path) - # cache some of the metadata fields of the original aggregation to update the metadata of the # updated aggregation keywords = self.metadata.subjects additional_meta = self.metadata.additional_metadata - # TODO: keep a local backup copy of the aggregation before deleting it - self.delete() # copy the updated shape files to the original shape file location where the user downloaded the # aggregation previously src_shp_file_dir_path = os.path.dirname(file_path) tgt_shp_file_dir_path = os.path.dirname(data_object.path) - agg_path = tgt_shp_file_dir_path filename_starts_with = f"{pathlib.Path(file_path).stem}." - # need to close the fiona.Collection object to free up access to all the original shape files - data_object.close() - for item in os.listdir(src_shp_file_dir_path): if item.startswith(filename_starts_with): src_file_full_path = os.path.join(src_shp_file_dir_path, item) @@ -676,7 +675,7 @@ def upload_shape_files(main_file_path, dst_path=""): shutil.copyfile(src_file_full_path, tgt_file_full_path) # upload the updated shape files to replace this aggregation - upload_shape_files(main_file_path=data_object.path, dst_path=destination_path) + upload_shape_files(main_file_path=data_object.path) # retrieve the updated aggregation aggr = resource.aggregation(file__path=aggr_main_file_path) @@ -687,23 +686,15 @@ def upload_shape_files(main_file_path, dst_path=""): aggr.metadata.subjects.append(kw) aggr.metadata.additional_metadata = additional_meta aggr.save() - - # load aggregation data to fiona Collection object - data_object = aggr.as_data_object(agg_path=agg_path) else: - # creating a new aggregation - # close the original fiona Collection object - data_object.close() - # upload the updated shape files to create a new geo feature aggregation upload_shape_files(main_file_path=file_path, dst_path=destination_path) # retrieve the new aggregation agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) aggr = resource.aggregation(file__path=agg_path) - data_object = None - aggr._data_object = data_object + aggr._data_object = None return aggr @@ -753,7 +744,6 @@ def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) def as_data_object(self, agg_path: str) -> 'rasterio.DatasetReader': if rasterio is None: raise Exception("rasterio package was not found") - return self._get_data_object(agg_path=agg_path, func=rasterio.open) def save_data_object(self, resource: 'Resource', agg_path: str, as_new_aggr: bool = False, @@ -764,7 +754,11 @@ def upload_raster_files(dst_path=""): item_full_path = os.path.join(agg_path, item) if os.path.isfile(item_full_path): raster_files.append(item_full_path) - resource.file_upload(*raster_files, destination_path=dst_path) + + if not dst_path: + self._update_aggregation(resource, *raster_files) + else: + resource.file_upload(*raster_files, destination_path=dst_path) def get_main_file_path(): main_file_name = os.path.basename(file_path) @@ -778,8 +772,6 @@ def get_main_file_path(): self._validate_aggregation_for_update(resource, AggregationType.GeographicRasterAggregation) file_path = self._validate_aggregation_path(agg_path, for_save_data=True) - # aggr_main_file_path = self.main_file_path - # data_object = self._data_object if not as_new_aggr: destination_path = dirname(self.main_file_path) @@ -787,14 +779,10 @@ def get_main_file_path(): # updated aggregation keywords = self.metadata.subjects additional_meta = self.metadata.additional_metadata - - # TODO: keep a local backup copy of the aggregation before deleting it - self.delete() upload_raster_files(dst_path=destination_path) - # retrieve the updated aggregation - # compute the main file name aggr_main_file_path = get_main_file_path() + # retrieve the updated aggregation aggr = resource.aggregation(file__path=aggr_main_file_path) # update metadata @@ -812,8 +800,7 @@ def get_main_file_path(): agg_path = urljoin(destination_path, os.path.basename(aggr_main_file_path)) aggr = resource.aggregation(file__path=agg_path) - data_object = None - aggr._data_object = data_object + aggr._data_object = None return aggr From 72f36fef9fe23b46ba102f066e4e2cad7db659a5 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 29 Mar 2023 18:39:23 -0400 Subject: [PATCH 12/23] [#44] adding tests for aggregation data object --- tests/conftest.py | 82 ++++++++++ tests/test_data_objects.py | 317 +++++++++++++++++++++++++++++++++++++ tests/test_functional.py | 113 +++---------- 3 files changed, 423 insertions(+), 89 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_data_objects.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3ff5d13 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,82 @@ +import os +import pytest +from hsclient import HydroShare + + +@pytest.fixture(scope="function") +def change_test_dir(request): + os.chdir(request.fspath.dirname) + yield + os.chdir(request.config.invocation_dir) + + +@pytest.fixture() +def hydroshare(change_test_dir): + hs = HydroShare(os.getenv("HYDRO_USERNAME"), os.getenv("HYDRO_PASSWORD"), host="beta.hydroshare.org") + return hs + + +@pytest.fixture() +def new_resource(hydroshare): + new_resource = hydroshare.create() + yield new_resource + try: + new_resource.delete() + except: + # resource already deleted + pass + + +@pytest.fixture() +def resource(new_resource): + new_resource.file_upload("data/georaster_composite.zip", refresh=False) + new_resource.file_unzip("georaster_composite.zip", refresh=False) + return new_resource + + +@pytest.fixture() +def timeseries_resource(new_resource): + files = [ + "ODM2_Multi_Site_One_Variable.sqlite", + "ODM2_Multi_Site_One_Variable_resmap.xml", + "ODM2_Multi_Site_One_Variable_meta.xml", + ] + root_path = "data/test_resource_metadata_files/" + new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) + return new_resource + + +@pytest.fixture() +def resource_with_netcdf_aggr(new_resource): + files = [ + "SWE_time.nc", + "SWE_time_header_info.txt", + "SWE_time_resmap.xml", + "SWE_time_meta.xml", + ] + root_path = "data/test_resource_metadata_files/" + new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) + return new_resource + + +@pytest.fixture() +def resource_with_geofeature_aggr(new_resource): + files = [ + "watersheds.shp", + "watersheds.cpg", + "watersheds.dbf", + "watersheds.prj", + "watersheds.sbn", + "watersheds.sbx", + "watersheds.shx", + "watersheds_resmap.xml", + "watersheds_meta.xml", + ] + root_path = "data/test_resource_metadata_files/" + new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) + return new_resource + + +@pytest.fixture() +def resource_with_raster_aggr(resource): + return resource diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py new file mode 100644 index 0000000..25c0e8c --- /dev/null +++ b/tests/test_data_objects.py @@ -0,0 +1,317 @@ +import os +import tempfile + +import fiona +import pytest +import rasterio +from fiona.model import to_dict +from hsmodels.schemas.enums import AggregationType +from rasterio.windows import Window + +from hsclient import GeoFeatureAggregation, GeoRasterAggregation, NetCDFAggregation, TimeseriesAggregation + + +@pytest.mark.parametrize("search_by", ["type", "file_path"]) +def test_timeseries_as_data_object(timeseries_resource, search_by): + timeseries_resource.refresh() + if search_by == 'type': + aggr = timeseries_resource.aggregation(type=AggregationType.TimeSeriesAggregation) + else: + file_path = "ODM2_Multi_Site_One_Variable.sqlite" + aggr = timeseries_resource.aggregation(file__path=file_path) + + assert type(aggr) is TimeseriesAggregation + series_result = next( + r for r in aggr.metadata.time_series_results if r.series_id == "2837b7d9-1ebc-11e6-a16e-f45c8999816f" + ) + + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = timeseries_resource.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + # load timseries data for the specified series to pandas DataFrame + pd_dataframe = aggr.as_data_object(agg_path=agg_path, series_id=series_result.series_id) + assert len(pd_dataframe) == 1333 + + +@pytest.mark.parametrize("as_new_aggr", [False, True]) +def test_timeseries_save_data_object(timeseries_resource, as_new_aggr): + timeseries_resource.refresh() + file_path = "ODM2_Multi_Site_One_Variable.sqlite" + aggr = timeseries_resource.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is TimeseriesAggregation + series_id = '4a6f095c-1ebc-11e6-8a10-f45c8999816f' + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = timeseries_resource.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + pd_dataframe = aggr.as_data_object(agg_path=agg_path, series_id=series_id) + assert pd_dataframe.__class__.__name__ == "DataFrame" + rows, columns = pd_dataframe.shape + # delete 10 rows + pd_dataframe.drop(pd_dataframe.index[0:10], axis=0, inplace=True) + dst_path = "" + if as_new_aggr: + dst_path = "raster_aggr_folder" + timeseries_resource.folder_create(dst_path) + + aggr = aggr.save_data_object(resource=timeseries_resource, agg_path=agg_path, as_new_aggr=as_new_aggr, + destination_path=dst_path) + assert type(aggr) is TimeseriesAggregation + + # check the updated/new timeseries aggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + if as_new_aggr: + file_path = f"{dst_path}/{file_path}" + + aggr = timeseries_resource.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is TimeseriesAggregation + + agg_path = timeseries_resource.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + pd_dataframe = aggr.as_data_object(agg_path=agg_path, series_id=series_id) + updated_rows, update_columns = pd_dataframe.shape + assert rows == updated_rows + 10 + assert columns == update_columns + + +@pytest.mark.parametrize("search_by", ["type", "file_path"]) +def test_raster_as_data_object(resource_with_raster_aggr, search_by): + resource_with_raster_aggr.refresh() + if search_by == "type": + aggr = resource_with_raster_aggr.aggregation(type=AggregationType.GeographicRasterAggregation) + else: + file_path = "logan.vrt" + aggr = resource_with_raster_aggr.aggregation(file__path=file_path) + + assert type(aggr) is GeoRasterAggregation + + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_raster_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + + dataset = aggr.as_data_object(agg_path=agg_path) + assert dataset.__class__.__name__ == "DatasetReader" + # raster should have 1 band + assert dataset.count == 1 + + +@pytest.mark.parametrize("as_new_aggr", [False, True]) +def test_raster_save_data_object(resource_with_raster_aggr, as_new_aggr): + resource_with_raster_aggr.refresh() + file_path = "logan.vrt" + aggr = resource_with_raster_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is GeoRasterAggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_raster_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + rasterio_reader = aggr.as_data_object(agg_path=agg_path) + assert rasterio_reader.__class__.__name__ == "DatasetReader" + # edit raster data - sub-setting the dataset + new_width = rasterio_reader.width - 9 + new_height = rasterio_reader.height - 10 + updated_width = new_width + updated_height = new_height + subset_window = Window(0, 0, new_width, new_height) + subset_band = rasterio_reader.read(1, window=subset_window) + output_raster_dir_path = os.path.join(tmp, "updated_aggr") + os.makedirs(output_raster_dir_path) + output_raster_file_path = os.path.join(output_raster_dir_path, "logan.tif") + profile = rasterio_reader.profile + rasterio_reader.close() + profile['driver'] = "GTiff" + profile['width'] = new_width + profile['height'] = new_height + + with rasterio.open(output_raster_file_path, "w", **profile) as dst: + dst.write(subset_band, 1) + + dst_path = "" + if as_new_aggr: + dst_path = "raster_aggr_folder" + resource_with_raster_aggr.folder_create(dst_path) + + # save the new tif file to update the aggregation or create a new aggregation + aggr = aggr.save_data_object(resource=resource_with_raster_aggr, agg_path=output_raster_dir_path, + as_new_aggr=as_new_aggr, destination_path=dst_path) + assert aggr is not None + assert type(aggr) is GeoRasterAggregation + + # check the updated raster aggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + if as_new_aggr: + file_path = f"{dst_path}/{file_path}" + + aggr = resource_with_raster_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is GeoRasterAggregation + + agg_path = resource_with_raster_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + rasterio_reader = aggr.as_data_object(agg_path=agg_path) + assert updated_height == rasterio_reader.height + assert updated_width == rasterio_reader.width + + +@pytest.mark.parametrize("search_by", ["type", "file_path"]) +def test_netcdf_as_data_object(resource_with_netcdf_aggr, search_by): + resource_with_netcdf_aggr.refresh() + if search_by == 'type': + aggr = resource_with_netcdf_aggr.aggregation(type=AggregationType.MultidimensionalAggregation) + else: + file_path = "SWE_time.nc" + aggr = resource_with_netcdf_aggr.aggregation(file__path=file_path) + + assert type(aggr) is NetCDFAggregation + + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_netcdf_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + dataset = aggr.as_data_object(agg_path=agg_path) + assert dataset.__class__.__name__ == "Dataset" + # netcdf dimensions + assert dataset.dims['time'] == 2184 + + +@pytest.mark.parametrize("as_new_aggr", [False, True]) +def test_netcdf_save_data_object(resource_with_netcdf_aggr, as_new_aggr): + resource_with_netcdf_aggr.refresh() + file_path = "SWE_time.nc" + aggr = resource_with_netcdf_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is NetCDFAggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_netcdf_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + xr_dataset = aggr.as_data_object(agg_path=agg_path) + assert xr_dataset.__class__.__name__ == "Dataset" + agg_title = "This is a modified title for this aggregation by hsclient" + xr_dataset.attrs["title"] = agg_title + dst_path = "" + if as_new_aggr: + dst_path = "netcdf_aggr_folder" + resource_with_netcdf_aggr.folder_create(dst_path) + + aggr = aggr.save_data_object(resource=resource_with_netcdf_aggr, agg_path=agg_path, as_new_aggr=as_new_aggr, + destination_path=dst_path) + + assert type(aggr) is NetCDFAggregation + xr_dataset = aggr.as_data_object(agg_path=agg_path) + assert xr_dataset.attrs["title"] == agg_title + + # check the updated/new netcdf aggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + if as_new_aggr: + file_path = f"{dst_path}/{file_path}" + + aggr = resource_with_netcdf_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is NetCDFAggregation + + agg_path = resource_with_netcdf_aggr.aggregation_download(aggregation=aggr, save_path=tmp, unzip_to=unzip_to) + xr_dataset = aggr.as_data_object(agg_path=agg_path) + assert xr_dataset.attrs["title"] == agg_title + + +@pytest.mark.parametrize("search_by", ["type", "file_path"]) +def test_geofeature_as_data_object(resource_with_geofeature_aggr, search_by): + resource_with_geofeature_aggr.refresh() + if search_by == "type": + aggr = resource_with_geofeature_aggr.aggregation(type=AggregationType.GeographicFeatureAggregation) + else: + file_path = "watersheds.shp" + aggr = resource_with_geofeature_aggr.aggregation(file__path=file_path) + + assert aggr is not None + assert type(aggr) is GeoFeatureAggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_geofeature_aggr.aggregation_download(aggregation=aggr, save_path=tmp, + unzip_to=unzip_to) + fn_collection = aggr.as_data_object(agg_path=agg_path) + assert fn_collection.__class__.__name__ == "Collection" + # check projection type + assert str(fn_collection.crs) == "EPSG:26912" + # close the fiona collection object so that the temp dir can be cleaned up. + fn_collection.close() + + +@pytest.mark.parametrize("as_new_aggr", [False, True]) +def test_geofeature_save_data_object(resource_with_geofeature_aggr, as_new_aggr): + resource_with_geofeature_aggr.refresh() + file_path = "watersheds.shp" + aggr = resource_with_geofeature_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is GeoFeatureAggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + agg_path = resource_with_geofeature_aggr.aggregation_download(aggregation=aggr, save_path=tmp, + unzip_to=unzip_to) + fn_collection = aggr.as_data_object(agg_path=agg_path) + assert fn_collection.__class__.__name__ == "Collection" + original_shp_filename = os.path.basename(fn_collection.path) + updated_shp_file_dir = os.path.join(tmp, "updated_aggr") + os.makedirs(updated_shp_file_dir) + output_shp_file_path = os.path.join(updated_shp_file_dir, original_shp_filename) + with fiona.open(output_shp_file_path, 'w', schema=fn_collection.schema, driver=fn_collection.driver, + crs=fn_collection.crs) as out_shp_file: + for feature in fn_collection: + ft_dict = to_dict(feature) + if ft_dict['properties']['Id'] < 5: + out_shp_file.write(feature) + + dst_path = "" + if as_new_aggr: + dst_path = "geo_aggr_folder" + resource_with_geofeature_aggr.folder_create(dst_path) + + aggr = aggr.save_data_object(resource=resource_with_geofeature_aggr, agg_path=updated_shp_file_dir, + as_new_aggr=as_new_aggr, destination_path=dst_path) + assert aggr is not None + assert type(aggr) is GeoFeatureAggregation + assert aggr.data_object is None + + # check the updated geo-feature aggregation + with tempfile.TemporaryDirectory() as tmp: + # download aggregation + unzip_to = os.path.join(tmp, "unzipped_aggr") + os.makedirs(unzip_to) + if as_new_aggr: + file_path = f"{dst_path}/{file_path}" + + aggr = resource_with_geofeature_aggr.aggregation(file__path=file_path) + assert aggr is not None + assert type(aggr) is GeoFeatureAggregation + + agg_path = resource_with_geofeature_aggr.aggregation_download(aggregation=aggr, save_path=tmp, + unzip_to=unzip_to) + fn_collection = aggr.as_data_object(agg_path=agg_path) + for feature in fn_collection: + ft_dict = to_dict(feature) + assert ft_dict['properties']['Id'] < 5 + # need to close the data object so that the tmp directory can be cleaned up + fn_collection.close() diff --git a/tests/test_functional.py b/tests/test_functional.py index 6710e02..08b2020 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -8,67 +8,6 @@ from hsclient import HydroShare -@pytest.fixture(scope="function") -def change_test_dir(request): - os.chdir(request.fspath.dirname) - yield - os.chdir(request.config.invocation_dir) - - -@pytest.fixture() -def hydroshare(change_test_dir): - hs = HydroShare(os.getenv("HYDRO_USERNAME"), os.getenv("HYDRO_PASSWORD"), host="beta.hydroshare.org") - return hs - - -@pytest.fixture() -def new_resource(hydroshare): - new_resource = hydroshare.create() - yield new_resource - try: - new_resource.delete() - except: - # resource already deleted - pass - - -@pytest.fixture() -def resource(new_resource): - new_resource.file_upload("data/georaster_composite.zip", refresh=False) - new_resource.file_unzip("georaster_composite.zip", refresh=False) - return new_resource - - -@pytest.fixture() -def timeseries_resource(new_resource): - files = [ - "ODM2_Multi_Site_One_Variable.sqlite", - "ODM2_Multi_Site_One_Variable_resmap.xml", - "ODM2_Multi_Site_One_Variable_meta.xml", - ] - root_path = "data/test_resource_metadata_files/" - new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) - return new_resource - - -@pytest.fixture() -def resource_with_netcdf_aggr(new_resource): - files = [ - "SWE_time.nc", - "SWE_time_header_info.txt", - "SWE_time_resmap.xml", - "SWE_time_meta.xml", - ] - root_path = "data/test_resource_metadata_files/" - new_resource.file_upload(*[os.path.join(root_path, file) for file in files], refresh=False) - return new_resource - - -@pytest.fixture() -def resource_with_raster_aggr(resource): - return resource - - def test_absolute_path_multiple_file_upload(new_resource): files = [ "other.txt", @@ -319,6 +258,30 @@ def test_aggregation_remove(resource): assert len(resource.files()) == 4 +def test_move_aggregation(resource_with_netcdf_aggr): + resource_with_netcdf_aggr.refresh() + assert len(resource_with_netcdf_aggr.aggregations()) == 1 + agg = resource_with_netcdf_aggr.aggregations()[0] + main_file = agg.main_file_path + # create a folder to move the aggregation to + folder = "netcdf-aggregation" + resource_with_netcdf_aggr.folder_create(folder) + resource_with_netcdf_aggr.aggregation_move(agg, dst_path=folder) + assert len(resource_with_netcdf_aggr.aggregations()) == 1 + file_path = f"{folder}/{main_file}" + agg = resource_with_netcdf_aggr.aggregation(file__path=file_path) + assert agg is not None + # now move back the aggregation to the root of the resource + resource_with_netcdf_aggr.aggregation_move(agg, dst_path="") + file_path = main_file + agg = resource_with_netcdf_aggr.aggregation(file__path=file_path) + assert agg is not None + # check there is no aggregation in the folder + file_path = f"{folder}/{main_file}" + agg = resource_with_netcdf_aggr.aggregation(file__path=file_path) + assert agg is None + + def test_file_upload_and_rename(new_resource): assert len(new_resource.files()) == 0 new_resource.file_upload("data/other.txt", refresh=False) @@ -514,34 +477,6 @@ def test_aggregation_fileset(new_resource, files): assert len(new_resource.files()) == 0 -def test_pandas_series(timeseries_resource): - timeseries_resource.refresh() - timeseries = timeseries_resource.aggregation(type=AggregationType.TimeSeriesAggregation) - series_result = next( - r for r in timeseries.metadata.time_series_results if r.series_id == "2837b7d9-1ebc-11e6-a16e-f45c8999816f" - ) - series = timeseries.as_series(series_result.series_id, "data/test_resource_metadata_files") - assert len(series) == 1333 - - -def test_raster_as_data_object(resource_with_raster_aggr): - resource_with_raster_aggr.refresh() - raster_aggr = resource_with_raster_aggr.aggregation(type=AggregationType.GeographicRasterAggregation) - dataset = raster_aggr.as_data_object(agg_path="data/test_resource_metadata_files") - assert dataset.__class__.__name__ == "DatasetReader" - # raster should have 1 band - assert dataset.count == 1 - - -def test_netcdf_as_data_object(resource_with_netcdf_aggr): - resource_with_netcdf_aggr.refresh() - nc_aggr = resource_with_netcdf_aggr.aggregation(type=AggregationType.MultidimensionalAggregation) - dataset = nc_aggr.as_data_object(agg_path="data/test_resource_metadata_files") - assert dataset.__class__.__name__ == "Dataset" - # netcdf dimensions - assert dataset.dims['time'] == 2184 - - def test_folder_zip(new_resource): new_resource.folder_create("test_folder", refresh=False) new_resource.file_upload("data/other.txt", destination_path="test_folder", refresh=False) From c0fd80fbf7f2ca6a883a0ff1daf1f96cb535ec8c Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 19 Apr 2023 14:13:31 -0400 Subject: [PATCH 13/23] [#44] enabling skipped tests --- tests/test_functional.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 08b2020..5977a87 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -247,7 +247,6 @@ def test_aggregation_delete(resource): assert len(resource.files()) == 1 -@pytest.mark.skip(reason="this test fails due to a bug (#4995) in hydroshare") def test_aggregation_remove(resource): resource.refresh() assert len(resource.aggregations()) == 1 @@ -370,7 +369,6 @@ def test_empty_creator(new_resource): assert "creators list must have at least one creator" in str(e) -@pytest.mark.skip(reason="this test fails due to a bug (#4995) in hydroshare") @pytest.mark.parametrize( "files", [ @@ -408,21 +406,10 @@ def test_aggregations(new_resource, files): assert len(agg.files()) == aggr_file_count new_resource.aggregation_remove(agg) assert len(new_resource.aggregations()) == 0 - if agg_type == "GeoRaster": - # TODO: Due to a bug (#4995) in hydroshare, the vrt file of the aggregation gets deleted when the aggregation - # is removed - file_count = file_count - 1 - elif agg_type == "NetCDF": + if agg_type == "NetCDF": # the txt file of the aggregation gets deleted when the netcdf aggregation is removed. file_count = file_count - 1 - assert len(new_resource.files()) == file_count - if agg_type == "GeoRaster": - # TODO: Due to a bug (#4995) in hydroshare, the vrt file of the aggregation gets deleted when the aggregation - # is removed -so we need to upload that vrt file again for now - new_resource.file_upload(os.path.join(root_path, files[2])) - assert len(new_resource.files()) == file_count + 1 - main_file = next(f for f in new_resource.files() if f.path.endswith(files[0])) assert main_file agg = new_resource.file_aggregate(main_file, agg_type) @@ -438,7 +425,6 @@ def test_aggregations(new_resource, files): assert len(new_resource.files()) == 0 -@pytest.mark.skip(reason="there is a bug (#4998) in hydroshare that causes this test to fail") @pytest.mark.parametrize( "files", [ @@ -524,7 +510,6 @@ def test_folder_download(new_resource): assert os.path.basename(downloaded_folder) == "test_folder.zip" -# @pytest.mark.skip("Requires hydroshare update to url encode resourcemap urls") def test_filename_spaces(hydroshare): res = hydroshare.create() res.folder_create("with spaces", refresh=False) From 6f2747c8d5319205c67159793adf8ce2085fc9a4 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 21 Apr 2023 15:07:08 -0400 Subject: [PATCH 14/23] [#44] adding a flag to use file path to find aggregation --- hsclient/hydroshare.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index edfe9ad..20ee11b 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -189,7 +189,7 @@ def populate_metadata(_aggr): if is_aggregation(str(file)): self._parsed_aggregations.append(Aggregation(unquote(file.path), self._hs_session, self._checksums)) - # load metadata for all aggregations (metadata is needed to create a typed aggregation) + # load metadata for all aggregations (metadata is needed to create any typed aggregation) with ThreadPoolExecutor() as executor: executor.map(populate_metadata, self._parsed_aggregations) @@ -339,10 +339,11 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: aggregations = self._aggregations # when searching using 'file__path' or files__path' as the key, there can be only one matching aggregation + file_path_priority = kwargs.pop("file_path_priority", True) file_path = kwargs.get("file__path", "") if not file_path: file_path = kwargs.get("files__path", "") - if file_path: + if file_path and file_path_priority: dir_path = os.path.dirname(file_path) file_name = pathlib.Path(file_path).stem if dir_path: @@ -408,7 +409,7 @@ def delete(self) -> None: class DataObjectSupportingAggregation(Aggregation): - """Base class for any aggregation supporting aggregation type specific data analysis object (e.g. pandas)""" + """Base class for any aggregation supporting aggregation type specific data manipulation object (e.g. pandas)""" @staticmethod def create(aggr_cls, base_aggr): @@ -488,7 +489,8 @@ def _update_aggregation(self, resource, *files): resource.folder_delete(temp_folder) if aggr is None: - raise Exception("Failed to update aggregation") + err_msg = f"Failed to update aggregation. Aggregation was not found at: {file_path}" + raise Exception(err_msg) class NetCDFAggregation(DataObjectSupportingAggregation): @@ -1081,6 +1083,10 @@ def file_aggregate(self, path: str, agg_type: AggregationType, refresh: bool = T if refresh: # Only return the newly created aggregation if a refresh is requested self.refresh() + if agg_type == AggregationType.GeographicRasterAggregation and not path.endswith(".vrt") \ + or agg_type == AggregationType.FileSetAggregation: + # search all files of the aggregation to find a matching aggregation + return self.aggregation(file__path=path, file_path_priority=False) return self.aggregation(file__path=path) @refresh @@ -1132,7 +1138,7 @@ def aggregation_move(self, aggregation: Aggregation, dst_path: str = "") -> None """ Moves an aggregation from its current location to another folder in HydroShare. :param aggregation: The aggregation object to move - :param dst_path: The target file path to move the aggregation to + :param dst_path: The target file path to move the aggregation to - target folder must exist :return: None """ path = urljoin( From 5efe36c29c7111dae4b554a820443155738ff866 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 21 Apr 2023 15:08:09 -0400 Subject: [PATCH 15/23] [#44] compute aggregation path for updated aggregation --- hsclient/hydroshare.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 20ee11b..62d6202 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -473,12 +473,16 @@ def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: Aggre if aggr is None: raise Exception("This aggregation is not part of the specified resource.") + def _compute_updated_aggregation_path(self, temp_folder, *files) -> str: + file_path = os.path.join(temp_folder, os.path.basename(self.main_file_path)) + return file_path + def _update_aggregation(self, resource, *files): temp_folder = uuid4().hex resource.folder_create(temp_folder) resource.file_upload(*files, destination_path=temp_folder) # check aggregation got created in the temp folder - file_path = os.path.join(temp_folder, os.path.basename(self.main_file_path)) + file_path = self._compute_updated_aggregation_path(temp_folder, *files) original_aggr_dir_path = dirname(self.main_file_path) aggr = resource.aggregation(file__path=file_path) if aggr is not None: @@ -706,6 +710,19 @@ class GeoRasterAggregation(DataObjectSupportingAggregation): def create(cls, base_aggr): return super().create(aggr_cls=cls, base_aggr=base_aggr) + def _compute_updated_aggregation_path(self, temp_folder, *files) -> str: + file_path = "" + for _file in files: + filename = os.path.basename(_file) + if filename.endswith(".vrt"): + file_path = os.path.join(temp_folder, filename) + break + else: + filename = pathlib.Path(filename).stem + ".vrt" + file_path = os.path.join(temp_folder, filename) + break + return file_path + def _validate_aggregation_path(self, agg_path: str, for_save_data: bool = False) -> str: if for_save_data: tif_file_count = 0 From feb1e84e24f506c837130d331604cc60eaca45b0 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 21 Apr 2023 15:09:30 -0400 Subject: [PATCH 16/23] [#44] fixing tests for aggregation data objects --- tests/test_data_objects.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py index 25c0e8c..49af289 100644 --- a/tests/test_data_objects.py +++ b/tests/test_data_objects.py @@ -1,4 +1,5 @@ import os +import pathlib import tempfile import fiona @@ -127,7 +128,8 @@ def test_raster_save_data_object(resource_with_raster_aggr, as_new_aggr): subset_band = rasterio_reader.read(1, window=subset_window) output_raster_dir_path = os.path.join(tmp, "updated_aggr") os.makedirs(output_raster_dir_path) - output_raster_file_path = os.path.join(output_raster_dir_path, "logan.tif") + update_raster_filename = "updated_logan.tif" + output_raster_file_path = os.path.join(output_raster_dir_path, update_raster_filename) profile = rasterio_reader.profile rasterio_reader.close() profile['driver'] = "GTiff" @@ -153,6 +155,7 @@ def test_raster_save_data_object(resource_with_raster_aggr, as_new_aggr): # download aggregation unzip_to = os.path.join(tmp, "unzipped_aggr") os.makedirs(unzip_to) + file_path = pathlib.Path(update_raster_filename).stem + ".vrt" if as_new_aggr: file_path = f"{dst_path}/{file_path}" From 429bec20af803e1272bf271f097ff14c27461e78 Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 2 Jun 2023 17:52:04 -0400 Subject: [PATCH 17/23] [#44] adding example notebook for aggregation data object operations --- .../Aggregation_Data_Object_Operations.ipynb | 929 ++++++++++++++++++ docs/examples/Aggregation_Operations.ipynb | 2 +- mkdocs.yml | 1 + 3 files changed, 931 insertions(+), 1 deletion(-) create mode 100644 docs/examples/Aggregation_Data_Object_Operations.ipynb diff --git a/docs/examples/Aggregation_Data_Object_Operations.ipynb b/docs/examples/Aggregation_Data_Object_Operations.ipynb new file mode 100644 index 0000000..a1f2111 --- /dev/null +++ b/docs/examples/Aggregation_Data_Object_Operations.ipynb @@ -0,0 +1,929 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# hsclient HydroShare Python Client Resource Aggregation Data Object Operation Examples\n", + "\n", + "\n", + "---\n", + "\n", + "\n", + "The following code snippets show examples for how to use the hsclient HydroShare Python Client to load certain aggregation data types to relevant data processing objects to view data properties as well as be able to modify the data. The aggregation data object feature is available for the following HydroShare's content type aggregations:\n", + " * Time series\n", + " * Geographic feature\n", + " * Geographic raster\n", + " * Multidimensional NetCDF" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install the hsclient Python Client\n", + "\n", + "The hsclient Python Client for HydroShare may not be installed by default in your Python environment, so it has to be installed first before you can work with it. Use the following command to install hsclient via the Python Package Index (PyPi)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install hsclient[all]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Authenticating with HydroShare\n", + "\n", + "Before you start interacting with resources in HydroShare you will need to authenticate." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import os\n", + "from hsclient import HydroShare\n", + "\n", + "hs = HydroShare()\n", + "hs.sign_in()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Loading Resource Aggregation Data to Relevant Python Data Analysis Modules\n", + "\n", + "The python data analysis module used for each of the supported aggregation types is shown below:\n", + "\n", + "* Time series : pandas.DataFrame\n", + "* Geographic feature : fiona.Collection\n", + "* Geographic raster : rasterio.DatasetReader\n", + "* Multidimensional NetCDF : xarray.Dataset\n", + "\n", + "In the following code examples, we are assuming that we have a resource in HydroShare that contains the above four aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# first we need to get the resource object from HydroShare using id of the resource\n", + "resource_id = 'a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b'\n", + "resource = hs.resource(resource_id)\n", + "# show resource identifier\n", + "print(f\"Resource ID:{resource.resource_id}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Time Series Data to pandas.DataFrame\n", + "Here we are assuming the time series aggregation contains a sqlite file with name \"sample.sqlite\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the time series aggregation\n", + "file_path = \"sample.sqlite\"\n", + "ts_aggr = resource.aggregation(file__path=file_path)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show the aggregation type\n", + "print(f\"Aggregation Type:{ts_aggr.metadata.type}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# display the time series results metadata to see the all available series\n", + "# later we will use one of the series ids to retrieve the time series data\n", + "print(ts_aggr.metadata.time_series_results)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# download the time series aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n", + "# Note: These directory paths need to be changed based on where you want to download the aggregation\n", + "\n", + "download_to = r\"D:\\Temp\\TimeSeries_Testing\"\n", + "unzip_to = rf\"{download_to}\\aggr_unzipped\"\n", + "aggr_path = resource.aggregation_download(aggregation=ts_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "print(f\"Downloaded aggregation to:{aggr_path}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# load a given time series of the aggregation as pandas.DataFrame from the downloaded location (aggr_path)\n", + "# Note: Here we are assuming the series id used below is one of the ids we found when we printed the\n", + "# time series results in the earlier coding step\n", + "series_id = '51e31687-1ebc-11e6-aa6c-f45c8999816f'\n", + "pd_dataframe = ts_aggr.as_data_object(series_id=series_id, agg_path=aggr_path)\n", + "print(f\"Type of data processing object:{type(pd_dataframe)}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# now we can use the pandas.DataFrame to do some data analysis\n", + "\n", + "# show time series column headings\n", + "print(pd_dataframe.columns)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show time series data summary\n", + "print(pd_dataframe.info)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show number of data points in time series\n", + "print(pd_dataframe.size)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show first 5 records in time series\n", + "print(pd_dataframe.head(5))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# editing time series aggregation data using the pandas.DataFrame\n", + "print(f\"Data frame size before edit:{pd_dataframe.size}\")\n", + "rows, columns = pd_dataframe.shape\n", + "print(f\"Number of rows:{rows}\")\n", + "print(f\"Number of columns:{columns}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# delete 10 rows from the dataframe. This will result in deleting 10 records from the 'TimeSeriesResultValues' table when we save the dataframe.\n", + "pd_dataframe.drop(pd_dataframe.index[0:10], axis=0, inplace=True)\n", + "rows, columns = pd_dataframe.shape\n", + "print(f\"Number of rows in dataframe after delete:{rows}\")\n", + "print(f\"Number of columns in dataframe after delete:{columns}\")\n", + "print(f\"Data frame size after delete:{pd_dataframe.size}\")\n", + "expected_row_count = rows" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# save the updated dataframe object to the time series aggregation in HydroShare\n", + "# Note this will update the data for the existing time series aggregation in HydroShare\n", + "ts_aggr = ts_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=False)\n", + "print(f\"Updated time series aggregation ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can also create a new time series aggregation in HydroShare using the updated dataframe object\n", + "# we will first create a new folder in which the new aggregation will be created\n", + "aggr_folder = \"ts_folder\"\n", + "resource.folder_create(folder=aggr_folder)\n", + "ts_aggr = ts_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=True,\n", + " destination_path=aggr_folder)\n", + "print(f\"Created a new time series aggregation ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the updated time series aggregation to verify the data was updated\n", + "# reload the new timeseries as pandas.DataFrame\n", + "# need to first download this new aggregation\n", + "\n", + "aggr_path = resource.aggregation_download(aggregation=ts_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "print(f\"Downloaded aggregation to:{aggr_path}\")\n", + "pd_dataframe = ts_aggr.as_data_object(series_id=series_id, agg_path=aggr_path)\n", + "rows, columns = pd_dataframe.shape\n", + "print(f\"Number of rows in the updated timeseries:{rows}\")\n", + "print(f\"Number of columns in the updated timeseries:{columns}\")\n", + "assert rows == expected_row_count" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Geographic Feature Data to fiona.Collection\n", + "Here we are assuming the geographic feature aggregation contains a shapefile with name \"sample.shp\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the geographic feature aggregation\n", + "file_path = \"sample.shp\"\n", + "gf_aggr = resource.aggregation(file__path=file_path)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show the aggregation type\n", + "print(f\"Aggregation Type:{gf_aggr.metadata.type}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# download the geographic feature aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n", + "# Note: These directory paths need to be changed based on where you want to download the aggregation\n", + "download_to = r\"D:\\Temp\\GeoFeature_Testing\"\n", + "unzip_to = rf\"{download_to}\\aggr_unzipped\"\n", + "aggr_path = resource.aggregation_download(aggregation=gf_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "print(f\"Downloaded aggregation to:{aggr_path}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# load the downloaded geo-feature aggregation as a fiona Collection object\n", + "fiona_coll = gf_aggr.as_data_object(agg_path=aggr_path)\n", + "print(f\"Type of data processing object:{type(fiona_coll)}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# now we can use the fiona.Collection object to do some data analysis\n", + "\n", + "# show driver used to open the vector file\n", + "print(fiona_coll.driver)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show feature collection coordinate reference system\n", + "print(fiona_coll.crs)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show feature collection spatial coverage\n", + "print(fiona_coll.bounds)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show number of features/bands\n", + "print(len(list(fiona_coll)))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show feature field information\n", + "print(fiona_coll.schema)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show data for a single feature in feature collection\n", + "from fiona.model import to_dict\n", + "\n", + "feature = fiona_coll[1]\n", + "to_dict(feature)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# editing geographic feature aggregation data using the fiona.Collection object\n", + "import fiona\n", + "\n", + "# location of the new output shp file\n", + "# Note: The output shapefile directory path must exist.\n", + "output_shp_file_dir_path = os.path.join(download_to, \"updated_aggr\")\n", + "\n", + "# name the output shape file same as the original shape file\n", + "orig_shp_file_name = os.path.basename(gf_aggr.main_file_path)\n", + "output_shp_file_path = os.path.join(output_shp_file_dir_path, orig_shp_file_name)\n", + "\n", + "# here we will remove one of the bands (where the state name is Alaska) and then write the updated data to a new shp file\n", + "# Note: You have to use a different criteria for selecting bands depending on your feature dataset\n", + "with fiona.open(output_shp_file_path, 'w', schema=fiona_coll.schema, driver=fiona_coll.driver,\n", + " crs=fiona_coll.crs) as out_shp_file:\n", + " for feature in fiona_coll:\n", + " ft_dict = to_dict(feature)\n", + " if ft_dict['properties']['STATE_NAME'] != \"Alaska\":\n", + " out_shp_file.write(feature)\n", + " else:\n", + " print(\">> Skipping feature for Alaska\")\n", + "\n", + "print(\"Done updating the shp file ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can now update the geographic feature aggregation in HydroShare using the updated shp file\n", + "gf_aggr = gf_aggr.save_data_object(resource=resource, agg_path=output_shp_file_dir_path, as_new_aggr=False)\n", + "print(\"Aggregation updated ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can also create a new geographic feature aggregation in HydroShare using the updated shp file\n", + "\n", + "# we will first create a new folder in which the new aggregation will be created in HydroShare\n", + "aggr_folder = \"gf_folder\"\n", + "resource.folder_create(folder=aggr_folder)\n", + "gf_aggr = gf_aggr.save_data_object(resource=resource, agg_path=output_shp_file_dir_path, as_new_aggr=True,\n", + " destination_path=aggr_folder)\n", + "print(\"New aggregation created ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the updated geographic feature aggregation to verify the data was updated\n", + "# need to first download this updated/new aggregation\n", + "aggr_path = resource.aggregation_download(aggregation=gf_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "fiona_coll = gf_aggr.as_data_object(agg_path=aggr_path)\n", + "# check the number of bands in the updated aggregation\n", + "print(len(list(fiona_coll)))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Multidimensional Data to xarray.Dataset\n", + "Here we are assuming the multidimensional aggregation contains a netcdf file with name \"sample.nc\"\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the multidimensional aggregation\n", + "file_path = \"sample.nc\"\n", + "md_aggr = resource.aggregation(file__path=file_path)\n", + "print(f\"Aggregation Type:{md_aggr.metadata.type}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# download the multidimensional aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n", + "# Note: These directory paths need to be changed based on where you want to download the aggregation\n", + "download_to = r\"D:\\Temp\\MultiDim_Testing\"\n", + "unzip_to = rf\"{download_to}\\aggr_unzipped\"\n", + "aggr_path = resource.aggregation_download(aggregation=md_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "print(f\"Downloaded aggregation to:{aggr_path}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# load the downloaded multidimensional aggregation as a xarray.Dataset object\n", + "xarray_ds = md_aggr.as_data_object(agg_path=aggr_path)\n", + "print(f\"Type of data processing object:{type(xarray_ds)}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# now we can use the xarray.Dataset object to do some data analysis\n", + "\n", + "# show netcdf global attributes\n", + "print(xarray_ds.attrs)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show netcdf dimensions\n", + "print(xarray_ds.dims)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show coordinate variables of the netcdf dataset\n", + "print(xarray_ds.coords)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# editing multidimensional aggregation data using the xarray.Dataset object\n", + "\n", + "# here we will only change the title attribute of the dataset\n", + "aggr_title = \"This is a modified title for this aggregation modified using hsclient\"\n", + "xarray_ds.attrs[\"title\"] = aggr_title" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can update the multidimensional aggregation in HydroShare using the updated xarray.Dataset object\n", + "md_aggr = md_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=False)\n", + "print(\"Aggregation updated ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can also create a new multidimensional aggregation in HydroShare using the updated xarray.Dataset object\n", + "\n", + "# we will first create a new folder in which the new aggregation will be created\n", + "aggr_folder = \"md_folder\"\n", + "resource.folder_create(folder=aggr_folder)\n", + "md_aggr = md_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=True,\n", + " destination_path=aggr_folder)\n", + "print(\"New aggregation created ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the updated multidimensional aggregation to verify the data was updated\n", + "\n", + "# need to first download this updated/new aggregation\n", + "aggr_path = resource.aggregation_download(aggregation=md_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "xarray_ds = md_aggr.as_data_object(agg_path=aggr_path)\n", + "# check the title attribute of the updated aggregation\n", + "assert xarray_ds.attrs[\"title\"] == aggr_title" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Loading Geo Raster Data to rasterio.DatasetReader\n", + "Here we are assuming the georaster aggregation contains a geotiff file with name \"sample.tif\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the georaster aggregation\n", + "file_path = \"sample.tif\"\n", + "gr_aggr = resource.aggregation(file__path=file_path)\n", + "print(f\"Aggregation Type:{gr_aggr.metadata.type}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# download the georaster aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n", + "# Note: These directory paths need to be changed based on where you want to download the aggregation\n", + "download_to = r\"D:\\Temp\\GeoRaster_Testing\"\n", + "unzip_to = rf\"{download_to}\\aggr_unzipped\"\n", + "aggr_path = resource.aggregation_download(aggregation=gr_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "print(f\"Downloaded aggregation to:{aggr_path}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# load the downloaded georaster aggregation as a rasterio.DatasetReader object\n", + "rasterio_ds = gr_aggr.as_data_object(agg_path=aggr_path)\n", + "print(f\"Type of data processing object:{type(rasterio_ds)}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# now we can use the rasterio.DatasetReader object to do some data analysis\n", + "\n", + "# show raster band count\n", + "print(rasterio_ds.count)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show raster band dimensions\n", + "print(rasterio_ds.width, rasterio_ds.height)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show raster coordinate reference system\n", + "print(rasterio_ds.crs)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show raster bounds\n", + "print(rasterio_ds.bounds)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# show raster data\n", + "data = rasterio_ds.read()\n", + "print(data)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# editing georaster aggregation data using the rasterio.DatasetReader object\n", + "from rasterio.windows import Window\n", + "import rasterio\n", + "\n", + "# here we will subset the raster data to a smaller extent\n", + "print(\"raster dimensions before editing:\")\n", + "print(f\"raster width :{rasterio_ds.width}\")\n", + "print(f\"raster height:{rasterio_ds.height}\")\n", + "new_width = rasterio_ds.width - 9\n", + "new_height = rasterio_ds.height - 10\n", + "subset_window = Window(0, 0, new_width, new_height)\n", + "subset_band = rasterio_ds.read(1, window=subset_window)\n", + "print(subset_band)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# write the subset data to a new tif file\n", + "\n", + "output_raster_dir_path = r\"D:\\Temp\\GeoRaster_Testing\\updated_aggr\"\n", + "output_raster_filename = \"out_sample.tif\"\n", + "output_raster_file_path = os.path.join(output_raster_dir_path, output_raster_filename)\n", + "profile = rasterio_ds.profile\n", + "rasterio_ds.close()\n", + "profile['driver'] = \"GTiff\"\n", + "profile['width'] = new_width\n", + "profile['height'] = new_height\n", + "\n", + "with rasterio.open(output_raster_file_path, \"w\", **profile) as dst:\n", + " dst.write(subset_band, 1)\n", + "\n", + "print(f\"Saved subset raster to:{output_raster_file_path}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can update the georaster aggregation in HydroShare using the updated rasterio.DatasetReader object\n", + "gr_aggr = gr_aggr.save_data_object(resource=resource, agg_path=output_raster_dir_path, as_new_aggr=False)\n", + "print(\"Aggregation updated ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# we can also create a new georaster aggregation in HydroShare using the updated rasterio.DatasetReader object\n", + "\n", + "# we will first create a new folder in which the new aggregation will be created\n", + "aggr_folder = \"gr_folder\"\n", + "resource.folder_create(folder=aggr_folder)\n", + "gr_aggr = gr_aggr.save_data_object(resource=resource, agg_path=output_raster_dir_path, as_new_aggr=True,\n", + " destination_path=aggr_folder)\n", + "print(\"New aggregation created ...\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# retrieve the updated georaster aggregation to verify the data was updated\n", + "\n", + "# need to first download this updated/new aggregation\n", + "aggr_path = resource.aggregation_download(aggregation=gr_aggr, save_path=download_to, unzip_to=unzip_to)\n", + "rasterio_ds = gr_aggr.as_data_object(agg_path=aggr_path)\n", + "# check the raster dimensions of the updated aggregation\n", + "print(\"raster dimensions after editing:\")\n", + "print(f\"raster width :{rasterio_ds.width}\")\n", + "print(f\"raster height:{rasterio_ds.height}\")" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/examples/Aggregation_Operations.ipynb b/docs/examples/Aggregation_Operations.ipynb index a08a08e..d4629f9 100644 --- a/docs/examples/Aggregation_Operations.ipynb +++ b/docs/examples/Aggregation_Operations.ipynb @@ -72,7 +72,7 @@ "\n", "A \"resource\" is a container for your content in HydroShare. Think of it as a \"working directory\" into which you are going to organize the code and/or data you are using and want to share. The following code can be used to create a new, empty resource within which you can create content and metadata.\n", "\n", - "This code creates a new resource in HydroShare. It also creates an in-memory object representation of that resource in your local environmment that you can then manipulate with further code." + "This code creates a new resource in HydroShare. It also creates an in-memory object representation of that resource in your local environment that you can then manipulate with further code." ] }, { diff --git a/mkdocs.yml b/mkdocs.yml index be7df82..6d9cacd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ nav: Aggregation Operations: examples/Aggregation_Operations.ipynb File Operations: examples/File_Operations.ipynb Metadata Operations: examples/Metadata_Operations.ipynb + Aggregation Data Object Operations: examples/Aggregation_Data_Object_Operations.ipynb - Developer Documentation: - Models: Single File: metadata/SingleFile.md From 14e9d7ed66869d6592b88bb9ddd9e139f02162cd Mon Sep 17 00:00:00 2001 From: pkdash Date: Fri, 2 Jun 2023 23:26:07 -0400 Subject: [PATCH 18/23] [#44] updating the github action yml file to install all optional python packages --- .github/workflows/python-package.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e8b47c2..793f059 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,7 +29,7 @@ jobs: run: | python -m pip install --upgrade pip make install - pip install . + pip install .[all] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/setup.py b/setup.py index 4f63168..8338a40 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='hsclient', - version='0.3.3', + version='0.3.4', packages=find_packages(include=['hsclient', 'hsclient.*'], exclude=("tests",)), install_requires=[ From cf29bcec1fac7abc89c53760a296c72790848e8e Mon Sep 17 00:00:00 2001 From: pkdash Date: Sat, 3 Jun 2023 23:19:00 -0400 Subject: [PATCH 19/23] [#44] reverting packaging yml change for optional dependencies --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 793f059..e8b47c2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,7 +29,7 @@ jobs: run: | python -m pip install --upgrade pip make install - pip install .[all] + pip install . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 94909d62abbcdfea2aab19fcf887a365fd9bc54f Mon Sep 17 00:00:00 2001 From: pkdash Date: Mon, 5 Jun 2023 18:25:53 -0400 Subject: [PATCH 20/23] [#44] run github workflow job using PR source branch --- .github/workflows/python-package.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e8b47c2..127549b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -21,9 +21,17 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} for pull requests + if: ${{ github.event_name == 'pull_request_target' }} uses: actions/setup-python@v3 with: + ref: ${{ github.head_ref }} + python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} for push + if: ${{ github.event_name == 'push' }} + uses: actions/setup-python@v3 + with: + ref: ${{ github.ref }} python-version: ${{ matrix.python-version }} - name: Install dependencies run: | From 2e0ed3939fc248801fa01c08c215f8a5dd526ce4 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 7 Jun 2023 22:31:57 -0400 Subject: [PATCH 21/23] [#44] removing priority search on aggregation path --- hsclient/hydroshare.py | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/hsclient/hydroshare.py b/hsclient/hydroshare.py index 62d6202..15335b6 100644 --- a/hsclient/hydroshare.py +++ b/hsclient/hydroshare.py @@ -338,26 +338,6 @@ def aggregations(self, **kwargs) -> List[BaseMetadata]: """ aggregations = self._aggregations - # when searching using 'file__path' or files__path' as the key, there can be only one matching aggregation - file_path_priority = kwargs.pop("file_path_priority", True) - file_path = kwargs.get("file__path", "") - if not file_path: - file_path = kwargs.get("files__path", "") - if file_path and file_path_priority: - dir_path = os.path.dirname(file_path) - file_name = pathlib.Path(file_path).stem - if dir_path: - aggr_map_path = urljoin(dir_path, file_name) - else: - aggr_map_path = file_name - - aggr_map_path = f"{aggr_map_path}_resmap.xml" - for aggr in self._parsed_aggregations: - aggr_map_full_path = f"/{aggr._resource_path}/data/contents/{aggr_map_path}" - if aggr._map_path == aggr_map_full_path: - return [aggr] - return [] - for key, value in kwargs.items(): if key.startswith('file__'): file_args = {key[len('file__'):]: value} @@ -474,7 +454,7 @@ def _validate_aggregation_for_update(self, resource: 'Resource', agg_type: Aggre raise Exception("This aggregation is not part of the specified resource.") def _compute_updated_aggregation_path(self, temp_folder, *files) -> str: - file_path = os.path.join(temp_folder, os.path.basename(self.main_file_path)) + file_path = urljoin(temp_folder, os.path.basename(self.main_file_path)) return file_path def _update_aggregation(self, resource, *files): @@ -715,11 +695,11 @@ def _compute_updated_aggregation_path(self, temp_folder, *files) -> str: for _file in files: filename = os.path.basename(_file) if filename.endswith(".vrt"): - file_path = os.path.join(temp_folder, filename) + file_path = urljoin(temp_folder, filename) break else: filename = pathlib.Path(filename).stem + ".vrt" - file_path = os.path.join(temp_folder, filename) + file_path = urljoin(temp_folder, filename) break return file_path @@ -1100,10 +1080,6 @@ def file_aggregate(self, path: str, agg_type: AggregationType, refresh: bool = T if refresh: # Only return the newly created aggregation if a refresh is requested self.refresh() - if agg_type == AggregationType.GeographicRasterAggregation and not path.endswith(".vrt") \ - or agg_type == AggregationType.FileSetAggregation: - # search all files of the aggregation to find a matching aggregation - return self.aggregation(file__path=path, file_path_priority=False) return self.aggregation(file__path=path) @refresh From f3fa653f341e69a2e55ccf47b491914302e074a7 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 7 Jun 2023 22:33:32 -0400 Subject: [PATCH 22/23] [#44] rolling back changes to github action workflow file --- .github/workflows/python-package.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 127549b..e8b47c2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -21,17 +21,9 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} for pull requests - if: ${{ github.event_name == 'pull_request_target' }} + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: - ref: ${{ github.head_ref }} - python-version: ${{ matrix.python-version }} - - name: Set up Python ${{ matrix.python-version }} for push - if: ${{ github.event_name == 'push' }} - uses: actions/setup-python@v3 - with: - ref: ${{ github.ref }} python-version: ${{ matrix.python-version }} - name: Install dependencies run: | From 2c14b5aa163f271d05189bdc813764831ccafc30 Mon Sep 17 00:00:00 2001 From: pkdash Date: Wed, 7 Jun 2023 22:35:24 -0400 Subject: [PATCH 23/23] [#44] removing some pycharm specific blocks from notebook file --- docs/examples/Basic_Operations.ipynb | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/docs/examples/Basic_Operations.ipynb b/docs/examples/Basic_Operations.ipynb index d28ac8b..c83dfce 100644 --- a/docs/examples/Basic_Operations.ipynb +++ b/docs/examples/Basic_Operations.ipynb @@ -52,10 +52,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "3njsiY73m7_V", - "pycharm": { - "is_executing": true - } + "id": "3njsiY73m7_V" }, "outputs": [], "source": [ @@ -76,11 +73,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "pycharm": { - "is_executing": true - } - }, + "metadata": {}, "outputs": [], "source": [ "from hsclient import HydroShare\n", @@ -100,25 +93,22 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "TH3UUihSojIb" - }, "source": [ "### Create a New Empty Resource\n", "\n", "A \"resource\" is a container for your content in HydroShare. Think of it as a \"working directory\" into which you are going to organize the code and/or data you are using and want to share. The following code can be used to create a new, empty resource within which you can create content and metadata.\n", "\n", "This code creates a new resource in HydroShare. It also creates an in-memory object representation of that resource in your local environmment that you can then manipulate with further code." - ] + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "W9azvJ_Co87w", - "pycharm": { - "is_executing": true - } + "id": "W9azvJ_Co87w" }, "outputs": [], "source": [