Skip to content

Commit

Permalink
Merge pull request #168 from nsidc/documentation
Browse files Browse the repository at this point in the history
Preparing for v0.4.1
  • Loading branch information
betolink authored Oct 31, 2022
2 parents 4fd276e + ebb1e25 commit ea2522b
Show file tree
Hide file tree
Showing 7 changed files with 1,374 additions and 488 deletions.
2 changes: 1 addition & 1 deletion earthdata/daac.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"homepage": "https://lpdaac.usgs.gov",
"cloud-providers": ["LPCLOUD"],
"on-prem-providers": ["LPDAAC_ECS"],
"s3-credentials": "https://data.lpdaac.prod.earthdatacloud.nasa.gov/s3credentials",
"s3-credentials": "https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials",
},
{
"short-name": "GESDISC",
Expand Down
34 changes: 28 additions & 6 deletions earthdata/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,22 +271,44 @@ def _derive_s3_link(self, links: List[str]) -> List[str]:
s3_links.append(f's3://{links[0].split("nasa.gov/")[1]}')
return s3_links

def data_links(self, access: str = "on_prem") -> List[str]:
def data_links(self, access: str = None, in_region: bool = False) -> List[str]:
"""Returns the data links form a granule
Parameters:
access: direct or external, direct means in-region access for cloud hosted collections.
in_region: if we are running in us-west-2, meant for the store class, default is False
Returns:
the data link for the requested access type
"""
links = self._filter_related_links("GET DATA")
https_links = self._filter_related_links("GET DATA")
s3_links = self._filter_related_links("GET DATA VIA DIRECT ACCESS")
if self.cloud_hosted and access == "direct":
if len(s3_links) == 0 and len(links) > 0:
return self._derive_s3_link(links)
if in_region:
# we are in us-west-2
if self.cloud_hosted and access is None:
# this is a cloud collection and we didn't specify the access type
# default to S3 links
if len(s3_links) == 0 and len(https_links) > 0:
# This is guessing the S3 links for some cloud collections that for
# some reason only offered HTTPS links
return self._derive_s3_link(https_links)
else:
# we have the s3 links so we return those
return s3_links
else:
# Even though we are in us-west-2 the user wants the HTTPS links
# used in region they are S3 signed links from TEA
# https://github.com/asfadmin/thin-egress-app
return https_links
else:
# we are not in region
if access == "direct":
# maybe the user wants to collect S3 links ans use them later
# from the cloud
return s3_links
return links
else:
# we are not in us-west-2, even cloud collections have HTTPS links
return https_links
return https_links

def dataviz_links(self) -> List[str]:
"""
Expand Down
7 changes: 4 additions & 3 deletions earthdata/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, auth: Auth = None, *args: Any, **kwargs: Any) -> None:
for queries that need authentication e.g. restricted datasets
"""
super().__init__(*args, **kwargs)
self.session = session()
if auth is not None and auth.authenticated:
# To search we need the new bearer tokens from NASA Earthdata
self.session = auth.get_session(bearer_token=True)
Expand Down Expand Up @@ -286,11 +287,10 @@ class DataGranules(GranuleQuery):
def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None:
"""Base class for Granule and Collection CMR queries."""
super().__init__(*args, **kwargs)
self.session = session()
if auth is not None and auth.authenticated:
# To search we need the new bearer tokens from NASA Earthdata
self.session = auth.get_session(bearer_token=True)
else:
self.session = session()

self._debug = False

Expand All @@ -306,7 +306,8 @@ def orbit_number(self, orbit1: int, orbit2: int) -> Type[GranuleQuery]:
return self

def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]:
"""Only match granules that are hosted in the cloud. This is valid for public collections and if we are using the short_name parameter. Concept-Id is unambiguous.
"""Only match granules that are hosted in the cloud. This is valid for public
collections and if we are using the short_name parameter. Concept-Id is unambiguous.
???+ Tip
Cloud hosted collections can be public or restricted.
Expand Down
76 changes: 66 additions & 10 deletions earthdata/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def get_https_session(
session = fsspec.filesystem("https", client_kwargs=client_kwargs)
return session

@singledispatchmethod
def open(
self,
granules: Union[List[str], List[DataGranule]],
Expand All @@ -192,14 +191,33 @@ def open(
"""Returns a list of fsspec file-like objects that can be used to access files
hosted on S3 or HTTPS by third party libraries like xarray.
Parameters:
granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule
Returns:
a list of s3fs "file pointers" to s3 files.
"""
if len(granules):
return self._open(granules, provider)
print("The granules list is empty, moving on...")
return None

@singledispatchmethod
def _open(
self,
granules: Union[List[str], List[DataGranule]],
provider: str = None,
) -> Union[List[Any], None]:
"""Returns a list of fsspec file-like objects that can be used to access files
hosted on S3 or HTTPS by third party libraries like xarray.
Parameters:
granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule
Returns:
a list of s3fs "file pointers" to s3 files.
"""
raise NotImplementedError("granules should be a list of DataGranule or URLs")

@open.register
@_open.register
def _open_granules(
self,
granules: List[DataGranule],
Expand All @@ -217,7 +235,8 @@ def _open_granules(
provider = granules[0]["meta"]["provider-id"]
data_links = list(
chain.from_iterable(
granule.data_links(access=access_method) for granule in granules
granule.data_links(access=access_method, in_region=self.running_in_aws)
for granule in granules
)
)
total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
Expand Down Expand Up @@ -266,7 +285,7 @@ def multi_thread_open(url: str) -> Any:
return None
return fileset

@open.register
@_open.register
def _open_urls(
self,
granules: List[str],
Expand Down Expand Up @@ -319,7 +338,6 @@ def _open_urls(
return None
return fileset

@singledispatchmethod
def get(
self,
granules: Union[List[DataGranule], List[str]],
Expand All @@ -345,14 +363,47 @@ def get(
Returns:
None
"""
if len(granules):
self._get(granules, local_path, access, provider, threads)

print("List of URLs or DataGranule isntances expected")
return None

@get.register
@singledispatchmethod
def _get(
self,
granules: Union[List[DataGranule], List[str]],
local_path: str = None,
access: str = None,
provider: str = None,
threads: int = 8,
) -> None:
"""Retrieves data granules from a remote storage system.
* If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
* If we run it outside the us-west-2 region and the data granules are part of a cloud-based
collection the method will not get any files.
* If we requests data granules from an on-prem collection the data will be effectively downloaded
to a local directory.
Parameters:
granules: a list of granules(DataGranule) instances or a list of granule links (HTTP)
local_path: local directory to store the remote data granules
access: direct or on_prem, if set it will use it for the access method. only for granules list from search
threads: parallel number of threads to use to download the files, adjust as necessary, default = 8
Returns:
None
"""
print("List of URLs or DataGranule isntances expected")
return None

@_get.register
def _get_urls(
self,
granules: List[str],
local_path: str = None,
access: str = None,
provider: str = None,
threads: int = 8,
) -> None:
Expand All @@ -367,28 +418,33 @@ def _get_urls(
print(f"Retrieved: {file} to {local_path}")
else:
# if the data is cloud based bu we are not in AWS it will be downloaded as if it was on prem
if access is None:
pass
self._download_onprem_granules(data_links, local_path, threads)
return None

@get.register
@_get.register
def _get_granules(
self,
granules: List[DataGranule],
local_path: str = None,
access: str = None,
provider: str = None,
threads: int = 8,
) -> None:

data_links: List = []
provider = granules[0]["meta"]["provider-id"]
cloud_hosted = granules[0].cloud_hosted
access = "on_prem"
if cloud_hosted and self.running_in_aws:
if cloud_hosted and self.running_in_aws and access is None:
# TODO: benchmark this
print("direct???")
access = "direct"
data_links = list(
# we are not in region
chain.from_iterable(
granule.data_links(access=access) for granule in granules
granule.data_links(access=access, in_region=self.running_in_aws)
for granule in granules
)
)
total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
Expand Down
Loading

0 comments on commit ea2522b

Please sign in to comment.