Merge pull request #168 from nsidc/documentation

Preparing for v0.4.1
nsidc · Oct 31, 2022 · ea2522b · ea2522b
2 parents 4fd276e + ebb1e25
commit ea2522b
Show file tree

Hide file tree

Showing 7 changed files with 1,374 additions and 488 deletions.
diff --git a/earthdata/daac.py b/earthdata/daac.py
@@ -50,7 +50,7 @@
         "homepage": "https://lpdaac.usgs.gov",
         "cloud-providers": ["LPCLOUD"],
         "on-prem-providers": ["LPDAAC_ECS"],
-        "s3-credentials": "https://data.lpdaac.prod.earthdatacloud.nasa.gov/s3credentials",
+        "s3-credentials": "https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials",
     },
     {
         "short-name": "GESDISC",

diff --git a/earthdata/results.py b/earthdata/results.py
@@ -271,22 +271,44 @@ def _derive_s3_link(self, links: List[str]) -> List[str]:
                 s3_links.append(f's3://{links[0].split("nasa.gov/")[1]}')
         return s3_links
 
-    def data_links(self, access: str = "on_prem") -> List[str]:
+    def data_links(self, access: str = None, in_region: bool = False) -> List[str]:
         """Returns the data links form a granule
 
         Parameters:
             access: direct or external, direct means in-region access for cloud hosted collections.
+            in_region: if we are running in us-west-2, meant for the store class, default is False
         Returns:
             the data link for the requested access type
         """
-        links = self._filter_related_links("GET DATA")
+        https_links = self._filter_related_links("GET DATA")
         s3_links = self._filter_related_links("GET DATA VIA DIRECT ACCESS")
-        if self.cloud_hosted and access == "direct":
-            if len(s3_links) == 0 and len(links) > 0:
-                return self._derive_s3_link(links)
+        if in_region:
+            # we are in us-west-2
+            if self.cloud_hosted and access is None:
+                # this is a cloud collection and we didn't specify the access type
+                # default to S3 links
+                if len(s3_links) == 0 and len(https_links) > 0:
+                    # This is guessing the S3 links for some cloud collections that for
+                    # some reason only offered HTTPS links
+                    return self._derive_s3_link(https_links)
+                else:
+                    # we have the s3 links so we return those
+                    return s3_links
             else:
+                # Even though we are in us-west-2 the user wants the HTTPS links
+                # used in region they are S3 signed links from TEA
+                # https://github.com/asfadmin/thin-egress-app
+                return https_links
+        else:
+            # we are not in region
+            if access == "direct":
+                # maybe the user wants to collect S3 links ans use them later
+                # from the cloud
                 return s3_links
-        return links
+            else:
+                # we are not in us-west-2, even cloud collections have HTTPS links
+                return https_links
+        return https_links
 
     def dataviz_links(self) -> List[str]:
         """

diff --git a/earthdata/search.py b/earthdata/search.py
@@ -41,6 +41,7 @@ def __init__(self, auth: Auth = None, *args: Any, **kwargs: Any) -> None:
                 for queries that need authentication e.g. restricted datasets
         """
         super().__init__(*args, **kwargs)
+        self.session = session()
         if auth is not None and auth.authenticated:
             # To search we need the new bearer tokens from NASA Earthdata
             self.session = auth.get_session(bearer_token=True)
@@ -286,11 +287,10 @@ class DataGranules(GranuleQuery):
     def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None:
         """Base class for Granule and Collection CMR queries."""
         super().__init__(*args, **kwargs)
+        self.session = session()
         if auth is not None and auth.authenticated:
             # To search we need the new bearer tokens from NASA Earthdata
             self.session = auth.get_session(bearer_token=True)
-        else:
-            self.session = session()
 
         self._debug = False
 
@@ -306,7 +306,8 @@ def orbit_number(self, orbit1: int, orbit2: int) -> Type[GranuleQuery]:
         return self
 
     def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]:
-        """Only match granules that are hosted in the cloud. This is valid for public collections and if we are using the short_name parameter. Concept-Id is unambiguous.
+        """Only match granules that are hosted in the cloud. This is valid for public
+        collections and if we are using the short_name parameter. Concept-Id is unambiguous.
 
         ???+ Tip
             Cloud hosted collections can be public or restricted.

diff --git a/earthdata/store.py b/earthdata/store.py
@@ -183,7 +183,6 @@ def get_https_session(
         session = fsspec.filesystem("https", client_kwargs=client_kwargs)
         return session
 
-    @singledispatchmethod
     def open(
         self,
         granules: Union[List[str], List[DataGranule]],
@@ -192,14 +191,33 @@ def open(
         """Returns a list of fsspec file-like objects that can be used to access files
         hosted on S3 or HTTPS by third party libraries like xarray.
 
+        Parameters:
+            granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule
+        Returns:
+            a list of s3fs "file pointers" to s3 files.
+        """
+        if len(granules):
+            return self._open(granules, provider)
+        print("The granules list is empty, moving on...")
+        return None
+
+    @singledispatchmethod
+    def _open(
+        self,
+        granules: Union[List[str], List[DataGranule]],
+        provider: str = None,
+    ) -> Union[List[Any], None]:
+        """Returns a list of fsspec file-like objects that can be used to access files
+        hosted on S3 or HTTPS by third party libraries like xarray.
+
         Parameters:
             granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule
         Returns:
             a list of s3fs "file pointers" to s3 files.
         """
         raise NotImplementedError("granules should be a list of DataGranule or URLs")
 
-    @open.register
+    @_open.register
     def _open_granules(
         self,
         granules: List[DataGranule],
@@ -217,7 +235,8 @@ def _open_granules(
         provider = granules[0]["meta"]["provider-id"]
         data_links = list(
             chain.from_iterable(
-                granule.data_links(access=access_method) for granule in granules
+                granule.data_links(access=access_method, in_region=self.running_in_aws)
+                for granule in granules
             )
         )
         total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
@@ -266,7 +285,7 @@ def multi_thread_open(url: str) -> Any:
                     return None
             return fileset
 
-    @open.register
+    @_open.register
     def _open_urls(
         self,
         granules: List[str],
@@ -319,7 +338,6 @@ def _open_urls(
                     return None
             return fileset
 
-    @singledispatchmethod
     def get(
         self,
         granules: Union[List[DataGranule], List[str]],
@@ -345,14 +363,47 @@ def get(
         Returns:
             None
         """
+        if len(granules):
+            self._get(granules, local_path, access, provider, threads)
+
         print("List of URLs or DataGranule isntances expected")
         return None
 
-    @get.register
+    @singledispatchmethod
+    def _get(
+        self,
+        granules: Union[List[DataGranule], List[str]],
+        local_path: str = None,
+        access: str = None,
+        provider: str = None,
+        threads: int = 8,
+    ) -> None:
+        """Retrieves data granules from a remote storage system.
+
+           * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
+           * If we run it outside the us-west-2 region and the data granules are part of a cloud-based
+             collection the method will not get any files.
+           * If we requests data granules from an on-prem collection the data will be effectively downloaded
+             to a local directory.
+
+        Parameters:
+            granules: a list of granules(DataGranule) instances or a list of granule links (HTTP)
+            local_path: local directory to store the remote data granules
+            access: direct or on_prem, if set it will use it for the access method. only for granules list from search
+            threads: parallel number of threads to use to download the files, adjust as necessary, default = 8
+
+        Returns:
+            None
+        """
+        print("List of URLs or DataGranule isntances expected")
+        return None
+
+    @_get.register
     def _get_urls(
         self,
         granules: List[str],
         local_path: str = None,
+        access: str = None,
         provider: str = None,
         threads: int = 8,
     ) -> None:
@@ -367,28 +418,33 @@ def _get_urls(
                 print(f"Retrieved: {file} to {local_path}")
         else:
             # if the data is cloud based bu we are not in AWS it will be downloaded as if it was on prem
+            if access is None:
+                pass
             self._download_onprem_granules(data_links, local_path, threads)
         return None
 
-    @get.register
+    @_get.register
     def _get_granules(
         self,
         granules: List[DataGranule],
         local_path: str = None,
+        access: str = None,
         provider: str = None,
         threads: int = 8,
     ) -> None:
 
         data_links: List = []
         provider = granules[0]["meta"]["provider-id"]
         cloud_hosted = granules[0].cloud_hosted
-        access = "on_prem"
-        if cloud_hosted and self.running_in_aws:
+        if cloud_hosted and self.running_in_aws and access is None:
             # TODO: benchmark this
+            print("direct???")
             access = "direct"
         data_links = list(
+            # we are not in region
             chain.from_iterable(
-                granule.data_links(access=access) for granule in granules
+                granule.data_links(access=access, in_region=self.running_in_aws)
+                for granule in granules
             )
         )
         total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)