Skip to content

Commit 3e2f685

Browse files
author
Andrew Delman
committed
tutorial for ECCO downloads and variable lists
1 parent 760b3cc commit 3e2f685

File tree

7 files changed

+975
-140
lines changed

7 files changed

+975
-140
lines changed

ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb

Lines changed: 181 additions & 139 deletions
Large diffs are not rendered by default.
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
def ecco_podaac_download(ShortName,StartDate,EndDate,download_root_dir=None,n_workers=6,force_redownload=False):
2+
"""
3+
This routine downloads ECCO datasets from PO.DAAC. It is adapted from the Jupyter notebooks created by Jack McNelis and Ian Fenty (https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md) and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io).
4+
5+
Parameters
6+
----------
7+
ShortName: the ShortName of the dataset (can be identified from https://search.earthdata.nasa.gov/search?fpj=ECCO, selecting the "i" information button and the ShortName will appear in a gray box in the upper-left corner)
8+
9+
StartDate: the start of the time range to be downloaded, expressed in the format "YYYY-MM-DD"
10+
11+
EndDate: the end of the time range to be downloaded, expressed in the format "YYYY-MM-DD"
12+
13+
download_root_dir: path of the parent directory to download ECCO files
14+
15+
n_workers: number of workers to use in concurrent downloads
16+
17+
force_redownload: if True, existing files will be redownloaded and replaced; if False, existing files will not be replaced
18+
"""
19+
20+
21+
## Initalize Python libraries
22+
import numpy as np
23+
import pandas as pd
24+
import requests
25+
import shutil
26+
import time as time
27+
28+
# for concurrent simulatenous downloads
29+
from concurrent.futures import ThreadPoolExecutor
30+
from getpass import getpass
31+
from http.cookiejar import CookieJar
32+
from io import StringIO
33+
from itertools import repeat
34+
from pathlib import Path
35+
from platform import system
36+
from netrc import netrc
37+
from os.path import basename, isfile, isdir, join
38+
# progress bar
39+
from tqdm import tqdm
40+
# library to download files
41+
from urllib import request
42+
43+
# if no download directory specified, set directory under user's home directory
44+
if download_root_dir==None:
45+
import sys
46+
from os.path import expanduser
47+
user_home_dir = expanduser('~')
48+
download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC')
49+
else:
50+
download_root_dir = Path(download_root_dir)
51+
52+
# Predict the path of the netrc file depending on os/platform type.
53+
_netrc = join(expanduser('~'), "_netrc" if system()=="Windows" else ".netrc")
54+
55+
## Define Helper Subroutines
56+
57+
### Helper subroutine to log into NASA EarthData
58+
59+
# not pretty but it works
60+
def setup_earthdata_login_auth(url: str='urs.earthdata.nasa.gov'):
61+
# look for the netrc file and use the login/password
62+
try:
63+
username, _, password = netrc(file=_netrc).authenticators(url)
64+
65+
# if the file is not found, prompt the user for the login/password
66+
except (FileNotFoundError, TypeError):
67+
print('Please provide Earthdata Login credentials for access.')
68+
username, password = input('Username: '), getpass('Password: ')
69+
70+
manager = request.HTTPPasswordMgrWithDefaultRealm()
71+
manager.add_password(None, url, username, password)
72+
auth = request.HTTPBasicAuthHandler(manager)
73+
jar = CookieJar()
74+
processor = request.HTTPCookieProcessor(jar)
75+
opener = request.build_opener(auth, processor)
76+
request.install_opener(opener)
77+
78+
### Helper subroutines to make the API calls to search CMR and parse response
79+
def set_params(params: dict):
80+
params.update({'scroll': "true", 'page_size': 2000})
81+
return {par: val for par, val in params.items() if val is not None}
82+
83+
def get_results(params: dict, headers: dict=None):
84+
response = requests.get(url="https://cmr.earthdata.nasa.gov/search/granules.csv",
85+
params=set_params(params),
86+
headers=headers)
87+
return response, response.headers
88+
89+
90+
def get_granules(params: dict):
91+
response, headers = get_results(params=params)
92+
scroll = headers['CMR-Scroll-Id']
93+
hits = int(headers['CMR-Hits'])
94+
if hits==0:
95+
raise Exception("No granules matched your input parameters.")
96+
df = pd.read_csv(StringIO(response.text))
97+
while hits > df.index.size:
98+
response, _ = get_results(params=params, headers={'CMR-Scroll-Id': scroll})
99+
data = pd.read_csv(StringIO(response.text))
100+
df = pd.concat([df, data])
101+
return df
102+
103+
### Helper subroutine to gracefully download single files and avoids re-downloading if file already exists.
104+
# To force redownload of the file, pass **True** to the boolean argument *force* (default **False**)\n,
105+
def download_file(url: str, output_dir: str, force: bool=False):
106+
"""url (str): the HTTPS url from which the file will download
107+
output_dir (str): the local path into which the file will download
108+
force (bool): download even if the file exists locally already
109+
"""
110+
if not isdir(output_dir):
111+
raise Exception(f"Output directory doesnt exist! ({output_dir})")
112+
113+
target_file = join(output_dir, basename(url))
114+
115+
# if the file has already been downloaded, skip
116+
if isfile(target_file) and force is False:
117+
print(f'\n{basename(url)} already exists, and force=False, not re-downloading')
118+
return 0
119+
120+
with requests.get(url) as r:
121+
if not r.status_code // 100 == 2:
122+
raise Exception(r.text)
123+
return 0
124+
else:
125+
with open(target_file, 'wb') as f:
126+
total_size_in_bytes= int(r.headers.get('content-length', 0))
127+
for chunk in r.iter_content(chunk_size=1024):
128+
if chunk:
129+
f.write(chunk)
130+
131+
return total_size_in_bytes
132+
133+
### Helper subroutine to download all urls in the list `dls`
134+
def download_files_concurrently(dls, download_dir, force=False):
135+
start_time = time.time()
136+
137+
# use 3 threads for concurrent downloads
138+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
139+
140+
# tqdm makes a cool progress bar
141+
results = list(tqdm(executor.map(download_file, dls, repeat(download_dir), repeat(force)), total=len(dls)))
142+
143+
# add up the total downloaded file sizes
144+
total_download_size_in_bytes = np.sum(np.array(results))
145+
# calculate total time spent in the download
146+
total_time = time.time() - start_time
147+
148+
print('\n=====================================')
149+
print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
150+
print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s')
151+
152+
# define root directory for downloaded NetCDF files
153+
download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC')
154+
155+
# define the directory where the downloaded files will be saved
156+
download_dir = download_root_dir / ShortName
157+
158+
# create the download directory
159+
download_dir.mkdir(exist_ok = True, parents=True)
160+
161+
print(f'created download directory {download_dir}')
162+
163+
## Log into Earthdata using your username and password
164+
165+
# actually log in with this command:
166+
setup_earthdata_login_auth()
167+
168+
# Query the NASA Common Metadata Repository to find the URL of every granule associated with the desired ECCO Dataset and date range of interest.
169+
170+
# create a Python dictionary with our search criteria: `ShortName` and `temporal`
171+
input_search_params = {'ShortName': ShortName,
172+
'temporal': ",".join([StartDate, EndDate])}
173+
174+
print(input_search_params)
175+
176+
### Query CMR for the desired ECCO Dataset
177+
178+
# grans means 'granules', PO.DAAC's term for individual files in a dataset
179+
grans = get_granules(input_search_params)
180+
181+
# grans.info()
182+
183+
num_grans = len( grans['Granule UR'] )
184+
print (f'\nTotal number of matching granules: {num_grans}')
185+
186+
187+
## Download the granules
188+
189+
# convert the rows of the 'Online Access URLS' column to a Python list
190+
dls = grans['Online Access URLs'].tolist()
191+
192+
try:
193+
# Attempt concurrent downloads, but if error arises switch to sequential downloads
194+
### Method 1: Concurrent downloads
195+
196+
# Define the maximum number of concurrent downloads (benefits typically taper off above 5-6)
197+
max_workers = 6
198+
199+
# Force redownload (or not) depending on value of force_redownload
200+
download_files_concurrently(dls, download_dir, force_redownload)
201+
202+
except:
203+
### Method 2: Sequential Downloads
204+
205+
# Download each URL sequentially in a for loop.
206+
total_download_size_in_bytes = 0
207+
start_time = time.time()
208+
209+
# loop through all urls in dls
210+
for u in dls:
211+
u_name = u.split('/')[-1]
212+
print(f'downloading {u_name}')
213+
total_download_size_in_bytes += download_file(url=u, output_dir=download_dir, force=force_redownload)
214+
215+
# calculate total time spent in the download
216+
total_time = time.time() - start_time
217+
218+
print('\n=====================================')
219+
print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
220+
print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s')

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
This repository contains a Python tutorial for using the [ECCO Central Production version 4](https://ecco.jpl.nasa.gov/) ocean and sea-ice state estimate. Directories within the repository include the ([tutorial documentation](http://ecco-v4-python-tutorial.readthedocs.io/)) and individiual lessons from the tutorial as Juypter notebooks ([model settings ([Tutorials_as_Jupyter_Notebooks/](Tutorials_as_Jupyter_Notebooks/) and [Tutorials_as_Python_Files/](Tutorials_as_Python_Files/)).
66

7-
The tutorials were written for ECCO version 4 release 3 but should be applicable to any ECCO v4 solution. If user support is needed, please contact <[email protected]>.
7+
The tutorials were written for ECCO version 4 release 3 but should be applicable to any ECCO v4 solution, and are currently being updated for version 4 release 4. If user support is needed, please contact <[email protected]>.
88

99
[Estimating the Circulation and Climate of the Ocean]: http://ecco.jpl.nasa.gov, http://ecco-group.org/
1010

0 commit comments

Comments
 (0)