|
| 1 | +def ecco_podaac_download(ShortName,StartDate,EndDate,download_root_dir=None,n_workers=6,force_redownload=False): |
| 2 | + """ |
| 3 | + This routine downloads ECCO datasets from PO.DAAC. It is adapted from the Jupyter notebooks created by Jack McNelis and Ian Fenty (https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md) and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io). |
| 4 | + |
| 5 | + Parameters |
| 6 | + ---------- |
| 7 | + ShortName: the ShortName of the dataset (can be identified from https://search.earthdata.nasa.gov/search?fpj=ECCO, selecting the "i" information button and the ShortName will appear in a gray box in the upper-left corner) |
| 8 | + |
| 9 | + StartDate: the start of the time range to be downloaded, expressed in the format "YYYY-MM-DD" |
| 10 | + |
| 11 | + EndDate: the end of the time range to be downloaded, expressed in the format "YYYY-MM-DD" |
| 12 | + |
| 13 | + download_root_dir: path of the parent directory to download ECCO files |
| 14 | + |
| 15 | + n_workers: number of workers to use in concurrent downloads |
| 16 | + |
| 17 | + force_redownload: if True, existing files will be redownloaded and replaced; if False, existing files will not be replaced |
| 18 | + """ |
| 19 | + |
| 20 | + |
| 21 | + ## Initalize Python libraries |
| 22 | + import numpy as np |
| 23 | + import pandas as pd |
| 24 | + import requests |
| 25 | + import shutil |
| 26 | + import time as time |
| 27 | + |
| 28 | + # for concurrent simulatenous downloads |
| 29 | + from concurrent.futures import ThreadPoolExecutor |
| 30 | + from getpass import getpass |
| 31 | + from http.cookiejar import CookieJar |
| 32 | + from io import StringIO |
| 33 | + from itertools import repeat |
| 34 | + from pathlib import Path |
| 35 | + from platform import system |
| 36 | + from netrc import netrc |
| 37 | + from os.path import basename, isfile, isdir, join |
| 38 | + # progress bar |
| 39 | + from tqdm import tqdm |
| 40 | + # library to download files |
| 41 | + from urllib import request |
| 42 | + |
| 43 | + # if no download directory specified, set directory under user's home directory |
| 44 | + if download_root_dir==None: |
| 45 | + import sys |
| 46 | + from os.path import expanduser |
| 47 | + user_home_dir = expanduser('~') |
| 48 | + download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC') |
| 49 | + else: |
| 50 | + download_root_dir = Path(download_root_dir) |
| 51 | + |
| 52 | + # Predict the path of the netrc file depending on os/platform type. |
| 53 | + _netrc = join(expanduser('~'), "_netrc" if system()=="Windows" else ".netrc") |
| 54 | + |
| 55 | + ## Define Helper Subroutines |
| 56 | + |
| 57 | + ### Helper subroutine to log into NASA EarthData |
| 58 | + |
| 59 | + # not pretty but it works |
| 60 | + def setup_earthdata_login_auth(url: str='urs.earthdata.nasa.gov'): |
| 61 | + # look for the netrc file and use the login/password |
| 62 | + try: |
| 63 | + username, _, password = netrc(file=_netrc).authenticators(url) |
| 64 | + |
| 65 | + # if the file is not found, prompt the user for the login/password |
| 66 | + except (FileNotFoundError, TypeError): |
| 67 | + print('Please provide Earthdata Login credentials for access.') |
| 68 | + username, password = input('Username: '), getpass('Password: ') |
| 69 | + |
| 70 | + manager = request.HTTPPasswordMgrWithDefaultRealm() |
| 71 | + manager.add_password(None, url, username, password) |
| 72 | + auth = request.HTTPBasicAuthHandler(manager) |
| 73 | + jar = CookieJar() |
| 74 | + processor = request.HTTPCookieProcessor(jar) |
| 75 | + opener = request.build_opener(auth, processor) |
| 76 | + request.install_opener(opener) |
| 77 | + |
| 78 | + ### Helper subroutines to make the API calls to search CMR and parse response |
| 79 | + def set_params(params: dict): |
| 80 | + params.update({'scroll': "true", 'page_size': 2000}) |
| 81 | + return {par: val for par, val in params.items() if val is not None} |
| 82 | + |
| 83 | + def get_results(params: dict, headers: dict=None): |
| 84 | + response = requests.get(url="https://cmr.earthdata.nasa.gov/search/granules.csv", |
| 85 | + params=set_params(params), |
| 86 | + headers=headers) |
| 87 | + return response, response.headers |
| 88 | + |
| 89 | + |
| 90 | + def get_granules(params: dict): |
| 91 | + response, headers = get_results(params=params) |
| 92 | + scroll = headers['CMR-Scroll-Id'] |
| 93 | + hits = int(headers['CMR-Hits']) |
| 94 | + if hits==0: |
| 95 | + raise Exception("No granules matched your input parameters.") |
| 96 | + df = pd.read_csv(StringIO(response.text)) |
| 97 | + while hits > df.index.size: |
| 98 | + response, _ = get_results(params=params, headers={'CMR-Scroll-Id': scroll}) |
| 99 | + data = pd.read_csv(StringIO(response.text)) |
| 100 | + df = pd.concat([df, data]) |
| 101 | + return df |
| 102 | + |
| 103 | + ### Helper subroutine to gracefully download single files and avoids re-downloading if file already exists. |
| 104 | + # To force redownload of the file, pass **True** to the boolean argument *force* (default **False**)\n, |
| 105 | + def download_file(url: str, output_dir: str, force: bool=False): |
| 106 | + """url (str): the HTTPS url from which the file will download |
| 107 | + output_dir (str): the local path into which the file will download |
| 108 | + force (bool): download even if the file exists locally already |
| 109 | + """ |
| 110 | + if not isdir(output_dir): |
| 111 | + raise Exception(f"Output directory doesnt exist! ({output_dir})") |
| 112 | + |
| 113 | + target_file = join(output_dir, basename(url)) |
| 114 | + |
| 115 | + # if the file has already been downloaded, skip |
| 116 | + if isfile(target_file) and force is False: |
| 117 | + print(f'\n{basename(url)} already exists, and force=False, not re-downloading') |
| 118 | + return 0 |
| 119 | + |
| 120 | + with requests.get(url) as r: |
| 121 | + if not r.status_code // 100 == 2: |
| 122 | + raise Exception(r.text) |
| 123 | + return 0 |
| 124 | + else: |
| 125 | + with open(target_file, 'wb') as f: |
| 126 | + total_size_in_bytes= int(r.headers.get('content-length', 0)) |
| 127 | + for chunk in r.iter_content(chunk_size=1024): |
| 128 | + if chunk: |
| 129 | + f.write(chunk) |
| 130 | + |
| 131 | + return total_size_in_bytes |
| 132 | + |
| 133 | + ### Helper subroutine to download all urls in the list `dls` |
| 134 | + def download_files_concurrently(dls, download_dir, force=False): |
| 135 | + start_time = time.time() |
| 136 | + |
| 137 | + # use 3 threads for concurrent downloads |
| 138 | + with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| 139 | + |
| 140 | + # tqdm makes a cool progress bar |
| 141 | + results = list(tqdm(executor.map(download_file, dls, repeat(download_dir), repeat(force)), total=len(dls))) |
| 142 | + |
| 143 | + # add up the total downloaded file sizes |
| 144 | + total_download_size_in_bytes = np.sum(np.array(results)) |
| 145 | + # calculate total time spent in the download |
| 146 | + total_time = time.time() - start_time |
| 147 | + |
| 148 | + print('\n=====================================') |
| 149 | + print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb') |
| 150 | + print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s') |
| 151 | + |
| 152 | + # define root directory for downloaded NetCDF files |
| 153 | + download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC') |
| 154 | + |
| 155 | + # define the directory where the downloaded files will be saved |
| 156 | + download_dir = download_root_dir / ShortName |
| 157 | + |
| 158 | + # create the download directory |
| 159 | + download_dir.mkdir(exist_ok = True, parents=True) |
| 160 | + |
| 161 | + print(f'created download directory {download_dir}') |
| 162 | + |
| 163 | + ## Log into Earthdata using your username and password |
| 164 | + |
| 165 | + # actually log in with this command: |
| 166 | + setup_earthdata_login_auth() |
| 167 | + |
| 168 | + # Query the NASA Common Metadata Repository to find the URL of every granule associated with the desired ECCO Dataset and date range of interest. |
| 169 | + |
| 170 | + # create a Python dictionary with our search criteria: `ShortName` and `temporal` |
| 171 | + input_search_params = {'ShortName': ShortName, |
| 172 | + 'temporal': ",".join([StartDate, EndDate])} |
| 173 | + |
| 174 | + print(input_search_params) |
| 175 | + |
| 176 | + ### Query CMR for the desired ECCO Dataset |
| 177 | + |
| 178 | + # grans means 'granules', PO.DAAC's term for individual files in a dataset |
| 179 | + grans = get_granules(input_search_params) |
| 180 | + |
| 181 | + # grans.info() |
| 182 | + |
| 183 | + num_grans = len( grans['Granule UR'] ) |
| 184 | + print (f'\nTotal number of matching granules: {num_grans}') |
| 185 | + |
| 186 | + |
| 187 | + ## Download the granules |
| 188 | + |
| 189 | + # convert the rows of the 'Online Access URLS' column to a Python list |
| 190 | + dls = grans['Online Access URLs'].tolist() |
| 191 | + |
| 192 | + try: |
| 193 | + # Attempt concurrent downloads, but if error arises switch to sequential downloads |
| 194 | + ### Method 1: Concurrent downloads |
| 195 | + |
| 196 | + # Define the maximum number of concurrent downloads (benefits typically taper off above 5-6) |
| 197 | + max_workers = 6 |
| 198 | + |
| 199 | + # Force redownload (or not) depending on value of force_redownload |
| 200 | + download_files_concurrently(dls, download_dir, force_redownload) |
| 201 | + |
| 202 | + except: |
| 203 | + ### Method 2: Sequential Downloads |
| 204 | + |
| 205 | + # Download each URL sequentially in a for loop. |
| 206 | + total_download_size_in_bytes = 0 |
| 207 | + start_time = time.time() |
| 208 | + |
| 209 | + # loop through all urls in dls |
| 210 | + for u in dls: |
| 211 | + u_name = u.split('/')[-1] |
| 212 | + print(f'downloading {u_name}') |
| 213 | + total_download_size_in_bytes += download_file(url=u, output_dir=download_dir, force=force_redownload) |
| 214 | + |
| 215 | + # calculate total time spent in the download |
| 216 | + total_time = time.time() - start_time |
| 217 | + |
| 218 | + print('\n=====================================') |
| 219 | + print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb') |
| 220 | + print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s') |
0 commit comments