From 7ebe450d252bf57b24e119c9606cb43932f80ce1 Mon Sep 17 00:00:00 2001 From: georgette Date: Sun, 15 Feb 2015 18:32:34 -0800 Subject: [PATCH] removed files.py & process.py after moving relevant functions into mccSearch.py --- code/files.py | 696 ------------------------ code/mccSearch.py | 113 +++- code/process.py | 1284 --------------------------------------------- 3 files changed, 100 insertions(+), 1993 deletions(-) delete mode 100644 code/files.py delete mode 100644 code/process.py diff --git a/code/files.py b/code/files.py deleted file mode 100644 index 82b508a..0000000 --- a/code/files.py +++ /dev/null @@ -1,696 +0,0 @@ -""" -Module for handling data input files. Requires PyNIO and Numpy be -installed. - -This module can easily open NetCDF, HDF and Grib files. Search the PyNIO -documentation for a complete list of supported formats. -""" - -from os import path - -try: - import Nio -except ImportError: - import nio as Nio - -import numpy as np -import numpy.ma as ma -import sys -import process #KDW added -#from toolkit import process -#from utils import fortranfile -#from utils import misc - - -VARIABLE_NAMES = {'time': ['time', 'times', 'date', 'dates', 'julian'], - 'latitude': ['latitude', 'lat', 'lats', 'latitudes'], - 'longitude': ['longitude', 'lon', 'lons', 'longitudes'] - } - - -def findunique(seq): - keys = {} - for e in seq: - keys[e] = 1 - return keys.keys() - -def getVariableByType(filename, variableType): - """ - Function that will try to return the variable from a file based on a provided - parameter type. - - Input:: - filename - the file to inspect - variableType - time | latitude | longitude - - Output:: - variable name OR list of all variables in the file if a single variable - name match cannot be found. - """ - try: - f = Nio.open_file(filename) - except: - #print 'PyNio had an issue opening the filename (%s) you provided' % filename - print "NIOError:", sys.exc_info()[0] - raise - - variableKeys = f.variables.keys() - f.close() - variableKeys = [variable.lower() for variable in variableKeys] - variableMatch = VARIABLE_NAMES[variableType] - - commonVariables = list(set(variableKeys).intersection(variableMatch)) - - if len(commonVariables) == 1: - return str(commonVariables[0]) - - else: - return variableKeys - -def getVariableRange(filename, variableName): - """ - Function to return the min and max values of the given variable in - the supplied filename. - - Input:: - filename - absolute path to a file - variableName - variable whose min and max values should be returned - - Output:: - variableRange - tuple of order (variableMin, variableMax) - """ - try: - f = Nio.open_file(filename) - except: - #print 'PyNio had an issue opening the filename (%s) you provided' % filename - print "NIOError:", sys.exc_info()[0] - raise - - varArray = f.variables[variableName][:] - return (varArray.min(), varArray.max()) - - -def read_data_from_file_list(filelist, myvar, timeVarName, latVarName, lonVarName): - ''' - Read in data from a list of model files into a single data structure - - Input: - filelist - list of filenames (including path) - myvar - string containing name of variable to load in (as it appears in file) - Output: - lat, lon - 2D array of latitude and longitude values - timestore - list of times - t2store - numpy array containing data from all files - - NB. originally written specific for WRF netCDF output files - modified to make more general (Feb 2011) - - Peter Lean July 2010 - ''' - - filelist.sort() - filename = filelist[0] - # Crash nicely if 'filelist' is zero length - """TODO: Throw Error instead via try Except""" - if len(filelist) == 0: - print 'Error: no files have been passed to read_data_from_file_list()' - sys.exit() - - # Open the first file in the list to: - # i) read in lats, lons - # ii) find out how many timesteps in the file - # (assume same ntimes in each file in list) - # -allows you to create an empty array to store variable data for all times - tmp = Nio.open_file(filename) - latsraw = tmp.variables[latVarName][:] - lonsraw = tmp.variables[lonVarName][:] - lonsraw[lonsraw > 180] = lonsraw[lonsraw > 180] - 360. # convert to -180,180 if necessary - - """TODO: Guard against case where latsraw and lonsraw are not the same dim?""" - - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - if(latsraw.ndim == 2): - lon = lonsraw - lat = latsraw - - timesraw = tmp.variables[timeVarName] - ntimes = len(timesraw) - - print 'Lats and lons read in for first file in filelist', ntimes - - # Create a single empty masked array to store model data from all files - t2store = ma.zeros((ntimes * len(filelist), len(lat[:, 0]), len(lon[0, :]))) - timestore = ma.zeros((ntimes * len(filelist))) - - # Now load in the data for real - # NB. no need to reload in the latitudes and longitudes -assume invariant - i = 0 - timesaccu = 0 # a counter for number of times stored so far in t2store - # NB. this method allows for missing times in data files - # as no assumption made that same number of times in each file... - - - for ifile in filelist: - - #print 'Loading data from file: ',filelist[i] - f = Nio.open_file(ifile) - t2raw = f.variables[myvar][:] - timesraw = f.variables[timeVarName] - time = timesraw[:] - ntimes = len(time) - print 'file= ', i, 'ntimes= ', ntimes, filelist[i] - print '\nt2raw shape: ', t2raw.shape - - # Flatten dimensions which needn't exist, i.e. level - # e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do. - # Code requires data to have dimensions, (time,lat,lon) - # i.e. remove level dimensions - # Remove 1d axis from the t2raw array - # Example: t2raw.shape == (365, 180, 360 1) - # After the squeeze you will be left with (365, 180, 360) instead - t2tmp = t2raw.squeeze() - # Nb. if this happens to be data for a single time only, then we just flattened it by accident - # lets put it back... - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - - print "\n timesaccu is: ", timesaccu, time,timesaccu + np.arange(ntimes) - - t2store[timesaccu + np.arange(ntimes), :, :] = t2tmp[:, :, :] - timestore[timesaccu + np.arange(ntimes)] = time - timesaccu = timesaccu + ntimes - print "\n ***timesaccu is: ", timesaccu - f.close() - i += 1 - - print '\nData read in successfully with dimensions: ', t2store.shape, timesaccu - - # TODO: search for duplicated entries (same time) and remove duplicates. - # Check to see if number of unique times == number of times, if so then no problem - - if(len(np.unique(timestore)) != len(np.where(timestore != 0)[0].view())): - print '\nWARNING: Possible duplicated times' - - # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here - timestore, _ = process.getModelTimes(filename, timeVarName) - - data_dict = {} - data_dict['lats'] = lat - data_dict['lons'] = lon - data_dict['times'] = timestore - data_dict['data'] = t2store - #return lat, lon, timestore, t2store - return data_dict - -def select_var_from_file(myfile, fmt='not set'): - ''' - Routine to act as user interface to allow users to select variable of interest from a file. - - Input: - myfile - filename - fmt - (optional) specify fileformat for PyNIO if filename suffix is non-standard - - Output: - myvar - variable name in file - - Peter Lean September 2010 - ''' - - print fmt - - if fmt == 'not set': - f = Nio.open_file(myfile) - - if fmt != 'not set': - f = Nio.open_file(myfile, format=fmt) - - keylist = f.variables.keys() - - i = 0 - for v in keylist: - print '[', i, '] ', f.variables[v].long_name, ' (', v, ')' - i += 1 - - user_selection = raw_input('Please select variable : [0 -' + str(i - 1) + '] ') - - myvar = keylist[int(user_selection)] - - return myvar - -def select_var_from_wrf_file(myfile): - ''' - Routine to act as user interface to allow users to select variable of interest from a wrf netCDF file. - - Input: - myfile - filename - - Output: - mywrfvar - variable name in wrf file - - Peter Lean September 2010 - ''' - - f = Nio.open_file(myfile, format='nc') - - keylist = f.variables.keys() - - i = 0 - for v in keylist: - try: - print '[', i, '] ', f.variables[v].description, ' (', v, ')' - except: - print '' - - i += 1 - - user_selection = raw_input('Please select WRF variable : [0 -' + str(i - 1) + '] ') - - mywrfvar = keylist[int(user_selection)] - - return mywrfvar - -def read_lolaT_from_file(filename, latVarName, lonVarName, timeVarName, file_type): - """ - Function that will return lat, lon, and time arrays - - Input:: - filename - the file to inspect - latVarName - name of the Latitude Variable - lonVarName - name of the Longitude Variable - timeVarName - name of the Time Variable - fileType = type of file we are trying to parse - - Output:: - lat - MESH GRID of Latitude values with shape (nx, ny) - lon - MESH GRID of Longitude values with shape (nx, ny) - timestore - Python list of Datetime objects - - MESHGRID docs: http://docs.scipy.org/doc/numpy/reference/generated/numpy.meshgrid.html - - """ - - tmp = Nio.open_file(filename, format=file_type) - lonsraw = tmp.variables[lonVarName][:] - latsraw = tmp.variables[latVarName][:] - lonsraw[lonsraw > 180] = lonsraw[lonsraw > 180] - 360. # convert to -180,180 if necessary - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - if(latsraw.ndim == 2): - lon = lonsraw - lat = latsraw - timestore, _ = process.getModelTimes(filename, timeVarName) - print ' read_lolaT_from_file: Lats, lons and times read in for the model domain' - return lat, lon, timestore - -def read_data_from_one_file(ifile, myvar, timeVarName, lat, file_type): - ################################################################################## - # Read in data from one file at a time - # Input: filelist - list of filenames (including path) - # myvar - string containing name of variable to load in (as it appears in file) - # Output: lat, lon - 2D array of latitude and longitude values - # times - list of times - # t2store - numpy array containing data from all files - # Modified from read_data_from_file_list to read data from multiple models into a 4-D array - # 1. The code now processes model data that completely covers the 20-yr period. Thus, - # all model data must have the same time levels (ntimes). Unlike in the oroginal, ntimes - # is fixed here. - # 2. Because one of the model data exceeds 240 mos (243 mos), the model data must be - # truncated to the 240 mons using the ntimes determined from the first file. - ################################################################################## - f = Nio.open_file(ifile) - try: - varUnit = f.variables[myvar].units.upper() - except: - varUnit = raw_input('Enter the model variable unit: \n> ').upper() - t2raw = f.variables[myvar][:] - t2tmp = t2raw.squeeze() - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - t2tmp = t2tmp - f.close() - print ' success read_data_from_one_file: VarName=', myvar, ' Shape(Full)= ', t2tmp.shape, ' Unit= ', varUnit - timestore = process.decode_model_timesK(ifile, timeVarName, file_type) - return timestore, t2tmp, varUnit - -def findTimeVariable(filename): - """ - Function to find what the time variable is called in a model file. - Input:: - filename - file to crack open and check for a time variable - Output:: - timeName - name of the input file's time variable - variableNameList - list of variable names from the input filename - """ - try: - f = Nio.open_file(filename, mode='r') - except: - print("Unable to open '%s' to try and read the Time variable" % filename) - raise - - variableNameList = f.variables.keys() - # convert all variable names into lower case - varNameListLowerCase = [x.lower() for x in variableNameList] - - # Use "set" types for finding common variable name from in the file and from the list of possibilities - possibleTimeNames = set(['time', 'times', 'date', 'dates', 'julian']) - - # Use the sets to find the intersection where variable names are in possibleNames - timeNameSet = possibleTimeNames.intersection(varNameListLowerCase) - - if len(timeNameSet) == 0: - print("Unable to autodetect the Time Variable Name in the '%s'" % filename) - timeName = misc.askUserForVariableName(variableNameList, targetName ="Time") - - else: - timeName = timeNameSet.pop() - - return timeName, variableNameList - - -def findLatLonVarFromFile(filename): - """ - Function to find what the latitude and longitude variables are called in a model file. - - Input:: - -filename - Output:: - -latVarName - -lonVarName - -latMin - -latMax - -lonMin - -lonMax - """ - try: - f = Nio.open_file(filename, mode='r') - except: - print("Unable to open '%s' to try and read the Latitude and Longitude variables" % filename) - raise - - variableNameList = f.variables.keys() - # convert all variable names into lower case - varNameListLowerCase = [x.lower() for x in variableNameList] - - # Use "set" types for finding common variable name from in the file and from the list of possibilities - possibleLatNames = set(['latitude', 'lat', 'lats', 'latitudes']) - possibleLonNames = set(['longitude', 'lon', 'lons', 'longitudes']) - - # Use the sets to find the intersection where variable names are in possibleNames - latNameSet = possibleLatNames.intersection(varNameListLowerCase) - lonNameSet = possibleLonNames.intersection(varNameListLowerCase) - - if len(latNameSet) == 0 or len(lonNameSet) == 0: - print("Unable to autodetect Latitude and/or Longitude values in the file") - latName = misc.askUserForVariableName(variableNameList, targetName ="Latitude") - lonName = misc.askUserForVariableName(variableNameList, targetName ="Longitude") - - else: - latName = latNameSet.pop() - lonName = lonNameSet.pop() - - lats = np.array(f.variables[latName][:]) - lons = np.array(f.variables[lonName][:]) - - latMin = lats.min() - latMax = lats.max() - - # Convert the lons from 0:360 into -180:180 - lons[lons > 180] = lons[lons > 180] - 360. - lonMin = lons.min() - lonMax = lons.max() - - return latName, lonName, latMin, latMax, lonMin, lonMax - - -def read_data_from_file_list_K(filelist, myvar, timeVarName, latVarName, lonVarName, file_type): - ################################################################################## - # Read in data from a list of model files into a single data structure - # Input: filelist - list of filenames (including path) - # myvar - string containing name of variable to load in (as it appears in file) - # Output: lat, lon - 2D array of latitude and longitude values - # times - list of times - # t2store - numpy array containing data from all files - # Modified from read_data_from_file_list to read data from multiple models into a 4-D array - # 1. The code now processes model data that completely covers the 20-yr period. Thus, - # all model data must have the same time levels (ntimes). Unlike in the oroginal, ntimes - # is fixed here. - # 2. Because one of the model data exceeds 240 mos (243 mos), the model data must be - # truncated to the 240 mons using the ntimes determined from the first file. - ################################################################################## - filelist.sort() - nfiles = len(filelist) - # Crash nicely if 'filelist' is zero length - if nfiles == 0: - print 'Error: no files have been passed to read_data_from_file_list(): Exit' - sys.exit() - - # Open the first file in the list to: - # i) read in lats, lons - # ii) find out how many timesteps in the file (assume same ntimes in each file in list) - # -allows you to create an empty array to store variable data for all times - tmp = Nio.open_file(filelist[0], format=file_type) - latsraw = tmp.variables[latVarName][:] - lonsraw = tmp.variables[lonVarName][:] - lonsraw[lonsraw > 180] = lonsraw[lonsraw > 180] - 360. # convert to -180,180 if necessary - if(latsraw.ndim == 1): - lon, lat = np.meshgrid(lonsraw, latsraw) - if(latsraw.ndim == 2): - lon = lonsraw; lat = latsraw - - timesraw = tmp.variables[timeVarName] - ntimes = len(timesraw); nygrd = len(lat[:, 0]); nxgrd = len(lon[0, :]) - - print 'Lats and lons read in for first file in filelist' - - # Create a single empty masked array to store model data from all files - #t2store = ma.zeros((ntimes*nfiles,nygrd,nxgrd)) - t2store = ma.zeros((nfiles, ntimes, nygrd, nxgrd)) - #timestore=ma.zeros((ntimes*nfiles)) - - ## Now load in the data for real - ## NB. no need to reload in the latitudes and longitudes -assume invariant - #timesaccu=0 # a counter for number of times stored so far in t2store - # NB. this method allows for missing times in data files - # as no assumption made that same number of times in each file... - - i = 0 - for ifile in filelist: - #print 'Loading data from file: ',filelist[i] - f = Nio.open_file(ifile) - t2raw = f.variables[myvar][:] - timesraw = f.variables[timeVarName] - #time = timesraw[0:ntimes] - ntimes = len(timesraw) - #ntimes=len(time) - print 'file= ',i,' ntimes= ',ntimes,filelist[i] - ## Flatten dimensions which needn't exist, i.e. level - ## e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do. - ## Code requires data to have dimensions, (time,lat,lon) - ## i.e. remove level dimensions - t2tmp = t2raw.squeeze() - ## Nb. if data happen to be for a single time, we flattened it by accident; lets put it back... - if t2tmp.ndim == 2: - t2tmp = np.expand_dims(t2tmp, 0) - #t2store[timesaccu+np.arange(ntimes),:,:]=t2tmp[0:ntimes,:,:] - t2store[i, 0:(ntimes-1), :, :] = t2tmp[0:(ntimes-1), :, :] - #timestore[timesaccu+np.arange(ntimes)]=time - #timesaccu=timesaccu+ntimes - f.close() - i += 1 - - print 'Data read in successfully with dimensions: ', t2store.shape - - # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here - ifile = filelist[0] - timestore, _ = process.getModelTimes(ifile, timeVarName) - - return lat, lon, timestore, t2store - -def find_latlon_ranges(filelist, lat_var_name, lon_var_name): - # Function to return the latitude and longitude ranges of the data in a file, - # given the identifying variable names. - # - # Input: - # filelist - list of filenames (data is read in from first file only) - # lat_var_name - variable name of the 'latitude' variable - # lon_var_name - variable name of the 'longitude' variable - # - # Output: - # latMin, latMax, lonMin, lonMax - self explanatory - # - # Peter Lean March 2011 - - filename = filelist[0] - - try: - f = Nio.open_file(filename) - - lats = f.variables[lat_var_name][:] - latMin = lats.min() - latMax = lats.max() - - lons = f.variables[lon_var_name][:] - lons[lons > 180] = lons[lons > 180] - 360. - lonMin = lons.min() - lonMax = lons.max() - - return latMin, latMax, lonMin, lonMax - - except: - print 'Error: there was a problem with finding the latitude and longitude ranges in the file' - print ' Please check that you specified the filename, and variable names correctly.' - - sys.exit() - -def writeBN_lola(fileName, lons, lats): - # write a binary data file that include longitude (1-d) and latitude (1-d) values - - F = fortranfile.FortranFile(fileName, mode='w') - ngrdY = lons.shape[0]; ngrdX = lons.shape[1] - tmpDat = ma.zeros(ngrdX); tmpDat[:] = lons[0, :]; F.writeReals(tmpDat) - tmpDat = ma.zeros(ngrdY); tmpDat[:] = lats[:, 0]; F.writeReals(tmpDat) - # release temporary arrays - tmpDat = 0 - F.close() - -def writeBNdata(fileName, numOBSs, numMDLs, nT, ngrdX, ngrdY, numSubRgn, obsData, mdlData, obsRgnAvg, mdlRgnAvg): - #(fileName,maskOption,numOBSs,numMDLs,nT,ngrdX,ngrdY,numSubRgn,obsData,mdlData,obsRgnAvg,mdlRgnAvg): - # write spatially- and regionally regridded data into a binary data file - missing = -1.e26 - F = fortranfile.FortranFile(fileName, mode='w') - # construct a data array to replace mask flag with a missing value (missing=-1.e12) for printing - data = ma.zeros((nT, ngrdY, ngrdX)) - for m in np.arange(numOBSs): - data[:, :, :] = obsData[m, :, :, :]; msk = data.mask - for n in np.arange(nT): - for j in np.arange(ngrdY): - for i in np.arange(ngrdX): - if msk[n, j, i]: data[n, j, i] = missing - - # write observed data. allowed to write only one row at a time - tmpDat = ma.zeros(ngrdX) - for n in np.arange(nT): - for j in np.arange(ngrdY): - tmpDat[:] = data[n, j, :] - F.writeReals(tmpDat) - - # write model data (dep. on the number of models). - for m in np.arange(numMDLs): - data[:, :, :] = mdlData[m, :, :, :] - msk = data.mask - for n in np.arange(nT): - for j in np.arange(ngrdY): - for i in np.arange(ngrdX): - if msk[n, j, i]: - data[n, j, i] = missing - - for n in np.arange(nT): - for j in np.arange(ngrdY): - tmpDat[:] = data[n, j, :] - F.writeReals(tmpDat) - - data = 0 # release the array allocated for data - # write data in subregions - if numSubRgn > 0: - print 'Also included are the time series of the means over ', numSubRgn, ' areas from obs and model data' - tmpDat = ma.zeros(nT); print numSubRgn - for m in np.arange(numOBSs): - for n in np.arange(numSubRgn): - tmpDat[:] = obsRgnAvg[m, n, :] - F.writeReals(tmpDat) - for m in np.arange(numMDLs): - for n in np.arange(numSubRgn): - tmpDat[:] = mdlRgnAvg[m, n, :] - F.writeReals(tmpDat) - tmpDat = 0 # release the array allocated for tmpDat - F.close() - -def writeNCfile(fileName, numSubRgn, lons, lats, obsData, mdlData, obsRgnAvg, mdlRgnAvg, obsList, mdlList, subRegions): - # write an output file of variables up to 3 dimensions - # fileName: the name of the output data file - # numSubRgn : the number of subregions - # lons[ngrdX]: longitude - # lats[ngrdY]: latitudes - # obsData[nT,ngrdY,ngrdX]: the obs time series of the entire model domain - # mdlData[numMDLs,nT,ngrdY,ngrdX]: the mdltime series of the entire model domain - # obsRgnAvg[numSubRgn,nT]: the obs time series for the all subregions - # mdlRgnAvg[numMDLs,numSubRgn,nT]: the mdl time series for the all subregions - dimO = obsData.shape[0] # the number of obs data - dimM = mdlData.shape[0] # the number of mdl data - dimT = mdlData.shape[1] # the number of time levels - dimY = mdlData.shape[2] # y-dimension - dimX = mdlData.shape[3] # x-dimension - dimR = obsRgnAvg.shape[1] # the number of subregions - f = Nio.open_file(fileName, mode='w', format='nc') - print mdlRgnAvg.shape, dimM, dimR, dimT - #create global attributes - f.globalAttName = '' - # create dimensions - print 'Creating Dimensions within the NetCDF Object...' - f.create_dimension('unity', 1) - f.create_dimension('time', dimT) - f.create_dimension('west_east', dimX) - f.create_dimension('south_north', dimY) - f.create_dimension('obs', dimO) - f.create_dimension('models', dimM) - - # create the variable (real*4) to be written in the file - print 'Creating Variables...' - f.create_variable('lon', 'd', ('south_north', 'west_east')) - f.create_variable('lat', 'd', ('south_north', 'west_east')) - f.create_variable('oDat', 'd', ('obs', 'time', 'south_north', 'west_east')) - f.create_variable('mDat', 'd', ('models', 'time', 'south_north', 'west_east')) - - if subRegions: - f.create_dimension('regions', dimR) - f.create_variable('oRgn', 'd', ('obs', 'regions', 'time')) - f.create_variable('mRgn', 'd', ('models', 'regions', 'time')) - f.variables['oRgn'].varAttName = 'Observation time series: Subregions' - f.variables['mRgn'].varAttName = 'Model time series: Subregions' - - loadDataIntoNetCDF(f, obsList, obsData, 'Observation') - print 'Loaded the Observations into the NetCDF' - - loadDataIntoNetCDF(f, mdlList, mdlData, 'Model') - - # create attributes and units for the variable - print 'Creating Attributes and Units...' - f.variables['lon'].varAttName = 'Longitudes' - f.variables['lon'].varUnit = 'degrees East' - f.variables['lat'].varAttName = 'Latitudes' - f.variables['lat'].varUnit = 'degrees North' - f.variables['oDat'].varAttName = 'Observation time series: entire domain' - f.variables['mDat'].varAttName = 'Model time series: entire domain' - - # assign the values to the variable and write it - f.variables['lon'][:, :] = lons - f.variables['lat'][:, :] = lats - if subRegions: - f.variables['oRgn'][:, :, :] = obsRgnAvg - f.variables['mRgn'][:, :, :] = mdlRgnAvg - - f.close() - -def loadDataIntoNetCDF(fileObject, datasets, dataArray, dataType): - """ - Input:: - fileObject - PyNIO file object data will be loaded into - datasets - List of dataset names - dataArray - Multi-dimensional array of data to be loaded into the NetCDF file - dataType - String with value of either 'Model' or 'Observation' - Output:: - No return value. PyNIO file object is updated in place - """ - datasetCount = 0 - for dataset in datasets: - if dataType.lower() == 'observation': - datasetName = dataset.replace(' ','') - elif dataType.lower() == 'model': - datasetName = path.splitext(path.basename(dataset))[0] - print "Creating variable %s" % datasetName - fileObject.create_variable(datasetName, 'd', ('time', 'south_north', 'west_east')) - fileObject.variables[datasetName].varAttName = 'Obseration time series: entire domain' - print 'Loading values into %s' % datasetName - fileObject.variables[datasetName].assign_value(dataArray[datasetCount,:,:,:]) - datasetCount += 1 diff --git a/code/mccSearch.py b/code/mccSearch.py index 59ec44b..6e246f6 100644 --- a/code/mccSearch.py +++ b/code/mccSearch.py @@ -31,10 +31,8 @@ import matplotlib.cm as cm import matplotlib.colors as mcolors from matplotlib.ticker import FuncFormatter, FormatStrFormatter +from scipy.ndimage import map_coordinates -#existing modules in services -import files -import process #----------------------- GLOBAL VARIABLES -------------------------- # --------------------- User defined variables --------------------- #FYI the lat lon values are not necessarily inclusive of the points given. These are the limits @@ -374,7 +372,7 @@ def findCloudElements(mergImgs,timelist,TRMMdirName=None): #---------regrid the TRMM data to the MERG dataset ---------------------------------- #regrid using the do_regrid stuff from the Apache OCW regriddedTRMM = ma.zeros((0, nygrd, nxgrd)) - regriddedTRMM = process.do_regrid(precipRateMasked[0,:,:], LATTRMM, LONTRMM, LAT, LON, order=1, mdi= -999999999) + regriddedTRMM = do_regrid(precipRateMasked[0,:,:], LATTRMM, LONTRMM, LAT, LON, order=1, mdi= -999999999) #---------------------------------------------------------------------------------- # #get the lat/lon info from cloudElement @@ -666,7 +664,7 @@ def findPrecipRate(TRMMdirName, timelist): #---------regrid the TRMM data to the MERG dataset ---------------------------------- #regrid using the do_regrid stuff from the Apache OCW regriddedTRMM = ma.zeros((0, nygrd, nxgrd)) - regriddedTRMM = process.do_regrid(precipRateMasked[0,:,:], LATTRMM, LONTRMM, LAT, LON, order=1, mdi= -999999999) + regriddedTRMM = do_regrid(precipRateMasked[0,:,:], LATTRMM, LONTRMM, LAT, LON, order=1, mdi= -999999999) #---------------------------------------------------------------------------------- # #get the lat/lon info from @@ -2644,17 +2642,106 @@ def decodeTimeFromString(time_string): print 'Error decoding time string: string does not match a predefined time format' return 0 +# #****************************************************************** +# def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi=-999999999): +# """ +# This function has been moved to the ocw/dataset_processor module +# """ +# from ocw import dataset_processor +# q2 = dataset_processor._rcmes_spatial_regrid(q, lat, lon, lat2, lon2, order=1) + +# return q2 #****************************************************************** -def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi=-999999999): - """ - This function has been moved to the ocw/dataset_processor module - """ - from ocw import dataset_processor - q2 = dataset_processor._rcmes_spatial_regrid(q, lat, lon, lat2, lon2, order=1) +def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi= -999999999): + ''' + Perform regridding from one set of lat,lon values onto a new set (lat2,lon2) + + Input:: + q - the variable to be regridded + lat,lon - original co-ordinates corresponding to q values + lat2,lon2 - new set of latitudes and longitudes that you want to regrid q onto + order - (optional) interpolation order 1=bi-linear, 3=cubic spline + mdi - (optional) fill value for missing data (used in creation of masked array) + + Output:: + q2 - q regridded onto the new set of lat2,lon2 + + ''' + + nlat = q.shape[0] + nlon = q.shape[1] + + nlat2 = lat2.shape[0] + nlon2 = lon2.shape[1] + + # To make our lives easier down the road, let's + # turn these into arrays of x & y coords + loni = lon2.ravel() + lati = lat2.ravel() + + loni = loni.copy() # NB. it won't run unless you do this... + lati = lati.copy() + + # Now, we'll set points outside the boundaries to lie along an edge + loni[loni > lon.max()] = lon.max() + loni[loni < lon.min()] = lon.min() + + # To deal with the "hard" break, we'll have to treat y differently, + # so we're just setting the min here... + lati[lati > lat.max()] = lat.max() + lati[lati < lat.min()] = lat.min() + + + # We need to convert these to (float) indicies + # (xi should range from 0 to (nx - 1), etc) + loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min()) + + # Deal with the "hard" break in the y-direction + lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min()) + + # Notes on dealing with MDI when regridding data. + # Method adopted here: + # Use bilinear interpolation of data by default (but user can specify other order using order=... in call) + # Perform bilinear interpolation of data, and of mask. + # To be conservative, new grid point which contained some missing data on the old grid is set to missing data. + # -this is achieved by looking for any non-zero interpolated mask values. + # To avoid issues with bilinear interpolation producing strong gradients leading into the MDI, + # set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now. + + # Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating + # Preserve MDI mask, by only changing data part of masked array object. + for shift in (-1, 1): + for axis in (0, 1): + q_shifted = np.roll(q, shift=shift, axis=axis) + idx = ~q_shifted.mask * q.mask + q.data[idx] = q_shifted[idx] + + # Now we actually interpolate + # map_coordinates does cubic interpolation by default, + # use "order=1" to preform bilinear interpolation instead... + q2 = map_coordinates(q, [lati, loni], order=order) + q2 = q2.reshape([nlat2, nlon2]) + + # Set values to missing data outside of original domain + q2 = ma.masked_array(q2, mask=np.logical_or(np.logical_or(lat2 >= lat.max(), + lat2 <= lat.min()), + np.logical_or(lon2 <= lon.min(), + lon2 >= lon.max()))) + + # Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these + qmdi = np.zeros_like(q) + qmdi[q.mask == True] = 1. + qmdi[q.mask == False] = 0. + qmdi_r = map_coordinates(qmdi, [lati, loni], order=order) + qmdi_r = qmdi_r.reshape([nlat2, nlon2]) + mdimask = (qmdi_r != 0.0) + + # Combine missing data mask, with outside domain mask define above. + q2.mask = np.logical_or(mdimask, q2.mask) return q2 - -#****************************************************************** + +#****************************************************************** # # METRICS FUNCTIONS FOR MERG.PY # TODO: rewrite these metrics so that they use the data from the diff --git a/code/process.py b/code/process.py deleted file mode 100644 index 5a362ec..0000000 --- a/code/process.py +++ /dev/null @@ -1,1284 +0,0 @@ -"""Collection of functions that process numpy arrays of varying shapes. -Functions can range from subsetting to regridding""" - -# Python Standard Libary Imports -import math -import datetime -import re -import string -import sys -import time - -# 3rd Party Imports -import Nio -import numpy as np -import numpy.ma as ma -from scipy.ndimage import map_coordinates - - -def extract_subregion_from_data_array(data, lats, lons, latmin, latmax, lonmin, lonmax): - ''' - Extract a sub-region from a data array. - - Example:: - **e.g.** the user may load a global model file, but only want to examine data over North America - This function extracts a sub-domain from the original data. - The defined sub-region must be a regular lat/lon bounding box, - but the model data may be on a non-regular grid (e.g. rotated, or Guassian grid layout). - Data are kept on the original model grid and a rectangular (in model-space) region - is extracted which contains the rectangular (in lat/lon space) user supplied region. - - INPUT:: - data - 3d masked data array - lats - 2d array of latitudes corresponding to data array - lons - 2d array of longitudes corresponding to data array - latmin, latmax, lonmin, lonmax - bounding box of required region to extract - - OUTPUT:: - data2 - subset of original data array containing only data from required subregion - lats2 - subset of original latitude data array - lons2 - subset of original longitude data array - - ''' - - - - # Mask model data array to find grid points inside the supplied bounding box - whlat = (lats > latmin) & (lats < latmax) - whlon = (lons > lonmin) & (lons < lonmax) - wh = whlat & whlon - - # Find required matrix indices describing the limits of the regular lat/lon bounding box - jmax = np.where(wh == True)[0].max() - jmin = np.where(wh == True)[0].min() - imax = np.where(wh == True)[1].max() - imin = np.where(wh == True)[1].min() - - # Cut out the sub-region from the data array - data2 = ma.zeros((data.shape[0], jmax - jmin, imax - imin)) - data2 = data[:, jmin:jmax, imin:imax] - - # Cut out sub-region from lats,lons arrays - lats2 = lats[jmin:jmax, imin:imax] - lons2 = lons[jmin:jmax, imin:imax] - - return data2, lats2, lons2 - -def calc_area_mean(data, lats, lons, mymask='not set'): - ''' - Calculate Area Average of data in a masked array - - INPUT:: - data: a masked array of data (NB. only data from one time expected to be passed at once) - lats: 2d array of regularly gridded latitudes - lons: 2d array of regularly gridded longitudes - mymask: (optional) defines spatial region to do averaging over - - OUTPUT:: - area_mean: a value for the mean inside the area - - ''' - - - - # If mask not passed in, then set maks to cover whole data domain - if(mymask == 'not set'): - mymask = np.empty(data.shape) - mymask[:] = False # NB. mask means (don't show), so False everywhere means use everything. - - # Dimension check on lats, lons - # Sometimes these arrays are 3d, sometimes 2d, sometimes 1d - # This bit of code just converts to the required 2d array shape - if(len(lats.shape) == 3): - lats = lats[0, :, :] - - if(len(lons.shape) == 3): - lons = lons[0, :, :] - - if(np.logical_and(len(lats.shape) == 1, len(lons.shape) == 1)): - lons, lats = np.meshgrid(lons, lats) - - # Calculate grid length (assuming regular lat/lon grid) - dlat = lats[1, 0] - lats[0, 0] - dlon = lons[0, 1] - lons[0, 0] - - # Calculates weights for each grid box - myweights = calc_area_in_grid_box(lats, dlon, dlat) - - # Create a new masked array covering just user selected area (defined in mymask) - # NB. this preserves missing data points in the observations data - subdata = ma.masked_array(data, mask=mymask) - - if(myweights.shape != subdata.shape): - myweights.resize(subdata.shape) - myweights[1:, :] = myweights[0, :] - - # Calculate weighted mean using ma.average (which takes weights) - area_mean = ma.average(subdata, weights=myweights) - - return area_mean - -def calc_area_in_grid_box(latitude, dlat, dlon): - ''' - Calculate area of regular lat-lon grid box - - INPUT:: - latitude: latitude of grid box (degrees) - dlat: grid length in latitude direction (degrees) - dlon: grid length in longitude direction (degrees) - - OUTPUT:: - A: area of the grid box - - ''' - - R = 6371000 # radius of Earth in metres - C = 2 * math.pi * R - - latitude = np.radians(latitude) - - A = (dlon * (C / 360.)) * (dlat * (C / 360.) * np.cos(latitude)) - - return A - -def do_regrid(q, lat, lon, lat2, lon2, order=1, mdi= -999999999): - ''' - Perform regridding from one set of lat,lon values onto a new set (lat2,lon2) - - Input:: - q - the variable to be regridded - lat,lon - original co-ordinates corresponding to q values - lat2,lon2 - new set of latitudes and longitudes that you want to regrid q onto - order - (optional) interpolation order 1=bi-linear, 3=cubic spline - mdi - (optional) fill value for missing data (used in creation of masked array) - - Output:: - q2 - q regridded onto the new set of lat2,lon2 - - ''' - - nlat = q.shape[0] - nlon = q.shape[1] - - nlat2 = lat2.shape[0] - nlon2 = lon2.shape[1] - - # To make our lives easier down the road, let's - # turn these into arrays of x & y coords - loni = lon2.ravel() - lati = lat2.ravel() - - loni = loni.copy() # NB. it won't run unless you do this... - lati = lati.copy() - - # Now, we'll set points outside the boundaries to lie along an edge - loni[loni > lon.max()] = lon.max() - loni[loni < lon.min()] = lon.min() - - # To deal with the "hard" break, we'll have to treat y differently, - # so we're just setting the min here... - lati[lati > lat.max()] = lat.max() - lati[lati < lat.min()] = lat.min() - - - # We need to convert these to (float) indicies - # (xi should range from 0 to (nx - 1), etc) - loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min()) - - # Deal with the "hard" break in the y-direction - lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min()) - - # Notes on dealing with MDI when regridding data. - # Method adopted here: - # Use bilinear interpolation of data by default (but user can specify other order using order=... in call) - # Perform bilinear interpolation of data, and of mask. - # To be conservative, new grid point which contained some missing data on the old grid is set to missing data. - # -this is achieved by looking for any non-zero interpolated mask values. - # To avoid issues with bilinear interpolation producing strong gradients leading into the MDI, - # set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now. - - # Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating - # Preserve MDI mask, by only changing data part of masked array object. - for shift in (-1, 1): - for axis in (0, 1): - q_shifted = np.roll(q, shift=shift, axis=axis) - idx = ~q_shifted.mask * q.mask - q.data[idx] = q_shifted[idx] - - # Now we actually interpolate - # map_coordinates does cubic interpolation by default, - # use "order=1" to preform bilinear interpolation instead... - q2 = map_coordinates(q, [lati, loni], order=order) - q2 = q2.reshape([nlat2, nlon2]) - - # Set values to missing data outside of original domain - q2 = ma.masked_array(q2, mask=np.logical_or(np.logical_or(lat2 >= lat.max(), - lat2 <= lat.min()), - np.logical_or(lon2 <= lon.min(), - lon2 >= lon.max()))) - - # Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these - qmdi = np.zeros_like(q) - qmdi[q.mask == True] = 1. - qmdi[q.mask == False] = 0. - qmdi_r = map_coordinates(qmdi, [lati, loni], order=order) - qmdi_r = qmdi_r.reshape([nlat2, nlon2]) - mdimask = (qmdi_r != 0.0) - - # Combine missing data mask, with outside domain mask define above. - q2.mask = np.logical_or(mdimask, q2.mask) - - return q2 - -def create_mask_using_threshold(masked_array, threshold=0.5): - ''' - Routine to create a mask, depending on the proportion of times with missing data. - For each pixel, calculate proportion of times that are missing data, - **if** the proportion is above a specified threshold value, - then mark the pixel as missing data. - - Input:: - masked_array - a numpy masked array of data (assumes time on axis 0, and space on axes 1 and 2). - threshold - (optional) threshold proportion above which a pixel is marked as 'missing data'. - NB. default threshold = 50% - - Output:: - mymask - a numpy array describing the mask. NB. not a masked array, just the mask itself. - - ''' - - - # try, except used as some model files don't have a full mask, but a single bool - # the except catches this situation and deals with it appropriately. - try: - nT = masked_array.mask.shape[0] - - # For each pixel, count how many times are masked. - nMasked = masked_array.mask[:, :, :].sum(axis=0) - - # Define new mask as when a pixel has over a defined threshold ratio of masked data - # e.g. if the threshold is 75%, and there are 10 times, - # then a pixel will be masked if more than 5 times are masked. - mymask = nMasked > (nT * threshold) - - except: - mymask = np.zeros_like(masked_array.data[0, :, :]) - - return mymask - -def calc_average_on_new_time_unit(data, dateList, unit='monthly'): - ''' - Routine to calculate averages on longer time units than the data exists on. - - Example:: - If the data is 6-hourly, calculate daily, or monthly means. - - Input:: - data - data values - dateList - list of python datetime structures corresponding to data values - unit - string describing time unit to average onto. - *Valid values are: 'monthly', 'daily', 'pentad','annual','decadal'* - - Output: - meanstorem - numpy masked array of data values meaned over required time period - newTimesList - a list of python datetime objects representing the data in the new averagin period - *NB.* currently set to beginning of averaging period, - **i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z.** - ''' - - # First catch unknown values of time unit passed in by user - acceptable = (unit == 'full') | (unit == 'annual') | (unit == 'monthly') | (unit == 'daily') | (unit == 'pentad') - - if not acceptable: - print 'Error: unknown unit type selected for time averaging' - print ' Please check your code.' - return - - # Calculate arrays of years (2007,2007), - # yearsmonths (200701,200702), - # or yearmonthdays (20070101,20070102) - # -depending on user selected averaging period. - - # Year list - if unit == 'annual': - print 'Calculating annual mean data' - timeunits = [] - - for i in np.arange(len(dateList)): - - timeunits.append(str(dateList[i].year)) - - timeunits = np.array(timeunits, dtype=int) - - # YearMonth format list - if unit == 'monthly': - print 'Calculating monthly mean data' - timeunits = [] - - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month)) - - timeunits = np.array(timeunits, dtype=int) - - # YearMonthDay format list - if unit == 'daily': - print 'Calculating daily mean data' - timeunits = [] - - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + \ - str("%02d" % dateList[i].day)) - - timeunits = np.array(timeunits, dtype=int) - - - # TODO: add pentad setting using Julian days? - - - # Full list: a special case - if unit == 'full': - print 'Calculating means data over full time range' - timeunits = [] - - for i in np.arange(len(dateList)): - timeunits.append(999) # i.e. we just want the same value for all times. - - timeunits = np.array(timeunits, dtype=int) - - - - # empty list to store new times - newTimesList = [] - - # Decide whether or not you need to do any time averaging. - # i.e. if data are already on required time unit then just pass data through and - # calculate and return representative datetimes. - processing_required = True - if len(timeunits) == (len(np.unique(timeunits))): - processing_required = False - - # 1D data arrays, i.e. time series - if data.ndim == 1: - # Create array to store the resulting data - meanstore = np.zeros(len(np.unique(timeunits))) - - # Calculate the means across each unique time unit - i = 0 - for myunit in np.unique(timeunits): - if processing_required: - datam = ma.masked_array(data, timeunits != myunit) - meanstore[i] = ma.average(datam) - - # construct new times list - smyunit = str(myunit) - if len(smyunit) == 4: # YYYY - yyyy = int(myunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit) == 6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit) == 8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit) == 3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1] - dateList[0] - halfway = dateList[0] + (dt / 2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - - newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0)) - i = i + 1 - - # 3D data arrays - if data.ndim == 3: - - #datamask = create_mask_using_threshold(data,threshold=0.75) - - # Create array to store the resulting data - meanstore = np.zeros([len(np.unique(timeunits)), data.shape[1], data.shape[2]]) - - # Calculate the means across each unique time unit - i = 0 - datamask_store = [] - for myunit in np.unique(timeunits): - if processing_required: - - mask = np.zeros_like(data) - mask[timeunits != myunit, :, :] = 1.0 - - # Calculate missing data mask within each time unit... - datamask_at_this_timeunit = np.zeros_like(data) - datamask_at_this_timeunit[:] = create_mask_using_threshold(data[timeunits == myunit, :, :], threshold=0.75) - # Store results for masking later - datamask_store.append(datamask_at_this_timeunit[0]) - - # Calculate means for each pixel in this time unit, ignoring missing data (using masked array). - datam = ma.masked_array(data, np.logical_or(mask, datamask_at_this_timeunit)) - meanstore[i, :, :] = ma.average(datam, axis=0) - - # construct new times list - smyunit = str(myunit) - if len(smyunit) == 4: # YYYY - yyyy = int(smyunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit) == 6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit) == 8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit) == 3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1] - dateList[0] - halfway = dateList[0] + (dt / 2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - - newTimesList.append(datetime.datetime(yyyy, mm, dd, 0, 0, 0, 0)) - - i += 1 - - if not processing_required: - meanstorem = data - - if processing_required: - # Create masked array (using missing data mask defined above) - datamask_store = np.array(datamask_store) - meanstorem = ma.masked_array(meanstore, datamask_store) - - return meanstorem, newTimesList - -def calc_running_accum_from_period_accum(data): - ''' - Routine to calculate running total accumulations from individual period accumulations. - :: - - e.g. 0,0,1,0,0,2,2,1,0,0 - -> 0,0,1,1,1,3,5,6,6,6 - - Input:: - data: numpy array with time in the first axis - - Output:: - running_acc: running accumulations - - ''' - - - running_acc = np.zeros_like(data) - - if(len(data.shape) == 1): - running_acc[0] = data[0] - - if(len(data.shape) > 1): - running_acc[0, :] = data[0, :] - - for i in np.arange(data.shape[0] - 1): - if(len(data.shape) == 1): - running_acc[i + 1] = running_acc[i] + data[i + 1] - - if(len(data.shape) > 1): - running_acc[i + 1, :] = running_acc[i, :] + data[i + 1, :] - - return running_acc - -def ignore_boundaries(data, rim=10): - ''' - Routine to mask the lateral boundary regions of model data to ignore them from calculations. - - Input:: - data - a masked array of model data - rim - (optional) number of grid points to ignore - - Output:: - data - data array with boundary region masked - - ''' - - nx = data.shape[1] - ny = data.shape[0] - - rimmask = np.zeros_like(data) - for j in np.arange(rim): - rimmask[j, 0:nx - 1] = 1.0 - - for j in ny - 1 - np.arange(rim): - rimmask[j, 0:nx - 1] = 1.0 - - for i in np.arange(rim): - rimmask[0:ny - 1, i] = 1.0 - - for i in nx - 1 - np.arange(rim): - rimmask[0:ny - 1, i] = 1.0 - - data = ma.masked_array(data, mask=rimmask) - - return data - -def normalizeDatetimes(datetimes, timestep): - """ - Input:: - datetimes - list of datetime objects that need to be normalized - timestep - string of value ('daily' | 'monthly') - Output:: - normalDatetimes - list of datetime objects that have been normalized - - Normalization Rules:: - Daily data will be forced to an hour value of 00:00:00 - Monthly data will be forced to the first of the month at midnight - """ - normalDatetimes = [] - if timestep.lower() == 'monthly': - for inputDatetime in datetimes: - if inputDatetime.day != 1: - # Clean the inputDatetime - inputDatetimeString = inputDatetime.strftime('%Y%m%d') - normalInputDatetimeString = inputDatetimeString[:6] + '01' - inputDatetime = datetime.datetime.strptime(normalInputDatetimeString, '%Y%m%d') - - normalDatetimes.append(inputDatetime) - - elif timestep.lower() == 'daily': - for inputDatetime in datetimes: - if inputDatetime.hour != 0 or inputDatetime.minute != 0 or inputDatetime.second != 0: - datetimeString = inputDatetime.strftime('%Y%m%d%H%M%S') - normalDatetimeString = datetimeString[:8] + '000000' - inputDatetime = datetime.datetime.strptime(normalDatetimeString, '%Y%m%d%H%M%S') - - normalDatetimes.append(inputDatetime) - - return normalDatetimes - -def getModelTimes(modelFile, timeVarName): - ''' - TODO: Do a better job handling dates here - Routine to convert from model times ('hours since 1900...', 'days since ...') - into a python datetime structure - - Input:: - modelFile - path to the model tile you want to extract the times list and modelTimeStep from - timeVarName - name of the time variable in the model file - - Output:: - times - list of python datetime objects describing model data times - modelTimeStep - 'hourly','daily','monthly','annual' - ''' - - f = Nio.open_file(modelFile) - xtimes = f.variables[timeVarName] - timeFormat = xtimes.attributes['units'] - - # search to check if 'since' appears in units - try: - sinceLoc = re.search('since', timeFormat).end() - - - #KDW the below block generates and error. But the print statement, indicates that sinceLoc found something - except AttributeError: - print 'Error decoding model times: time variable attributes do not contain "since"' - raise - - units = '' - TIME_UNITS = ('minutes', 'hours', 'days', 'months', 'years') - # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units - for unit in TIME_UNITS: - if re.search(unit, timeFormat): - units = unit - break - - # cut out base time (the bit following 'since') - base_time_string = string.lstrip(timeFormat[sinceLoc:]) - - # decode base time - base_time = decodeTimeFromString(base_time_string) - - times = [] - - - - for xtime in xtimes[:]: - - # Cast time as an int ***KDW remove this so fractional xtime can be read from MERG - xtime = int(xtime) - - if units == 'minutes': - dt = datetime.timedelta(minutes=xtime) - new_time = base_time + dt - elif units == 'hours': - dt = datetime.timedelta(hours=xtime) - new_time = base_time + dt - elif units == 'days': - dt = datetime.timedelta(days=xtime) - new_time = base_time + dt - elif units == 'months': - # NB. adding months in python is complicated as month length varies and hence ambiguous. - # Perform date arithmatic manually - # Assumption: the base_date will usually be the first of the month - # NB. this method will fail if the base time is on the 29th or higher day of month - # -as can't have, e.g. Feb 31st. - new_month = int(base_time.month + xtime % 12) - new_year = int(math.floor(base_time.year + xtime / 12.)) - new_time = datetime.datetime(new_year, new_month, base_time.day, base_time.hour, base_time.second, 0) - elif units == 'years': - dt = datetime.timedelta(years=xtime) - new_time = base_time + dt - - #print "xtime is:", xtime, "dt is: ", dt - - - - times.append(new_time) - - try: - timeStepLength = int(xtimes[1] - xtimes[0] + 1.e-12) - modelTimeStep = getModelTimeStep(units, timeStepLength) - - #KDW if timeStepLength is zero do not normalize times as this would create an empty list - if timeStepLength != 0: - times = normalizeDatetimes(times, modelTimeStep) - except: - raise - - return times, modelTimeStep - -def getModelTimeStep(units, stepSize): - # Time units are now determined. Determine the time intervals of input data (mdlTimeStep) - - if units == 'minutes': - if stepSize == 60: - modelTimeStep = 'hourly' - elif stepSize == 1440: - modelTimeStep = 'daily' - # 28 days through 31 days - elif 40320 <= stepSize <= 44640: - modelTimeStep = 'monthly' - # 365 days through 366 days - elif 525600 <= stepSize <= 527040: - modelTimeStep = 'annual' - else: - raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize) - - elif units == 'hours': - #need a check for fractional hrs and only one hr i.e. stepSize=0 - if stepSize == 0 or stepSize == 1: - modelTimeStep = 'hourly' - elif stepSize == 24: - modelTimeStep = 'daily' - elif 672 <= stepSize <= 744: - modelTimeStep = 'monthly' - elif 8760 <= stepSize <= 8784: - modelTimeStep = 'annual' - else: - raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize) - - elif units == 'days': - if stepSize == 1: - modelTimeStep = 'daily' - elif 28 <= stepSize <= 31: - modelTimeStep = 'monthly' - elif 365 <= stepSize <= 366: - modelTimeStep = 'annual' - else: - raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize) - - elif units == 'months': - if stepSize == 1: - modelTimeStep = 'monthly' - elif stepSize == 12: - modelTimeStep = 'annual' - else: - raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize) - - elif units == 'years': - if stepSize == 1: - modelTimeStep = 'annual' - else: - raise Exception('model data time step interval exceeds the max time interval (annual)', units, stepSize) - - else: - errorMessage = 'the time unit ', units, ' is not currently handled in this version.' - raise Exception(errorMessage) - - return modelTimeStep - -def decodeTimeFromString(time_string): - ''' - Decodes string into a python datetime object - *Method:* tries a bunch of different time format possibilities and hopefully one of them will hit. - :: - - **Input:** time_string - a string that represents a date/time - - **Output:** mytime - a python datetime object - ''' - - # This will deal with times that use decimal seconds - if '.' in time_string: - time_string = time_string.split('.')[0] + '0' - else: - pass - - try: - mytime = time.strptime(time_string, '%Y-%m-%d %H:%M:%S') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y/%m/%d %H:%M:%S') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y%m%d %H:%M:%S') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y:%m:%d %H:%M:%S') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y%m%d%H%M%S') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y-%m-%d %H:%M') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass -#KDW added - try: - mytime = time.strptime(time_string, '%Y-%m-%d') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - try: - mytime = time.strptime(time_string, '%Y-%m-%d %H') - mytime = datetime.datetime(*mytime[0:6]) - return mytime - - except ValueError: - pass - - - print 'Error decoding time string: string does not match a predefined time format' - return 0 - -def regrid_wrapper(regrid_choice, obsdata, obslats, obslons, wrfdata, wrflats, wrflons): - ''' - Wrapper routine for regridding. - - Either regrids model to obs grid, or obs to model grid, depending on user choice - - Inputs:: - regrid_choice - [0] = Regrid obs data onto model grid or - [1] = Regrid model data onto obs grid - - obsdata,wrfdata - data arrays - obslats,obslons - observation lat,lon arrays - wrflats,wrflons - model lat,lon arrays - - Output:: - rdata,rdata2 - regridded data - lats,lons - latitudes and longitudes of regridded data - - ''' - - # Regrid obs data onto model grid - if(regrid_choice == '0'): - - ndims = len(obsdata.shape) - if(ndims == 3): - newshape = [obsdata.shape[0], wrfdata.shape[1], wrfdata.shape[2]] - nT = obsdata.shape[0] - - if(ndims == 2): - newshape = [wrfdata.shape[0], wrfdata.shape[1]] - nT = 0 - - rdata = wrfdata - lats, lons = wrflats, wrflons - - # Create a new masked array with the required dimensions - tmp = np.zeros(newshape) - rdata2 = ma.MaskedArray(tmp, mask=(tmp == 0)) - tmp = 0 # free up some memory - - rdata2[:] = 0.0 - - if(nT > 0): - for t in np.arange(nT): - rdata2[t, :, :] = do_regrid(obsdata[t, :, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :]) - - if(nT == 0): - rdata2[:, :] = do_regrid(obsdata[:, :], obslats[:, :], obslons[:, :], wrflats[:, :], wrflons[:, :]) - - - # Regrid model data onto obs grid - if(regrid_choice == '1'): - ndims = len(wrfdata.shape) - if(ndims == 3): - newshape = [wrfdata.shape[0], obsdata.shape[1], obsdata.shape[2]] - nT = obsdata.shape[0] - - if(ndims == 2): - newshape = [obsdata.shape[0], obsdata.shape[1]] - nT = 0 - - rdata2 = obsdata - lats, lons = obslats, obslons - - tmp = np.zeros(newshape) - rdata = ma.MaskedArray(tmp, mask=(tmp == 0)) - tmp = 0 # free up some memory - - rdata[:] = 0.0 - - if(nT > 0): - for t in np.arange(nT): - rdata[t, :, :] = do_regrid(wrfdata[t, :, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :]) - - if(nT == 0): - rdata[:, :] = do_regrid(wrfdata[:, :], wrflats[:, :], wrflons[:, :], obslats[:, :], obslons[:, :]) - - - return rdata, rdata2, lats, lons - -def extract_sub_time_selection(allTimes, subTimes, data): - ''' - Routine to extract a sub-selection of times from a data array. - - Example:: - Suppose a data array has 30 time values for daily data for a whole month, - but you only want the data from the 5th-15th of the month. - - Input:: - allTimes - list of datetimes describing what times the data in the data array correspond to - subTimes - the times you want to extract data from - data - the data array - - Output:: - subdata - subselection of data array - - ''' - # Create new array to store the subselection - subdata = np.zeros([len(subTimes), data.shape[1], data.shape[2]]) - - # Loop over all Times and when it is a member of the required subselection, copy the data across - i = 0 # counter for allTimes - subi = 0 # counter for subTimes - for t in allTimes: - if(np.setmember1d(allTimes, subTimes)): - subdata[subi, :, :] = data[i, :, :] - subi += 1 - i += 1 - - return subdata - -def calc_average_on_new_time_unit_K(data, dateList, unit): - ''' - Routine to calculate averages on longer time units than the data exists on. - e.g. if the data is 6-hourly, calculate daily, or monthly means. - - Input: - data - data values - dateList - list of python datetime structures corresponding to data values - unit - string describing time unit to average onto - e.g. 'monthly', 'daily', 'pentad','annual','decadal' - Output: - meanstorem - numpy masked array of data values meaned over required time period - newTimesList - a list of python datetime objects representing the data in the new averagin period - NB. currently set to beginning of averaging period, - i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z. - ''' - - # Check if the user-selected temporal grid is valid. If not, EXIT - acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad') - if not acceptable: - print 'Error: unknown unit type selected for time averaging: EXIT' - return -1,-1,-1,-1 - - # Calculate arrays of: annual timeseries: year (2007,2007), - # monthly time series: year-month (200701,200702), - # daily timeseries: year-month-day (20070101,20070102) - # depending on user-selected averaging period. - - # Year list - if unit=='annual': - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year)) - timeunits = np.array(timeunits, dtype=int) - - # YearMonth format list - if unit=='monthly': - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month)) - timeunits = np.array(timeunits,dtype=int) - - # YearMonthDay format list - if unit=='daily': - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day)) - timeunits = np.array(timeunits,dtype=int) - - # TODO: add pentad setting using Julian days? - - # Full list: a special case - if unit == 'full': - comment='Calculating means data over the entire time range: i.e., annual-mean climatology' - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(999) # i.e. we just want the same value for all times. - timeunits = np.array(timeunits, dtype=int) - - # empty list to store new times - newTimesList = [] - - # Decide whether or not you need to do any time averaging. - # i.e. if data are already on required time unit then just pass data through and - # calculate and return representative datetimes. - processing_required = True - if len(timeunits)==(len(np.unique(timeunits))): - processing_required = False - - # 1D data arrays, i.e. time series - if data.ndim==1: - # Create array to store the resulting data - meanstore = np.zeros(len(np.unique(timeunits))) - - # Calculate the means across each unique time unit - i=0 - for myunit in np.unique(timeunits): - if processing_required: - datam=ma.masked_array(data,timeunits!=myunit) - meanstore[i] = ma.average(datam) - - # construct new times list - smyunit = str(myunit) - if len(smyunit)==4: # YYYY - yyyy = int(myunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit)==6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit)==8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit)==3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1]-dateList[0] - halfway = dateList[0]+(dt/2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - - newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0)) - i = i+1 - - # 3D data arrays - if data.ndim==3: - # datamask = create_mask_using_threshold(data,threshold=0.75) - # Create array to store the resulting data - meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]]) - - # Calculate the means across each unique time unit - i=0 - datamask_store = [] - for myunit in np.unique(timeunits): - if processing_required: - mask = np.zeros_like(data) - mask[timeunits!=myunit,:,:] = 1.0 - # Calculate missing data mask within each time unit... - datamask_at_this_timeunit = np.zeros_like(data) - datamask_at_this_timeunit[:]= create_mask_using_threshold(data[timeunits==myunit,:,:],threshold=0.75) - # Store results for masking later - datamask_store.append(datamask_at_this_timeunit[0]) - # Calculate means for each pixel in this time unit, ignoring missing data (using masked array). - datam = ma.masked_array(data,np.logical_or(mask,datamask_at_this_timeunit)) - meanstore[i,:,:] = ma.average(datam,axis=0) - # construct new times list - smyunit = str(myunit) - if len(smyunit)==4: # YYYY - yyyy = int(smyunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit)==6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit)==8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit)==3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1]-dateList[0] - halfway = dateList[0]+(dt/2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0)) - i += 1 - - if not processing_required: - meanstorem = data - - if processing_required: - # Create masked array (using missing data mask defined above) - datamask_store = np.array(datamask_store) - meanstorem = ma.masked_array(meanstore, datamask_store) - - return meanstorem,newTimesList - -def decode_model_timesK(ifile,timeVarName,file_type): - ################################################################################################# - # Convert model times ('hours since 1900...', 'days since ...') into a python datetime structure - # Input: - # filelist - list of model files - # timeVarName - name of the time variable in the model files - # Output: - # times - list of python datetime objects describing model data times - # Peter Lean February 2011 - ################################################################################################# - f = Nio.open_file(ifile,mode='r',options=None,format=file_type) - xtimes = f.variables[timeVarName] - timeFormat = xtimes.attributes['units'] - #timeFormat = "days since 1979-01-01 00:00:00" - # search to check if 'since' appears in units - try: - sinceLoc = re.search('since',timeFormat).end() - except: - print 'Error decoding model times: time var attributes do not contain "since": Exit' - sys.exit() - # search for 'seconds','minutes','hours', 'days', 'months', 'years' so know units - # TODO: Swap out this section for a set of if...elseif statements - units = '' - try: - _ = re.search('minutes',timeFormat).end() - units = 'minutes' - except: - pass - try: - _ = re.search('hours',timeFormat).end() - units = 'hours' - except: - pass - try: - _ = re.search('days',timeFormat).end() - units = 'days' - except: - pass - try: - _ = re.search('months',timeFormat).end() - units = 'months' - except: - pass - try: - _ = re.search('years',timeFormat).end() - units = 'years' - except: - pass - # cut out base time (the bit following 'since') - base_time_string = string.lstrip(timeFormat[sinceLoc:]) - # decode base time - # 9/25/2012: datetime.timedelta has problem with the argument because xtimes is NioVariable. - # Soln (J. Kim): use a temp variable ttmp in the for loop, then convert it into an integer xtime. - base_time = decodeTimeFromString(base_time_string) - times=[] - for ttmp in xtimes[:]: - xtime = int(ttmp) - if(units=='minutes'): - dt = datetime.timedelta(minutes=xtime); new_time = base_time + dt - if(units=='hours'): - dt = datetime.timedelta(hours=xtime); new_time = base_time + dt - if(units=='days'): - dt = datetime.timedelta(days=xtime); new_time = base_time + dt - if(units=='months'): # NB. adding months in python is complicated as month length varies and hence ambigous. - # Perform date arithmatic manually - # Assumption: the base_date will usually be the first of the month - # NB. this method will fail if the base time is on the 29th or higher day of month - # -as can't have, e.g. Feb 31st. - new_month = int(base_time.month + xtime % 12) - new_year = int(math.floor(base_time.year + xtime / 12.)) - #print type(new_year),type(new_month),type(base_time.day),type(base_time.hour),type(base_time.second) - new_time = datetime.datetime(new_year,new_month,base_time.day,base_time.hour,base_time.second,0) - if(units=='years'): - dt = datetime.timedelta(years=xtime); new_time = base_time + dt - times.append(new_time) - return times - - -def regrid_in_time(data,dateList,unit): - ################################################################################################# - # Routine to calculate averages on longer time units than the data exists on. - # e.g. if the data is 6-hourly, calculate daily, or monthly means. - # Input: - # data - data values - # dateList - list of python datetime structures corresponding to data values - # unit - string describing time unit to average onto - # e.g. 'monthly', 'daily', 'pentad','annual','decadal' - # Output: - # meanstorem - numpy masked array of data values meaned over required time period - # newTimesList - a list of python datetime objects representing the data in the new averagin period - # NB. currently set to beginning of averaging period, - # i.e. mean Jan 1st - Jan 31st -> represented as Jan 1st, 00Z. - # .............................. - # Jinwon Kim, Sept 30, 2012 - # Created from calc_average_on_new_time_unit_K, Peter's original, with the modification below: - # Instead of masked array used by Peter, use wh to defined data within the averaging range. - ################################################################################################# - - print '*** Begin calc_average_on_new_time_unit_KK ***' - # Check if the user-selected temporal grid is valid. If not, EXIT - acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')|(unit=='pentad') - if not acceptable: - print 'Error: unknown unit type selected for time averaging: EXIT'; return -1,-1,-1,-1 - - # Calculate time arrays of: annual (yyyy, [2007]), monthly (yyyymm, [200701,200702]), daily (yyyymmdd, [20070101,20070102]) - # "%02d" is similar to i2.2 in Fortran - if unit=='annual': # YYYY - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year)) - elif unit=='monthly': # YYYYMM - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month)) - elif unit=='daily': # YYYYMMDD - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(str(dateList[i].year) + str("%02d" % dateList[i].month) + str("%02d" % dateList[i].day)) - elif unit=='full': # Full list: a special case - comment='Calculating means data over the entire time range: i.e., annual-mean climatology' - timeunits = [] - for i in np.arange(len(dateList)): - timeunits.append(999) # i.e. we just want the same value for all times. - timeunits = np.array(timeunits,dtype=int) - print 'timeRegridOption= ',unit#'; output timeunits= ',timeunits - #print 'timeRegridOption= ',unit,'; output timeunits= ',timeunits - - # empty list to store time levels after temporal regridding - newTimesList = [] - - # Decide whether or not you need to do any time averaging. - # i.e. if data are already on required time unit then just pass data through and calculate and return representative datetimes. - processing_required = True - if len(timeunits)==(len(np.unique(timeunits))): - processing_required = False - print 'processing_required= ',processing_required,': input time steps= ',len(timeunits),': regridded output time steps = ',len(np.unique(timeunits)) - #print 'np.unique(timeunits): ',np.unique(timeunits) - print 'data.ndim= ',data.ndim - - if data.ndim==1: # 1D data arrays, i.e. 1D time series - # Create array to store the temporally regridded data - meanstore = np.zeros(len(np.unique(timeunits))) - # Calculate the means across each unique time unit - i=0 - for myunit in np.unique(timeunits): - if processing_required: - wh = timeunits==myunit - datam = data[wh] - meanstore[i] = ma.average(datam) - # construct new times list - smyunit = str(myunit) - if len(smyunit)==4: # YYYY - yyyy = int(myunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit)==6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit)==8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit)==3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1]-dateList[0] - halfway = dateList[0]+(dt/2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0)) - i = i+1 - - elif data.ndim==3: # 3D data arrays, i.e. 2D time series - # Create array to store the resulting data - meanstore = np.zeros([len(np.unique(timeunits)),data.shape[1],data.shape[2]]) - # Calculate the means across each unique time unit - i=0 - datamask_store = [] - for myunit in np.unique(timeunits): - if processing_required: - wh = timeunits==myunit - datam = data[wh,:,:] - meanstore[i,:,:] = ma.average(datam,axis=0) - # construct new times list - smyunit = str(myunit) - if len(smyunit)==4: # YYYY - yyyy = int(smyunit[0:4]) - mm = 1 - dd = 1 - if len(smyunit)==6: # YYYYMM - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = 1 - if len(smyunit)==8: # YYYYMMDD - yyyy = int(smyunit[0:4]) - mm = int(smyunit[4:6]) - dd = int(smyunit[6:8]) - if len(smyunit)==3: # Full time range - # Need to set an appropriate time representing the mid-point of the entire time span - dt = dateList[-1]-dateList[0] - halfway = dateList[0]+(dt/2) - yyyy = int(halfway.year) - mm = int(halfway.month) - dd = int(halfway.day) - newTimesList.append(datetime.datetime(yyyy,mm,dd,0,0,0,0)) - i += 1 - - if not processing_required: - meanstorem = data - elif processing_required: - meanstorem = meanstore - - print '*** End calc_average_on_new_time_unit_KK ***' - return meanstorem,newTimesList