forked from david-edu-morales/TAAP_py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
taap_dataLoadQC.py
102 lines (89 loc) · 5.08 KB
/
taap_dataLoadQC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# %%
import pandas as pd
# %%
# Set up variables
keylist_mx = [26013, 26057, 26164] # create list of climate station keys
# %%
# *** MEXICAN CLIMATE STATIONS ***
# Read the files into a df and clean the data
# Create a dictionary of keys and filenames to call dataframes into another dictionary
filenameDict = {keylist_mx[key]: str(keylist_mx[key])+'_daily-record.txt' for key in range(len(keylist_mx))}
# Create a dictionary of keys and corresponding dataframes
dict_mx = {key: pd.read_fwf(filename,
skiprows=19,
skipfooter=1,
names=['date',
'precip',
'evap',
'tmax',
'tmin'])
for (key, filename) in filenameDict.items()}
# Edit data and prep for analysis
for key in keylist_mx:
dict_mx[key] = dict_mx[key].replace({'Nulo': None}, regex=True) # swap str to None type
dict_mx[key] = dict_mx[key].replace({'ul' : None}, regex=True)
dict_mx[key]['date'] = pd.to_datetime(dict_mx[key]['date'], # correct date format
infer_datetime_format=True,
dayfirst=True,
format='%Y-%m-%d')
dict_mx[key] = dict_mx[key].set_index('date') # set to datetimeIndex
dict_mx[key] = dict_mx[key].astype(float) # correct datatypes
# %%
# Melt the dataframe to arrange one observation per row
dictMelt = {key: pd.melt(frame=dict_mx[key],
value_vars=['precip','evap','tmax','tmin'],
value_name='measurement',
var_name='variable',
ignore_index=False)
for key in keylist_mx}
# Identify leap years contained within the data
leapYear = [1932, 1936, 1940, 1944, 1952, 1956, 1960, 1972, 1976,
1980, 1984, 1988, 1992, 2000, 2004, 2008, 2012, 2016]
# dictMelt[26057][(dictMelt[26057].index.day == 29) &
#(dictMelt[26057].month == 2)].index.year.unique().tolist()
# Create function that inputs the number of days in a month to be applied across rows
def days_of_month(row):
x = row['month']
if (row['year'] in leapYear):
dayList = [31,29,31,30,31,30,31,31,30,31,30,31]
else:
dayList = [31,28,31,30,31,30,31,31,30,31,30,31]
return dayList[x-1]
# Count the number of Nan values for each month.
for key in keylist_mx:
df = dictMelt[key] # Temporary rename of iterated df
df['month'] = df.index.month # Add month column
df['year'] = df.index.year # Add year column
df['key'] = key # Add key column
# Count the total number of null values by month/year/variable (for <= 10 values rule)
df['null_count'] = df.measurement.isnull().groupby([df['variable'],
df['month'],
df['year']]).transform('sum').astype(int)
# Count the number of valid observations by month/year/variable (for <= 10 values rule)
# Some months may have only 15 (for ex.) valid records with no null values.
# This would escape above filter.
df['days_of_record'] = df.month.notnull().groupby([df['variable'],
df['month'],
df['year']]).transform('sum').astype(int)
# Add the number of days for each month, sensitive to leap years.
df['days_of_month'] = df.apply(lambda row: days_of_month(row), axis=1)
#
df['min_days_reqd'] = df.days_of_month - 10
cons_null = df.measurement.isnull().astype(int).groupby(df.measurement.notnull().astype(int).cumsum()).transform('sum')
df['cons_null'] = cons_null
df['cons_null_max'] = df.cons_null.groupby([df['variable'],
df['month'],
df['year']]).transform('max')
# Remove months w/ > 10 missing values
dictMelt = {key: dictMelt[key][dictMelt[key]['null_count'] <= 10] for key in keylist_mx}
# Remove months w/ fewer days of record than required minimum
dictMelt = {key: dictMelt[key][dictMelt[key].days_of_record + dictMelt[key].null_count >
dictMelt[key].min_days_reqd] for key in keylist_mx}
# Remove months w/ > 5 consecutive missing values
dictMelt = {key: dictMelt[key][dictMelt[key].cons_null_max <= 5] for key in keylist_mx}
# Remove unneeded columns
dictMelt = {key: dictMelt[key][['variable','measurement','month','year','key']] for key in keylist_mx}
# %%
# Create cleaned-data csv files
for key in keylist_mx:
dictMelt[key].to_csv('data/'+str(key)+'_clean-data.csv')