-
Notifications
You must be signed in to change notification settings - Fork 4
/
resample.py
310 lines (256 loc) · 11.6 KB
/
resample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# sudo pip install hungarian
# sudo easy_install statsmodels
#sudo apt-get install python-pandas
import pandas as pd
import numpy as np
import hungarian
import math
import argparse
from multiprocessing import Pool
from scipy.stats import ks_2samp
from scipy.stats import ttest_ind
from scipy.stats import chisquare
from sklearn import linear_model
import sys
df_full = None
df_columns = None
column_parameters = {}
column_funs = {}
column_weights = {}
def print_report():
#generate an output report
print "Column weights used:"
for attribute in sorted(column_weights.keys()):
print "\t" + attribute.ljust(30) + "\t" + str(column_weights[attribute])
print""
print "Two-sample Kolmogorov-Smirnov:"
ks_vals = ks(df_sample_condition, df_matches)
for attribute in sorted(ks_vals.keys()):
if ks_vals[attribute][1] < 0.1:
print "*",
print "\t" + attribute.ljust(30) + "\tD={:.4f}\tp={:.4f}".format(*ks_vals[attribute])
print ""
print "Independent two-sample t-test:"
for attribute in sorted(df_columns.columns):
if df_columns.ix[0][attribute] == "ignore":
try:
t, p = ttest_ind(df_sample_condition[attribute], df_matches[attribute])
print "\t" + attribute.ljust(30) + "\tt={:.4f}\tp={:.4f}".format(t, p)
except:
pass #oops, must not have been an integer value!
def ks(df_sample_condition, df_match):
"""For every column that has not been set to ignore and is not the match_attribute
this function will perform a two sample Kolmogorov-Smirnov test
"""
ks_results = {}
for column in df_columns:
if df_columns.ix[0][column] != "ignore":
ks_results[column] = ks_2samp(df_sample_condition[column], df_match[column])
return ks_results
def write_output(output, df_sample_condition, df_matches, side_by_side=False):
"""Writes the sample condition and matches to a single csv file. If side_by_side
then the sample condition is written to the left of the matches, with pairs matched
appropriately. Otherwise, the sample condition is written above the matches,
with the matched pairs in order.
"""
if (side_by_side):
columns_to_rename = {}
for column in df_sample_condition.columns:
columns_to_rename[column] = "matched_" + str(column)
df_output = df_matches.rename(columns=columns_to_rename)
df_output = df_output.rename(columns={"_index": "index"}) #rename the index we will join on back
df_output = df_sample_condition.join(df_output)
else:
df_output = pd.concat([df_sample_condition, df_matches])
df_output.to_csv(file_output, encoding='utf-8')
def run_hungarian(matrix, df_population, df_sample_condition):
"""Runs the hungarian linear assignment problem solver from the hungarian package.
Takes in a matrix of datavalues and dataframes for the df_population and the
df_sample_condition. Returns the matches as a new dataframe with the same
structure as the df_population.
"""
row_assigns, col_assigns = hungarian.lap(matrix)
interesting_indicies = []
for i in range(0, len(df_sample_condition)):
interesting_indicies.append(col_assigns[i])
return df_population.ix[interesting_indicies]
def discover_weightings(df_full):
"""Returns a dict of weightings for every column in file_column_definitions
which is not labeled as ignore, and is not the match_attribute. This function
requires that there are no null/None/NaN values in the columns.
TODO: Remove the requirement for no null/None/NaN values.
"""
candidates = []
for column_name in df_columns.columns:
if df_columns[column_name][0] != "ignore":
if column_name != match_attribute:
candidates.append(column_name)
clf = linear_model.LinearRegression()
r_column = df_full[match_attribute]
nr_columns = df_full[candidates]
clf.fit(nr_columns, r_column)
results = {}
for i in range(0, len(clf.coef_)):
results[candidates[i]] = clf.coef_[i]
return results
def diff_two_rows(x, y):
"""Returns difference over all columns in column_definitions between two rows
in a pandas dataframe as a tuple: (average difference, dictionary of column differences)
"""
diffs = {}
for column_name in df_columns.columns:
difference = diff(x[column_name], y[column_name], column_name)
if difference != None:
diffs[column_name] = difference
return (np.mean(diffs.values()), diffs)
def diff_ignore(one, two):
return None
def diff_nominal(one, two):
if one == two:
return 0
else:
return 1
def diff_ordinal(one, two, sorted_range):
#sorted_range=sorted(sorted_range)
pos1 = sorted_range.index(one)
pos2 = sorted_range.index(two)
diff = math.fabs(pos1 - pos2)
return diff / (len(sorted_range) - 1)
def diff_real(one, two, min, max):
top = float(max - min)
one = float(one)
two - float(two)
return math.fabs((one / top) - (two / top))
def diff(one, two, column_name):
#look up the type for the column name
return column_funs[column_name](one, two, **column_parameters[column_name])
def load_data():
global df_full
global df_columns
df_full = pd.read_csv(file_input, na_values=["", " ", "NULL"])
df_columns = pd.read_csv(file_columns, na_values=["", " ", "NULL"])
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Subsamples a large population based on the characteristics of a small population.')
parser.add_argument('-i', '--input', help='Input filename as a CSV')
parser.add_argument('-o', '--output', help='Output filename for matches')
parser.add_argument('-c', '--column_definitions', help='The CSV of column definitions')
parser.add_argument('-m', '--match', help='The attribute to match one')
parser.add_argument('-w', '--weights', help='Whether weights should be automatically discovered or evenly applied')
args = parser.parse_args()
file_columns = args.column_definitions
file_input = args.input
file_output = args.output
match_attribute = args.match
weights = args.weights
#load datafiles into pandas dataframes
load_data()
#separate the general population for matching and the subsample of interest based on the match_attribute
df_population = df_full[df_full[match_attribute] == 0]
df_sample_condition = df_full[df_full[match_attribute] == 1]
df_population = df_population.reset_index(drop=True)
df_sample_condition = df_sample_condition.reset_index(drop=True)
#make sure sample condition and population and column definitions are all the same size
assert len(df_population.columns) == len(df_columns.columns) == len(
df_sample_condition.columns), "All data files must have the same number of columns"
#make sure that population file has at least one free choice in it
assert len(df_population) > len(
df_sample_condition), "The population file must have more items in it than the sample condition file"
column_weights = {}
if weights == "auto":
#run logistic regression to discover the weightings for each column
column_weights = discover_weightings(df_full)
else:
#set all columns to 1 except those that will not be used
for column_name in df_columns.columns:
if df_columns[column_name][0] != "ignore":
column_weights[column_name] = 1.0
else:
column_weights[column_name] = 0.0
#go through and build shorthand variables for the columns based on the data in the definitions file
for column_name in df_columns:
if df_columns[column_name][0] == "ordinal":
items = []
items.extend(df_sample_condition[column_name].unique().tolist())
items.extend(df_population[column_name].unique().tolist())
s = set()
for item in items:
s.add(item)
#print "Setting ordinal " + str(column_name)
newlist = sorted(s)
column_parameters[column_name] = {"sorted_range": newlist}
column_funs[column_name] = diff_ordinal
elif df_columns[column_name][0] == "real":
mymin = np.min(df_sample_condition[column_name])
if np.min(df_population[column_name]) < mymin:
mymin = np.min(df_population[column_name])
mymax = np.max(df_sample_condition[column_name])
if np.max(df_population[column_name]) > mymax:
mymax = np.max(df_population[column_name])
#print "Setting real " + str(column_name)
column_parameters[column_name] = {"min": mymin, "max": mymax}
column_funs[column_name] = diff_real
elif df_columns[column_name][0] == "ignore":
#print "Setting unique " + str(column_name)
column_parameters[column_name] = {}
column_funs[column_name] = diff_ignore
elif df_columns[column_name][0] == "nominal":
#print "Setting nominal " + str(column_name)
column_parameters[column_name] = {}
column_funs[column_name] = diff_nominal
#add a zero index for anything that does not already have a weight
if column_name not in column_weights.keys():
column_weights[column_name] = 0
#create a matrix filled with ones (worst match value)
matrix = np.ones((len(df_population), len(df_population)), dtype=np.float32)
print "Building difference tables matrix of size (" + str(len(df_population)) + "x" + str(
len(df_population)) + "):",
x_i = 0
for x in df_population.iterrows():
y_i = 0
for y in df_sample_condition.iterrows():
diffs = []
for column_name in df_columns.columns:
difference = diff(x[1][column_name], y[1][column_name], column_name)
#todo: right now if a value is missing we maximize it, setting it to totally different at 1, is this reasonable?
#todo: instead should we just ignore this? or should it be a sort of special value?
if difference != None and np.isnan(difference):
difference = 1
elif difference != None:
diffs.append(difference * column_weights[column_name])
matrix[x_i][y_i] = np.sum(diffs)
y_i += 1
x_i += 1
print ".",
sys.stdout.flush()
print""
#run lap solver
print "Running the LAP using the hungarian method."
df_matches = run_hungarian(matrix, df_population, df_sample_condition)
df_matches = df_matches.reset_index(drop=True)
#write output files
write_output(file_output, df_sample_condition, df_matches)
#write output report
print_report()
"""
#Alternative matches
import time
before = int(round(time.time() * 1000))
print before
matches={}
for rowiter in df_deltas.iterrows():
diff_value=np.mean(rowiter[1])
curr_sample_val=rowiter[0]
candidates=[]
for popiter in df_population.iterrows():
difference_value=diff_two_rows(df_sample_condition.ix[curr_sample_val], df_population.ix[popiter[0]])[0]
if difference_value <= diff_value:
candidates.append( (popiter[0], difference_value) )
matches[curr_sample_val]=candidates
after = int(round(time.time() * 1000))
print after
print str(after-before)
"""
#non_regression=df_john[["GENDER_CODE","BIRTH_YEAR","ABORIGINAL_ANCESTRY_IND","SELF_REPORTED_DISABILITY_IND","cu_year1","entrance_average"]]
#clf.fit(non_regression,regression_val)
#clf.coef_