-
Notifications
You must be signed in to change notification settings - Fork 2
/
ALS.py
790 lines (628 loc) · 31 KB
/
ALS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
# ALS.py
# Author: Matthew Smarte
# Version: 1.2.0
# Last Updated: 01/12/19
# This code is designed to be imported and run inside a Jupyter notebook using an iPython kernel.
'''
ALS.py DEPENDENCIES
This code was developed and tested using the following packages/versions.
Other versions may also work.
python 3.6.1
numpy 1.12.1
pandas 0.20.1
scipy 0.19.0
matplotlib 2.2.2
ipython 5.3.0
ALS.py MAJOR UPDATE HISTORY
1.0.0 - 07/10/18 - Initial release (__init__, fit, plot_data_model, plot_model, _time_axis, _model, and _conv_IRF methods).
1.1.0 - 07/25/18 - Added bootstrap method.
1.1.1 - 10/04/18 - Added conc_units kwarg for fit, plot_data_model, and boostrap methods.
1.1.2 - 01/09/19 - Added quiet kwarg to bootstrap method.
1.2.0 - 01/12/19 - Added preliminary monte_carlo_params method.
'''
# TODO:
# Add checks to make sure the inputs from the user have the correct formatting, throw errors if they are not (ex: df_data err field can have no zeros)
# Adjust random sampling methodology of monte carlo parameter sampling to accomodate parameters with negative values and correlation between parameters
import numpy as np
import pandas as pd
from scipy.optimize import leastsq
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import display, clear_output
class KineticModel:
_dt = 0.02 # Fundamental kinetic time step (ms)
def __init__(self, user_model, err_weight=True, fit_pre_photo=False, apply_IRF=True, apply_PG=True, t_PG=1.0):
'''
Initializor for a KineticModel object.
See ex_notebook_1.ipynb for API documentation.
'''
self._user_model = user_model
self._err_weight = err_weight
self._fit_pre_photo = fit_pre_photo
self._apply_IRF = apply_IRF
self._apply_PG = apply_PG
self._t_PG = t_PG
def fit(self, t, tbin, df_data, df_model_params, df_ALS_params, delta_xtick=20.0, conc_units=False, save_fn=None, quiet=False, **kwargs):
'''
Method for fitting data and optimizing parameters.
See ex_notebook_1.ipynb for API documentation.
'''
# Check fit t0 / fit_pre_photo
if df_ALS_params.at['t0','fit'] and not self._fit_pre_photo:
print('ERROR: Fit t0 is True but fit_pre_photo is False!')
print('If we are fitting t0, then we must also fit the pre-photolysis data.')
print('Otherwise, the cost function would be minimized by arbitrarily increasing t0.')
return None, None, None, None, None, None
# Determine start time, end time, and range of times over which to fit
t_start = float(t.min())
t_end = float(t.max())
idx_cost = np.full(t.shape, True) if self._fit_pre_photo else (t >= df_ALS_params.at['t0','val']) # Boolean array of indices to use in cost calculation
# Establish corrspondence between data and model time axes
# Each entry in t has an exact match to an entry in t_model
# We take the below approach (rather than ==) to prevent any problems with numerical roundoff
t_model = self._time_axis(t_start, t_end, tbin)
idx_model = [np.abs(t_model - t[_]).argmin() for _ in range(t.size)] # Array of positional indices, maps model axis --> data axis
# Organize fitted species data into data_val and data_err frames
# Columns of data_val and data_err are species and the rows correspond to times in t array
data_fit = df_data[df_data['fit']]
species_names = list(data_fit.index)
data_val = pd.DataFrame(list(data_fit['val']), index=species_names).T
if self._err_weight:
data_err = pd.DataFrame(list(data_fit['err']), index=species_names).T
# Organize the fit parameters
model_params_fit = df_model_params[df_model_params['fit']]
ALS_params_fit = df_ALS_params[df_ALS_params['fit']]
p_names = list(model_params_fit.index) + list(ALS_params_fit.index)
p0 = np.concatenate((model_params_fit['val'], ALS_params_fit['val']))
# Define the cost function to be optimized
def calc_cost(p):
# Organize parameter values used for the current iteration of the fit into dictionaries
model_params_p = {}
for param in df_model_params.index:
model_params_p[param] = p[p_names.index(param)] if param in p_names else df_model_params.at[param,'val']
ALS_params_p = {}
for param in df_ALS_params.index:
ALS_params_p[param] = p[p_names.index(param)] if param in p_names else df_ALS_params.at[param,'val']
# Run the model - we only need the concentrations dataframe
_, c_model = self._model(t_start, t_end, tbin, model_params_p, ALS_params_p)
# Calculate the weighted residual array across points included in the cost computation
res = []
for species in species_names:
obs = data_val[species]
mod = ALS_params_p['S_'+species]*c_model[species]
# We take the sqrt of the weight since leastsq will square the array later
# Important to perform .values conversion to array BEFORE we subtract obs and mod (pandas subtracts Series by index agreement not position)
species_res = np.sqrt(data_fit.at[species,'weight']) * (obs.values - mod.values[idx_model])[idx_cost]
if self._err_weight:
err = data_err[species]
species_res = species_res / err.values[idx_cost]
res.append(species_res)
# leastsq will square then sum all entries in the returned array, and minimize this cost value
return np.concatenate(res)
if not quiet:
print('Initial Cost Function Value: {:g}'.format((calc_cost(p0)**2).sum()))
print()
# Perform the fit
# NOTE: The backend of leastsq will automatically autoscale the fit parameters to the same order of magnitude if diag=None (default).
p, cov_p, infodict, mesg, ier = leastsq(calc_cost, p0, full_output=True, **kwargs)
# Calculate minimized cost value
cost = (infodict['fvec']**2).sum()
# Prepare covariance and correlation matrices
if cov_p is not None:
# Scale the covariance matrix according to documentation for leastsq and source code for curve_fit.
# Scale factor is cost / (# of data points - # fit parameters).
M = infodict['fvec'].size
N = p.size
cov_p *= cost / (M-N)
# Compute standard errors
p_err = np.sqrt(np.diag(cov_p))
# Compute correlation matrix
corr_p = np.array([[cov_p[i][j] / (np.sqrt(cov_p[i][i]*cov_p[j][j])) for j in range(cov_p.shape[1])] for i in range(cov_p.shape[0])])
else:
p_err = np.full(p.shape, np.NaN)
corr_p = None
# Convert fit results to dataframes
df_p = pd.DataFrame(np.array((p,p_err)).T, index=p_names, columns=('val','err'))
df_cov_p = pd.DataFrame(cov_p, index=p_names, columns=p_names)
df_corr_p = pd.DataFrame(corr_p, index=p_names, columns=p_names)
if not quiet:
# Display results
print('Optimization terminated successfully.' if ier in (1,2,3,4) else 'Optimization FAILED.')
print('Exit Code = {:d}'.format(ier))
print('Exit Message = {}'.format(mesg))
print()
print('Optimized Cost Function Value = {:g}'.format(cost))
print()
print('Optimized Parameters and Estimated Standard Errors:')
display(df_p)
print()
print('Estimated Correlation Matrix:')
display(df_corr_p)
print()
# Plot the fits
df_model_params_p = df_model_params.copy()
df_ALS_params_p = df_ALS_params.copy()
for param in df_p.index:
if param in df_model_params_p.index:
df_model_params_p.at[param,'val'] = df_p.at[param,'val']
else:
df_ALS_params_p.at[param,'val'] = df_p.at[param,'val']
self.plot_data_model(t, tbin, df_data, df_model_params_p, df_ALS_params_p, delta_xtick=delta_xtick, conc_units=conc_units, save_fn=save_fn, print_cost=False)
return df_p, df_cov_p, df_corr_p, cost, mesg, ier
def plot_data_model(self, t, tbin, df_data, df_model_params, df_ALS_params, delta_xtick=20.0, conc_units=False, save_fn=None, print_cost=True):
'''
Method for plotting the scaled model overlaid on the inputted species data. (No fit is performed.)
Residuals are also plotted and the value of the cost function with these parameters is optionally outputted.
Only species for which fit is True in df_data are shown and included in the cost calculation.
See ex_notebook_1.ipynb for API documentation.
'''
# Run the model
t_start = float(t.min())
t_end = float(t.max())
t_model, c_model = self._model(t_start, t_end, tbin, df_model_params['val'].to_dict(), df_ALS_params['val'].to_dict())
# Only plot the species for which fit=True
# Columns of data_val and data_err are species and the rows correspond to times in t array
data_fit = df_data[df_data['fit']]
species_names = list(data_fit.index)
nSpecies = len(species_names)
data_val = pd.DataFrame(list(data_fit['val']), index=species_names).T
if self._err_weight:
data_err = pd.DataFrame(list(data_fit['err']), index=species_names).T
# Setup for residual and cost computations
cost = 0
idx_cost = np.full(t.shape, True) if self._fit_pre_photo else (t >= df_ALS_params.at['t0','val']) # Boolean array of indices to use in cost calculation
idx_model = [np.abs(t_model - t[_]).argmin() for _ in range(t.size)] # Array of positional indices, maps model axis --> data axis
# Set up the grid of subplots
ncols = 3
nrows = (nSpecies//ncols) if (nSpecies%ncols) == 0 else (nSpecies//ncols)+1
dpi = 120
plt.rc('font', size=9)
plt.rc('axes.formatter', useoffset=False)
f = plt.figure(figsize=(1000/dpi,450*nrows/dpi), dpi=dpi)
gs = gridspec.GridSpec(nrows, ncols, figure=f, hspace=0.3, wspace=0.3, top=0.9)
# Determine x-axis ticks
tick_low = (t_start//delta_xtick)*delta_xtick
tick_high = t_end if t_end % delta_xtick == 0. else ((t_end//delta_xtick)+1)*delta_xtick
ticks = np.linspace(tick_low, tick_high, num=round(((tick_high-tick_low)/delta_xtick)+1), endpoint=True)
# Make the subplots
c_data = []
s_model = []
for i, species in enumerate(species_names):
obs = data_val[species]
mod = df_ALS_params.at['S_'+species,'val']*c_model[species]
s_model.append(mod)
c_obs = data_val[species]/df_ALS_params.at['S_'+species,'val']
c_mod = c_model[species]
c_data.append(c_obs)
# Compute this species' residual and cost contribution
# Important to perform .values conversion to array BEFORE we subtract obs and mod (pandas subtracts Series by index agreement not position)
res = obs.values - mod.values[idx_model]
cost_i = np.sqrt(data_fit.at[species,'weight']) * res[idx_cost]
if self._err_weight:
err = data_err[species]
cost_i = cost_i / err.values[idx_cost]
cost += (cost_i**2).sum()
c_res = c_obs.values - c_mod.values[idx_model]
j = i // 3 # Row index
k = i % 3 # Col index
gs_jk = gridspec.GridSpecFromSubplotSpec(2, 1, hspace=0, height_ratios=[3,1], subplot_spec=gs[j,k])
ax0 = plt.subplot(gs_jk[0]) # Data & Model
ax1 = plt.subplot(gs_jk[1]) # Data - Model
if conc_units:
ax0.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
ax0.plot(t, c_obs, 'o') # Plot the data
ax0.plot(t_model, c_mod, linewidth=2) # Plot the model
# OOM calculation adapted from matplotlib.ticker.ScalarFormatter._set_orderOfMagnitude source code
oom = np.floor(np.log10(ax0.get_yticks().max()))
ax1.ticklabel_format(axis='y', style='plain')
ax1.plot(t, c_res/(10**oom), 'o') # Plot residual
else:
ax0.plot(t, obs, 'o') # Plot the data
ax0.plot(t_model, mod, linewidth=2) # Plot the model
ax1.plot(t, res, 'o') # Plot residual
ax1.plot(t_model, np.zeros(t_model.shape)) # Plot zero residual line
# Manually set x-axis ticks
ax0.set_xticks(ticks)
ax1.set_xticks(ticks)
# Labels
ax0.set_title(species, fontweight='bold') # Make the title the species name
ax0.set_xticklabels([]) # Hide x-axis tick labels for top plot
ax1.set_xlabel('Time (ms)') # Set x-axis label for bottom plot
if k == 0: # Set y-axis labels if plot is in first column
ax0.set_ylabel('Data & Model')
ax1.set_ylabel('Data - Model')
plt.show()
# Print the value of the cost function
if print_cost:
print()
print('Cost Function Value = {:g}'.format(cost))
# Save the scaled model traces
if save_fn:
append_data = (lambda _ : _ + '-data')
append_model = (lambda _ : _ + '-model')
if conc_units:
df_data_out = pd.DataFrame(c_data).T.rename(columns=append_data)
df_model_out = c_model.rename(columns=append_model)
else:
df_data_out = data_val.rename(columns=append_data)
df_model_out = pd.DataFrame(s_model).T.rename(columns=append_model)
df_data_out.insert(0,'t-data',t)
df_model_out.insert(0,'t-model',t_model)
df = pd.concat((df_data_out,df_model_out), axis='columns')
df.to_csv(save_fn, index=False)
def plot_model(self, t_start, t_end, tbin, df_model_params, df_ALS_params, delta_xtick=20.0, save_fn=None):
'''
Plots the model in concentration units (molc/cm3) without the data.
All species returned by the user model function are shown.
See ex_notebook_1.ipynb for API documentation.
'''
# Run the model
t_model, c_model = self._model(t_start, t_end, tbin, df_model_params['val'].to_dict(), df_ALS_params['val'].to_dict())
species_names = list(c_model.columns)
nSpecies = len(species_names)
# Set up the grid of subplots
ncols = 3
nrows = (nSpecies//ncols) if (nSpecies%ncols) == 0 else (nSpecies//ncols)+1
dpi = 120
plt.rc('font', size=9)
plt.rc('axes.formatter', useoffset=False)
f = plt.figure(figsize=(1000/dpi,325*nrows/dpi), dpi=dpi)
gs = gridspec.GridSpec(nrows, ncols, figure=f, hspace=0.45, wspace=0.3, top=0.9, bottom=0.2)
# Determine x-axis ticks
tick_low = (t_start//delta_xtick)*delta_xtick
tick_high = t_end if t_end % delta_xtick == 0. else ((t_end//delta_xtick)+1)*delta_xtick
ticks = np.linspace(tick_low, tick_high, num=round(((tick_high-tick_low)/delta_xtick)+1), endpoint=True)
# Make the subplots
s_model = []
for i, species in enumerate(species_names):
mod = c_model[species]
s_model.append(mod)
j = i // 3 # Row index
k = i % 3 # Col index
ax = plt.subplot(gs[j,k])
ax.ticklabel_format(axis='y', style='sci', scilimits=(0,0))
ax.plot(t_model, mod, linewidth=2)
# Manually set x-axis ticks
ax.set_xticks(ticks)
# Labels
ax.set_title(species, fontweight='bold') # Make the title the species name
ax.set_xlabel('Time (ms)') # Set x-axis label for bottom plot
if k == 0: # Set y-axis labels if plot is in first column
ax.set_ylabel('Concentration ($\mathregular{molc/cm^{3}})$')
plt.show()
# Save the model traces
if save_fn:
df = pd.DataFrame(s_model).T
df.insert(0,'t',t_model)
df.to_csv(save_fn, index=False)
def bootstrap(self, t, tbin, df_data, df_model_params, df_ALS_params, N, delta_xtick=20.0, conc_units=False, save_fn=None, quiet=False, **kwargs):
'''
Performs a bootstrap simulation to estimate the covariance matrix of the fit parameters.
See ex_notebook_1.ipynb for API documentation.
'''
# Check fit t0 / fit_pre_photo
if df_ALS_params.at['t0','fit'] and not self._fit_pre_photo:
print('ERROR: Fit t0 is True but fit_pre_photo is False!')
print('If we are fitting t0, then we must also fit the pre-photolysis data.')
print('Otherwise, the cost function would be minimized by arbitrarily increasing t0.')
return None, None, None, None
# Determine range of data over which to generate bootstrap samples
idx_cost = np.full(t.shape, True) if self._fit_pre_photo else (t >= df_ALS_params.at['t0','val'])
M = sum(idx_cost) # Length of each bootstrap sample
dist_p = []
N_success = 0
N_fail = 0
while N_success < N:
if not quiet:
print('Successful Iterations: {:d}'.format(N_success))
print('Failed Iterations: {:d}'.format(N_fail))
print()
print('Current Iteration: {:d}'.format(N_success+N_fail+1))
clear_output(wait=True)
# Randomly generate indices (with replacement)
idx_b = np.random.choice(M, M)
# Create the bootstrap sample
t_b = t[idx_cost][idx_b]
df_data_b = df_data.copy()
for species in df_data_b.index:
df_data_b.at[species,'val'] = df_data_b.at[species,'val'][idx_cost][idx_b]
df_data_b.at[species,'err'] = df_data_b.at[species,'err'][idx_cost][idx_b]
# Fit the bootstrap sample
df_p_b, _, _, cost_b, mesg_b, ier_b = self.fit(t_b, tbin, df_data_b, df_model_params, df_ALS_params, quiet=True, **kwargs)
success = (ier_b in (1,2,3,4))
# Save the output as the function runs so it may be accessed during a simulation
# Note: We use pd.DataFrame.to_csv because pd.Series.to_csv orients as column instead of row
if save_fn:
mesg_b_adj = mesg_b.replace('\n','').replace(',','') # Remove newlines and commas
row_save = pd.DataFrame(df_p_b['val'].append(pd.Series((cost_b,success,ier_b,mesg_b_adj), index=('cost','success','ier','mesg')))).T
if (N_success+N_fail) == 0:
# First iteration - create a new file containing first row and column headers
row_save.to_csv(save_fn, index=False)
else:
# Subsequent iterations - append row to file without headers
row_save.to_csv(save_fn, index=False, header=False, mode='a')
# If fit was successful, add results to df_dist_p for calculating statistics after loop is finished
if success:
dist_p.append(df_p_b['val'])
N_success +=1
else:
N_fail += 1
df_dist_p = pd.DataFrame(dist_p, index=pd.RangeIndex(len(dist_p)))
# Summary statistics
# The estimated standard error is the standard deviation of the bootstrap distribution
df_p = pd.DataFrame([df_dist_p.mean(),df_dist_p.std()], index=('val','err')).T
df_cov_p = df_dist_p.cov()
df_corr_p = df_dist_p.corr()
if not quiet:
# Display results
print('Bootstrap simulation completed.')
print('Successful Iterations: {:d}'.format(N_success))
print('Failed Iterations: {:d}'.format(N_fail))
print()
print('Returned variables and below summary include only successful iterations.')
if save_fn:
print('Results saved to file include all iterations.')
print()
print('Average Parameter Values and Estimated Standard Errors:')
display(df_p)
print()
print('Estimated Correlation Matrix:')
display(df_corr_p)
print()
print('Below plots and cost use the average parameter values:')
# Make plots
df_model_params_p = df_model_params.copy()
df_ALS_params_p = df_ALS_params.copy()
for param in df_p.index:
if param in df_model_params_p.index:
df_model_params_p.at[param,'val'] = df_p.at[param,'val']
else:
df_ALS_params_p.at[param,'val'] = df_p.at[param,'val']
self.plot_data_model(t, tbin, df_data, df_model_params_p, df_ALS_params_p, delta_xtick=delta_xtick, conc_units=conc_units, save_fn=None, print_cost=True)
return df_p, df_cov_p, df_corr_p, df_dist_p
def monte_carlo_params(self, t, tbin, df_data, df_model_params, df_ALS_params, N, delta_xtick=20.0, conc_units=False, save_fn=None, quiet=False, **kwargs):
'''
Performs a monte carlo simulation over fixed parameters to assess how their uncertainties propagate into uncertainties of the fitted parameters.
See ex_notebook_1.ipynb for API documentation.
The random sampling of the fitted parameters is preliminary (see below). Future updates to this method should:
--- Allow fixed parameters for which negative values are physically meaningful to be included
--- Take into acccount possible correlation between fixed parameters
See full discussion of these issues in ex_notebook_1.ipynb.
'''
# Check fit t0 / fit_pre_photo
if df_ALS_params.at['t0','fit'] and not self._fit_pre_photo:
print('ERROR: Fit t0 is True but fit_pre_photo is False!')
print('If we are fitting t0, then we must also fit the pre-photolysis data.')
print('Otherwise, the cost function would be minimized by arbitrarily increasing t0.')
return None, None, None, None
# Identify parameters to vary - fit field must be False and err field must be nonzero
model_params_vary = df_model_params[(df_model_params['fit'] == False) & (df_model_params['err'] != 0)]
ALS_params_vary = df_ALS_params[(df_ALS_params['fit'] == False) & (df_ALS_params['err'] != 0)]
# Parameter sampling is currently done using the truncated normal distribution:
# --- Lower bound of distribution (lb) is 0.
# --- Upper bound of distribution (ub) is 2 times the parameter value.
# --- Mean of distribution (mu) is the parameter value ('val' field).
# --- Standard deviation of distribution (sigma) is the parameter uncertainty ('err' field).
# See ex_notebook_1.ipynb for a discussion on the benefits / limitations of this approach.
# To sample from scipy.stats.truncnorm, use truncnorm.rvs((lb-mu)/sigma, (ub-mu)/sigma, mu, sigma).
# For our choice of lb and ub, this reduces to the formula used below.
def sample(mu, sigma):
return truncnorm.rvs(-mu/sigma, mu/sigma, mu, sigma)
dist_p = []
N_success = 0
N_fail = 0
while N_success < N:
if not quiet:
print('Successful Iterations: {:d}'.format(N_success))
print('Failed Iterations: {:d}'.format(N_fail))
print()
print('Current Iteration: {:d}'.format(N_success+N_fail+1))
clear_output(wait=True)
# Create the monte carlo parameter sample
p_vary_vals = {}
df_model_params_mc = df_model_params.copy()
for p_row in model_params_vary.itertuples():
df_model_params_mc.at[p_row.Index,'val'] = p_vary_vals[p_row.Index] = sample(p_row.val, p_row.err)
df_ALS_params_mc = df_ALS_params.copy()
for p_row in ALS_params_vary.itertuples():
df_ALS_params_mc.at[p_row.Index,'val'] = p_vary_vals[p_row.Index] = sample(p_row.val, p_row.err)
# Perform fit using the parameter sample
df_p_mc, _, _, cost_mc, mesg_mc, ier_mc = self.fit(t, tbin, df_data, df_model_params_mc, df_ALS_params_mc, quiet=True, **kwargs)
success = (ier_mc in (1,2,3,4))
# Save the output as the function runs so it may be accessed during a simulation
# Note: We use pd.DataFrame.to_csv because pd.Series.to_csv orients as column instead of row
if save_fn:
mesg_mc_adj = mesg_mc.replace('\n','').replace(',','') # Remove newlines and commas
row_save = pd.Series(p_vary_vals) # Create Series with values of the randomly sampled parameters
row_save = row_save.append(pd.Series((cost_mc,success,ier_mc,mesg_mc_adj), index=('cost','success','ier','mesg'))) # Append fit result info
row_save = row_save.append(df_p_mc['val']) # Append fitted parameter values
row_save = pd.DataFrame(row_save).T # Convert to DataFrame for saving
if (N_success+N_fail) == 0:
# First iteration - create a new file containing first row and column headers
row_save.to_csv(save_fn, index=False)
else:
# Subsequent iterations - append row to file without headers
row_save.to_csv(save_fn, index=False, header=False, mode='a')
# If fit was successful, add results to df_dist_p for calculating statistics after loop is finished
if success:
dist_p.append(df_p_mc['val'])
N_success +=1
else:
N_fail += 1
df_dist_p = pd.DataFrame(dist_p, index=pd.RangeIndex(len(dist_p)))
# Summary statistics
# The estimated standard error is the standard deviation of the monte carlo distribution
df_p = pd.DataFrame([df_dist_p.mean(),df_dist_p.std()], index=('val','err')).T
df_cov_p = df_dist_p.cov()
df_corr_p = df_dist_p.corr()
if not quiet:
# Display results
print('Monte carlo simulation completed.')
print('Successful Iterations: {:d}'.format(N_success))
print('Failed Iterations: {:d}'.format(N_fail))
print()
print('Returned variables and below summary include only successful iterations.')
if save_fn:
print('Results saved to file include all iterations.')
print()
print('Average Fitted Parameter Values and Estimated Standard Errors:')
display(df_p)
print()
print('Estimated Correlation Matrix:')
display(df_corr_p)
print()
print('Below plots and cost use the average parameter values:')
# Make plots
df_model_params_p = df_model_params.copy()
df_ALS_params_p = df_ALS_params.copy()
for param in df_p.index:
if param in df_model_params_p.index:
df_model_params_p.at[param,'val'] = df_p.at[param,'val']
else:
df_ALS_params_p.at[param,'val'] = df_p.at[param,'val']
self.plot_data_model(t, tbin, df_data, df_model_params_p, df_ALS_params_p, delta_xtick=delta_xtick, conc_units=conc_units, save_fn=None, print_cost=True)
return df_p, df_cov_p, df_corr_p, df_dist_p
def _time_axis(self, t_start, t_end, tbin):
'''
Private method for computing time axes.
Parameters:
t_start = float: start time of time axis (inclusive), must be integer multiple of tbin*dt
t_end = float: end time of time axis (inclusive), must be integer multiple of tbin*dt
tbin = int: the axis will be evenly spaced by tbin*dt
Returns:
t = ndarray: the time axis
'''
return np.linspace(t_start, t_end, num=round(((t_end-t_start)/(tbin*self._dt))+1), endpoint=True)
def _model(self, t_start, t_end, tbin, model_params, ALS_params):
'''
Private method for integrating the user model.
Implements the photolysis gradient, instrument response function, and photolysis offset.
A copy is created for anything passed to the user model to prevent problems with any mutable objects.
Procedure:
1) A time axis is created for running the model, with photolysis occurring at 0 ms.
Forced to have at least 20 ms of pre-photolysis baseline to ensure correct IRF convolution of points immediately after photolysis.
2) User model is integrated in steps (set by t_PG) with different initial radical concentrations (set by X0 and B).
For example, if t_PG = 1 ms, then:
- The 0-1 ms output is determined by integrating over 0-1 ms with initial radical concentration X0*(1+B*(0.5 ms)).
- The 1-2 ms output is determined by integrating over 0-2 ms with initial radical concentration X0*(1+B*(1.5 ms)), and taking the 1-2 ms portion.
- The 2-3 ms output is determined by integrating over 0-3 ms with initial radical concentration X0*(1+B*(2.5 ms)), and taking the 2-3 ms portion.
- Etc.
3) The concentration profiles are convolved with the IRF (see conv_IRF).
4) The time axis is adjusted for the photolysis offset t0.
5) The profiles are trimmed and averaged within time bins to achieve a step size of dt*tbin over [t_start, t_end].
Parameters:
t_start = float
start time of the model output (ms), must be integer multiple of tbin*dt, must be greater than or equal to -20 ms
t_end = float
end time of the model output (ms), must be integer multiple of tbin*dt, must be greater than t_start
tbin = int
step size of the model output will be tbin*dt
model_params = dict
keys (str) are model param names and values (float) are the parameter values
X0 is required
ALS_params = dict
keys (str) are ALS param names and values (float) are the parameter values
t0 is required, A is required if apply_IRF = True, B is required if apply_PG = True
Returns:
t_model = ndarray
Time axis of the model output (ms). Evenly spaced by tbin*dt over [t_start, t_end].
c_model = DataFrame
Columns correspond to the species returned by the user model.
Rows correspond to the modeled concentrations at the times in t_model
'''
# Create time axis for running the model (before applying tbin and t0)
t0 = float(ALS_params['t0'])
if t0 == 0:
t = self._time_axis(-20, t_end+((tbin-1)*self._dt), 1)
elif t0 < 0:
t = self._time_axis(-20, t_end+((tbin-1)*self._dt)-t0, 1)
elif t0 > 0:
t = self._time_axis(-20-t0, t_end+((tbin-1)*self._dt), 1)
# Model integration w/ photolysis gradient
if self._apply_PG:
X0 = model_params['X0']
B = ALS_params['B']
idx_zero = np.abs(t).argmin()
# First populate concentration matrix for t < 0
m, c = self._user_model(t[:idx_zero].copy(), model_params.copy())
# Then populate concentration matrix for t >= 0
idx_curr = idx_zero
idx_step = round(self._t_PG/self._dt)
while idx_curr < t.size:
t_mid = t[idx_curr] + self._t_PG/2
X_curr = X0*(1+B*t_mid)
model_params_curr = model_params.copy()
model_params_curr['X0'] = X_curr
# If idx_curr+idx_step >= t.size, then t[:idx_curr+idx_step] is equivalent to t[:]
c_tmp = self._user_model(t[:idx_curr+idx_step].copy(), model_params_curr)[1]
for species in c:
c[species] = np.append(c[species], c_tmp[species][idx_curr:])
idx_curr += idx_step
# Model integration w/o photolysis gradient
else:
m, c = self._user_model(t.copy(), model_params.copy())
# Convolve with IRF
if self._apply_IRF:
c = self._conv_IRF(m, c, t.size, ALS_params['A'])
# Apply photolysis offset (the sign of this is correct)
t = t + t0
# Trim data to desired range
idx_start = (np.abs(t - t_start)).argmin()
idx_end = (np.abs(t - (t_end+((tbin-1)*self._dt)))).argmin()
t = t[idx_start:idx_end+1]
for species in c:
c[species] = c[species][idx_start:idx_end+1]
# Bin the model output
t_out = self._time_axis(t_start, t_end, tbin)
c_out = {}
for species in c:
c_out[species] = np.zeros(t_out.size)
# This is the same method for binning used in the IGOR code (although they sum instead of average)
for i in range(t_out.size):
for species in c_out:
c_out[species][i] = np.mean(c[species][i*tbin:(i+1)*tbin])
# Columns of dataframe are species, rows correspond to times in t_out
return t_out, pd.DataFrame.from_dict(c_out)
def _conv_IRF(self, m, y, N, A):
'''
Private method for convolving signals with the mass-dependent ALS instrument response function.
At the ALS, molecules exit the pinhole with a Maxwell-Boltzmann distribution of velocities.
The ion signal at a specific observation time therefore arises from molecules that exit the
pinhole at a distribution of kinetic times that all come before the observation time. See
more details here: https://doi.org/10.1002/kin.20262
Each convolved point is thus calculated as a weighted average of all points in the model at
earlier times, where the weighting is given by the mass-dependent IRF function.
Note that this operation is not quite a "convolution" as one might usually think about it,
since a convolved point only contains contributions from the modeled points at earlier times,
and has zero contribution from modeled points at future times.
Parameters:
m = dictionary: keys are species (str), values are their masses (float - amu)
y = dictionary: keys are species (str), values are arrays of their signals (ndarray)
N = int: length of signal arrays
A = float: IRF parameter (ms2/amu)
Returns:
y_conv = dictionary: keys are species (str), values are arrays of their signals convolved with IRF (ndarray)
'''
# Define convolution functions (h)
t = np.arange(N)*self._dt # ms
h = {}
for species in m:
h[species] = np.zeros(N)
# Leave h(0) = 0 (lim h t-->0 = 0, although directly calculating h(0) is undefined)
h[species][1:] = np.exp((A * m[species]) / (t[1:])**2) / (t[1:])**4
# Compute convolved signals (y_conv)
y_conv = {}
for species in y:
y_conv[species] = np.zeros(t.size)
for i in range(t.size):
for species in y_conv:
norm_factor = (h[species][:i+1]).sum()
# Initially, norm_factor = 0 since the t and y arrays are finite and there is a lag between molecules exiting the pinhole and being detected.
# Therefore, to avoid divide by 0 errors when norm_factor = 0, it makes the most physical sense to set y_conv[species][i] = y[species][0].
# Once norm_factor != 0, we can start calculating y_conv[species][i] as a convolution.
if norm_factor == 0.:
y_conv[species][i] = y[species][0]
else:
y_conv[species][i] = (y[species][:i+1] * np.flip(h[species][:i+1],0)).sum() / norm_factor
return y_conv