-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunction.py
486 lines (378 loc) · 17.2 KB
/
function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#### modified for paper
import os
import pandas as pd
import numpy as np
from scipy.stats import linregress
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math
import sys
import pathlib
from scipy.optimize import curve_fit
from sklearn.preprocessing import MinMaxScaler
def find_exponents(df,
fractionToAnalyze=1,
outputPath='outputs',
outputTable=True,
outputPlots=True,
outputTablename='Testtable',
logToScreen=True,
columnFillThreshold=0.5,
exp_b_threshold=0.01,
exp_r_s_threshold=0.95,
logistic_r_s_threshold=0.95,
k_threshold=0.2,
maxrows=5000,
debug=False):
'''find and plot plot exponential data in a pandas dataframe
:param dataframe df: The dataframe containing date-like and metric columns
:param str fractionToAnalyze: The last xx percent (0.0-1.0) of data (indexed by the found date columns)
:param str outputPath: The subfolder to write output table and plots to (created if not existing)
:param bln outputTable: Write table to outputPath/table (True|False)
:param bln outputPlots: Plot outputs to outputPath/plots (True|False)
:param str timescaling: (sec|min|hour|day|mon|year) scaling the exponential factors to the given time unit
:param str outputTablename: The table name used in the table, and the output plots
:param bln logToScreen: Print out messages (True|False)
:param dbl columnFillThreshold: filter out columns with more than xx percent (0.0-1.0) of the data missing
:param dbl exp_threshold: only show and plot results with an exponent higher than x
:param dbl exp_r_s_threshold: only show and plot results with an r squared lower than x
:param int maxrows: specify the maximum number of rows. If df.rowsize > maxrows a random sample of rows is used
:param bln debug: raise errors instead of continuing if True
:return: A Pandas dataframe with a list of all date and metric column combinations with their exponential factor and R squared
:rtype: dataframe
'''
######################## Functions: #########################
def is_nan(x):
return (x is np.nan or x != x)
def printL(text):
if logToScreen:
print(text)
def normalize(lst):
minval=min(lst)
maxval=max(lst)
return [(float(i)-minval)/(maxval-minval) for i in lst]
def powerfit(xs, ys):
S_x2_y = 0.0
S_y_lny = 0.0
S_x_y = 0.0
S_x_y_lny = 0.0
S_y = 0.0
for (x,y) in zip(xs, ys):
S_x2_y += x * x * y
S_y_lny += y * np.log(y)
S_x_y += x * y
S_x_y_lny += x * y * np.log(y)
S_y += y
#end
a = (S_x2_y * S_y_lny - S_x_y * S_x_y_lny) / (S_y * S_x2_y - S_x_y * S_x_y)
b = (S_y * S_x_y_lny - S_x_y * S_y_lny) / (S_y * S_x2_y - S_x_y * S_x_y)
return np.exp(a), b
def logisticfit(x,y):
popt, pcov = curve_fit(sigmoid,xdata= x,ydata= y)
return popt
def sigmoid(x, x0, k):
y = 1 / (1 + np.exp(-k*(x-x0)))
return y
def normalize_values(values):
values=np.array(np.float64(values))
values=np.array(values).reshape(-1,1)
scaler = MinMaxScaler()
scaler.fit(values)
scaled_values=scaler.transform(values)
return_values=scaled_values.squeeze()
return return_values, scaler
def denormalize_values(values, scaler):
return scaler.inverse_transform(values.reshape(-1,1)).squeeze()
def cleanfilename(filename,cleanlist):
text=filename
for cleanword in cleanlist:
text = text.replace(cleanword, '-')
text=text.replace('-----','-')
text=text.replace('----','-')
text=text.replace('---','-')
text=text.replace('--','-')
text=text.replace('-','-')
if text!=filename:
printL("Warning: Provided filename '"+filename+"' is invalid. Set to '"+text+"'")
return text
def checkAndGetBool(val):
if isinstance(val, str):
return val.lower() in ['true', '1', 't', 'y', 'yes']
if isinstance(val, (int, float)):
if val>=1:
return 1
else:
return 0
############################ Input handling: #############################
data=df
##### Input handling: logToScreen ###
logToScreen=checkAndGetBool(logToScreen)
#### Input handling: maxrows ##############
if maxrows<=0:
rownum=df.shape[0]
maxrows=min(rownum,5000)
printL("Warning: maxrows cannot be <1. Set to " + str(rownum))
##### Input handling: df ###################
if df is None:
printL("Warning: Dataframe is empty. Returning None")
return None
if df.shape[0]==0:
printL("Warning: Dataframe is empty. Returning None")
return None
if df.shape[0]>maxrows:
data=data.sample(maxrows)
##### Input handling: fractionToAnalyze ###
##### ensure that the fraction is between 0 and 1:
if fractionToAnalyze>1:
printL("Warning: Fraction to analyze was specified > 1 ! Set to 1" )
fractionToAnalyze=1
if fractionToAnalyze<=0:
printL("Warning: Fraction to analyze was specified <= 0 ! Set to 0.1" )
fractionToAnalyze=0.1
##### Input handling: outputPath ###
outputPath=outputPath.replace("//","/")
outputPath=outputPath.replace("\\","/")
outputPath=outputPath.replace("\\\\","/")
if outputPath[:1]==".":
if outputPath[0:2]!="./":
outputPath="./"+outputPath[1:]
elif outputPath[0:1]=="/":
outputPath="."+outputPath
else:
outputPath="./"+outputPath
####ensure that output folders exist:
cwd = pathlib.Path.cwd() #used to handle paths in different environments
if outputPlots:
#ensure output folder is existing:
if not (cwd/outputPath).is_dir():
pathlib.Path("./"+outputPath+"/").mkdir(parents=True, exist_ok=True)
if not (cwd/outputPath/"plots").is_dir():
pathlib.Path("./"+outputPath+"/plots").mkdir(parents=True, exist_ok=True)
if outputTable:
#ensure output folder is existing:
if not (cwd/outputPath).is_dir():
pathlib.Path("./"+outputPath+"/").mkdir(parents=True, exist_ok=True)
if not (cwd/outputPath/"table").is_dir():
pathlib.Path("./"+outputPath+"/table").mkdir(parents=True, exist_ok=True)
##### Input handling: outputTable ###
outputTable=checkAndGetBool(outputTable)
##### Input handling: outputPlots ###
outputPlots=checkAndGetBool(outputPlots)
##### Input handling: outputTablename ###
if outputTablename is None:
printL("Warning: No valid outputTableName provided. As it is used for file naming, 'testtable' is used.")
outputTablename="testtable"
##### Input handling: columnFillThreshold ###
if columnFillThreshold>1:
printL("Warning: columnFillThreshold was specified > 1 ! Set to 1" )
columnFillThreshold=1
if columnFillThreshold<=0:
printL("Warning: columnFillThreshold was specified <= 0 ! Set to 0.1" )
columnFillThreshold=0.1
############################ Code: #############################
#### specify the columns of the returntable:
resulttablecols=['tablename','datecol','valuecol','analyzed_fraction','delta_y','exp_B','exp_A','exp_r_squared','logistic_k','logistic_r_s']
printL ('######## Starting ' + outputTablename + ' ###### fraction '+str(fractionToAnalyze)+'########')
printL ('##########################################')
#### specify which column types are regarded as numeric (and get analyzed):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#### specify which chars to remove in the filenames:
cleanlist=['(',')',' ',':','[',']','#','/']
#### initialize an empty data frame with the columns for returning the results:
df_result = pd.DataFrame(columns=resulttablecols)
#### Loop through all datetimecols:
for datetimecol in data.columns:
df_temp=data.copy()
### identify date columns and append as converted column (datetime) and converted2 (numeric - for correlation)
if any(df_temp[datetimecol].astype(str).str.len()<4):
printL("####### The datetime column "+ datetimecol+": has less than 4 chars --> not used as datetime column.")
continue
try:
df_temp['date_converted']=pd.to_datetime(df_temp[datetimecol])
#df_temp['date_days_since_first_date']=mdates.date2num(df_temp['date_converted'])
dates_converted=mdates.date2num(df_temp['date_converted'])
df_temp['date_days_since_first_date']=dates_converted-min(dates_converted)
except:
if debug:
printL("####### Datetime column "+ datetimecol+": conversion error!") #raise
continue
else:
printL("####### Datetime column "+ datetimecol+": conversion error!")
continue
if any(df_temp['date_converted'] < '1700-01-01'): #1700 because of pandas nanoseconds limitation
printL("####### The datetime column "+ datetimecol+" had values before 1700-01-01 --> not used as date")
continue
if datetimecol!='date_converted':
printL("####### The datetime column: "+ datetimecol + " is used as datetimecol!")
### filter columns where only 50% of the rows have values:
df_temp= df_temp.loc[:, df_temp.isnull().mean() < columnFillThreshold]
### Only use the last fraction of the data, indexed / determined by the time column at hand:
maxval=max(df_temp['date_days_since_first_date']) #use converted time columns to allow for easy multiplication
minval=min(df_temp['date_days_since_first_date'])
timepoint_to_cut=(maxval-minval)*(1-fractionToAnalyze)+minval
df_temp=df_temp[(df_temp['date_days_since_first_date'] > timepoint_to_cut)]
###loop through all numeric cols:
for numericcol in data.select_dtypes(include=numerics).columns:
if numericcol==datetimecol or numericcol not in df_temp.columns:
continue ###skip this column if it is the date col
#### Start computing the values:
printL('##### Computing numeric col: '+ numericcol)
printL('##### Date col: '+datetimecol)
fitted_exponential=False
fitted_logistic=False
fitted_powerlaw=False
try: #trying exponential fit
df_temp=df_temp
#### zero values will break the algorithm with log calculations.
#### Therefore, we move all data points slightly above 0.
#### 'slightly" is defined as min_max_diff/1000000
if df_temp[numericcol].shape[0]==0:continue #exit if this numeric column has no values
min_max_diff=max(df_temp[numericcol])-min(df_temp[numericcol])
min0=False
if min(df_temp[numericcol])==0:
min0=True
shiftamount=min_max_diff/1000000
cols_to_keep=['date_converted','date_days_since_first_date',datetimecol,numericcol]
df_temp_2=df_temp[cols_to_keep].copy()# this df is used for cleaning and afterwards splitted
#remove all rows with nan or inf:
df_temp_2=df_temp_2.replace([np.inf, -np.inf], np.nan)
df_temp_2=df_temp_2.dropna(subset=[numericcol])
df_temp_2=df_temp_2.sort_values(['date_converted'],ascending=True)
#get minimum val, and move all points if negative:
minval=min(df_temp_2[numericcol])
df_temp_3=df_temp_2
if minval<=0:
df_temp_3[numericcol]=df_temp_3[numericcol]+shiftamount
df_temp_3[numericcol]=df_temp_3[numericcol]-minval
printL('! moved by '+str(minval) +' to correct for negative values')
x=df_temp_3['date_days_since_first_date']
print(x)
y=df_temp_3[numericcol]
delta_y=max(y)-min(y)
datetime=df_temp_3['date_converted']#datetime
if len(y)<100: #minimum 100 cases
continue
#### fit a regression line and get the slope (= exponential factor):
exp_a,exp_b=powerfit(x,y)
y_exp=exp_a*np.exp(exp_b*x)
#compute r²:
exp_residuals = y-y_exp
ss_res = np.sum(exp_residuals**2)
ss_tot = np.sum((y-np.mean(y))**2)
exp_r_s = 1 - (ss_res / ss_tot)
if (abs(exp_r_s)<exp_r_s_threshold or abs(exp_r_s)>1 or abs(exp_b)<exp_b_threshold or is_nan(exp_r_s) or is_nan(exp_b)):
fitted_exponential=False
exp_r_s=np.nan
exp_b=np.nan
exp_a=np.nan
else:
fitted_exponential=True
except:
if debug:
raise
else:
printL("Unexpected error exp fit:" + str(sys.exc_info()[0]))
try: #trying logistic fit
k=np.nan
x0=np.nan
logistic_r_s=np.nan
#### fit a logistic curve:
x_norm,scaler_x=normalize_values(x)
y_norm,scaler_y=normalize_values(df_temp_3[numericcol])
popt=logisticfit(x_norm,y_norm)
##get r²:
y_logistic= sigmoid(x_norm, *popt)
y_logistic2=denormalize_values(y_logistic,scaler_y)
residuals = y-y_logistic2
ss_res = np.sum(residuals**2)
ss_tot = np.sum((y-np.mean(y))**2)
logistic_r_s = 1 - (ss_res / ss_tot)
x0=mdates.num2date(denormalize_values(popt[0],scaler_x))
k=popt[1]
if (abs(logistic_r_s)<logistic_r_s_threshold or abs(logistic_r_s)>1 or abs(k)<k_threshold or is_nan(logistic_r_s) or is_nan(k)):
fitted_exponential=False
logistic_r_s=np.nan
k=np.nan
x0=np.nan
else:
fitted_exponential=True
except:
if debug:
raise
else:
printL("Unexpected error exp fit:" + str(sys.exc_info()[0]))
#append to output table:
df_result=df_result.append(pd.Series([outputTablename,datetimecol,numericcol,fractionToAnalyze,delta_y,exp_b,exp_a,exp_r_s,k,logistic_r_s],index=resulttablecols),ignore_index=True)
if outputTable:
df_result.to_csv(str(pathlib.Path(outputPath+'/table/'+cleanfilename(outputTablename+"-"
+str(int(round(fractionToAnalyze*100,0)))+"perc.csv",cleanlist))))
################################### Plotting #########################################
try: #plotting
if outputPlots and (fitted_exponential==True or fitted_logistic==True or fitted_powerlaw==True):
########### Plotparameters:
figsize_plot=(6,12) #inches
fontsize_plot=10
linestyle='-'#'--'
linewidth_plot=2
############################ Plot 1: Logarithmized - Dev ############################
fig, (ax1,ax2) = plt.subplots(2,1,figsize=figsize_plot)
fig.subplots_adjust(bottom=0.3,left=0.2,hspace = 0.8)
ax1.plot(datetime, y,'.', alpha=.3,color='#000000', markersize=5) #plotting the values
label_start_y=-0.5
if fitted_exponential:
expa=round(exp_a,5)
expb=round(exp_b,5)
ax1.plot(datetime,y_exp,linestyle, linewidth=linewidth_plot,color='#00A287') #plotting the line
#plotting additional information:
ax1.annotate(f'${expa}^{{{expb}x}}$',xy=(0.1,label_start_y-0.16),xycoords='axes fraction',
fontsize=fontsize_plot,color='#00A287' )
ax1.annotate("r²="+str(round(exp_r_s,3)), xy=(0.1,label_start_y-0.08),xycoords='axes fraction',
fontsize=fontsize_plot,color='#00A287')
ax1.annotate("Exponential Fit:", xy=(0.1,label_start_y),xycoords='axes fraction',
fontsize=fontsize_plot+2, color='#00A287')
if fitted_logistic:
# logistic regression plot:
ax1.plot(datetime, y_logistic2, color='#C87000', linewidth=2)
#plotting additional information:
ax1.annotate("k="+str(round(k,3)), xy=(0.6,label_start_y-0.24),xycoords='axes fraction',
fontsize=fontsize_plot, color='#C87000')
ax1.annotate("x₀="+str(x0), xy=(0.6,label_start_y-0.16),xycoords='axes fraction',
fontsize=fontsize_plot, color='#C87000')
ax1.annotate("r²="+str(round(r_squared_sigmoid,3)), xy=(0.6,label_start_y-0.08),xycoords='axes fraction',
fontsize=fontsize_plot, color='#C87000')
ax1.annotate("Logistic Fit:", xy=(0.6,label_start_y),xycoords='axes fraction',
fontsize=fontsize_plot+2, color='#C87000')
ax1.set_title("Last "+str(100*fractionToAnalyze)+"%", fontsize=19)
ax1.set_yscale('log')
if min0: #only set min y axis if 0 values are existing:
x1,x2,y1,y2 = plt.axis()
ax1.axis((x1,x2,shiftamount/10,y2))
#plt.annotate("0 (no log)", xy=(-0.005,shiftamount),xycoords=('axes fraction','data'),fontsize=fontsize_plot,color='red',ha='right')
ax1.tick_params('x', labelrotation=90)
############################ Plot 2: Regular Plot - Dev ############################
#fig, ax = plt.subplots(figsize=figsize_plot)
#fig.subplots_adjust(bottom=0.3,left=0.2)
ax2.plot(datetime, y,'.', alpha=.3,color='#000000', markersize=5) #plotting the values
if fitted_exponential:
ax2.plot(datetime,y_exp,linestyle, linewidth=linewidth_plot,color='#00A287') #plotting the line
if fitted_logistic:
# logistic regression plot:
y_logistic= sigmoid(x_norm, *popt)
y_logistic2=denormalize_values(y_logistic,scaler_y)
ax2.plot(datetime, y_logistic2, color='#C87000', linewidth=2)
if min0: #only set min y axis if 0 values are existing:
x1,x2,y1,y2 = plt.axis()
ax2.axis((x1,x2,shiftamount/10,y2))
#plt.annotate("0", xy=(-0.005,shiftamount),xycoords=('axes fraction','data'),fontsize=fontsize_plot,color='red',ha='right')
ax2.tick_params('x', labelrotation=90)
plt.savefig(str(pathlib.Path(outputPath+'/plots/'+cleanfilename(outputTablename+'-'+datetimecol+
'-'+numericcol,cleanlist)+'-'+str(int(round(fractionToAnalyze*100,0)))+'perc-nonlog.png')))
plt.close('all')
except:
if debug:
raise
else:
printL("Unexpected error:" + str(sys.exc_info()[0]))
#if df_result.shape[0]>0: ##only append if there are results:
#df_result=df_result.sort_values([''],ascending=False)
return(df_result)