-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats_funcs_library.py
359 lines (285 loc) · 10.7 KB
/
stats_funcs_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import numpy as np
def get_mean_and_se(df, col):
'''
Compute the mean and standard error of a column in a pandas DataFrame.
Parameters
----------
df : pandas DataFrame
The DataFrame containing the column to compute the mean and standard error for.
col : str
The name of the column to compute the mean and standard error for.
Returns
-------
mean,se: tuple
A tuple containing two values: the mean of the specified column, and the standard error of the mean.
'''
mean = df[col].mean()
se = df[col].std() / np.sqrt(len(df))
return mean, se
def critical_value(sig, dof, test_kind='two-tail'):
'''
Calculate critical value for various tests.
Parameters:
-----------
sig: float
Significance level between 0 and 1 e.g. 0.05
dof: int
Degrees of freedom i.e. number of data points - 1
test_kind: string
Type of test i.e. left, right, two-tail.
Default is two-tail.
Returns:
--------
critical_value: float
The critical value.
'''
from scipy.stats import t
if test_kind == 'left':
crit = t.ppf(q=sig, df=dof)
elif test_kind == 'right':
crit = t.ppf(q=1-sig, df=dof)
else:
crit = t.ppf(q=1-sig/2, df=dof)
return crit
def confidence_interval(mean, se, critical_value):
'''
Compute a confidence interval for a given mean and standard error.
Parameters
----------
mean : float
The sample mean to compute the confidence interval for.
se : float
The standard error of the mean.
critical_value : float
The critical value for the desired confidence level.
Returns
-------
tuple
A tuple containing the lower and upper bounds of the confidence interval.
'''
lower = mean - critical_value * se
upper = mean + critical_value * se
return lower, upper
def t_test_statistic_1_samp (sample, sample_col, hypothesis_mean):
'''
Calculates a t-test for one sample.
T = (x̄ - μ₀) / (s / √n)
Parameters:
-----------
sample: df
This is you sample dataframe.
sample_col: string
This is the column to be aggregated.
hypothesis_mean: float
μ₀: This is the hypothesized population mean, which is the value you want to test against.
Returns:
--------
T: float
This is the calculated t-value, which measures the difference between the sample mean (x̄)
and the hypothesized population mean (μ₀) in units of the standard error of the sample mean.
The resulting t-value is compared to a t-distribution to determine the probability of
observing such a large difference between the sample mean and the hypothesized population mean
by chance alone. If this probability is low enough (usually set at a significance level of 0.05),
we reject the null hypothesis that the sample mean is not significantly different
from the hypothesized population mean.
'''
sample_n = len(sample)
mean, se = get_mean_and_se(sample, sample_col)
t_1_samp = (mean - hypothesis_mean) / se
return t_1_samp
def t_test_statistic_2_samp (control_sample, treatment_sample, control_col, treatment_col, hypothesis_mean=0):
'''
Calculates the t-test statistic for two independent samples.
Parameters:
-----------
sample_1 : pandas DataFrame
The first DataFrame used for computing the mean and standard error.
sample_2 : pandas DataFrame
The second DataFrame used for computing the mean and standard error.
col_1 : str
The column used for analysis within the first DataFrame.
col_2 : str
The column used for analysis within the second DataFrame.
hypothesis_mean : float, optional
The null hypothesis mean, which is the value being tested against the sample means to
determine whether the difference between them is statistically significant. The default is 0.
Returns:
--------
float
The t-test statistic for two independent samples.
'''
control_mean, control_se = get_mean_and_se(control_sample, control_col)
treatment_mean, treatment_se = get_mean_and_se(treatment_sample, treatment_col)
t_2_samp = ((treatment_mean - control_mean) - hypothesis_mean) / np.sqrt((control_se**2 + treatment_se**2))
return t_2_samp
def confidence_interval_diff_mean(control_sample, treamtent_sample, control_col, treatment_col, sig):
'''
Compute a confidence interval for the difference between the means of two samples.
Parameters
----------
control_sample : pandas DataFrame
The control sample data containing the column specified by `col_1`.
treamtent_sample : pandas DataFrame
The treatment sample data containing the column specified by `col_2`.
col_1 : str
The name of the column in `control_sample` to compute the mean and standard error for.
col_2 : str
The name of the column in `treatment_sample` to compute the mean and standard error for.
sig : float
The desired significance level (e.g., 0.05).
Returns
-------
tuple
A tuple containing the lower and upper bounds of the confidence interval for the difference between the means of
the two samples.
'''
control_mean, control_se = get_mean_and_se(control_sample, control_col)
treatment_mean, treamtent_se = get_mean_and_se(treamtent_sample, treatment_col)
dof = (len(control_sample) - 1) + (len(treamtent_sample) - 1)
test_kind = 'two-tail'
sample_stat = treatment_mean - control_mean
se = np.sqrt(control_se**2 + treamtent_se**2)
crit_value = critical_value(sig, dof, test_kind='two-tail')
lower = sample_stat - crit_value * se
upper = sample_stat + crit_value * se
return lower, upper
def conversion_rate(df, col):
'''
Calculate the conversion rate of a column in a pandas DataFrame.
Parameters:
df (pandas.DataFrame): The DataFrame containing the data.
col (str): The name of the column to calculate the conversion rate for.
Returns:
float: The conversion rate as a percentage, i.e., the proportion of rows
in the DataFrame where the value in the specified column is non-zero.
Example:
>>> data = pd.DataFrame({'col1': [0, 1, 0, 1], 'col2': [1, 0, 1, 0]})
>>> conversion_rate(data, 'col1')
0.5
'''
con_rate = len(df[df[col] != 0]) / len(df)
return con_rate
def proportion_stats(df, col):
'''
Calculate the sample proportion and standard error for a column in a pandas DataFrame.
Parameters:
df (pandas.DataFrame): The DataFrame containing the data.
col (str): The name of the column to calculate the sample proportion and standard error for.
Returns:
tuple: A tuple containing the sample proportion and standard error as floats.
Example:
>>> data = pd.DataFrame({'col1': [0, 1, 0, 1], 'col2': [1, 0, 1, 0]})
>>> proportion_stats(data, 'col1')
(0.5, 0.2886751345948129)
'''
proportion = len(df[df[col] != 0]) / len(df)
se = np.sqrt(proportion*(1-proportion)/len(df))
return proportion, se
def critical_value_proportion(sig, test_kind='two-tail'):
'''Calculate critical value for various tests.
Parameters:
-----------
sig: float
Significance level between 0 and 1 e.g. 0.05
test_kind: string
Type of test i.e. left, right, two-tail.
Default is two-tail.
Returns:
--------
critical_value: float
The critical value for the given significance level and test type.
Example:
>>> critical_value_proportion(0.05, 'two-tail')
1.959963984540054
'''
from scipy.stats import norm
if test_kind == 'left':
crit = norm.ppf(q=sig)
elif test_kind == 'right':
crit = norm.ppf(q=1-sig)
else:
crit = norm.ppf(q=1-sig/2)
return crit
def pooled_prop_se(p1, p2, df1, df2):
n1 = len(df1)
n2 = len(df2)
p = (p1 * n1 + p2 * n2) / (n1 + n2)
pooled_se = np.sqrt(p*(1-p) * (1/n1 + 1/n2))
return pooled_se
def pooled_confidence(sample_stat, critical_value_proportion, pooled_prop_se):
lower = sample_stat - critical_value_proportion * pooled_prop_se
upper = sample_stat + critical_value_proportion * pooled_prop_se
return lower, upper
def pooled_t_statistic_prop(p1 , p2, df1, df2 ):
'''Computes the pooled t-statistic for two independent samples with known proportions.
Parameters
----------
p1 : float
Proportion of successes in sample 1.
p2 : float
Proportion of successes in sample 2.
df1 : array-like
Sample 1.
df2 : array-like
Sample 2.
Returns
-------
pooled_t : float
The pooled t-statistic.
Notes
-----
The pooled t-statistic is used to test the hypothesis that two independent samples have equal population means. It assumes that the variances of the two populations are equal. The formula for the pooled t-statistic is:
pooled_t = (p1 - p2) / sqrt(p * (1 - p) * (1 / n1 + 1 / n2))
where:
- p is the pooled proportion of successes
- n1 and n2 are the sample sizes of sample 1 and sample 2, respectively'''
n1 = len(df1)
n2 = len(df2)
p = (p1 * n1 + p2 * n2) / (n1 + n2)
pooled_t = (p1-p2) / np.sqrt( p*(1-p) * (1/n1 + 1/n2) )
return pooled_t
def se_unpooled_prop(p1 , p2, df1, df2):
'''Computes the unpooled standard-error for z-interval for two independent samples with known proportions.
Parameters
----------
p1 : float
Proportion of successes in sample 1.
p2 : float
Proportion of successes in sample 2.
df1 : array-like
Sample 1.
df2 : array-like
Sample 2.
Returns
-------
unpooled_se : float
The unpooled standard-error.'''
n1 = len(df1)
n2 = len(df2)
unpooled_se = np.sqrt((p1*(1-p1)/n1) + (p2*(1-p2)/n2))
return unpooled_se
# incomplete
def unpooled_t_statistic_prop(p1 , p2, df1, df2 ):
'''Computes the unpooled t-statistic for two independent samples with known proportions.
Parameters
----------
p1 : float
Proportion of successes in sample 1.
p2 : float
Proportion of successes in sample 2.
df1 : array-like
Sample 1.
df2 : array-like
Sample 2.
Returns
-------
pooled_t : float
The pooled t-statistic.'''
n1 = len(df1)
n2 = len(df2)
return unpooled_t
def determine_pool( h0):
if h0 == 0:
print("The proportions are equal, therefore use the pooled standard error. ")
else:
print("The proportions are not equal, therefore use the unpooled standard error.")