-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
49 lines (36 loc) · 1.56 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#imports
import random
import pandas as pd
import numpy as np
#=============================================================================
#helper funcitons
def drop_nans(df,limit = 0.5):
"""given a dataframe and a limit value, first drops all the columns that have
more than limit % nans than drops all raws containing at least one nan value"""
row_num = df.shape[0]
nan_col = df.isnull().sum(axis=0)>(row_num*limit)
df=df.drop(columns=df.columns[nan_col])
return df.dropna()
def pick_n_from_k(df,n,seed = 0,onlynames = False):
"""given a dataframe and a number N, returns a dataframe that
contains n randomly selected columns of the input dataframe"""
k = df.shape[0]
#safety check
assert k >= n, 'K should be >= N'
#if onlynames is active return only the name of the columns
if onlynames:
random.seed(seed)
return random.sample(list(df.columns), n)
return df.sample(n=n, random_state=seed, axis='columns')
def bootstrap_CI(data, nbr_draws):
"""Given an array and a number of random samples performs bootstrapping to
find the confidence intervals of the mean"""
# Input: your array and the number of random samples (e.g., 1000 is a good number)
# Output: [lower error, upper error]
means = np.zeros(nbr_draws)
data = np.array(data)
for n in range(nbr_draws):
indices = np.random.randint(0, len(data), len(data))
data_tmp = data[indices]
means[n] = np.nanmean(data_tmp)
return [np.nanpercentile(means, 0.5),np.nanpercentile(means, 99.5)]