-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
171 lines (149 loc) · 7.07 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import random
import numpy as np
import os
import pandas as pd
import torch
import openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import make_column_transformer
import xgboost as xgb
import time
from torch.optim import Adam
# Global parameters
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!!!")
RESULT_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'experiments')
os.makedirs(RESULT_DIR, exist_ok=True)
CONTRASTIVE_LEARNING_MAX_EPOCHS = 500
SUPERVISED_LEARNING_MAX_EPOCHS = 100
CLS_CORR_REFRESH_SAMPLER_PERIOD = 10
FRACTION_LABELED = 0.3
CORRUPTION_RATE = 0.4
BATCH_SIZE = 256
SEEDS = [614579, 336466, 974761, 450967, 743562, 843198, 502837, 328984]
assert len(SEEDS) == len(set(SEEDS))
# All the methods to experiment
ALL_METHODS = ['no_pretrain', 'rand_corr-rand_feats', 'cls_corr-rand_feats', 'orc_corr-rand_feats', 'cls_corr-leastRela_feats', 'cls_corr-mostRela_feats']
P_VAL_SIGNIFICANCE = 0.05
CORRELATED_FEATURES_RANDOMIZE_SAMPLING = True
CORRELATED_FEATURES_RANDOMIZE_SAMPLING_TEMPERATURE = 0.25
# Result processing metric
METRIC = "accuracy"
# METRIC = "auroc"
XGB_FEATURECORR_CONFIG = {
"n_estimators": 100,
"max_depth": 10,
"eta": 0.1,
"subsample": 0.7,
"colsample_bytree": 0.8,
"enable_categorical": True,
"tree_method": "hist"
}
def fix_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
def load_openml_list(DIDS):
datasets = []
datasets_list = openml.datasets.list_datasets(DIDS, output_format='dataframe')
for ds in datasets_list.index:
entry = datasets_list.loc[ds]
print('Loading', entry['name'], entry.did, '..')
if entry['NumberOfClasses'] == 0.0:
raise Exception("Regression not supported for now")
exit(1)
else:
dataset = openml.datasets.get_dataset(int(entry.did))
# since under SCARF corruption, the replacement by sampling happens before one-hot encoding, load the
# data in its original form
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="dataframe", target=dataset.default_target_attribute
)
assert isinstance(X, pd.DataFrame) and isinstance(y, pd.Series)
order = np.arange(y.shape[0])
# Don't think need to re-seed here
np.random.shuffle(order)
X, y = X.iloc[order], y.iloc[order]
assert X is not None
datasets += [[entry['name'],
entry.did,
int(entry['NumberOfClasses']),
np.sum(categorical_indicator),
len(X.columns),
X,
y]]
return datasets
def preprocess_datasets(train_data, test_data, normalize_numerical_features):
assert isinstance(train_data, pd.DataFrame) and \
isinstance(test_data, pd.DataFrame)
assert np.all(train_data.columns == test_data.columns)
features_dropped = []
for col in train_data.columns:
# drop columns with all null values or with a constant value on training data
if train_data[col].isnull().all() or train_data[col].nunique() == 1:
train_data.drop(columns=col, inplace=True)
test_data.drop(columns=col, inplace=True)
features_dropped.append(col)
continue
# fill the missing values
if train_data[col].isnull().any() or test_data[col].isnull().any():
# for categorical features, fill with the mode in the training data
if train_data[col].dtype.name == "category":
val_fill = train_data[col].mode(dropna=True)[0]
# for numerical features, fill with the mean of the training data
else:
val_fill = train_data[col].mean(skipna=True)
train_data[col].fillna(val_fill, inplace=True)
test_data[col].fillna(val_fill, inplace=True)
if normalize_numerical_features:
# z-score transform numerical values
scaler = StandardScaler()
non_categorical_cols = train_data.select_dtypes(exclude='category').columns
if len(non_categorical_cols) == 0:
print("No numerical features presen! Skip numerical z-score normalization.")
else:
train_data[non_categorical_cols] = scaler.fit_transform(train_data[non_categorical_cols])
test_data[non_categorical_cols] = scaler.transform(test_data[non_categorical_cols])
print(f"Data preprocess finished! Dropped {len(features_dropped)} features: {features_dropped}. {'Normalized numerical features.' if normalize_numerical_features else ''}")
# retain the pandas dataframe format for later one-hot encoder
return train_data, test_data
def fit_one_hot_encoder(one_hot_encoder_raw, train_data):
categorical_cols = train_data.select_dtypes(include='category').columns
one_hot_encoder = make_column_transformer((one_hot_encoder_raw, categorical_cols), remainder='passthrough')
one_hot_encoder.fit(train_data)
return one_hot_encoder
def get_bootstrapped_targets(data, targets, classifier_model, mask_labeled, one_hot_encoder):
# use the classifier to predict for all data first
classifier_model.module.eval()
with torch.no_grad():
pred_logits = classifier_model.module.get_classification_prediction_logits(
torch.tensor(one_hot_encoder.transform(data).astype(float), dtype=torch.float32).to(DEVICE)).cpu().numpy()
preds = np.argmax(pred_logits, axis=1)
return np.where(mask_labeled, targets, preds)
# expect a pandas dataframe
# fit xgboost models on pandas dataframe and series
def compute_feature_mutual_influences(data):
assert isinstance(data, pd.DataFrame)
label_encoder_tmp = LabelEncoder()
feat_impt = []
start_time = time.time()
feat_impt_range_avg = 0
for i, col in enumerate(data.columns):
if data[col].dtype == "category":
xgb_model = xgb.XGBClassifier(**XGB_FEATURECORR_CONFIG)
target = label_encoder_tmp.fit_transform(data[col])
else:
xgb_model = xgb.XGBRegressor(**XGB_FEATURECORR_CONFIG)
target = data[col]
xgb_model.fit(data.drop(col, axis=1), target)
# the xgb_obj.feature_importances_ is the normalized score for gain
feat_impt_range_avg += np.ptp(xgb_model.feature_importances_)
feat_impt.append(np.insert(xgb_model.feature_importances_, obj=i, values=0))
feat_impt = np.array(feat_impt)
feat_impt_range_avg = feat_impt_range_avg/len(data.columns)
print(f"Feature importances computated for {len(data)} samples each with {np.shape(data)[1]} features! Took {time.time()-start_time:.2f} seconds. The average range is {feat_impt_range_avg}")
return feat_impt, feat_impt_range_avg
def initialize_adam_optimizer(model):
return Adam(filter(lambda p: p.requires_grad, model.module.parameters()), lr=0.001)