-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
273 lines (224 loc) · 9.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""
End-to-end pipeline for training and evaluating the model.
Most of the code is taken from main.ipynb and converted into a script.
We propose here a robust (limit the impact of weights initialiazation) and efficient (use of GPU) and reproducible (fix the seed) pipeline.
The pipeline is divided into 3 main steps:
1. Data preparation
2. Model training
3. Model evaluation
"""
## 0. LIBRARIES
from pathlib import Path
import sys
import warnings
from copy import deepcopy
import multiprocessing
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datetime import datetime
from IPython.display import clear_output
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
## In-project libraries
from datasets.core import SlideFeaturesDataset
from models.chowder import Chowder
from utils.features import pad_collate_fn
from utils.functional import sigmoid, softmax
from trainer import TorchTrainer
from trainer.utils import slide_level_train_step, slide_level_val_step
working_directory = Path(".").resolve()
sys.path.append(str(working_directory))
def main():
## I- DATA PREPARATION
# 1. Load the precomputed features
## Train and test directories
train_features_dir = working_directory / "data" / "train_input" / "moco_features"
test_features_dir = working_directory / "data" / "test_input" / "moco_features"
## List of all the files in each directory
train_features_path_all = list(train_features_dir.glob("*.npy"))
test_features_path_all = list(test_features_dir.glob("*.npy"))
# 2. Load the metadata
train_metadata_df = pd.read_csv(working_directory / "data" / "supplementary_data" / "train_metadata.csv")
test_metadata_df = pd.read_csv(working_directory / "data" / "supplementary_data" / "test_metadata.csv")
# 3. Load the traning labels
y_train = pd.read_csv(working_directory / "data" / "train_output.csv")
## And concatenate the labels to the metadata
train_metadata_df = pd.merge(train_metadata_df, y_train, on="Sample ID")
y_train = train_metadata_df["Target"].values.astype(np.float32) ## float32 required for BCE loss
# 4. Train dataset
train_dataset = SlideFeaturesDataset(
features = train_features_path_all,
labels = y_train,
n_tiles=1000,
shuffle=True,
transform=None
)
# 5. Test dataset
test_dataset = SlideFeaturesDataset(
features = test_features_path_all,
labels = np.zeros(len(test_features_path_all), dtype=np.float32), ## Dummy labels, won't be used
n_tiles=1000,
shuffle=False,
transform=None
)
train_indices = np.arange(len(train_dataset))
train_labels = train_dataset.labels
## II- MODEL TRAINING
# 1. Hyperparameters
in_features = 2048
out_features = 1
n_top = 5
n_bottom = 5
tiles_mlp_hidden = None
mlp_hidden = [128,64]
mlp_activation = torch.nn.LeakyReLU()
mlp_dropout = [0.1,0.1]
bias = True
# 2. Model initialization
chowder = Chowder(
in_features=in_features,
out_features=out_features,
n_top=n_top,
n_bottom=n_bottom,
tiles_mlp_hidden=tiles_mlp_hidden,
mlp_hidden=mlp_hidden,
mlp_activation=mlp_activation,
mlp_dropout=mlp_dropout,
bias=bias
)
print_trainable_parameters(chowder)
## We define the loss function, optimizer and metrics for the training
criterion = torch.nn.BCEWithLogitsLoss() # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam # Adam optimizer
metrics = {"auc": roc_auc_score} # AUC will be the tracking metric
# 3. Instantiate the trainer
trainer = TorchTrainer(
model=chowder,
criterion=criterion,
metrics=metrics,
batch_size=8, # you can tweak this
num_epochs=20, # you can tweak this
learning_rate=1e-3, # you can tweak this
weight_decay=0, # you can tweak this
device="cuda:0" if torch.cuda.is_available() else "cpu",
optimizer=deepcopy(optimizer),
train_step=slide_level_train_step,
val_step=slide_level_val_step,
collator=pad_collate_fn,
use_tqdm=True,
)
# 4. Train-validation split
train_indices_, val_indices_ = train_test_split(train_indices, test_size=0.2, stratify=train_labels, random_state=42)
train_dataset_ = torch.utils.data.Subset(train_dataset, train_indices_)
val_dataset_ = torch.utils.data.Subset(train_dataset, val_indices_)
# 5. Training
## Logging of the hyperparameters
print("-"*50)
print("Training the {} model".format(chowder.__class__.__name__))
print("On {} samples, validating on {} samples\n".format(len(train_dataset_), len(val_dataset_)))
print("-"*50)
print("Hyperparameters:")
print(f"num_epochs: {trainer.num_epochs}")
print(f"learning_rate: {trainer.learning_rate}")
print(f"weight_decay: {trainer.weight_decay}")
print(f"mlp_hidden: {mlp_hidden}")
print(f"mlp_activation: {mlp_activation}")
print(f"mlp_dropout: {mlp_dropout}")
print(f"batch_size: {trainer.batch_size}")
print(f"device: {trainer.device}")
print("-"*50)
## training loop
start = datetime.now()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
# Training step for the given number of epochs
train_metrics, val_metrics = trainer.train(
train_dataset_, val_dataset_
)
# Predictions on test (logits, sigmoid(logits) = probability)
test_logits = trainer.predict(test_dataset)[1]
## plot the training and validation loss and metrics and save them in a txt file
plot_loss_auc(trainer, train_metrics, val_metrics)
end = datetime.now()
print("Time taken to train the model: {}".format(end-start))
## III- MODEL EVALUATION
test_probas = np.mean([sigmoid(logits) for logits in test_logits], axis=0).squeeze()
## Prediction to dataframe
submission = pd.DataFrame(
{"Sample ID": test_metadata_df["Sample ID"].values, "Target": test_probas}
).sort_values(
"Sample ID"
) # extra step to sort the sample IDs
# sanity checks
assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
assert list(submission.columns) == [
"Sample ID",
"Target",
], "Your submission file must have columns `Sample ID` and `Target`"
# save the submission as a csv file
output_dir = working_directory / "test_output"
if not output_dir.exists():
output_dir.mkdir(parents=True)
saving_path = output_dir / f"submission_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
submission.to_csv(saving_path, index=None)
#submission.head()
## Save all the hyperparameters in a txt file
log_dir = working_directory / "logs"
if not log_dir.exists():
log_dir.mkdir(parents=True)
with open(log_dir / f"hyperparameters_{datetime.now().strftime('%Y%m%d_%H%M')}.txt", "w") as f:
f.write(f"num_epochs: {trainer.num_epochs}\n")
f.write(f"learning_rate: {trainer.learning_rate}\n")
f.write(f"weight_decay: {trainer.weight_decay}\n")
f.write(f"mlp_hidden: {mlp_hidden}\n")
f.write(f"mlp_activation: {mlp_activation}\n")
f.write(f"mlp_dropout: {mlp_dropout}\n")
f.write(f"batch_size: {trainer.batch_size}\n")
f.write(f"device: {trainer.device}\n")
f.write(f"Time taken to train the model: {end-start}\n")
print("Submission file saved in \"test_output\" folder.")
print("Hyperparameters saved in \"logs\" folder.")
print("-"*50)
def print_trainable_parameters(model: torch.nn) -> None:
"""Print number of trainable parameters."""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param}"
f" || trainable%: {100 * trainable_params / all_param:.2f}"
)
def plot_loss_auc(trainer, train_metrics, val_metrics):
fig, axs = plt.subplots(1, 2, figsize=(10, 6))
axs[0].plot(trainer.train_losses, label="train")
axs[0].plot(trainer.val_losses, label="val")
axs[0].set_title("Loss")
axs[0].legend()
axs[1].plot(train_metrics["auc"], label="train")
axs[1].plot(val_metrics["auc"], label="val")
axs[1].set_title("AUC")
axs[1].legend()
plt.savefig(f"figures/loss_auc_{datetime.now().strftime('%Y%m%d_%H%M')}.png")
plt.show()
print("Loss and AUC plots saved in figures folder.")
## Set seed for reproducibility
def set_seed(seed):
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if __name__ == "__main__":
set_seed(317)
print("Start of the pipeline")
main()
print("End of the pipeline")