@@ -19,7 +19,9 @@ PyTorch implementation of L2L execution algorithm from paper [Training Large Neu
19
19
20
20
You need to define a torch model where all layers are specified in ModuleList.
21
21
22
- for example
22
+ See [ examples folder] ( examples )
23
+
24
+ ### Basic usage
23
25
24
26
``` python
25
27
import torch
@@ -55,15 +57,15 @@ class M(nn.Module):
55
57
56
58
return x
57
59
60
+
61
+ model = M(depth = 5 , dim = 40 ).train() # on CPU
58
62
```
59
63
60
64
Then, you can use the L2L wrapper over this model.
61
65
62
66
``` python
63
67
from layer_to_layer_pytorch.l2l import Layer2Layer
64
68
65
- model = M(depth = 5 , dim = 40 ).train() # on CPU
66
-
67
69
l2l_model = Layer2Layer(
68
70
model,
69
71
layers_attr = " layers" , # attribute with ModuleList
@@ -81,23 +83,46 @@ x = torch.rand(1_000, 40) # on CPU
81
83
y = torch.rand(1_000 , 40 ) # on CPU
82
84
83
85
losses = []
84
- loss_fn = nn.MSELoss(reduction = " sum " ) # since L2L calcs average losses itself, we just need to save them
86
+ criterion = nn.MSELoss()
85
87
86
- optimizer = optim.AdamW(l2l_model.main_model.parameters(), lr = 0.001 ) # optimizer works with the main model on CPU
88
+ optimizer = optim.AdamW(l2l_model.main_params ) # optimizer works with the main model on CPU
87
89
88
- for i in trange(5000 ):
90
+ for i in trange(2000 ):
89
91
l2l_model.zero_grad()
90
- l2l_model.forward(x)
92
+ _ = l2l_model.forward(x)
91
93
92
- loss_value = l2l_model.backward(x, y, loss_fn )
94
+ loss_value: float = l2l_model.compute_loss( y, criterion )
93
95
94
96
if i % 50 == 0 :
95
- tqdm.write(f " [ { i} ] loss = { loss_value.item() } " )
96
- losses.append(loss_value.item() )
97
+ tqdm.write(f " [ { i} ] loss = { loss_value} " )
98
+ losses.append(loss_value)
97
99
100
+
101
+ l2l_model.backward()
98
102
optimizer.step()
103
+ l2l_model.update_main_model_params() # Sync params with CPU
104
+ ```
105
+
106
+ ### FP-16 usage
107
+
108
+ Cross-mixes-precision available in init params
109
+
110
+ ``` python
111
+ from layer_to_layer_pytorch.l2l import Layer2Layer
112
+
113
+ l2l_model = Layer2Layer(
114
+ model,
115
+ layers_attr = " layers" ,
116
+ microbatch_size = 100 ,
117
+
118
+ # fp-16
119
+ mixed_precision = True ,
120
+ loss_scale = 128.0
121
+ )
99
122
```
100
123
124
+ And then train the same way 😉
125
+
101
126
## Installation
102
127
103
128
``` bash
0 commit comments