|
5 | 5 | from torch.utils.data import DataLoader
|
6 | 6 | from tqdm import trange
|
7 | 7 | import numpy as np
|
8 |
| -import numpy.typing as npt |
9 | 8 | import torch.nn.functional as F
|
10 | 9 | from matplotlib import pyplot as plt
|
| 10 | +import random |
11 | 11 |
|
12 | 12 | # plt.style.use('seaborn')
|
13 |
| -plt.rcParams['figure.autolayout'] = True |
| 13 | +# plt.rcParams['figure.autolayout'] = True |
| 14 | + |
| 15 | +torch.cuda.empty_cache() |
| 16 | +device = torch.device('cpu') |
14 | 17 |
|
15 | 18 | norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
16 |
| -T = transforms.Compose([transforms.ToTensor(), transforms.Lambda(torch.flatten)]) |
17 |
| -train_data = datasets.MNIST(root=f'datasets/MNIST', download=True, transform=T) # (BS, 784) |
| 19 | +T = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: torch.cat((x.flatten(), torch.Tensor([1]))))]) |
| 20 | +train_data = datasets.MNIST(root=f'datasets/MNIST', download=True, transform=T) # (BS, 785) |
18 | 21 | test_data = datasets.MNIST(root=f'datasets/MNIST', train=False, transform=T)
|
19 | 22 |
|
20 |
| -BS = 100 |
21 |
| -LR = 10 |
22 |
| -EPOCHS = 5 |
| 23 | +BS: int = 100 |
| 24 | +LR: float = 10. |
| 25 | +EPOCHS: int = 5 |
| 26 | +SEED: int = 0 |
23 | 27 |
|
24 | 28 | # SGD:
|
25 |
| -torch.manual_seed(0) |
26 |
| -np.random.seed(0) |
| 29 | +torch.manual_seed(SEED) |
| 30 | +np.random.seed(SEED) |
| 31 | +random.seed(SEED) |
27 | 32 |
|
28 |
| -train_loader = DataLoader(train_data, batch_size=BS, shuffle=True, pin_memory=True) |
29 |
| -test_loader = DataLoader(test_data, batch_size=BS, pin_memory=True) |
| 33 | +train_loader = DataLoader(train_data, batch_size=BS, num_workers=0, shuffle=True, pin_memory=True, generator=torch.Generator().manual_seed(SEED)) |
| 34 | +test_loader = DataLoader(test_data, batch_size=BS, num_workers=0, pin_memory=True) |
30 | 35 |
|
31 |
| -w = torch.randn(784, 100, requires_grad=True) |
32 |
| -b = torch.randn(100, requires_grad=True) |
33 |
| -wh = torch.randn(100, 10, requires_grad=True) |
34 |
| -bh = torch.randn(10, requires_grad=True) |
| 36 | +w = torch.randn(784, 100, requires_grad=True, device=device) |
| 37 | +b = torch.randn(100, requires_grad=True, device=device) |
| 38 | +wh = torch.randn(100, 10, requires_grad=True, device=device) |
| 39 | +bh = torch.randn(10, requires_grad=True, device=device) |
35 | 40 |
|
36 | 41 | losses_sgd = []
|
37 | 42 | for _ in trange(EPOCHS):
|
38 | 43 | loss_batch = []
|
39 | 44 | for x, y in train_loader:
|
40 |
| - out = torch.softmax(torch.tanh(x @ w + b) @ wh + bh, dim=1) |
| 45 | + x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) |
| 46 | + out = torch.tanh(x @ w + b) @ wh + bh # raw logits |
41 | 47 | loss = F.cross_entropy(out, y)
|
42 | 48 | loss_batch.append(loss.item())
|
43 |
| - grad_w = torch.autograd.grad(loss, w, retain_graph=True)[0] # (784, 100) |
44 |
| - grad_b = torch.autograd.grad(loss, b, retain_graph=True)[0] # (100) |
45 |
| - grad_wh = torch.autograd.grad(loss, wh, retain_graph=True)[0] # (100, 10) |
46 |
| - grad_bh = torch.autograd.grad(loss, bh)[0] # (10) |
47 |
| - with torch.no_grad(): |
48 |
| - w -= grad_w * LR |
49 |
| - b -= grad_b * LR |
50 |
| - wh -= grad_wh * LR |
51 |
| - bh -= grad_bh * LR |
| 49 | + grad_w, grad_b, grad_wh, grad_bh = torch.autograd.grad(loss, (w, b, wh, bh)) |
| 50 | + |
| 51 | + with torch.no_grad(): # computations below are untracked as tensors are treated as "detached" tensors |
| 52 | + w -= grad_w * LR # (784, 100) |
| 53 | + b -= grad_b * LR # (100) |
| 54 | + wh -= grad_wh * LR # (100, 10) |
| 55 | + bh -= grad_bh * LR # (10) |
| 56 | + |
52 | 57 | losses_sgd.append(np.mean(loss_batch))
|
53 | 58 | plt.semilogy(losses_sgd, label='SGD')
|
54 | 59 |
|
55 | 60 | acc = []
|
56 | 61 | with torch.no_grad():
|
57 | 62 | for x, y in test_loader:
|
| 63 | + x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) |
58 | 64 | out = torch.tanh(x @ w + b) @ wh + bh
|
59 | 65 | _, pred = out.max(1)
|
60 | 66 | acc.append((pred == y).float().mean().item())
|
|
63 | 69 | ############################################################################################################################################################
|
64 | 70 |
|
65 | 71 | # BGD:
|
66 |
| -torch.manual_seed(0) |
67 |
| -np.random.seed(0) |
| 72 | +torch.manual_seed(SEED) |
| 73 | +np.random.seed(SEED) |
| 74 | +random.seed(SEED) |
| 75 | + |
| 76 | +train_loader = DataLoader(train_data, batch_size=BS, num_workers=0, shuffle=True, pin_memory=True, generator=torch.Generator().manual_seed(SEED)) |
| 77 | +test_loader = DataLoader(test_data, batch_size=BS, num_workers=0, pin_memory=True) |
68 | 78 |
|
69 |
| -train_loader = DataLoader(train_data, batch_size=BS, shuffle=True, pin_memory=True) |
70 |
| -test_loader = DataLoader(test_data, batch_size=BS, pin_memory=True) |
| 79 | +w = torch.randn(785, 100, requires_grad=True, device=device) # each column corresponds to an output node (neuron) in the network |
| 80 | +wh = torch.randn(101, 10, requires_grad=True, device=device) |
71 | 81 |
|
72 |
| -w = torch.randn(784, 100, requires_grad=True) |
73 |
| -b = torch.randn(100, requires_grad=True) |
74 |
| -wh = torch.randn(100, 10, requires_grad=True) |
75 |
| -bh = torch.randn(10, requires_grad=True) |
| 82 | +TH = 0.9 # ThreshHold value is inversely proportional to the initial learning rate |
| 83 | +EPS = torch.Tensor([1e-11]) |
| 84 | +BIAS = torch.ones(100, device=device).unsqueeze(1) |
| 85 | +SHRINK = torch.tensor(1.1, device=device) |
| 86 | +LRw = torch.ones(100, device=device) * LR # per output node (neuron) adaptive learning rate (not per parameter/weight) |
| 87 | +LRwh = torch.ones(10, device=device) * LR |
76 | 88 |
|
77 | 89 |
|
78 |
| -def dist(g1: torch.Tensor, g2: torch.Tensor) -> npt.NDArray[float]: |
79 |
| - e = 1e-11 |
| 90 | +def dist(g1: torch.Tensor, g2: torch.Tensor) -> torch.Tensor: |
80 | 91 | flatness_1, flatness_2 = g1.norm().item(), g2.norm().item()
|
81 |
| - dists = np.array([flatness_2, flatness_1]) |
82 |
| - return dists / (dists.sum() + e) |
| 92 | + dists = torch.Tensor([flatness_2, flatness_1]) |
| 93 | + return dists / (dists.sum() + EPS) |
| 94 | + |
83 | 95 |
|
| 96 | +def bounce_update(weight, oracle, weight_gradient, oracle_gradient, weight_LR) -> None: |
| 97 | + dot_prods: torch.Tensor = torch.einsum("kj,kj->j", weight_gradient, oracle_gradient) |
| 98 | + for i, dot_prod in enumerate(dot_prods): |
| 99 | + if dot_prod.item() < 0: |
| 100 | + # print('bounce') |
| 101 | + d1, d2 = dist(weight_gradient[:, i], oracle_gradient[:, i]) |
| 102 | + if d1.item() > TH: |
| 103 | + weight_LR[i] /= SHRINK |
| 104 | + |
| 105 | + weight[:, i] = weight[:, i] * d1 + oracle[:, i] * d2 |
| 106 | + |
| 107 | + else: |
| 108 | + weight[:, i] = oracle[:, i] - oracle_gradient[:, i] * weight_LR[i] |
84 | 109 |
|
85 |
| -TH = 0.9 # TH value is inversely proportional to "ini" |
86 |
| -shrink = 1.1 |
87 |
| -LRw = torch.ones(100) * LR |
88 |
| -LRwh = torch.ones(10) * LR |
89 | 110 |
|
90 | 111 | losses_bgd = []
|
91 | 112 | for _ in trange(EPOCHS):
|
92 | 113 | loss_batch = []
|
93 | 114 | for x, y in train_loader:
|
94 |
| - out = torch.softmax(torch.tanh(x @ w + b) @ wh + bh, dim=1) |
| 115 | + x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) |
| 116 | + out = torch.cat(((x @ w).tanh(), BIAS), dim=1) @ wh |
95 | 117 | loss = F.cross_entropy(out, y)
|
96 | 118 | loss_batch.append(loss.item())
|
97 |
| - grad_w = torch.autograd.grad(loss, w, retain_graph=True)[0] # (784, 100) |
98 |
| - grad_b = torch.autograd.grad(loss, b, retain_graph=True)[0] # (100) |
99 |
| - grad_wh = torch.autograd.grad(loss, wh, retain_graph=True)[0] # (100, 10) |
100 |
| - grad_bh = torch.autograd.grad(loss, bh)[0] # (10) |
101 |
| - |
102 |
| - # Find oracle weights: |
103 |
| - oracle_w = w - grad_w * LRw |
104 |
| - oracle_wh = wh - grad_wh * LRwh |
| 119 | + loss.backward() |
| 120 | + grad_w, grad_wh = w.grad, wh.grad |
105 | 121 |
|
106 |
| - # Update bias vectors: |
107 |
| - with torch.no_grad(): |
108 |
| - b -= grad_b * LR |
109 |
| - bh -= grad_bh * LR |
| 122 | + with torch.no_grad(): # computations below are untracked as tensors are treated as "detached" tensors |
| 123 | + # Find oracle weights: |
| 124 | + w_oracle = (w - grad_w * LRw).requires_grad_() # (784, 100) |
| 125 | + wh_oracle = (wh - grad_wh * LRwh).requires_grad_() # (100, 10) |
110 | 126 |
|
111 |
| - out = torch.softmax(torch.tanh(x @ oracle_w + b) @ oracle_wh + bh, dim=1) |
| 127 | + out = torch.cat(((x @ w_oracle).tanh(), BIAS), dim=1) @ wh_oracle |
112 | 128 | loss = F.cross_entropy(out, y)
|
113 |
| - grad_orc_w = torch.autograd.grad(loss, oracle_w, retain_graph=True)[0] # (784, 100) |
114 |
| - grad_orc_wh = torch.autograd.grad(loss, oracle_wh, retain_graph=True)[0] # (100, 10) |
| 129 | + loss.backward() |
| 130 | + grad_w_orc, grad_wh_orc = w_oracle.grad, wh_oracle.grad |
115 | 131 |
|
116 | 132 | with torch.no_grad():
|
117 |
| - # Update w: |
118 |
| - for i, (g, g_orc) in enumerate(zip(grad_w.T, grad_orc_w.T)): # (100, 784) |
119 |
| - if g @ g_orc < 0: # (784) |
120 |
| - # print('bounce') |
121 |
| - d1, d2 = dist(g, g_orc) |
122 |
| - if d1 > TH: |
123 |
| - LRw[i] /= shrink |
124 |
| - w[:, i] = w[:, i] * d1 + oracle_w[:, i] * d2 |
125 |
| - else: |
126 |
| - w[:, i] = oracle_w[:, i] - g_orc * LRw[i] |
127 |
| - |
128 |
| - # Update wh: |
129 |
| - for i, (g, g_orc) in enumerate(zip(grad_wh.T, grad_orc_wh.T)): # (10, 100) |
130 |
| - if g @ g_orc < 0: # (100) |
131 |
| - # print('bounce') |
132 |
| - d1, d2 = dist(g, g_orc) |
133 |
| - if d1 > TH: |
134 |
| - LRwh[i] /= shrink |
135 |
| - wh[:, i] = wh[:, i] * d1 + oracle_wh[:, i] * d2 |
136 |
| - else: |
137 |
| - wh[:, i] = oracle_wh[:, i] - g_orc * LRwh[i] |
| 133 | + bounce_update(w, w_oracle, grad_w, grad_w_orc, LRw) |
| 134 | + bounce_update(wh, wh_oracle, grad_wh, grad_wh_orc, LRwh) |
| 135 | + |
| 136 | + # Zero each parameters' gradients to avoid gradient accumulation across iterations: |
| 137 | + w.grad.zero_() |
| 138 | + wh.grad.zero_() |
138 | 139 |
|
139 | 140 | losses_bgd.append(np.mean(loss_batch))
|
140 | 141 | plt.semilogy(losses_bgd, label='BGD')
|
141 | 142 |
|
142 | 143 | acc = []
|
143 | 144 | with torch.no_grad():
|
144 | 145 | for x, y in test_loader:
|
145 |
| - out = torch.tanh(x @ w + b) @ wh + bh |
| 146 | + x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) |
| 147 | + out = torch.cat(((x @ w).tanh(), BIAS), dim=1) @ wh |
146 | 148 | _, pred = out.max(1)
|
147 | 149 | acc.append((pred == y).float().mean().item())
|
148 | 150 | print(f'Accuracy: {np.mean(acc) * 100:.4}%')
|
|
0 commit comments