massive bgd updates

Omar · Omar · commit 0f5ef65adc1b · 2025-03-11T23:14:55.000+04:00
diff --git a/bouncing_gd_np.py b/bouncing_gd_np.py
@@ -9,10 +9,10 @@
 type Vec = npt.NDArray[float]  # can represent a matrix as well
 type Input = csr_matrix | Vec
 type LossFunc = Callable[[Input, Vec], float]
-type Gradient = Callable[[Input, Vec], Vec]
+type GradFunc = Callable[[Input, Vec], Vec]
 
 
-def gd(data: Input, loss_f: LossFunc, gradient: Gradient, lr=1, epochs=50, seed=0) -> tuple[Vec, list[float]]:
+def gd(data: Input, loss_f: LossFunc, gradient: GradFunc, lr=1, epochs=50, seed=0) -> tuple[Vec, list[float]]:
     np.random.seed(seed)
     weight = np.random.randn(data.shape[-1])
     losses = [loss_f(data, weight)]
@@ -25,22 +25,24 @@ def gd(data: Input, loss_f: LossFunc, gradient: Gradient, lr=1, epochs=50, seed=
     return weight, losses
 
 
-def bouncy_gd(data: Input, loss_f: LossFunc, gradient: Gradient, lr=1, epochs=50, TH=0.7, seed=0, beta=1) -> tuple[Vec, list[float]]:
+def bouncy_gd(data: Input, loss_f: LossFunc, gradient: GradFunc, lr=1, epochs=50, TH=0.7, seed=0, beta=0.995) -> tuple[Vec, list[float]]:
     np.random.seed(seed)
-    weight = np.random.randn(data.shape[-1])
+    feature_dimensions = data.shape[-1]
+    weight = np.random.randn(feature_dimensions)
     losses = [loss_f(data, weight)]
-    lr = np.ones(data.shape[-1]) * lr  # alpha
+    lr = np.ones(feature_dimensions) * lr  # per weight (parameter) adaptive learning rate
     sw = 1  # v_t
+    e = 1e-08
 
     def dist(g1: Vec, g2: Vec) -> Vec:
-        e = 1e-05
         flatness_1, flatness_2 = np.linalg.norm(g1), np.linalg.norm(g2)
         dists = np.array([flatness_2, flatness_1])
         return dists / (dists.sum() + e)
 
     for _ in trange(epochs):
         g = gradient(data, weight)
-        sw = beta * sw + abs(g)
+        sw = beta * sw + np.abs(g)  # second moment
+
         oracle = weight - g * lr
         g_orc = gradient(data, oracle)
         if g @ g_orc < 0:
diff --git a/bouncing_gd_torch.py b/bouncing_gd_torch.py
@@ -5,56 +5,62 @@
 from torch.utils.data import DataLoader
 from tqdm import trange
 import numpy as np
-import numpy.typing as npt
 import torch.nn.functional as F
 from matplotlib import pyplot as plt
+import random
 
 # plt.style.use('seaborn')
-plt.rcParams['figure.autolayout'] = True
+# plt.rcParams['figure.autolayout'] = True
+
+torch.cuda.empty_cache()
+device = torch.device('cpu')
 
 norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-T = transforms.Compose([transforms.ToTensor(), transforms.Lambda(torch.flatten)])
-train_data = datasets.MNIST(root=f'datasets/MNIST', download=True, transform=T)  # (BS, 784)
+T = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: torch.cat((x.flatten(), torch.Tensor([1]))))])
+train_data = datasets.MNIST(root=f'datasets/MNIST', download=True, transform=T)  # (BS, 785)
 test_data = datasets.MNIST(root=f'datasets/MNIST', train=False, transform=T)
 
-BS = 100
-LR = 10
-EPOCHS = 5
+BS: int = 100
+LR: float = 10.
+EPOCHS: int = 5
+SEED: int = 0
 
 # SGD:
-torch.manual_seed(0)
-np.random.seed(0)
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+random.seed(SEED)
 
-train_loader = DataLoader(train_data, batch_size=BS, shuffle=True, pin_memory=True)
-test_loader = DataLoader(test_data, batch_size=BS, pin_memory=True)
+train_loader = DataLoader(train_data, batch_size=BS, num_workers=0, shuffle=True, pin_memory=True, generator=torch.Generator().manual_seed(SEED))
+test_loader = DataLoader(test_data, batch_size=BS, num_workers=0, pin_memory=True)
 
-w = torch.randn(784, 100, requires_grad=True)
-b = torch.randn(100, requires_grad=True)
-wh = torch.randn(100, 10, requires_grad=True)
-bh = torch.randn(10, requires_grad=True)
+w = torch.randn(784, 100, requires_grad=True, device=device)
+b = torch.randn(100, requires_grad=True, device=device)
+wh = torch.randn(100, 10, requires_grad=True, device=device)
+bh = torch.randn(10, requires_grad=True, device=device)
 
 losses_sgd = []
 for _ in trange(EPOCHS):
     loss_batch = []
     for x, y in train_loader:
-        out = torch.softmax(torch.tanh(x @ w + b) @ wh + bh, dim=1)
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        out = torch.tanh(x @ w + b) @ wh + bh  # raw logits
         loss = F.cross_entropy(out, y)
         loss_batch.append(loss.item())
-        grad_w = torch.autograd.grad(loss, w, retain_graph=True)[0]  # (784, 100)
-        grad_b = torch.autograd.grad(loss, b, retain_graph=True)[0]  # (100)
-        grad_wh = torch.autograd.grad(loss, wh, retain_graph=True)[0]  # (100, 10)
-        grad_bh = torch.autograd.grad(loss, bh)[0]  # (10)
-        with torch.no_grad():
-            w -= grad_w * LR
-            b -= grad_b * LR
-            wh -= grad_wh * LR
-            bh -= grad_bh * LR
+        grad_w, grad_b, grad_wh, grad_bh = torch.autograd.grad(loss, (w, b, wh, bh))
+
+        with torch.no_grad():  # computations below are untracked as tensors are treated as "detached" tensors
+            w -= grad_w * LR  # (784, 100)
+            b -= grad_b * LR  # (100)
+            wh -= grad_wh * LR  # (100, 10)
+            bh -= grad_bh * LR  # (10)
+
     losses_sgd.append(np.mean(loss_batch))
 plt.semilogy(losses_sgd, label='SGD')
 
 acc = []
 with torch.no_grad():
     for x, y in test_loader:
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
         out = torch.tanh(x @ w + b) @ wh + bh
         _, pred = out.max(1)
         acc.append((pred == y).float().mean().item())
@@ -63,86 +69,82 @@
 ############################################################################################################################################################
 
 # BGD:
-torch.manual_seed(0)
-np.random.seed(0)
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+random.seed(SEED)
+
+train_loader = DataLoader(train_data, batch_size=BS, num_workers=0, shuffle=True, pin_memory=True, generator=torch.Generator().manual_seed(SEED))
+test_loader = DataLoader(test_data, batch_size=BS, num_workers=0, pin_memory=True)
 
-train_loader = DataLoader(train_data, batch_size=BS, shuffle=True, pin_memory=True)
-test_loader = DataLoader(test_data, batch_size=BS, pin_memory=True)
+w = torch.randn(785, 100, requires_grad=True, device=device)  # each column corresponds to an output node (neuron) in the network
+wh = torch.randn(101, 10, requires_grad=True, device=device)
 
-w = torch.randn(784, 100, requires_grad=True)
-b = torch.randn(100, requires_grad=True)
-wh = torch.randn(100, 10, requires_grad=True)
-bh = torch.randn(10, requires_grad=True)
+TH = 0.9  # ThreshHold value is inversely proportional to the initial learning rate
+EPS = torch.Tensor([1e-11])
+BIAS = torch.ones(100, device=device).unsqueeze(1)
+SHRINK = torch.tensor(1.1, device=device)
+LRw = torch.ones(100, device=device) * LR  # per output node (neuron) adaptive learning rate (not per parameter/weight)
+LRwh = torch.ones(10, device=device) * LR
 
 
-def dist(g1: torch.Tensor, g2: torch.Tensor) -> npt.NDArray[float]:
-    e = 1e-11
+def dist(g1: torch.Tensor, g2: torch.Tensor) -> torch.Tensor:
     flatness_1, flatness_2 = g1.norm().item(), g2.norm().item()
-    dists = np.array([flatness_2, flatness_1])
-    return dists / (dists.sum() + e)
+    dists = torch.Tensor([flatness_2, flatness_1])
+    return dists / (dists.sum() + EPS)
+
 
+def bounce_update(weight, oracle, weight_gradient, oracle_gradient, weight_LR) -> None:
+    dot_prods: torch.Tensor = torch.einsum("kj,kj->j", weight_gradient, oracle_gradient)
+    for i, dot_prod in enumerate(dot_prods):
+        if dot_prod.item() < 0:
+            # print('bounce')
+            d1, d2 = dist(weight_gradient[:, i], oracle_gradient[:, i])
+            if d1.item() > TH:
+                weight_LR[i] /= SHRINK
+
+            weight[:, i] = weight[:, i] * d1 + oracle[:, i] * d2
+
+        else:
+            weight[:, i] = oracle[:, i] - oracle_gradient[:, i] * weight_LR[i]
 
-TH = 0.9  # TH value is inversely proportional to "ini"
-shrink = 1.1
-LRw = torch.ones(100) * LR
-LRwh = torch.ones(10) * LR
 
 losses_bgd = []
 for _ in trange(EPOCHS):
     loss_batch = []
     for x, y in train_loader:
-        out = torch.softmax(torch.tanh(x @ w + b) @ wh + bh, dim=1)
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        out = torch.cat(((x @ w).tanh(), BIAS), dim=1) @ wh
         loss = F.cross_entropy(out, y)
         loss_batch.append(loss.item())
-        grad_w = torch.autograd.grad(loss, w, retain_graph=True)[0]  # (784, 100)
-        grad_b = torch.autograd.grad(loss, b, retain_graph=True)[0]  # (100)
-        grad_wh = torch.autograd.grad(loss, wh, retain_graph=True)[0]  # (100, 10)
-        grad_bh = torch.autograd.grad(loss, bh)[0]  # (10)
-
-        # Find oracle weights:
-        oracle_w = w - grad_w * LRw
-        oracle_wh = wh - grad_wh * LRwh
+        loss.backward()
+        grad_w, grad_wh = w.grad, wh.grad
 
-        # Update bias vectors:
-        with torch.no_grad():
-            b -= grad_b * LR
-            bh -= grad_bh * LR
+        with torch.no_grad():  # computations below are untracked as tensors are treated as "detached" tensors
+            # Find oracle weights:
+            w_oracle = (w - grad_w * LRw).requires_grad_()  # (784, 100)
+            wh_oracle = (wh - grad_wh * LRwh).requires_grad_()  # (100, 10)
 
-        out = torch.softmax(torch.tanh(x @ oracle_w + b) @ oracle_wh + bh, dim=1)
+        out = torch.cat(((x @ w_oracle).tanh(), BIAS), dim=1) @ wh_oracle
         loss = F.cross_entropy(out, y)
-        grad_orc_w = torch.autograd.grad(loss, oracle_w, retain_graph=True)[0]  # (784, 100)
-        grad_orc_wh = torch.autograd.grad(loss, oracle_wh, retain_graph=True)[0]  # (100, 10)
+        loss.backward()
+        grad_w_orc, grad_wh_orc = w_oracle.grad, wh_oracle.grad
 
         with torch.no_grad():
-            # Update w:
-            for i, (g, g_orc) in enumerate(zip(grad_w.T, grad_orc_w.T)):  # (100, 784)
-                if g @ g_orc < 0:  # (784)
-                    # print('bounce')
-                    d1, d2 = dist(g, g_orc)
-                    if d1 > TH:
-                        LRw[i] /= shrink
-                    w[:, i] = w[:, i] * d1 + oracle_w[:, i] * d2
-                else:
-                    w[:, i] = oracle_w[:, i] - g_orc * LRw[i]
-
-            # Update wh:
-            for i, (g, g_orc) in enumerate(zip(grad_wh.T, grad_orc_wh.T)):  # (10, 100)
-                if g @ g_orc < 0:  # (100)
-                    # print('bounce')
-                    d1, d2 = dist(g, g_orc)
-                    if d1 > TH:
-                        LRwh[i] /= shrink
-                    wh[:, i] = wh[:, i] * d1 + oracle_wh[:, i] * d2
-                else:
-                    wh[:, i] = oracle_wh[:, i] - g_orc * LRwh[i]
+            bounce_update(w, w_oracle, grad_w, grad_w_orc, LRw)
+            bounce_update(wh, wh_oracle, grad_wh, grad_wh_orc, LRwh)
+
+        # Zero each parameters' gradients to avoid gradient accumulation across iterations:
+        w.grad.zero_()
+        wh.grad.zero_()
 
     losses_bgd.append(np.mean(loss_batch))
 plt.semilogy(losses_bgd, label='BGD')
 
 acc = []
 with torch.no_grad():
     for x, y in test_loader:
-        out = torch.tanh(x @ w + b) @ wh + bh
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        out = torch.cat(((x @ w).tanh(), BIAS), dim=1) @ wh
         _, pred = out.max(1)
         acc.append((pred == y).float().mean().item())
 print(f'Accuracy: {np.mean(acc) * 100:.4}%')