diff --git a/challenges/3.7 challenge.py b/challenges/3.7 challenge.py new file mode 100644 index 0000000..c7d3a98 --- /dev/null +++ b/challenges/3.7 challenge.py @@ -0,0 +1,154 @@ +import numpy as np +import pandas as pd + +#=== Bonus Challenge ============================================================================================ +# Add validation monitoring to the network you implemented in challenge 3.6 +# The fit method should accept Xval, and yval, and print the progress of both train and validation accuracy and crossentropy + +def softmax(x): + """ + Calculate row-wise softmax + + :param x: 2d array where (i,j) gives the jth input value for the ith sample + :return: 2d array with the same shape as the input, with softmax applied to each row-vector + As a result, the elements in each row can be interpretted as probabilities that sum to one + """ + + return np.exp(x)/np.sum(np.exp(x), axis=1)[:, None] + + +def one_hot(x): + """ + One-hot-encode an array + + :param x: 1d array where element (i) gives the true label for sample i + :return: tuple of (onehot, classes) where: + - onehot is a NxK array where N = len(x), K = len(np.unique(x)) and + element (i,j) = 1 if string_arr[i] == np.unique(x)[j], 0 otherwise + - classes is a 1d array of classes corresponding to the columns of onehot + """ + + classes, inverse = np.unique(x, return_inverse=True) + onehot = np.eye(classes.shape[0], dtype='int64')[inverse] + return (onehot, classes) + + +def cross_entropy(Yhat, Y): + """ + Calculate row-wise cross entropy + + :param Yhat: NxK array where (i,j) gives the predicted probability of class j for sample i + :param Y: either: + 1) NxK array where (i,j) gives the true probability of class j for sample i or + 2) a 1-D array where element i gives the index of the true class for sample i + :return: 1-D array with N elements, where element i gives the cross entropy for the ith sample + """ + + if Y.ndim == 1: + ce = -np.log(Yhat[np.arange(len(Y)), Y]) + else: + ce = -np.sum(Y * np.log(Yhat), axis=1) + + return ce + + +def logistic(x): + """ + standard logistic function + + uses the identity: 1/(1 + e^(-x)) = e^x/(e^x + 1) + to prevent double precision issues when x is a big negative number + + :param x: numpy array + :return: 1/(1 + e^(-x)) + """ + + mask = x > 0 + y = np.full(shape=x.shape, fill_value=np.nan) + y[mask] = 1 / (1 + np.exp(-(x[mask]))) + y[~mask] = np.exp(x[~mask]) / (np.exp(x[~mask]) + 1) + return y + + +class NNet(): + """ + NNet with stochastic gradient descent and validation loss monitoring + """ + + def __init__(self, Ws=None, y_classes=None): + """ + Initialization + + :param Ws: optional list of weight matrices (list of 2-D numpy arrays) + :param y_classes: optional array of y_classes (1-D numpy array with >= 2 elements) + """ + + self.Ws = Ws + self.y_classes = y_classes + + def fit(self, X, y, hiddenNodes, Xval=None, yval=None, stepSize=0.01, ITERS=100, batchSize=None, seed=None): + """ + Find the best weights via stochastic gradient descent + + :param X: training features + :param y: training labels. 1-d array with >= 2 classes + :param hiddenNodes: list indicating how many nodes to use in each hidden layer, excluding bias nodes + :param Xval: optional validation features + :param yval: optional validation labels. 1-d array with >= 2 classes + :param stepSize: AKA "learning rate" AKA "alpha" used in gradient descent + :param ITERS: How many gradient descent steps to make? + :param batchSize: How many samples to user per batch? If None, use all samples + :return: None. Update self.y_classes, self.W1, self.W2 + """ + + # Validate X dimensionality + if X.ndim != 2: + raise AssertionError(f"X should have 2 dimensions but it has {X.ndim}") + + # Validate Ws type + if not isinstance(hiddenNodes, list): + AssertionError("hiddenNodes should be a list of integers") + + # Determine unique y classes + y01, y_classes = one_hot(y) + if len(y_classes) < 2: + AssertionError(f"y should have at least 2 distinct classes, but instead it has {len(y_classes)}") + + pass + + def predict(self, X, type='classes', Ws = None, y_classes = None): + """ + Predict on X + + :param X: 2-D array with >= 1 column of real-valued features + :param type: If 'classes', predicted classes, else if 'probs', predicted class probabilities + :param Ws: list of 2-D arrays (weight matrices). If None, use self.Ws + :param y_classes: numpy array of y classes. If None, use self.y_classes + :return: if type = 'probs' then probabilities else if type = 'classes' then classes + """ + + pass + +#=== Test ============================================================================================ + +# Load MNIST images data +mnist_train = pd.read_csv("https://raw.githubusercontent.com/ben519/nnets-for-your-dog/master/data/mnist_train.csv") +mnist_test = pd.read_csv("https://raw.githubusercontent.com/ben519/nnets-for-your-dog/master/data/mnist_test.csv") + +# Split train into train and validation +gen = np.random.default_rng(seed = 1234) +val_ids, train_ids = np.split(gen.choice(len(mnist_train), len(mnist_train), replace=False), [5000]) + +# Initialize & fit neural network +nn = NNet() +nn.fit( + X = mnist_train.iloc[train_ids].drop(columns='label').to_numpy(), + y = mnist_train.iloc[train_ids].label.to_numpy(), + Xval = mnist_train.iloc[val_ids].drop(columns='label').to_numpy(), + yval = mnist_train.iloc[val_ids].label.to_numpy(), + hiddenNodes = [50, 20, 8], + stepSize = 0.1, + batchSize = 100, + ITERS = 200, + seed = 0 +) \ No newline at end of file diff --git a/challenges/3.7 solution.py b/challenges/3.7 solution.py new file mode 100644 index 0000000..46f2689 --- /dev/null +++ b/challenges/3.7 solution.py @@ -0,0 +1,273 @@ +import numpy as np +import pandas as pd + +#=== Bonus Challenge ============================================================================================ +# Add validation monitoring to the network you implemented in challenge 3.6 +# The fit method should accept Xval, and yval, and print the progress of both train and validation accuracy and crossentropy + +def softmax(x): + """ + Calculate row-wise softmax + + :param x: 2d array where (i,j) gives the jth input value for the ith sample + :return: 2d array with the same shape as the input, with softmax applied to each row-vector + As a result, the elements in each row can be interpretted as probabilities that sum to one + """ + + return np.exp(x)/np.sum(np.exp(x), axis=1)[:, None] + + +def one_hot(x): + """ + One-hot-encode an array + + :param x: 1d array where element (i) gives the true label for sample i + :return: tuple of (onehot, classes) where: + - onehot is a NxK array where N = len(x), K = len(np.unique(x)) and + element (i,j) = 1 if string_arr[i] == np.unique(x)[j], 0 otherwise + - classes is a 1d array of classes corresponding to the columns of onehot + """ + + classes, inverse = np.unique(x, return_inverse=True) + onehot = np.eye(classes.shape[0], dtype='int64')[inverse] + return (onehot, classes) + + +def cross_entropy(Yhat, Y): + """ + Calculate row-wise cross entropy + + :param Yhat: NxK array where (i,j) gives the predicted probability of class j for sample i + :param Y: either: + 1) NxK array where (i,j) gives the true probability of class j for sample i or + 2) a 1-D array where element i gives the index of the true class for sample i + :return: 1-D array with N elements, where element i gives the cross entropy for the ith sample + """ + + if Y.ndim == 1: + ce = -np.log(Yhat[np.arange(len(Y)), Y]) + else: + ce = -np.sum(Y * np.log(Yhat), axis=1) + + return ce + + +def logistic(x): + """ + standard logistic function + + uses the identity: 1/(1 + e^(-x)) = e^x/(e^x + 1) + to prevent double precision issues when x is a big negative number + + :param x: numpy array + :return: 1/(1 + e^(-x)) + """ + + mask = x > 0 + y = np.full(shape=x.shape, fill_value=np.nan) + y[mask] = 1 / (1 + np.exp(-(x[mask]))) + y[~mask] = np.exp(x[~mask]) / (np.exp(x[~mask]) + 1) + return y + + +class NNet(): + """ + NNet with stochastic gradient descent and validation loss monitoring + """ + + def __init__(self, Ws=None, y_classes=None): + """ + Initialization + + :param Ws: optional list of weight matrices (list of 2-D numpy arrays) + :param y_classes: optional array of y_classes (1-D numpy array with >= 2 elements) + """ + + self.Ws = Ws + self.y_classes = y_classes + + def fit(self, X, y, hiddenNodes, Xval=None, yval=None, stepSize=0.01, ITERS=100, batchSize=None, seed=None): + """ + Find the best weights via stochastic gradient descent + + :param X: training features + :param y: training labels. 1-d array with >= 2 classes + :param hiddenNodes: list indicating how many nodes to use in each hidden layer, excluding bias nodes + :param Xval: optional validation features + :param yval: optional validation labels. 1-d array with >= 2 classes + :param stepSize: AKA "learning rate" AKA "alpha" used in gradient descent + :param ITERS: How many gradient descent steps to make? + :param batchSize: How many samples to user per batch? If None, use all samples + :return: None. Update self.y_classes, self.W1, self.W2 + """ + + # Validate X dimensionality + if X.ndim != 2: + raise AssertionError(f"X should have 2 dimensions but it has {X.ndim}") + + # Validate Ws type + if not isinstance(hiddenNodes, list): + AssertionError("hiddenNodes should be a list of integers") + + # Determine unique y classes + y01, y_classes = one_hot(y) + if len(y_classes) < 2: + AssertionError(f"y should have at least 2 distinct classes, but instead it has {len(y_classes)}") + + if yval is not None: + y01_val, yval_classes = one_hot(yval) + + # Initialization (note Ws is a list of weight matrices) + gen = np.random.default_rng(seed) + X1 = np.insert(X / 255, obj=X.shape[1], values=1, axis=1) + Ws = [None] * (len(hiddenNodes) + 1) + Ws[0] = gen.uniform(low=-1, high=1, size=(X1.shape[1], hiddenNodes[0])) + for i in range(1, len(hiddenNodes)): + Ws[i] = gen.uniform(low=-1, high=1, size=(hiddenNodes[i - 1] + 1, hiddenNodes[i])) + Ws[i + 1] = gen.uniform(low=-1, high=1, size=(hiddenNodes[i] + 1, len(y_classes))) + + # Initialize lists to store Xs, Zs, and gradients + Zs = [None] * len(Ws) + Xs = [None] * len(Ws) + gradWs = [None] * len(Ws) + + # Determine number of batches + if batchSize is None: + Nbatches = 1 + else: + Nbatches = np.ceil(X1.shape[0]/batchSize).astype('int64') + + # Initialize lists to store performance stats + CE_train_list = [] + Accuracy_train_list = [] + CE_validation_list = [] + Accuracy_validation_list = [] + + # Train + for i in range(ITERS): + + # mini batches + idxs = gen.choice(X1.shape[0], size=X1.shape[0], replace=False) + batches = np.array_split(idxs, Nbatches) + + # Loop over batches + for b in range(Nbatches): + batch_idxs = batches[b] + Xs[0] = X1[batch_idxs] + + # Make predictions (forward pass) + for j in range(len(Ws)): + Zs[j] = Xs[j] @ Ws[j] + if j + 1 < len(Xs): + Xs[j + 1] = np.insert(logistic(Zs[j]), obj=Zs[j].shape[1], values=1, axis=1) + yhat_probs = softmax(Zs[-1]) + + if b == Nbatches - 1: + # Calculate training cross entropy loss, accuracy + yhat_probs_train = self.predict(X, type='probs', Ws=Ws, y_classes=y_classes) + yhat_classes_train = y_classes[np.argmax(yhat_probs_train, axis=1)] + ce_train = cross_entropy(yhat_probs_train, y01) + CE_train = np.mean(ce_train) + accuracy_train = np.mean(yhat_classes_train == y) + + CE_train_list.append(CE_train) + Accuracy_train_list.append(accuracy_train) + + if Xval is None or yval is None: + print(f'iteration: {i}, train cross entropy loss: {CE_train}, train accuracy: {accuracy_train}') + else: + yhat_probs_val = self.predict(Xval, type='probs', Ws = Ws, y_classes = y_classes) + yhat_classes_val = y_classes[np.argmax(yhat_probs_val, axis=1)] + ce_val = cross_entropy(yhat_probs_val, y01_val) + CE_val = np.mean(ce_val) + accuracy_val = np.mean(yhat_classes_val == yval) + + CE_validation_list.append(CE_val) + Accuracy_validation_list.append(accuracy_val) + + print(f'iteration: {i}, ' + f'train cross entropy loss: {CE_train}, validation cross entropy loss: {CE_val}, ' + f'train accuracy: {accuracy_train}, validation accuracy: {accuracy_val}') + + + # Calculate gradients (backward pass) + gradZ = (yhat_probs - y01[batch_idxs])[:, None, :] + for j in range(len(Ws) - 1, -1, -1): + gradWs[j] = np.transpose(Xs[j][:, None, :], axes=[0, 2, 1]) @ gradZ + gradWs[j] = gradWs[j].mean(axis=0) + gradX = (gradZ @ np.transpose(Ws[j]))[:, :, :-1] + gradZ = gradX * (Xs[j] * (1 - Xs[j]))[:, None, :-1] + + # Update weights (gradient step) + for j in range(len(Ws)): + Ws[j] -= gradWs[j] * stepSize + + # Update class vars + self.y_classes = y_classes + self.Ws = Ws + self.CE_train_list = CE_train_list + self.Accuracy_train_list = Accuracy_train_list + self.CE_validation_list = CE_validation_list + self.Accuracy_validation_list = Accuracy_validation_list + + def predict(self, X, type='classes', Ws = None, y_classes = None): + """ + Predict on X + + :param X: 2-D array with >= 1 column of real-valued features + :param type: If 'classes', predicted classes, else if 'probs', predicted class probabilities + :param Ws: list of 2-D arrays (weight matrices). If None, use self.Ws + :param y_classes: numpy array of y classes. If None, use self.y_classes + :return: if type = 'probs' then probabilities else if type = 'classes' then classes + """ + + if Ws is None: + Ws = self.Ws + if y_classes is None: + y_classes = self.y_classes + + if Ws is None: + raise AssertionError(f"Need to fit() before predict()") + if X.ndim != 2: + raise AssertionError(f"X should have 2 dimensions but it has {X.ndim}") + if X.shape[1] != len(Ws[0]) - 1: + raise AssertionError( + f"Perceptron was fit on X with {len(Ws[0]) - 1} columns but this X has {X.shape[1]} columns") + + # Make predictions (forward pass) + X1 = np.insert(X / 255, obj=X.shape[1], values=1, axis=1) + for j in range(len(Ws)): + Z = X1 @ Ws[j] + if j < len(Ws) - 1: + X1 = np.insert(logistic(Z), obj=Z.shape[1], values=1, axis=1) + yhat_probs = softmax(Z) + + if type == 'probs': + return yhat_probs + elif type == 'classes': + yhat_classes = y_classes[np.argmax(yhat_probs, axis=1)] + return yhat_classes + +#=== Test ============================================================================================ + +# Load MNIST images data +mnist_train = pd.read_csv("https://raw.githubusercontent.com/ben519/nnets-for-your-dog/master/data/mnist_train.csv") +mnist_test = pd.read_csv("https://raw.githubusercontent.com/ben519/nnets-for-your-dog/master/data/mnist_test.csv") + +# Split train into train and validation +gen = np.random.default_rng(seed = 1234) +val_ids, train_ids = np.split(gen.choice(len(mnist_train), len(mnist_train), replace=False), [5000]) + +# Initialize & fit neural network +nn = NNet() +nn.fit( + X = mnist_train.iloc[train_ids].drop(columns='label').to_numpy(), + y = mnist_train.iloc[train_ids].label.to_numpy(), + Xval = mnist_train.iloc[val_ids].drop(columns='label').to_numpy(), + yval = mnist_train.iloc[val_ids].label.to_numpy(), + hiddenNodes = [50, 20, 8], + stepSize = 0.1, + batchSize = 100, + ITERS = 200, + seed = 0 +) \ No newline at end of file