Skip to content
This repository has been archived by the owner on Sep 1, 2022. It is now read-only.

Commit

Permalink
many updates to neural networks, layers are made REP-compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
arogozhnikov committed Apr 16, 2015
1 parent 22ebf4a commit 3b0de88
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 83 deletions.
163 changes: 96 additions & 67 deletions cmsuml/nnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
The neural networks from this library provide sklearn classifier's interface.
"""
from __future__ import print_function, division
from copy import deepcopy

import numpy
import theano
import theano.tensor as T
from sklearn.utils.validation import check_random_state
from sklearn.utils.validation import check_random_state, check_arrays
from sklearn.base import BaseEstimator, ClassifierMixin

from . import utils
Expand All @@ -24,31 +25,43 @@


def squared_loss(y, pred, w):
""" Squared loss for classification, not to be messed up with MSE"""
return T.mean(w * (y - T.nnet.sigmoid(pred)) ** 2)


def log_loss(y, pred, w):
""" Logistic loss (aka cross-entropy, aka binomial deviance) """
margin = pred * (1 - 2 * y)
return T.mean(w * T.nnet.softplus(margin))


def ada_loss(y, pred, w):
"""important - ada loss should be used with nnets without sigmoid,
output should be arbitrary real, not [0,1]"""
def exp_loss(y, pred, w):
"""Exponential loss (aka AdaLoss function) """
margin = pred * (1 - 2 * y)
return T.mean(w * T.exp(margin))


def exp_log_loss(y, pred, w):
""" Combines logistic loss for signal and exponential loss for background """
return 2 * log_loss(y, pred, w=w * y) + exp_loss(y, pred, w=w * (1 - y))


# regression loss
def mse_loss(y, pred, w):
return T.mean(w * (y - pred) ** 2)

losses = {'mse_loss': mse_loss,
'exp_loss': exp_loss,
'log_loss': log_loss,
'squared_loss': squared_loss,
'exp_log_loss': exp_log_loss,
}

# endregion


# region Trainers
def get_batch(x, y, w, batch=10, random=numpy.random):
def get_batch(x, y, w, random, batch=10):
""" Generates subset of training dataset, of size batch"""
if len(y) > batch:
indices = random.choice(len(x), size=batch)
Expand All @@ -59,8 +72,9 @@ def get_batch(x, y, w, batch=10, random=numpy.random):

def sgd_trainer(x, y, w, parameters, derivatives, loss,
stages=1000, batch=10, learning_rate=0.1, l2_penalty=0.001, momentum=0.9,
random=numpy.random, verbose=SILENT):
random=0, verbose=SILENT):
"""Simple gradient descent with backpropagation"""
random = check_random_state(random)
momentums = {name: 0. for name in parameters}

for stage in range(stages):
Expand All @@ -75,9 +89,9 @@ def sgd_trainer(x, y, w, parameters, derivatives, loss,
print(loss(x, y, w))


def irprop_minus_trainer(x, y, w, parameters, derivatives, loss, stages=100,
positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6,
random=numpy.random, verbose=SILENT):
def irprop_minus_trainer(x, y, w, parameters, derivatives, loss,
stages=100, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6,
random=0, verbose=SILENT):
""" IRPROP- trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 """
deltas = {name: 1e-3 for name in parameters}
prev_derivatives = {name: 0. for name in parameters}
Expand All @@ -100,8 +114,8 @@ def irprop_minus_trainer(x, y, w, parameters, derivatives, loss, stages=100,

def irprop_star_trainer(x, y, w, parameters, derivatives, loss, stages=100,
positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6,
random=numpy.random, verbose=SILENT):
""" IRPROP* trainer (own modification, not recommended) """
random=0, verbose=SILENT):
""" IRPROP* trainer (own modification, not recommended for usage) """
from collections import defaultdict

deltas = defaultdict(lambda: 1e-3)
Expand All @@ -126,10 +140,9 @@ def irprop_star_trainer(x, y, w, parameters, derivatives, loss, stages=100,
print(loss(x, y, w))


# TODO check this IRPROP implementation
def irprop_plus_trainer(x, y, w, parameters, derivatives, loss, stages=100,
positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6,
random=numpy.random, verbose=SILENT):
random=0, verbose=SILENT):
"""IRPROP+ trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332"""
deltas = dict([(name, 1e-3) for name in parameters])
prev_derivatives = dict([(name, 0.) for name in parameters])
Expand All @@ -142,7 +155,7 @@ def irprop_plus_trainer(x, y, w, parameters, derivatives, loss, stages=100,
val = parameters[name].get_value()
delta = deltas[name]
if loss_value > prev_loss_value:
# step back
# step back on those variables where sign was changed
val += numpy.where(new_derivative * old_derivative < 0, delta * numpy.sign(old_derivative), 0)
delta = numpy.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step)
delta = numpy.clip(delta, min_step, max_step)
Expand All @@ -157,7 +170,8 @@ def irprop_plus_trainer(x, y, w, parameters, derivatives, loss, stages=100,


def adadelta_trainer(x, y, w, parameters, derivatives, loss, stages=1000, decay_rate=0.95,
epsilon=1e-5, learning_rate=1., batch=1000, random=numpy.random, verbose=SILENT):
epsilon=1e-5, learning_rate=1., batch=1000, random=0, verbose=SILENT):
random = check_random_state(random)
cumulative_derivatives = {name: epsilon for name in parameters}
cumulative_steps = {name: epsilon for name in parameters}

Expand Down Expand Up @@ -185,20 +199,29 @@ def adadelta_trainer(x, y, w, parameters, derivatives, loss, stages=1000, decay_
'irprop+': irprop_plus_trainer,
'irprop*': irprop_star_trainer,
'adadelta': adadelta_trainer,
}
}
# endregion


# TODO think of dropper and noises
# TODO think about case of vector output

def normalize_weight(y, sample_weight):
""" guarantees that sample weight is numpy.array of shape [n_samples],
moreover normalizes it """
sample_weight = utils.check_sample_weight(y, sample_weight=sample_weight)
sample_weight /= numpy.mean(sample_weight)
return sample_weight


class AbstractNeuralNetworkClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, layers=None, loss=log_loss, trainer='irprop-', trainer_parameters=None, random_state=None):
"""
Constructs the neural network based on Theano (for classification purposes).
Supports only binary classification, supports weights, which makes it usable in boosting.
Works in sklearn fit-predict way: X is [n_samples, n_features], y is [n_samples], sample_weight is [n_samples].
Works as usual sklearn classifier, can be used in boosting, for instance, pickled, etc.
:param layers: list of int, e.g [14, 7, 1] - the number of units in each layer
:param layers: list of int, e.g [9, 7] - the number of units in each *hidden* layer
:param loss: loss function used (log_loss by default)
:param trainer: string, describes the method
:param trainer_parameters: parameters passed to trainer function (learning_rate, etc., trainer-specific).
Expand All @@ -209,38 +232,51 @@ def __init__(self, layers=None, loss=log_loss, trainer='irprop-', trainer_parame
self.parameters = {}
self.derivatives = {}
self.trainer = trainer
self.trainer_parameters = trainer_parameters
self.trainer_parameters = deepcopy(trainer_parameters)
self.random_state = random_state
self.classes_ = numpy.array([0, 1]) # Dirty hack for AdaBoost
self.classes_ = numpy.array([0, 1])

def _create_shared_matrix(self, name, n1, n2):
"""Creates a parameter of neural network, which is typically a matrix """
matrix = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX) * 0.01, name=name)
self.parameters[name] = matrix
return matrix

def prepare(self):
"""This method should provide activation function and set parameters
:return Activation function, f: X -> p,
X of shape [n_events, n_outputs], p of shape [n_events, n_outputs]
X of shape [n_events, n_outputs], p of shape [n_events].
For classification, p is arbitrary real, the greater p, the more event
looks like signal event (label 1).
"""
raise NotImplementedError()

def _prepare(self):
def _prepare(self, n_input_features):
"""This function is called once, it creates the activation function, it's gradient
and initializes the weights"""
self.random_state = check_random_state(self.random_state)
activation = self.prepare()
loss_ = lambda x, y, w: self.loss(y, activation(x).flatten(), w)
self.layers_ = [n_input_features] + self.layers + [1]
self.parameters = {}
activation_raw = self.prepare()
activation = lambda x: activation_raw(x).flatten()
loss_function = losses.get(self.loss, self.loss)
loss_ = lambda x, y, w: loss_function(y, activation(x), w)
x = T.matrix('X')
y = T.vector('y')
w = T.vector('w')
self.Activation = theano.function([x], activation(x).flatten())
self.Activation = theano.function([x], activation(x))
self.Loss = theano.function([x, y, w], loss_(x, y, w))
for name, param in self.parameters.iteritems():
self.derivatives[name] = theano.function([x, y, w], T.grad(loss_(x, y, w), param))

def activate(self, X):
""" Activates NN on particular dataset
:param numpy.array X: of shape [n_samples, n_features] """
:param numpy.array X: of shape [n_samples, n_features]
:return: numpy.array with results of shape [n_samples] """
return self.Activation(X)

def predict_proba(self, X):
"""Computes praobability of each event to belong to a particular class
"""Computes probability of each event to belong to a particular class
:param numpy.array X: of shape [n_samples, n_features]
:return: numpy.array of shape [n_samples, n_classes]
"""
Expand All @@ -263,8 +299,7 @@ def compute_loss(self, X, y, sample_weight=None):
:param sample_weight: optional, numpy.array of shape [n_samples],
weights are normalized (so that 'mean' == 1).
:return float, the loss vales computed"""
sample_weight = utils.check_sample_weight(y, sample_weight=sample_weight)
sample_weight /= numpy.mean(sample_weight)
sample_weight = normalize_weight(y, sample_weight)
return self.Loss(X, y, sample_weight)

def fit(self, X, y, sample_weight=None, trainer=None, **trainer_parameters):
Expand All @@ -276,11 +311,13 @@ def fit(self, X, y, sample_weight=None, trainer=None, **trainer_parameters):
:param trainer: str, method used to minimize loss, overrides one in the ctor
:param trainer_parameters: parameters for this method, override ones in ctor
:return: self """
X, y, sample_weight = check_arrays(X, y, sample_weight)
self.classes_ = numpy.array([0, 1])
assert (numpy.unique(y) == self.classes_).all(), 'only two-class classification supported, labels are 0 and 1'
if not self.prepared:
self._prepare()
self._prepare(X.shape[1])
self.prepared = True
sample_weight = utils.check_sample_weight(y, sample_weight=sample_weight)
sample_weight /= numpy.mean(sample_weight)
sample_weight = normalize_weight(y, sample_weight)

trainer = trainers[self.trainer if trainer is None else trainer]
parameters_ = {} if self.trainer_parameters is None else self.trainer_parameters.copy()
Expand All @@ -295,10 +332,9 @@ class SimpleNeuralNetwork(AbstractNeuralNetworkClassifier):
"""The most simple NN with one hidden layer (sigmoid activation), for example purposes """

def prepare(self):
n1, n2, n3 = self.layers
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX), name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[n2, n3]).astype(floatX), name='W2')
self.parameters = {'W1': W1, 'W2': W2}
n1, n2, n3 = self.layers_
W1 = self._create_shared_matrix('W1', n1, n2)
W2 = self._create_shared_matrix('W2', n2, n3)

def activation(input):
first = T.nnet.sigmoid(T.dot(input, W1))
Expand All @@ -308,30 +344,27 @@ def activation(input):


class MultiLayerNetwork(AbstractNeuralNetworkClassifier):
"""Supports arbitrary number of layers (sigmoid activation each)."""
"""Supports arbitrary number of layers (sigmoid activation each).
aka MLP (MultiLayerPerceptron)"""

def prepare(self):
activations = [lambda x: x]
for i, layer in list(enumerate(self.layers))[1:]:
W = theano.shared(value=self.random_state.normal(size=[self.layers[i - 1], self.layers[i]]),
name='W' + str(i))
self.parameters[i] = W
# j = i trick is to avoid lambda-capturing of i
pred_activation = lambda x, j=i: T.dot(activations[j - 1](x), self.parameters[j])
activations.append(lambda x, j=i: T.tanh(pred_activation(x, j)))
return pred_activation
activation = lambda x: x
for i, layer in list(enumerate(self.layers_))[1:]:
W = self._create_shared_matrix('W' + str(i), self.layers_[i - 1], self.layers_[i])
# act=activation and W_=W are tricks to avoid lambda-capturing
activation = lambda x, act=activation, W_=W: T.tanh(T.dot(act(x), W_))
return activation


class RBFNeuralNetwork(AbstractNeuralNetworkClassifier):
"""One hidden layer with normalized RBF activation (Radial Basis Function)"""

def prepare(self):
n1, n2, n3 = self.layers
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX), name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[n2, n3]).astype(floatX), name='W2')
n1, n2, n3 = self.layers_
W1 = self._create_shared_matrix('W1', n1, n2)
W2 = self._create_shared_matrix('W2', n2, n3)
# this parameter is responsible for scaling, it is computed as well
G = theano.shared(value=0.1, name='G')
self.parameters = {'W1': W1, 'W2': W2, 'G': G}
self.parameters['G'] = G

def activation(input):
translation_vectors = W1.reshape((1, W1.shape[0], -1)) - input.reshape((input.shape[0], 1, -1))
Expand All @@ -346,10 +379,9 @@ class SoftmaxNeuralNetwork(AbstractNeuralNetworkClassifier):
"""One hidden layer, softmax activation function """

def prepare(self):
n1, n2, n3 = self.layers
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX), name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[n2, n3]).astype(floatX), name='W2')
self.parameters = {'W1': W1, 'W2': W2}
n1, n2, n3 = self.layers_
W1 = self._create_shared_matrix('W1', n1, n2)
W2 = self._create_shared_matrix('W2', n2, n3)

def activation(input):
first = T.nnet.softmax(T.dot(input, W1))
Expand All @@ -362,10 +394,9 @@ class PairwiseNeuralNetwork(AbstractNeuralNetworkClassifier):
"""The result is computed as h = sigmoid(Ax), output = sum_{ij} B_ij h_i (1 - h_j) """

def prepare(self):
n1, n2, n3 = self.layers
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX) * 0.01, name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[n2, n2]).astype(floatX) * 0.01, name='W2')
self.parameters = {'W1': W1, 'W2': W2}
n1, n2, n3 = self.layers_
W1 = self._create_shared_matrix('W1', n1, n2)
W2 = self._create_shared_matrix('W2', n2, n2)

def activation(input):
first = T.nnet.sigmoid(T.dot(input, W1))
Expand All @@ -378,10 +409,9 @@ class PairwiseSoftplusNeuralNetwork(AbstractNeuralNetworkClassifier):
"""The result is computed as h = sigmoid(Ax), output = sum_{ij} B_ij h_i (1 - h_j) """

def prepare(self):
n1, n2, n3 = self.layers
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX) * 0.01, name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[n2, n2]).astype(floatX) * 0.01, name='W2')
self.parameters = {'W1': W1, 'W2': W2}
n1, n2, n3 = self.layers_
W1 = self._create_shared_matrix('W1', n1, n2)
W2 = self._create_shared_matrix('W2', n2, n2)

def activation(input):
z = T.dot(input, W1)
Expand All @@ -395,11 +425,10 @@ def activation(input):
class ObliviousNeuralNetwork(AbstractNeuralNetworkClassifier):
""" Uses idea of oblivious trees,
but not strict cuts on features and not rectangular cuts, but linear conditions
TODO: needs pretraining, like oblivious tree first
"""

"""
# TODO: needs pretraining, like oblivious tree first
def prepare(self):
n1, n2, n3 = self.layers
n1, n2, n3 = self.layers_
W1 = theano.shared(value=self.random_state.normal(size=[n1, n2]).astype(floatX), name='W1')
W2 = theano.shared(value=self.random_state.normal(size=[2] * n2).astype(floatX), name='W2')
self.parameters = {'W1': W1, 'W2': W2}
Expand Down
Loading

0 comments on commit 3b0de88

Please sign in to comment.