AdaBoostTrees/boost.py at master · simsicon/AdaBoostTrees · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import division

import random
import time
import numpy as np
from collections import Counter

from tree import Tree

class Boosting:
    def __init__(self, X, y, n_estimators=10, n_samples=1024):
        self.X = X
        self.y = y

        self.n_estimators = n_estimators
        self.n_samples = n_samples
        self.N = self.y.shape[0]
        self.weights = [1 / self.N for _ in range(self.N)]
        self.alphas = []
        self.estimators = None
        self.count = 0

    def init_estimator(self):
        indices = [i for i in np.random.choice(X.shape[0], self.n_samples, p=self.weights)]
        X_tree = np.array([X[i, :] for i in indices])
        y_tree = np.array([y[i] for i in indices])

        print "%s / %s" % (self.count, self.n_estimators)

        while True:
            t1 = time.time()
            tree = Tree(X_tree, y_tree)
            t2 = time.time()

            print "tree generation time: %s" % (t2 - t1)

            predictions = tree.predict(self.X)
            accuracy = accuracy_score(self.y, predictions)
            print "accuracy: %s" % accuracy
            if accuracy != 0.50:
                self.estimators.append(tree)
                break

        return tree, predictions

    def train(self):
        self.count = 0
        self.estimators = []
        t1 = time.time()
        for _ in range(self.n_estimators):
            self.count += 1

            estimator, y_pred = self.init_estimator()

            errors = np.array([ y_i != y_p for y_i, y_p in zip(y, y_pred)])
            agreements = [-1 if e else 1 for e in errors]
            epsilon = sum(errors * self.weights)

            print "epsilon: %s" % epsilon
            alpha = 0.5 * np.log((1 - epsilon) / epsilon)
            z = 2 * np.sqrt(epsilon * ( 1 - epsilon))
            self.weights = np.array([(weight / z) * np.exp(-1 * alpha * agreement)
                                     for weight, agreement in zip(self.weights, agreements)])
            print "weights sum: %s" % sum(self.weights)
            self.alphas.append(alpha)
        t2 = time.time()
        print "train took %s s" % (t2 - t1)

    def predict(self, X):
        predicts = np.array([estimator.predict(X) for estimator in self.estimators])
        weighted_prdicts = [[(p_i, alpha) for p_i in p] for alpha, p in zip(self.alphas, predicts)]

        H = []
        for i in range(X.shape[0]):
            bucket = []
            for j in range(len(self.alphas)):
                bucket.append(weighted_prdicts[j][i])
            H.append(bucket)

        return [self.weighted_majority_vote(h) for h in H]

    def weighted_majority_vote(self, h):
        weighted_vote = {}
        for label, weight in h:
            if label in weighted_vote:
                weighted_vote[label] = weighted_vote[label] + weight
            else:
                weighted_vote[label] = weight

        max_weight = 0
        max_vote = 0
        for vote, weight in weighted_vote.iteritems():
            if max_weight < weight:
                max_weight = weight
                max_vote = vote

        return max_vote