Skip to content

Commit 1ab4b07

Browse files
author
schalkdaniel
committed
#88 text + figure for intercept handling
1 parent 1bf1fe9 commit 1ab4b07

File tree

3 files changed

+141
-102
lines changed

3 files changed

+141
-102
lines changed
122 KB
Loading
+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# ------------------------------------------------------------------------------
2+
# FIG: COMPBOOST INTERCEPT HANDLING
3+
# ------------------------------------------------------------------------------
4+
5+
# Purpose: visualize base learners
6+
7+
if (FALSE) {
8+
devtools::install_github("schalkdaniel/compboost", ref = "dev")
9+
install.packages("AmesHousing")
10+
}
11+
12+
# HELPER -------------------------------------------------------------------------
13+
aggregator = function(cf) {
14+
bnames = names(cf)
15+
bnamesl = bnames[grepl("linear", bnames)]
16+
iidx = which(grepl("intercept", bnames))
17+
intercept = cf[["offset"]]
18+
if (length(iidx) == 0) {
19+
intercept = intercept + sum(vapply(bnamesl, FUN.VALUE = numeric(1), FUN = function(bn) cf[[bn]][1]))
20+
cfo = vapply(bnamesl, function(bn) cf[[bn]][2], numeric(1))
21+
} else {
22+
cfo = vapply(bnamesl, function(bn) cf[[bn]][1], numeric(1))
23+
intercept = intercept + cf[[iidx]]
24+
}
25+
names(cfo) = bnamesl
26+
cfo["Intercept"] = intercept
27+
return(cfo)
28+
}
29+
30+
intInDF = function(cf_int, cf_noint) {
31+
cf1 = aggregator(cf_int)
32+
cf2 = aggregator(cf_noint)
33+
34+
bln = names(cf1)
35+
if (! all(names(cf1) %in% names(cf2))) stop("Names are not equal")
36+
dfout = data.frame(bl = bln, with_intercept = unlist(cf1)[bln], without_intercept = unlist(cf2)[bln])
37+
rownames(dfout) = NULL
38+
return(dfout)
39+
}
40+
41+
# DATA -------------------------------------------------------------------------
42+
43+
# Get data
44+
45+
dat = as.data.frame(AmesHousing::make_ames())
46+
47+
# TRAINING ---------------------------------------------------------------------
48+
49+
library(compboost)
50+
51+
# Instantiate & train model:
52+
# - one intercept base learner
53+
# - All other base learners are linear without intercept
54+
n_iters = 10000L
55+
56+
fnames = c("Fireplaces", "Lot_Frontage", "Lot_Area", "Wood_Deck_SF")
57+
58+
# MODEL WITH INTERCEPT BL:
59+
cboost_int = Compboost$new(data = dat, target = "Sale_Price", learning_rate = 0.01,
60+
loss = LossQuadratic$new())
61+
62+
cboost_int$addIntercept()
63+
64+
invisible(lapply(fnames, function (fn) {
65+
cboost_int$addBaselearner(fn, "linear", BaselearnerPolynomial, intercept = FALSE)
66+
}))
67+
68+
cboost_int$train(n_iters, trace = 0)
69+
70+
# MODEL WITHOUT INTERCEPT BL:
71+
cboost_noint = Compboost$new(data = dat, target = "Sale_Price", learning_rate = 0.01,
72+
loss = LossQuadratic$new())
73+
74+
invisible(lapply(fnames, function (fn) {
75+
cboost_noint$addBaselearner(fn, "linear", BaselearnerPolynomial)
76+
}))
77+
78+
cboost_noint$train(n_iters, trace = 0)
79+
80+
# EXTRACT COEFFICIENTS -------------------------------------------------------------------------
81+
iters = seq(1, n_iters, length.out = 100L)
82+
ll = list()
83+
for (m in iters) {
84+
cboost_int$train(m)
85+
cboost_noint$train(m)
86+
87+
cf_int = cboost_int$getCoef()
88+
cf_noint = cboost_noint$getCoef()
89+
90+
dfin = cbind(intInDF(cf_int, cf_noint), iteration = m)
91+
ll = c(ll, list(dfin))
92+
}
93+
df_cf = do.call(rbind, ll)
94+
95+
# VISUALIZE -------------------------------------------------------------------------------------
96+
library(ggplot2)
97+
98+
gg = ggplot(df_cf, aes(x = iteration, color = bl)) +
99+
geom_line(aes(y = with_intercept, linetype = "With intercept base learner")) +
100+
geom_line(aes(y = without_intercept, linetype = "Without intercept base learner")) +
101+
theme_minimal() +
102+
scale_color_brewer(palette = "Dark2") +
103+
scale_y_continuous(trans = "log") +
104+
labs(linetype = "", color = "Base learner") +
105+
xlab("Iteration") +
106+
ylab("Parameter value (log scale)")
107+
108+
ggsave(plot = gg, filename = here::here("slides/boosting/figure/compboost-intercept-handling.png"), height = 2.2, width = 7L)
109+

Diff for: slides/boosting/slides-boosting-comp-boost-basics.tex

+32-102
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
\newcommand{\titlefigure}{figure/compboost-illustration-2.png}
99
\newcommand{\learninggoals}{
10-
\item Learn the concept of componentwise boosting and its relation to GLM
10+
\item Learn the concept of componentwise boosting (CWB)
1111
\item Understand the built-in feature selection process
1212
\item Understand the problem of fair base learner selection
1313
}
@@ -29,7 +29,7 @@
2929

3030
\lz
3131

32-
The aim of componentwise gradient boosting is to find a model that:
32+
The aim of componentwise gradient boosting (CWB) is to find a model that:
3333

3434
\begin{itemize}
3535
\item
@@ -53,7 +53,7 @@
5353

5454
\lz
5555

56-
Because of this, componentwise gradient boosting is also often referred to as \textbf{model-based boosting}.
56+
Because of this, CWB is also often referred to as \textbf{model-based boosting}.
5757

5858
\end{vbframe}
5959

@@ -66,7 +66,7 @@
6666

6767
\lz
6868

69-
For componentwise gradient boosting we generalize this to multiple base learner sets $\{ \mathcal{B}_1, ... \mathcal{B}_J \}$ with associated parameter spaces
69+
For CWB we generalize this to multiple base learner sets $\{ \mathcal{B}_1, ... \mathcal{B}_J \}$ with associated parameter spaces
7070
$\{ \bm{\Theta}_1, ... \bm{\Theta}_J \}$,
7171
% $$
7272
% % b_j^{[m]}(\xv,\pmb\theta^{[m]}) \quad j = 1,\dots, J\,,
@@ -155,68 +155,50 @@
155155
\end{vbframe}
156156

157157

158-
159158
% ------------------------------------------------------------------------------
160159

161-
\begin{vbframe}{relation to glm}
162-
163-
In the simplest case we use linear models (without intercept) on single features
164-
as base learners:
165-
166-
$$
167-
b_j(x_j,\theta) = \theta x_j \quad \text{for } j = 1, 2, \dots, p \quad
168-
\text{and with } b_j \in \mathcal{B}_j = \{\theta x_j ~\rvert~ \theta \in
169-
\mathbb{R} \}.
170-
$$
171-
172-
173-
This definition will result in an ordinary \textbf{linear regression} model.
174-
175-
% .\footnote{Note: a linear model base learner without intercept only makes sense if the covariates are centered (see \texttt{mboost} tutorial, page7)}
176-
160+
\begin{vbframe}{intercept handling}
177161

178162
\begin{itemize}
179-
\item Note that linear base learners without intercept only make sense for
180-
covariates that have been centered before.
181-
\item If we let the boosting algorithm converge, i.e., let it run for a really
182-
long time, the parameters will converge to the \textbf{same solution} as the
183-
ML estimate.
184-
\item This means that, by specifying a loss function according to the negative
185-
likelihood of a distribution from an exponential family and defining a link
186-
function accordingly, this kind of boosting is equivalent to a (regularized)
187-
\textbf{generalized linear model (GLM)}.
163+
\item CWB is initialized with a loss-optimal constant $\fm[0](\xv)$ as initial model intercept.
164+
\item An intercept is often referred to as part of a model which contains information independent of the features.
165+
\item Suppose linear base learners $b_j(\xv) = \theta_{j1} + \theta_{j2} x_j$ with data independent intercept $\theta_{j1}$ and slope $\theta_{j2}$.
166+
\item Adding base learner $\hat{b}_j$ in iteration $m$ with parameter estimates $\thetamh = (\hat{\theta}_{j1}^{[m]}, \hat{\theta}_{j1}^{[m]})$ consequently updates the intercept to $\fm[0](\xv) + \hat{\theta}_{j1}^{[m]}$.
167+
\item Throughout the fitting process, the intercept is adjusted $M$ times to its final form:
168+
$$
169+
\fm[0](\xv) + \sum\limits_{m=1}^M \hat{\theta}^{[m]}_{j^{[m]}1}
170+
$$
188171
\end{itemize}
189172

190-
\framebreak
191-
192173
% ------------------------------------------------------------------------------
193174

194-
But: We do not \emph{need} an exponential family and thus are able to fit models
195-
to all kinds of other distributions and losses, as long as we can calculate (or
196-
approximate) a derivative of the loss.
197-
% Note, however, that this does not imply that the algorithm does something
198-
% meaningful (e.g., non-convex loss functions would still require some
199-
% additional effort).
200-
201-
\lz
175+
\framebreak
202176

203-
Usually we do not let the boosting model converge fully, but \textbf{stop
204-
early} for the sake of regularization and feature selection.
177+
Two possible options to handle the intercept in CWB are:
205178

206-
\lz
179+
\begin{itemize}
207180

208-
Even though the resulting model looks like a GLM, we do not have valid standard
209-
errors for our coefficients,
210-
so cannot provide confidence or prediction intervals or perform tests etc.
211-
$\rightarrow$ post-selection inference.
181+
\item Include an intercept base learner:
182+
\begin{itemize}
183+
\item Add base learner $b_{\text{int}} = \theta$ as potential candidate considered in each iteration.
184+
\item At the same time, remove the intercept from all linear base learners to only use $b_j(\xv) = \theta_j x_j$.
185+
\item The final intercept is given by $\fm[0](\xv) + \hat{\theta}$.
186+
\end{itemize}
187+
\item Include an intercept in each linear base learner $b_j(\xv) = \theta_{j1} + \theta_{j2} x_j$ and accumulate all intercepts to one global intercept after the fitting.
212188

213-
\end{vbframe}
189+
\end{itemize}
214190

215191
% ------------------------------------------------------------------------------
216192

217-
\begin{vbframe}{intercept handling}
193+
\framebreak
194+
195+
The following figures shows a comparison of the parameter updates with a different intercept handling:
196+
\begin{center}
197+
\includegraphics[width = \textwidth]{figure/compboost-intercept-handling.png}
198+
\end{center}
199+
200+
The used data set is \href{https://github.com/topepo/AmesHousing}{Ames Housing}.
218201

219-
\textcolor{red}{@Janek}
220202

221203
\end{vbframe}
222204

@@ -301,57 +283,5 @@
301283
\end{vbframe}
302284

303285

304-
305-
\begin{vbframe}{Relation to GLM - continued}
306-
307-
The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
308-
309-
\begin{center}
310-
\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter250.png}
311-
\end{center}
312-
313-
\end{vbframe}
314-
315-
\begin{vbframe}{Relation to GLM - continued}
316-
317-
The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
318-
319-
\begin{center}
320-
\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter500.png}
321-
\end{center}
322-
323-
\end{vbframe}
324-
325-
\begin{vbframe}{Relation to GLM - continued}
326-
327-
The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
328-
329-
\begin{center}
330-
\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter1000.png}
331-
\end{center}
332-
333-
\end{vbframe}
334-
335-
\begin{vbframe}{Relation to GLM - continued}
336-
337-
The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
338-
339-
\begin{center}
340-
\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter5000.png}
341-
\end{center}
342-
343-
\end{vbframe}
344-
345-
\begin{vbframe}{Relation to GLM - continued}
346-
347-
The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
348-
349-
\begin{center}
350-
\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter10000.png}
351-
\end{center}
352-
353-
\end{vbframe}
354-
355-
356286
\endlecture
357287
\end{document}

0 commit comments

Comments
 (0)