#88 text + figure for intercept handling

schalkdaniel · schalkdaniel · commit 1ab4b0700752 · 2023-01-30T11:53:23.000+01:00
diff --git a/slides/boosting/figure/compboost-intercept-handling.png b/slides/boosting/figure/compboost-intercept-handling.png
diff --git a/slides/boosting/rsrc/fig-compboost-intercept-handling.R b/slides/boosting/rsrc/fig-compboost-intercept-handling.R
@@ -0,0 +1,109 @@
+# ------------------------------------------------------------------------------
+# FIG: COMPBOOST INTERCEPT HANDLING
+# ------------------------------------------------------------------------------
+
+# Purpose: visualize base learners
+
+if (FALSE) {
+  devtools::install_github("schalkdaniel/compboost", ref = "dev")
+  install.packages("AmesHousing")
+}
+
+# HELPER  -------------------------------------------------------------------------
+aggregator = function(cf) {
+  bnames = names(cf)
+  bnamesl = bnames[grepl("linear", bnames)]
+  iidx = which(grepl("intercept", bnames))
+  intercept = cf[["offset"]]
+  if (length(iidx) == 0) {
+    intercept = intercept + sum(vapply(bnamesl, FUN.VALUE = numeric(1), FUN = function(bn) cf[[bn]][1]))
+    cfo = vapply(bnamesl, function(bn) cf[[bn]][2], numeric(1))
+  } else {
+    cfo = vapply(bnamesl, function(bn) cf[[bn]][1], numeric(1))
+    intercept = intercept + cf[[iidx]]
+  }
+  names(cfo) = bnamesl
+  cfo["Intercept"] = intercept
+  return(cfo)
+}
+
+intInDF = function(cf_int, cf_noint) {
+  cf1 = aggregator(cf_int)
+  cf2 = aggregator(cf_noint)
+
+  bln = names(cf1)
+  if (! all(names(cf1) %in% names(cf2))) stop("Names are not equal")
+  dfout = data.frame(bl = bln, with_intercept = unlist(cf1)[bln], without_intercept = unlist(cf2)[bln])
+  rownames(dfout) = NULL
+  return(dfout)
+}
+
+# DATA -------------------------------------------------------------------------
+
+# Get data
+
+dat = as.data.frame(AmesHousing::make_ames())
+
+# TRAINING ---------------------------------------------------------------------
+
+library(compboost)
+
+# Instantiate & train model:
+# - one intercept base learner
+# - All other base learners are linear without intercept
+n_iters = 10000L
+
+fnames = c("Fireplaces", "Lot_Frontage", "Lot_Area", "Wood_Deck_SF")
+
+# MODEL WITH INTERCEPT BL:
+cboost_int = Compboost$new(data = dat, target = "Sale_Price", learning_rate = 0.01,
+  loss = LossQuadratic$new())
+
+cboost_int$addIntercept()
+
+invisible(lapply(fnames, function (fn) {
+  cboost_int$addBaselearner(fn, "linear", BaselearnerPolynomial, intercept = FALSE)
+}))
+
+cboost_int$train(n_iters, trace = 0)
+
+# MODEL WITHOUT INTERCEPT BL:
+cboost_noint = Compboost$new(data = dat, target = "Sale_Price", learning_rate = 0.01,
+  loss = LossQuadratic$new())
+
+invisible(lapply(fnames, function (fn) {
+  cboost_noint$addBaselearner(fn, "linear", BaselearnerPolynomial)
+}))
+
+cboost_noint$train(n_iters, trace = 0)
+
+# EXTRACT COEFFICIENTS  -------------------------------------------------------------------------
+iters = seq(1, n_iters, length.out = 100L)
+ll = list()
+for (m in iters) {
+  cboost_int$train(m)
+  cboost_noint$train(m)
+
+  cf_int = cboost_int$getCoef()
+  cf_noint = cboost_noint$getCoef()
+
+  dfin = cbind(intInDF(cf_int, cf_noint), iteration = m)
+  ll = c(ll, list(dfin))
+}
+df_cf = do.call(rbind, ll)
+
+# VISUALIZE -------------------------------------------------------------------------------------
+library(ggplot2)
+
+gg = ggplot(df_cf, aes(x = iteration, color = bl)) +
+  geom_line(aes(y = with_intercept, linetype = "With intercept base learner")) +
+  geom_line(aes(y = without_intercept, linetype = "Without intercept base learner")) +
+  theme_minimal() +
+  scale_color_brewer(palette = "Dark2") +
+  scale_y_continuous(trans = "log") +
+  labs(linetype = "", color = "Base learner") +
+  xlab("Iteration") +
+  ylab("Parameter value (log scale)")
+
+ggsave(plot = gg, filename = here::here("slides/boosting/figure/compboost-intercept-handling.png"), height = 2.2, width = 7L)
+
diff --git a/slides/boosting/slides-boosting-comp-boost-basics.tex b/slides/boosting/slides-boosting-comp-boost-basics.tex
@@ -7,7 +7,7 @@
 
 \newcommand{\titlefigure}{figure/compboost-illustration-2.png}
 \newcommand{\learninggoals}{
-  \item Learn the concept of componentwise boosting and its relation to GLM
+  \item Learn the concept of componentwise boosting (CWB)
   \item Understand the built-in feature selection process
   \item Understand the problem of fair base learner selection
 }
@@ -29,7 +29,7 @@
 
 \lz
 
-The aim of componentwise gradient boosting is to find a model that:
+The aim of componentwise gradient boosting (CWB) is to find a model that:
 
 \begin{itemize}
   \item
@@ -53,7 +53,7 @@
 
 \lz
 
-Because of this, componentwise gradient boosting is also often referred to as \textbf{model-based boosting}.
+Because of this, CWB is also often referred to as \textbf{model-based boosting}.
 
 \end{vbframe}
 
@@ -66,7 +66,7 @@
 
 \lz
 
-For componentwise gradient boosting we generalize this to multiple base learner sets $\{ \mathcal{B}_1, ... \mathcal{B}_J \}$ with associated parameter spaces
+For CWB we generalize this to multiple base learner sets $\{ \mathcal{B}_1, ... \mathcal{B}_J \}$ with associated parameter spaces
 $\{ \bm{\Theta}_1, ... \bm{\Theta}_J \}$,
 % $$
 %   % b_j^{[m]}(\xv,\pmb\theta^{[m]}) \quad j = 1,\dots, J\,,
@@ -155,68 +155,50 @@
 \end{vbframe}
 
 
-
 % ------------------------------------------------------------------------------
 
-\begin{vbframe}{relation to glm}
-
-In the simplest case we use linear models (without intercept) on single features
-as base learners:
-
-$$
-  b_j(x_j,\theta) = \theta x_j  \quad \text{for } j = 1, 2, \dots, p \quad
-  \text{and with } b_j \in \mathcal{B}_j = \{\theta x_j  ~\rvert~ \theta \in
-  \mathbb{R} \}.
-$$
-
-
-This definition will result in an ordinary \textbf{linear regression} model.
-
-% .\footnote{Note: a linear model base learner without intercept only makes sense if the covariates are centered (see \texttt{mboost} tutorial, page7)}
-
+\begin{vbframe}{intercept handling}
 
 \begin{itemize}
-  \item Note that linear base learners without intercept only make sense for
-  covariates that have been centered before.
-  \item If we let the boosting algorithm converge, i.e., let it run for a really
-  long time, the parameters will converge to the \textbf{same solution} as the
-  ML estimate.
-  \item This means that, by specifying a loss function according to the negative
-  likelihood of a distribution from an exponential family and defining a link
-  function accordingly, this kind of boosting is equivalent to a (regularized)
-  \textbf{generalized linear model (GLM)}.
+  \item CWB is initialized with a loss-optimal constant $\fm[0](\xv)$ as initial model intercept.
+  \item An intercept is often referred to as part of a model which contains information independent of the features.
+  \item Suppose linear base learners $b_j(\xv) = \theta_{j1} + \theta_{j2} x_j$ with data independent intercept $\theta_{j1}$ and slope $\theta_{j2}$.
+  \item Adding base learner $\hat{b}_j$ in iteration $m$ with parameter estimates $\thetamh = (\hat{\theta}_{j1}^{[m]}, \hat{\theta}_{j1}^{[m]})$ consequently updates the intercept to $\fm[0](\xv) + \hat{\theta}_{j1}^{[m]}$.
+  \item Throughout the fitting process, the intercept is adjusted $M$ times to its final form:
+    $$
+    \fm[0](\xv) + \sum\limits_{m=1}^M \hat{\theta}^{[m]}_{j^{[m]}1}
+    $$
 \end{itemize}
 
-\framebreak
-
 % ------------------------------------------------------------------------------
 
-But: We do not \emph{need} an exponential family and thus are able to fit models
-to all kinds of other distributions and losses, as long as we can calculate (or
-approximate) a derivative of the loss.
-% Note, however, that this does not imply that the algorithm does something
-% meaningful (e.g., non-convex loss functions would still require some
-% additional effort).
-
-\lz
+\framebreak
 
-Usually we do not let the boosting model converge fully, but \textbf{stop
-early} for the sake of regularization and feature selection.
+Two possible options to handle the intercept in CWB are:
 
-\lz
+\begin{itemize}
 
-Even though the resulting model looks like a GLM, we do not have valid standard
-errors for our coefficients,
-so cannot provide confidence or prediction intervals or perform tests etc.
-$\rightarrow$ post-selection inference.
+\item Include an intercept base learner:
+  \begin{itemize}
+    \item Add base learner $b_{\text{int}} = \theta$ as potential candidate considered in each iteration.
+    \item At the same time, remove the intercept from all linear base learners to only use $b_j(\xv) = \theta_j x_j$.
+    \item The final intercept is given by $\fm[0](\xv) + \hat{\theta}$.
+  \end{itemize}
+  \item Include an intercept in each linear base learner $b_j(\xv) = \theta_{j1} + \theta_{j2} x_j$ and accumulate all intercepts to one global intercept after the fitting.
 
-\end{vbframe}
+\end{itemize}
 
 % ------------------------------------------------------------------------------
 
-\begin{vbframe}{intercept handling}
+\framebreak
+
+The following figures shows a comparison of the parameter updates with a different intercept handling:
+\begin{center}
+\includegraphics[width = \textwidth]{figure/compboost-intercept-handling.png}
+\end{center}
+
+The used data set is \href{https://github.com/topepo/AmesHousing}{Ames Housing}.
 
-\textcolor{red}{@Janek}
 
 \end{vbframe}
 
@@ -301,57 +283,5 @@
 \end{vbframe}
 
 
-
-\begin{vbframe}{Relation to GLM - continued}
-
-The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
-
-\begin{center}
-\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter250.png}
-\end{center}
-
-\end{vbframe}
-
-\begin{vbframe}{Relation to GLM - continued}
-
-The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
-
-\begin{center}
-\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter500.png}
-\end{center}
-
-\end{vbframe}
-
-\begin{vbframe}{Relation to GLM - continued}
-
-The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
-
-\begin{center}
-\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter1000.png}
-\end{center}
-
-\end{vbframe}
-
-\begin{vbframe}{Relation to GLM - continued}
-
-The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
-
-\begin{center}
-\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter5000.png}
-\end{center}
-
-\end{vbframe}
-
-\begin{vbframe}{Relation to GLM - continued}
-
-The following figure shows the parameter values after $m \in \{250, 500, 1000, 5000, 10000\}$ iterations as well as the estimates from a linear model as crosses (GLM with normally distributed errors):
-
-\begin{center}
-\includegraphics[width=\textwidth]{figure/compboost-to-glm-iter10000.png}
-\end{center}
-
-\end{vbframe}
-
-
 \endlecture
 \end{document}