BB: final edits chap 5

slds-lmu · Dec 14, 2022 · 8475f86 · 8475f86
1 parent e98eed3
commit 8475f86
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 59 deletions.
diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex b/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex
@@ -7,7 +7,7 @@
 
 \newcommand{\titlefigure}{figure_man/NR_2.png}
 \newcommand{\learninggoals}{
-\item First vs. Second order methods
+\item 1st vs. 2nd order methods
 \item Newton-Raphson
 }
 
@@ -36,7 +36,7 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Newton-Raphson method}
+\begin{vbframe}{Newton-Raphson}
 
 \textbf{Assumption:} $f$ twice differentiable with Hessian $\nabla^2 f(\bm{x})$
 
@@ -57,6 +57,8 @@
 
 Update: $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \bm{d}^{[t]}$ with $\bm{d}^{[t]} =  \left(\nabla^2 f(\mathbf{ x}^{[t]})\right)^{-1}\nabla f(\mathbf{x}^{[t]})$.
 
+Or with step size: $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \alpha \bm{d}^{[t]}$ 
+
 \vspace*{0.3cm}
 
 \textbf{Note: } Numerically, we determine $\bm{d}^{[t]}$ by solving the LGS $\nabla^2 f(\mathbf{x}^{[t]})\bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]})$ with symmetic matrix $\nabla^2 f(\mathbf{x}^{[t]})$. 
@@ -81,11 +83,9 @@
 % Im Vergleich zum \enquote{steilsten Abstieg}: Newton-Raphson divergiert ebenfalls leicht, hat aber quadratische
 % Konvergenz nahe beim Minimum.
 
-\framebreak
-
-Example:
+\end{vbframe}
 
-\lz
+\begin{vbframe}{Analytical Example on QF}
 
 $$
 f(x, y) = \left(x^2 + \frac{y^2}{2}\right)
@@ -141,38 +141,43 @@
 &=& \mathbf{0}
 \end{eqnarray*}
 
-Newton-Raphson only needs one iteration to solve the problem!
+NR only needs one iteration to solve!
 
-\framebreak 
 
-Example: Optimize Branin function with Newton Raphson and gradient descent (GD). 
+\end{vbframe}
+
+\begin{vbframe}{NR vs GD on Branin}
 
 \begin{figure}
     \centering
     \includegraphics[width=0.45\textwidth]{slides/05-multivariate-second-order/figure_man/NR_1.png} ~~ \includegraphics[width=0.45\textwidth]{slides/05-multivariate-second-order/figure_man/NR_2.png} \\
-    Red: Newton Raphson. Right: Gradient descent. Newton raphson 
+    Red=NR; Green=GD 
 \end{figure}
 
-\framebreak
+NR has much better convergence speed here.
+
+\end{vbframe}
+
+\begin{vbframe}{Discussion}
 
 \textbf{Advantages:}
 
 \begin{itemize}
-\item If $f$ sufficiently smooth, the procedure converges quadratically locally (i.e. if the starting point is close enough to optimum)
+\item If $f$ sufficiently smooth, NR converges quadratically locally (i.e. if starting point is close enough to optimum)
 \end{itemize}
 
 \textbf{Disadvantages}
 
 \begin{itemize}
-\item At \enquote{bad} starting points the procedure may not converge at all
-\item The Hessian must be calculated and the direction of descent determined by solving a system of equations
+\item At \enquote{bad} starting points, NR may not converge at all
+\item Hessian must be calculated and the direction of descent determined by solving a system of equations
 \end{itemize}
 
 \end{vbframe}
 
-\begin{vbframe}{Newton-Raphson: Limitations}
+\begin{vbframe}{Limitations}
 
-\textbf{Problem 1:} The update direction is generally not a direction of descent.
+\textbf{Problem 1:} Update is generally not a direction of descent.
 
 \vspace*{-0.3cm}
 \begin{figure}
@@ -204,8 +209,8 @@
 
 \textbf{Aim}: Find methods that can be applied without the Hessian matrix
 \begin{itemize}
-\item Quasi-Newton method.
-\item Gauss-Newton algorithm (for least squares).
+\item Quasi-Newton method
+\item Gauss-Newton algorithm (for least squares)
 \end{itemize}
 
 

diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex b/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex
@@ -47,42 +47,41 @@
 % $\mathbf{A}_{i} = \nabla^2 f(\mathbf{x}_{i})$ ist für quadratische Probleme optimal, bei allgemeinen
 % Problemen muss die Hessematrix weit weg vom Optimum noch nicht einmal positiv definit sein.
 
-The starting point of the \textbf{Quasi-Newton method} is (as with Newton-Raphson) a Taylor approximation of the gradient, except that the Hessian matrix is replaced by a \textbf{positive definite} matrix $\bm{A}^{[t]}$:
+Start point of \textbf{QN method} is (as with NR) a Taylor approximation of the gradient, except that H is replaced by a \textbf{pd} matrix $\bm{A}^{[t]}$:
 
 \vspace*{-0.2cm}
 \begin{footnotesize}
 \begin{alignat*}{4}
-\nabla f(\mathbf{x}) &\approx  \nabla f(\mathbf{x}^{[t]}) + \nabla^2 f(\mathbf{x}^{[t]}) & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~\mathbf{0}  &\qquad& \text{ Approach Newton-Raphson} \\
-\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \bm{A}^{[t]} & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~ \mathbf{0} &\qquad& \text{ Approach Quasi-Newton}
+\nabla f(\mathbf{x}) &\approx  \nabla f(\mathbf{x}^{[t]}) + \nabla^2 f(\mathbf{x}^{[t]}) & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~\mathbf{0}  &\qquad& \text{ NR} \\
+\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \bm{A}^{[t]} & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~ \mathbf{0} &\qquad& \text{ QN}
 \end{alignat*}
 \end{footnotesize}
 
-The update direction is the same:
+The update direction:
 
 \begin{footnotesize}
 \begin{alignat*}{3}
-\bm{d}^{[t]} &= - \nabla^2 f(\mathbf{x}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ Update direction Newton-Raphson} \\
-\bm{d}^{[t]} &= - (\bm{A}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ Update direction Quasi-Newton} \\
+\bm{d}^{[t]} &= - \nabla^2 f(\mathbf{x}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ NR} \\
+\bm{d}^{[t]} &= - (\bm{A}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ QN} \\
 \end{alignat*}
 \end{footnotesize}
 
 \framebreak
 
-\textbf{Quasi-Newton method}:
+%\textbf{Quasi-Newton method}:
 
 
 \begin{enumerate}
-\item Select a starting point $\mathbf{x}^{[0]}$ and initialize a positive definite matrix
-$\mathbf{A}^{[0]}$ (can also be a diagonal matrix - a very rough approximation of the
-Hessian matrix).
+\item Select a starting point $\mathbf{x}^{[0]}$ and initialize pd matrix
+$\mathbf{A}^{[0]}$ (can also be a diagonal matrix - a very rough approximation of Hessian).
 \item Calculate update direction by solving
 
 $$
 \bm{A}^{[t]} \bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]})
 $$
 
-and calculate $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \lambda^{[t]} \bm{d}^{[t]}$ (Step size through backtracking)
-\item Calculate an efficient update $\mathbf{A}^{[t+1]}$, which is based on $\mathbf{x}^{[t]}$,
+and set $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \alpha^{[t]} \bm{d}^{[t]}$ (Step size through backtracking)
+\item Calculate an efficient update $\mathbf{A}^{[t+1]}$,\\based on $\mathbf{x}^{[t]}$,
 $\mathbf{x}^{[t+1]}$, $\nabla f(\mathbf{x}^{[t]})$, $\nabla f(\mathbf{x}^{[t+1]})$ and
 $\mathbf{A}^{[t]}$.
 \end{enumerate}
@@ -95,17 +94,17 @@
 \bm{A}^{[t+1]} = \bm{A}^{[t]} + \bm{B}^{[t]}. 
 $$
 
-How $ \bm{B}^{[t]}$ is constructed is shown on the next slides. 
+How $ \bm{B}^{[t]}$ is constructed is shown on the next slides. \\
 \textbf{Requirements} for the matrix sequence $\bm{A}^{[t]}$:
 \begin{enumerate}
-\item Symmetric \& positive definite matrices so that $\bm{d}^{[t]}$ are descent directions.
-\item Low calculation effort when calculating the descent direction
+\item Symmetric pd, so that $\bm{d}^{[t]}$ are descent directions.
+\item Low computational effort when solving LES
 
 $$
 \bm{A}^{[t]} \bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]})
 $$
 
-\item Good approximation for Hessian: The \enquote{modified} Taylor series for $\nabla f(\bm{x})$ (especially for $i \to \infty$) should provide a good approximation
+\item Good approximation of Hessian: The \enquote{modified} Taylor series for $\nabla f(\bm{x})$ (especially for $t \to \infty$) should provide a good approximation
 
 $$
 \nabla f(\mathbf{x}) \approx \nabla f(\mathbf{x}^{[t]}) +
@@ -119,46 +118,46 @@
 
 \begin{vbframe}{Symmetric rank 1 update (SR1)}
 
-The simplest approach are symmetric rank 1 modifications (\textbf{SR1}) of the form
+Simplest approach: symmetric rank 1 updates (\textbf{SR1}) of form
 
 $$
-\bm{A}^{[t+1]} \leftarrow \bm{A}^{[t]} + \bm{B}^{[t]} = \bm{A}^{[t]} + \alpha \bm{u}^{[t]}(\bm{u}^{[t]})^{\top}
+\bm{A}^{[t+1]} \leftarrow \bm{A}^{[t]} + \bm{B}^{[t]} = \bm{A}^{[t]} + \beta \bm{u}^{[t]}(\bm{u}^{[t]})^{\top}
 $$
 
-with appropriate vector $\bm{u}^{[t]} \in \R^n$, $\alpha \in \R$.
+with appropriate vector $\bm{u}^{[t]} \in \R^n$, $\beta \in \R$.
 
 
 \framebreak
 
 \textbf{Choice of} $\bm{u}^{[t]}$:
 
-The vectors should be chosen so that the \enquote{modified} Taylor series corresponds to the gradient:
+Vectors should be chosen so that the \enquote{modified} Taylor series corresponds to the gradient:
 
 \begin{eqnarray*}
 \nabla f(\mathbf{x}) &\overset{!}{=}& \nabla f(\mathbf{x}^{[t+1]}) +
 \bm{A}^{[t+1]}(\mathbf{x} - \mathbf{x}^{[t+1]}) \\
 \nabla f(\mathbf{x}) &=& \nabla f(\mathbf{x}^{[t+1]}) +  \left(\bm{A}^{[t]} +
-\alpha \bm{u}^{[t]}(\bm{u}^{[t]})^\top\right)\underbrace{(\mathbf{x} - \mathbf{x}^{[t+1]})}_{:= \bm{s}^{[t+1]}} \\
-\underbrace{\nabla f(\mathbf{x}) - \nabla f(\mathbf{x}^{[t+1]})}_{\bm{y}^{[t+1]}} &=& \left(\bm{A}^{[t]} + \alpha \bm{u}^{[t]} (\bm{u}^{[t]})^{\top}\right) \bm{s}^{[t+1]} \\
-\bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]} &=& \left(\alpha (\bm{u}^{[t]})^{\top} \bm{s}^{[t+1]}\right) \bm{u}^{[t]}
+\beta \bm{u}^{[t]}(\bm{u}^{[t]})^\top\right)\underbrace{(\mathbf{x} - \mathbf{x}^{[t+1]})}_{:= \bm{s}^{[t+1]}} \\
+\underbrace{\nabla f(\mathbf{x}) - \nabla f(\mathbf{x}^{[t+1]})}_{\bm{y}^{[t+1]}} &=& \left(\bm{A}^{[t]} + \beta \bm{u}^{[t]} (\bm{u}^{[t]})^{\top}\right) \bm{s}^{[t+1]} \\
+\bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]} &=& \left(\beta (\bm{u}^{[t]})^{\top} \bm{s}^{[t+1]}\right) \bm{u}^{[t]}
 \end{eqnarray*}
 
-For $\bm{u}^{[t]} = \bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]}$ and $\alpha = \frac{1}{\left(\bm{y}^{[t+1]} - \bm{A}^{[t]}\bm{s}^{[t+1]}\right)^\top\bm{s}^{[t+1]}}$ the equation is satisfied.
+For $\bm{u}^{[t]} = \bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]}$ and $\beta = \frac{1}{\left(\bm{y}^{[t+1]} - \bm{A}^{[t]}\bm{s}^{[t+1]}\right)^\top\bm{s}^{[t+1]}}$ the equation is satisfied.
 
 \framebreak
 
 \textbf{Advantage}
 \begin{itemize}
-\item The updates provide a sequence of \textbf{symmetric} matrices
-\item The matrices can be inverted efficiently and stably using the Sherman-Morrison formula (special case of Woodbury formula) using the formula
+\item Provides a sequence of \textbf{symmetric pd} matrices
+\item Matrices can be inverted efficiently and stable using Sherman-Morrison:
 $$
-(\bm{A} + \alpha \bm{u}\bm{u}^{\top})^{-1} = \bm{A} + \alpha \frac{\bm{u}\bm{u}^{\top}}{1 + \alpha\bm{u}^\top\bm{u}}.
+(\bm{A} + \beta \bm{u}\bm{u}^{\top})^{-1} = \bm{A} + \beta \frac{\bm{u}\bm{u}^{\top}}{1 + \beta\bm{u}^\top\bm{u}}.
 $$
 \end{itemize}
 
 \textbf{Disadvantage}
 \begin{itemize}
-\item The constructed matrices are not necessarily positive definite, and the update directions $\bm{d}^{[t]}$ are therefore not necessarily descent directions
+\item The constructed matrices are not necessarily pd, and the update directions $\bm{d}^{[t]}$ are therefore not necessarily descent directions
 \end{itemize}
 
 \end{vbframe}
@@ -168,15 +167,15 @@
 Instead of Rank 1 updates, the \textbf{BFGS} procedure (published simultaneously in 1970 by Broyden, Fletcher, Goldfarb and Shanno) uses rank 2 modifications of the form
 
 $$
-\bm{A}^{[t]} + \alpha \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} + \beta \bm{v}^{[t]}(\bm{v}^{[t]})^{\top}
+\bm{A}^{[t]} + \beta \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} + \beta \bm{v}^{[t]}(\bm{v}^{[t]})^{\top}
 $$
 
 with $\bm{s}^{[t]} := \bm{x}^{[t+1]} - \bm{x}^{[t]}$
 
 \begin{itemize}
   \item $\bm{u}^{[t]} = \nabla f(\bm{x}^{[t+1]}) - \nabla f(\bm{x}^{[t]})$
   \item $\bm{v}^{[t]} = \bm{A}^{[t]} \bm{s}^{[t]}$
-  \item $\alpha = \frac{1}{(\bm{u}^{[t]})^\top (\bm{s}^{[t]})}$
+  \item $\beta = \frac{1}{(\bm{u}^{[t]})^\top (\bm{s}^{[t]})}$
   \item $\beta = - \frac{1}{(\bm{s}^{[t]})^\top \bm{A}^{[t]} \bm{s}^{[t]}}$
 \end{itemize}
 

diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex b/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex
@@ -35,10 +35,10 @@
 	\min_{\bm{\theta}} &&g(\thetab)  \\ \text{ with } && g(\thetab) = \|r(\bm{\theta})\|_2^2 = \sum_{i = 1}^n \left[r_i(\bm{\theta})\right]^2 = r(\thetab)^\top r(\thetab). 
 \end{eqnarray*}
 
-We define $r$ as the function that maps a parameter $\thetab$ to the vector of residuals
+$r$: map $\thetab$ to residuals
 
 \begin{eqnarray*}
-	r: \R^p &\to& \R^n, \\
+	r: \R^d &\to& \R^n, \\
 	\thetab &\mapsto& r(\thetab) = \begin{pmatrix} r_1(\thetab) \\ ... \\ r_n(\thetab)\end{pmatrix}
 \end{eqnarray*}
 
@@ -55,7 +55,8 @@
 
 \begin{columns}
 \begin{column}{0.55\textwidth}
-\textbf{Example:} Consider, for example, a regression problem with data
+\textbf{Example:} 
+%Consider, for example, a regression problem with data
 
 \begin{footnotesize}
 \begin{eqnarray*}
@@ -116,15 +117,16 @@
 % Dieses Optimierungsproblem möchten wir nun mit Hilfe des Newton Verfahrens lösen. Hierfür beginnen wir mit der Berechnung der Jakobi- und Hessematrix.
 
 
-The vector of residuals is \begin{footnotesize}
+Residuals:
+\begin{footnotesize}
 $$
 r(\bm{\theta}) = \mat{\theta_1 exp(\theta_2 x^{(1)}) - y^{(1)} \\\theta_1 exp(\theta_2 x^{(2)}) - y^{(2)}\\ \theta_1 exp(\theta_2 x^{(3)}) - y^{(3)} \\ \theta_1 exp(\theta_2 x^{(4)}) - y^{(4)} \\ \theta_1 exp(\theta_2 x^{(5)}) - y^{(5)} } = \mat{
 \theta_1 exp(1 \theta_2) - 3 \\\theta_1 exp(2 \theta_2) - 7\\ \theta_1 exp(4 \theta_2) - 12 \\ \theta_1 exp(5 \theta_2) - 13 \\ \theta_1 exp(7 \theta_2) - 20
 }.
 $$
 \end{footnotesize}
 
-and the least squares problem is to minimize
+LS problem:
 
 $$
 g(\thetab) =  r(\thetab)^\top r(\thetab) = \sum_{i=1}^{5} \left(\yi - \theta_1 \exp\left(\theta_2 x^{(i)}\right)\right)^2.
@@ -134,10 +136,10 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Limitations of Newton-Raphson}
+\begin{vbframe}{Newton-Raphson Idea}
 
 
-\textbf{Approach:} Calculate Newton-Raphson search direction by solving
+\textbf{Approach:} Calculate NR update direction by solving:
 
 $$
 \nabla^2 g(\bm{\theta}^{[t]}) \bm{d}^{[t]} = - \nabla g(\thetab^{[t]}).
@@ -168,7 +170,7 @@
 
 \framebreak 
 
-The Hessian matrix is obtained by applying the product rule and has the elements
+Hessian is obtained by applying product rule and has elements
 
 \begin{eqnarray*}
 	H_{jk} &=& 2 \sumin \left(\frac{\partial r_i}{\partial \thetab_j}\frac{\partial r_i}{\partial \thetab_k} + r_i \frac{\partial^2 r_i}{\partial \thetab_j \partial \thetab_k}\right)
@@ -190,13 +192,13 @@
 % % &= J(\bm{\theta})^\top J(\bm{\theta}) + W(\bm{\theta})
 % \end{align*}
 
-\textbf{Problem with Newton-Raphson:} Second derivatives can be challenging to compute! 
+\textbf{Problem with NR:} 2nd derivatives can be challenging to compute! 
 
 \end{vbframe}
 
 \begin{vbframe}{Gauss Newton for least squares}
 
-Gauss-Newton approximates the Hessian by dropping its second part:
+GN approximates H by dropping its second part:
 
 \begin{eqnarray*}
 H_{jk} &=& 2 \sumin \left(\frac{\partial r_i}{\partial \thetab_j}\frac{\partial r_i}{\partial \thetab_k} + r_i \frac{\partial^2 r_i}{\partial \thetab_j \partial \thetab_k}\right) \\
@@ -449,7 +451,7 @@
 
 \lz
 
-Note: The diagonal elements of a positive definite matrix are always $\geq 0$
+Note: The diag elements of a pd matrix are always $\geq 0$
 
 \end{vbframe}