From 8475f8610ad12801dd37e1c2670edfbb1f0d8ca7 Mon Sep 17 00:00:00 2001 From: ludwigbothmann <46222472+ludwigbothmann@users.noreply.github.com> Date: Wed, 14 Dec 2022 02:52:41 +0100 Subject: [PATCH] BB: final edits chap 5 --- ...slides-optim-multivar-1-newton-raphson.tex | 41 +++++++------ .../slides-optim-multivar-2-quasi-newton.tex | 59 +++++++++---------- .../slides-optim-multivar-3-gauss-newton.tex | 24 ++++---- 3 files changed, 65 insertions(+), 59 deletions(-) diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex b/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex index 1b8f607..ad25752 100644 --- a/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex +++ b/slides/05-multivariate-second-order/slides-optim-multivar-1-newton-raphson.tex @@ -7,7 +7,7 @@ \newcommand{\titlefigure}{figure_man/NR_2.png} \newcommand{\learninggoals}{ -\item First vs. Second order methods +\item 1st vs. 2nd order methods \item Newton-Raphson } @@ -36,7 +36,7 @@ \end{vbframe} -\begin{vbframe}{Newton-Raphson method} +\begin{vbframe}{Newton-Raphson} \textbf{Assumption:} $f$ twice differentiable with Hessian $\nabla^2 f(\bm{x})$ @@ -57,6 +57,8 @@ Update: $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \bm{d}^{[t]}$ with $\bm{d}^{[t]} = \left(\nabla^2 f(\mathbf{ x}^{[t]})\right)^{-1}\nabla f(\mathbf{x}^{[t]})$. +Or with step size: $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \alpha \bm{d}^{[t]}$ + \vspace*{0.3cm} \textbf{Note: } Numerically, we determine $\bm{d}^{[t]}$ by solving the LGS $\nabla^2 f(\mathbf{x}^{[t]})\bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]})$ with symmetic matrix $\nabla^2 f(\mathbf{x}^{[t]})$. @@ -81,11 +83,9 @@ % Im Vergleich zum \enquote{steilsten Abstieg}: Newton-Raphson divergiert ebenfalls leicht, hat aber quadratische % Konvergenz nahe beim Minimum. -\framebreak - -Example: +\end{vbframe} -\lz +\begin{vbframe}{Analytical Example on QF} $$ f(x, y) = \left(x^2 + \frac{y^2}{2}\right) @@ -141,38 +141,43 @@ &=& \mathbf{0} \end{eqnarray*} -Newton-Raphson only needs one iteration to solve the problem! +NR only needs one iteration to solve! -\framebreak -Example: Optimize Branin function with Newton Raphson and gradient descent (GD). +\end{vbframe} + +\begin{vbframe}{NR vs GD on Branin} \begin{figure} \centering \includegraphics[width=0.45\textwidth]{slides/05-multivariate-second-order/figure_man/NR_1.png} ~~ \includegraphics[width=0.45\textwidth]{slides/05-multivariate-second-order/figure_man/NR_2.png} \\ - Red: Newton Raphson. Right: Gradient descent. Newton raphson + Red=NR; Green=GD \end{figure} -\framebreak +NR has much better convergence speed here. + +\end{vbframe} + +\begin{vbframe}{Discussion} \textbf{Advantages:} \begin{itemize} -\item If $f$ sufficiently smooth, the procedure converges quadratically locally (i.e. if the starting point is close enough to optimum) +\item If $f$ sufficiently smooth, NR converges quadratically locally (i.e. if starting point is close enough to optimum) \end{itemize} \textbf{Disadvantages} \begin{itemize} -\item At \enquote{bad} starting points the procedure may not converge at all -\item The Hessian must be calculated and the direction of descent determined by solving a system of equations +\item At \enquote{bad} starting points, NR may not converge at all +\item Hessian must be calculated and the direction of descent determined by solving a system of equations \end{itemize} \end{vbframe} -\begin{vbframe}{Newton-Raphson: Limitations} +\begin{vbframe}{Limitations} -\textbf{Problem 1:} The update direction is generally not a direction of descent. +\textbf{Problem 1:} Update is generally not a direction of descent. \vspace*{-0.3cm} \begin{figure} @@ -204,8 +209,8 @@ \textbf{Aim}: Find methods that can be applied without the Hessian matrix \begin{itemize} -\item Quasi-Newton method. -\item Gauss-Newton algorithm (for least squares). +\item Quasi-Newton method +\item Gauss-Newton algorithm (for least squares) \end{itemize} diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex b/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex index d7e480d..03b1c86 100644 --- a/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex +++ b/slides/05-multivariate-second-order/slides-optim-multivar-2-quasi-newton.tex @@ -47,42 +47,41 @@ % $\mathbf{A}_{i} = \nabla^2 f(\mathbf{x}_{i})$ ist für quadratische Probleme optimal, bei allgemeinen % Problemen muss die Hessematrix weit weg vom Optimum noch nicht einmal positiv definit sein. -The starting point of the \textbf{Quasi-Newton method} is (as with Newton-Raphson) a Taylor approximation of the gradient, except that the Hessian matrix is replaced by a \textbf{positive definite} matrix $\bm{A}^{[t]}$: +Start point of \textbf{QN method} is (as with NR) a Taylor approximation of the gradient, except that H is replaced by a \textbf{pd} matrix $\bm{A}^{[t]}$: \vspace*{-0.2cm} \begin{footnotesize} \begin{alignat*}{4} -\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \nabla^2 f(\mathbf{x}^{[t]}) & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~\mathbf{0} &\qquad& \text{ Approach Newton-Raphson} \\ -\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \bm{A}^{[t]} & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~ \mathbf{0} &\qquad& \text{ Approach Quasi-Newton} +\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \nabla^2 f(\mathbf{x}^{[t]}) & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~\mathbf{0} &\qquad& \text{ NR} \\ +\nabla f(\mathbf{x}) &\approx \nabla f(\mathbf{x}^{[t]}) + \bm{A}^{[t]} & (\mathbf{x} - \mathbf{x}^{[t]}) ~ &=& ~ \mathbf{0} &\qquad& \text{ QN} \end{alignat*} \end{footnotesize} -The update direction is the same: +The update direction: \begin{footnotesize} \begin{alignat*}{3} -\bm{d}^{[t]} &= - \nabla^2 f(\mathbf{x}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ Update direction Newton-Raphson} \\ -\bm{d}^{[t]} &= - (\bm{A}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ Update direction Quasi-Newton} \\ +\bm{d}^{[t]} &= - \nabla^2 f(\mathbf{x}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ NR} \\ +\bm{d}^{[t]} &= - (\bm{A}^{[t]})^{-1} & \nabla f(\mathbf{x}^{[t]}) &\qquad& \text{ QN} \\ \end{alignat*} \end{footnotesize} \framebreak -\textbf{Quasi-Newton method}: +%\textbf{Quasi-Newton method}: \begin{enumerate} -\item Select a starting point $\mathbf{x}^{[0]}$ and initialize a positive definite matrix -$\mathbf{A}^{[0]}$ (can also be a diagonal matrix - a very rough approximation of the -Hessian matrix). +\item Select a starting point $\mathbf{x}^{[0]}$ and initialize pd matrix +$\mathbf{A}^{[0]}$ (can also be a diagonal matrix - a very rough approximation of Hessian). \item Calculate update direction by solving $$ \bm{A}^{[t]} \bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]}) $$ -and calculate $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \lambda^{[t]} \bm{d}^{[t]}$ (Step size through backtracking) -\item Calculate an efficient update $\mathbf{A}^{[t+1]}$, which is based on $\mathbf{x}^{[t]}$, +and set $\bm{x}^{[t+1]} = \bm{x}^{[t]} + \alpha^{[t]} \bm{d}^{[t]}$ (Step size through backtracking) +\item Calculate an efficient update $\mathbf{A}^{[t+1]}$,\\based on $\mathbf{x}^{[t]}$, $\mathbf{x}^{[t+1]}$, $\nabla f(\mathbf{x}^{[t]})$, $\nabla f(\mathbf{x}^{[t+1]})$ and $\mathbf{A}^{[t]}$. \end{enumerate} @@ -95,17 +94,17 @@ \bm{A}^{[t+1]} = \bm{A}^{[t]} + \bm{B}^{[t]}. $$ -How $ \bm{B}^{[t]}$ is constructed is shown on the next slides. +How $ \bm{B}^{[t]}$ is constructed is shown on the next slides. \\ \textbf{Requirements} for the matrix sequence $\bm{A}^{[t]}$: \begin{enumerate} -\item Symmetric \& positive definite matrices so that $\bm{d}^{[t]}$ are descent directions. -\item Low calculation effort when calculating the descent direction +\item Symmetric pd, so that $\bm{d}^{[t]}$ are descent directions. +\item Low computational effort when solving LES $$ \bm{A}^{[t]} \bm{d}^{[t]} = - \nabla f(\mathbf{x}^{[t]}) $$ -\item Good approximation for Hessian: The \enquote{modified} Taylor series for $\nabla f(\bm{x})$ (especially for $i \to \infty$) should provide a good approximation +\item Good approximation of Hessian: The \enquote{modified} Taylor series for $\nabla f(\bm{x})$ (especially for $t \to \infty$) should provide a good approximation $$ \nabla f(\mathbf{x}) \approx \nabla f(\mathbf{x}^{[t]}) + @@ -119,46 +118,46 @@ \begin{vbframe}{Symmetric rank 1 update (SR1)} -The simplest approach are symmetric rank 1 modifications (\textbf{SR1}) of the form +Simplest approach: symmetric rank 1 updates (\textbf{SR1}) of form $$ -\bm{A}^{[t+1]} \leftarrow \bm{A}^{[t]} + \bm{B}^{[t]} = \bm{A}^{[t]} + \alpha \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} +\bm{A}^{[t+1]} \leftarrow \bm{A}^{[t]} + \bm{B}^{[t]} = \bm{A}^{[t]} + \beta \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} $$ -with appropriate vector $\bm{u}^{[t]} \in \R^n$, $\alpha \in \R$. +with appropriate vector $\bm{u}^{[t]} \in \R^n$, $\beta \in \R$. \framebreak \textbf{Choice of} $\bm{u}^{[t]}$: -The vectors should be chosen so that the \enquote{modified} Taylor series corresponds to the gradient: +Vectors should be chosen so that the \enquote{modified} Taylor series corresponds to the gradient: \begin{eqnarray*} \nabla f(\mathbf{x}) &\overset{!}{=}& \nabla f(\mathbf{x}^{[t+1]}) + \bm{A}^{[t+1]}(\mathbf{x} - \mathbf{x}^{[t+1]}) \\ \nabla f(\mathbf{x}) &=& \nabla f(\mathbf{x}^{[t+1]}) + \left(\bm{A}^{[t]} + -\alpha \bm{u}^{[t]}(\bm{u}^{[t]})^\top\right)\underbrace{(\mathbf{x} - \mathbf{x}^{[t+1]})}_{:= \bm{s}^{[t+1]}} \\ -\underbrace{\nabla f(\mathbf{x}) - \nabla f(\mathbf{x}^{[t+1]})}_{\bm{y}^{[t+1]}} &=& \left(\bm{A}^{[t]} + \alpha \bm{u}^{[t]} (\bm{u}^{[t]})^{\top}\right) \bm{s}^{[t+1]} \\ -\bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]} &=& \left(\alpha (\bm{u}^{[t]})^{\top} \bm{s}^{[t+1]}\right) \bm{u}^{[t]} +\beta \bm{u}^{[t]}(\bm{u}^{[t]})^\top\right)\underbrace{(\mathbf{x} - \mathbf{x}^{[t+1]})}_{:= \bm{s}^{[t+1]}} \\ +\underbrace{\nabla f(\mathbf{x}) - \nabla f(\mathbf{x}^{[t+1]})}_{\bm{y}^{[t+1]}} &=& \left(\bm{A}^{[t]} + \beta \bm{u}^{[t]} (\bm{u}^{[t]})^{\top}\right) \bm{s}^{[t+1]} \\ +\bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]} &=& \left(\beta (\bm{u}^{[t]})^{\top} \bm{s}^{[t+1]}\right) \bm{u}^{[t]} \end{eqnarray*} -For $\bm{u}^{[t]} = \bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]}$ and $\alpha = \frac{1}{\left(\bm{y}^{[t+1]} - \bm{A}^{[t]}\bm{s}^{[t+1]}\right)^\top\bm{s}^{[t+1]}}$ the equation is satisfied. +For $\bm{u}^{[t]} = \bm{y}^{[t+1]} - \bm{A}^{[t]} \bm{s}^{[t+1]}$ and $\beta = \frac{1}{\left(\bm{y}^{[t+1]} - \bm{A}^{[t]}\bm{s}^{[t+1]}\right)^\top\bm{s}^{[t+1]}}$ the equation is satisfied. \framebreak \textbf{Advantage} \begin{itemize} -\item The updates provide a sequence of \textbf{symmetric} matrices -\item The matrices can be inverted efficiently and stably using the Sherman-Morrison formula (special case of Woodbury formula) using the formula +\item Provides a sequence of \textbf{symmetric pd} matrices +\item Matrices can be inverted efficiently and stable using Sherman-Morrison: $$ -(\bm{A} + \alpha \bm{u}\bm{u}^{\top})^{-1} = \bm{A} + \alpha \frac{\bm{u}\bm{u}^{\top}}{1 + \alpha\bm{u}^\top\bm{u}}. +(\bm{A} + \beta \bm{u}\bm{u}^{\top})^{-1} = \bm{A} + \beta \frac{\bm{u}\bm{u}^{\top}}{1 + \beta\bm{u}^\top\bm{u}}. $$ \end{itemize} \textbf{Disadvantage} \begin{itemize} -\item The constructed matrices are not necessarily positive definite, and the update directions $\bm{d}^{[t]}$ are therefore not necessarily descent directions +\item The constructed matrices are not necessarily pd, and the update directions $\bm{d}^{[t]}$ are therefore not necessarily descent directions \end{itemize} \end{vbframe} @@ -168,7 +167,7 @@ Instead of Rank 1 updates, the \textbf{BFGS} procedure (published simultaneously in 1970 by Broyden, Fletcher, Goldfarb and Shanno) uses rank 2 modifications of the form $$ -\bm{A}^{[t]} + \alpha \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} + \beta \bm{v}^{[t]}(\bm{v}^{[t]})^{\top} +\bm{A}^{[t]} + \beta \bm{u}^{[t]}(\bm{u}^{[t]})^{\top} + \beta \bm{v}^{[t]}(\bm{v}^{[t]})^{\top} $$ with $\bm{s}^{[t]} := \bm{x}^{[t+1]} - \bm{x}^{[t]}$ @@ -176,7 +175,7 @@ \begin{itemize} \item $\bm{u}^{[t]} = \nabla f(\bm{x}^{[t+1]}) - \nabla f(\bm{x}^{[t]})$ \item $\bm{v}^{[t]} = \bm{A}^{[t]} \bm{s}^{[t]}$ - \item $\alpha = \frac{1}{(\bm{u}^{[t]})^\top (\bm{s}^{[t]})}$ + \item $\beta = \frac{1}{(\bm{u}^{[t]})^\top (\bm{s}^{[t]})}$ \item $\beta = - \frac{1}{(\bm{s}^{[t]})^\top \bm{A}^{[t]} \bm{s}^{[t]}}$ \end{itemize} diff --git a/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex b/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex index 9d1e90d..09ac778 100644 --- a/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex +++ b/slides/05-multivariate-second-order/slides-optim-multivar-3-gauss-newton.tex @@ -35,10 +35,10 @@ \min_{\bm{\theta}} &&g(\thetab) \\ \text{ with } && g(\thetab) = \|r(\bm{\theta})\|_2^2 = \sum_{i = 1}^n \left[r_i(\bm{\theta})\right]^2 = r(\thetab)^\top r(\thetab). \end{eqnarray*} -We define $r$ as the function that maps a parameter $\thetab$ to the vector of residuals +$r$: map $\thetab$ to residuals \begin{eqnarray*} - r: \R^p &\to& \R^n, \\ + r: \R^d &\to& \R^n, \\ \thetab &\mapsto& r(\thetab) = \begin{pmatrix} r_1(\thetab) \\ ... \\ r_n(\thetab)\end{pmatrix} \end{eqnarray*} @@ -55,7 +55,8 @@ \begin{columns} \begin{column}{0.55\textwidth} -\textbf{Example:} Consider, for example, a regression problem with data +\textbf{Example:} +%Consider, for example, a regression problem with data \begin{footnotesize} \begin{eqnarray*} @@ -116,7 +117,8 @@ % Dieses Optimierungsproblem möchten wir nun mit Hilfe des Newton Verfahrens lösen. Hierfür beginnen wir mit der Berechnung der Jakobi- und Hessematrix. -The vector of residuals is \begin{footnotesize} +Residuals: +\begin{footnotesize} $$ r(\bm{\theta}) = \mat{\theta_1 exp(\theta_2 x^{(1)}) - y^{(1)} \\\theta_1 exp(\theta_2 x^{(2)}) - y^{(2)}\\ \theta_1 exp(\theta_2 x^{(3)}) - y^{(3)} \\ \theta_1 exp(\theta_2 x^{(4)}) - y^{(4)} \\ \theta_1 exp(\theta_2 x^{(5)}) - y^{(5)} } = \mat{ \theta_1 exp(1 \theta_2) - 3 \\\theta_1 exp(2 \theta_2) - 7\\ \theta_1 exp(4 \theta_2) - 12 \\ \theta_1 exp(5 \theta_2) - 13 \\ \theta_1 exp(7 \theta_2) - 20 @@ -124,7 +126,7 @@ $$ \end{footnotesize} -and the least squares problem is to minimize +LS problem: $$ g(\thetab) = r(\thetab)^\top r(\thetab) = \sum_{i=1}^{5} \left(\yi - \theta_1 \exp\left(\theta_2 x^{(i)}\right)\right)^2. @@ -134,10 +136,10 @@ \end{vbframe} -\begin{vbframe}{Limitations of Newton-Raphson} +\begin{vbframe}{Newton-Raphson Idea} -\textbf{Approach:} Calculate Newton-Raphson search direction by solving +\textbf{Approach:} Calculate NR update direction by solving: $$ \nabla^2 g(\bm{\theta}^{[t]}) \bm{d}^{[t]} = - \nabla g(\thetab^{[t]}). @@ -168,7 +170,7 @@ \framebreak -The Hessian matrix is obtained by applying the product rule and has the elements +Hessian is obtained by applying product rule and has elements \begin{eqnarray*} H_{jk} &=& 2 \sumin \left(\frac{\partial r_i}{\partial \thetab_j}\frac{\partial r_i}{\partial \thetab_k} + r_i \frac{\partial^2 r_i}{\partial \thetab_j \partial \thetab_k}\right) @@ -190,13 +192,13 @@ % % &= J(\bm{\theta})^\top J(\bm{\theta}) + W(\bm{\theta}) % \end{align*} -\textbf{Problem with Newton-Raphson:} Second derivatives can be challenging to compute! +\textbf{Problem with NR:} 2nd derivatives can be challenging to compute! \end{vbframe} \begin{vbframe}{Gauss Newton for least squares} -Gauss-Newton approximates the Hessian by dropping its second part: +GN approximates H by dropping its second part: \begin{eqnarray*} H_{jk} &=& 2 \sumin \left(\frac{\partial r_i}{\partial \thetab_j}\frac{\partial r_i}{\partial \thetab_k} + r_i \frac{\partial^2 r_i}{\partial \thetab_j \partial \thetab_k}\right) \\ @@ -449,7 +451,7 @@ \lz -Note: The diagonal elements of a positive definite matrix are always $\geq 0$ +Note: The diag elements of a pd matrix are always $\geq 0$ \end{vbframe}