course.tex

\documentclass{beamer}
\usepackage{graphicx}
\usepackage{multicol}
\usepackage{amsmath}
\usepackage{booktabs}

\graphicspath{{latex/}{img/}}

% for custom colors
\definecolor{maroon}{RGB}{215,0,0}

\mode<presentation>
{
    % theme + colorscheme
    \usetheme{Madrid}
    \usecolortheme{beaver}

    % customize bullets in items list
    \setbeamertemplate{itemize item}{\color{red}$\bullet$}
    \setbeamertemplate{itemize subitem}{\color{red}$\circ$}

    % customize bullets in table of content
    \setbeamerfont{section number projected}{%
      family=\sffamily,series=\bfseries,size=\normalsize}
    \setbeamercolor{section number projected}{bg=maroon, fg=white}
    \setbeamercolor{subsection number projected}{bg=maroon}
    \setbeamertemplate{section in toc}[circle]
    \setbeamertemplate{subsection in toc}[square]

    % remove navigation symbols
    \setbeamertemplate{navigation symbols}{}
}

% customize footline: print section/subsection/date+slides_number instead of
% author/short_title/date+slides_number.
% You might need to remove it if you use another theme than Madrid.
\makeatletter
\setbeamertemplate{footline}
{
  \leavevmode%
  \hbox{%
  \begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,center]
    {section in head/foot}\usebeamerfont{section in head/foot}\insertsection
  \end{beamercolorbox}% if no % at the end, there is a gap between subboxes
  \begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,center]
    {title in head/foot}\usebeamerfont{title in head/foot}\insertsubsection
  \end{beamercolorbox}%
  \begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,right]
    {date in head/foot}\usebeamerfont{date in head/foot}\insertshortdate{}
    \hspace*{2em} \insertframenumber{} / \inserttotalframenumber\hspace*{2ex}
  \end{beamercolorbox}}%
  \vskip0pt
}
\makeatother

% main information
\title[Algorithms for Data Analysis]{Algorithms for Data Analysis}
\author{Julien Tissier}
\date{\today}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%
%%    F R A M E S    %%
%%%%%%%%%%%%%%%%%%%%%%%


% section -> on the left in footline; subsection -> in the center in footline
\section*{\insertshortauthor} % asterisk -> do not appear in table of content
\subsection*{\insertshorttitle}

% frame with title + main information
\begin{frame}
    \titlepage
\end{frame}

% summary frame
\begin{frame}
  \frametitle{Overview}
  \begin{multicols}{2}
    \tableofcontents
  \end{multicols}
\end{frame}

%------------------
%    I N T R O    -
%------------------

\section{Introduction to Data Analysis}

% data (analysis) definition
\subsection{Definition}
\begin{frame}
  \frametitle{What is data analysis?}
  \begin{block}{Data}
    Pieces of information (measurement, values, facts...) that can be:
    \begin{itemize}
      \item structured (matrices, tabular data, RDBMS, time series...)
      \item unstructured (news articles, webpages, images/video...)
    \end{itemize}
  \end{block}
  \begin{block}{Data Analysis}
  Process of preparing, transforming and using models to find more information
  from data, as well as visualizing results.
  \end{block}
\end{frame}

% tools (why Python?)
\subsection{Tools and libraries}
\begin{frame}
  \frametitle{How to analyze data?}
  There are usually two steps in data analysis. The first one is to find and
  \textbf{develop models} that can extract useful information from data (with
  languages like R or MATLAB). The second one is to \textbf{develop programs}
  that can be used in production systems (with languages like Java or C++).
  \\~\\

  With a growing popularity among scientists as well as the development of
  efficient libraries (numpy, pandas), \textbf{Python} became a great tool for
  data analysis. Python has many advantages:
  \begin{itemize}
    \item great for string/data processing
    \item can be used for both prototyping/production
    \item has a lot of existing libraries
    \item can easily integrate C/C++/FORTRAN legacy code
    \item easy to read/develop
  \end{itemize}
\end{frame}

% list of libraries
\begin{frame}
  \frametitle{Libraries}
  This course will be based on Python 3.7 (or above) and the following
  libraries:
  \begin{itemize}
    \item IPython (7.8+): enhanced Python shell
    \item numpy (1.17+): fast/efficient arrays and operations
    \item pandas (0.25+): data structures (Series/DataFrame)
    \item matplotlib (3.1+): plots and 2D visualization
    \item scipy (1.3.1+): scientific algorithms
    \item scikit-learn (0.21+): machine learning algorithms
  \end{itemize}

  Jupyter Notebook will also be used to give you samples of code, as they
  provide a more interactive way to learn and discover how these libraries
  work.
\end{frame}

% numpy
\subsection{Numpy}
\begin{frame}
  \frametitle{Numpy}
  Numpy (\textbf{Num}erical \textbf{Py}thon) is a high performance scientific
  computing library that can be used for matrices computations, Fourier
  transforms, linear algebra, statistical computations...
  \\~\\

  The main type of data in Numpy is the \textbf{ndarray}:
  \begin{itemize}
    \item n-dimensional array
    \item fixed size
    \item homogeneous datatypes
    \item similar to C arrays (continuous block of memory)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Why is Numpy efficient?}
  As a high level language, Python is slow to do any heavy computations,
  especially if very large arrays are involved. Numpy solves this problem thanks
  to the ndarray datatypes:
  \begin{itemize}
    \item efficient memory management (continuous block)
    \item use C loops instead of Python loops for computations on array
    \item vectorized operations (computations are done block by block, not
      element by element)
    \item rely on low-level routines for some operations (BLAS/LAPACK)
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Numpy}
  \begin{example}
    \begin{verbatim}

  import numpy as np

  a = np.array([1, 2, 3, 4])
  b = np.array([6, 7, 8, 9])

  c = a * b

  d = np.array([[1, 2], [3, 4]])
    \end{verbatim}
  \end{example}
\end{frame}

% pandas
\subsection{Pandas}
\begin{frame}
  \frametitle{Pandas}
  Pandas is a high-performance Python library used to work with data and analyze
  them. It contains many pre-implemented methods to read and parse data, as well
  as common statistical computations (mean, variance, correlation...).
  \\~\\

  Pandas has two main datatypes:
  \begin{itemize}
    \item Series: one-dimensional container. Indexes can be integers (like an
      array) or other objects (string, date...)
    \item DataFrame: tabular data, like a spreadsheet. It contains multiple
      rows and multiple columns. It can be seen as a collection of Series.
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Pandas}
  \begin{example}[Series]
    \begin{verbatim}

  from pandas import Series

  s1 = Series([4, 7, 9, -1])
  s2 = Series([12, 3.2, "John"],
              index=["mark", "gpa", "name"])
    \end{verbatim}
  \end{example}

  \begin{example}[DataFrame]
    \begin{verbatim}

  from pandas import DataFrame
  df = DataFrame({"key1": [1, 2, 3],
                  "key2": [4, 5, 6]})
    \end{verbatim}
  \end{example}
\end{frame}

% matplotlib
\subsection{Matplotlib}
\begin{frame}
  \frametitle{Matplotlib}
  Matplotlib is a plotting library used to visualize data and create graphics.
  Pandas directly uses matplotlib for representation.
  \begin{figure}[h]
    \includegraphics[width=8cm]{img/matplotlib_1.png}
  \end{figure}
\end{frame}

\begin{frame}
  \frametitle{Matplotlib}
  Matplotlib is a plotting library used to visualize data and create graphics.
  Pandas directly uses matplotlib for representation.
  \begin{figure}[h]
    \includegraphics[width=8cm]{img/matplotlib_2.png}
  \end{figure}
\end{frame}


%----------------------------------------
%    M A C H I N E   L E A R N I N G    -
%----------------------------------------

\section{Machine Learning}

% ML definition
\subsection{Definition}
\begin{frame}
  \frametitle{What is Machine Learning?}
  \begin{block}{Machine Learning (ML)}
    ML is when a computational system can \textbf{automatically learn} and
    extract \textbf{knowledge from data} without being explicitly programmed.
    ML is at the intersection of statistics, artificial intelligence and
    computer science.

    ML is now everywhere:
    \begin{itemize}
      \item automatic friend tagging in your Facebook photos
      \item music recommendation for Spotify/YouTube
      \item self-driven cars
      \item medical diagnosis in a hospital
      \item and many more...
    \end{itemize}
    ML can be \textbf{supervised} or \textbf{unsupervised}.
  \end{block}
\end{frame}

% ML history
\begin{frame}
  \frametitle{What is Machine Learning?}
  \begin{alertblock}{Brief history}
    At the beginning, many decision-making applications (like spam detection)
    were built with a lot of manually programmed if/else conditions. These
    methods have some drawbacks:
    \begin{itemize}
      \item it requires the \textbf{knowledge of an expert} to design the rules
      \item it is \textbf{very specific} to a task
    \end{itemize}

    With ML, you only need to feed a model with your data. The algorithm will
    find the rules by itself.
  \end{alertblock}
\end{frame}

% Supervised
\subsection{Supervised vs. Unsupervised}
\begin{frame}
  \frametitle{Supervised Learning}
  In supervised  learning, you give the algorithm two kinds of data:
  \begin{itemize}
    \item \textbf{inputs} (an image, information about a person...)
    \item \textbf{outputs} (name of the object, the salary...)
  \end{itemize}

  The algorithm learns to find a way to \textbf{produce the output given the
  input}. The algorithm is then capable of producing the output of an input it
  has never seen before. \\~\\
  Supervised learning can be used to:
  \begin{itemize}
    \item recognize handwritten digits (input=image, output=digit)
    \item identify spams (input=mail, output=yes/no)
    \item predict a price (input=house infomation, output=price)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Supervised Learning}
  The inputs are in general real value vectors (sometimes it can be binary
  vectors). Each dimension of this vector is
  called a \textbf{feature}. \\~\\

  There are two main problems that can be solved with supervised learning:
  \begin{itemize}
    \item \textbf{classification}: the output is the class the item belongs to.
      It can be a binary classification problem (yes/no) or a multi-class
      classification problem (class 1, class 2, class 3...)
    \item \textbf{regression}: the output is a real value number
  \end{itemize}
\end{frame}

% Unsupervised
\begin{frame}
  \frametitle{Unsupervised Learning}
  In unsupervised  learning, there are no known output (or label), so you only
  give the algorithm the inputs. For example, if you want to regroup similar
  clients according to their customer habits, you do not know in advance what
  are the groups.\\~\\

  The main problems that can be solved with unsupervised learning are:
  \begin{itemize}
    \item \textbf{clustering}
    \item \textbf{dimensionality reduction}
  \end{itemize}

  The main difficulty of unsupervised learning algorithms is to \textbf{evaluate
  them}. Since we do not have any labels, there are not any "true" valid answer.
  Most of the time, we need to manually evaluate if the output of the algoritm
  is relevant. Sometimes, unsupervised learning algorithms are used as a
  \textbf{pre-processing} step before supervised learning algorithms and can
  lead to a more accurate model or a faster training.
\end{frame}

% Evaluation
\subsection{Training, test \& validation sets}
\begin{frame}
  \frametitle{How to evaluate a ML algorithm?}
  After training a ML algorithm with data, we need to evaluate its performance.
  One way would be to feed the algorithm with the same data, compute the output
  and compare it with the original output.\\~\\

  This method is bad because it does not tell us the capacity of the algorithm
  \textbf{to generalize} (is the algorithm able to perform well on unseen data
  ?). One way to solve this problem is to separate the data into 3 sets:
  \begin{itemize}
    \item training set: used to train the algorithm
    \item validation set: used to adjust the algorithm
    \item test set: used to measure the performance of the algorithm
  \end{itemize}
\end{frame}

% overfitting/underfitting
\subsection{Overfitting/Underfitting}
\begin{frame}
  \frametitle{Problems of ML}
  \begin{itemize}
    \item \textbf{underfitting}: when the model is too simple. The training
    score and the test score are both bad (left)
    \item \textbf{overfitting}: when the model is too complex (right). The
    training score is good but the test score is bad. The model is not able to
    generalize well.
  \end{itemize}
  \begin{center}
    \includegraphics[height=4.7cm]{img/underfitting.png}
  \end{center}
\end{frame}


%----------------------------
%    S U P E R V I S E D    -
%----------------------------

\section{Supervised Learning}

% KNN
\subsection{k-Nearest Neighbors}
\begin{frame}
  \frametitle{k-Nearest Neighbors - Classification}
  \textbf{Inputs}: set of training points $\mathcal{S}$, point $X$ to
  classify\\
  \textbf{Procedure}:
  \begin{itemize}
    \item find the $k$ closest points (the neighbors) to $X$ from $\mathcal{S}$
    \item find the most frequent class $c$ among the neighbors
    \item assign $c$ to $X$
  \end{itemize}

  \begin{columns}
    \column{0.48\linewidth}
      \parbox{\linewidth}{The $k$ closest points are defined as the points with
      minimal distance (usually the Euclidean distance) to $X$. If there is a
      tie in finding $c$, we can increase/decrease $k$ until the tie is broken.}

    \column{0.48\linewidth}
      \centering
      \includegraphics[height=3.8cm]{img/knn.png}
    \end{columns}
\end{frame}

\begin{frame}
  \frametitle{k-Nearest Neighbors - Regression}
  \textbf{Inputs}: set of training points $\mathcal{S}$, point $X$ to predict
  value

  \textbf{Procedure}:
  \begin{itemize}
    \item find the $k$ closest points (the neighbors) to $X$ from $\mathcal{S}$
    \item compute the average value $v$ of all neighbors
    \item assign $v$ to $X$
  \end{itemize}

  If $k$ is set to 1, the value assigned to $X$ is simply the value of its
  nearest neighbor.
\end{frame}

\begin{frame}
  \frametitle{k-Nearest Neighbors}
  The k-nearest neighbors is one of the simplest model in ML. It usually has two
  main parameters: \textbf{the number of neighbors} used and \textbf{the
  distance} used to find the closest points.
  \\~\\
  \textbf{Pros}
  \begin{itemize}
    \item model is easy to understand
    \item does not require extended tuning to achieve good performance
    \item works well as a baseline before training more complex models
  \end{itemize}

  \textbf{Cons}
  \begin{itemize}
    \item slow when many training points or many features
    \item not very good on sparse data
  \end{itemize}
\end{frame}

% Linear models
\subsection{Linear Models}
\begin{frame}
  \frametitle{Linear Models - Regression}
  Linear models are usually used to make values prediction. The output $\hat{y}$
  is a \textbf{linear function} of each features $x_i$ of a given input vector
  $X$ (hence the name of \textbf{linear} models).

  \begin{equation*}
    \hat{y} = \sum_{i = 1}^k w_i * x_i + b
  \end{equation*}

  \begin{columns}
    \column{0.48\linewidth}
      \parbox{\linewidth}{The model learns the coefficients $w_i$ and $b$. The
      model is trained to minimize the \textbf{Mean Squared Error (MSE)} on all
      training points $X$ in $\mathcal{S}$ (where $y$ is the correct value of
      $X$).

      \begin{equation*}
        MSE = \frac{1}{\#\mathcal{S}} \sum_{X \in \mathcal{S}} (\hat{y} - y)^2
      \end{equation*}
      }

    \column{0.48\linewidth}
      \centering
      \includegraphics[height=3.6cm]{img/linear_reg.png}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear Models - Regression}
  \begin{itemize}
    \item \underline{Training}\\
      \textbf{Inputs}: set of training points $\mathcal{S}$\\
      \textbf{Procedure}:
      \begin{itemize}
        \item compute the MSE
        \item compute the partial derivatives $\frac{\partial MSE}{\partial
          w_i}$ and  $\frac{\partial MSE}{\partial b}$
        \item solve the linear system given by the equations $\frac{\partial
          MSE}{\partial w_i}=0$ and $\frac{\partial MSE}{\partial b}=0$
        \item the solution is unique and gives you the $w_i$ and $b$

      \end{itemize}
      \textbf{Output}: learned $w_i$ and $b$

    \item \underline{Predicting}\\
      \textbf{Inputs}: learned $w_i$ and $b$, point $X$ to predict value\\
      \textbf{Procedure}: compute $\hat{y}$ with the formula and $w_i$, $b$,
      $x_i$\\
      \textbf{Output}: predicted value $\hat{y}$

  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Linear Models - Classification}
  Linear models can also be used for either binary or multi-class classification
  problems. For binary classification (1 or 0 labels), the model evaluates the
  output with:

  \begin{equation*}
    \hat{y} =
      \begin{cases}
        1 & \text{if } w \cdot x + b \geq 0\\
        0 & \text{otherwise}
      \end{cases}
  \end{equation*}

  We can rewrite the condition to remove the bias $b$ with $\sum_{i = 0}^k w_i *
  x_i \geq 0$ and set $x_0$ to 1 for all vectors $X$. The objective of the model
  is to find a good $w$ to predict the correct label (such that $\hat{y} =
  y$). Different methods exist to find $w$:
  \begin{itemize}
    \item perceptron
    \item logistic regression
    \item support vector machines (SVM)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Linear Models - Perceptron}
  We can define a loss $\ell$ (called the zero-one loss) for each point $(x, y)$
  from our training set as follows:

  \begin{equation*}
    \ell(x, y, w) =  1_{[(w \cdot x)y \leq 0]}
  \end{equation*}

  The perceptron algorithm learns the $w$ weights as follows:
  \begin{itemize}
    \item initialize $w$ to 0
    \item \textbf{for} step in [1..n]
      \begin{itemize}
        \item \textbf{for} x in training dataset
        \begin{itemize}
          \item[] \textbf{if $ (w \cdot x) \times y \leq 0$}\\
            \hspace{.5cm} w = w + $\lambda$ y $\cdot$ x
        \end{itemize}
      \end{itemize}
  \end{itemize}

  The perceptron algorithm only updates the weights $w$ when it encounters a
  misclassified example. If the dataset is \textbf{linearly separable}, the
  perceptron algorithm is guaranteed to find an optimal solution.
\end{frame}

\begin{frame}
  \frametitle{Linear Models - Logistic Regression}
  Instead of directly finding the output $\hat{y}$ with the sign function, we
  can compute the \textbf{probability of belonging to class 1} defined as
  follows:

  \begin{equation*}
    P(y=1 | x) = \sigma(w \cdot x) = \frac{1}{1 + e^{-w \cdot x}} \in [0, 1]
  \end{equation*}

  The idea is to find $w$ such that this probability is high (near 1) for points
  labeled with 1, and low (near 0) for points labeled with 0. We can define
  another loss function (called negative log-likelihood) :

  \begin{equation*}
    \ell(x, y, w) = - y \times \log(\sigma(w \cdot x))
                 - (1 - y) \times \log(1 - \sigma(w \cdot x))
  \end{equation*}

  The model is trained to minimize the loss for all training examples. Since
  this loss function is convex, we can use different algorithms like gradient
  descent, L-BFGS...
\end{frame}

\begin{frame}
  \frametitle{Linear Models}
  Linear models have two
  main parameters : \textbf{the weight of regularization} and \textbf{the
  type of regularization} used.
  \\~\\
  \textbf{Pros}
  \begin{itemize}
    \item models are fast to train and predict
    \item can be used with large datasets (scalability)
    \item works well on sparse data
  \end{itemize}

  \textbf{Cons}
  \begin{itemize}
    \item learned coefficients are not very interpretable
    \item not very good at detecting correlated features
  \end{itemize}
\end{frame}

%% Decision Trees
\subsection{Decision Trees}
\begin{frame}
  \frametitle{Decision Trees}
  Decision trees are mostly used for classification problems. They learn a
  \textbf{sequence of conditions} (if/else questions) to find the class of a vector
  $x$.

  Let's suppose we have the following dataset and we want to predict if the game
  can be played (9 yes / 5 no) :
  \begin{table}[htb]
    \centering
    \tiny
      \begin{tabular}{@{}rcccc@{}}
      \cmidrule{2-5}
         & Outlook  & Humidity & Wind   & Play\\
      \midrule
      1  & Sunny    & High     & Weak   & No  \\
      2  & Sunny    & High     & Strong & No  \\
      3  & Overcast & High     & Weak   & Yes \\
      4  & Rain     & High     & Weak   & Yes \\
      5  & Rain     & Normal   & Weak   & Yes \\
      6  & Rain     & Normal   & Strong & No  \\
      7  & Overcast & Normal   & Strong & Yes \\
      8  & Sunny    & High     & Weak   & No  \\
      9  & Sunny    & Normal   & Weak   & Yes \\
      10 & Rain     & Normal   & Weak   & Yes \\
      11 & Sunny    & Normal   & Strong & Yes \\
      12 & Overcast & High     & Strong & Yes \\
      13 & Overcast & Normal   & Weak   & Yes \\
      14 & Rain     & High     & Strong & No  \\
      \bottomrule
    \end{tabular}
    \normalsize
  \end{table}
\end{frame}

\begin{frame}
\frametitle{Decision Trees}
  Many algorithms exist to create a decision tree. One of them is ID3. At each
  step, it computes the entropy of each unused attributes  with :

  \begin{equation*}
    Entropy(S) = \sum_{x \in X} -p(x) \log_{2}p(x)
  \end{equation*}

  where $p(x)$ is the ratio between the number of elements in $x$ and in $S$.
  The attribute that has the higher entropy is chosen as a node in the tree.

  \begin{columns}
    \column{0.48\linewidth}
    \begin{table}[htb]
      \centering
      \tiny
        \begin{tabular}{@{}rcccc@{}}
        \cmidrule{2-5}
           & Outlook  & Humidity & Wind   & Play\\
        \midrule
        1  & Sunny    & High     & Weak   & No  \\
        2  & Sunny    & High     & Strong & No  \\
        3  & Overcast & High     & Weak   & Yes \\
        4  & Rain     & High     & Weak   & Yes \\
        5  & Rain     & Normal   & Weak   & Yes \\
        6  & Rain     & Normal   & Strong & No  \\
        7  & Overcast & Normal   & Strong & Yes \\
        8  & Sunny    & High     & Weak   & No  \\
        9  & Sunny    & Normal   & Weak   & Yes \\
        10 & Rain     & Normal   & Weak   & Yes \\
        11 & Sunny    & Normal   & Strong & Yes \\
        12 & Overcast & High     & Strong & Yes \\
        13 & Overcast & Normal   & Weak   & Yes \\
        14 & Rain     & High     & Strong & No  \\
        \bottomrule
      \end{tabular}
      \normalsize
    \end{table}

    \column{0.48\linewidth}
      \centering
      \includegraphics[height=3.2cm]{img/id3.png}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Decision Trees}
  Decision trees have many parameters to control their size: maximum depth,
  number of leaf, acceptable ratio in a node... They provide a strong
  understandable tool for people not familiar with the ML field.
  \\~\\
  \textbf{Pros}
  \begin{itemize}
    \item can visually be understood
    \item can be used with large datasets
    \item no preprocessing needed (normalization of features)
  \end{itemize}

  \textbf{Cons}
  \begin{itemize}
    \item tend to easily overfit
    \item poor generalization performance
  \end{itemize}
\end{frame}

% SVM
\subsection{Support Vector Machine}
\begin{frame}
  \frametitle{Support Vector Machine}
  Support Vector Machine (SVM) is an algorithm used to solve
  \textbf{classification problems}. The main goal is to find the \textbf{optimal
  hyperplane}.\\~\\

  \begin{columns}
    \column{0.48\linewidth}
      \centering
      \includegraphics[height=4.2cm]{img/svm_goal.png}

    \column{0.48\linewidth}
      \begin{itemize}
        \item ${H_1}$ is not correct
        \item ${H_2}$ is correct, but not the best
        \item ${H_3}$ is correct and optimal
      \end{itemize}
    \end{columns}
\end{frame}

\begin{frame}
  \frametitle{What is the optimal hyperplane?}
  If the training data is linearly separable, an infinite number of correct
  hyperplanes can be found. So we need to discriminate them to choose the
  optimal one. We introduce the notion of \textbf{margin} as the distance
  between the hyperplane and the closest points from training set.
  \\~\\
  The \textbf{optimal hyperplane} is the one with the \textbf{largest margin}.
  The data points on the margin are called \textbf{support vectors}.

  \begin{center}
    \includegraphics[height=3.8cm]{img/svm_margin.png}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{Finding the optimal hyperplane}
  \begin{columns}
    \column{0.48\linewidth}
      \parbox{\linewidth}{The equation of an hyperplane is:

      \begin{equation}
        \vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = 0
      \end{equation}

      The equations of the hyperplanes representing the margins are:

      \begin{equation}
        \begin{cases}
          \vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = +1\\
          \vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = -1
        \end{cases}
      \end{equation}
      }

      The distance between the margins and the optimal hyperplane is:

      \begin{equation}
        \frac{1}{{\lvert \lvert w \rvert \rvert}^2}
      \end{equation}

    \column{0.48\linewidth}
      \centering
      \includegraphics[height=4.2cm]{img/svm_hyperplanes.png}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Finding the optimal hyperplane}
  The main goal of SVM is to \textbf{maximize the margin}, this means to
  \textbf{minimize the value of ${\lvert \lvert w \rvert \rvert}^2$}.

  We also want to correctly classify the points w.r.t. the margin (i.e. no
  points are between the two hyperplanes defining the margin). This means:
  \begin{equation}
    \begin{cases}
      \vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b \geq +1
        \text{ for labels = +1}\\
      \vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b \leq -1
        \text{ for labels = -1}
    \end{cases}
  \end{equation}

  This can be rewritten as:
  \begin{equation}
    y \times (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b) \geq +1
  \end{equation}

  So the SVM solves (with the \textbf{Lagrangian multiplier} method):
  \begin{equation}
    \min_{w, b} {\lvert \lvert w \rvert \rvert}^2 \quad s.t. \quad
    y_i \times (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b) \geq +1
    \quad \forall ~i
  \end{equation}
\end{frame}

\begin{frame}
  \frametitle{Soft margin}
  The assumption that our dataset is linearly separable is strong and not true
  for most of the cases. We can create a new model that will find the optimal
  hyperplane, but do not try to correctly classify every training point.  We
  introduce the \textbf{hinge loss} as:

  \begin{equation}
    \max (0, 1 - y_i \times
      (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b))
  \end{equation}

  The problem to optimize becomes:
  \begin{equation}
    \min_{w, b} \left( {\lvert \lvert w \rvert \rvert}^2 + \lambda \sum_{i = 1}^n
    \max (0, 1 - y_i \times
      (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b)) \right)
  \end{equation}
\end{frame}

\begin{frame}
  \frametitle{Support Vector Machine}
  SVM finds the optimal hyperplane by maximizing the margin. With hard margin,
  no tuning is required but it does not work if the data is not linearly
  separable. With soft margin, there is always a correct solution and the model
  is more robust to outliers.
  \\~\\
  \textbf{Pros}
  \begin{itemize}
    \item work well on high or low dimension data
    \item allow very complex boundaries (with kernel trick)
  \end{itemize}

  \textbf{Cons}
  \begin{itemize}
    \item do not scale well
    \item require preprocessing and tuning
    \item hard to inspect and understand
  \end{itemize}
\end{frame}

% Neural nets
\subsection{Neural networks}
\begin{frame}
  \frametitle{Neural Networks}
  A neural network is a model used for either \textbf{classification} or
  \textbf{regression} problems. The main unit of a neural network is called a
  \textbf{neuron}:

  \begin{center}
    \includegraphics[height=2cm]{img/neuron.png}
  \end{center}

  Each neuron has a weight vector $w$ and a bias $b$. Its output is:
  \begin{equation}
    y = f(\boldsymbol{w} \cdot \boldsymbol{x} + b)
  \end{equation}
\end{frame}

\begin{frame}
  \frametitle{Neural Networks}
  When we combine several neurons, we obtain a neural network:

  \begin{center}
    \includegraphics[height=5cm]{img/nn_full.png}
  \end{center}

  Each neuron from one layer is connect to each neurons from the following
  layer. We call this the \textbf{fully connected} architecture.
\end{frame}

\begin{frame}
  \frametitle{Training neural networks}
  \begin{center}
    \includegraphics[height=3cm]{img/nn_basic.png}
  \end{center}
  The goal is to be able to predict the output given the input. We can compute
  the cost for one training example as the difference between the predicted
  output and its label:

  \begin{equation}
    C_i = \frac{1}{2}
    {\lvert \lvert \boldsymbol{y_i} - a^l(\boldsymbol{x_i}) \rvert \rvert}^2
  \end{equation}
\end{frame}

\begin{frame}
  \frametitle{Training neural networks}
  We want to minimize the cost $C_i$. We can use the \textbf{gradient descent}
  to do so. We compute the partial derivative of $C_i$ w.r.t. to the weights and
  biases of the last layer.

  \begin{equation}
    \frac{\partial C_i}{\partial w_{jk}^l} \quad and \quad
    \frac{\partial C_i}{\partial b_j^l}
  \end{equation}

  And then update the weights and biases with:
  \begin{equation}
    w_{jk}^l = w_{jk}^l - \lambda \times\frac{\partial C_i}{\partial w_{jk}^l}
  \end{equation}

  \begin{equation}
    b_j^l = b_j^l - \lambda \times\frac{\partial C_i}{\partial b_j^l}
  \end{equation}
\end{frame}

\begin{frame}
  \frametitle{Training neural networks}
  We repeat the same process on the layer before the last one, and again on the
  layer before and again until the input layer. Since we are moving backward, we
  are calling this the \textbf{backpropagation}.\\~\\

  We repeat this process for each training examples. Once we have seen all
  training examples, we call this an \textbf{epoch}. We repeat the backprogation
  algorithm until a certain number of epoch is done, or until the network has a
  good accuracy (or a cost below a threshold).
\end{frame}

\begin{frame}
  \frametitle{Neural Networks}
  Neural network is a really big subject and we only cover a fraction of it in
  this course. Many more implementations exist (convolutional neural network,
  deep network, recurrent neural network...). Neural networks have been proved
  to be able to compute any function (so in theory, it can always predict the
  right output given the input).
  \\~\\
  \textbf{Pros}
  \begin{itemize}
    \item can capture information in large datasets
    \item can build complex models
  \end{itemize}

  \textbf{Cons}
  \begin{itemize}
    \item slow to train (if deep)
    \item require preprocessing (homogeneous data)
    \item hard to tune
  \end{itemize}
\end{frame}

%--------------------------------
%    U N S U P E R V I S E D    -
%--------------------------------

\section{Unsupervised Learning}

% PCA
\subsection{Principal Component Analysis}
\begin{frame}
  \frametitle{Representing data}
  \begin{columns}
    \column{0.48\linewidth}
      \begin{figure}
        \centering
        \includegraphics[height=1.2cm]{img/pca_1dim.png}
        \caption{1 dimension}
      \end{figure}
      \begin{figure}
        \centering
        \includegraphics[height=3.2cm]{img/pca_2dim.png}
        \caption{2 dimensions}
      \end{figure}

    \column{0.48\linewidth}
      \begin{figure}
        \centering
        \includegraphics[height=3.2cm]{img/pca_3dim.png}
        \caption{3 dimensions}
      \end{figure}
      \parbox{\linewidth}{
        \centering
        4 dimensions ?\\
        $n$ dimensions ?}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Reducing the dimension}
  To visualize data, we need to project it into a 1, 2 or 3 dimensional space
  (because we can not draw a representation if there are more than 3
  dimensions). So we \textbf{need to reduce the number of dimensions}. One way
  is to remove unimportant dimensions.\\~\\

  \centering
  \includegraphics[height=5cm]{img/pca_bef-aft.png}
\end{frame}

\begin{frame}
  \frametitle{Finding the important dimensions}
  We consider a feature to be important if \textbf{it has a lot of variations}.
  Indeed, if all the examples share almost the same value for a certain
  features, we can not extract useful information from this feature.  So the
  most important dimensions are the ones that \textbf{maximize the variance}.
  \\~\\
  \centering
  \includegraphics[height=4.5cm]{img/pca_2dim.png}
\end{frame}

\begin{frame}
  \frametitle{Principal Component Analysis}
  PCA finds the directions that have the most variations. If the vectors have
  $n$ dimension, PCA will find $n$ directions. The directions found are ranked
  according to their variance, so first direction has the most variations,
  second one has the second most variations...\\~\\

  These directions are called \textbf{principal components (PC)}: PC1 has the most
  variations, PC2 the second most variations...
  All principal components are \textbf{orthogonal} to each other. So PCA is like
  "finding a new set of axis" to represent the data. Then, by only selecting the
  2 or 3 first PC, we can draw our dataset into a space where we can visualize
  our data.\\~\\

  We can compute the PC with:
  \begin{itemize}
    \item eigenvalue decomposition of the covariance matrix
    \item singular value decomposition
  \end{itemize}
\end{frame}

% Clustering
\subsection{Clustering}
\begin{frame}
  \frametitle{Clustering}
  Clustering is a method to divide a dataset into a certain number of
  \textbf{clusters}. A cluster is a group of points that have minimal distance
  between them. Below is an example with 3 clusters.

  \center
  \includegraphics[height=5cm]{img/kmeans.png}
\end{frame}

\begin{frame}
  \frametitle{K-means clustering algorithm}
  K-means is one fast algorithm that can be used to separate a dataset into $k$
  clusters with complexity $\mathcal{O}(k*m*d)$.\\~\\

  \textbf{Inputs}: set of $n$ data points $\mathcal{X}$, number of clusters $k$

  \textbf{Procedure}:
  \begin{itemize}
    \item select $k$ points at random. They form the \textbf{centroids} $c_i \in
      C$
    \item \textbf{while} centroids do not converge
      \begin{itemize}
        \item \textbf{for} $x_i$ in $\mathcal{X}$
        \item[] - find nearest centroid $c_j$
        \item[] - assign $x_i$ to cluster $C_j$
      \end{itemize}
      \begin{itemize}
        \item \textbf{for} each cluster $C_j$
        \item[] - define new centroid $c_j$ as
                  $\frac{1}{n_j}\sum_{x_i \in C_j} x_i$
      \end{itemize}
  \end{itemize}
\end{frame}

\end{document}