forked from bcaffo/Caffo-Coursera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lecture6.tex
247 lines (221 loc) · 7.9 KB
/
lecture6.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
\documentclass[aspectratio=169]{beamer}
\mode<presentation>
% \usetheme{Warsaw}
% \usetheme{Goettingen}
\usetheme{Hannover}
% \useoutertheme{default}
% \useoutertheme{infolines}
\useoutertheme{sidebar}
\usecolortheme{dolphin}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{enumerate}
% some bold math symbosl
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\Cor}{\mathrm{Cor}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\brho}{\boldsymbol{\rho}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\bH}{\mathbf{H}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bolde}{\mathbf{e}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\cpp}[1]{\texttt{#1}}
\title{Mathematical Biostatistics Boot Camp: Lecture 6, Likelihood}
\author{Brian Caffo}
\date{\today}
\institute[Department of Biostatistics]{
Department of Biostatistics \\
Johns Hopkins Bloomberg School of Public Health\\
Johns Hopkins University
}
\begin{document}
\frame{\titlepage}
\section{Table of contents}
\frame{
\frametitle{Table of contents}
\tableofcontents
}
\section{Defining likelihood}
\begin{frame}\frametitle{Likelihood}
\begin{itemize}
\item A common and fruitful approach to statistics is to assume that
the data arises from a family of distributions indexed by a
parameter that represents a useful summary of the distribution
\item The {\bf likelihood} of a collection of data is the joint
density evaluated as a function of the parameters with the data fixed
\item Likelihood analysis of data uses the likelihood to perform inference
regarding the unknown parameter
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Likelihood}
Given a statistical probability mass function or density, say $f(x,
\theta)$, where $\theta$ is an unknown parameter, the {\bf likelihood}
is $f$ viewed as a function of $\theta$ for a fixed, observed value of
$x$.
\end{frame}
\section{Interpreting likelihoods}
\begin{frame}\frametitle{Interpretations of likelihoods}
The likelihood has the following properties:
\begin{enumerate}
\item Ratios of likelihood values measure the relative {\bf
evidence} of one value of the unknown parameter to another.
\item Given a statistical model and observed data, all of the relevant
information contained in the data regarding the unknown parameter is
contained in the likelihood.
\item If $\{X_i\}$ are independent random variables, then their likelihoods
multiply. That is, the likelihood of the parameters given all of
the $X_i$ is simply the product of the individual likelihoods.
\end{enumerate}
\end{frame}
\begin{frame}\frametitle{Example}
\begin{itemize}
\item Suppose that we flip a coin with success probability $\theta$
\item Recall that the mass function for $x$
$$
f(x,\theta) = \theta^x(1 - \theta)^{1 - x} ~~~\mbox{for}~~~ \theta \in [0,1].
$$
where $x$ is either $0$ (Tails) or $1$ (Heads)
\item Suppose that the result is a head
\item The likelihood is
$$
{\cal L}(\theta, 1) = \theta^1 (1 - \theta)^{1 - 1} = \theta ~~~\mbox{for} ~~~ \theta \in [0,1].
$$
\item Therefore, ${\cal L}(.5, 1) / {\cal L}(.25, 1) = 2$,
\item There is twice as much evidence supporting the hypothesis that $\theta = .5$ to the
hypothesis that $\theta = .25$
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Example continued}
\begin{itemize}
\item Suppose now that we flip our coin from the previous example 4 times and
get the sequence 1, 0, 1, 1
\item The likelihood is:
\begin{eqnarray*}
{\cal L}(\theta, 1,0,1,1) & = & \theta^1 (1 - \theta)^{1 - 1}
\theta^0 (1 - \theta)^{1 - 0} \\
& \times & \theta^1 (1 - \theta)^{1 - 1}
\theta^1 (1 - \theta)^{1 - 1}\\
& = & \theta^3(1 - \theta)^1
\end{eqnarray*}
\item This likelihood only depends on the total number of heads and
the total number of tails; we might write ${\cal L}(\theta, 1, 3)$
for shorthand
\item Now consider ${\cal L}(.5, 1, 3) / {\cal L}(.25, 1, 3) = 5.33$
\item There is over five times as much evidence supporting the
hypothesis that $\theta = .5$ over that $\theta = .25$
\end{itemize}
\end{frame}
\section{Plotting likelihoods}
\begin{frame}\frametitle{Plotting likelihoods}
\begin{itemize}
\item Generally, we want to consider all the values of $\theta$ between 0 and 1
\item A {\bf likelihood plot} displays $\theta$ by ${\cal
L}(\theta,x)$
\item Usually, it is divided by its maximum value so that its height
is 1
\item Because the likelihood measures {\em relative evidence}, dividing the
curve by its maximum value (or any other value for that matter) does
not change its interpretation
\end{itemize}
\end{frame}
\begin{frame}
\includegraphics[width=4.5in]{coinLikelihood.pdf}
\end{frame}
\section{Maximum likelihood}
\begin{frame}\frametitle{Maximum likelihood}
\begin{itemize}
\item The value of $\theta$ where the curve reaches its maximum has a special meaning
\item It is the value of $\theta$ that is most well supported by the
data
\item This point is called the {\bf maximum likelihood estimate} (or
MLE) of $\theta$
$$
MLE = \mathrm{argmax}_\theta {\cal L}(\theta, x).
$$
\item Another interpretation of the MLE is that it is the value of
$\theta$ that would make the data that we observed most probable
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Maximum likelihood, coin example}
\begin{itemize}
\item The maximum likelihood estimate for $\theta$ is always the proportion of heads
\item Proof: Let $x$ be the number of heads and $n$ be the number of trials
\item Recall
$$
{\cal L}(\theta, x) = \theta^x(1-\theta)^{n-x}
$$
\item It's easier to maximize the {\bf log-likelihood}
$$
l(\theta, x) = x \log(\theta) + (n - x)\log(1 - \theta)
$$
\end{itemize}
\end{frame}
\begin{frame}\frametitle{Continued}
\begin{itemize}
\item Taking the derivative we get
$$
\frac{d}{d\theta} l(\theta, x) = \frac{x}{\theta} - \frac{n-x}{1 - \theta}
$$
\item Setting equal to zero implies
$$
(1 - \frac{x}{n})\theta = (1 - \theta) \frac{x}{n}
$$
\item Which is clearly solved at $\theta = \frac{x}{n}$
\item Notice that the second derivative
$$
\frac{d^2}{d\theta^2} l(\theta, x) = -\frac{x}{\theta^2} - \frac{n-x}{(1 - \theta)^2} < 0
$$
provided that $x$ is not $0$ or $n$ (do these cases on your own)
\end{itemize}
\end{frame}
\section{Interpreting likelihood ratios}
\begin{frame}\frametitle{What constitutes strong evidence?}
\begin{itemize}
\item Again imagine an experiment where a person repeatedly flips a
coin
\item Consider the possibility that we are entertaining three
hypotheses: $H_1:\theta = 0$, $H_2:\theta=.5$, and $H_3:\theta = 1$
\end{itemize}
\end{frame}
\begin{frame}
\tiny
\begin{tabular}{rccccc}
Outcome $X$ & $P(X ~|~ H_1)$ & $P(X ~|~ H_2)$ & $P(X ~|~ H_3)$ & ${\cal L}(H_1) / {\cal L}(H_2)$ & ${\cal L}(H_3) / {\cal L}(H_2)$ \\ \hline
H & 0 & .5 & 1 & 0 & 2 \\
T & 1 & .5 & 0 & 2 & 0 \\ \hline
HH & 0 & .25 & 1 & 0 & 4 \\
HT & 0 & .25 & 0 & 0 & 0 \\
TH & 0 & .25 & 0 & 0 & 0 \\
TT & 1 & .25 & 0 & 4 & 0 \\ \hline
HHH & 0 & .125 & 1 & 0 & 8 \\
HHT & 0 & .125 & 0 & 0 & 0 \\
HTH & 0 & .125 & 0 & 0 & 0 \\
THH & 0 & .125 & 0 & 0 & 0 \\
HTT & 0 & .125 & 0 & 0 & 0 \\
THT & 0 & .125 & 0 & 0 & 0 \\
TTH & 0 & .125 & 0 & 0 & 0 \\
TTT & 1 & .125 & 0 & 8 & 0 \\ \hline
\end{tabular}
\normalsize
\end{frame}
\begin{frame}\frametitle{Benchmarks}
\begin{itemize}
\item Using this example as a guide, researchers tend to think of a likelihood ratio
\begin{itemize}
\item of $8$ as being moderate evidence
\item of $16$ as being moderately strong evidence
\item of $32$ as being strong evidence
\end{itemize}
of one hypothesis over another
\item Because of this, it is common to draw reference lines at these values on likelihood plots
\item Parameter values above the $1/8$ reference line, for example, are such that no other point
is more than 8 times better supported given the data
\end{itemize}
\end{frame}
\end{document}