-
Notifications
You must be signed in to change notification settings - Fork 1
/
10-local-decoding-cis.Rmd
355 lines (306 loc) · 13.7 KB
/
10-local-decoding-cis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# Local decoding confidence intervals {#local-decoding-cis}
`TMB` allows to retrieve standard errors for any quantity computed in `C++` that is derived from parameters.
Therefore, we can derive CIs for quantities defined in [state-inference].
We compute CIs for smoothing probabilities of a Poisson and a Gaussian HMM.
## Likelihood function (Poisson HMM)
We define the `C++` code first, available in the file *[code/poi_hmm_smoothing.cpp](#poi_hmm_smoothing.cpp)* and below.
Similarly to the `R` code, we compute the log-forward and log-backward probabilities, derive smoothing probabilities, truncate them (optional), and return them with `ADREPORT`.
We also produce the sequence of most likely states as a by-product using the variable `ldecode`.
```{Rcpp 10poi_hmm_smoothing.cpp, code=readLines("code/poi_hmm_smoothing.cpp"), eval=FALSE}
```
## Estimation
In `R`, we can use the `C++` objective function with `TMB` to return smoothing probabilities.
Note that they are returned column-wise. For example, the first `m` (here 2) values correspond to the smoothing probabilities of the first and second hidden state of the first data.
```{r 10smoothing_tyt}
set.seed(1)
# Load TMB, the data set, and utility functions
library(TMB)
source("functions/utils.R")
load("data/tinnitus.RData")
TMB::compile("code/poi_hmm_smoothing.cpp")
dyn.load(dynlib("code/poi_hmm_smoothing"))
m <- 2
data_set <- tinn_data
# Setup initial parameters
lambda <- seq(quantile(data_set, 0.1),
quantile(data_set, 0.9),
length.out = m)
gamma <- matrix(0.2 / (m - 1),
nrow = m,
ncol = m)
diag(gamma) <- 0.8
delta <- stat.dist(gamma)
# Create the TMB objects for estimation
TMB_data <- list(x = data_set, m = m)
parameters <- pois.HMM.pn2pw(m, lambda, gamma)
obj_tmb <- MakeADFun(data = TMB_data,
parameters = parameters,
DLL = "poi_hmm_smoothing",
silent = TRUE)
# Estimation
mod_tmb <- nlminb(start = obj_tmb$par, objective = obj_tmb$fn,
gradient = obj_tmb$gr, hessian = obj_tmb$he)
# Retrieve estimates
adrep <- summary(sdreport(obj_tmb))
lambda <- obj_tmb$report(obj_tmb$env$last.par.best)$lambda
gamma <- obj_tmb$report(obj_tmb$env$last.par.best)$gamma
# We are interested only in the smoothing probabilities
adrep <- adrep[rownames(adrep) == "smoothing_probs", ]
# We follow the notation of (Zucchini et al., 2016)
# One row per hidden state, one column per data
smoothing_probs <- matrix(adrep[, "Estimate"], nrow = m)
smoothing_probs_std_error <- matrix(adrep[, "Std. Error"], nrow = m)
# 95% confidence intervals
q95_norm <- qnorm(1 - 0.05 / 2)
```
## Results
Next, we plot smoothing probabilities for each state along with their [Wald-type CI](#Wald-type).
```{r 10plot_smoothing_probabilities}
# Aesthetic parameters used in the article
ERRORBAR_LINE_WIDTH <- 0.2
ERRORBAR_END_WIDTH <- 1.1
POINT_SIZE <- 0.2
LINE_SIZE <- 0.25
COLORS <- c("dodgerblue2", "tomato3", "green", "purple", "gold")
n <- ncol(smoothing_probs)
## Plotting (with ggplot2)
library(ggplot2)
state_ggplot <- list()
for (state in 1:m) {
# State smoothing probabilities
stateprobs <- data.frame(idx = 1:n,
estimate = smoothing_probs[state, ],
std.error = smoothing_probs_std_error[state, ])
# Add 95% CI bounds with cutoffs at 0 and 1
stateprobs$ci_lower <- pmax(0, stateprobs$estimate - q95_norm * stateprobs$std.error)
stateprobs$ci_upper <- pmin(stateprobs$estimate + q95_norm * stateprobs$std.error, 1)
state_ggplot[[state]] <- ggplot(stateprobs, aes(x = idx)) +
geom_ribbon(aes(ymin = ci_lower,
ymax = ci_upper),
fill = "lightgrey") +
geom_errorbar(aes(ymin = ci_lower,
ymax = ci_upper,
width = ERRORBAR_END_WIDTH),
color = COLORS[state],
show.legend = FALSE,
linewidth = ERRORBAR_LINE_WIDTH) +
geom_point(aes(y = estimate), size = POINT_SIZE, shape = 16) +
geom_line(aes(y = estimate), linewidth = LINE_SIZE) +
ggtitle(bquote("State" ~ .(state) * "," ~ widehat(lambda) ~ "=" ~ .(round(lambda[state], 2)))) +
theme_Publication() + # Custom defined function, this is optional
ylim(0, 1) +
xlab("") +
ylab("Probability")
}
# STATE MOST LIKELY
ldecode <- obj_tmb$report(obj_tmb$env$last.par.best)$ldecode
ldecode_estimates <- c()
ldecode_std.errors <- c()
for (i in 1:n) {
ldecode_estimates[i] <- smoothing_probs[ldecode[i], i]
ldecode_std.errors[i] <- smoothing_probs_std_error[ldecode[i], i]
}
stateprobs <- data.frame(idx = 1:n,
estimate = ldecode_estimates,
std.error = ldecode_std.errors,
state = as.factor(ldecode))
stateprobs$ci_lower <- pmax(0, stateprobs$estimate - q95_norm * stateprobs$std.error)
stateprobs$ci_upper <- pmin(stateprobs$estimate + q95_norm * stateprobs$std.error, 1)
state_likely_ggplot <- ggplot(stateprobs, aes(x = idx)) +
geom_ribbon(aes(ymin = ci_lower,
ymax = ci_upper),
fill = "lightgrey") +
geom_errorbar(aes(ymin = ci_lower,
ymax = ci_upper,
width = ERRORBAR_END_WIDTH,
color = state),
show.legend = FALSE,
linewidth = ERRORBAR_LINE_WIDTH) +
geom_point(aes(y = estimate), size = POINT_SIZE) +
geom_line(aes(y = estimate), linewidth = LINE_SIZE) +
scale_color_manual(values = COLORS[1:m], breaks = 1:m) +
ggtitle("Most likely states") +
theme_Publication() + # Custom defined function, this is optional
xlab("Time (days)") +
ylab("Probability") +
ylim(0, 1)
# Display the graphs in one figure
library(ggpubr)
ggarrange(ggarrange(plotlist = state_ggplot,
ncol = 2, nrow = ceiling(m/2)),
state_likely_ggplot,
ncol = 1, nrow = 2)
```
## Limitation
The TYT data set used above is short and does not pose computing issues.
However, with a larger set, retrieving the standard errors of the smoothing probabilities becomes time-consuming and can take days or weeks depending on the dataamount of data, as the delta method is applied repeatedly when these errors are retrieved via `summary(sdreport(obj_tmb))`.
A solution to this problem is to truncate the smoothing probability matrix before it is returned by `TMB`, in the objective function defined above.
An example of what the function can be is in the file *[code/poi_hmm_smoothing_truncated.cpp](#poi_hmm_smoothing_truncated.cpp)* and below.
```{Rcpp 10poi_hmm_smoothing_truncated.cpp, code=readLines("code/poi_hmm_smoothing_truncated.cpp"), eval=FALSE}
```
:::{#note-text .note}
It is important to note that the values `start_row`, `start_col`, `nb_rows`, and `nb_cols` cam be set to `NA` to retrieve the .
They serve to truncate the amount of smoothing probabilities returned, as there are one per combination of data point and hidden state. With 2000 data and 5 hidden states, there will be 10000 smoothing probabilities in a 5 x 2000 matrix.
:::
If needed, one can return all smoothing probabilities by estimating the HMM multiple times and returning each time a different block of the matrix of smoothing probabilities, then aggregating these blocks.
To make use of this, in `R`, we need to load the modified likelihood function stored in
```{r 10load_truncated_nll_cpp, eval = FALSE}
TMB::compile("code/poi_hmm_smoothing_truncated.cpp")
dyn.load(dynlib("code/poi_hmm_smoothing_truncated"))
```
.
Then, the `TMB_data` object can be defined as
```{r 10smoothing_tyt_truncated_no_change, eval = FALSE}
TMB_data <- list(x = data_set, m = m,
start_row = 0,
start_col = 0,
nb_rows = m,
nb_cols = length(data_set))
```
or
```{r 10smoothing_tyt_truncated_NA, eval = FALSE}
TMB_data <- list(x = data_set, m = m,
start_row = NA,
start_col = NA,
nb_rows = NA,
nb_cols = NA)
```
and the same smoothing probabilities will be returned as above.
Returning truncated smoothing probabilities can be done as follows
```{r 10smoothing_tyt_truncated, eval = FALSE}
TMB_data <- list(x = data_set, m = m,
start_row = 0, # C++ indexes at 0, unlike R which indexes from 1. Be careful.
start_col = 50, # C++ indexes at 0, unlike R which indexes from 1. Be careful.
nb_rows = m,
nb_cols = 30)
```
.
This returns the \(m\) smoothing probabilities associated with data points 51, 52, \(\ldots\), 80.
:::{#important-text .important}
Remember the updated name of the returned smoothing probabilities from `smoothing_probs` to `truncated_stateprobs` for clarity. This will be reflected in the row names of `summary(sdreport(obj_tmb))`.
:::
Let us show this in action and plot these probabilities as above. We first build the model and retrieve the estimates.
```{r 10truncated_smoothing_tyt}
set.seed(1)
# Load TMB, the data set, and utility functions
library(TMB)
source("functions/utils.R")
load("data/tinnitus.RData")
TMB::compile("code/poi_hmm_smoothing_truncated.cpp")
dyn.load(dynlib("code/poi_hmm_smoothing_truncated"))
m <- 2
data_set <- tinn_data
# Setup initial parameters
lambda <- seq(quantile(data_set, 0.1),
quantile(data_set, 0.9),
length.out = m)
gamma <- matrix(0.2 / (m - 1),
nrow = m,
ncol = m)
diag(gamma) <- 0.8
delta <- stat.dist(gamma)
# Create the TMB objects for estimation
TMB_data <- list(x = data_set, m = m,
start_row = 0,
start_col = 50,
nb_rows = m,
nb_cols = 30)
parameters <- pois.HMM.pn2pw(m, lambda, gamma)
obj_tmb <- MakeADFun(data = TMB_data,
parameters = parameters,
DLL = "poi_hmm_smoothing_truncated",
silent = TRUE)
# Estimation
mod_tmb <- nlminb(start = obj_tmb$par, objective = obj_tmb$fn,
gradient = obj_tmb$gr, hessian = obj_tmb$he)
# Retrieve estimates
adrep <- summary(sdreport(obj_tmb))
lambda <- obj_tmb$report(obj_tmb$env$last.par.best)$lambda
gamma <- obj_tmb$report(obj_tmb$env$last.par.best)$gamma
# We are interested only in the smoothing probabilities
adrep <- adrep[rownames(adrep) == "truncated_smoothing_probs", ]
# We follow the notation of (Zucchini et al., 2016)
# One row per hidden state, one column per data
smoothing_probs <- matrix(adrep[, "Estimate"], nrow = m)
smoothing_probs_std_error <- matrix(adrep[, "Std. Error"], nrow = m)
# 95% confidence intervals
q95_norm <- qnorm(1 - 0.05 / 2)
```
Then we plot the smoothing probabilities and confidence intervals.
```{r 10plot_truncated_smoothing_probabilities}
# Aesthetic parameters used in the article
ERRORBAR_LINE_WIDTH <- 0.2
ERRORBAR_END_WIDTH <- 1.1
POINT_SIZE <- 0.2
LINE_SIZE <- 0.25
COLORS <- c("dodgerblue2", "tomato3", "green", "purple", "gold")
n <- ncol(smoothing_probs)
indices_to_plot <- 51:80
## Plotting (with ggplot2)
library(ggplot2)
state_ggplot <- list()
for (state in 1:m) {
# State smoothing probabilities
stateprobs <- data.frame(idx = indices_to_plot,
estimate = smoothing_probs[state, ],
std.error = smoothing_probs_std_error[state, ])
# Add 95% CI bounds with cutoffs at 0 and 1
stateprobs$ci_lower <- pmax(0, stateprobs$estimate - q95_norm * stateprobs$std.error)
stateprobs$ci_upper <- pmin(stateprobs$estimate + q95_norm * stateprobs$std.error, 1)
state_ggplot[[state]] <- ggplot(stateprobs, aes(x = idx)) +
geom_ribbon(aes(ymin = ci_lower,
ymax = ci_upper),
fill = "lightgrey") +
geom_errorbar(aes(ymin = ci_lower,
ymax = ci_upper,
width = ERRORBAR_END_WIDTH),
color = COLORS[state],
show.legend = FALSE,
linewidth = ERRORBAR_LINE_WIDTH) +
geom_point(aes(y = estimate), size = POINT_SIZE, shape = 16) +
geom_line(aes(y = estimate), linewidth = LINE_SIZE) +
ggtitle(bquote("State" ~ .(state) * "," ~ widehat(lambda) ~ "=" ~ .(round(lambda[state], 2)))) +
theme_Publication() +
ylim(0, 1) +
xlab("") +
ylab("Probability")
}
# STATE MOST LIKELY
ldecode <- obj_tmb$report(obj_tmb$env$last.par.best)$ldecode
ldecode_estimates <- c()
ldecode_std.errors <- c()
for (i in 1:n) {
ldecode_estimates[i] <- smoothing_probs[ldecode[i], i]
ldecode_std.errors[i] <- smoothing_probs_std_error[ldecode[i], i]
}
stateprobs <- data.frame(idx = indices_to_plot,
estimate = ldecode_estimates,
std.error = ldecode_std.errors,
state = as.factor(ldecode))
stateprobs$ci_lower <- pmax(0, stateprobs$estimate - q95_norm * stateprobs$std.error)
stateprobs$ci_upper <- pmin(stateprobs$estimate + q95_norm * stateprobs$std.error, 1)
state_likely_ggplot <- ggplot(stateprobs, aes(x = idx)) +
geom_ribbon(aes(ymin = ci_lower,
ymax = ci_upper),
fill = "lightgrey") +
geom_errorbar(aes(ymin = ci_lower,
ymax = ci_upper,
width = ERRORBAR_END_WIDTH,
color = state),
show.legend = FALSE,
linewidth = ERRORBAR_LINE_WIDTH) +
geom_point(aes(y = estimate), size = POINT_SIZE) +
geom_line(aes(y = estimate), linewidth = LINE_SIZE) +
scale_color_manual(values = COLORS[1:m], breaks = 1:m) +
ggtitle("Most likely states") +
theme_Publication() +
xlab("Time (days)") +
ylab("Probability") +
ylim(0, 1)
# Display the graphs in one figure
library(ggpubr)
ggarrange(ggarrange(plotlist = state_ggplot,
ncol = 2, nrow = ceiling(m/2)),
state_likely_ggplot,
ncol = 1, nrow = 2)
```