From a909ceaa138b6bdfc71c443d0906150d92ef0273 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Sun, 31 Aug 2025 19:58:48 +0300 Subject: [PATCH 01/40] add diff diagnostics --- R/loo_compare.R | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 028fa59b..9dd182f0 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -116,7 +116,23 @@ loo_compare.default <- function(x, ...) { diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) elpd_diff <- apply(diffs, 2, sum) se_diff <- apply(diffs, 2, se_elpd_diff) - comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp) + p_worse <- pnorm(0, elpd_diff, se_diff) + p_worse[elpd_diff==0] <- NA + N <- nrow(diffs) + if (N<100) { + diag_pnorm <- rep("N < 100", length(elpd_diff)) + diag_pnorm[elpd_diff==0] = "" + } else { + diag_pnorm <- rep("", length(elpd_diff)) + diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions" + khat_diff <- rep(NA, length(elpd_diff)) + khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both")) + diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2)) + } + rownames(comp) <- rnms + comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff, + p_worse = p_worse, diag_pnorm = diag_pnorm), + as.data.frame(comp)) rownames(comp) <- rnms # run order statistics-based checks on models @@ -143,7 +159,7 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) { } else if (NCOL(xcopy) >= 2 && simplify) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - print(.fr(xcopy, digits), quote = FALSE) + print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE) invisible(x) } @@ -172,7 +188,6 @@ se_elpd_diff <- function(diffs) { sqrt(N) * sd(diffs) } - #' Perform checks on `"loo"` objects before comparison #' @noRd #' @param loos List of `"loo"` objects. From b94019298d58433455092257a4f53c21e1897420 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Thu, 16 Oct 2025 16:41:41 +0300 Subject: [PATCH 02/40] add comments and option whether the probabilities are printed --- R/loo_compare.R | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 9dd182f0..3c5bd883 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -116,15 +116,22 @@ loo_compare.default <- function(x, ...) { diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) elpd_diff <- apply(diffs, 2, sum) se_diff <- apply(diffs, 2, se_elpd_diff) + # compute probabilities that a model has worse elpd than the best model + # using a normal approximation (Sivula et al., 2025) p_worse <- pnorm(0, elpd_diff, se_diff) p_worse[elpd_diff==0] <- NA N <- nrow(diffs) + # diagnostics to assess whether the normal approximation can be trusted if (N<100) { + # small N (Sivula et al., 2025) diag_pnorm <- rep("N < 100", length(elpd_diff)) diag_pnorm[elpd_diff==0] = "" } else { diag_pnorm <- rep("", length(elpd_diff)) + # similar predictions (Sivula et al., 2025) diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions" + # possible outliers in differences (Sivula et al., 2025; + # Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both")) diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2)) @@ -147,9 +154,10 @@ loo_compare.default <- function(x, ...) { #' @param digits For the print method only, the number of digits to use when #' printing. #' @param simplify For the print method only, should only the essential columns -#' of the summary matrix be printed? The entire matrix is always returned, but -#' by default only the most important columns are printed. -print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) { +#' of the summary matrix be printed? The entire matrix is always returned, bu#' @param pnorm For the print method only, should we include the normal +#' approximation based probability of model having worse performance than +#' the best model +print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) { xcopy <- x if (inherits(xcopy, "old_compare.loo")) { if (NCOL(xcopy) >= 2 && simplify) { @@ -159,12 +167,16 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) { } else if (NCOL(xcopy) >= 2 && simplify) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE) - invisible(x) + if (p_worse) { + print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE) + invisible(x) + } else { + print(cbind(.fr(xcopy, digits), quote = FALSE)) + invisible(x) + } } - # internal ---------------------------------------------------------------- #' Compute pointwise elpd differences From 6fe41bb9b7d2760e9274a2a084e8d50b3f59dca7 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Fri, 17 Oct 2025 20:22:28 +0300 Subject: [PATCH 03/40] Apply Jonah's suggestions from code review Co-authored-by: Jonah Gabry --- R/loo_compare.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 3c5bd883..0355c0ef 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -118,7 +118,7 @@ loo_compare.default <- function(x, ...) { se_diff <- apply(diffs, 2, se_elpd_diff) # compute probabilities that a model has worse elpd than the best model # using a normal approximation (Sivula et al., 2025) - p_worse <- pnorm(0, elpd_diff, se_diff) + p_worse <- stats::pnorm(0, elpd_diff, se_diff) p_worse[elpd_diff==0] <- NA N <- nrow(diffs) # diagnostics to assess whether the normal approximation can be trusted @@ -154,9 +154,11 @@ loo_compare.default <- function(x, ...) { #' @param digits For the print method only, the number of digits to use when #' printing. #' @param simplify For the print method only, should only the essential columns -#' of the summary matrix be printed? The entire matrix is always returned, bu#' @param pnorm For the print method only, should we include the normal -#' approximation based probability of model having worse performance than -#' the best model +#' of the summary matrix be printed? The entire matrix is always returned, but +#' by default only the most important columns are printed. +#' @param pnorm For the print method only, should we include the normal +#' approximation based probability of each model having worse performance than +#' the best model? print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) { xcopy <- x if (inherits(xcopy, "old_compare.loo")) { @@ -171,7 +173,7 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE) invisible(x) } else { - print(cbind(.fr(xcopy, digits), quote = FALSE)) + print(cbind(.fr(xcopy, digits)), quote = FALSE) invisible(x) } } From fa0d79d29db4f78e48f1400290d05f7e4b289fc5 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Fri, 17 Oct 2025 20:24:09 +0300 Subject: [PATCH 04/40] fix argument name --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 0355c0ef..06730baa 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -159,7 +159,7 @@ loo_compare.default <- function(x, ...) { #' @param pnorm For the print method only, should we include the normal #' approximation based probability of each model having worse performance than #' the best model? -print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) { +print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, p_worse = TRUE) { xcopy <- x if (inherits(xcopy, "old_compare.loo")) { if (NCOL(xcopy) >= 2 && simplify) { From 65ef286a12cd3b788694bdef0c991429f3b62d54 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Fri, 17 Oct 2025 20:33:46 +0300 Subject: [PATCH 05/40] add more documentation (+ khat threshold 0.5 as we don't yet smooth) --- R/loo_compare.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 06730baa..6af99682 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -41,8 +41,14 @@ #' standard approach of comparing differences of deviances to a Chi-squared #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. -#' Sivula et al. (2022) discuss the conditions when the normal -#' approximation used for SE and `se_diff` is good. +#' +#' The values in `p_worse` column are computed using the normal +#' approximation and values from the columns `elpd_diff` and +#' `se_diff`. Sivula et al. (2025) discuss the conditions when the +#' normal approximation used for SE and `se_diff` is good., and +#' column `diag_pnorm` contains possible diagnostic messages: 1) +#' small data (N < 100), 2) similar predictions (|elpd_diff| < 4), +#' or 3) possible outliers (khat > 0.5). #' #' If more than \eqn{11} models are compared, we internally recompute the model #' differences using the median model by ELPD as the baseline model. We then @@ -134,7 +140,7 @@ loo_compare.default <- function(x, ...) { # Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both")) - diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2)) + diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > 0.5") } rownames(comp) <- rnms comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff, From b0ebac7540d86ec4003311bc1b4cd585e9e65266 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Fri, 17 Oct 2025 20:38:40 +0300 Subject: [PATCH 06/40] oops, fix khat comparison --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 6af99682..8f781a92 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -140,7 +140,7 @@ loo_compare.default <- function(x, ...) { # Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both")) - diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > 0.5") + diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } rownames(comp) <- rnms comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff, From 7410b81bcec0f1752be78ec2e3bf493dd2020609 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Fri, 17 Oct 2025 20:44:16 +0300 Subject: [PATCH 07/40] skip pareto-k check for low number of unique values --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 8f781a92..5be1adee 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -139,7 +139,7 @@ loo_compare.default <- function(x, ...) { # possible outliers in differences (Sivula et al., 2025; # Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) - khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both")) + khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) ifelse(length(unique(x))<=20, NA, posterior::pareto_khat(x, tail="both"))) diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } rownames(comp) <- rnms From b993b44dd9e759828e554c69fcaf0e203f06d9aa Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 13:27:45 -0600 Subject: [PATCH 08/40] remove `simplify` argument (need to check reverse dependencies) --- R/loo_compare.R | 56 +++++++++++++++++++++++----------------------- man/loo_compare.Rd | 18 ++++++++++----- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 5be1adee..658bb9f7 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -41,11 +41,11 @@ #' standard approach of comparing differences of deviances to a Chi-squared #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. -#' -#' The values in `p_worse` column are computed using the normal +#' +#' The values in the `p_worse` column are computed using the normal #' approximation and values from the columns `elpd_diff` and #' `se_diff`. Sivula et al. (2025) discuss the conditions when the -#' normal approximation used for SE and `se_diff` is good., and +#' normal approximation used for SE and `se_diff` is good, and the #' column `diag_pnorm` contains possible diagnostic messages: 1) #' small data (N < 100), 2) similar predictions (|elpd_diff| < 4), #' or 3) possible outliers (khat > 0.5). @@ -58,7 +58,7 @@ #' selection process. In that case users are recommended to avoid model #' selection based on LOO-CV, and instead to favor model averaging/stacking or #' projection predictive inference. -#' +#' #' @seealso #' * The [FAQ page](https://mc-stan.org/loo/articles/online-only/faq.html) on #' the __loo__ website for answers to frequently asked questions. @@ -122,24 +122,28 @@ loo_compare.default <- function(x, ...) { diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) elpd_diff <- apply(diffs, 2, sum) se_diff <- apply(diffs, 2, se_elpd_diff) + # compute probabilities that a model has worse elpd than the best model # using a normal approximation (Sivula et al., 2025) p_worse <- stats::pnorm(0, elpd_diff, se_diff) - p_worse[elpd_diff==0] <- NA - N <- nrow(diffs) + p_worse[elpd_diff == 0] <- NA + # diagnostics to assess whether the normal approximation can be trusted - if (N<100) { + N <- nrow(diffs) + if (N < 100) { # small N (Sivula et al., 2025) diag_pnorm <- rep("N < 100", length(elpd_diff)) - diag_pnorm[elpd_diff==0] = "" + diag_pnorm[elpd_diff == 0] <- "" } else { diag_pnorm <- rep("", length(elpd_diff)) # similar predictions (Sivula et al., 2025) - diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions" - # possible outliers in differences (Sivula et al., 2025; - # Vehtari et al., 2024) + diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "similar predictions" + # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) - khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) ifelse(length(unique(x))<=20, NA, posterior::pareto_khat(x, tail="both"))) + khat_diff[elpd_diff != 0] <- apply( + diffs[, elpd_diff != 0, drop = FALSE], 2, + \(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") + )) diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } rownames(comp) <- rnms @@ -159,29 +163,25 @@ loo_compare.default <- function(x, ...) { #' @export #' @param digits For the print method only, the number of digits to use when #' printing. -#' @param simplify For the print method only, should only the essential columns -#' of the summary matrix be printed? The entire matrix is always returned, but -#' by default only the most important columns are printed. -#' @param pnorm For the print method only, should we include the normal +#' @param p_worse For the print method only, should we include the normal #' approximation based probability of each model having worse performance than -#' the best model? -print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, p_worse = TRUE) { +#' the best model? The default is `TRUE`. +print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { xcopy <- x - if (inherits(xcopy, "old_compare.loo")) { - if (NCOL(xcopy) >= 2 && simplify) { - patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$" - xcopy <- xcopy[, grepl(patts, colnames(xcopy))] - } - } else if (NCOL(xcopy) >= 2 && simplify) { - xcopy <- xcopy[, c("elpd_diff", "se_diff")] + if (NCOL(xcopy) >= 2) { + xcopy <- xcopy[, c("elpd_diff", "se_diff")] } if (p_worse) { - print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE) - invisible(x) + print( + cbind(.fr(xcopy, digits), + p_worse = .fr(x[, "p_worse"], 2), + diag_pnorm = x[, "diag_pnorm"]), + quote = FALSE + ) } else { print(cbind(.fr(xcopy, digits)), quote = FALSE) - invisible(x) } + invisible(x) } diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index ee6a8db3..4d380f7e 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -11,7 +11,7 @@ loo_compare(x, ...) \method{loo_compare}{default}(x, ...) -\method{print}{compare.loo}(x, ..., digits = 1, simplify = TRUE) +\method{print}{compare.loo}(x, ..., digits = 1, p_worse = TRUE) \method{print}{compare.loo_ss}(x, ..., digits = 1, simplify = TRUE) } @@ -26,9 +26,9 @@ list.} \item{digits}{For the print method only, the number of digits to use when printing.} -\item{simplify}{For the print method only, should only the essential columns -of the summary matrix be printed? The entire matrix is always returned, but -by default only the most important columns are printed.} +\item{p_worse}{For the print method only, should we include the normal +approximation based probability of each model having worse performance than +the best model? The default is \code{TRUE}.} } \value{ A matrix with class \code{"compare.loo"} that has its own @@ -66,8 +66,14 @@ better sense of uncertainty than what is obtained using the current standard approach of comparing differences of deviances to a Chi-squared distribution, a practice derived for Gaussian linear models or asymptotically, and which only applies to nested models in any case. -Sivula et al. (2022) discuss the conditions when the normal -approximation used for SE and \code{se_diff} is good. + +The values in the \code{p_worse} column are computed using the normal +approximation and values from the columns \code{elpd_diff} and +\code{se_diff}. Sivula et al. (2025) discuss the conditions when the +normal approximation used for SE and \code{se_diff} is good, and the +column \code{diag_pnorm} contains possible diagnostic messages: 1) +small data (N < 100), 2) similar predictions (|elpd_diff| < 4), +or 3) possible outliers (khat > 0.5). If more than \eqn{11} models are compared, we internally recompute the model differences using the median model by ELPD as the baseline model. We then From b118da2834561d34a5dad8ad9cf5a0d287163bf2 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 14:08:31 -0600 Subject: [PATCH 09/40] remove simplify argument from print.compare.loo_ss --- R/loo_compare.R | 7 ------- R/loo_compare.psis_loo_ss_list.R | 9 ++------- man/loo_compare.Rd | 9 +-------- 3 files changed, 3 insertions(+), 22 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 658bb9f7..9b35be5b 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -2,9 +2,6 @@ #' #' @description Compare fitted models based on [ELPD][loo-glossary]. #' -#' By default the print method shows only the most important information. Use -#' `print(..., simplify=FALSE)` to print a more detailed summary. -#' #' @export #' @param x An object of class `"loo"` or a list of such objects. If a list is #' used then the list names will be used as the model names in the output. See @@ -74,10 +71,6 @@ #' comp <- loo_compare(loo1, loo2, loo3) #' print(comp, digits = 2) #' -#' # show more details with simplify=FALSE -#' # (will be the same for all models in this artificial example) -#' print(comp, simplify = FALSE, digits = 3) -#' #' # can use a list of objects with custom names #' # will use apple, banana, and cherry, as the names in the output #' loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) diff --git a/R/loo_compare.psis_loo_ss_list.R b/R/loo_compare.psis_loo_ss_list.R index acd0690b..9c0db564 100644 --- a/R/loo_compare.psis_loo_ss_list.R +++ b/R/loo_compare.psis_loo_ss_list.R @@ -173,14 +173,9 @@ loo_compare_checks.psis_loo_ss_list <- function(loos) { #' @rdname loo_compare #' @export -print.compare.loo_ss <- function(x, ..., digits = 1, simplify = TRUE) { +print.compare.loo_ss <- function(x, ..., digits = 1) { xcopy <- x - if (inherits(xcopy, "old_compare.loo")) { - if (NCOL(xcopy) >= 2 && simplify) { - patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$" - xcopy <- xcopy[, grepl(patts, colnames(xcopy))] - } - } else if (NCOL(xcopy) >= 2 && simplify) { + if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff", "subsampling_se_diff")] } print(.fr(xcopy, digits), quote = FALSE) diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index 4d380f7e..6fad19e0 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -13,7 +13,7 @@ loo_compare(x, ...) \method{print}{compare.loo}(x, ..., digits = 1, p_worse = TRUE) -\method{print}{compare.loo_ss}(x, ..., digits = 1, simplify = TRUE) +\method{print}{compare.loo_ss}(x, ..., digits = 1) } \arguments{ \item{x}{An object of class \code{"loo"} or a list of such objects. If a list is @@ -36,9 +36,6 @@ print method. See the \strong{Details} section. } \description{ Compare fitted models based on \link[=loo-glossary]{ELPD}. - -By default the print method shows only the most important information. Use -\code{print(..., simplify=FALSE)} to print a more detailed summary. } \details{ When comparing two fitted models, we can estimate the difference in their @@ -94,10 +91,6 @@ loo3 <- loo(LL + 2) # should be best model when compared comp <- loo_compare(loo1, loo2, loo3) print(comp, digits = 2) -# show more details with simplify=FALSE -# (will be the same for all models in this artificial example) -print(comp, simplify = FALSE, digits = 3) - # can use a list of objects with custom names # will use apple, banana, and cherry, as the names in the output loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) From 25f93fdb0535d6548f4da87750466a53afe0d305 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 14:22:54 -0600 Subject: [PATCH 10/40] start fixing tests --- R/loo_compare.R | 6 ++--- man/loo_compare.Rd | 2 +- tests/testthat/_snaps/compare.md | 44 +++++++++++++++++--------------- tests/testthat/test_compare.R | 5 +++- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 9b35be5b..66dd6515 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -9,7 +9,7 @@ #' @param ... Additional objects of class `"loo"`, if not passed in as a single #' list. #' -#' @return A matrix with class `"compare.loo"` that has its own +#' @return A data frame with class `"compare.loo"` that has its own #' print method. See the **Details** section. #' #' @details @@ -164,7 +164,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - if (p_worse) { + if (p_worse && !inherits(x, "old_compare.loo")) { print( cbind(.fr(xcopy, digits), p_worse = .fr(x[, "p_worse"], 2), @@ -172,7 +172,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { quote = FALSE ) } else { - print(cbind(.fr(xcopy, digits)), quote = FALSE) + print(.fr(xcopy, digits), quote = FALSE) } invisible(x) } diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index 6fad19e0..e1888539 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -31,7 +31,7 @@ approximation based probability of each model having worse performance than the best model? The default is \code{TRUE}.} } \value{ -A matrix with class \code{"compare.loo"} that has its own +A data frame with class \code{"compare.loo"} that has its own print method. See the \strong{Details} section. } \description{ diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index cf27900e..5458d967 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -1,28 +1,32 @@ # loo_compare returns expected results (2 models) - WAoAAAACAAQFAAACAwAAAAMOAAAAEAAAAAAAAAAAwBA6U1+cRe4AAAAAAAAAAD+2ake0LxMB - wFTh8N3JQljAVeWWE8MGuUARCD2zEXBfQBEalRIN2T9ACijAYdW5U0AmZ5XrANCKP/H9Zexy - 814/8ZtgnG1nx0Bk4fDdyUJYQGXllhPDBrlAIQg9sxFwX0AhGpUSDdk/AAAEAgAAAAEABAAJ - AAAAA2RpbQAAAA0AAAACAAAAAgAAAAgAAAQCAAAAAQAEAAkAAAAIZGltbmFtZXMAAAATAAAA - AgAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1vZGVsMgAAABAAAAAIAAQACQAAAAll - bHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxw - ZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkA - AAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAADAAQACQAAAAtjb21wYXJl - LmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJheQAAAP4= + WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAAC + AAAAAAAAAAA/tmpHtC8TAQAAAA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAA + AAAABAAJAAAAB04gPCAxMDAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2z + EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb + YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A + AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz + ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh + aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA + BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA + AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ + cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g== # loo_compare returns expected result (3 models) - WAoAAAACAAQFAAACAwAAAAMOAAAAGAAAAAAAAAAAwBA6U1+cRe7AMA3KkbYEGAAAAAAAAAAA - P7ZqR7QvEwE/y6/t4TTtXsBU4fDdyUJYwFXllhPDBrnAWOVjgjbDYkARCD2zEXBfQBEalRIN - 2T9AEPIF3GigE0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCj/x/WXscvNeP/GbYJxtZ8c/8YDQ - kmfJX0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYkAhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAA - BAIAAAABAAQACQAAAANkaW0AAAANAAAAAgAAAAMAAAAIAAAEAgAAAAEABAAJAAAACGRpbW5h - bWVzAAAAEwAAAAIAAAAQAAAAAwAEAAkAAAAGbW9kZWwxAAQACQAAAAZtb2RlbDIABAAJAAAA - Bm1vZGVsMwAAABAAAAAIAAQACQAAAAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAA - CWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNl - X3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFz - cwAAABAAAAADAAQACQAAAAtjb21wYXJlLmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJh - eQAAAP4= + WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAADAAAAAAAAAADAEDpTX5xF7sAwDcqRtgQY + AAAADgAAAAMAAAAAAAAAAD+2ake0LxMBP8uv7eE07V4AAAAOAAAAA3/wAAAAAAeiP/AAAAAA + AAA/8AAAAAAAAAAAABAAAAADAAQACQAAAAAABAAJAAAAB04gPCAxMDAABAAJAAAAB04gPCAx + MDAAAAAOAAAAA8BU4fDdyUJYwFXllhPDBrnAWOVjgjbDYgAAAA4AAAADQBEIPbMRcF9AERqV + Eg3ZP0AQ8gXcaKATAAAADgAAAANACijAYdW5U0AmZ5XrANCKQEHI2FIa3QoAAAAOAAAAAz/x + /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi + AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA + AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl + AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp + YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl + X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A + BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ + AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+ # compare returns expected result (2 models) diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 256c263c..220d87c8 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -62,6 +62,8 @@ test_that("loo_compare throws appropriate warnings", { comp_colnames <- c( "elpd_diff", "se_diff", + "p_worse", + "diag_pnorm", "elpd_waic", "se_elpd_waic", "p_waic", @@ -73,6 +75,7 @@ comp_colnames <- c( test_that("loo_compare returns expected results (2 models)", { comp1 <- loo_compare(w1, w1) expect_s3_class(comp1, "compare.loo") + expect_s3_class(comp1, "data.frame") expect_equal(colnames(comp1), comp_colnames) expect_equal(rownames(comp1), c("model1", "model2")) expect_output(print(comp1), "elpd_diff") @@ -98,7 +101,7 @@ test_that("loo_compare returns expected result (3 models)", { expect_equal(rownames(comp1), c("model1", "model2", "model3")) expect_equal(comp1[1, 1], 0) expect_s3_class(comp1, "compare.loo") - expect_s3_class(comp1, "matrix") + expect_s3_class(comp1, "data.frame") expect_snapshot_value(comp1, style = "serialize") From 62027d1944fe2e040f7871cc92ddea64abd64c20 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 15:34:03 -0600 Subject: [PATCH 11/40] fix issues in preliminary reverse dependency checks --- R/loo_compare.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 66dd6515..7f8d394d 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -113,6 +113,7 @@ loo_compare.default <- function(x, ...) { # compute elpd_diff and se_elpd_diff relative to best model rnms <- rownames(comp) diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) + colnames(diffs) <- rnms elpd_diff <- apply(diffs, 2, sum) se_diff <- apply(diffs, 2, se_elpd_diff) @@ -135,7 +136,7 @@ loo_compare.default <- function(x, ...) { khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff != 0] <- apply( diffs[, elpd_diff != 0, drop = FALSE], 2, - \(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") + function (x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") )) diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } From cc3edfe797f63ae35fb231477ce0aefc398b4bb6 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 15:34:57 -0600 Subject: [PATCH 12/40] Update loo_compare.R --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 7f8d394d..79d506fb 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -136,7 +136,7 @@ loo_compare.default <- function(x, ...) { khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff != 0] <- apply( diffs[, elpd_diff != 0, drop = FALSE], 2, - function (x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") + function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") )) diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } From 22f442f1d9b735054edaba9d5176d3b57b923d06 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 17:02:49 -0600 Subject: [PATCH 13/40] make sure p_worse is available --- R/loo_compare.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 79d506fb..aabd92f2 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -140,7 +140,6 @@ loo_compare.default <- function(x, ...) { )) diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") } - rownames(comp) <- rnms comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff, p_worse = p_worse, diag_pnorm = diag_pnorm), as.data.frame(comp)) @@ -165,7 +164,9 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - if (p_worse && !inherits(x, "old_compare.loo")) { + if (p_worse && + "p_worse" %in% colnames(xcopy) && + !inherits(x, "old_compare.loo")) { print( cbind(.fr(xcopy, digits), p_worse = .fr(x[, "p_worse"], 2), From ca843f494f4b7ed7efc1b1c7455aab49ac70ad7f Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 17:16:25 -0600 Subject: [PATCH 14/40] use x instead of xcopy --- R/loo_compare.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index aabd92f2..2871a564 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -164,8 +164,9 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } + browser() if (p_worse && - "p_worse" %in% colnames(xcopy) && + "p_worse" %in% colnames(x) && !inherits(x, "old_compare.loo")) { print( cbind(.fr(xcopy, digits), From d579c061970303a916c2de30137140aeab794f01 Mon Sep 17 00:00:00 2001 From: jgabry Date: Fri, 17 Oct 2025 17:18:30 -0600 Subject: [PATCH 15/40] Update loo_compare.R --- R/loo_compare.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 2871a564..e9e45ea4 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -164,7 +164,6 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - browser() if (p_worse && "p_worse" %in% colnames(x) && !inherits(x, "old_compare.loo")) { From 846a891e7ea3f6172eece53c6821bf6d5d26fed5 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Sat, 18 Oct 2025 11:44:50 +0300 Subject: [PATCH 16/40] unify diagnostic messages --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index e9e45ea4..41cdb843 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -131,7 +131,7 @@ loo_compare.default <- function(x, ...) { } else { diag_pnorm <- rep("", length(elpd_diff)) # similar predictions (Sivula et al., 2025) - diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "similar predictions" + diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff != 0] <- apply( From 64b365c9230e7759aea05c416c19bf8dcf60eeee Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Sat, 18 Oct 2025 17:27:55 +0300 Subject: [PATCH 17/40] improved loo_compare documentation --- R/loo_compare.R | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 41cdb843..c6128e05 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -39,13 +39,18 @@ #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. #' -#' The values in the `p_worse` column are computed using the normal -#' approximation and values from the columns `elpd_diff` and -#' `se_diff`. Sivula et al. (2025) discuss the conditions when the -#' normal approximation used for SE and `se_diff` is good, and the -#' column `diag_pnorm` contains possible diagnostic messages: 1) -#' small data (N < 100), 2) similar predictions (|elpd_diff| < 4), -#' or 3) possible outliers (khat > 0.5). +#' The values in the `p_worse` column show the probabilities for models +#' having worse ELPD than the best model. These probabilities are +#' computed using the normal approximation and values from the +#' columns `elpd_diff` and `se_diff`. Sivula et al. (2025) present +#' the conditions when the normal approximation used for SE and +#' `se_diff` is good, and the column `diag_pnorm` contains possible +#' diagnostic messages: 1) small data (N < 100), 2) similar +#' predictions (|elpd_diff| < 4), or 3) possible outliers (khat > 0.5). +#' If any of these diagnostic messages is shown, the normal +#' approximation is not well calibrated and the shown probabilities +#' can be too large (small data or similar predictions) or too small +#' (outliers). #' #' If more than \eqn{11} models are compared, we internally recompute the model #' differences using the median model by ELPD as the baseline model. We then From 185e5703564b3e62201433ababfc8c559cf7218c Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 12:22:26 -0600 Subject: [PATCH 18/40] add subsections to loo_compare doc and put diagnostic messages in bulleted list --- R/loo_compare.R | 44 ++++++++++++++++++++++++-------------------- man/loo_compare.Rd | 45 ++++++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index c6128e05..5a0b0462 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -18,14 +18,14 @@ #' [`elpd_loo`][loo-glossary] or `elpd_waic` (or multiplied by \eqn{-2}, if #' desired, to be on the deviance scale). #' -#' When using `loo_compare()`, the returned matrix will have one row per model -#' and several columns of estimates. The values in the -#' [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] columns of the -#' returned matrix are computed by making pairwise comparisons between each -#' model and the model with the largest ELPD (the model in the first row). For -#' this reason the `elpd_diff` column will always have the value `0` in the -#' first row (i.e., the difference between the preferred model and itself) and -#' negative values in subsequent rows for the remaining models. +#' ## `elpd_diff` and `se_diff` +#' When using `loo_compare()`, the returned data frame will have one row per +#' model and several columns of estimates. The values of +#' [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] are computed by +#' making pairwise comparisons between each model and the model with the +#' largest ELPD (the model listed first). Therefore, the first `elpd_diff` +#' value will always be `0` (i.e., the difference between the preferred model +#' and itself) and the rest of the values will be negative. #' #' To compute the standard error of the difference in [ELPD][loo-glossary] --- #' which should not be expected to equal the difference of the standard errors @@ -39,19 +39,23 @@ #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. #' -#' The values in the `p_worse` column show the probabilities for models -#' having worse ELPD than the best model. These probabilities are -#' computed using the normal approximation and values from the -#' columns `elpd_diff` and `se_diff`. Sivula et al. (2025) present -#' the conditions when the normal approximation used for SE and -#' `se_diff` is good, and the column `diag_pnorm` contains possible -#' diagnostic messages: 1) small data (N < 100), 2) similar -#' predictions (|elpd_diff| < 4), or 3) possible outliers (khat > 0.5). -#' If any of these diagnostic messages is shown, the normal -#' approximation is not well calibrated and the shown probabilities -#' can be too large (small data or similar predictions) or too small -#' (outliers). +#' ## `p_worse` and `diag_pnorm` +#' The values in the `p_worse` column show the probability of each model +#' having worse ELPD than the best model. These probabilities are computed +#' with a normal approximation using the values from `elpd_diff` and +#' `se_diff`. Sivula et al. (2025) present the conditions when the normal +#' approximation used for SE and `se_diff` is good, and the column +#' `diag_pnorm` contains possible diagnostic messages: #' +#' * small data (`N < 100`), +#' * similar predictions (`|elpd_diff| < 4`) +#' * possible outliers (`khat > 0.5`) +#' +#' If any of these diagnostic messages is shown, the normal approximation is +#' not well calibrated and the probabilities can be too large (small data or +#' similar predictions) or too small (outliers). +#' +#' ## Warnings for many model comparisons #' If more than \eqn{11} models are compared, we internally recompute the model #' differences using the median model by ELPD as the baseline model. We then #' estimate whether the differences in predictive performance are potentially diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index 46d1a428..ea17aafb 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -42,15 +42,15 @@ When comparing two fitted models, we can estimate the difference in their expected predictive accuracy by the difference in \code{\link[=loo-glossary]{elpd_loo}} or \code{elpd_waic} (or multiplied by \eqn{-2}, if desired, to be on the deviance scale). +\subsection{\code{elpd_diff} and \code{se_diff}}{ -When using \code{loo_compare()}, the returned matrix will have one row per model -and several columns of estimates. The values in the -\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} columns of the -returned matrix are computed by making pairwise comparisons between each -model and the model with the largest ELPD (the model in the first row). For -this reason the \code{elpd_diff} column will always have the value \code{0} in the -first row (i.e., the difference between the preferred model and itself) and -negative values in subsequent rows for the remaining models. +When using \code{loo_compare()}, the returned data frame will have one row per +model and several columns of estimates. The values of +\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} are computed by +making pairwise comparisons between each model and the model with the +largest ELPD (the model listed first). Therefore, the first \code{elpd_diff} +value will always be \code{0} (i.e., the difference between the preferred model +and itself) and the rest of the values will be negative. To compute the standard error of the difference in \link[=loo-glossary]{ELPD} --- which should not be expected to equal the difference of the standard errors @@ -63,14 +63,28 @@ better sense of uncertainty than what is obtained using the current standard approach of comparing differences of deviances to a Chi-squared distribution, a practice derived for Gaussian linear models or asymptotically, and which only applies to nested models in any case. +} + +\subsection{\code{p_worse} and \code{diag_pnorm}}{ + +The values in the \code{p_worse} column show the probability of each model +having worse ELPD than the best model. These probabilities are computed +with a normal approximation using the values from \code{elpd_diff} and +\code{se_diff}. Sivula et al. (2025) present the conditions when the normal +approximation used for SE and \code{se_diff} is good, and the column +\code{diag_pnorm} contains possible diagnostic messages: +\itemize{ +\item small data (\code{N < 100}), +\item similar predictions (\verb{|elpd_diff| < 4}) +\item possible outliers (\code{khat > 0.5}) +} + +If any of these diagnostic messages is shown, the normal approximation is +not well calibrated and the probabilities can be too large (small data or +similar predictions) or too small (outliers). +} -The values in the \code{p_worse} column are computed using the normal -approximation and values from the columns \code{elpd_diff} and -\code{se_diff}. Sivula et al. (2025) discuss the conditions when the -normal approximation used for SE and \code{se_diff} is good, and the -column \code{diag_pnorm} contains possible diagnostic messages: 1) -small data (N < 100), 2) similar predictions (|elpd_diff| < 4), -or 3) possible outliers (khat > 0.5). +\subsection{Warnings for many model comparisons}{ If more than \eqn{11} models are compared, we internally recompute the model differences using the median model by ELPD as the baseline model. We then @@ -81,6 +95,7 @@ selection process. In that case users are recommended to avoid model selection based on LOO-CV, and instead to favor model averaging/stacking or projection predictive inference. } +} \examples{ # very artificial example, just for demonstration! LL <- example_loglik_array() From 8625e10e58eda5d741ae7947ab6bc48ea71f7ffa Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 13:02:56 -0600 Subject: [PATCH 19/40] minor cleanup --- R/loo_compare.R | 38 ++++++++++++++++++++++---------------- man/loo_compare.Rd | 8 ++++---- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 5a0b0462..bb2af3ac 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -47,9 +47,9 @@ #' approximation used for SE and `se_diff` is good, and the column #' `diag_pnorm` contains possible diagnostic messages: #' -#' * small data (`N < 100`), -#' * similar predictions (`|elpd_diff| < 4`) -#' * possible outliers (`khat > 0.5`) +#' * `N < 100` (small data) +#' * `|elpd_diff| < 4` (models make similar predictions) +#' * `khat > 0.5` (possible outliers) #' #' If any of these diagnostic messages is shown, the normal approximation is #' not well calibrated and the probabilities can be too large (small data or @@ -81,7 +81,7 @@ #' print(comp, digits = 2) #' #' # can use a list of objects with custom names -#' # will use apple, banana, and cherry, as the names in the output +#' # the names will be used in the output #' loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) #' #' \dontrun{ @@ -109,17 +109,17 @@ loo_compare.default <- function(x, ...) { loos <- x } - # If subsampling is used + # if subsampling is used if (any(sapply(loos, inherits, "psis_loo_ss"))) { return(loo_compare.psis_loo_ss_list(loos)) } + # run pre-comparison checks loo_compare_checks(loos) + # compute elpd_diff and se_elpd_diff relative to best model comp <- loo_compare_matrix(loos) ord <- loo_compare_order(loos) - - # compute elpd_diff and se_elpd_diff relative to best model rnms <- rownames(comp) diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord]) colnames(diffs) <- rnms @@ -132,33 +132,39 @@ loo_compare.default <- function(x, ...) { p_worse[elpd_diff == 0] <- NA # diagnostics to assess whether the normal approximation can be trusted + # * N < 100: small data (Sivula et al., 2025) + # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025) + # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) N <- nrow(diffs) if (N < 100) { - # small N (Sivula et al., 2025) diag_pnorm <- rep("N < 100", length(elpd_diff)) diag_pnorm[elpd_diff == 0] <- "" } else { diag_pnorm <- rep("", length(elpd_diff)) - # similar predictions (Sivula et al., 2025) diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" - # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff != 0] <- apply( diffs[, elpd_diff != 0, drop = FALSE], 2, function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") )) - diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5") + diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5" } - comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff, - p_worse = p_worse, diag_pnorm = diag_pnorm), - as.data.frame(comp)) + comp <- cbind( + data.frame( + elpd_diff = elpd_diff, + se_diff = se_diff, + p_worse = p_worse, + diag_pnorm = diag_pnorm + ), + as.data.frame(comp) + ) rownames(comp) <- rnms - # run order statistics-based checks on models + # run order statistics-based checks for many model comparisons loo_order_stat_check(loos, ord) class(comp) <- c("compare.loo", class(comp)) - return(comp) + comp } #' @rdname loo_compare diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index ea17aafb..01ab1c31 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -74,9 +74,9 @@ with a normal approximation using the values from \code{elpd_diff} and approximation used for SE and \code{se_diff} is good, and the column \code{diag_pnorm} contains possible diagnostic messages: \itemize{ -\item small data (\code{N < 100}), -\item similar predictions (\verb{|elpd_diff| < 4}) -\item possible outliers (\code{khat > 0.5}) +\item \code{N < 100} (small data) +\item \verb{|elpd_diff| < 4} (models make similar predictions) +\item \code{khat > 0.5} (possible outliers) } If any of these diagnostic messages is shown, the normal approximation is @@ -107,7 +107,7 @@ comp <- loo_compare(loo1, loo2, loo3) print(comp, digits = 2) # can use a list of objects with custom names -# will use apple, banana, and cherry, as the names in the output +# the names will be used in the output loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3)) \dontrun{ From 64a19c380e2f95e96df4cf905846376724a4fff8 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 13:29:19 -0600 Subject: [PATCH 20/40] Add `model` column to `loo_compare()` output --- R/loo_compare.R | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index bb2af3ac..3b9c20ae 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -149,8 +149,10 @@ loo_compare.default <- function(x, ...) { )) diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5" } + comp <- cbind( data.frame( + model = rnms, elpd_diff = elpd_diff, se_diff = se_diff, p_worse = p_worse, @@ -158,7 +160,7 @@ loo_compare.default <- function(x, ...) { ), as.data.frame(comp) ) - rownames(comp) <- rnms + rownames(comp) <- NULL # run order statistics-based checks for many model comparisons loo_order_stat_check(loos, ord) @@ -175,22 +177,27 @@ loo_compare.default <- function(x, ...) { #' approximation based probability of each model having worse performance than #' the best model? The default is `TRUE`. print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { + if (inherits(x, "old_compare.loo")) { + stop( + "Output from old loo::compare() detected. ", + "Please rerun loo_compare() to get updated output ", + "or run unclass() on the object to print it.", + call. = FALSE + ) + } xcopy <- x if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] } - if (p_worse && - "p_worse" %in% colnames(x) && - !inherits(x, "old_compare.loo")) { - print( - cbind(.fr(xcopy, digits), - p_worse = .fr(x[, "p_worse"], 2), - diag_pnorm = x[, "diag_pnorm"]), - quote = FALSE + xcopy <- cbind(model = x$model, .fr(xcopy, digits)) + if (p_worse && "p_worse" %in% colnames(x)) { + xcopy <- cbind( + xcopy, + p_worse = .fr(x[, "p_worse"], 2), + diag_pnorm = x[, "diag_pnorm"] ) - } else { - print(.fr(xcopy, digits), quote = FALSE) } + print(xcopy, quote = FALSE) invisible(x) } From a84154ec87fb93fcb6a8a9cfd4c3ecc2acb67887 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 13:29:48 -0600 Subject: [PATCH 21/40] remove old loo::compare() --- NAMESPACE | 1 - R/compare.R | 138 ---------------------------------- man/compare.Rd | 80 -------------------- tests/testthat/test_compare.R | 80 -------------------- 4 files changed, 299 deletions(-) delete mode 100644 R/compare.R delete mode 100644 man/compare.Rd diff --git a/NAMESPACE b/NAMESPACE index 4d1a9118..a77e925c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -99,7 +99,6 @@ export(.compute_point_estimate) export(.ndraws) export(.thin_draws) export(E_loo) -export(compare) export(crps) export(elpd) export(example_loglik_array) diff --git a/R/compare.R b/R/compare.R deleted file mode 100644 index a6081d2f..00000000 --- a/R/compare.R +++ /dev/null @@ -1,138 +0,0 @@ -#' Model comparison (deprecated, old version) -#' -#' **This function is deprecated**. Please use the new [loo_compare()] function -#' instead. -#' -#' @export -#' @param ... At least two objects returned by [loo()] (or [waic()]). -#' @param x A list of at least two objects returned by [loo()] (or -#' [waic()]). This argument can be used as an alternative to -#' specifying the objects in `...`. -#' -#' @return A vector or matrix with class `'compare.loo'` that has its own -#' print method. If exactly two objects are provided in `...` or -#' `x`, then the difference in expected predictive accuracy and the -#' standard error of the difference are returned. If more than two objects are -#' provided then a matrix of summary information is returned (see **Details**). -#' -#' @details -#' When comparing two fitted models, we can estimate the difference in their -#' expected predictive accuracy by the difference in `elpd_loo` or -#' `elpd_waic` (or multiplied by -2, if desired, to be on the -#' deviance scale). -#' -#' *When that difference, `elpd_diff`, is positive then the expected -#' predictive accuracy for the second model is higher. A negative -#' `elpd_diff` favors the first model.* -#' -#' When using `compare()` with more than two models, the values in the -#' `elpd_diff` and `se_diff` columns of the returned matrix are -#' computed by making pairwise comparisons between each model and the model -#' with the best ELPD (i.e., the model in the first row). -#' Although the `elpd_diff` column is equal to the difference in -#' `elpd_loo`, do not expect the `se_diff` column to be equal to the -#' the difference in `se_elpd_loo`. -#' -#' To compute the standard error of the difference in ELPD we use a -#' paired estimate to take advantage of the fact that the same set of _N_ -#' data points was used to fit both models. These calculations should be most -#' useful when _N_ is large, because then non-normality of the -#' distribution is not such an issue when estimating the uncertainty in these -#' sums. These standard errors, for all their flaws, should give a better -#' sense of uncertainty than what is obtained using the current standard -#' approach of comparing differences of deviances to a Chi-squared -#' distribution, a practice derived for Gaussian linear models or -#' asymptotically, and which only applies to nested models in any case. -#' -#' @template loo-and-psis-references -#' -#' @examples -#' \dontrun{ -#' loo1 <- loo(log_lik1) -#' loo2 <- loo(log_lik2) -#' print(compare(loo1, loo2), digits = 3) -#' print(compare(x = list(loo1, loo2))) -#' -#' waic1 <- waic(log_lik1) -#' waic2 <- waic(log_lik2) -#' compare(waic1, waic2) -#' } -#' -compare <- function(..., x = list()) { - .Deprecated("loo_compare") - dots <- list(...) - if (length(dots)) { - if (length(x)) { - stop("If 'x' is specified then '...' should not be specified.", - call. = FALSE) - } - nms <- as.character(match.call(expand.dots = TRUE))[-1L] - } else { - if (!is.list(x) || !length(x)) { - stop("'x' must be a list.", call. = FALSE) - } - dots <- x - nms <- names(dots) - if (!length(nms)) { - nms <- paste0("model", seq_along(dots)) - } - } - - if (!all(sapply(dots, is.loo))) { - stop("All inputs should have class 'loo'.") - } - if (length(dots) <= 1L) { - stop("'compare' requires at least two models.") - } else if (length(dots) == 2L) { - loo1 <- dots[[1]] - loo2 <- dots[[2]] - comp <- compare_two_models(loo1, loo2) - class(comp) <- c(class(comp), "old_compare.loo") - return(comp) - } else { - Ns <- sapply(dots, function(x) nrow(x$pointwise)) - if (!all(Ns == Ns[1L])) { - stop("Not all models have the same number of data points.", call. = FALSE) - } - - x <- sapply(dots, function(x) { - est <- x$estimates - setNames(c(est), nm = c(rownames(est), paste0("se_", rownames(est))) ) - }) - colnames(x) <- nms - rnms <- rownames(x) - comp <- x - ord <- order(x[grep("^elpd", rnms), ], decreasing = TRUE) - comp <- t(comp)[ord, ] - patts <- c("elpd", "p_", "^waic$|^looic$", "^se_waic$|^se_looic$") - col_ord <- unlist(sapply(patts, function(p) grep(p, colnames(comp))), - use.names = FALSE) - comp <- comp[, col_ord] - - # compute elpd_diff and se_elpd_diff relative to best model - rnms <- rownames(comp) - diffs <- mapply(elpd_diffs, dots[ord[1]], dots[ord]) - elpd_diff <- apply(diffs, 2, sum) - se_diff <- apply(diffs, 2, se_elpd_diff) - comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp) - rownames(comp) <- rnms - class(comp) <- c("compare.loo", class(comp), "old_compare.loo") - comp - } -} - - - -# internal ---------------------------------------------------------------- -compare_two_models <- function(loo_a, loo_b, return = c("elpd_diff", "se"), check_dims = TRUE) { - if (check_dims) { - if (dim(loo_a$pointwise)[1] != dim(loo_b$pointwise)[1]) { - stop(paste("Models don't have the same number of data points.", - "\nFound N_1 =", dim(loo_a$pointwise)[1], "and N_2 =", dim(loo_b$pointwise)[1]), call. = FALSE) - } - } - - diffs <- elpd_diffs(loo_a, loo_b) - comp <- c(elpd_diff = sum(diffs), se = se_elpd_diff(diffs)) - structure(comp, class = "compare.loo") -} diff --git a/man/compare.Rd b/man/compare.Rd deleted file mode 100644 index 11412fe4..00000000 --- a/man/compare.Rd +++ /dev/null @@ -1,80 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare.R -\name{compare} -\alias{compare} -\title{Model comparison (deprecated, old version)} -\usage{ -compare(..., x = list()) -} -\arguments{ -\item{...}{At least two objects returned by \code{\link[=loo]{loo()}} (or \code{\link[=waic]{waic()}}).} - -\item{x}{A list of at least two objects returned by \code{\link[=loo]{loo()}} (or -\code{\link[=waic]{waic()}}). This argument can be used as an alternative to -specifying the objects in \code{...}.} -} -\value{ -A vector or matrix with class \code{'compare.loo'} that has its own -print method. If exactly two objects are provided in \code{...} or -\code{x}, then the difference in expected predictive accuracy and the -standard error of the difference are returned. If more than two objects are -provided then a matrix of summary information is returned (see \strong{Details}). -} -\description{ -\strong{This function is deprecated}. Please use the new \code{\link[=loo_compare]{loo_compare()}} function -instead. -} -\details{ -When comparing two fitted models, we can estimate the difference in their -expected predictive accuracy by the difference in \code{elpd_loo} or -\code{elpd_waic} (or multiplied by -2, if desired, to be on the -deviance scale). - -\emph{When that difference, \code{elpd_diff}, is positive then the expected -predictive accuracy for the second model is higher. A negative -\code{elpd_diff} favors the first model.} - -When using \code{compare()} with more than two models, the values in the -\code{elpd_diff} and \code{se_diff} columns of the returned matrix are -computed by making pairwise comparisons between each model and the model -with the best ELPD (i.e., the model in the first row). -Although the \code{elpd_diff} column is equal to the difference in -\code{elpd_loo}, do not expect the \code{se_diff} column to be equal to the -the difference in \code{se_elpd_loo}. - -To compute the standard error of the difference in ELPD we use a -paired estimate to take advantage of the fact that the same set of \emph{N} -data points was used to fit both models. These calculations should be most -useful when \emph{N} is large, because then non-normality of the -distribution is not such an issue when estimating the uncertainty in these -sums. These standard errors, for all their flaws, should give a better -sense of uncertainty than what is obtained using the current standard -approach of comparing differences of deviances to a Chi-squared -distribution, a practice derived for Gaussian linear models or -asymptotically, and which only applies to nested models in any case. -} -\examples{ -\dontrun{ -loo1 <- loo(log_lik1) -loo2 <- loo(log_lik2) -print(compare(loo1, loo2), digits = 3) -print(compare(x = list(loo1, loo2))) - -waic1 <- waic(log_lik1) -waic2 <- waic(log_lik2) -compare(waic1, waic2) -} - -} -\references{ -Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model -evaluation using leave-one-out cross-validation and WAIC. -\emph{Statistics and Computing}. 27(5), 1413--1432. doi:10.1007/s11222-016-9696-4 -(\href{https://link.springer.com/article/10.1007/s11222-016-9696-4}{journal version}, -\href{https://arxiv.org/abs/1507.04544}{preprint arXiv:1507.04544}). - -Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024). -Pareto smoothed importance sampling. \emph{Journal of Machine Learning Research}, -25(72):1-58. -\href{https://jmlr.org/papers/v25/19-556.html}{PDF} -} diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 220d87c8..172b73bb 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -109,83 +109,3 @@ test_that("loo_compare returns expected result (3 models)", { # except rownames) to using 'x' argument expect_equal(comp1, loo_compare(x = list(w1, w2, w3)), ignore_attr = TRUE) }) - -# Tests for deprecated compare() ------------------------------------------ - -test_that("compare throws deprecation warnings", { - expect_warning(loo::compare(w1, w2), "Deprecated") - expect_warning(loo::compare(w1, w1, w2), "Deprecated") -}) - -test_that("compare returns expected result (2 models)", { - expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated") - expect_snapshot(comp1) - expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0)) - - expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated") - expect_snapshot(comp2) - expect_named(comp2, c("elpd_diff", "se")) - expect_s3_class(comp2, "compare.loo") - - # specifying objects via ... and via arg x gives equal results - expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated") - expect_equal(comp2, comp_via_list) -}) - -test_that("compare returns expected result (3 models)", { - w3 <- suppressWarnings(waic(LLarr3)) - expect_warning(comp1 <- loo::compare(w1, w2, w3), "Deprecated") - - expect_equal( - colnames(comp1), - c( - "elpd_diff", - "se_diff", - "elpd_waic", - "se_elpd_waic", - "p_waic", - "se_p_waic", - "waic", - "se_waic" - ) - ) - expect_equal(rownames(comp1), c("w1", "w2", "w3")) - expect_equal(comp1[1, 1], 0) - expect_s3_class(comp1, "compare.loo") - expect_s3_class(comp1, "matrix") - expect_snapshot_value(comp1, style = "serialize") - - # specifying objects via '...' gives equivalent results (equal - # except rownames) to using 'x' argument - expect_warning( - comp_via_list <- loo::compare(x = list(w1, w2, w3)), - "Deprecated" - ) - expect_equal(comp1, comp_via_list, ignore_attr = TRUE) -}) - -test_that("compare throws appropriate errors", { - expect_error( - suppressWarnings(loo::compare(w1, w2, x = list(w1, w2))), - "should not be specified" - ) - expect_error(suppressWarnings(loo::compare(x = 2)), "must be a list") - expect_error( - suppressWarnings(loo::compare(x = list(2))), - "should have class 'loo'" - ) - expect_error( - suppressWarnings(loo::compare(x = list(w1))), - "requires at least two models" - ) - - w3 <- suppressWarnings(waic(LLarr2[,, -1])) - expect_error( - suppressWarnings(loo::compare(x = list(w1, w3))), - "same number of data points" - ) - expect_error( - suppressWarnings(loo::compare(x = list(w1, w2, w3))), - "same number of data points" - ) -}) From 16f67d47d63be1640d37c286d845dad433532992 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 13:49:54 -0600 Subject: [PATCH 22/40] improve backwards compatibility --- R/loo_compare.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/loo_compare.R b/R/loo_compare.R index bb2af3ac..6820c406 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -175,6 +175,9 @@ loo_compare.default <- function(x, ...) { #' approximation based probability of each model having worse performance than #' the best model? The default is `TRUE`. print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { + if (!inherits(x, "data.frame")) { + class(x) <- c(class(x), "data.frame") + } xcopy <- x if (NCOL(xcopy) >= 2) { xcopy <- xcopy[, c("elpd_diff", "se_diff")] From 4e16d2fcfacf02644442351bea36f8284023bc10 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 14:14:35 -0600 Subject: [PATCH 23/40] Update loo_compare.R --- R/loo_compare.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/loo_compare.R b/R/loo_compare.R index 790becfa..3c3c5348 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -184,6 +184,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { "or run unclass() on the object to print it.", call. = FALSE ) + } if (!inherits(x, "data.frame")) { class(x) <- c(class(x), "data.frame") } From 23a79c0de892c35f5c6a4f7c84dee9c81798bb1d Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 14:18:13 -0600 Subject: [PATCH 24/40] fix failing test --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 6820c406..d56687ad 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -175,7 +175,7 @@ loo_compare.default <- function(x, ...) { #' approximation based probability of each model having worse performance than #' the best model? The default is `TRUE`. print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { - if (!inherits(x, "data.frame")) { + if (!inherits(x, "data.frame") && !inherits(x, "old_compare.loo")) { class(x) <- c(class(x), "data.frame") } xcopy <- x From cff3c2cf5ebaf4a531037c332274cf8086968a83 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 17:06:57 -0600 Subject: [PATCH 25/40] Revert "remove old loo::compare()" This reverts commit a84154ec87fb93fcb6a8a9cfd4c3ecc2acb67887. --- NAMESPACE | 1 + R/compare.R | 138 ++++++++++++++++++++++++++++++++++ man/compare.Rd | 80 ++++++++++++++++++++ tests/testthat/test_compare.R | 80 ++++++++++++++++++++ 4 files changed, 299 insertions(+) create mode 100644 R/compare.R create mode 100644 man/compare.Rd diff --git a/NAMESPACE b/NAMESPACE index a77e925c..4d1a9118 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -99,6 +99,7 @@ export(.compute_point_estimate) export(.ndraws) export(.thin_draws) export(E_loo) +export(compare) export(crps) export(elpd) export(example_loglik_array) diff --git a/R/compare.R b/R/compare.R new file mode 100644 index 00000000..a6081d2f --- /dev/null +++ b/R/compare.R @@ -0,0 +1,138 @@ +#' Model comparison (deprecated, old version) +#' +#' **This function is deprecated**. Please use the new [loo_compare()] function +#' instead. +#' +#' @export +#' @param ... At least two objects returned by [loo()] (or [waic()]). +#' @param x A list of at least two objects returned by [loo()] (or +#' [waic()]). This argument can be used as an alternative to +#' specifying the objects in `...`. +#' +#' @return A vector or matrix with class `'compare.loo'` that has its own +#' print method. If exactly two objects are provided in `...` or +#' `x`, then the difference in expected predictive accuracy and the +#' standard error of the difference are returned. If more than two objects are +#' provided then a matrix of summary information is returned (see **Details**). +#' +#' @details +#' When comparing two fitted models, we can estimate the difference in their +#' expected predictive accuracy by the difference in `elpd_loo` or +#' `elpd_waic` (or multiplied by -2, if desired, to be on the +#' deviance scale). +#' +#' *When that difference, `elpd_diff`, is positive then the expected +#' predictive accuracy for the second model is higher. A negative +#' `elpd_diff` favors the first model.* +#' +#' When using `compare()` with more than two models, the values in the +#' `elpd_diff` and `se_diff` columns of the returned matrix are +#' computed by making pairwise comparisons between each model and the model +#' with the best ELPD (i.e., the model in the first row). +#' Although the `elpd_diff` column is equal to the difference in +#' `elpd_loo`, do not expect the `se_diff` column to be equal to the +#' the difference in `se_elpd_loo`. +#' +#' To compute the standard error of the difference in ELPD we use a +#' paired estimate to take advantage of the fact that the same set of _N_ +#' data points was used to fit both models. These calculations should be most +#' useful when _N_ is large, because then non-normality of the +#' distribution is not such an issue when estimating the uncertainty in these +#' sums. These standard errors, for all their flaws, should give a better +#' sense of uncertainty than what is obtained using the current standard +#' approach of comparing differences of deviances to a Chi-squared +#' distribution, a practice derived for Gaussian linear models or +#' asymptotically, and which only applies to nested models in any case. +#' +#' @template loo-and-psis-references +#' +#' @examples +#' \dontrun{ +#' loo1 <- loo(log_lik1) +#' loo2 <- loo(log_lik2) +#' print(compare(loo1, loo2), digits = 3) +#' print(compare(x = list(loo1, loo2))) +#' +#' waic1 <- waic(log_lik1) +#' waic2 <- waic(log_lik2) +#' compare(waic1, waic2) +#' } +#' +compare <- function(..., x = list()) { + .Deprecated("loo_compare") + dots <- list(...) + if (length(dots)) { + if (length(x)) { + stop("If 'x' is specified then '...' should not be specified.", + call. = FALSE) + } + nms <- as.character(match.call(expand.dots = TRUE))[-1L] + } else { + if (!is.list(x) || !length(x)) { + stop("'x' must be a list.", call. = FALSE) + } + dots <- x + nms <- names(dots) + if (!length(nms)) { + nms <- paste0("model", seq_along(dots)) + } + } + + if (!all(sapply(dots, is.loo))) { + stop("All inputs should have class 'loo'.") + } + if (length(dots) <= 1L) { + stop("'compare' requires at least two models.") + } else if (length(dots) == 2L) { + loo1 <- dots[[1]] + loo2 <- dots[[2]] + comp <- compare_two_models(loo1, loo2) + class(comp) <- c(class(comp), "old_compare.loo") + return(comp) + } else { + Ns <- sapply(dots, function(x) nrow(x$pointwise)) + if (!all(Ns == Ns[1L])) { + stop("Not all models have the same number of data points.", call. = FALSE) + } + + x <- sapply(dots, function(x) { + est <- x$estimates + setNames(c(est), nm = c(rownames(est), paste0("se_", rownames(est))) ) + }) + colnames(x) <- nms + rnms <- rownames(x) + comp <- x + ord <- order(x[grep("^elpd", rnms), ], decreasing = TRUE) + comp <- t(comp)[ord, ] + patts <- c("elpd", "p_", "^waic$|^looic$", "^se_waic$|^se_looic$") + col_ord <- unlist(sapply(patts, function(p) grep(p, colnames(comp))), + use.names = FALSE) + comp <- comp[, col_ord] + + # compute elpd_diff and se_elpd_diff relative to best model + rnms <- rownames(comp) + diffs <- mapply(elpd_diffs, dots[ord[1]], dots[ord]) + elpd_diff <- apply(diffs, 2, sum) + se_diff <- apply(diffs, 2, se_elpd_diff) + comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp) + rownames(comp) <- rnms + class(comp) <- c("compare.loo", class(comp), "old_compare.loo") + comp + } +} + + + +# internal ---------------------------------------------------------------- +compare_two_models <- function(loo_a, loo_b, return = c("elpd_diff", "se"), check_dims = TRUE) { + if (check_dims) { + if (dim(loo_a$pointwise)[1] != dim(loo_b$pointwise)[1]) { + stop(paste("Models don't have the same number of data points.", + "\nFound N_1 =", dim(loo_a$pointwise)[1], "and N_2 =", dim(loo_b$pointwise)[1]), call. = FALSE) + } + } + + diffs <- elpd_diffs(loo_a, loo_b) + comp <- c(elpd_diff = sum(diffs), se = se_elpd_diff(diffs)) + structure(comp, class = "compare.loo") +} diff --git a/man/compare.Rd b/man/compare.Rd new file mode 100644 index 00000000..11412fe4 --- /dev/null +++ b/man/compare.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare.R +\name{compare} +\alias{compare} +\title{Model comparison (deprecated, old version)} +\usage{ +compare(..., x = list()) +} +\arguments{ +\item{...}{At least two objects returned by \code{\link[=loo]{loo()}} (or \code{\link[=waic]{waic()}}).} + +\item{x}{A list of at least two objects returned by \code{\link[=loo]{loo()}} (or +\code{\link[=waic]{waic()}}). This argument can be used as an alternative to +specifying the objects in \code{...}.} +} +\value{ +A vector or matrix with class \code{'compare.loo'} that has its own +print method. If exactly two objects are provided in \code{...} or +\code{x}, then the difference in expected predictive accuracy and the +standard error of the difference are returned. If more than two objects are +provided then a matrix of summary information is returned (see \strong{Details}). +} +\description{ +\strong{This function is deprecated}. Please use the new \code{\link[=loo_compare]{loo_compare()}} function +instead. +} +\details{ +When comparing two fitted models, we can estimate the difference in their +expected predictive accuracy by the difference in \code{elpd_loo} or +\code{elpd_waic} (or multiplied by -2, if desired, to be on the +deviance scale). + +\emph{When that difference, \code{elpd_diff}, is positive then the expected +predictive accuracy for the second model is higher. A negative +\code{elpd_diff} favors the first model.} + +When using \code{compare()} with more than two models, the values in the +\code{elpd_diff} and \code{se_diff} columns of the returned matrix are +computed by making pairwise comparisons between each model and the model +with the best ELPD (i.e., the model in the first row). +Although the \code{elpd_diff} column is equal to the difference in +\code{elpd_loo}, do not expect the \code{se_diff} column to be equal to the +the difference in \code{se_elpd_loo}. + +To compute the standard error of the difference in ELPD we use a +paired estimate to take advantage of the fact that the same set of \emph{N} +data points was used to fit both models. These calculations should be most +useful when \emph{N} is large, because then non-normality of the +distribution is not such an issue when estimating the uncertainty in these +sums. These standard errors, for all their flaws, should give a better +sense of uncertainty than what is obtained using the current standard +approach of comparing differences of deviances to a Chi-squared +distribution, a practice derived for Gaussian linear models or +asymptotically, and which only applies to nested models in any case. +} +\examples{ +\dontrun{ +loo1 <- loo(log_lik1) +loo2 <- loo(log_lik2) +print(compare(loo1, loo2), digits = 3) +print(compare(x = list(loo1, loo2))) + +waic1 <- waic(log_lik1) +waic2 <- waic(log_lik2) +compare(waic1, waic2) +} + +} +\references{ +Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model +evaluation using leave-one-out cross-validation and WAIC. +\emph{Statistics and Computing}. 27(5), 1413--1432. doi:10.1007/s11222-016-9696-4 +(\href{https://link.springer.com/article/10.1007/s11222-016-9696-4}{journal version}, +\href{https://arxiv.org/abs/1507.04544}{preprint arXiv:1507.04544}). + +Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024). +Pareto smoothed importance sampling. \emph{Journal of Machine Learning Research}, +25(72):1-58. +\href{https://jmlr.org/papers/v25/19-556.html}{PDF} +} diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 172b73bb..220d87c8 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -109,3 +109,83 @@ test_that("loo_compare returns expected result (3 models)", { # except rownames) to using 'x' argument expect_equal(comp1, loo_compare(x = list(w1, w2, w3)), ignore_attr = TRUE) }) + +# Tests for deprecated compare() ------------------------------------------ + +test_that("compare throws deprecation warnings", { + expect_warning(loo::compare(w1, w2), "Deprecated") + expect_warning(loo::compare(w1, w1, w2), "Deprecated") +}) + +test_that("compare returns expected result (2 models)", { + expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated") + expect_snapshot(comp1) + expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0)) + + expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated") + expect_snapshot(comp2) + expect_named(comp2, c("elpd_diff", "se")) + expect_s3_class(comp2, "compare.loo") + + # specifying objects via ... and via arg x gives equal results + expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated") + expect_equal(comp2, comp_via_list) +}) + +test_that("compare returns expected result (3 models)", { + w3 <- suppressWarnings(waic(LLarr3)) + expect_warning(comp1 <- loo::compare(w1, w2, w3), "Deprecated") + + expect_equal( + colnames(comp1), + c( + "elpd_diff", + "se_diff", + "elpd_waic", + "se_elpd_waic", + "p_waic", + "se_p_waic", + "waic", + "se_waic" + ) + ) + expect_equal(rownames(comp1), c("w1", "w2", "w3")) + expect_equal(comp1[1, 1], 0) + expect_s3_class(comp1, "compare.loo") + expect_s3_class(comp1, "matrix") + expect_snapshot_value(comp1, style = "serialize") + + # specifying objects via '...' gives equivalent results (equal + # except rownames) to using 'x' argument + expect_warning( + comp_via_list <- loo::compare(x = list(w1, w2, w3)), + "Deprecated" + ) + expect_equal(comp1, comp_via_list, ignore_attr = TRUE) +}) + +test_that("compare throws appropriate errors", { + expect_error( + suppressWarnings(loo::compare(w1, w2, x = list(w1, w2))), + "should not be specified" + ) + expect_error(suppressWarnings(loo::compare(x = 2)), "must be a list") + expect_error( + suppressWarnings(loo::compare(x = list(2))), + "should have class 'loo'" + ) + expect_error( + suppressWarnings(loo::compare(x = list(w1))), + "requires at least two models" + ) + + w3 <- suppressWarnings(waic(LLarr2[,, -1])) + expect_error( + suppressWarnings(loo::compare(x = list(w1, w3))), + "same number of data points" + ) + expect_error( + suppressWarnings(loo::compare(x = list(w1, w2, w3))), + "same number of data points" + ) +}) From 8f521a56c7f448cc3f2908848f76bbb46fde3306 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 17:44:51 -0600 Subject: [PATCH 26/40] update tests --- R/loo_compare.R | 7 +-- tests/testthat/_snaps/compare.md | 95 +++++++++++++++++++++----------- tests/testthat/test_compare.R | 29 ++++++---- 3 files changed, 80 insertions(+), 51 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 3c3c5348..2a739045 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -178,12 +178,7 @@ loo_compare.default <- function(x, ...) { #' the best model? The default is `TRUE`. print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (inherits(x, "old_compare.loo")) { - stop( - "Output from old loo::compare() detected. ", - "Please rerun loo_compare() to get updated output ", - "or run unclass() on the object to print it.", - call. = FALSE - ) + return(unclass(x)) } if (!inherits(x, "data.frame")) { class(x) <- c(class(x), "data.frame") diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index 5458d967..82548aca 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -1,48 +1,77 @@ # loo_compare returns expected results (2 models) - WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAAC - AAAAAAAAAAA/tmpHtC8TAQAAAA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAA - AAAABAAJAAAAB04gPCAxMDAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2z - EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb - YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A - AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz - ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh - aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA - BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA - AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ - cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g== + WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAA + AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAAA4AAAAC + wFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMRcF9AEQg9sxFwXwAAAA4AAAACQAoowGHV + uVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw + 3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA + AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA + AAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNl + X2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMA + BAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29t + cGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAA + DQAAAAKAAAAA/////gAAAP4= -# loo_compare returns expected result (3 models) +--- + + Code + print(comp1) + Output + model elpd_diff se_diff p_worse diag_pnorm + 1 model1 0.0 0.0 NA + 2 model2 0.0 0.0 NA + +--- + + WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAACAAAAAAAAAAA/tmpHtC8TAQAA + AA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAAAAAABAAJAAAAB04gPCAxMDAA + AAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2zEXBfQBEalRIN2T8AAAAOAAAA + AkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDd + yUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFt + ZXMAAAAQAAAACwAEAAkAAAAFbW9kZWwABAAJAAAACWVscGRfZGlmZgAEAAkAAAAHc2VfZGlm + ZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAKZGlhZ19wbm9ybQAEAAkAAAAJZWxwZF93YWljAAQA + CQAAAAxzZV9lbHBkX3dhaWMABAAJAAAABnBfd2FpYwAEAAkAAAAJc2VfcF93YWljAAQACQAA + AAR3YWljAAQACQAAAAdzZV93YWljAAAEAgAAAAEABAAJAAAABWNsYXNzAAAAEAAAAAIABAAJ + AAAAC2NvbXBhcmUubG9vAAQACQAAAApkYXRhLmZyYW1lAAAEAgAAAAEABAAJAAAACXJvdy5u + YW1lcwAAAA0AAAACgAAAAP////4AAAD+ - WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAADAAAAAAAAAADAEDpTX5xF7sAwDcqRtgQY - AAAADgAAAAMAAAAAAAAAAD+2ake0LxMBP8uv7eE07V4AAAAOAAAAA3/wAAAAAAeiP/AAAAAA - AAA/8AAAAAAAAAAAABAAAAADAAQACQAAAAAABAAJAAAAB04gPCAxMDAABAAJAAAAB04gPCAx - MDAAAAAOAAAAA8BU4fDdyUJYwFXllhPDBrnAWOVjgjbDYgAAAA4AAAADQBEIPbMRcF9AERqV - Eg3ZP0AQ8gXcaKATAAAADgAAAANACijAYdW5U0AmZ5XrANCKQEHI2FIa3QoAAAAOAAAAAz/x - /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi - AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA - AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl - AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp - YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl - X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A - BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ - AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+ - -# compare returns expected result (2 models) +--- Code - comp1 + print(comp2) Output - elpd_diff se - 0.0 0.0 + model elpd_diff se_diff p_worse diag_pnorm + 1 model1 0.0 0.0 NA + 2 model2 -4.1 0.1 1.00 N < 100 + +# loo_compare returns expected result (3 models) + + WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + ZGVsMgAEAAkAAAAGbW9kZWwzAAAADgAAAAMAAAAAAAAAAMAQOlNfnEXuwDANypG2BBgAAAAO + AAAAAwAAAAAAAAAAP7ZqR7QvEwE/y6/t4TTtXgAAAA4AAAADf/AAAAAAB6I/8AAAAAAAAD/w + AAAAAAAAAAAAEAAAAAMABAAJAAAAAAAEAAkAAAAHTiA8IDEwMAAEAAkAAAAHTiA8IDEwMAAA + AA4AAAADwFTh8N3JQljAVeWWE8MGucBY5WOCNsNiAAAADgAAAANAEQg9sxFwX0ARGpUSDdk/ + QBDyBdxooBMAAAAOAAAAA0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCgAAAA4AAAADP/H9Zexy + 814/8ZtgnG1nxz/xgNCSZ8lfAAAADgAAAANAZOHw3clCWEBl5ZYTwwa5QGjlY4I2w2IAAAAO + AAAAA0AhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA + AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA + AAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNl + X2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMA + BAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29t + cGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAA + DQAAAAKAAAAA/////QAAAP4= --- Code - comp2 + print(comp1) Output - elpd_diff se - -4.1 0.1 + model elpd_diff se_diff p_worse diag_pnorm + 1 model1 0.0 0.0 NA + 2 model2 -4.1 0.1 1.00 N < 100 + 3 model3 -16.1 0.2 1.00 N < 100 # compare returns expected result (3 models) diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 220d87c8..c1bf352a 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -60,6 +60,7 @@ test_that("loo_compare throws appropriate warnings", { comp_colnames <- c( + "model", "elpd_diff", "se_diff", "p_worse", @@ -77,19 +78,25 @@ test_that("loo_compare returns expected results (2 models)", { expect_s3_class(comp1, "compare.loo") expect_s3_class(comp1, "data.frame") expect_equal(colnames(comp1), comp_colnames) - expect_equal(rownames(comp1), c("model1", "model2")) - expect_output(print(comp1), "elpd_diff") - expect_equal(comp1[1:2, 1], c(0, 0), ignore_attr = TRUE) - expect_equal(comp1[1:2, 2], c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$model, c("model1", "model2")) + expect_equal(comp1$elpd_diff, c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$se_diff, c(0, 0), ignore_attr = TRUE) + expect_equal(comp1$p_worse, c(NA_real_, NA_real_), ignore_attr = TRUE) + expect_snapshot_value(comp1, style = "serialize") + expect_snapshot(print(comp1)) comp2 <- loo_compare(w1, w2) expect_s3_class(comp2, "compare.loo") expect_equal(colnames(comp2), comp_colnames) - expect_snapshot_value(comp2, style = "serialize") + expect_snapshot(print(comp2)) # specifying objects via ... and via arg x gives equal results expect_equal(comp2, loo_compare(x = list(w1, w2))) + + # custom naming works + comp3 <- loo_compare(x = list("A" = w2, "B" = w1)) + expect_equal(comp3$model, c("B", "A")) }) @@ -98,12 +105,12 @@ test_that("loo_compare returns expected result (3 models)", { comp1 <- loo_compare(w1, w2, w3) expect_equal(colnames(comp1), comp_colnames) - expect_equal(rownames(comp1), c("model1", "model2", "model3")) - expect_equal(comp1[1, 1], 0) + expect_equal(comp1$model, c("model1", "model2", "model3")) + expect_equal(comp1$p_worse, c(NA, 1, 1)) expect_s3_class(comp1, "compare.loo") expect_s3_class(comp1, "data.frame") - expect_snapshot_value(comp1, style = "serialize") + expect_snapshot(print(comp1)) # specifying objects via '...' gives equivalent results (equal # except rownames) to using 'x' argument @@ -119,13 +126,11 @@ test_that("compare throws deprecation warnings", { test_that("compare returns expected result (2 models)", { expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated") - expect_snapshot(comp1) expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0)) expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated") - expect_snapshot(comp2) - expect_named(comp2, c("elpd_diff", "se")) - expect_s3_class(comp2, "compare.loo") + expect_equal(round(comp2[1:2], 3), c(elpd_diff = -4.057, se = 0.088)) + expect_s3_class(comp2, "old_compare.loo") # specifying objects via ... and via arg x gives equal results expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated") From dc9db69efdd6673c13ea12b67da6f75f20dc2542 Mon Sep 17 00:00:00 2001 From: jgabry Date: Sat, 18 Oct 2025 18:07:26 -0600 Subject: [PATCH 27/40] cleanup print method --- R/loo_compare.R | 17 ++++++++--------- tests/testthat/_snaps/compare.md | 9 +++++++++ tests/testthat/test_compare.R | 1 + 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 2a739045..0660aea4 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -183,19 +183,18 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (!inherits(x, "data.frame")) { class(x) <- c(class(x), "data.frame") } - xcopy <- x - if (NCOL(xcopy) >= 2) { - xcopy <- xcopy[, c("elpd_diff", "se_diff")] - } - xcopy <- cbind(model = x$model, .fr(xcopy, digits)) + x2 <- cbind( + model = x$model, + .fr(x[, c("elpd_diff", "se_diff")], digits) + ) if (p_worse && "p_worse" %in% colnames(x)) { - xcopy <- cbind( - xcopy, - p_worse = .fr(x[, "p_worse"], 2), + x2 <- cbind( + x2, + p_worse = .fr(x[, "p_worse"], digits = 2), diag_pnorm = x[, "diag_pnorm"] ) } - print(xcopy, quote = FALSE) + print(x2, quote = FALSE) invisible(x) } diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index 82548aca..c0568451 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -46,6 +46,15 @@ 1 model1 0.0 0.0 NA 2 model2 -4.1 0.1 1.00 N < 100 +--- + + Code + print(comp2, p_worse = FALSE) + Output + model elpd_diff se_diff + 1 model1 0.0 0.0 + 2 model2 -4.1 0.1 + # loo_compare returns expected result (3 models) WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index c1bf352a..6f12c5d5 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -90,6 +90,7 @@ test_that("loo_compare returns expected results (2 models)", { expect_equal(colnames(comp2), comp_colnames) expect_snapshot_value(comp2, style = "serialize") expect_snapshot(print(comp2)) + expect_snapshot(print(comp2, p_worse = FALSE)) # specifying objects via ... and via arg x gives equal results expect_equal(comp2, loo_compare(x = list(w1, w2))) From 3ed1c0c56231ca5631813e76ea7a2399f269df7e Mon Sep 17 00:00:00 2001 From: jgabry Date: Sun, 19 Oct 2025 13:24:01 -0600 Subject: [PATCH 28/40] improve backwards compatibility --- R/loo_compare.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/loo_compare.R b/R/loo_compare.R index 0660aea4..8e70df0c 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -183,6 +183,10 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { if (!inherits(x, "data.frame")) { class(x) <- c(class(x), "data.frame") } + if (!all(c("model", "elpd_diff", "se_diff") %in% colnames(x))) { + print(as.data.frame(x)) + return(x) + } x2 <- cbind( model = x$model, .fr(x[, c("elpd_diff", "se_diff")], digits) From 89d39f58e7d34f7281bd110ed442e9fb64872d3e Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Tue, 21 Oct 2025 10:17:01 +0300 Subject: [PATCH 29/40] change diag_pnorm to diag_diff --- R/loo_compare.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index d56687ad..e0f378fa 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -39,13 +39,13 @@ #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. #' -#' ## `p_worse` and `diag_pnorm` +#' ## `p_worse` and `diag_diff` #' The values in the `p_worse` column show the probability of each model #' having worse ELPD than the best model. These probabilities are computed #' with a normal approximation using the values from `elpd_diff` and #' `se_diff`. Sivula et al. (2025) present the conditions when the normal #' approximation used for SE and `se_diff` is good, and the column -#' `diag_pnorm` contains possible diagnostic messages: +#' `diag_diff` contains possible diagnostic messages: #' #' * `N < 100` (small data) #' * `|elpd_diff| < 4` (models make similar predictions) @@ -137,24 +137,24 @@ loo_compare.default <- function(x, ...) { # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) N <- nrow(diffs) if (N < 100) { - diag_pnorm <- rep("N < 100", length(elpd_diff)) - diag_pnorm[elpd_diff == 0] <- "" + diag_diff <- rep("N < 100", length(elpd_diff)) + diag_diff[elpd_diff == 0] <- "" } else { - diag_pnorm <- rep("", length(elpd_diff)) - diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" + diag_diff <- rep("", length(elpd_diff)) + diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" khat_diff <- rep(NA, length(elpd_diff)) khat_diff[elpd_diff != 0] <- apply( diffs[, elpd_diff != 0, drop = FALSE], 2, function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") )) - diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5" + diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5" } comp <- cbind( data.frame( elpd_diff = elpd_diff, se_diff = se_diff, p_worse = p_worse, - diag_pnorm = diag_pnorm + diag_diff = diag_diff ), as.data.frame(comp) ) @@ -188,7 +188,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { print( cbind(.fr(xcopy, digits), p_worse = .fr(x[, "p_worse"], 2), - diag_pnorm = x[, "diag_pnorm"]), + diag_diff = x[, "diag_diff"]), quote = FALSE ) } else { From 6d73537601b190d1416376df661218893f9e102d Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Tue, 21 Oct 2025 10:34:00 +0300 Subject: [PATCH 30/40] change diag_pnorm to diag_diff in tests --- tests/testthat/test_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 220d87c8..d6fe1deb 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -63,7 +63,7 @@ comp_colnames <- c( "elpd_diff", "se_diff", "p_worse", - "diag_pnorm", + "diag_diff", "elpd_waic", "se_elpd_waic", "p_waic", From abcf209e29a1df8633a71e76fa0a64d6afbd59ab Mon Sep 17 00:00:00 2001 From: jgabry Date: Tue, 21 Oct 2025 10:00:02 -0600 Subject: [PATCH 31/40] update test snapshots --- tests/testthat/_snaps/compare.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index 5458d967..61de13ce 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -6,11 +6,11 @@ EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz - ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh - aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA - BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA - AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ - cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g== + ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2Fp + YwAEAAkAAAAMc2VfZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAE + AAkAAAAEd2FpYwAEAAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAAC + AAQACQAAAAtjb21wYXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAly + b3cubmFtZXMAAAAQAAAAAgAEAAkAAAAGbW9kZWwxAAQACQAAAAZtb2RlbDIAAAD+ # loo_compare returns expected result (3 models) @@ -22,11 +22,11 @@ /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl - AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp - YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl - X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A - BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ - AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+ + AAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93YWlj + AAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAHc2Vf + d2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21wYXJlLmxvbwAE + AAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAQAAAAAwAEAAkA + AAAGbW9kZWwxAAQACQAAAAZtb2RlbDIABAAJAAAABm1vZGVsMwAAAP4= # compare returns expected result (2 models) From a375c93d27a2dfb07d1c883beeda91557c1fe279 Mon Sep 17 00:00:00 2001 From: jgabry Date: Tue, 21 Oct 2025 10:32:34 -0600 Subject: [PATCH 32/40] remove row numbers when printing --- R/loo_compare.R | 2 +- tests/testthat/_snaps/compare.md | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 12b49f5c..afe1cc4d 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -198,7 +198,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { diag_diff = x[, "diag_diff"] ) } - print(x2, quote = FALSE) + print(x2, quote = FALSE, row.names = FALSE) invisible(x) } diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index 6639fd57..5aee7b29 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -18,9 +18,9 @@ Code print(comp1) Output - model elpd_diff se_diff p_worse diag_diff - 1 model1 0.0 0.0 NA - 2 model2 0.0 0.0 NA + model elpd_diff se_diff p_worse diag_diff + model1 0.0 0.0 NA + model2 0.0 0.0 NA --- @@ -42,18 +42,18 @@ Code print(comp2) Output - model elpd_diff se_diff p_worse diag_diff - 1 model1 0.0 0.0 NA - 2 model2 -4.1 0.1 1.00 N < 100 + model elpd_diff se_diff p_worse diag_diff + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 --- Code print(comp2, p_worse = FALSE) Output - model elpd_diff se_diff - 1 model1 0.0 0.0 - 2 model2 -4.1 0.1 + model elpd_diff se_diff + model1 0.0 0.0 + model2 -4.1 0.1 # loo_compare returns expected result (3 models) @@ -77,10 +77,10 @@ Code print(comp1) Output - model elpd_diff se_diff p_worse diag_diff - 1 model1 0.0 0.0 NA - 2 model2 -4.1 0.1 1.00 N < 100 - 3 model3 -16.1 0.2 1.00 N < 100 + model elpd_diff se_diff p_worse diag_diff + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 + model3 -16.1 0.2 1.00 N < 100 # compare returns expected result (3 models) From f3fcb289103152aeb27c7f5b7293fbb7f5861dfa Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 11:53:26 +0300 Subject: [PATCH 33/40] add diag_elpd --- R/loo_compare.R | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index afe1cc4d..16372b9b 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -150,13 +150,34 @@ loo_compare.default <- function(x, ...) { diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5" } + # get khats for PSIS + khat_psis <- sapply(loos[ord], + \(loo) { + k <- loo$diagnostics[["pareto_k"]] + if (is.null(k)) { + out = "" + } else { + S <- dim(loo)[1] + khat_threshold <- ps_khat_threshold(S) + K <- sum(k > khat_threshold) + if (K==0) { + out <- "" + } else { + out <- paste0(K, " khat_psis > ", round(khat_threshold, 2)) + } + } + out + } + ) + comp <- cbind( data.frame( model = rnms, elpd_diff = elpd_diff, se_diff = se_diff, p_worse = p_worse, - diag_diff = diag_diff + diag_diff = diag_diff, + diag_elpd = khat_psis ), as.data.frame(comp) ) @@ -195,7 +216,8 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) { x2 <- cbind( x2, p_worse = .fr(x[, "p_worse"], digits = 2), - diag_diff = x[, "diag_diff"] + diag_diff = x[, "diag_diff"], + diag_elpd = x[, "diag_elpd"] ) } print(x2, quote = FALSE, row.names = FALSE) From 359853a8221715420c87ef654021d54d6b393777 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 11:53:36 +0300 Subject: [PATCH 34/40] improve loo_compare doc --- R/loo_compare.R | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 16372b9b..d970d2a0 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -39,7 +39,7 @@ #' distribution, a practice derived for Gaussian linear models or #' asymptotically, and which only applies to nested models in any case. #' -#' ## `p_worse` and `diag_diff` +#' ## `p_worse`, `diag_diff`, and `diag_elpd` #' The values in the `p_worse` column show the probability of each model #' having worse ELPD than the best model. These probabilities are computed #' with a normal approximation using the values from `elpd_diff` and @@ -51,9 +51,24 @@ #' * `|elpd_diff| < 4` (models make similar predictions) #' * `khat > 0.5` (possible outliers) #' -#' If any of these diagnostic messages is shown, the normal approximation is -#' not well calibrated and the probabilities can be too large (small data or -#' similar predictions) or too small (outliers). +#' If any of these diagnostic messages is shown, the error distribution is +#' skewed or thick tailed and the normal approximation based on `elpd_diff` +#' and `se_diff` is not well calibrated. The probabilities `p_worse` are +#' likely to be too large (small data or similar predictions) or too small +#' (outliers). `elpd_diff` and `se_diff` are still indicative of the +#' differences and uncertainties, and for example, if `|elpd_diff|` is +#' many times larger than `se_diff` the difference is quite certain. +#' While `khat > 0.5` indicates possibility of outliers, it is also +#' possible that both models compared seem to be well specified based +#' on model checking, but the pointwise ELPD differences have such thick tails +#' that the normal approximation for the sum is not good. +#' +#' The column `diag_elpd` shows diagnostic for the pointwise ELPD +#' computations for each model. If `k khat_psis > 0.7` is shown, +#' where `k` is the number of high high Pareto k values in Pareto +#' smoothed importance sampling computation, then there may be +#' significant bias in `elpd_diff` favoring models with a large +#' number of high Pareto k values. #' #' ## Warnings for many model comparisons #' If more than \eqn{11} models are compared, we internally recompute the model From b7db7ddacf2daedce4d9bcc54ecaab95eba14a16 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 15:57:29 +0300 Subject: [PATCH 35/40] clarifiy loo_compare diag_diff khat --- R/loo_compare.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index d970d2a0..490d8853 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -60,8 +60,10 @@ #' many times larger than `se_diff` the difference is quite certain. #' While `khat > 0.5` indicates possibility of outliers, it is also #' possible that both models compared seem to be well specified based -#' on model checking, but the pointwise ELPD differences have such thick tails -#' that the normal approximation for the sum is not good. +#' on model checking, but the pointwise ELPD differences have such thick +#' tails that the normal approximation for the sum is not good (note that +#' `khat`s in `diag_diff` column are different from the `khat`s in PSIS-LOO +#' diagnostic). #' #' The column `diag_elpd` shows diagnostic for the pointwise ELPD #' computations for each model. If `k khat_psis > 0.7` is shown, From 77e753f63462a459c0759d1f7d6eb198ad08886b Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 16:18:07 +0300 Subject: [PATCH 36/40] yet another small doc improvement --- R/loo_compare.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 490d8853..6a69445a 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -61,9 +61,9 @@ #' While `khat > 0.5` indicates possibility of outliers, it is also #' possible that both models compared seem to be well specified based #' on model checking, but the pointwise ELPD differences have such thick -#' tails that the normal approximation for the sum is not good (note that -#' `khat`s in `diag_diff` column are different from the `khat`s in PSIS-LOO -#' diagnostic). +#' tails that the normal approximation for the sum is not good (Vehtari et +#' al., 2024). Note that `khat`s in `diag_diff` column are different from +#' the `khat`s in PSIS-LOO diagnostic. #' #' The column `diag_elpd` shows diagnostic for the pointwise ELPD #' computations for each model. If `k khat_psis > 0.7` is shown, From c59414e33df1f54b9ab71e64e066bce500e4f047 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 18:52:44 +0300 Subject: [PATCH 37/40] Use function() Co-authored-by: Jonah Gabry --- R/loo_compare.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 6a69445a..f9240341 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -169,7 +169,7 @@ loo_compare.default <- function(x, ...) { # get khats for PSIS khat_psis <- sapply(loos[ord], - \(loo) { + function(loo) { k <- loo$diagnostics[["pareto_k"]] if (is.null(k)) { out = "" From 93fdbff72cc363235f33a9c42d0712a3b39faaa4 Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 18:59:09 +0300 Subject: [PATCH 38/40] another loo_compare doc edit --- R/loo_compare.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index f9240341..7649a746 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -58,12 +58,13 @@ #' (outliers). `elpd_diff` and `se_diff` are still indicative of the #' differences and uncertainties, and for example, if `|elpd_diff|` is #' many times larger than `se_diff` the difference is quite certain. -#' While `khat > 0.5` indicates possibility of outliers, it is also -#' possible that both models compared seem to be well specified based +#' The `khat` value for the `diag_diff` column is computed using +#' the pointwise elpd differences (and is different frin `khat`s in PSIS-LOO +#' diagnostic). While `khat > 0.5` indicates possibility of outliers, it is +#' also possible that both models compared seem to be well specified based #' on model checking, but the pointwise ELPD differences have such thick #' tails that the normal approximation for the sum is not good (Vehtari et -#' al., 2024). Note that `khat`s in `diag_diff` column are different from -#' the `khat`s in PSIS-LOO diagnostic. +#' al., 2024). #' #' The column `diag_elpd` shows diagnostic for the pointwise ELPD #' computations for each model. If `k khat_psis > 0.7` is shown, From ae48d6945e414c2ac432efdaceeb053c1174774c Mon Sep 17 00:00:00 2001 From: Aki Vehtari Date: Wed, 22 Oct 2025 20:02:58 +0300 Subject: [PATCH 39/40] adjust some diagnostic messages and documentation --- R/loo_compare.R | 57 +++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index 7649a746..c53236ad 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -49,7 +49,7 @@ #' #' * `N < 100` (small data) #' * `|elpd_diff| < 4` (models make similar predictions) -#' * `khat > 0.5` (possible outliers) +#' * `k_diff > 0.5` (possible outliers) #' #' If any of these diagnostic messages is shown, the error distribution is #' skewed or thick tailed and the normal approximation based on `elpd_diff` @@ -58,16 +58,17 @@ #' (outliers). `elpd_diff` and `se_diff` are still indicative of the #' differences and uncertainties, and for example, if `|elpd_diff|` is #' many times larger than `se_diff` the difference is quite certain. -#' The `khat` value for the `diag_diff` column is computed using -#' the pointwise elpd differences (and is different frin `khat`s in PSIS-LOO -#' diagnostic). While `khat > 0.5` indicates possibility of outliers, it is +#' The `k_diff` value for the `diag_diff` column is computed using +#' the pointwise elpd differences (and is different from Pareto k's in PSIS-LOO +#' diagnostic). While `k_diff > 0.5` indicates possibility of outliers, it is #' also possible that both models compared seem to be well specified based #' on model checking, but the pointwise ELPD differences have such thick #' tails that the normal approximation for the sum is not good (Vehtari et -#' al., 2024). +#' al., 2024). Threshold 0.5 is used for `k_diff` as we do not do automatic +#' Pareto smoothing for the pointwise differences (Vehtari et al., 2024). #' #' The column `diag_elpd` shows diagnostic for the pointwise ELPD -#' computations for each model. If `k khat_psis > 0.7` is shown, +#' computations for each model. If `k_psis > 0.7` is shown, #' where `k` is the number of high high Pareto k values in Pareto #' smoothed importance sampling computation, then there may be #' significant bias in `elpd_diff` favoring models with a large @@ -152,7 +153,7 @@ loo_compare.default <- function(x, ...) { # diagnostics to assess whether the normal approximation can be trusted # * N < 100: small data (Sivula et al., 2025) # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025) - # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) + # * k_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) N <- nrow(diffs) if (N < 100) { diag_diff <- rep("N < 100", length(elpd_diff)) @@ -160,34 +161,30 @@ loo_compare.default <- function(x, ...) { } else { diag_diff <- rep("", length(elpd_diff)) diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" - khat_diff <- rep(NA, length(elpd_diff)) - khat_diff[elpd_diff != 0] <- apply( + k_diff <- rep(NA, length(elpd_diff)) + k_diff[elpd_diff != 0] <- apply( diffs[, elpd_diff != 0, drop = FALSE], 2, function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") )) - diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5" + diag_diff[k_diff > 0.5] <- "k_diff > 0.5" } # get khats for PSIS - khat_psis <- sapply(loos[ord], - function(loo) { - k <- loo$diagnostics[["pareto_k"]] - if (is.null(k)) { - out = "" - } else { - S <- dim(loo)[1] - khat_threshold <- ps_khat_threshold(S) - K <- sum(k > khat_threshold) - if (K==0) { - out <- "" - } else { - out <- paste0(K, " khat_psis > ", round(khat_threshold, 2)) - } - } - out - } - ) - + k_psis <- sapply(loos[ord], + function(loo) { + k <- loo$diagnostics[["pareto_k"]] + if (is.null(k)) { + out = "" + } else { + S <- dim(loo)[1] + khat_threshold <- ps_khat_threshold(S) + K <- sum(k > khat_threshold) + out <- ifelse(K==0, "", paste0(K, " k_psis > ", round(khat_threshold, 2))) + } + out + } + ) + comp <- cbind( data.frame( model = rnms, @@ -195,7 +192,7 @@ loo_compare.default <- function(x, ...) { se_diff = se_diff, p_worse = p_worse, diag_diff = diag_diff, - diag_elpd = khat_psis + diag_elpd = k_psis ), as.data.frame(comp) ) From 4f5872b5d93ff0a9e6490cf544ddbe2b0c057b56 Mon Sep 17 00:00:00 2001 From: jgabry Date: Wed, 22 Oct 2025 16:25:48 -0600 Subject: [PATCH 40/40] edit doc, fix tests, move diagnostics to internal functions --- R/loo_compare.R | 126 ++++++++++++++++--------------- man/find_model_names.Rd | 1 - man/loo_compare.Rd | 31 ++++++-- tests/testthat/_snaps/compare.md | 87 ++++++++++----------- tests/testthat/test_compare.R | 5 ++ 5 files changed, 140 insertions(+), 110 deletions(-) diff --git a/R/loo_compare.R b/R/loo_compare.R index c53236ad..b67f8c0e 100644 --- a/R/loo_compare.R +++ b/R/loo_compare.R @@ -53,26 +53,28 @@ #' #' If any of these diagnostic messages is shown, the error distribution is #' skewed or thick tailed and the normal approximation based on `elpd_diff` -#' and `se_diff` is not well calibrated. The probabilities `p_worse` are -#' likely to be too large (small data or similar predictions) or too small -#' (outliers). `elpd_diff` and `se_diff` are still indicative of the -#' differences and uncertainties, and for example, if `|elpd_diff|` is -#' many times larger than `se_diff` the difference is quite certain. -#' The `k_diff` value for the `diag_diff` column is computed using -#' the pointwise elpd differences (and is different from Pareto k's in PSIS-LOO -#' diagnostic). While `k_diff > 0.5` indicates possibility of outliers, it is -#' also possible that both models compared seem to be well specified based -#' on model checking, but the pointwise ELPD differences have such thick -#' tails that the normal approximation for the sum is not good (Vehtari et -#' al., 2024). Threshold 0.5 is used for `k_diff` as we do not do automatic -#' Pareto smoothing for the pointwise differences (Vehtari et al., 2024). +#' and `se_diff` is not well calibrated. In that case, the probabilities +#' `p_worse` are likely to be too large (small data or similar predictions) or +#' too small (outliers). However, `elpd_diff` and `se_diff` will still be +#' indicative of the differences and uncertainties (for example, if +#' `|elpd_diff|` is many times larger than `se_diff` the difference is quite +#' certain). #' -#' The column `diag_elpd` shows diagnostic for the pointwise ELPD -#' computations for each model. If `k_psis > 0.7` is shown, -#' where `k` is the number of high high Pareto k values in Pareto -#' smoothed importance sampling computation, then there may be -#' significant bias in `elpd_diff` favoring models with a large -#' number of high Pareto k values. +#' The `k_diff` value for the `diag_diff` column is computed using the +#' pointwise ELPD differences (and is different from the Pareto k's in +#' PSIS-LOO diagnostic). While `k_diff > 0.5` indicates the *possibility* of +#' outliers, it is also possible that both models compared seem to be well +#' specified based on model checking, but the pointwise ELPD differences have +#' such thick tails that the normal approximation for the sum is not good +#' (Vehtari et al., 2024). A threshold of 0.5 is used for `k_diff` as we do +#' not do automatic Pareto smoothing for the pointwise differences (Vehtari et +#' al., 2024). +#' +#' The column `diag_elpd` shows the PSIS-LOO Pareto k diagnostic for the +#' pointwise ELPD computations for each model. If `K k_psis > 0.7` is shown, +#' where `K` is the number of high high Pareto k values in the PSIS +#' computation, then there may be significant bias in `elpd_diff` favoring +#' models with a large number of high Pareto k values. #' #' ## Warnings for many model comparisons #' If more than \eqn{11} models are compared, we internally recompute the model @@ -150,49 +152,14 @@ loo_compare.default <- function(x, ...) { p_worse <- stats::pnorm(0, elpd_diff, se_diff) p_worse[elpd_diff == 0] <- NA - # diagnostics to assess whether the normal approximation can be trusted - # * N < 100: small data (Sivula et al., 2025) - # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025) - # * k_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024) - N <- nrow(diffs) - if (N < 100) { - diag_diff <- rep("N < 100", length(elpd_diff)) - diag_diff[elpd_diff == 0] <- "" - } else { - diag_diff <- rep("", length(elpd_diff)) - diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" - k_diff <- rep(NA, length(elpd_diff)) - k_diff[elpd_diff != 0] <- apply( - diffs[, elpd_diff != 0, drop = FALSE], 2, - function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") - )) - diag_diff[k_diff > 0.5] <- "k_diff > 0.5" - } - - # get khats for PSIS - k_psis <- sapply(loos[ord], - function(loo) { - k <- loo$diagnostics[["pareto_k"]] - if (is.null(k)) { - out = "" - } else { - S <- dim(loo)[1] - khat_threshold <- ps_khat_threshold(S) - K <- sum(k > khat_threshold) - out <- ifelse(K==0, "", paste0(K, " k_psis > ", round(khat_threshold, 2))) - } - out - } - ) - comp <- cbind( data.frame( model = rnms, elpd_diff = elpd_diff, se_diff = se_diff, p_worse = p_worse, - diag_diff = diag_diff, - diag_elpd = k_psis + diag_diff = diag_diff(nrow(diffs), elpd_diff), + diag_elpd = diag_elpd(loos[ord]) ), as.data.frame(comp) ) @@ -311,7 +278,6 @@ loo_compare_checks <- function(loos) { #' Find the model names associated with `"loo"` objects #' #' @export -#' @keywords internal #' @param x List of `"loo"` objects. #' @return Character vector of model names the same length as `x.` #' @@ -340,7 +306,6 @@ find_model_names <- function(x) { #' Compute the loo_compare matrix -#' @keywords internal #' @noRd #' @param loos List of `"loo"` objects. loo_compare_matrix <- function(loos){ @@ -362,7 +327,6 @@ loo_compare_matrix <- function(loos){ #' Computes the order of loos for comparison #' @noRd -#' @keywords internal #' @param loos List of `"loo"` objects. loo_compare_order <- function(loos){ tmp <- sapply(loos, function(x) { @@ -377,7 +341,6 @@ loo_compare_order <- function(loos){ #' Perform checks on `"loo"` objects __after__ comparison #' @noRd -#' @keywords internal #' @param loos List of `"loo"` objects. #' @param ord List of `"loo"` object orderings. #' @return Nothing, just possibly throws errors/warnings. @@ -419,14 +382,12 @@ loo_order_stat_check <- function(loos, ord) { #' Returns the middle index of a vector #' @noRd -#' @keywords internal #' @param vec A vector. #' @return Integer index value. middle_idx <- function(vec) floor(length(vec) / 2) #' Computes maximum order statistic from K Gaussians #' @noRd -#' @keywords internal #' @param K Number of Gaussians. #' @param c Scaling of the order statistic. #' @return Numeric expected maximum from K samples from a Gaussian with mean @@ -434,3 +395,44 @@ middle_idx <- function(vec) floor(length(vec) / 2) order_stat_heuristic <- function(K, c) { qnorm(p = 1 - 1 / (K * 2), mean = 0, sd = c) } + +#' Count number of high Pareto k values in PSIS-LOO and create diagnostic message +#' @noRd +#' @param loos Ordered list of loo objects. +#' @return Character vector of diagnostic messages. +diag_elpd <- function(loos) { + sapply(loos, function(loo) { + k <- loo$diagnostics[["pareto_k"]] + if (is.null(k)) { + out <- "" + } else { + S <- dim(loo)[1] + khat_threshold <- ps_khat_threshold(S) + K <- sum(k > khat_threshold) + out <- ifelse(K == 0, "", paste0(K, " k_psis > ", round(khat_threshold, 2))) + } + out + }) +} + +#' Create diagnostic for elpd differences +#' @noRd +#' @param N Number of data points. +#' @param elpd_diff Vector of elpd differences. +#' @return Character vector of diagnostic messages. +diag_diff <- function(N, elpd_diff) { + if (N < 100) { + diag_diff <- rep("N < 100", length(elpd_diff)) + diag_diff[elpd_diff == 0] <- "" + } else { + diag_diff <- rep("", length(elpd_diff)) + diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4" + k_diff <- rep(NA, length(elpd_diff)) + k_diff[elpd_diff != 0] <- apply( + diffs[, elpd_diff != 0, drop = FALSE], 2, + function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both") + )) + diag_diff[k_diff > 0.5] <- "k_diff > 0.5" + } + diag_diff +} diff --git a/man/find_model_names.Rd b/man/find_model_names.Rd index cdb76b80..70a79d58 100644 --- a/man/find_model_names.Rd +++ b/man/find_model_names.Rd @@ -15,4 +15,3 @@ Character vector of model names the same length as \code{x.} \description{ Find the model names associated with \code{"loo"} objects } -\keyword{internal} diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd index 6b7350ea..ecac018b 100644 --- a/man/loo_compare.Rd +++ b/man/loo_compare.Rd @@ -65,7 +65,7 @@ distribution, a practice derived for Gaussian linear models or asymptotically, and which only applies to nested models in any case. } -\subsection{\code{p_worse} and \code{diag_diff}}{ +\subsection{\code{p_worse}, \code{diag_diff}, and \code{diag_elpd}}{ The values in the \code{p_worse} column show the probability of each model having worse ELPD than the best model. These probabilities are computed @@ -76,12 +76,33 @@ approximation used for SE and \code{se_diff} is good, and the column \itemize{ \item \code{N < 100} (small data) \item \verb{|elpd_diff| < 4} (models make similar predictions) -\item \code{khat > 0.5} (possible outliers) +\item \code{k_diff > 0.5} (possible outliers) } -If any of these diagnostic messages is shown, the normal approximation is -not well calibrated and the probabilities can be too large (small data or -similar predictions) or too small (outliers). +If any of these diagnostic messages is shown, the error distribution is +skewed or thick tailed and the normal approximation based on \code{elpd_diff} +and \code{se_diff} is not well calibrated. In that case, the probabilities +\code{p_worse} are likely to be too large (small data or similar predictions) or +too small (outliers). However, \code{elpd_diff} and \code{se_diff} will still be +indicative of the differences and uncertainties (for example, if +\verb{|elpd_diff|} is many times larger than \code{se_diff} the difference is quite +certain). + +The \code{k_diff} value for the \code{diag_diff} column is computed using the +pointwise ELPD differences (and is different from the Pareto k's in +PSIS-LOO diagnostic). While \code{k_diff > 0.5} indicates the \emph{possibility} of +outliers, it is also possible that both models compared seem to be well +specified based on model checking, but the pointwise ELPD differences have +such thick tails that the normal approximation for the sum is not good +(Vehtari et al., 2024). A threshold of 0.5 is used for \code{k_diff} as we do +not do automatic Pareto smoothing for the pointwise differences (Vehtari et +al., 2024). + +The column \code{diag_elpd} shows the PSIS-LOO Pareto k diagnostic for the +pointwise ELPD computations for each model. If \verb{K k_psis > 0.7} is shown, +where \code{K} is the number of high high Pareto k values in the PSIS +computation, then there may be significant bias in \code{elpd_diff} favoring +models with a large number of high Pareto k values. } \subsection{Warnings for many model comparisons}{ diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md index 5aee7b29..1c23a744 100644 --- a/tests/testthat/_snaps/compare.md +++ b/tests/testthat/_snaps/compare.md @@ -1,50 +1,52 @@ # loo_compare returns expected results (2 models) - WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v ZGVsMgAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAA - AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAAA4AAAAC - wFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMRcF9AEQg9sxFwXwAAAA4AAAACQAoowGHV - uVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw - 3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA - AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA - AAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2Vf - ZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAE - AAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21w - YXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAN - AAAAAoAAAAD////+AAAA/g== + AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAABAAAAAC + AAQACQAAAAAABAAJAAAAAAAAAA4AAAACwFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMR + cF9AEQg9sxFwXwAAAA4AAAACQAoowGHVuVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l + 7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAA + BAIAAAABAAQACQAAAAVuYW1lcwAAABAAAAAMAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9k + aWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJ + AAAACWRpYWdfZWxwZAAEAAkAAAAJZWxwZF93YWljAAQACQAAAAxzZV9lbHBkX3dhaWMABAAJ + AAAABnBfd2FpYwAEAAkAAAAJc2VfcF93YWljAAQACQAAAAR3YWljAAQACQAAAAdzZV93YWlj + AAAEAgAAAAEABAAJAAAABWNsYXNzAAAAEAAAAAIABAAJAAAAC2NvbXBhcmUubG9vAAQACQAA + AApkYXRhLmZyYW1lAAAEAgAAAAEABAAJAAAACXJvdy5uYW1lcwAAAA0AAAACgAAAAP////4A + AAD+ --- Code print(comp1) Output - model elpd_diff se_diff p_worse diag_diff - model1 0.0 0.0 NA - model2 0.0 0.0 NA + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 0.0 0.0 NA --- - WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v ZGVsMgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAACAAAAAAAAAAA/tmpHtC8TAQAA AA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAAAAAABAAJAAAAB04gPCAxMDAA - AAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2zEXBfQBEalRIN2T8AAAAOAAAA - AkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDd - yUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFt - ZXMAAAAQAAAACwAEAAkAAAAFbW9kZWwABAAJAAAACWVscGRfZGlmZgAEAAkAAAAHc2VfZGlm - ZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAJZGlhZ19kaWZmAAQACQAAAAllbHBkX3dhaWMABAAJ - AAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAA - BHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkA - AAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5h - bWVzAAAADQAAAAKAAAAA/////gAAAP4= + AAAQAAAAAgAEAAkAAAAAAAQACQAAAAAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAA + AkARCD2zEXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXs + cvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEa + lRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAADAAEAAkAAAAFbW9kZWwABAAJAAAA + CWVscGRfZGlmZgAEAAkAAAAHc2VfZGlmZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAJZGlhZ19k + aWZmAAQACQAAAAlkaWFnX2VscGQABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93 + YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAH + c2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21wYXJlLmxv + bwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAANAAAAAoAA + AAD////+AAAA/g== --- Code print(comp2) Output - model elpd_diff se_diff p_worse diag_diff - model1 0.0 0.0 NA - model2 -4.1 0.1 1.00 N < 100 + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 --- @@ -57,30 +59,31 @@ # loo_compare returns expected result (3 models) - WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v + WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v ZGVsMgAEAAkAAAAGbW9kZWwzAAAADgAAAAMAAAAAAAAAAMAQOlNfnEXuwDANypG2BBgAAAAO AAAAAwAAAAAAAAAAP7ZqR7QvEwE/y6/t4TTtXgAAAA4AAAADf/AAAAAAB6I/8AAAAAAAAD/w AAAAAAAAAAAAEAAAAAMABAAJAAAAAAAEAAkAAAAHTiA8IDEwMAAEAAkAAAAHTiA8IDEwMAAA - AA4AAAADwFTh8N3JQljAVeWWE8MGucBY5WOCNsNiAAAADgAAAANAEQg9sxFwX0ARGpUSDdk/ - QBDyBdxooBMAAAAOAAAAA0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCgAAAA4AAAADP/H9Zexy - 814/8ZtgnG1nxz/xgNCSZ8lfAAAADgAAAANAZOHw3clCWEBl5ZYTwwa5QGjlY4I2w2IAAAAO - AAAAA0AhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA - AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA - AAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2Vf - ZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAE - AAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21w - YXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAN - AAAAAoAAAAD////9AAAA/g== + ABAAAAADAAQACQAAAAAABAAJAAAAAAAEAAkAAAAAAAAADgAAAAPAVOHw3clCWMBV5ZYTwwa5 + wFjlY4I2w2IAAAAOAAAAA0ARCD2zEXBfQBEalRIN2T9AEPIF3GigEwAAAA4AAAADQAoowGHV + uVNAJmeV6wDQikBByNhSGt0KAAAADgAAAAM/8f1l7HLzXj/xm2CcbWfHP/GA0JJnyV8AAAAO + AAAAA0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYgAAAA4AAAADQCEIPbMRcF9AIRqVEg3ZP0Ag + 8gXcaKATAAAEAgAAAAEABAAJAAAABW5hbWVzAAAAEAAAAAwABAAJAAAABW1vZGVsAAQACQAA + AAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAAB3Bfd29yc2UABAAJAAAACWRpYWdf + ZGlmZgAEAAkAAAAJZGlhZ19lbHBkAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRf + d2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAA + B3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5s + b28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAADQAAAAKA + AAAA/////QAAAP4= --- Code print(comp1) Output - model elpd_diff se_diff p_worse diag_diff - model1 0.0 0.0 NA - model2 -4.1 0.1 1.00 N < 100 - model3 -16.1 0.2 1.00 N < 100 + model elpd_diff se_diff p_worse diag_diff diag_elpd + model1 0.0 0.0 NA + model2 -4.1 0.1 1.00 N < 100 + model3 -16.1 0.2 1.00 N < 100 # compare returns expected result (3 models) diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R index 678a4858..8463c411 100644 --- a/tests/testthat/test_compare.R +++ b/tests/testthat/test_compare.R @@ -65,6 +65,7 @@ comp_colnames <- c( "se_diff", "p_worse", "diag_diff", + "diag_elpd", "elpd_waic", "se_elpd_waic", "p_waic", @@ -88,6 +89,9 @@ test_that("loo_compare returns expected results (2 models)", { comp2 <- loo_compare(w1, w2) expect_s3_class(comp2, "compare.loo") expect_equal(colnames(comp2), comp_colnames) + expect_equal(comp2$p_worse, c(NA, 1)) + expect_equal(comp2$diag_diff, c("", "N < 100")) + expect_equal(comp2$diag_elpd, c("", "")) expect_snapshot_value(comp2, style = "serialize") expect_snapshot(print(comp2)) expect_snapshot(print(comp2, p_worse = FALSE)) @@ -108,6 +112,7 @@ test_that("loo_compare returns expected result (3 models)", { expect_equal(colnames(comp1), comp_colnames) expect_equal(comp1$model, c("model1", "model2", "model3")) expect_equal(comp1$p_worse, c(NA, 1, 1)) + expect_equal(comp1$diag_diff, c("", "N < 100", "N < 100")) expect_s3_class(comp1, "compare.loo") expect_s3_class(comp1, "data.frame") expect_snapshot_value(comp1, style = "serialize")