From a909ceaa138b6bdfc71c443d0906150d92ef0273 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Sun, 31 Aug 2025 19:58:48 +0300
Subject: [PATCH 01/40] add diff diagnostics

---
 R/loo_compare.R | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 028fa59b..9dd182f0 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -116,7 +116,23 @@ loo_compare.default <- function(x, ...) {
   diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord])
   elpd_diff <- apply(diffs, 2, sum)
   se_diff <- apply(diffs, 2, se_elpd_diff)
-  comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp)
+  p_worse <- pnorm(0, elpd_diff, se_diff)
+  p_worse[elpd_diff==0] <- NA
+  N <- nrow(diffs)
+  if (N<100) {
+    diag_pnorm <- rep("N < 100", length(elpd_diff))
+    diag_pnorm[elpd_diff==0] = ""
+  } else {
+    diag_pnorm <- rep("", length(elpd_diff))
+    diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions"
+    khat_diff <- rep(NA, length(elpd_diff))
+    khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both"))
+    diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2))
+  }
+  rownames(comp) <- rnms
+  comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff,
+                p_worse = p_worse, diag_pnorm = diag_pnorm),
+                as.data.frame(comp))
   rownames(comp) <- rnms
 
   # run order statistics-based checks on models
@@ -143,7 +159,7 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) {
   } else if (NCOL(xcopy) >= 2 && simplify) {
      xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  print(.fr(xcopy, digits), quote = FALSE)
+  print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE)
   invisible(x)
 }
 
@@ -172,7 +188,6 @@ se_elpd_diff <- function(diffs) {
   sqrt(N) * sd(diffs)
 }
 
-
 #' Perform checks on `"loo"` objects before comparison
 #' @noRd
 #' @param loos List of `"loo"` objects.

From b94019298d58433455092257a4f53c21e1897420 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Thu, 16 Oct 2025 16:41:41 +0300
Subject: [PATCH 02/40] add comments and option whether the probabilities are
 printed

---
 R/loo_compare.R | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 9dd182f0..3c5bd883 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -116,15 +116,22 @@ loo_compare.default <- function(x, ...) {
   diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord])
   elpd_diff <- apply(diffs, 2, sum)
   se_diff <- apply(diffs, 2, se_elpd_diff)
+  # compute probabilities that a model has worse elpd than the best model
+  # using a normal approximation (Sivula et al., 2025)
   p_worse <- pnorm(0, elpd_diff, se_diff)
   p_worse[elpd_diff==0] <- NA
   N <- nrow(diffs)
+  # diagnostics to assess whether the normal approximation can be trusted
   if (N<100) {
+    # small N (Sivula et al., 2025)
     diag_pnorm <- rep("N < 100", length(elpd_diff))
     diag_pnorm[elpd_diff==0] = ""
   } else {
     diag_pnorm <- rep("", length(elpd_diff))
+    # similar predictions (Sivula et al., 2025)
     diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions"
+    # possible outliers in differences (Sivula et al., 2025;
+    # Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both"))
     diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2))
@@ -147,9 +154,10 @@ loo_compare.default <- function(x, ...) {
 #' @param digits For the print method only, the number of digits to use when
 #'   printing.
 #' @param simplify For the print method only, should only the essential columns
-#'   of the summary matrix be printed? The entire matrix is always returned, but
-#'   by default only the most important columns are printed.
-print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) {
+#'   of the summary matrix be printed? The entire matrix is always returned, bu#' @param pnorm For the print method only, should we include the normal
+#'   approximation based probability of model having worse performance than
+#'   the best model
+print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) {
   xcopy <- x
   if (inherits(xcopy, "old_compare.loo")) {
     if (NCOL(xcopy) >= 2 && simplify) {
@@ -159,12 +167,16 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE) {
   } else if (NCOL(xcopy) >= 2 && simplify) {
      xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE)
-  invisible(x)
+  if (p_worse) {
+    print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE)
+    invisible(x)
+  } else {
+    print(cbind(.fr(xcopy, digits), quote = FALSE))
+    invisible(x)
+  }
 }
 
 
-
 # internal ----------------------------------------------------------------
 
 #' Compute pointwise elpd differences

From 6fe41bb9b7d2760e9274a2a084e8d50b3f59dca7 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <aki.vehtari@aalto.fi>
Date: Fri, 17 Oct 2025 20:22:28 +0300
Subject: [PATCH 03/40] Apply Jonah's suggestions from code review

Co-authored-by: Jonah Gabry <jgabry@gmail.com>
---
 R/loo_compare.R | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 3c5bd883..0355c0ef 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -118,7 +118,7 @@ loo_compare.default <- function(x, ...) {
   se_diff <- apply(diffs, 2, se_elpd_diff)
   # compute probabilities that a model has worse elpd than the best model
   # using a normal approximation (Sivula et al., 2025)
-  p_worse <- pnorm(0, elpd_diff, se_diff)
+  p_worse <- stats::pnorm(0, elpd_diff, se_diff)
   p_worse[elpd_diff==0] <- NA
   N <- nrow(diffs)
   # diagnostics to assess whether the normal approximation can be trusted
@@ -154,9 +154,11 @@ loo_compare.default <- function(x, ...) {
 #' @param digits For the print method only, the number of digits to use when
 #'   printing.
 #' @param simplify For the print method only, should only the essential columns
-#'   of the summary matrix be printed? The entire matrix is always returned, bu#' @param pnorm For the print method only, should we include the normal
-#'   approximation based probability of model having worse performance than
-#'   the best model
+#'   of the summary matrix be printed? The entire matrix is always returned, but
+#'   by default only the most important columns are printed.
+#' @param pnorm For the print method only, should we include the normal
+#'   approximation based probability of each model having worse performance than
+#'   the best model?
 print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) {
   xcopy <- x
   if (inherits(xcopy, "old_compare.loo")) {
@@ -171,7 +173,7 @@ print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE
     print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE)
     invisible(x)
   } else {
-    print(cbind(.fr(xcopy, digits), quote = FALSE))
+    print(cbind(.fr(xcopy, digits)), quote = FALSE)
     invisible(x)
   }
 }

From fa0d79d29db4f78e48f1400290d05f7e4b289fc5 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Fri, 17 Oct 2025 20:24:09 +0300
Subject: [PATCH 04/40] fix argument name

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 0355c0ef..06730baa 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -159,7 +159,7 @@ loo_compare.default <- function(x, ...) {
 #' @param pnorm For the print method only, should we include the normal
 #'   approximation based probability of each model having worse performance than
 #'   the best model?
-print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, pnorm = FALSE) {
+print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, p_worse = TRUE) {
   xcopy <- x
   if (inherits(xcopy, "old_compare.loo")) {
     if (NCOL(xcopy) >= 2 && simplify) {

From 65ef286a12cd3b788694bdef0c991429f3b62d54 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Fri, 17 Oct 2025 20:33:46 +0300
Subject: [PATCH 05/40] add more documentation (+ khat threshold 0.5 as we
 don't yet smooth)

---
 R/loo_compare.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 06730baa..6af99682 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -41,8 +41,14 @@
 #'   standard approach of comparing differences of deviances to a Chi-squared
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
-#'   Sivula et al. (2022) discuss the conditions when the normal
-#'   approximation used for SE and `se_diff` is good.
+#' 
+#'   The values in `p_worse` column are computed using the normal
+#'   approximation and values from the columns `elpd_diff` and
+#'   `se_diff`. Sivula et al. (2025) discuss the conditions when the
+#'   normal approximation used for SE and `se_diff` is good., and
+#'   column `diag_pnorm` contains possible diagnostic messages: 1)
+#'   small data (N < 100), 2) similar predictions (|elpd_diff| < 4),
+#'   or 3) possible outliers (khat > 0.5).
 #'
 #'   If more than \eqn{11} models are compared, we internally recompute the model
 #'   differences using the median model by ELPD as the baseline model. We then
@@ -134,7 +140,7 @@ loo_compare.default <- function(x, ...) {
     # Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both"))
-    diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > ", .fr(ps_khat_threshold(N), 2))
+    diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > 0.5")
   }
   rownames(comp) <- rnms
   comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff,

From b0ebac7540d86ec4003311bc1b4cd585e9e65266 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Fri, 17 Oct 2025 20:38:40 +0300
Subject: [PATCH 06/40] oops, fix khat comparison

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 6af99682..8f781a92 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -140,7 +140,7 @@ loo_compare.default <- function(x, ...) {
     # Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both"))
-    diag_pnorm[khat_diff > ps_khat_threshold(N)] <- paste0("khat_diff > 0.5")
+    diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }
   rownames(comp) <- rnms
   comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff,

From 7410b81bcec0f1752be78ec2e3bf493dd2020609 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Fri, 17 Oct 2025 20:44:16 +0300
Subject: [PATCH 07/40] skip pareto-k check for low number of unique values

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 8f781a92..5be1adee 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -139,7 +139,7 @@ loo_compare.default <- function(x, ...) {
     # possible outliers in differences (Sivula et al., 2025;
     # Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
-    khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) posterior::pareto_khat(x, tail="both"))
+    khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) ifelse(length(unique(x))<=20, NA, posterior::pareto_khat(x, tail="both")))
     diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }
   rownames(comp) <- rnms

From b993b44dd9e759828e554c69fcaf0e203f06d9aa Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 13:27:45 -0600
Subject: [PATCH 08/40] remove `simplify` argument (need to check reverse
 dependencies)

---
 R/loo_compare.R    | 56 +++++++++++++++++++++++-----------------------
 man/loo_compare.Rd | 18 ++++++++++-----
 2 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 5be1adee..658bb9f7 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -41,11 +41,11 @@
 #'   standard approach of comparing differences of deviances to a Chi-squared
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
-#' 
-#'   The values in `p_worse` column are computed using the normal
+#'
+#'   The values in the `p_worse` column are computed using the normal
 #'   approximation and values from the columns `elpd_diff` and
 #'   `se_diff`. Sivula et al. (2025) discuss the conditions when the
-#'   normal approximation used for SE and `se_diff` is good., and
+#'   normal approximation used for SE and `se_diff` is good, and the
 #'   column `diag_pnorm` contains possible diagnostic messages: 1)
 #'   small data (N < 100), 2) similar predictions (|elpd_diff| < 4),
 #'   or 3) possible outliers (khat > 0.5).
@@ -58,7 +58,7 @@
 #'   selection process. In that case users are recommended to avoid model
 #'   selection based on LOO-CV, and instead to favor model averaging/stacking or
 #'   projection predictive inference.
-#' 
+#'
 #' @seealso
 #' * The [FAQ page](https://mc-stan.org/loo/articles/online-only/faq.html) on
 #'   the __loo__ website for answers to frequently asked questions.
@@ -122,24 +122,28 @@ loo_compare.default <- function(x, ...) {
   diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord])
   elpd_diff <- apply(diffs, 2, sum)
   se_diff <- apply(diffs, 2, se_elpd_diff)
+
   # compute probabilities that a model has worse elpd than the best model
   # using a normal approximation (Sivula et al., 2025)
   p_worse <- stats::pnorm(0, elpd_diff, se_diff)
-  p_worse[elpd_diff==0] <- NA
-  N <- nrow(diffs)
+  p_worse[elpd_diff == 0] <- NA
+
   # diagnostics to assess whether the normal approximation can be trusted
-  if (N<100) {
+  N <- nrow(diffs)
+  if (N < 100) {
     # small N (Sivula et al., 2025)
     diag_pnorm <- rep("N < 100", length(elpd_diff))
-    diag_pnorm[elpd_diff==0] = ""
+    diag_pnorm[elpd_diff == 0] <- ""
   } else {
     diag_pnorm <- rep("", length(elpd_diff))
     # similar predictions (Sivula et al., 2025)
-    diag_pnorm[elpd_diff>-4 & elpd_diff!=0] <- "similar predictions"
-    # possible outliers in differences (Sivula et al., 2025;
-    # Vehtari et al., 2024)
+    diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "similar predictions"
+    # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
-    khat_diff[elpd_diff!=0] <- apply(diffs[,elpd_diff!=0, drop = FALSE], 2, \(x) ifelse(length(unique(x))<=20, NA, posterior::pareto_khat(x, tail="both")))
+    khat_diff[elpd_diff != 0] <- apply(
+      diffs[, elpd_diff != 0, drop = FALSE], 2,
+      \(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
+    ))
     diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }
   rownames(comp) <- rnms
@@ -159,29 +163,25 @@ loo_compare.default <- function(x, ...) {
 #' @export
 #' @param digits For the print method only, the number of digits to use when
 #'   printing.
-#' @param simplify For the print method only, should only the essential columns
-#'   of the summary matrix be printed? The entire matrix is always returned, but
-#'   by default only the most important columns are printed.
-#' @param pnorm For the print method only, should we include the normal
+#' @param p_worse For the print method only, should we include the normal
 #'   approximation based probability of each model having worse performance than
-#'   the best model?
-print.compare.loo <- function(x, ..., digits = 1, simplify = TRUE, p_worse = TRUE) {
+#'   the best model? The default is `TRUE`.
+print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   xcopy <- x
-  if (inherits(xcopy, "old_compare.loo")) {
-    if (NCOL(xcopy) >= 2 && simplify) {
-      patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$"
-      xcopy <- xcopy[, grepl(patts, colnames(xcopy))]
-    }
-  } else if (NCOL(xcopy) >= 2 && simplify) {
-     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
+  if (NCOL(xcopy) >= 2) {
+    xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
   if (p_worse) {
-    print(cbind(.fr(xcopy, digits), p_worse=.fr(x[,"p_worse"],2), diag_pnorm=x[, "diag_pnorm"]), quote = FALSE)
-    invisible(x)
+    print(
+      cbind(.fr(xcopy, digits),
+            p_worse = .fr(x[, "p_worse"], 2),
+            diag_pnorm = x[, "diag_pnorm"]),
+      quote = FALSE
+    )
   } else {
     print(cbind(.fr(xcopy, digits)), quote = FALSE)
-    invisible(x)
   }
+  invisible(x)
 }
 
 
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index ee6a8db3..4d380f7e 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -11,7 +11,7 @@ loo_compare(x, ...)
 
 \method{loo_compare}{default}(x, ...)
 
-\method{print}{compare.loo}(x, ..., digits = 1, simplify = TRUE)
+\method{print}{compare.loo}(x, ..., digits = 1, p_worse = TRUE)
 
 \method{print}{compare.loo_ss}(x, ..., digits = 1, simplify = TRUE)
 }
@@ -26,9 +26,9 @@ list.}
 \item{digits}{For the print method only, the number of digits to use when
 printing.}
 
-\item{simplify}{For the print method only, should only the essential columns
-of the summary matrix be printed? The entire matrix is always returned, but
-by default only the most important columns are printed.}
+\item{p_worse}{For the print method only, should we include the normal
+approximation based probability of each model having worse performance than
+the best model? The default is \code{TRUE}.}
 }
 \value{
 A matrix with class \code{"compare.loo"} that has its own
@@ -66,8 +66,14 @@ better sense of uncertainty than what is obtained using the current
 standard approach of comparing differences of deviances to a Chi-squared
 distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
-Sivula et al. (2022) discuss the conditions when the normal
-approximation used for SE and \code{se_diff} is good.
+
+The values in the \code{p_worse} column are computed using the normal
+approximation and values from the columns \code{elpd_diff} and
+\code{se_diff}. Sivula et al. (2025) discuss the conditions when the
+normal approximation used for SE and \code{se_diff} is good, and the
+column \code{diag_pnorm} contains possible diagnostic messages: 1)
+small data (N < 100), 2) similar predictions (|elpd_diff| < 4),
+or 3) possible outliers (khat > 0.5).
 
 If more than \eqn{11} models are compared, we internally recompute the model
 differences using the median model by ELPD as the baseline model. We then

From b118da2834561d34a5dad8ad9cf5a0d287163bf2 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 14:08:31 -0600
Subject: [PATCH 09/40] remove simplify argument from print.compare.loo_ss

---
 R/loo_compare.R                  | 7 -------
 R/loo_compare.psis_loo_ss_list.R | 9 ++-------
 man/loo_compare.Rd               | 9 +--------
 3 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 658bb9f7..9b35be5b 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -2,9 +2,6 @@
 #'
 #' @description Compare fitted models based on [ELPD][loo-glossary].
 #'
-#'   By default the print method shows only the most important information. Use
-#'   `print(..., simplify=FALSE)` to print a more detailed summary.
-#'
 #' @export
 #' @param x An object of class `"loo"` or a list of such objects. If a list is
 #'   used then the list names will be used as the model names in the output. See
@@ -74,10 +71,6 @@
 #' comp <- loo_compare(loo1, loo2, loo3)
 #' print(comp, digits = 2)
 #'
-#' # show more details with simplify=FALSE
-#' # (will be the same for all models in this artificial example)
-#' print(comp, simplify = FALSE, digits = 3)
-#'
 #' # can use a list of objects with custom names
 #' # will use apple, banana, and cherry, as the names in the output
 #' loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3))
diff --git a/R/loo_compare.psis_loo_ss_list.R b/R/loo_compare.psis_loo_ss_list.R
index acd0690b..9c0db564 100644
--- a/R/loo_compare.psis_loo_ss_list.R
+++ b/R/loo_compare.psis_loo_ss_list.R
@@ -173,14 +173,9 @@ loo_compare_checks.psis_loo_ss_list <- function(loos) {
 
 #' @rdname loo_compare
 #' @export
-print.compare.loo_ss <- function(x, ..., digits = 1, simplify = TRUE) {
+print.compare.loo_ss <- function(x, ..., digits = 1) {
   xcopy <- x
-  if (inherits(xcopy, "old_compare.loo")) {
-    if (NCOL(xcopy) >= 2 && simplify) {
-      patts <- "^elpd_|^se_diff|^p_|^waic$|^looic$"
-      xcopy <- xcopy[, grepl(patts, colnames(xcopy))]
-    }
-  } else if (NCOL(xcopy) >= 2 && simplify) {
+  if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff", "subsampling_se_diff")]
   }
   print(.fr(xcopy, digits), quote = FALSE)
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index 4d380f7e..6fad19e0 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -13,7 +13,7 @@ loo_compare(x, ...)
 
 \method{print}{compare.loo}(x, ..., digits = 1, p_worse = TRUE)
 
-\method{print}{compare.loo_ss}(x, ..., digits = 1, simplify = TRUE)
+\method{print}{compare.loo_ss}(x, ..., digits = 1)
 }
 \arguments{
 \item{x}{An object of class \code{"loo"} or a list of such objects. If a list is
@@ -36,9 +36,6 @@ print method. See the \strong{Details} section.
 }
 \description{
 Compare fitted models based on \link[=loo-glossary]{ELPD}.
-
-By default the print method shows only the most important information. Use
-\code{print(..., simplify=FALSE)} to print a more detailed summary.
 }
 \details{
 When comparing two fitted models, we can estimate the difference in their
@@ -94,10 +91,6 @@ loo3 <- loo(LL + 2) # should be best model when compared
 comp <- loo_compare(loo1, loo2, loo3)
 print(comp, digits = 2)
 
-# show more details with simplify=FALSE
-# (will be the same for all models in this artificial example)
-print(comp, simplify = FALSE, digits = 3)
-
 # can use a list of objects with custom names
 # will use apple, banana, and cherry, as the names in the output
 loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3))

From 25f93fdb0535d6548f4da87750466a53afe0d305 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 14:22:54 -0600
Subject: [PATCH 10/40] start fixing tests

---
 R/loo_compare.R                  |  6 ++---
 man/loo_compare.Rd               |  2 +-
 tests/testthat/_snaps/compare.md | 44 +++++++++++++++++---------------
 tests/testthat/test_compare.R    |  5 +++-
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 9b35be5b..66dd6515 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -9,7 +9,7 @@
 #' @param ... Additional objects of class `"loo"`, if not passed in as a single
 #'   list.
 #'
-#' @return A matrix with class `"compare.loo"` that has its own
+#' @return A data frame with class `"compare.loo"` that has its own
 #'   print method. See the **Details** section.
 #'
 #' @details
@@ -164,7 +164,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  if (p_worse) {
+  if (p_worse && !inherits(x, "old_compare.loo")) {
     print(
       cbind(.fr(xcopy, digits),
             p_worse = .fr(x[, "p_worse"], 2),
@@ -172,7 +172,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
       quote = FALSE
     )
   } else {
-    print(cbind(.fr(xcopy, digits)), quote = FALSE)
+    print(.fr(xcopy, digits), quote = FALSE)
   }
   invisible(x)
 }
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index 6fad19e0..e1888539 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -31,7 +31,7 @@ approximation based probability of each model having worse performance than
 the best model? The default is \code{TRUE}.}
 }
 \value{
-A matrix with class \code{"compare.loo"} that has its own
+A data frame with class \code{"compare.loo"} that has its own
 print method. See the \strong{Details} section.
 }
 \description{
diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index cf27900e..5458d967 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -1,28 +1,32 @@
 # loo_compare returns expected results (2 models)
 
-    WAoAAAACAAQFAAACAwAAAAMOAAAAEAAAAAAAAAAAwBA6U1+cRe4AAAAAAAAAAD+2ake0LxMB
-    wFTh8N3JQljAVeWWE8MGuUARCD2zEXBfQBEalRIN2T9ACijAYdW5U0AmZ5XrANCKP/H9Zexy
-    814/8ZtgnG1nx0Bk4fDdyUJYQGXllhPDBrlAIQg9sxFwX0AhGpUSDdk/AAAEAgAAAAEABAAJ
-    AAAAA2RpbQAAAA0AAAACAAAAAgAAAAgAAAQCAAAAAQAEAAkAAAAIZGltbmFtZXMAAAATAAAA
-    AgAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1vZGVsMgAAABAAAAAIAAQACQAAAAll
-    bHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxw
-    ZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkA
-    AAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAADAAQACQAAAAtjb21wYXJl
-    LmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJheQAAAP4=
+    WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAAC
+    AAAAAAAAAAA/tmpHtC8TAQAAAA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAA
+    AAAABAAJAAAAB04gPCAxMDAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2z
+    EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb
+    YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A
+    AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz
+    ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh
+    aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA
+    BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA
+    AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ
+    cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g==
 
 # loo_compare returns expected result (3 models)
 
-    WAoAAAACAAQFAAACAwAAAAMOAAAAGAAAAAAAAAAAwBA6U1+cRe7AMA3KkbYEGAAAAAAAAAAA
-    P7ZqR7QvEwE/y6/t4TTtXsBU4fDdyUJYwFXllhPDBrnAWOVjgjbDYkARCD2zEXBfQBEalRIN
-    2T9AEPIF3GigE0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCj/x/WXscvNeP/GbYJxtZ8c/8YDQ
-    kmfJX0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYkAhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAA
-    BAIAAAABAAQACQAAAANkaW0AAAANAAAAAgAAAAMAAAAIAAAEAgAAAAEABAAJAAAACGRpbW5h
-    bWVzAAAAEwAAAAIAAAAQAAAAAwAEAAkAAAAGbW9kZWwxAAQACQAAAAZtb2RlbDIABAAJAAAA
-    Bm1vZGVsMwAAABAAAAAIAAQACQAAAAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAA
-    CWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNl
-    X3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFz
-    cwAAABAAAAADAAQACQAAAAtjb21wYXJlLmxvbwAEAAkAAAAGbWF0cml4AAQACQAAAAVhcnJh
-    eQAAAP4=
+    WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAADAAAAAAAAAADAEDpTX5xF7sAwDcqRtgQY
+    AAAADgAAAAMAAAAAAAAAAD+2ake0LxMBP8uv7eE07V4AAAAOAAAAA3/wAAAAAAeiP/AAAAAA
+    AAA/8AAAAAAAAAAAABAAAAADAAQACQAAAAAABAAJAAAAB04gPCAxMDAABAAJAAAAB04gPCAx
+    MDAAAAAOAAAAA8BU4fDdyUJYwFXllhPDBrnAWOVjgjbDYgAAAA4AAAADQBEIPbMRcF9AERqV
+    Eg3ZP0AQ8gXcaKATAAAADgAAAANACijAYdW5U0AmZ5XrANCKQEHI2FIa3QoAAAAOAAAAAz/x
+    /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi
+    AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA
+    AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl
+    AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp
+    YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl
+    X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A
+    BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ
+    AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+
 
 # compare returns expected result (2 models)
 
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 256c263c..220d87c8 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -62,6 +62,8 @@ test_that("loo_compare throws appropriate warnings", {
 comp_colnames <- c(
   "elpd_diff",
   "se_diff",
+  "p_worse",
+  "diag_pnorm",
   "elpd_waic",
   "se_elpd_waic",
   "p_waic",
@@ -73,6 +75,7 @@ comp_colnames <- c(
 test_that("loo_compare returns expected results (2 models)", {
   comp1 <- loo_compare(w1, w1)
   expect_s3_class(comp1, "compare.loo")
+  expect_s3_class(comp1, "data.frame")
   expect_equal(colnames(comp1), comp_colnames)
   expect_equal(rownames(comp1), c("model1", "model2"))
   expect_output(print(comp1), "elpd_diff")
@@ -98,7 +101,7 @@ test_that("loo_compare returns expected result (3 models)", {
   expect_equal(rownames(comp1), c("model1", "model2", "model3"))
   expect_equal(comp1[1, 1], 0)
   expect_s3_class(comp1, "compare.loo")
-  expect_s3_class(comp1, "matrix")
+  expect_s3_class(comp1, "data.frame")
 
   expect_snapshot_value(comp1, style = "serialize")
 

From 62027d1944fe2e040f7871cc92ddea64abd64c20 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 15:34:03 -0600
Subject: [PATCH 11/40] fix issues in preliminary reverse dependency checks

---
 R/loo_compare.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 66dd6515..7f8d394d 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -113,6 +113,7 @@ loo_compare.default <- function(x, ...) {
   # compute elpd_diff and se_elpd_diff relative to best model
   rnms <- rownames(comp)
   diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord])
+  colnames(diffs) <- rnms
   elpd_diff <- apply(diffs, 2, sum)
   se_diff <- apply(diffs, 2, se_elpd_diff)
 
@@ -135,7 +136,7 @@ loo_compare.default <- function(x, ...) {
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff != 0] <- apply(
       diffs[, elpd_diff != 0, drop = FALSE], 2,
-      \(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
+      function (x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
     ))
     diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }

From cc3edfe797f63ae35fb231477ce0aefc398b4bb6 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 15:34:57 -0600
Subject: [PATCH 12/40] Update loo_compare.R

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 7f8d394d..79d506fb 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -136,7 +136,7 @@ loo_compare.default <- function(x, ...) {
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff != 0] <- apply(
       diffs[, elpd_diff != 0, drop = FALSE], 2,
-      function (x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
+      function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
     ))
     diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }

From 22f442f1d9b735054edaba9d5176d3b57b923d06 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 17:02:49 -0600
Subject: [PATCH 13/40] make sure p_worse is available

---
 R/loo_compare.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 79d506fb..aabd92f2 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -140,7 +140,6 @@ loo_compare.default <- function(x, ...) {
     ))
     diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
   }
-  rownames(comp) <- rnms
   comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff,
                 p_worse = p_worse, diag_pnorm = diag_pnorm),
                 as.data.frame(comp))
@@ -165,7 +164,9 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  if (p_worse && !inherits(x, "old_compare.loo")) {
+  if (p_worse &&
+      "p_worse" %in% colnames(xcopy) &&
+      !inherits(x, "old_compare.loo")) {
     print(
       cbind(.fr(xcopy, digits),
             p_worse = .fr(x[, "p_worse"], 2),

From ca843f494f4b7ed7efc1b1c7455aab49ac70ad7f Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 17:16:25 -0600
Subject: [PATCH 14/40] use x instead of xcopy

---
 R/loo_compare.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index aabd92f2..2871a564 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -164,8 +164,9 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
+  browser()
   if (p_worse &&
-      "p_worse" %in% colnames(xcopy) &&
+      "p_worse" %in% colnames(x) &&
       !inherits(x, "old_compare.loo")) {
     print(
       cbind(.fr(xcopy, digits),

From d579c061970303a916c2de30137140aeab794f01 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Fri, 17 Oct 2025 17:18:30 -0600
Subject: [PATCH 15/40] Update loo_compare.R

---
 R/loo_compare.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 2871a564..e9e45ea4 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -164,7 +164,6 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  browser()
   if (p_worse &&
       "p_worse" %in% colnames(x) &&
       !inherits(x, "old_compare.loo")) {

From 846a891e7ea3f6172eece53c6821bf6d5d26fed5 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Sat, 18 Oct 2025 11:44:50 +0300
Subject: [PATCH 16/40] unify diagnostic messages

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index e9e45ea4..41cdb843 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -131,7 +131,7 @@ loo_compare.default <- function(x, ...) {
   } else {
     diag_pnorm <- rep("", length(elpd_diff))
     # similar predictions (Sivula et al., 2025)
-    diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "similar predictions"
+    diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
     # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff != 0] <- apply(

From 64b365c9230e7759aea05c416c19bf8dcf60eeee Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Sat, 18 Oct 2025 17:27:55 +0300
Subject: [PATCH 17/40] improved loo_compare documentation

---
 R/loo_compare.R | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 41cdb843..c6128e05 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -39,13 +39,18 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#'   The values in the `p_worse` column are computed using the normal
-#'   approximation and values from the columns `elpd_diff` and
-#'   `se_diff`. Sivula et al. (2025) discuss the conditions when the
-#'   normal approximation used for SE and `se_diff` is good, and the
-#'   column `diag_pnorm` contains possible diagnostic messages: 1)
-#'   small data (N < 100), 2) similar predictions (|elpd_diff| < 4),
-#'   or 3) possible outliers (khat > 0.5).
+#'   The values in the `p_worse` column show the probabilities for models
+#'   having worse ELPD than the best model. These probabilities are
+#'   computed using the normal approximation and values from the
+#'   columns `elpd_diff` and `se_diff`. Sivula et al. (2025) present
+#'   the conditions when the normal approximation used for SE and
+#'   `se_diff` is good, and the column `diag_pnorm` contains possible
+#'   diagnostic messages: 1) small data (N < 100), 2) similar
+#'   predictions (|elpd_diff| < 4), or 3) possible outliers (khat > 0.5).
+#'   If any of these diagnostic messages is shown, the normal
+#'   approximation is not well calibrated and the shown probabilities
+#'   can be too large (small data or similar predictions) or too small
+#'   (outliers).
 #'
 #'   If more than \eqn{11} models are compared, we internally recompute the model
 #'   differences using the median model by ELPD as the baseline model. We then

From 185e5703564b3e62201433ababfc8c559cf7218c Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 12:22:26 -0600
Subject: [PATCH 18/40] add subsections to loo_compare doc and put diagnostic
 messages in bulleted list

---
 R/loo_compare.R    | 44 ++++++++++++++++++++++++--------------------
 man/loo_compare.Rd | 45 ++++++++++++++++++++++++++++++---------------
 2 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index c6128e05..5a0b0462 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -18,14 +18,14 @@
 #'   [`elpd_loo`][loo-glossary] or `elpd_waic` (or multiplied by \eqn{-2}, if
 #'   desired, to be on the deviance scale).
 #'
-#'   When using `loo_compare()`, the returned matrix will have one row per model
-#'   and several columns of estimates. The values in the
-#'   [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] columns of the
-#'   returned matrix are computed by making pairwise comparisons between each
-#'   model and the model with the largest ELPD (the model in the first row). For
-#'   this reason the `elpd_diff` column will always have the value `0` in the
-#'   first row (i.e., the difference between the preferred model and itself) and
-#'   negative values in subsequent rows for the remaining models.
+#' ## `elpd_diff` and `se_diff`
+#'   When using `loo_compare()`, the returned data frame will have one row per
+#'   model and several columns of estimates. The values of
+#'   [`elpd_diff`][loo-glossary] and [`se_diff`][loo-glossary] are computed by
+#'   making pairwise comparisons between each model and the model with the
+#'   largest ELPD (the model listed first). Therefore, the first `elpd_diff`
+#'   value will always be `0` (i.e., the difference between the preferred model
+#'   and itself) and the rest of the values will be negative.
 #'
 #'   To compute the standard error of the difference in [ELPD][loo-glossary] ---
 #'   which should not be expected to equal the difference of the standard errors
@@ -39,19 +39,23 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#'   The values in the `p_worse` column show the probabilities for models
-#'   having worse ELPD than the best model. These probabilities are
-#'   computed using the normal approximation and values from the
-#'   columns `elpd_diff` and `se_diff`. Sivula et al. (2025) present
-#'   the conditions when the normal approximation used for SE and
-#'   `se_diff` is good, and the column `diag_pnorm` contains possible
-#'   diagnostic messages: 1) small data (N < 100), 2) similar
-#'   predictions (|elpd_diff| < 4), or 3) possible outliers (khat > 0.5).
-#'   If any of these diagnostic messages is shown, the normal
-#'   approximation is not well calibrated and the shown probabilities
-#'   can be too large (small data or similar predictions) or too small
-#'   (outliers).
+#' ## `p_worse` and `diag_pnorm`
+#'   The values in the `p_worse` column show the probability of each model
+#'   having worse ELPD than the best model. These probabilities are computed
+#'   with a normal approximation using the values from `elpd_diff` and
+#'   `se_diff`. Sivula et al. (2025) present the conditions when the normal
+#'   approximation used for SE and `se_diff` is good, and the column
+#'   `diag_pnorm` contains possible diagnostic messages:
 #'
+#'   * small data (`N < 100`),
+#'   * similar predictions (`|elpd_diff| < 4`)
+#'   * possible outliers (`khat > 0.5`)
+#'
+#'   If any of these diagnostic messages is shown, the normal approximation is
+#'   not well calibrated and the probabilities can be too large (small data or
+#'   similar predictions) or too small (outliers).
+#'
+#' ## Warnings for many model comparisons
 #'   If more than \eqn{11} models are compared, we internally recompute the model
 #'   differences using the median model by ELPD as the baseline model. We then
 #'   estimate whether the differences in predictive performance are potentially
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index 46d1a428..ea17aafb 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -42,15 +42,15 @@ When comparing two fitted models, we can estimate the difference in their
 expected predictive accuracy by the difference in
 \code{\link[=loo-glossary]{elpd_loo}} or \code{elpd_waic} (or multiplied by \eqn{-2}, if
 desired, to be on the deviance scale).
+\subsection{\code{elpd_diff} and \code{se_diff}}{
 
-When using \code{loo_compare()}, the returned matrix will have one row per model
-and several columns of estimates. The values in the
-\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} columns of the
-returned matrix are computed by making pairwise comparisons between each
-model and the model with the largest ELPD (the model in the first row). For
-this reason the \code{elpd_diff} column will always have the value \code{0} in the
-first row (i.e., the difference between the preferred model and itself) and
-negative values in subsequent rows for the remaining models.
+When using \code{loo_compare()}, the returned data frame will have one row per
+model and several columns of estimates. The values of
+\code{\link[=loo-glossary]{elpd_diff}} and \code{\link[=loo-glossary]{se_diff}} are computed by
+making pairwise comparisons between each model and the model with the
+largest ELPD (the model listed first). Therefore, the first \code{elpd_diff}
+value will always be \code{0} (i.e., the difference between the preferred model
+and itself) and the rest of the values will be negative.
 
 To compute the standard error of the difference in \link[=loo-glossary]{ELPD} ---
 which should not be expected to equal the difference of the standard errors
@@ -63,14 +63,28 @@ better sense of uncertainty than what is obtained using the current
 standard approach of comparing differences of deviances to a Chi-squared
 distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
+}
+
+\subsection{\code{p_worse} and \code{diag_pnorm}}{
+
+The values in the \code{p_worse} column show the probability of each model
+having worse ELPD than the best model. These probabilities are computed
+with a normal approximation using the values from \code{elpd_diff} and
+\code{se_diff}. Sivula et al. (2025) present the conditions when the normal
+approximation used for SE and \code{se_diff} is good, and the column
+\code{diag_pnorm} contains possible diagnostic messages:
+\itemize{
+\item small data (\code{N < 100}),
+\item similar predictions (\verb{|elpd_diff| < 4})
+\item possible outliers (\code{khat > 0.5})
+}
+
+If any of these diagnostic messages is shown, the normal approximation is
+not well calibrated and the probabilities can be too large (small data or
+similar predictions) or too small (outliers).
+}
 
-The values in the \code{p_worse} column are computed using the normal
-approximation and values from the columns \code{elpd_diff} and
-\code{se_diff}. Sivula et al. (2025) discuss the conditions when the
-normal approximation used for SE and \code{se_diff} is good, and the
-column \code{diag_pnorm} contains possible diagnostic messages: 1)
-small data (N < 100), 2) similar predictions (|elpd_diff| < 4),
-or 3) possible outliers (khat > 0.5).
+\subsection{Warnings for many model comparisons}{
 
 If more than \eqn{11} models are compared, we internally recompute the model
 differences using the median model by ELPD as the baseline model. We then
@@ -81,6 +95,7 @@ selection process. In that case users are recommended to avoid model
 selection based on LOO-CV, and instead to favor model averaging/stacking or
 projection predictive inference.
 }
+}
 \examples{
 # very artificial example, just for demonstration!
 LL <- example_loglik_array()

From 8625e10e58eda5d741ae7947ab6bc48ea71f7ffa Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 13:02:56 -0600
Subject: [PATCH 19/40] minor cleanup

---
 R/loo_compare.R    | 38 ++++++++++++++++++++++----------------
 man/loo_compare.Rd |  8 ++++----
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 5a0b0462..bb2af3ac 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -47,9 +47,9 @@
 #'   approximation used for SE and `se_diff` is good, and the column
 #'   `diag_pnorm` contains possible diagnostic messages:
 #'
-#'   * small data (`N < 100`),
-#'   * similar predictions (`|elpd_diff| < 4`)
-#'   * possible outliers (`khat > 0.5`)
+#'   * `N < 100` (small data)
+#'   * `|elpd_diff| < 4` (models make similar predictions)
+#'   * `khat > 0.5` (possible outliers)
 #'
 #'   If any of these diagnostic messages is shown, the normal approximation is
 #'   not well calibrated and the probabilities can be too large (small data or
@@ -81,7 +81,7 @@
 #' print(comp, digits = 2)
 #'
 #' # can use a list of objects with custom names
-#' # will use apple, banana, and cherry, as the names in the output
+#' # the names will be used in the output
 #' loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3))
 #'
 #' \dontrun{
@@ -109,17 +109,17 @@ loo_compare.default <- function(x, ...) {
     loos <- x
   }
 
-  # If subsampling is used
+  # if subsampling is used
   if (any(sapply(loos, inherits, "psis_loo_ss"))) {
     return(loo_compare.psis_loo_ss_list(loos))
   }
 
+  # run pre-comparison checks
   loo_compare_checks(loos)
 
+  # compute elpd_diff and se_elpd_diff relative to best model
   comp <- loo_compare_matrix(loos)
   ord <- loo_compare_order(loos)
-
-  # compute elpd_diff and se_elpd_diff relative to best model
   rnms <- rownames(comp)
   diffs <- mapply(FUN = elpd_diffs, loos[ord[1]], loos[ord])
   colnames(diffs) <- rnms
@@ -132,33 +132,39 @@ loo_compare.default <- function(x, ...) {
   p_worse[elpd_diff == 0] <- NA
 
   # diagnostics to assess whether the normal approximation can be trusted
+  # * N < 100: small data (Sivula et al., 2025)
+  # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025)
+  # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
   N <- nrow(diffs)
   if (N < 100) {
-    # small N (Sivula et al., 2025)
     diag_pnorm <- rep("N < 100", length(elpd_diff))
     diag_pnorm[elpd_diff == 0] <- ""
   } else {
     diag_pnorm <- rep("", length(elpd_diff))
-    # similar predictions (Sivula et al., 2025)
     diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
-    # possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff != 0] <- apply(
       diffs[, elpd_diff != 0, drop = FALSE], 2,
       function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
     ))
-    diag_pnorm[khat_diff > 0.5] <- paste0("khat_diff > 0.5")
+    diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5"
   }
-  comp <- cbind(data.frame(elpd_diff = elpd_diff, se_diff = se_diff,
-                p_worse = p_worse, diag_pnorm = diag_pnorm),
-                as.data.frame(comp))
+  comp <- cbind(
+    data.frame(
+      elpd_diff = elpd_diff,
+      se_diff = se_diff,
+      p_worse = p_worse,
+      diag_pnorm = diag_pnorm
+    ),
+    as.data.frame(comp)
+  )
   rownames(comp) <- rnms
 
-  # run order statistics-based checks on models
+  # run order statistics-based checks for many model comparisons
   loo_order_stat_check(loos, ord)
 
   class(comp) <- c("compare.loo", class(comp))
-  return(comp)
+  comp
 }
 
 #' @rdname loo_compare
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index ea17aafb..01ab1c31 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -74,9 +74,9 @@ with a normal approximation using the values from \code{elpd_diff} and
 approximation used for SE and \code{se_diff} is good, and the column
 \code{diag_pnorm} contains possible diagnostic messages:
 \itemize{
-\item small data (\code{N < 100}),
-\item similar predictions (\verb{|elpd_diff| < 4})
-\item possible outliers (\code{khat > 0.5})
+\item \code{N < 100} (small data)
+\item \verb{|elpd_diff| < 4} (models make similar predictions)
+\item \code{khat > 0.5} (possible outliers)
 }
 
 If any of these diagnostic messages is shown, the normal approximation is
@@ -107,7 +107,7 @@ comp <- loo_compare(loo1, loo2, loo3)
 print(comp, digits = 2)
 
 # can use a list of objects with custom names
-# will use apple, banana, and cherry, as the names in the output
+# the names will be used in the output
 loo_compare(list("apple" = loo1, "banana" = loo2, "cherry" = loo3))
 
 \dontrun{

From 64a19c380e2f95e96df4cf905846376724a4fff8 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 13:29:19 -0600
Subject: [PATCH 20/40] Add `model` column to `loo_compare()` output

---
 R/loo_compare.R | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index bb2af3ac..3b9c20ae 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -149,8 +149,10 @@ loo_compare.default <- function(x, ...) {
     ))
     diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5"
   }
+
   comp <- cbind(
     data.frame(
+      model = rnms,
       elpd_diff = elpd_diff,
       se_diff = se_diff,
       p_worse = p_worse,
@@ -158,7 +160,7 @@ loo_compare.default <- function(x, ...) {
     ),
     as.data.frame(comp)
   )
-  rownames(comp) <- rnms
+  rownames(comp) <- NULL
 
   # run order statistics-based checks for many model comparisons
   loo_order_stat_check(loos, ord)
@@ -175,22 +177,27 @@ loo_compare.default <- function(x, ...) {
 #'   approximation based probability of each model having worse performance than
 #'   the best model? The default is `TRUE`.
 print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
+  if (inherits(x, "old_compare.loo")) {
+    stop(
+      "Output from old loo::compare() detected. ",
+      "Please rerun loo_compare() to get updated output ",
+      "or run unclass() on the object to print it.",
+      call. = FALSE
+    )
+  }
   xcopy <- x
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]
   }
-  if (p_worse &&
-      "p_worse" %in% colnames(x) &&
-      !inherits(x, "old_compare.loo")) {
-    print(
-      cbind(.fr(xcopy, digits),
-            p_worse = .fr(x[, "p_worse"], 2),
-            diag_pnorm = x[, "diag_pnorm"]),
-      quote = FALSE
+  xcopy <- cbind(model = x$model, .fr(xcopy, digits))
+  if (p_worse && "p_worse" %in% colnames(x)) {
+    xcopy <- cbind(
+      xcopy,
+      p_worse = .fr(x[, "p_worse"], 2),
+      diag_pnorm = x[, "diag_pnorm"]
     )
-  } else {
-    print(.fr(xcopy, digits), quote = FALSE)
   }
+  print(xcopy, quote = FALSE)
   invisible(x)
 }
 

From a84154ec87fb93fcb6a8a9cfd4c3ecc2acb67887 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 13:29:48 -0600
Subject: [PATCH 21/40] remove old loo::compare()

---
 NAMESPACE                     |   1 -
 R/compare.R                   | 138 ----------------------------------
 man/compare.Rd                |  80 --------------------
 tests/testthat/test_compare.R |  80 --------------------
 4 files changed, 299 deletions(-)
 delete mode 100644 R/compare.R
 delete mode 100644 man/compare.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 4d1a9118..a77e925c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -99,7 +99,6 @@ export(.compute_point_estimate)
 export(.ndraws)
 export(.thin_draws)
 export(E_loo)
-export(compare)
 export(crps)
 export(elpd)
 export(example_loglik_array)
diff --git a/R/compare.R b/R/compare.R
deleted file mode 100644
index a6081d2f..00000000
--- a/R/compare.R
+++ /dev/null
@@ -1,138 +0,0 @@
-#' Model comparison (deprecated, old version)
-#'
-#' **This function is deprecated**. Please use the new [loo_compare()] function
-#' instead.
-#'
-#' @export
-#' @param ... At least two objects returned by [loo()] (or [waic()]).
-#' @param x A list of at least two objects returned by [loo()] (or
-#'   [waic()]). This argument can be used as an alternative to
-#'   specifying the objects in `...`.
-#'
-#' @return A vector or matrix with class `'compare.loo'` that has its own
-#'   print method. If exactly two objects are provided in `...` or
-#'   `x`, then the difference in expected predictive accuracy and the
-#'   standard error of the difference are returned. If more than two objects are
-#'   provided then a matrix of summary information is returned (see **Details**).
-#'
-#' @details
-#'   When comparing two fitted models, we can estimate the difference in their
-#'   expected predictive accuracy by the difference in `elpd_loo` or
-#'   `elpd_waic` (or multiplied by -2, if desired, to be on the
-#'   deviance scale).
-#'
-#'   *When that difference, `elpd_diff`, is positive then the expected
-#'   predictive accuracy for the second model is higher. A negative
-#'   `elpd_diff` favors the first model.*
-#'
-#'   When using `compare()` with more than two models, the values in the
-#'   `elpd_diff` and `se_diff` columns of the returned matrix are
-#'   computed by making pairwise comparisons between each model and the model
-#'   with the best ELPD (i.e., the model in the first row).
-#'   Although the `elpd_diff` column is equal to the difference in
-#'   `elpd_loo`, do not expect the `se_diff` column to be equal to the
-#'   the difference in `se_elpd_loo`.
-#'
-#'   To compute the standard error of the difference in ELPD we use a
-#'   paired estimate to take advantage of the fact that the same set of _N_
-#'   data points was used to fit both models. These calculations should be most
-#'   useful when _N_ is large, because then non-normality of the
-#'   distribution is not such an issue when estimating the uncertainty in these
-#'   sums. These standard errors, for all their flaws, should give a better
-#'   sense of uncertainty than what is obtained using the current standard
-#'   approach of comparing differences of deviances to a Chi-squared
-#'   distribution, a practice derived for Gaussian linear models or
-#'   asymptotically, and which only applies to nested models in any case.
-#'
-#' @template loo-and-psis-references
-#'
-#' @examples
-#' \dontrun{
-#' loo1 <- loo(log_lik1)
-#' loo2 <- loo(log_lik2)
-#' print(compare(loo1, loo2), digits = 3)
-#' print(compare(x = list(loo1, loo2)))
-#'
-#' waic1 <- waic(log_lik1)
-#' waic2 <- waic(log_lik2)
-#' compare(waic1, waic2)
-#' }
-#'
-compare <- function(..., x = list()) {
-  .Deprecated("loo_compare")
-  dots <- list(...)
-  if (length(dots)) {
-    if (length(x)) {
-      stop("If 'x' is specified then '...' should not be specified.",
-           call. = FALSE)
-    }
-    nms <- as.character(match.call(expand.dots = TRUE))[-1L]
-  } else {
-    if (!is.list(x) || !length(x)) {
-      stop("'x' must be a list.", call. = FALSE)
-    }
-    dots <- x
-    nms <- names(dots)
-    if (!length(nms)) {
-      nms <- paste0("model", seq_along(dots))
-    }
-  }
-
-  if (!all(sapply(dots, is.loo))) {
-    stop("All inputs should have class 'loo'.")
-  }
-  if (length(dots) <= 1L) {
-    stop("'compare' requires at least two models.")
-  } else if (length(dots) == 2L) {
-    loo1 <- dots[[1]]
-    loo2 <- dots[[2]]
-    comp <- compare_two_models(loo1, loo2)
-    class(comp) <- c(class(comp), "old_compare.loo")
-    return(comp)
-  } else {
-    Ns <- sapply(dots, function(x) nrow(x$pointwise))
-    if (!all(Ns == Ns[1L])) {
-      stop("Not all models have the same number of data points.", call. = FALSE)
-    }
-
-    x <- sapply(dots, function(x) {
-      est <- x$estimates
-      setNames(c(est), nm = c(rownames(est), paste0("se_", rownames(est))) )
-    })
-    colnames(x) <- nms
-    rnms <- rownames(x)
-    comp <- x
-    ord <- order(x[grep("^elpd", rnms), ], decreasing = TRUE)
-    comp <- t(comp)[ord, ]
-    patts <- c("elpd", "p_", "^waic$|^looic$", "^se_waic$|^se_looic$")
-    col_ord <- unlist(sapply(patts, function(p) grep(p, colnames(comp))),
-                      use.names = FALSE)
-    comp <- comp[, col_ord]
-
-    # compute elpd_diff and se_elpd_diff relative to best model
-    rnms <- rownames(comp)
-    diffs <- mapply(elpd_diffs, dots[ord[1]], dots[ord])
-    elpd_diff <- apply(diffs, 2, sum)
-    se_diff <- apply(diffs, 2, se_elpd_diff)
-    comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp)
-    rownames(comp) <- rnms
-    class(comp) <- c("compare.loo", class(comp), "old_compare.loo")
-    comp
-  }
-}
-
-
-
-# internal ----------------------------------------------------------------
-compare_two_models <- function(loo_a, loo_b, return = c("elpd_diff", "se"), check_dims = TRUE) {
-  if (check_dims) {
-    if (dim(loo_a$pointwise)[1] != dim(loo_b$pointwise)[1]) {
-      stop(paste("Models don't have the same number of data points.",
-                 "\nFound N_1 =", dim(loo_a$pointwise)[1], "and N_2 =", dim(loo_b$pointwise)[1]), call. = FALSE)
-    }
-  }
-
-  diffs <- elpd_diffs(loo_a, loo_b)
-  comp <- c(elpd_diff = sum(diffs), se = se_elpd_diff(diffs))
-  structure(comp, class = "compare.loo")
-}
diff --git a/man/compare.Rd b/man/compare.Rd
deleted file mode 100644
index 11412fe4..00000000
--- a/man/compare.Rd
+++ /dev/null
@@ -1,80 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compare.R
-\name{compare}
-\alias{compare}
-\title{Model comparison (deprecated, old version)}
-\usage{
-compare(..., x = list())
-}
-\arguments{
-\item{...}{At least two objects returned by \code{\link[=loo]{loo()}} (or \code{\link[=waic]{waic()}}).}
-
-\item{x}{A list of at least two objects returned by \code{\link[=loo]{loo()}} (or
-\code{\link[=waic]{waic()}}). This argument can be used as an alternative to
-specifying the objects in \code{...}.}
-}
-\value{
-A vector or matrix with class \code{'compare.loo'} that has its own
-print method. If exactly two objects are provided in \code{...} or
-\code{x}, then the difference in expected predictive accuracy and the
-standard error of the difference are returned. If more than two objects are
-provided then a matrix of summary information is returned (see \strong{Details}).
-}
-\description{
-\strong{This function is deprecated}. Please use the new \code{\link[=loo_compare]{loo_compare()}} function
-instead.
-}
-\details{
-When comparing two fitted models, we can estimate the difference in their
-expected predictive accuracy by the difference in \code{elpd_loo} or
-\code{elpd_waic} (or multiplied by -2, if desired, to be on the
-deviance scale).
-
-\emph{When that difference, \code{elpd_diff}, is positive then the expected
-predictive accuracy for the second model is higher. A negative
-\code{elpd_diff} favors the first model.}
-
-When using \code{compare()} with more than two models, the values in the
-\code{elpd_diff} and \code{se_diff} columns of the returned matrix are
-computed by making pairwise comparisons between each model and the model
-with the best ELPD (i.e., the model in the first row).
-Although the \code{elpd_diff} column is equal to the difference in
-\code{elpd_loo}, do not expect the \code{se_diff} column to be equal to the
-the difference in \code{se_elpd_loo}.
-
-To compute the standard error of the difference in ELPD we use a
-paired estimate to take advantage of the fact that the same set of \emph{N}
-data points was used to fit both models. These calculations should be most
-useful when \emph{N} is large, because then non-normality of the
-distribution is not such an issue when estimating the uncertainty in these
-sums. These standard errors, for all their flaws, should give a better
-sense of uncertainty than what is obtained using the current standard
-approach of comparing differences of deviances to a Chi-squared
-distribution, a practice derived for Gaussian linear models or
-asymptotically, and which only applies to nested models in any case.
-}
-\examples{
-\dontrun{
-loo1 <- loo(log_lik1)
-loo2 <- loo(log_lik2)
-print(compare(loo1, loo2), digits = 3)
-print(compare(x = list(loo1, loo2)))
-
-waic1 <- waic(log_lik1)
-waic2 <- waic(log_lik2)
-compare(waic1, waic2)
-}
-
-}
-\references{
-Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model
-evaluation using leave-one-out cross-validation and WAIC.
-\emph{Statistics and Computing}. 27(5), 1413--1432. doi:10.1007/s11222-016-9696-4
-(\href{https://link.springer.com/article/10.1007/s11222-016-9696-4}{journal version},
-\href{https://arxiv.org/abs/1507.04544}{preprint arXiv:1507.04544}).
-
-Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024).
-Pareto smoothed importance sampling. \emph{Journal of Machine Learning Research},
-25(72):1-58.
-\href{https://jmlr.org/papers/v25/19-556.html}{PDF}
-}
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 220d87c8..172b73bb 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -109,83 +109,3 @@ test_that("loo_compare returns expected result (3 models)", {
   # except rownames) to using 'x' argument
   expect_equal(comp1, loo_compare(x = list(w1, w2, w3)), ignore_attr = TRUE)
 })
-
-# Tests for deprecated compare() ------------------------------------------
-
-test_that("compare throws deprecation warnings", {
-  expect_warning(loo::compare(w1, w2), "Deprecated")
-  expect_warning(loo::compare(w1, w1, w2), "Deprecated")
-})
-
-test_that("compare returns expected result (2 models)", {
-  expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated")
-  expect_snapshot(comp1)
-  expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0))
-
-  expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated")
-  expect_snapshot(comp2)
-  expect_named(comp2, c("elpd_diff", "se"))
-  expect_s3_class(comp2, "compare.loo")
-
-  # specifying objects via ... and via arg x gives equal results
-  expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated")
-  expect_equal(comp2, comp_via_list)
-})
-
-test_that("compare returns expected result (3 models)", {
-  w3 <- suppressWarnings(waic(LLarr3))
-  expect_warning(comp1 <- loo::compare(w1, w2, w3), "Deprecated")
-
-  expect_equal(
-    colnames(comp1),
-    c(
-      "elpd_diff",
-      "se_diff",
-      "elpd_waic",
-      "se_elpd_waic",
-      "p_waic",
-      "se_p_waic",
-      "waic",
-      "se_waic"
-    )
-  )
-  expect_equal(rownames(comp1), c("w1", "w2", "w3"))
-  expect_equal(comp1[1, 1], 0)
-  expect_s3_class(comp1, "compare.loo")
-  expect_s3_class(comp1, "matrix")
-  expect_snapshot_value(comp1, style = "serialize")
-
-  # specifying objects via '...' gives equivalent results (equal
-  # except rownames) to using 'x' argument
-  expect_warning(
-    comp_via_list <- loo::compare(x = list(w1, w2, w3)),
-    "Deprecated"
-  )
-  expect_equal(comp1, comp_via_list, ignore_attr = TRUE)
-})
-
-test_that("compare throws appropriate errors", {
-  expect_error(
-    suppressWarnings(loo::compare(w1, w2, x = list(w1, w2))),
-    "should not be specified"
-  )
-  expect_error(suppressWarnings(loo::compare(x = 2)), "must be a list")
-  expect_error(
-    suppressWarnings(loo::compare(x = list(2))),
-    "should have class 'loo'"
-  )
-  expect_error(
-    suppressWarnings(loo::compare(x = list(w1))),
-    "requires at least two models"
-  )
-
-  w3 <- suppressWarnings(waic(LLarr2[,, -1]))
-  expect_error(
-    suppressWarnings(loo::compare(x = list(w1, w3))),
-    "same number of data points"
-  )
-  expect_error(
-    suppressWarnings(loo::compare(x = list(w1, w2, w3))),
-    "same number of data points"
-  )
-})

From 16f67d47d63be1640d37c286d845dad433532992 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 13:49:54 -0600
Subject: [PATCH 22/40] improve backwards compatibility

---
 R/loo_compare.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index bb2af3ac..6820c406 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -175,6 +175,9 @@ loo_compare.default <- function(x, ...) {
 #'   approximation based probability of each model having worse performance than
 #'   the best model? The default is `TRUE`.
 print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
+  if (!inherits(x, "data.frame")) {
+    class(x) <- c(class(x), "data.frame")
+  }
   xcopy <- x
   if (NCOL(xcopy) >= 2) {
     xcopy <- xcopy[, c("elpd_diff", "se_diff")]

From 4e16d2fcfacf02644442351bea36f8284023bc10 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 14:14:35 -0600
Subject: [PATCH 23/40] Update loo_compare.R

---
 R/loo_compare.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 790becfa..3c3c5348 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -184,6 +184,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
       "or run unclass() on the object to print it.",
       call. = FALSE
     )
+  }
   if (!inherits(x, "data.frame")) {
     class(x) <- c(class(x), "data.frame")
   }

From 23a79c0de892c35f5c6a4f7c84dee9c81798bb1d Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 14:18:13 -0600
Subject: [PATCH 24/40] fix failing test

---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 6820c406..d56687ad 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -175,7 +175,7 @@ loo_compare.default <- function(x, ...) {
 #'   approximation based probability of each model having worse performance than
 #'   the best model? The default is `TRUE`.
 print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
-  if (!inherits(x, "data.frame")) {
+  if (!inherits(x, "data.frame") && !inherits(x, "old_compare.loo")) {
     class(x) <- c(class(x), "data.frame")
   }
   xcopy <- x

From cff3c2cf5ebaf4a531037c332274cf8086968a83 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 17:06:57 -0600
Subject: [PATCH 25/40] Revert "remove old loo::compare()"

This reverts commit a84154ec87fb93fcb6a8a9cfd4c3ecc2acb67887.
---
 NAMESPACE                     |   1 +
 R/compare.R                   | 138 ++++++++++++++++++++++++++++++++++
 man/compare.Rd                |  80 ++++++++++++++++++++
 tests/testthat/test_compare.R |  80 ++++++++++++++++++++
 4 files changed, 299 insertions(+)
 create mode 100644 R/compare.R
 create mode 100644 man/compare.Rd

diff --git a/NAMESPACE b/NAMESPACE
index a77e925c..4d1a9118 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -99,6 +99,7 @@ export(.compute_point_estimate)
 export(.ndraws)
 export(.thin_draws)
 export(E_loo)
+export(compare)
 export(crps)
 export(elpd)
 export(example_loglik_array)
diff --git a/R/compare.R b/R/compare.R
new file mode 100644
index 00000000..a6081d2f
--- /dev/null
+++ b/R/compare.R
@@ -0,0 +1,138 @@
+#' Model comparison (deprecated, old version)
+#'
+#' **This function is deprecated**. Please use the new [loo_compare()] function
+#' instead.
+#'
+#' @export
+#' @param ... At least two objects returned by [loo()] (or [waic()]).
+#' @param x A list of at least two objects returned by [loo()] (or
+#'   [waic()]). This argument can be used as an alternative to
+#'   specifying the objects in `...`.
+#'
+#' @return A vector or matrix with class `'compare.loo'` that has its own
+#'   print method. If exactly two objects are provided in `...` or
+#'   `x`, then the difference in expected predictive accuracy and the
+#'   standard error of the difference are returned. If more than two objects are
+#'   provided then a matrix of summary information is returned (see **Details**).
+#'
+#' @details
+#'   When comparing two fitted models, we can estimate the difference in their
+#'   expected predictive accuracy by the difference in `elpd_loo` or
+#'   `elpd_waic` (or multiplied by -2, if desired, to be on the
+#'   deviance scale).
+#'
+#'   *When that difference, `elpd_diff`, is positive then the expected
+#'   predictive accuracy for the second model is higher. A negative
+#'   `elpd_diff` favors the first model.*
+#'
+#'   When using `compare()` with more than two models, the values in the
+#'   `elpd_diff` and `se_diff` columns of the returned matrix are
+#'   computed by making pairwise comparisons between each model and the model
+#'   with the best ELPD (i.e., the model in the first row).
+#'   Although the `elpd_diff` column is equal to the difference in
+#'   `elpd_loo`, do not expect the `se_diff` column to be equal to the
+#'   the difference in `se_elpd_loo`.
+#'
+#'   To compute the standard error of the difference in ELPD we use a
+#'   paired estimate to take advantage of the fact that the same set of _N_
+#'   data points was used to fit both models. These calculations should be most
+#'   useful when _N_ is large, because then non-normality of the
+#'   distribution is not such an issue when estimating the uncertainty in these
+#'   sums. These standard errors, for all their flaws, should give a better
+#'   sense of uncertainty than what is obtained using the current standard
+#'   approach of comparing differences of deviances to a Chi-squared
+#'   distribution, a practice derived for Gaussian linear models or
+#'   asymptotically, and which only applies to nested models in any case.
+#'
+#' @template loo-and-psis-references
+#'
+#' @examples
+#' \dontrun{
+#' loo1 <- loo(log_lik1)
+#' loo2 <- loo(log_lik2)
+#' print(compare(loo1, loo2), digits = 3)
+#' print(compare(x = list(loo1, loo2)))
+#'
+#' waic1 <- waic(log_lik1)
+#' waic2 <- waic(log_lik2)
+#' compare(waic1, waic2)
+#' }
+#'
+compare <- function(..., x = list()) {
+  .Deprecated("loo_compare")
+  dots <- list(...)
+  if (length(dots)) {
+    if (length(x)) {
+      stop("If 'x' is specified then '...' should not be specified.",
+           call. = FALSE)
+    }
+    nms <- as.character(match.call(expand.dots = TRUE))[-1L]
+  } else {
+    if (!is.list(x) || !length(x)) {
+      stop("'x' must be a list.", call. = FALSE)
+    }
+    dots <- x
+    nms <- names(dots)
+    if (!length(nms)) {
+      nms <- paste0("model", seq_along(dots))
+    }
+  }
+
+  if (!all(sapply(dots, is.loo))) {
+    stop("All inputs should have class 'loo'.")
+  }
+  if (length(dots) <= 1L) {
+    stop("'compare' requires at least two models.")
+  } else if (length(dots) == 2L) {
+    loo1 <- dots[[1]]
+    loo2 <- dots[[2]]
+    comp <- compare_two_models(loo1, loo2)
+    class(comp) <- c(class(comp), "old_compare.loo")
+    return(comp)
+  } else {
+    Ns <- sapply(dots, function(x) nrow(x$pointwise))
+    if (!all(Ns == Ns[1L])) {
+      stop("Not all models have the same number of data points.", call. = FALSE)
+    }
+
+    x <- sapply(dots, function(x) {
+      est <- x$estimates
+      setNames(c(est), nm = c(rownames(est), paste0("se_", rownames(est))) )
+    })
+    colnames(x) <- nms
+    rnms <- rownames(x)
+    comp <- x
+    ord <- order(x[grep("^elpd", rnms), ], decreasing = TRUE)
+    comp <- t(comp)[ord, ]
+    patts <- c("elpd", "p_", "^waic$|^looic$", "^se_waic$|^se_looic$")
+    col_ord <- unlist(sapply(patts, function(p) grep(p, colnames(comp))),
+                      use.names = FALSE)
+    comp <- comp[, col_ord]
+
+    # compute elpd_diff and se_elpd_diff relative to best model
+    rnms <- rownames(comp)
+    diffs <- mapply(elpd_diffs, dots[ord[1]], dots[ord])
+    elpd_diff <- apply(diffs, 2, sum)
+    se_diff <- apply(diffs, 2, se_elpd_diff)
+    comp <- cbind(elpd_diff = elpd_diff, se_diff = se_diff, comp)
+    rownames(comp) <- rnms
+    class(comp) <- c("compare.loo", class(comp), "old_compare.loo")
+    comp
+  }
+}
+
+
+
+# internal ----------------------------------------------------------------
+compare_two_models <- function(loo_a, loo_b, return = c("elpd_diff", "se"), check_dims = TRUE) {
+  if (check_dims) {
+    if (dim(loo_a$pointwise)[1] != dim(loo_b$pointwise)[1]) {
+      stop(paste("Models don't have the same number of data points.",
+                 "\nFound N_1 =", dim(loo_a$pointwise)[1], "and N_2 =", dim(loo_b$pointwise)[1]), call. = FALSE)
+    }
+  }
+
+  diffs <- elpd_diffs(loo_a, loo_b)
+  comp <- c(elpd_diff = sum(diffs), se = se_elpd_diff(diffs))
+  structure(comp, class = "compare.loo")
+}
diff --git a/man/compare.Rd b/man/compare.Rd
new file mode 100644
index 00000000..11412fe4
--- /dev/null
+++ b/man/compare.Rd
@@ -0,0 +1,80 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compare.R
+\name{compare}
+\alias{compare}
+\title{Model comparison (deprecated, old version)}
+\usage{
+compare(..., x = list())
+}
+\arguments{
+\item{...}{At least two objects returned by \code{\link[=loo]{loo()}} (or \code{\link[=waic]{waic()}}).}
+
+\item{x}{A list of at least two objects returned by \code{\link[=loo]{loo()}} (or
+\code{\link[=waic]{waic()}}). This argument can be used as an alternative to
+specifying the objects in \code{...}.}
+}
+\value{
+A vector or matrix with class \code{'compare.loo'} that has its own
+print method. If exactly two objects are provided in \code{...} or
+\code{x}, then the difference in expected predictive accuracy and the
+standard error of the difference are returned. If more than two objects are
+provided then a matrix of summary information is returned (see \strong{Details}).
+}
+\description{
+\strong{This function is deprecated}. Please use the new \code{\link[=loo_compare]{loo_compare()}} function
+instead.
+}
+\details{
+When comparing two fitted models, we can estimate the difference in their
+expected predictive accuracy by the difference in \code{elpd_loo} or
+\code{elpd_waic} (or multiplied by -2, if desired, to be on the
+deviance scale).
+
+\emph{When that difference, \code{elpd_diff}, is positive then the expected
+predictive accuracy for the second model is higher. A negative
+\code{elpd_diff} favors the first model.}
+
+When using \code{compare()} with more than two models, the values in the
+\code{elpd_diff} and \code{se_diff} columns of the returned matrix are
+computed by making pairwise comparisons between each model and the model
+with the best ELPD (i.e., the model in the first row).
+Although the \code{elpd_diff} column is equal to the difference in
+\code{elpd_loo}, do not expect the \code{se_diff} column to be equal to the
+the difference in \code{se_elpd_loo}.
+
+To compute the standard error of the difference in ELPD we use a
+paired estimate to take advantage of the fact that the same set of \emph{N}
+data points was used to fit both models. These calculations should be most
+useful when \emph{N} is large, because then non-normality of the
+distribution is not such an issue when estimating the uncertainty in these
+sums. These standard errors, for all their flaws, should give a better
+sense of uncertainty than what is obtained using the current standard
+approach of comparing differences of deviances to a Chi-squared
+distribution, a practice derived for Gaussian linear models or
+asymptotically, and which only applies to nested models in any case.
+}
+\examples{
+\dontrun{
+loo1 <- loo(log_lik1)
+loo2 <- loo(log_lik2)
+print(compare(loo1, loo2), digits = 3)
+print(compare(x = list(loo1, loo2)))
+
+waic1 <- waic(log_lik1)
+waic2 <- waic(log_lik2)
+compare(waic1, waic2)
+}
+
+}
+\references{
+Vehtari, A., Gelman, A., and Gabry, J. (2017). Practical Bayesian model
+evaluation using leave-one-out cross-validation and WAIC.
+\emph{Statistics and Computing}. 27(5), 1413--1432. doi:10.1007/s11222-016-9696-4
+(\href{https://link.springer.com/article/10.1007/s11222-016-9696-4}{journal version},
+\href{https://arxiv.org/abs/1507.04544}{preprint arXiv:1507.04544}).
+
+Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024).
+Pareto smoothed importance sampling. \emph{Journal of Machine Learning Research},
+25(72):1-58.
+\href{https://jmlr.org/papers/v25/19-556.html}{PDF}
+}
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 172b73bb..220d87c8 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -109,3 +109,83 @@ test_that("loo_compare returns expected result (3 models)", {
   # except rownames) to using 'x' argument
   expect_equal(comp1, loo_compare(x = list(w1, w2, w3)), ignore_attr = TRUE)
 })
+
+# Tests for deprecated compare() ------------------------------------------
+
+test_that("compare throws deprecation warnings", {
+  expect_warning(loo::compare(w1, w2), "Deprecated")
+  expect_warning(loo::compare(w1, w1, w2), "Deprecated")
+})
+
+test_that("compare returns expected result (2 models)", {
+  expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated")
+  expect_snapshot(comp1)
+  expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0))
+
+  expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated")
+  expect_snapshot(comp2)
+  expect_named(comp2, c("elpd_diff", "se"))
+  expect_s3_class(comp2, "compare.loo")
+
+  # specifying objects via ... and via arg x gives equal results
+  expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated")
+  expect_equal(comp2, comp_via_list)
+})
+
+test_that("compare returns expected result (3 models)", {
+  w3 <- suppressWarnings(waic(LLarr3))
+  expect_warning(comp1 <- loo::compare(w1, w2, w3), "Deprecated")
+
+  expect_equal(
+    colnames(comp1),
+    c(
+      "elpd_diff",
+      "se_diff",
+      "elpd_waic",
+      "se_elpd_waic",
+      "p_waic",
+      "se_p_waic",
+      "waic",
+      "se_waic"
+    )
+  )
+  expect_equal(rownames(comp1), c("w1", "w2", "w3"))
+  expect_equal(comp1[1, 1], 0)
+  expect_s3_class(comp1, "compare.loo")
+  expect_s3_class(comp1, "matrix")
+  expect_snapshot_value(comp1, style = "serialize")
+
+  # specifying objects via '...' gives equivalent results (equal
+  # except rownames) to using 'x' argument
+  expect_warning(
+    comp_via_list <- loo::compare(x = list(w1, w2, w3)),
+    "Deprecated"
+  )
+  expect_equal(comp1, comp_via_list, ignore_attr = TRUE)
+})
+
+test_that("compare throws appropriate errors", {
+  expect_error(
+    suppressWarnings(loo::compare(w1, w2, x = list(w1, w2))),
+    "should not be specified"
+  )
+  expect_error(suppressWarnings(loo::compare(x = 2)), "must be a list")
+  expect_error(
+    suppressWarnings(loo::compare(x = list(2))),
+    "should have class 'loo'"
+  )
+  expect_error(
+    suppressWarnings(loo::compare(x = list(w1))),
+    "requires at least two models"
+  )
+
+  w3 <- suppressWarnings(waic(LLarr2[,, -1]))
+  expect_error(
+    suppressWarnings(loo::compare(x = list(w1, w3))),
+    "same number of data points"
+  )
+  expect_error(
+    suppressWarnings(loo::compare(x = list(w1, w2, w3))),
+    "same number of data points"
+  )
+})

From 8f521a56c7f448cc3f2908848f76bbb46fde3306 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 17:44:51 -0600
Subject: [PATCH 26/40] update tests

---
 R/loo_compare.R                  |  7 +--
 tests/testthat/_snaps/compare.md | 95 +++++++++++++++++++++-----------
 tests/testthat/test_compare.R    | 29 ++++++----
 3 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 3c3c5348..2a739045 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -178,12 +178,7 @@ loo_compare.default <- function(x, ...) {
 #'   the best model? The default is `TRUE`.
 print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (inherits(x, "old_compare.loo")) {
-    stop(
-      "Output from old loo::compare() detected. ",
-      "Please rerun loo_compare() to get updated output ",
-      "or run unclass() on the object to print it.",
-      call. = FALSE
-    )
+    return(unclass(x))
   }
   if (!inherits(x, "data.frame")) {
     class(x) <- c(class(x), "data.frame")
diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index 5458d967..82548aca 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -1,48 +1,77 @@
 # loo_compare returns expected results (2 models)
 
-    WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAAC
-    AAAAAAAAAAA/tmpHtC8TAQAAAA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAA
-    AAAABAAJAAAAB04gPCAxMDAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2z
-    EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb
-    YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A
-    AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz
-    ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh
-    aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA
-    BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA
-    AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ
-    cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g==
+    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    ZGVsMgAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAA
+    AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAAA4AAAAC
+    wFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMRcF9AEQg9sxFwXwAAAA4AAAACQAoowGHV
+    uVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw
+    3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA
+    AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA
+    AAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNl
+    X2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMA
+    BAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29t
+    cGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAA
+    DQAAAAKAAAAA/////gAAAP4=
 
-# loo_compare returns expected result (3 models)
+---
+
+    Code
+      print(comp1)
+    Output
+         model elpd_diff se_diff p_worse diag_pnorm
+      1 model1       0.0     0.0      NA           
+      2 model2       0.0     0.0      NA           
+
+---
+
+    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    ZGVsMgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAACAAAAAAAAAAA/tmpHtC8TAQAA
+    AA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAAAAAABAAJAAAAB04gPCAxMDAA
+    AAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2zEXBfQBEalRIN2T8AAAAOAAAA
+    AkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDd
+    yUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFt
+    ZXMAAAAQAAAACwAEAAkAAAAFbW9kZWwABAAJAAAACWVscGRfZGlmZgAEAAkAAAAHc2VfZGlm
+    ZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAKZGlhZ19wbm9ybQAEAAkAAAAJZWxwZF93YWljAAQA
+    CQAAAAxzZV9lbHBkX3dhaWMABAAJAAAABnBfd2FpYwAEAAkAAAAJc2VfcF93YWljAAQACQAA
+    AAR3YWljAAQACQAAAAdzZV93YWljAAAEAgAAAAEABAAJAAAABWNsYXNzAAAAEAAAAAIABAAJ
+    AAAAC2NvbXBhcmUubG9vAAQACQAAAApkYXRhLmZyYW1lAAAEAgAAAAEABAAJAAAACXJvdy5u
+    YW1lcwAAAA0AAAACgAAAAP////4AAAD+
 
-    WAoAAAACAAQEAgACAwAAAAMTAAAACgAAAA4AAAADAAAAAAAAAADAEDpTX5xF7sAwDcqRtgQY
-    AAAADgAAAAMAAAAAAAAAAD+2ake0LxMBP8uv7eE07V4AAAAOAAAAA3/wAAAAAAeiP/AAAAAA
-    AAA/8AAAAAAAAAAAABAAAAADAAQACQAAAAAABAAJAAAAB04gPCAxMDAABAAJAAAAB04gPCAx
-    MDAAAAAOAAAAA8BU4fDdyUJYwFXllhPDBrnAWOVjgjbDYgAAAA4AAAADQBEIPbMRcF9AERqV
-    Eg3ZP0AQ8gXcaKATAAAADgAAAANACijAYdW5U0AmZ5XrANCKQEHI2FIa3QoAAAAOAAAAAz/x
-    /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi
-    AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA
-    AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl
-    AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp
-    YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl
-    X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A
-    BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ
-    AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+
-
-# compare returns expected result (2 models)
+---
 
     Code
-      comp1
+      print(comp2)
     Output
-      elpd_diff        se 
-            0.0       0.0 
+         model elpd_diff se_diff p_worse diag_pnorm
+      1 model1       0.0     0.0      NA           
+      2 model2      -4.1     0.1    1.00    N < 100
+
+# loo_compare returns expected result (3 models)
+
+    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    ZGVsMgAEAAkAAAAGbW9kZWwzAAAADgAAAAMAAAAAAAAAAMAQOlNfnEXuwDANypG2BBgAAAAO
+    AAAAAwAAAAAAAAAAP7ZqR7QvEwE/y6/t4TTtXgAAAA4AAAADf/AAAAAAB6I/8AAAAAAAAD/w
+    AAAAAAAAAAAAEAAAAAMABAAJAAAAAAAEAAkAAAAHTiA8IDEwMAAEAAkAAAAHTiA8IDEwMAAA
+    AA4AAAADwFTh8N3JQljAVeWWE8MGucBY5WOCNsNiAAAADgAAAANAEQg9sxFwX0ARGpUSDdk/
+    QBDyBdxooBMAAAAOAAAAA0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCgAAAA4AAAADP/H9Zexy
+    814/8ZtgnG1nxz/xgNCSZ8lfAAAADgAAAANAZOHw3clCWEBl5ZYTwwa5QGjlY4I2w2IAAAAO
+    AAAAA0AhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA
+    AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA
+    AAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNl
+    X2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMA
+    BAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29t
+    cGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAA
+    DQAAAAKAAAAA/////QAAAP4=
 
 ---
 
     Code
-      comp2
+      print(comp1)
     Output
-      elpd_diff        se 
-           -4.1       0.1 
+         model elpd_diff se_diff p_worse diag_pnorm
+      1 model1       0.0     0.0      NA           
+      2 model2      -4.1     0.1    1.00    N < 100
+      3 model3     -16.1     0.2    1.00    N < 100
 
 # compare returns expected result (3 models)
 
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 220d87c8..c1bf352a 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -60,6 +60,7 @@ test_that("loo_compare throws appropriate warnings", {
 
 
 comp_colnames <- c(
+  "model",
   "elpd_diff",
   "se_diff",
   "p_worse",
@@ -77,19 +78,25 @@ test_that("loo_compare returns expected results (2 models)", {
   expect_s3_class(comp1, "compare.loo")
   expect_s3_class(comp1, "data.frame")
   expect_equal(colnames(comp1), comp_colnames)
-  expect_equal(rownames(comp1), c("model1", "model2"))
-  expect_output(print(comp1), "elpd_diff")
-  expect_equal(comp1[1:2, 1], c(0, 0), ignore_attr = TRUE)
-  expect_equal(comp1[1:2, 2], c(0, 0), ignore_attr = TRUE)
+  expect_equal(comp1$model, c("model1", "model2"))
+  expect_equal(comp1$elpd_diff, c(0, 0), ignore_attr = TRUE)
+  expect_equal(comp1$se_diff, c(0, 0), ignore_attr = TRUE)
+  expect_equal(comp1$p_worse, c(NA_real_, NA_real_), ignore_attr = TRUE)
+  expect_snapshot_value(comp1, style = "serialize")
+  expect_snapshot(print(comp1))
 
   comp2 <- loo_compare(w1, w2)
   expect_s3_class(comp2, "compare.loo")
   expect_equal(colnames(comp2), comp_colnames)
-
   expect_snapshot_value(comp2, style = "serialize")
+  expect_snapshot(print(comp2))
 
   # specifying objects via ... and via arg x gives equal results
   expect_equal(comp2, loo_compare(x = list(w1, w2)))
+
+  # custom naming works
+  comp3 <- loo_compare(x = list("A" = w2, "B" = w1))
+  expect_equal(comp3$model, c("B", "A"))
 })
 
 
@@ -98,12 +105,12 @@ test_that("loo_compare returns expected result (3 models)", {
   comp1 <- loo_compare(w1, w2, w3)
 
   expect_equal(colnames(comp1), comp_colnames)
-  expect_equal(rownames(comp1), c("model1", "model2", "model3"))
-  expect_equal(comp1[1, 1], 0)
+  expect_equal(comp1$model, c("model1", "model2", "model3"))
+  expect_equal(comp1$p_worse, c(NA, 1, 1))
   expect_s3_class(comp1, "compare.loo")
   expect_s3_class(comp1, "data.frame")
-
   expect_snapshot_value(comp1, style = "serialize")
+  expect_snapshot(print(comp1))
 
   # specifying objects via '...' gives equivalent results (equal
   # except rownames) to using 'x' argument
@@ -119,13 +126,11 @@ test_that("compare throws deprecation warnings", {
 
 test_that("compare returns expected result (2 models)", {
   expect_warning(comp1 <- loo::compare(w1, w1), "Deprecated")
-  expect_snapshot(comp1)
   expect_equal(comp1[1:2], c(elpd_diff = 0, se = 0))
 
   expect_warning(comp2 <- loo::compare(w1, w2), "Deprecated")
-  expect_snapshot(comp2)
-  expect_named(comp2, c("elpd_diff", "se"))
-  expect_s3_class(comp2, "compare.loo")
+  expect_equal(round(comp2[1:2], 3), c(elpd_diff = -4.057, se = 0.088))
+  expect_s3_class(comp2, "old_compare.loo")
 
   # specifying objects via ... and via arg x gives equal results
   expect_warning(comp_via_list <- loo::compare(x = list(w1, w2)), "Deprecated")

From dc9db69efdd6673c13ea12b67da6f75f20dc2542 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sat, 18 Oct 2025 18:07:26 -0600
Subject: [PATCH 27/40] cleanup print method

---
 R/loo_compare.R                  | 17 ++++++++---------
 tests/testthat/_snaps/compare.md |  9 +++++++++
 tests/testthat/test_compare.R    |  1 +
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 2a739045..0660aea4 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -183,19 +183,18 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (!inherits(x, "data.frame")) {
     class(x) <- c(class(x), "data.frame")
   }
-  xcopy <- x
-  if (NCOL(xcopy) >= 2) {
-    xcopy <- xcopy[, c("elpd_diff", "se_diff")]
-  }
-  xcopy <- cbind(model = x$model, .fr(xcopy, digits))
+  x2 <- cbind(
+    model = x$model,
+    .fr(x[, c("elpd_diff", "se_diff")], digits)
+  )
   if (p_worse && "p_worse" %in% colnames(x)) {
-    xcopy <- cbind(
-      xcopy,
-      p_worse = .fr(x[, "p_worse"], 2),
+    x2 <- cbind(
+      x2,
+      p_worse = .fr(x[, "p_worse"], digits = 2),
       diag_pnorm = x[, "diag_pnorm"]
     )
   }
-  print(xcopy, quote = FALSE)
+  print(x2, quote = FALSE)
   invisible(x)
 }
 
diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index 82548aca..c0568451 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -46,6 +46,15 @@
       1 model1       0.0     0.0      NA           
       2 model2      -4.1     0.1    1.00    N < 100
 
+---
+
+    Code
+      print(comp2, p_worse = FALSE)
+    Output
+         model elpd_diff se_diff
+      1 model1       0.0     0.0
+      2 model2      -4.1     0.1
+
 # loo_compare returns expected result (3 models)
 
     WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index c1bf352a..6f12c5d5 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -90,6 +90,7 @@ test_that("loo_compare returns expected results (2 models)", {
   expect_equal(colnames(comp2), comp_colnames)
   expect_snapshot_value(comp2, style = "serialize")
   expect_snapshot(print(comp2))
+  expect_snapshot(print(comp2, p_worse = FALSE))
 
   # specifying objects via ... and via arg x gives equal results
   expect_equal(comp2, loo_compare(x = list(w1, w2)))

From 3ed1c0c56231ca5631813e76ea7a2399f269df7e Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Sun, 19 Oct 2025 13:24:01 -0600
Subject: [PATCH 28/40] improve backwards compatibility

---
 R/loo_compare.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 0660aea4..8e70df0c 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -183,6 +183,10 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
   if (!inherits(x, "data.frame")) {
     class(x) <- c(class(x), "data.frame")
   }
+  if (!all(c("model", "elpd_diff", "se_diff") %in% colnames(x))) {
+    print(as.data.frame(x))
+    return(x)
+  }
   x2 <- cbind(
     model = x$model,
     .fr(x[, c("elpd_diff", "se_diff")], digits)

From 89d39f58e7d34f7281bd110ed442e9fb64872d3e Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Tue, 21 Oct 2025 10:17:01 +0300
Subject: [PATCH 29/40] change diag_pnorm to diag_diff

---
 R/loo_compare.R | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index d56687ad..e0f378fa 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -39,13 +39,13 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#' ## `p_worse` and `diag_pnorm`
+#' ## `p_worse` and `diag_diff`
 #'   The values in the `p_worse` column show the probability of each model
 #'   having worse ELPD than the best model. These probabilities are computed
 #'   with a normal approximation using the values from `elpd_diff` and
 #'   `se_diff`. Sivula et al. (2025) present the conditions when the normal
 #'   approximation used for SE and `se_diff` is good, and the column
-#'   `diag_pnorm` contains possible diagnostic messages:
+#'   `diag_diff` contains possible diagnostic messages:
 #'
 #'   * `N < 100` (small data)
 #'   * `|elpd_diff| < 4` (models make similar predictions)
@@ -137,24 +137,24 @@ loo_compare.default <- function(x, ...) {
   # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
   N <- nrow(diffs)
   if (N < 100) {
-    diag_pnorm <- rep("N < 100", length(elpd_diff))
-    diag_pnorm[elpd_diff == 0] <- ""
+    diag_diff <- rep("N < 100", length(elpd_diff))
+    diag_diff[elpd_diff == 0] <- ""
   } else {
-    diag_pnorm <- rep("", length(elpd_diff))
-    diag_pnorm[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
+    diag_diff <- rep("", length(elpd_diff))
+    diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
     khat_diff <- rep(NA, length(elpd_diff))
     khat_diff[elpd_diff != 0] <- apply(
       diffs[, elpd_diff != 0, drop = FALSE], 2,
       function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
     ))
-    diag_pnorm[khat_diff > 0.5] <- "khat_diff > 0.5"
+    diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5"
   }
   comp <- cbind(
     data.frame(
       elpd_diff = elpd_diff,
       se_diff = se_diff,
       p_worse = p_worse,
-      diag_pnorm = diag_pnorm
+      diag_diff = diag_diff
     ),
     as.data.frame(comp)
   )
@@ -188,7 +188,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
     print(
       cbind(.fr(xcopy, digits),
             p_worse = .fr(x[, "p_worse"], 2),
-            diag_pnorm = x[, "diag_pnorm"]),
+            diag_diff = x[, "diag_diff"]),
       quote = FALSE
     )
   } else {

From 6d73537601b190d1416376df661218893f9e102d Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Tue, 21 Oct 2025 10:34:00 +0300
Subject: [PATCH 30/40] change diag_pnorm to diag_diff in tests

---
 tests/testthat/test_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 220d87c8..d6fe1deb 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -63,7 +63,7 @@ comp_colnames <- c(
   "elpd_diff",
   "se_diff",
   "p_worse",
-  "diag_pnorm",
+  "diag_diff",
   "elpd_waic",
   "se_elpd_waic",
   "p_waic",

From abcf209e29a1df8633a71e76fa0a64d6afbd59ab Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Tue, 21 Oct 2025 10:00:02 -0600
Subject: [PATCH 31/40] update test snapshots

---
 tests/testthat/_snaps/compare.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index 5458d967..61de13ce 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -6,11 +6,11 @@
     EXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/Gb
     YJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8A
     AAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdz
-    ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dh
-    aWMABAAJAAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMA
-    BAAJAAAABHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAA
-    AgAEAAkAAAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJ
-    cm93Lm5hbWVzAAAAEAAAAAIABAAJAAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAAA/g==
+    ZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2Fp
+    YwAEAAkAAAAMc2VfZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAE
+    AAkAAAAEd2FpYwAEAAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAAC
+    AAQACQAAAAtjb21wYXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAly
+    b3cubmFtZXMAAAAQAAAAAgAEAAkAAAAGbW9kZWwxAAQACQAAAAZtb2RlbDIAAAD+
 
 # loo_compare returns expected result (3 models)
 
@@ -22,11 +22,11 @@
     /WXscvNeP/GbYJxtZ8c/8YDQkmfJXwAAAA4AAAADQGTh8N3JQlhAZeWWE8MGuUBo5WOCNsNi
     AAAADgAAAANAIQg9sxFwX0AhGpUSDdk/QCDyBdxooBMAAAQCAAAAAQAEAAkAAAAFbmFtZXMA
     AAAQAAAACgAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNl
-    AAQACQAAAApkaWFnX3Bub3JtAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRfd2Fp
-    YwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAAB3Nl
-    X3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5sb28A
-    BAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAAEAAAAAMABAAJ
-    AAAABm1vZGVsMQAEAAkAAAAGbW9kZWwyAAQACQAAAAZtb2RlbDMAAAD+
+    AAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93YWlj
+    AAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAHc2Vf
+    d2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21wYXJlLmxvbwAE
+    AAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAQAAAAAwAEAAkA
+    AAAGbW9kZWwxAAQACQAAAAZtb2RlbDIABAAJAAAABm1vZGVsMwAAAP4=
 
 # compare returns expected result (2 models)
 

From a375c93d27a2dfb07d1c883beeda91557c1fe279 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Tue, 21 Oct 2025 10:32:34 -0600
Subject: [PATCH 32/40] remove row numbers when printing

---
 R/loo_compare.R                  |  2 +-
 tests/testthat/_snaps/compare.md | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 12b49f5c..afe1cc4d 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -198,7 +198,7 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
       diag_diff = x[, "diag_diff"]
     )
   }
-  print(x2, quote = FALSE)
+  print(x2, quote = FALSE, row.names = FALSE)
   invisible(x)
 }
 
diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index 6639fd57..5aee7b29 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -18,9 +18,9 @@
     Code
       print(comp1)
     Output
-         model elpd_diff se_diff p_worse diag_diff
-      1 model1       0.0     0.0      NA          
-      2 model2       0.0     0.0      NA          
+        model elpd_diff se_diff p_worse diag_diff
+       model1       0.0     0.0      NA          
+       model2       0.0     0.0      NA          
 
 ---
 
@@ -42,18 +42,18 @@
     Code
       print(comp2)
     Output
-         model elpd_diff se_diff p_worse diag_diff
-      1 model1       0.0     0.0      NA          
-      2 model2      -4.1     0.1    1.00   N < 100
+        model elpd_diff se_diff p_worse diag_diff
+       model1       0.0     0.0      NA          
+       model2      -4.1     0.1    1.00   N < 100
 
 ---
 
     Code
       print(comp2, p_worse = FALSE)
     Output
-         model elpd_diff se_diff
-      1 model1       0.0     0.0
-      2 model2      -4.1     0.1
+        model elpd_diff se_diff
+       model1       0.0     0.0
+       model2      -4.1     0.1
 
 # loo_compare returns expected result (3 models)
 
@@ -77,10 +77,10 @@
     Code
       print(comp1)
     Output
-         model elpd_diff se_diff p_worse diag_diff
-      1 model1       0.0     0.0      NA          
-      2 model2      -4.1     0.1    1.00   N < 100
-      3 model3     -16.1     0.2    1.00   N < 100
+        model elpd_diff se_diff p_worse diag_diff
+       model1       0.0     0.0      NA          
+       model2      -4.1     0.1    1.00   N < 100
+       model3     -16.1     0.2    1.00   N < 100
 
 # compare returns expected result (3 models)
 

From f3fcb289103152aeb27c7f5b7293fbb7f5861dfa Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 11:53:26 +0300
Subject: [PATCH 33/40] add diag_elpd

---
 R/loo_compare.R | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index afe1cc4d..16372b9b 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -150,13 +150,34 @@ loo_compare.default <- function(x, ...) {
     diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5"
   }
 
+  # get khats for PSIS
+  khat_psis <- sapply(loos[ord],
+                      \(loo) {
+                        k <- loo$diagnostics[["pareto_k"]]
+                        if (is.null(k)) {
+                          out = ""
+                        } else {
+                          S <- dim(loo)[1]
+                          khat_threshold <- ps_khat_threshold(S)
+                          K <- sum(k > khat_threshold)
+                          if (K==0) {
+                            out <- ""
+                          } else {
+                            out <- paste0(K, " khat_psis > ", round(khat_threshold, 2))
+                          }
+                        }
+                        out
+                      }
+                      )
+  
   comp <- cbind(
     data.frame(
       model = rnms,
       elpd_diff = elpd_diff,
       se_diff = se_diff,
       p_worse = p_worse,
-      diag_diff = diag_diff
+      diag_diff = diag_diff,
+      diag_elpd = khat_psis
     ),
     as.data.frame(comp)
   )
@@ -195,7 +216,8 @@ print.compare.loo <- function(x, ..., digits = 1, p_worse = TRUE) {
     x2 <- cbind(
       x2,
       p_worse = .fr(x[, "p_worse"], digits = 2),
-      diag_diff = x[, "diag_diff"]
+      diag_diff = x[, "diag_diff"],
+      diag_elpd = x[, "diag_elpd"]
     )
   }
   print(x2, quote = FALSE, row.names = FALSE)

From 359853a8221715420c87ef654021d54d6b393777 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 11:53:36 +0300
Subject: [PATCH 34/40] improve loo_compare doc

---
 R/loo_compare.R | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 16372b9b..d970d2a0 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -39,7 +39,7 @@
 #'   distribution, a practice derived for Gaussian linear models or
 #'   asymptotically, and which only applies to nested models in any case.
 #'
-#' ## `p_worse` and `diag_diff`
+#' ## `p_worse`, `diag_diff`, and `diag_elpd`
 #'   The values in the `p_worse` column show the probability of each model
 #'   having worse ELPD than the best model. These probabilities are computed
 #'   with a normal approximation using the values from `elpd_diff` and
@@ -51,9 +51,24 @@
 #'   * `|elpd_diff| < 4` (models make similar predictions)
 #'   * `khat > 0.5` (possible outliers)
 #'
-#'   If any of these diagnostic messages is shown, the normal approximation is
-#'   not well calibrated and the probabilities can be too large (small data or
-#'   similar predictions) or too small (outliers).
+#'   If any of these diagnostic messages is shown, the error distribution is
+#'   skewed or thick tailed and the normal approximation based on `elpd_diff`
+#'   and `se_diff` is not well calibrated. The probabilities `p_worse` are
+#'   likely to be too large (small data or similar predictions) or too small
+#'   (outliers). `elpd_diff` and `se_diff` are still indicative of the
+#'   differences and uncertainties, and for example, if `|elpd_diff|` is
+#'   many times larger than `se_diff` the difference is quite certain.
+#'   While `khat > 0.5` indicates possibility of outliers, it is also
+#'   possible that both models compared seem to be well specified based
+#'   on model checking, but the pointwise ELPD differences have such thick tails
+#'   that the normal approximation for the sum is not good.
+#'
+#'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
+#'   computations for each model. If `k khat_psis > 0.7` is shown,
+#'   where `k` is the number of high high Pareto k values in Pareto
+#'   smoothed importance sampling computation, then there may be
+#'   significant bias in `elpd_diff` favoring models with a large
+#'   number of high Pareto k values.
 #'
 #' ## Warnings for many model comparisons
 #'   If more than \eqn{11} models are compared, we internally recompute the model

From b7db7ddacf2daedce4d9bcc54ecaab95eba14a16 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 15:57:29 +0300
Subject: [PATCH 35/40] clarifiy loo_compare diag_diff khat

---
 R/loo_compare.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index d970d2a0..490d8853 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -60,8 +60,10 @@
 #'   many times larger than `se_diff` the difference is quite certain.
 #'   While `khat > 0.5` indicates possibility of outliers, it is also
 #'   possible that both models compared seem to be well specified based
-#'   on model checking, but the pointwise ELPD differences have such thick tails
-#'   that the normal approximation for the sum is not good.
+#'   on model checking, but the pointwise ELPD differences have such thick
+#'   tails that the normal approximation for the sum is not good (note that
+#'   `khat`s in `diag_diff` column are different from the `khat`s in PSIS-LOO
+#'   diagnostic). 
 #'
 #'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
 #'   computations for each model. If `k khat_psis > 0.7` is shown,

From 77e753f63462a459c0759d1f7d6eb198ad08886b Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 16:18:07 +0300
Subject: [PATCH 36/40] yet another small doc improvement

---
 R/loo_compare.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 490d8853..6a69445a 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -61,9 +61,9 @@
 #'   While `khat > 0.5` indicates possibility of outliers, it is also
 #'   possible that both models compared seem to be well specified based
 #'   on model checking, but the pointwise ELPD differences have such thick
-#'   tails that the normal approximation for the sum is not good (note that
-#'   `khat`s in `diag_diff` column are different from the `khat`s in PSIS-LOO
-#'   diagnostic). 
+#'   tails that the normal approximation for the sum is not good (Vehtari et
+#'   al., 2024). Note that `khat`s in `diag_diff` column are different from
+#'   the `khat`s in PSIS-LOO  diagnostic. 
 #'
 #'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
 #'   computations for each model. If `k khat_psis > 0.7` is shown,

From c59414e33df1f54b9ab71e64e066bce500e4f047 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <aki.vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 18:52:44 +0300
Subject: [PATCH 37/40] Use function()

Co-authored-by: Jonah Gabry <jgabry@gmail.com>
---
 R/loo_compare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 6a69445a..f9240341 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -169,7 +169,7 @@ loo_compare.default <- function(x, ...) {
 
   # get khats for PSIS
   khat_psis <- sapply(loos[ord],
-                      \(loo) {
+                      function(loo) {
                         k <- loo$diagnostics[["pareto_k"]]
                         if (is.null(k)) {
                           out = ""

From 93fdbff72cc363235f33a9c42d0712a3b39faaa4 Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 18:59:09 +0300
Subject: [PATCH 38/40] another loo_compare doc edit

---
 R/loo_compare.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index f9240341..7649a746 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -58,12 +58,13 @@
 #'   (outliers). `elpd_diff` and `se_diff` are still indicative of the
 #'   differences and uncertainties, and for example, if `|elpd_diff|` is
 #'   many times larger than `se_diff` the difference is quite certain.
-#'   While `khat > 0.5` indicates possibility of outliers, it is also
-#'   possible that both models compared seem to be well specified based
+#'   The `khat` value for the `diag_diff` column is computed using
+#'   the pointwise elpd differences (and is different frin `khat`s in PSIS-LOO
+#'   diagnostic).  While `khat > 0.5` indicates possibility of outliers, it is
+#'   also possible that both models compared seem to be well specified based
 #'   on model checking, but the pointwise ELPD differences have such thick
 #'   tails that the normal approximation for the sum is not good (Vehtari et
-#'   al., 2024). Note that `khat`s in `diag_diff` column are different from
-#'   the `khat`s in PSIS-LOO  diagnostic. 
+#'   al., 2024).
 #'
 #'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
 #'   computations for each model. If `k khat_psis > 0.7` is shown,

From ae48d6945e414c2ac432efdaceeb053c1174774c Mon Sep 17 00:00:00 2001
From: Aki Vehtari <Aki.Vehtari@aalto.fi>
Date: Wed, 22 Oct 2025 20:02:58 +0300
Subject: [PATCH 39/40] adjust some diagnostic messages and documentation

---
 R/loo_compare.R | 57 +++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index 7649a746..c53236ad 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -49,7 +49,7 @@
 #'
 #'   * `N < 100` (small data)
 #'   * `|elpd_diff| < 4` (models make similar predictions)
-#'   * `khat > 0.5` (possible outliers)
+#'   * `k_diff > 0.5` (possible outliers)
 #'
 #'   If any of these diagnostic messages is shown, the error distribution is
 #'   skewed or thick tailed and the normal approximation based on `elpd_diff`
@@ -58,16 +58,17 @@
 #'   (outliers). `elpd_diff` and `se_diff` are still indicative of the
 #'   differences and uncertainties, and for example, if `|elpd_diff|` is
 #'   many times larger than `se_diff` the difference is quite certain.
-#'   The `khat` value for the `diag_diff` column is computed using
-#'   the pointwise elpd differences (and is different frin `khat`s in PSIS-LOO
-#'   diagnostic).  While `khat > 0.5` indicates possibility of outliers, it is
+#'   The `k_diff` value for the `diag_diff` column is computed using
+#'   the pointwise elpd differences (and is different from Pareto k's in PSIS-LOO
+#'   diagnostic).  While `k_diff > 0.5` indicates possibility of outliers, it is
 #'   also possible that both models compared seem to be well specified based
 #'   on model checking, but the pointwise ELPD differences have such thick
 #'   tails that the normal approximation for the sum is not good (Vehtari et
-#'   al., 2024).
+#'   al., 2024). Threshold 0.5 is used for `k_diff` as we do not do automatic
+#'   Pareto smoothing for the pointwise differences (Vehtari et al., 2024).
 #'
 #'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
-#'   computations for each model. If `k khat_psis > 0.7` is shown,
+#'   computations for each model. If `k_psis > 0.7` is shown,
 #'   where `k` is the number of high high Pareto k values in Pareto
 #'   smoothed importance sampling computation, then there may be
 #'   significant bias in `elpd_diff` favoring models with a large
@@ -152,7 +153,7 @@ loo_compare.default <- function(x, ...) {
   # diagnostics to assess whether the normal approximation can be trusted
   # * N < 100: small data (Sivula et al., 2025)
   # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025)
-  # * khat_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
+  # * k_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
   N <- nrow(diffs)
   if (N < 100) {
     diag_diff <- rep("N < 100", length(elpd_diff))
@@ -160,34 +161,30 @@ loo_compare.default <- function(x, ...) {
   } else {
     diag_diff <- rep("", length(elpd_diff))
     diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
-    khat_diff <- rep(NA, length(elpd_diff))
-    khat_diff[elpd_diff != 0] <- apply(
+    k_diff <- rep(NA, length(elpd_diff))
+    k_diff[elpd_diff != 0] <- apply(
       diffs[, elpd_diff != 0, drop = FALSE], 2,
       function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
     ))
-    diag_diff[khat_diff > 0.5] <- "khat_diff > 0.5"
+    diag_diff[k_diff > 0.5] <- "k_diff > 0.5"
   }
 
   # get khats for PSIS
-  khat_psis <- sapply(loos[ord],
-                      function(loo) {
-                        k <- loo$diagnostics[["pareto_k"]]
-                        if (is.null(k)) {
-                          out = ""
-                        } else {
-                          S <- dim(loo)[1]
-                          khat_threshold <- ps_khat_threshold(S)
-                          K <- sum(k > khat_threshold)
-                          if (K==0) {
-                            out <- ""
-                          } else {
-                            out <- paste0(K, " khat_psis > ", round(khat_threshold, 2))
-                          }
-                        }
-                        out
-                      }
-                      )
-  
+  k_psis <- sapply(loos[ord],
+                   function(loo) {
+                     k <- loo$diagnostics[["pareto_k"]]
+                     if (is.null(k)) {
+                       out = ""
+                     } else {
+                       S <- dim(loo)[1]
+                       khat_threshold <- ps_khat_threshold(S)
+                       K <- sum(k > khat_threshold)
+                       out <- ifelse(K==0, "", paste0(K, " k_psis > ", round(khat_threshold, 2)))
+                     }
+                     out
+                   }
+                   )
+
   comp <- cbind(
     data.frame(
       model = rnms,
@@ -195,7 +192,7 @@ loo_compare.default <- function(x, ...) {
       se_diff = se_diff,
       p_worse = p_worse,
       diag_diff = diag_diff,
-      diag_elpd = khat_psis
+      diag_elpd = k_psis
     ),
     as.data.frame(comp)
   )

From 4f5872b5d93ff0a9e6490cf544ddbe2b0c057b56 Mon Sep 17 00:00:00 2001
From: jgabry <jgabry@gmail.com>
Date: Wed, 22 Oct 2025 16:25:48 -0600
Subject: [PATCH 40/40] edit doc, fix tests, move diagnostics to internal
 functions

---
 R/loo_compare.R                  | 126 ++++++++++++++++---------------
 man/find_model_names.Rd          |   1 -
 man/loo_compare.Rd               |  31 ++++++--
 tests/testthat/_snaps/compare.md |  87 ++++++++++-----------
 tests/testthat/test_compare.R    |   5 ++
 5 files changed, 140 insertions(+), 110 deletions(-)

diff --git a/R/loo_compare.R b/R/loo_compare.R
index c53236ad..b67f8c0e 100644
--- a/R/loo_compare.R
+++ b/R/loo_compare.R
@@ -53,26 +53,28 @@
 #'
 #'   If any of these diagnostic messages is shown, the error distribution is
 #'   skewed or thick tailed and the normal approximation based on `elpd_diff`
-#'   and `se_diff` is not well calibrated. The probabilities `p_worse` are
-#'   likely to be too large (small data or similar predictions) or too small
-#'   (outliers). `elpd_diff` and `se_diff` are still indicative of the
-#'   differences and uncertainties, and for example, if `|elpd_diff|` is
-#'   many times larger than `se_diff` the difference is quite certain.
-#'   The `k_diff` value for the `diag_diff` column is computed using
-#'   the pointwise elpd differences (and is different from Pareto k's in PSIS-LOO
-#'   diagnostic).  While `k_diff > 0.5` indicates possibility of outliers, it is
-#'   also possible that both models compared seem to be well specified based
-#'   on model checking, but the pointwise ELPD differences have such thick
-#'   tails that the normal approximation for the sum is not good (Vehtari et
-#'   al., 2024). Threshold 0.5 is used for `k_diff` as we do not do automatic
-#'   Pareto smoothing for the pointwise differences (Vehtari et al., 2024).
+#'   and `se_diff` is not well calibrated. In that case, the probabilities
+#'   `p_worse` are likely to be too large (small data or similar predictions) or
+#'   too small (outliers). However, `elpd_diff` and `se_diff` will still be
+#'   indicative of the differences and uncertainties (for example, if
+#'   `|elpd_diff|` is many times larger than `se_diff` the difference is quite
+#'   certain).
 #'
-#'   The column `diag_elpd` shows diagnostic for the pointwise ELPD
-#'   computations for each model. If `k_psis > 0.7` is shown,
-#'   where `k` is the number of high high Pareto k values in Pareto
-#'   smoothed importance sampling computation, then there may be
-#'   significant bias in `elpd_diff` favoring models with a large
-#'   number of high Pareto k values.
+#'   The `k_diff` value for the `diag_diff` column is computed using the
+#'   pointwise ELPD differences (and is different from the Pareto k's in
+#'   PSIS-LOO diagnostic).  While `k_diff > 0.5` indicates the *possibility* of
+#'   outliers, it is also possible that both models compared seem to be well
+#'   specified based on model checking, but the pointwise ELPD differences have
+#'   such thick tails that the normal approximation for the sum is not good
+#'   (Vehtari et al., 2024). A threshold of 0.5 is used for `k_diff` as we do
+#'   not do automatic Pareto smoothing for the pointwise differences (Vehtari et
+#'   al., 2024).
+#'
+#'   The column `diag_elpd` shows the PSIS-LOO Pareto k diagnostic for the
+#'   pointwise ELPD computations for each model. If `K k_psis > 0.7` is shown,
+#'   where `K` is the number of high high Pareto k values in the PSIS
+#'   computation, then there may be significant bias in `elpd_diff` favoring
+#'   models with a large number of high Pareto k values.
 #'
 #' ## Warnings for many model comparisons
 #'   If more than \eqn{11} models are compared, we internally recompute the model
@@ -150,49 +152,14 @@ loo_compare.default <- function(x, ...) {
   p_worse <- stats::pnorm(0, elpd_diff, se_diff)
   p_worse[elpd_diff == 0] <- NA
 
-  # diagnostics to assess whether the normal approximation can be trusted
-  # * N < 100: small data (Sivula et al., 2025)
-  # * |elpd_diff| < 4: similar predictions (Sivula et al., 2025)
-  # * k_diff > 0.5: possible outliers in differences (Sivula et al., 2025; Vehtari et al., 2024)
-  N <- nrow(diffs)
-  if (N < 100) {
-    diag_diff <- rep("N < 100", length(elpd_diff))
-    diag_diff[elpd_diff == 0] <- ""
-  } else {
-    diag_diff <- rep("", length(elpd_diff))
-    diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
-    k_diff <- rep(NA, length(elpd_diff))
-    k_diff[elpd_diff != 0] <- apply(
-      diffs[, elpd_diff != 0, drop = FALSE], 2,
-      function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
-    ))
-    diag_diff[k_diff > 0.5] <- "k_diff > 0.5"
-  }
-
-  # get khats for PSIS
-  k_psis <- sapply(loos[ord],
-                   function(loo) {
-                     k <- loo$diagnostics[["pareto_k"]]
-                     if (is.null(k)) {
-                       out = ""
-                     } else {
-                       S <- dim(loo)[1]
-                       khat_threshold <- ps_khat_threshold(S)
-                       K <- sum(k > khat_threshold)
-                       out <- ifelse(K==0, "", paste0(K, " k_psis > ", round(khat_threshold, 2)))
-                     }
-                     out
-                   }
-                   )
-
   comp <- cbind(
     data.frame(
       model = rnms,
       elpd_diff = elpd_diff,
       se_diff = se_diff,
       p_worse = p_worse,
-      diag_diff = diag_diff,
-      diag_elpd = k_psis
+      diag_diff = diag_diff(nrow(diffs), elpd_diff),
+      diag_elpd = diag_elpd(loos[ord])
     ),
     as.data.frame(comp)
   )
@@ -311,7 +278,6 @@ loo_compare_checks <- function(loos) {
 #' Find the model names associated with `"loo"` objects
 #'
 #' @export
-#' @keywords internal
 #' @param x List of `"loo"` objects.
 #' @return Character vector of model names the same length as `x.`
 #'
@@ -340,7 +306,6 @@ find_model_names <- function(x) {
 
 
 #' Compute the loo_compare matrix
-#' @keywords internal
 #' @noRd
 #' @param loos List of `"loo"` objects.
 loo_compare_matrix <- function(loos){
@@ -362,7 +327,6 @@ loo_compare_matrix <- function(loos){
 
 #' Computes the order of loos for comparison
 #' @noRd
-#' @keywords internal
 #' @param loos List of `"loo"` objects.
 loo_compare_order <- function(loos){
   tmp <- sapply(loos, function(x) {
@@ -377,7 +341,6 @@ loo_compare_order <- function(loos){
 
 #' Perform checks on `"loo"` objects __after__ comparison
 #' @noRd
-#' @keywords internal
 #' @param loos List of `"loo"` objects.
 #' @param ord List of `"loo"` object orderings.
 #' @return Nothing, just possibly throws errors/warnings.
@@ -419,14 +382,12 @@ loo_order_stat_check <- function(loos, ord) {
 
 #' Returns the middle index of a vector
 #' @noRd
-#' @keywords internal
 #' @param vec A vector.
 #' @return Integer index value.
 middle_idx <- function(vec) floor(length(vec) / 2)
 
 #' Computes maximum order statistic from K Gaussians
 #' @noRd
-#' @keywords internal
 #' @param K Number of Gaussians.
 #' @param c Scaling of the order statistic.
 #' @return Numeric expected maximum from K samples from a Gaussian with mean
@@ -434,3 +395,44 @@ middle_idx <- function(vec) floor(length(vec) / 2)
 order_stat_heuristic <- function(K, c) {
   qnorm(p = 1 - 1 / (K * 2), mean = 0, sd = c)
 }
+
+#' Count number of high Pareto k values in PSIS-LOO and create diagnostic message
+#' @noRd
+#' @param loos Ordered list of loo objects.
+#' @return Character vector of diagnostic messages.
+diag_elpd <- function(loos) {
+  sapply(loos, function(loo) {
+    k <- loo$diagnostics[["pareto_k"]]
+    if (is.null(k)) {
+      out <- ""
+    } else {
+      S <- dim(loo)[1]
+      khat_threshold <- ps_khat_threshold(S)
+      K <- sum(k > khat_threshold)
+      out <- ifelse(K == 0, "", paste0(K, " k_psis > ", round(khat_threshold, 2)))
+    }
+    out
+  })
+}
+
+#' Create diagnostic for elpd differences
+#' @noRd
+#' @param N Number of data points.
+#' @param elpd_diff Vector of elpd differences.
+#' @return Character vector of diagnostic messages.
+diag_diff <- function(N, elpd_diff) {
+  if (N < 100) {
+    diag_diff <- rep("N < 100", length(elpd_diff))
+    diag_diff[elpd_diff == 0] <- ""
+  } else {
+    diag_diff <- rep("", length(elpd_diff))
+    diag_diff[elpd_diff > -4 & elpd_diff != 0] <- "|elpd_diff| < 4"
+    k_diff <- rep(NA, length(elpd_diff))
+    k_diff[elpd_diff != 0] <- apply(
+      diffs[, elpd_diff != 0, drop = FALSE], 2,
+      function(x) ifelse(length(unique(x)) <= 20, NA, posterior::pareto_khat(x, tail = "both")
+      ))
+    diag_diff[k_diff > 0.5] <- "k_diff > 0.5"
+  }
+  diag_diff
+}
diff --git a/man/find_model_names.Rd b/man/find_model_names.Rd
index cdb76b80..70a79d58 100644
--- a/man/find_model_names.Rd
+++ b/man/find_model_names.Rd
@@ -15,4 +15,3 @@ Character vector of model names the same length as \code{x.}
 \description{
 Find the model names associated with \code{"loo"} objects
 }
-\keyword{internal}
diff --git a/man/loo_compare.Rd b/man/loo_compare.Rd
index 6b7350ea..ecac018b 100644
--- a/man/loo_compare.Rd
+++ b/man/loo_compare.Rd
@@ -65,7 +65,7 @@ distribution, a practice derived for Gaussian linear models or
 asymptotically, and which only applies to nested models in any case.
 }
 
-\subsection{\code{p_worse} and \code{diag_diff}}{
+\subsection{\code{p_worse}, \code{diag_diff}, and \code{diag_elpd}}{
 
 The values in the \code{p_worse} column show the probability of each model
 having worse ELPD than the best model. These probabilities are computed
@@ -76,12 +76,33 @@ approximation used for SE and \code{se_diff} is good, and the column
 \itemize{
 \item \code{N < 100} (small data)
 \item \verb{|elpd_diff| < 4} (models make similar predictions)
-\item \code{khat > 0.5} (possible outliers)
+\item \code{k_diff > 0.5} (possible outliers)
 }
 
-If any of these diagnostic messages is shown, the normal approximation is
-not well calibrated and the probabilities can be too large (small data or
-similar predictions) or too small (outliers).
+If any of these diagnostic messages is shown, the error distribution is
+skewed or thick tailed and the normal approximation based on \code{elpd_diff}
+and \code{se_diff} is not well calibrated. In that case, the probabilities
+\code{p_worse} are likely to be too large (small data or similar predictions) or
+too small (outliers). However, \code{elpd_diff} and \code{se_diff} will still be
+indicative of the differences and uncertainties (for example, if
+\verb{|elpd_diff|} is many times larger than \code{se_diff} the difference is quite
+certain).
+
+The \code{k_diff} value for the \code{diag_diff} column is computed using the
+pointwise ELPD differences (and is different from the Pareto k's in
+PSIS-LOO diagnostic).  While \code{k_diff > 0.5} indicates the \emph{possibility} of
+outliers, it is also possible that both models compared seem to be well
+specified based on model checking, but the pointwise ELPD differences have
+such thick tails that the normal approximation for the sum is not good
+(Vehtari et al., 2024). A threshold of 0.5 is used for \code{k_diff} as we do
+not do automatic Pareto smoothing for the pointwise differences (Vehtari et
+al., 2024).
+
+The column \code{diag_elpd} shows the PSIS-LOO Pareto k diagnostic for the
+pointwise ELPD computations for each model. If \verb{K k_psis > 0.7} is shown,
+where \code{K} is the number of high high Pareto k values in the PSIS
+computation, then there may be significant bias in \code{elpd_diff} favoring
+models with a large number of high Pareto k values.
 }
 
 \subsection{Warnings for many model comparisons}{
diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md
index 5aee7b29..1c23a744 100644
--- a/tests/testthat/_snaps/compare.md
+++ b/tests/testthat/_snaps/compare.md
@@ -1,50 +1,52 @@
 # loo_compare returns expected results (2 models)
 
-    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
     ZGVsMgAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAAAA4AAAACAAAAAAAAAAAAAAAAAAAAAAAA
-    AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAAA4AAAAC
-    wFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMRcF9AEQg9sxFwXwAAAA4AAAACQAoowGHV
-    uVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw
-    3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA
-    AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA
-    AAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2Vf
-    ZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAE
-    AAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21w
-    YXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAN
-    AAAAAoAAAAD////+AAAA/g==
+    AA4AAAACf/AAAAAAB6J/8AAAAAAHogAAABAAAAACAAQACQAAAAAABAAJAAAAAAAAABAAAAAC
+    AAQACQAAAAAABAAJAAAAAAAAAA4AAAACwFTh8N3JQljAVOHw3clCWAAAAA4AAAACQBEIPbMR
+    cF9AEQg9sxFwXwAAAA4AAAACQAoowGHVuVNACijAYdW5UwAAAA4AAAACP/H9Zexy814/8f1l
+    7HLzXgAAAA4AAAACQGTh8N3JQlhAZOHw3clCWAAAAA4AAAACQCEIPbMRcF9AIQg9sxFwXwAA
+    BAIAAAABAAQACQAAAAVuYW1lcwAAABAAAAAMAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9k
+    aWZmAAQACQAAAAdzZV9kaWZmAAQACQAAAAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJ
+    AAAACWRpYWdfZWxwZAAEAAkAAAAJZWxwZF93YWljAAQACQAAAAxzZV9lbHBkX3dhaWMABAAJ
+    AAAABnBfd2FpYwAEAAkAAAAJc2VfcF93YWljAAQACQAAAAR3YWljAAQACQAAAAdzZV93YWlj
+    AAAEAgAAAAEABAAJAAAABWNsYXNzAAAAEAAAAAIABAAJAAAAC2NvbXBhcmUubG9vAAQACQAA
+    AApkYXRhLmZyYW1lAAAEAgAAAAEABAAJAAAACXJvdy5uYW1lcwAAAA0AAAACgAAAAP////4A
+    AAD+
 
 ---
 
     Code
       print(comp1)
     Output
-        model elpd_diff se_diff p_worse diag_diff
-       model1       0.0     0.0      NA          
-       model2       0.0     0.0      NA          
+        model elpd_diff se_diff p_worse diag_diff diag_elpd
+       model1       0.0     0.0      NA                    
+       model2       0.0     0.0      NA                    
 
 ---
 
-    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAACAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
     ZGVsMgAAAA4AAAACAAAAAAAAAADAEDpTX5xF7gAAAA4AAAACAAAAAAAAAAA/tmpHtC8TAQAA
     AA4AAAACf/AAAAAAB6I/8AAAAAAAAAAAABAAAAACAAQACQAAAAAABAAJAAAAB04gPCAxMDAA
-    AAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAAAkARCD2zEXBfQBEalRIN2T8AAAAOAAAA
-    AkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXscvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDd
-    yUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEalRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFt
-    ZXMAAAAQAAAACwAEAAkAAAAFbW9kZWwABAAJAAAACWVscGRfZGlmZgAEAAkAAAAHc2VfZGlm
-    ZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAJZGlhZ19kaWZmAAQACQAAAAllbHBkX3dhaWMABAAJ
-    AAAADHNlX2VscGRfd2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAA
-    BHdhaWMABAAJAAAAB3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkA
-    AAALY29tcGFyZS5sb28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5h
-    bWVzAAAADQAAAAKAAAAA/////gAAAP4=
+    AAAQAAAAAgAEAAkAAAAAAAQACQAAAAAAAAAOAAAAAsBU4fDdyUJYwFXllhPDBrkAAAAOAAAA
+    AkARCD2zEXBfQBEalRIN2T8AAAAOAAAAAkAKKMBh1blTQCZnlesA0IoAAAAOAAAAAj/x/WXs
+    cvNeP/GbYJxtZ8cAAAAOAAAAAkBk4fDdyUJYQGXllhPDBrkAAAAOAAAAAkAhCD2zEXBfQCEa
+    lRIN2T8AAAQCAAAAAQAEAAkAAAAFbmFtZXMAAAAQAAAADAAEAAkAAAAFbW9kZWwABAAJAAAA
+    CWVscGRfZGlmZgAEAAkAAAAHc2VfZGlmZgAEAAkAAAAHcF93b3JzZQAEAAkAAAAJZGlhZ19k
+    aWZmAAQACQAAAAlkaWFnX2VscGQABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2VfZWxwZF93
+    YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAEAAkAAAAH
+    c2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21wYXJlLmxv
+    bwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAANAAAAAoAA
+    AAD////+AAAA/g==
 
 ---
 
     Code
       print(comp2)
     Output
-        model elpd_diff se_diff p_worse diag_diff
-       model1       0.0     0.0      NA          
-       model2      -4.1     0.1    1.00   N < 100
+        model elpd_diff se_diff p_worse diag_diff diag_elpd
+       model1       0.0     0.0      NA                    
+       model2      -4.1     0.1    1.00   N < 100          
 
 ---
 
@@ -57,30 +59,31 @@
 
 # loo_compare returns expected result (3 models)
 
-    WAoAAAACAAQEAgACAwAAAAMTAAAACwAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
+    WAoAAAACAAQEAgACAwAAAAMTAAAADAAAABAAAAADAAQACQAAAAZtb2RlbDEABAAJAAAABm1v
     ZGVsMgAEAAkAAAAGbW9kZWwzAAAADgAAAAMAAAAAAAAAAMAQOlNfnEXuwDANypG2BBgAAAAO
     AAAAAwAAAAAAAAAAP7ZqR7QvEwE/y6/t4TTtXgAAAA4AAAADf/AAAAAAB6I/8AAAAAAAAD/w
     AAAAAAAAAAAAEAAAAAMABAAJAAAAAAAEAAkAAAAHTiA8IDEwMAAEAAkAAAAHTiA8IDEwMAAA
-    AA4AAAADwFTh8N3JQljAVeWWE8MGucBY5WOCNsNiAAAADgAAAANAEQg9sxFwX0ARGpUSDdk/
-    QBDyBdxooBMAAAAOAAAAA0AKKMBh1blTQCZnlesA0IpAQcjYUhrdCgAAAA4AAAADP/H9Zexy
-    814/8ZtgnG1nxz/xgNCSZ8lfAAAADgAAAANAZOHw3clCWEBl5ZYTwwa5QGjlY4I2w2IAAAAO
-    AAAAA0AhCD2zEXBfQCEalRIN2T9AIPIF3GigEwAABAIAAAABAAQACQAAAAVuYW1lcwAAABAA
-    AAALAAQACQAAAAVtb2RlbAAEAAkAAAAJZWxwZF9kaWZmAAQACQAAAAdzZV9kaWZmAAQACQAA
-    AAdwX3dvcnNlAAQACQAAAAlkaWFnX2RpZmYABAAJAAAACWVscGRfd2FpYwAEAAkAAAAMc2Vf
-    ZWxwZF93YWljAAQACQAAAAZwX3dhaWMABAAJAAAACXNlX3Bfd2FpYwAEAAkAAAAEd2FpYwAE
-    AAkAAAAHc2Vfd2FpYwAABAIAAAABAAQACQAAAAVjbGFzcwAAABAAAAACAAQACQAAAAtjb21w
-    YXJlLmxvbwAEAAkAAAAKZGF0YS5mcmFtZQAABAIAAAABAAQACQAAAAlyb3cubmFtZXMAAAAN
-    AAAAAoAAAAD////9AAAA/g==
+    ABAAAAADAAQACQAAAAAABAAJAAAAAAAEAAkAAAAAAAAADgAAAAPAVOHw3clCWMBV5ZYTwwa5
+    wFjlY4I2w2IAAAAOAAAAA0ARCD2zEXBfQBEalRIN2T9AEPIF3GigEwAAAA4AAAADQAoowGHV
+    uVNAJmeV6wDQikBByNhSGt0KAAAADgAAAAM/8f1l7HLzXj/xm2CcbWfHP/GA0JJnyV8AAAAO
+    AAAAA0Bk4fDdyUJYQGXllhPDBrlAaOVjgjbDYgAAAA4AAAADQCEIPbMRcF9AIRqVEg3ZP0Ag
+    8gXcaKATAAAEAgAAAAEABAAJAAAABW5hbWVzAAAAEAAAAAwABAAJAAAABW1vZGVsAAQACQAA
+    AAllbHBkX2RpZmYABAAJAAAAB3NlX2RpZmYABAAJAAAAB3Bfd29yc2UABAAJAAAACWRpYWdf
+    ZGlmZgAEAAkAAAAJZGlhZ19lbHBkAAQACQAAAAllbHBkX3dhaWMABAAJAAAADHNlX2VscGRf
+    d2FpYwAEAAkAAAAGcF93YWljAAQACQAAAAlzZV9wX3dhaWMABAAJAAAABHdhaWMABAAJAAAA
+    B3NlX3dhaWMAAAQCAAAAAQAEAAkAAAAFY2xhc3MAAAAQAAAAAgAEAAkAAAALY29tcGFyZS5s
+    b28ABAAJAAAACmRhdGEuZnJhbWUAAAQCAAAAAQAEAAkAAAAJcm93Lm5hbWVzAAAADQAAAAKA
+    AAAA/////QAAAP4=
 
 ---
 
     Code
       print(comp1)
     Output
-        model elpd_diff se_diff p_worse diag_diff
-       model1       0.0     0.0      NA          
-       model2      -4.1     0.1    1.00   N < 100
-       model3     -16.1     0.2    1.00   N < 100
+        model elpd_diff se_diff p_worse diag_diff diag_elpd
+       model1       0.0     0.0      NA                    
+       model2      -4.1     0.1    1.00   N < 100          
+       model3     -16.1     0.2    1.00   N < 100          
 
 # compare returns expected result (3 models)
 
diff --git a/tests/testthat/test_compare.R b/tests/testthat/test_compare.R
index 678a4858..8463c411 100644
--- a/tests/testthat/test_compare.R
+++ b/tests/testthat/test_compare.R
@@ -65,6 +65,7 @@ comp_colnames <- c(
   "se_diff",
   "p_worse",
   "diag_diff",
+  "diag_elpd",
   "elpd_waic",
   "se_elpd_waic",
   "p_waic",
@@ -88,6 +89,9 @@ test_that("loo_compare returns expected results (2 models)", {
   comp2 <- loo_compare(w1, w2)
   expect_s3_class(comp2, "compare.loo")
   expect_equal(colnames(comp2), comp_colnames)
+  expect_equal(comp2$p_worse, c(NA, 1))
+  expect_equal(comp2$diag_diff, c("", "N < 100"))
+  expect_equal(comp2$diag_elpd, c("", ""))
   expect_snapshot_value(comp2, style = "serialize")
   expect_snapshot(print(comp2))
   expect_snapshot(print(comp2, p_worse = FALSE))
@@ -108,6 +112,7 @@ test_that("loo_compare returns expected result (3 models)", {
   expect_equal(colnames(comp1), comp_colnames)
   expect_equal(comp1$model, c("model1", "model2", "model3"))
   expect_equal(comp1$p_worse, c(NA, 1, 1))
+  expect_equal(comp1$diag_diff, c("", "N < 100", "N < 100"))
   expect_s3_class(comp1, "compare.loo")
   expect_s3_class(comp1, "data.frame")
   expect_snapshot_value(comp1, style = "serialize")