diff --git a/NEWS.md b/NEWS.md index e67514eff..a8f1b51b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,20 @@ * Improved documentation for printing-methods. +* `check_model()` now automatically hides data points in diagnostic plots for + models with only categorical predictors. This improves the visualization of + variance and linearity patterns across groups by reducing visual clutter from + overlapping points. Users can still override this behavior by explicitly + setting `show_dots = TRUE` (#873, @DominiqueMakowski). + +## Known Issues + +* The `theme` argument in `check_model()` currently has no effect due to a bug + in the **see** package's plot method. The `check_model()` function correctly + stores the theme attribute, but the **see** package's `plot.check_model()` + function does not properly handle it. This issue is tracked in #851 and + requires a fix in the **see** package repository. + # performance 0.15.2 ## Bug fixes diff --git a/R/check_model.R b/R/check_model.R index 952885f7a..12a3806d7 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -46,7 +46,10 @@ #' to `FALSE` for models with many observations, if generating the plot is too #' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` #' tries to guess whether performance will be poor due to a very large model -#' and thus automatically shows or hides dots. +#' and thus automatically shows or hides dots. For models with only categorical +#' predictors, dots are automatically hidden to improve the visualization of +#' variance patterns across groups (but you can override this by explicitly +#' setting `show_dots = TRUE`). #' @param verbose If `FALSE` (default), suppress most warning messages. #' @param ... Arguments passed down to the individual check functions, especially #' to `check_predictions()` and `binned_residuals()`. @@ -300,10 +303,15 @@ check_model.default <- function( type <- "discrete_interval" } - # set default for show_dots, based on "model size" + # set default for show_dots, based on "model size" and predictor types if (is.null(show_dots)) { n <- .safe(insight::n_obs(x)) show_dots <- is.null(n) || n <= 1e5 + # Auto-disable dots for models with only categorical predictors + # to improve visualization of variance patterns + if (show_dots && .has_only_categorical_predictors(x)) { + show_dots <- FALSE + } } attr(assumptions_data, "panel") <- panel @@ -742,3 +750,91 @@ check_model.DHARMa <- check_model.performance_simres class(dat) <- c("check_model", "see_check_model", "data.frame") dat } + +# Helper function to detect if model has only categorical predictors ---- + +.has_only_categorical_predictors <- function(model) { + tryCatch( + { + # Get the model matrix (excludes intercept) + mm <- insight::get_modelmatrix(model, verbose = FALSE) + if (is.null(mm) || ncol(mm) <= 1) { + # Only intercept or no predictors + return(FALSE) + } + + # Remove intercept column if present + mm_cols <- colnames(mm) + mm_cols <- mm_cols[mm_cols != "(Intercept)"] + + if (length(mm_cols) == 0) { + return(FALSE) + } + + # Get predictors from the model + predictors <- insight::find_predictors(model, flatten = TRUE) + if (is.null(predictors) || length(predictors) == 0) { + return(FALSE) + } + + # Get model data + model_data <- insight::get_data(model, verbose = FALSE) + if (is.null(model_data)) { + return(FALSE) + } + + # Check each predictor + # A predictor is considered categorical if: + # 1. It's a factor/character in the data, OR + # 2. It appears in the formula wrapped in factor(), as.factor(), etc., OR + # 3. It generates dummy columns in the model matrix + predictor_is_categorical <- vapply( + predictors, + function(pred) { + # Check if it's a factor/character in the data + if (pred %in% names(model_data)) { + var <- model_data[[pred]] + if (is.factor(var) || is.character(var)) { + return(TRUE) + } + } + + # Check for explicit factor conversion in formula like factor() or as.factor() + # This correctly handles binary factors converted in the formula. + factor_patterns <- paste( + c( + paste0("^factor\\(", pred, "\\)"), + paste0("^as\\.factor\\(", pred, "\\)") + ), + collapse = "|" + ) + if (any(grepl(factor_patterns, mm_cols))) { + return(TRUE) + } + + # Check if this predictor generates dummy columns + # (indicates categorical treatment for non-binary factors) + # This pattern correctly identifies dummy variables like 'cyl6' from 'cyl' + # without incorrectly matching partial names like 'cyl_long'. + pred_pattern <- paste0("^", pred, "(?![a-zA-Z_])") + pred_cols <- grep(pred_pattern, mm_cols, value = TRUE, perl = TRUE) + + # Exclude exact matches (continuous predictors appear as-is in model matrix) + # Only count dummy variable columns (e.g., 'cyl6', 'cyl8' but not 'cyl') + pred_cols <- pred_cols[pred_cols != pred] + + # If one or more dummy columns for this predictor, it's categorical + length(pred_cols) >= 1 + }, + FUN.VALUE = logical(1) + ) + + # Return TRUE only if all predictors are categorical + length(predictor_is_categorical) > 0 && all(predictor_is_categorical) + }, + error = function(e) { + # If there's any error, default to FALSE + FALSE + } + ) +} diff --git a/man/check_model.Rd b/man/check_model.Rd index 4d858f14a..89a2e827e 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -77,7 +77,10 @@ residuals.} to \code{FALSE} for models with many observations, if generating the plot is too time-consuming. By default, \code{show_dots = NULL}. In this case \code{check_model()} tries to guess whether performance will be poor due to a very large model -and thus automatically shows or hides dots.} +and thus automatically shows or hides dots. For models with only categorical +predictors, dots are automatically hidden to improve the visualization of +variance patterns across groups (but you can override this by explicitly +setting \code{show_dots = TRUE}).} \item{size_dot, size_line}{Size of line and dot-geoms.} diff --git a/tests/testthat/test-check_model.R b/tests/testthat/test-check_model.R index 7ee4be747..1adc0bd07 100644 --- a/tests/testthat/test-check_model.R +++ b/tests/testthat/test-check_model.R @@ -206,3 +206,65 @@ test_that("`check_model()` with transformed response when named as function", { out <- check_predictions(model) expect_s3_class(out, "performance_pp_check") }) + + +test_that("`check_model()` with show_dots parameter", { + data(mtcars) + m <- lm(mpg ~ factor(cyl) + wt, data = mtcars) + + # Test with show_dots = FALSE + result <- check_model(m, show_dots = FALSE, verbose = FALSE) + + expect_s3_class(result, "check_model") + expect_false(attr(result, "show_dots")) +}) + + +test_that("`check_model()` auto-detects categorical predictors", { + skip_if_not_installed("curl") + + # Load test data + star <- tryCatch( + { + read.csv("https://drmankin.github.io/disc_stats/star.csv") + }, + error = function(e) { + NULL + } + ) + + skip_if(is.null(star), message = "Could not download test data") + + star$star2 <- factor(star$star2) + + m <- lm(math2 ~ star2, data = star, na.action = na.exclude) + result <- check_model(m, verbose = FALSE) + + # Should auto-disable dots for categorical-only models + expect_s3_class(result, "check_model") + expect_false(attr(result, "show_dots")) +}) + + +test_that("`check_model()` keeps dots for mixed continuous/categorical models", { + data(mtcars) + m <- lm(mpg ~ factor(cyl) + wt + hp, data = mtcars) + + result <- check_model(m, verbose = FALSE) + + expect_s3_class(result, "check_model") + # Should keep dots by default for mixed models + expect_true(attr(result, "show_dots")) +}) + + +test_that("`check_model()` auto-disables dots for binary factor in formula", { + data(mtcars) + m <- lm(mpg ~ as.factor(am), data = mtcars) + + result <- check_model(m, verbose = FALSE) + + # Should auto-disable dots for categorical-only models (even binary) + expect_s3_class(result, "check_model") + expect_false(attr(result, "show_dots")) +})