diff --git a/DESCRIPTION b/DESCRIPTION index c80b63f..d495553 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,7 +28,7 @@ Imports: R6 (>= 2.4.1) License: GPL-3 Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 URL: https://github.com/MarselScheer/bootGOF BugReports: https://github.com/MarselScheer/bootGOF/issues Suggests: diff --git a/R/GOF_model.R b/R/GOF_model.R index 344dc92..f581ba7 100644 --- a/R/GOF_model.R +++ b/R/GOF_model.R @@ -1,3 +1,5 @@ +library(parallel) + ##' @title Convenience function for creating a GOF-test for statistical models ##' ##' @description Simplifies the creation of an instance of @@ -18,6 +20,13 @@ ##' class used for performing the GOF test (\link{GOF_model_test}) ##' is injected. This parameter simply makes it easier to test the ##' convenience function properly. +##' @param n_cores positive integer specifying the number of CPU cores to use +##' for parallel resampling. If bigger than 1, the L'Ecuyer-CMRG is used; +##' if 'NULL' or 1, one core is used with the current RNG. +##' Default is "NULL". +##' @param seed integer intended to seed the internally setup +##' L'Ecuyer-CMRG, but will also be applied when RNG not replaced. +##' Default is "NULL", which will not alter the seed. ##' @export ##' @return instance of \link{GOF_model_test} ##' @examples @@ -54,7 +63,9 @@ GOF_model <- function(model, # nolint y_name, Rn1_statistic, # nolint gof_model_resample_class = GOF_model_resample, - gof_model_test_class = GOF_model_test + gof_model_test_class = GOF_model_test, + n_cores = NULL, + seed = NULL ) { checkmate::assert_subset( x = simulator_type, @@ -67,8 +78,6 @@ GOF_model <- function(model, # nolint )) } - - simulators <- list( lm = list( parametric = GOF_lm_sim_param, @@ -113,6 +122,9 @@ GOF_model <- function(model, # nolint y_name = y_name, Rn1_statistic = Rn1_statistic, gof_model_info_extractor = mie, - gof_model_resample = model_resample) + gof_model_resample = model_resample, + n_cores = n_cores, + seed = seed) + return(ret) } diff --git a/R/GOF_model_test.R b/R/GOF_model_test.R index 24a2ec2..a37bf30 100644 --- a/R/GOF_model_test.R +++ b/R/GOF_model_test.R @@ -22,6 +22,12 @@ GOF_model_test <- R6::R6Class( # nolint ##' @param gof_model_resample an instance that implements ##' \link{GOF_model_resample} in order to apply it to ##' \code{model} + ##' @param n_cores positive integer specifying the number of CPU cores to + ##' use for parallel resampling. If bigger than 1, the L'Ecuyer-CMRG is + ##' used; if 'NULL' or 1, one core is used with the current RNG. + ##' @param seed integer intended to seed the internally setup + ##' L'Ecuyer-CMRG, but will also be applied when RNG not replaced, + ##' as long as it is not "NULL". ##' @return An instance of the Class initialize = function(model, data, @@ -29,8 +35,11 @@ GOF_model_test <- R6::R6Class( # nolint y_name, Rn1_statistic, # nolint gof_model_info_extractor, - gof_model_resample) { + gof_model_resample, + n_cores, + seed) { checkmate::assert_count(x = nmb_boot_samples, positive = TRUE) + checkmate::assert_count(x = n_cores, positive = TRUE, null.ok = TRUE) private$model_org <- model private$data_org <- data private$y_name <- y_name @@ -38,6 +47,8 @@ GOF_model_test <- R6::R6Class( # nolint private$nmb_boot_samples <- nmb_boot_samples private$model_info_extractor <- gof_model_info_extractor private$model_resample <- gof_model_resample + private$n_cores <- n_cores + private$seed <- seed private$order_beta_dot_X_org <- order( # nolint private$model_info_extractor$beta_x_covariates( model = private$model_org @@ -84,6 +95,8 @@ GOF_model_test <- R6::R6Class( # nolint nmb_boot_samples = NULL, model_info_extractor = NULL, model_resample = NULL, + n_cores = NULL, + seed = NULL, Rn1_statistic = NULL, Rn1_boot = NULL, Rn1_org = NULL, @@ -108,5 +121,32 @@ GOF_model_test <- R6::R6Class( # nolint order_beta_x_covariates = private$order_beta_dot_X_org) return(Rn1_boot) } - private$Rn1_boot <- lapply(X = 1:private$nmb_boot_samples, FUN = f) # nolint + + # Replace RNG with "L'Ecuyer-CMRG" if going parallel + replaced_rng <- FALSE + if (is.null(private$n_cores)) { + private$n_cores <- 1 + } else if (private$n_cores > 1) { + # save and replace current RNG state + original_state <- if (exists(".Random.seed", .GlobalEnv)) + .GlobalEnv$.Random.seed else NULL # nolint + RNGkind("L'Ecuyer-CMRG") + set.seed(NULL) + replaced_rng <- TRUE + } + + if (!is.null(private$seed)) { + set.seed(private$seed) + } + + private$Rn1_boot <- parallel::mclapply(X = 1:private$nmb_boot_samples, FUN = f, mc.cores = private$n_cores) # nolint + + # Reset initial RNG if it has been replaced + if (replaced_rng) { + if (!is.null(original_state)) { + .GlobalEnv$.Random.seed <- original_state + } else { + RNGkind("default") + } + } })) diff --git a/README.Rmd b/README.Rmd index 6f8e167..0b38c88 100644 --- a/README.Rmd +++ b/README.Rmd @@ -81,6 +81,44 @@ mt$get_pvalue() ... +## Parallelization + +The bootstrapping process can be accelerated using the `n_cores` parameter of +the `GOF_model` function, that specifies the number of CPU cores to use. + +If this parameter is set to at least two cores, internally the currently used +RNG is replaced by the L'Ecuyer-CMRG generator, which is safe to use +in a parallel context. + +This internal generator can be seeded using the `seed` parameter of the +`GOF_model` function, which however will also apply if the `n_cores` parameter +is not used and thus the generator is not replaced. + +For example: +```{r} +set.seed(1) +N <- 100 +X1 <- rnorm(N) +X2 <- rnorm(N) +d <- data.frame( + y = rpois(n = N, lambda = exp(4 + X1 * 2 + X2 * 6)), + x1 = X1, + x2 = X2) + +fit <- glm(y ~ x1 + x2, data = d, family = poisson()) + +mt <- GOF_model( + model = fit, + data = d, + nmb_boot_samples = 100, + simulator_type = "parametric", + y_name = "y", + Rn1_statistic = Rn1_KS$new(), + n_cores = 2, + seed = 1) +mt$get_pvalue() +``` + ## Installation You can install it from CRAN diff --git a/README.md b/README.md index 62757e9..f85d7bb 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,45 @@ not be rejected by the GOF-test: … +## Parallelization + +The bootstrapping process can be accelerated using the `n_cores` +parameter of the `GOF_model` function, that specifies the number of CPU +cores to use. + +If this parameter is set to at least two cores, internally the currently +used RNG is replaced by the L’Ecuyer-CMRG generator, which is safe to +use in a parallel context. + +This internal generator can be seeded using the `seed` parameter of the +`GOF_model` function, which however will also apply if the `n_cores` +parameter is not used and thus the generator is not replaced. + +For example: + + set.seed(1) + N <- 100 + X1 <- rnorm(N) + X2 <- rnorm(N) + d <- data.frame( + y = rpois(n = N, lambda = exp(4 + X1 * 2 + X2 * 6)), + x1 = X1, + x2 = X2) + + fit <- glm(y ~ x1 + x2, data = d, family = poisson()) + + mt <- GOF_model( + model = fit, + data = d, + nmb_boot_samples = 100, + simulator_type = "parametric", + y_name = "y", + Rn1_statistic = Rn1_KS$new(), + n_cores = 2, + seed = 1) + mt$get_pvalue() + #> [1] 0.62 + ## Installation You can install it from CRAN @@ -89,9 +128,9 @@ package in your environment by calling: # sessionInfo sessionInfo() - #> R Under development (unstable) (2025-08-19 r88650) + #> R Under development (unstable) (2025-10-19 r88945) #> Platform: x86_64-pc-linux-gnu - #> Running under: Ubuntu 24.04.2 LTS + #> Running under: Ubuntu 24.04.3 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 @@ -109,15 +148,15 @@ package in your environment by calling: #> tzcode source: system (glibc) #> #> attached base packages: - #> [1] stats graphics grDevices datasets utils methods base + #> [1] parallel stats graphics grDevices utils datasets methods + #> [8] base #> #> other attached packages: - #> [1] bootGOF_0.1.1 + #> [1] bootGOF_0.1.1.9000 #> #> loaded via a namespace (and not attached): #> [1] digest_0.6.37 desc_1.4.3 backports_1.5.0 R6_2.6.1 #> [5] fastmap_1.2.0 xfun_0.53 knitr_1.50 htmltools_0.5.8.1 - #> [9] rmarkdown_2.29 cli_3.6.5 renv_1.1.5 withr_3.0.2 - #> [13] pkgload_1.4.0 compiler_4.6.0 rprojroot_2.1.0 tools_4.6.0 - #> [17] pkgbuild_1.4.8 checkmate_2.3.3 evaluate_1.0.4 yaml_2.3.10 - #> [21] rlang_1.1.6 + #> [9] rmarkdown_2.30 cli_3.6.5 pkgload_1.4.1 compiler_4.6.0 + #> [13] rprojroot_2.1.1 tools_4.6.0 pkgbuild_1.4.8 checkmate_2.3.3 + #> [17] evaluate_1.0.5 yaml_2.3.10 rlang_1.1.6 diff --git a/inst/tinytest/test_GOF_model.R b/inst/tinytest/test_GOF_model.R index 2a007ce..78a9bf7 100644 --- a/inst/tinytest/test_GOF_model.R +++ b/inst/tinytest/test_GOF_model.R @@ -33,7 +33,9 @@ GOF_model_test_dummy <- R6::R6Class( # nolint y_name, Rn1_statistic, # nolint gof_model_info_extractor, - gof_model_resample) { + gof_model_resample, + n_cores, + seed) { })) GOF_model_error_if_fit_class_is_not_lm_or_glm <- function() { # nolint @@ -86,7 +88,9 @@ GOF_model_uses_lm_info_extractor <- function() { # nolint y_name, Rn1_statistic, # nolint gof_model_info_extractor, - gof_model_resample) { + gof_model_resample, + n_cores, + seed) { inject_lm_info_extractor <<- inherits( x = gof_model_info_extractor, what = "GOF_lm_info_extractor") @@ -209,7 +213,9 @@ GOF_model_uses_glm_info_extractor <- function() { # nolint y_name, Rn1_statistic, # nolint gof_model_info_extractor, - gof_model_resample) { + gof_model_resample, + n_cores, + seed) { inject_glm_info_extractor <<- inherits( x = gof_model_info_extractor, what = "GOF_glm_info_extractor") diff --git a/inst/tinytest/test_GOF_model_test.R b/inst/tinytest/test_GOF_model_test.R index ea815b2..af35410 100644 --- a/inst/tinytest/test_GOF_model_test.R +++ b/inst/tinytest/test_GOF_model_test.R @@ -11,24 +11,37 @@ GOF_model_test_necessary_input <- function() { # nolint expect_error( GOF_model_test$new( nmb_boot_samples = 1), + pattern = "n_cores") + expect_error( + GOF_model_test$new( + nmb_boot_samples = 1, + n_cores = 0), + pattern = ".*n_cores.*>= 1") + expect_error( + GOF_model_test$new( + nmb_boot_samples = 1, + n_cores = 1), pattern = "model") expect_error( GOF_model_test$new( model = "dummy", - nmb_boot_samples = 1), + nmb_boot_samples = 1, + n_cores = 1), pattern = "data") expect_error( GOF_model_test$new( model = "dummy", nmb_boot_samples = 1, - data = "dummy"), + data = "dummy", + n_cores = 1), pattern = "y_name") expect_error( GOF_model_test$new( model = "dummy", nmb_boot_samples = 1, data = "dummy", - y_name = "dummy"), + y_name = "dummy", + n_cores = 1), pattern = "Rn1_statistic") expect_error( GOF_model_test$new( @@ -36,7 +49,8 @@ GOF_model_test_necessary_input <- function() { # nolint nmb_boot_samples = 1, data = "dummy", y_name = "dummy", - Rn1_statistic = "dummy"), + Rn1_statistic = "dummy", + n_cores = 1), pattern = "gof_model_info_extractor") expect_error( GOF_model_test$new( @@ -45,8 +59,20 @@ GOF_model_test_necessary_input <- function() { # nolint data = "dummy", y_name = "dummy", Rn1_statistic = "dummy", - gof_model_info_extractor = "dummy"), + gof_model_info_extractor = "dummy", + n_cores = 1), pattern = "gof_model_resample") + expect_error( + GOF_model_test$new( + model = "dummy", + nmb_boot_samples = 1, + data = "dummy", + y_name = "dummy", + Rn1_statistic = "dummy", + gof_model_info_extractor = "dummy", + gof_model_resample = "dummy", + n_cores = 1), + pattern = "seed") } GOF_model_test_necessary_input() @@ -63,7 +89,9 @@ GOF_model_test_calc_Rn1_org <- function() { # nolint Rn1_statistic = "dummy", nmb_boot_samples = 10, gof_model_info_extractor = GOF_lm_info_extractor$new(), - gof_model_resample = "dummy") + gof_model_resample = "dummy", + n_cores = NULL, + seed = NULL) expect_equal( mt$get_Rn1_org(), Rn1_fun(r = residuals(fit), o = order(X)) @@ -98,7 +126,9 @@ GOF_model_test_calc_Rn1_boot <- function() { # nolint gof_model_resample = GOF_model_resample$new( gof_model_simulator = lm_sim_para_mock, gof_model_trainer = GOF_lm_trainer$new() - ) + ), + n_cores = NULL, + seed = NULL ) d1 <- d @@ -137,7 +167,10 @@ GOF_model_test_calc_pvalue <- function() { # nolint gof_model_info_extractor = GOF_lm_info_extractor$new(), gof_model_resample = GOF_model_resample$new( gof_model_simulator = GOF_lm_sim_param$new(), - gof_model_trainer = GOF_lm_trainer$new())) + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = NULL, + seed = NULL + ) out <- mt$get_pvalue() expect_equal( out, { @@ -168,7 +201,10 @@ GOF_model_test_expect_small_pvalue <- function() { # nolint gof_model_info_extractor = GOF_glm_info_extractor$new(), gof_model_resample = GOF_model_resample$new( gof_model_simulator = GOF_glm_sim_param$new(), - gof_model_trainer = GOF_glm_trainer$new())) + gof_model_trainer = GOF_glm_trainer$new()), + n_cores = NULL, + seed = NULL + ) expect_equal(mt$get_pvalue(), 0) @@ -189,7 +225,10 @@ GOF_model_test_expect_small_pvalue <- function() { # nolint gof_model_simulator = GOF_sim_wild_rademacher$new( gof_model_info_extractor = ie ), - gof_model_trainer = GOF_lm_trainer$new())) + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = NULL, + seed = NULL + ) expect_equal(mt$get_pvalue(), 0) } @@ -211,7 +250,10 @@ GOF_model_test_expect_non_small_pvalue <- function() { # nolint gof_model_info_extractor = GOF_glm_info_extractor$new(), gof_model_resample = GOF_model_resample$new( gof_model_simulator = GOF_glm_sim_param$new(), - gof_model_trainer = GOF_glm_trainer$new())) + gof_model_trainer = GOF_glm_trainer$new()), + n_cores = NULL, + seed = NULL + ) expect_equal(mt$get_pvalue(), 0.74) @@ -232,8 +274,221 @@ GOF_model_test_expect_non_small_pvalue <- function() { # nolint gof_model_simulator = GOF_sim_wild_rademacher$new( gof_model_info_extractor = ie ), - gof_model_trainer = GOF_lm_trainer$new())) + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = NULL, + seed = NULL + ) expect_equal(mt$get_pvalue(), 0.8) } GOF_model_test_expect_non_small_pvalue() + + + +GOF_model_test_no_initial_RNG <- function() { # nolint + set.seed(1) + X <- rnorm(10) # nolint + Y <- 5 * X + rnorm(10) # nolint + d <- data.frame(y = Y, x = X) + fit <- lm(y ~ x, data = d) + + remove(.Random.seed, envir = .GlobalEnv) + + mt <- GOF_model_test$new( + model = fit, + nmb_boot_samples = 1, + data = d, + y_name = "y", + Rn1_statistic = "dummy", + gof_model_info_extractor = GOF_lm_info_extractor$new(), + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_lm_sim_param$new(), + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = 2, + seed = 1 + ) + out <- mt$get_Rn1_boot() + + reset.state <- .GlobalEnv$.Random.seed + RNGkind("default") + expect_equal(reset.state[1:2], .GlobalEnv$.Random.seed[1:2]) +} +GOF_model_test_no_initial_RNG() + +GOF_model_test_calc_Rn1_boot_2_cores <- function() { # nolint + set.seed(1) + X <- rnorm(10) # nolint + Y <- 5 * X + rnorm(10) # nolint + d <- data.frame(y = Y, x = X) + fit <- lm(y ~ x, data = d) + + model_resample_mock <- list(rnorm(10), rnorm(10), rnorm(10)) + MODEL_MOCK_NMB <- 0 # nolint + lm_sim_para_mock <- R6::R6Class( + public = list( + resample_y = function(model) { + MODEL_MOCK_NMB <<- MODEL_MOCK_NMB + 1 # nolint + model_resample_mock[[MODEL_MOCK_NMB]] + }))$new() + + mt <- GOF_model_test$new( + model = fit, + nmb_boot_samples = 3, + data = d, + y_name = "y", + Rn1_statistic = "dummy", + gof_model_info_extractor = GOF_lm_info_extractor$new(), + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = lm_sim_para_mock, + gof_model_trainer = GOF_lm_trainer$new() + ), + n_cores = 2, + seed = 1 + ) + + d1 <- d + d1$y <- model_resample_mock[[1]] + fit1 <- lm(y~x, data = d1) + d2 <- d + d2$y <- model_resample_mock[[2]] + fit2 <- lm(y~x, data = d2) + out <- mt$get_Rn1_boot() + expect_equal( + out, + list( + Rn1_fun(r = residuals(fit1), o = order(X)), + Rn1_fun(r = residuals(fit1), o = order(X)), + Rn1_fun(r = residuals(fit2), o = order(X)) + )) +} +GOF_model_test_calc_Rn1_boot_2_cores() + +GOF_model_test_calc_pvalue_2_cores <- function() { # nolint + set.seed(1) + X <- rnorm(10) # nolint + Y <- 5 * X + rnorm(10) # nolint + d <- data.frame(y = Y, x = X) + fit <- lm(y ~ x, data = d) + KS <- Rn1_KS$new() # nolint + mt <- GOF_model_test$new( + model = fit, + data = d, + y_name = "y", + Rn1_statistic = KS, + nmb_boot_samples = 10, + gof_model_info_extractor = GOF_lm_info_extractor$new(), + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_lm_sim_param$new(), + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = 2, + seed = 1 + ) + out <- mt$get_pvalue() + expect_equal( + out, { + stat_org <- KS$calc_statistic(mt$get_Rn1_org()) + stat_boot <- sapply(mt$get_Rn1_boot(), KS$calc_statistic) + mean(stat_org < stat_boot) + } + ) +} +GOF_model_test_calc_pvalue_2_cores() + +GOF_model_test_expect_small_pvalue_2_cores <- function() { # nolint + set.seed(1) + X1 <- rnorm(100) # nolint + X2 <- rnorm(100) # nolint + d <- data.frame( + y = rpois(n = 100, lambda = exp(4 + X1 * 2 + X2 * 6)), + x1 = X1) + fit <- glm(y~x1, data = d, family = poisson()) + mt <- GOF_model_test$new( + model = fit, + data = d, + y_name = "y", + Rn1_statistic = Rn1_KS$new(), + nmb_boot_samples = 100, + gof_model_info_extractor = GOF_glm_info_extractor$new(), + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_glm_sim_param$new(), + gof_model_trainer = GOF_glm_trainer$new()), + n_cores = 2, + seed = 1 + ) + + expect_equal(mt$get_pvalue(), 0) + + X1 <- rnorm(100) # nolint + d <- data.frame( + y = rnorm(n = 100, mean = 4 + X1^2), + x1 = X1) + fit <- lm(y~x1, data = d) + ie <- GOF_lm_info_extractor$new() + mt <- GOF_model_test$new( + model = fit, + data = d, + y_name = "y", + Rn1_statistic = Rn1_KS$new(), + nmb_boot_samples = 100, + gof_model_info_extractor = ie, + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_sim_wild_rademacher$new( + gof_model_info_extractor = ie + ), + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = 2, + seed = 1 + ) + + expect_equal(mt$get_pvalue(), 0) +} +GOF_model_test_expect_small_pvalue_2_cores() + +GOF_model_test_expect_non_small_pvalue_2_cores <- function() { # nolint + set.seed(1) + X1 <- rnorm(100) # nolint + d <- data.frame( + y = rpois(n = 100, lambda = exp(4 + X1 * 2)), + x1 = X1) + fit <- glm(y~x1, data = d, family = poisson()) + mt <- GOF_model_test$new( + model = fit, + data = d, + y_name = "y", + Rn1_statistic = Rn1_KS$new(), + nmb_boot_samples = 100, + gof_model_info_extractor = GOF_glm_info_extractor$new(), + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_glm_sim_param$new(), + gof_model_trainer = GOF_glm_trainer$new()), + n_cores = 2, + seed = 1 + ) + + expect_equal(mt$get_pvalue(), 0.8) + + X1 <- rnorm(100) # nolint + d <- data.frame( + y = rnorm(n = 100, mean = 4 + X1 + X1^2), + x1 = X1) + fit <- lm(y~x1 + I(x1^2), data = d) + ie <- GOF_lm_info_extractor$new() + mt <- GOF_model_test$new( + model = fit, + data = d, + y_name = "y", + Rn1_statistic = Rn1_CvM$new(), + nmb_boot_samples = 100, + gof_model_info_extractor = ie, + gof_model_resample = GOF_model_resample$new( + gof_model_simulator = GOF_sim_wild_rademacher$new( + gof_model_info_extractor = ie + ), + gof_model_trainer = GOF_lm_trainer$new()), + n_cores = 2, + seed = 1 + ) + + expect_equal(mt$get_pvalue(), 0.92) +} +GOF_model_test_expect_non_small_pvalue_2_cores() diff --git a/vignettes/New-Models.Rmd b/vignettes/New-Models.Rmd index 58f53be..1495489 100644 --- a/vignettes/New-Models.Rmd +++ b/vignettes/New-Models.Rmd @@ -240,7 +240,9 @@ mt <- GOF_model_test$new( gof_model_resample = GOF_model_resample$new( gof_model_simulator = my_simulator, gof_model_trainer = my_trainer - ) + ), + n_cores = NULL, + seed = NULL ) mt$get_pvalue() ``` @@ -304,7 +306,9 @@ mt <- GOF_model_test$new( gof_model_resample = GOF_model_resample$new( gof_model_simulator = GOF_glm_sim_param$new(), gof_model_trainer = my_negbin_trainer$new() - ) + ), + n_cores = NULL, + seed = NULL ) mt$get_pvalue() ```