diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 9ae0bec..f1131c4 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,4 +1,3 @@ -Version: 0.3.4 -Date: 2023-06-13 18:09:36 UTC -SHA: - dcb6aa72d97c124bb6194b5a5334c698d7062eb3 +Version: 0.4.0 +Date: 2023-06-30 18:30:53 UTC +SHA: 2daf0161a16357e90aea1fa42e0c452226a4f3fc diff --git a/DESCRIPTION b/DESCRIPTION index 7111f7a..3cb3632 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cbcTools -Title: Design and Evaluate Choice-Based Conjoint Survey Experiments -Version: 0.3.4 +Title: A Simulation-Based Workflow to Design and Evaluate Choice-Based Conjoint Survey Experiments +Version: 0.4.0 Maintainer: John Helveston Authors@R: c( person(given = "John", @@ -8,7 +8,7 @@ Authors@R: c( role = c("cre", "aut", "cph"), email = "john.helveston@gmail.com", comment = c(ORCID = "0000-0002-2657-9191"))) -Description: Design and evaluate choice-based conjoint survey experiments in R. Generate survey designs, including randomized designs and Bayesian D-efficient designs as well as designs with "no choice" options and labeled designs. Conveniently inspect the design balance and overlap, and simulate choice data for a survey design either randomly or according to a multinomial or mixed logit utility model defined by user-provided prior parameters. Conduct power analyses on a survey design by estimating the same model multiple times using different subsets of the data to simulate different sample sizes. Choice simulation and model estimation are handled using the 'logitr' package, and Bayesian D-efficient designs are obtained using the 'idefix' package. For more details see Helveston (2023) and Traets et al (2020) . +Description: A simulation-based workflow to design and evaluate choice-based conjoint survey experiments. Generate a variety of survey designs, including full factorial designs, orthogonal designs, and Bayesian D-efficient designs as well as designs with "no choice" options and "labeled" (also known as "alternative specific") designs. Full factorial and orthogonal designs are obtained using the 'DoE.base' package (Grömping, 2018) . Bayesian D-efficient designs are obtained using the 'idefix' package (Traets et al, 2020) . Conveniently inspect the design balance and overlap, and simulate choice data for a survey design either randomly or according to a multinomial or mixed logit utility model defined by user-provided prior parameters. Conduct a power analysis for a given survey design by estimating the same model on different subsets of the data to simulate different sample sizes. Choice simulation and model estimation in power analyses are handled using the 'logitr' package (Helveston, 2023) . License: MIT + file LICENSE Encoding: UTF-8 LazyData: true @@ -20,7 +20,7 @@ Suggests: testthat, tibble Imports: - dplyr, + DoE.base, fastDummies, ggplot2, idefix, diff --git a/NAMESPACE b/NAMESPACE index e58bae9..11c2d24 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,11 +9,13 @@ export(cbc_overlap) export(cbc_power) export(cbc_profiles) export(cbc_restrict) +export(plot_compare_power) export(randLN) export(randN) importFrom(ggplot2,aes) importFrom(ggplot2,element_blank) importFrom(ggplot2,expand_limits) +importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_hline) importFrom(ggplot2,geom_point) importFrom(ggplot2,ggplot) diff --git a/R/choices.R b/R/choices.R index afcb9b0..77e8065 100644 --- a/R/choices.R +++ b/R/choices.R @@ -1,7 +1,9 @@ #' Simulate choices for a survey design #' #' Simulate choices for a survey design, either randomly or according to a -#' utility model defined by user-provided prior parameters. +#' utility model defined by user-provided prior parameters. All choices are +#' simulated using the 'logitr' package. For more details see the JSS article +#' on the 'logitr' package (Helveston, 2023). #' @keywords logitr mnl mxl mixed logit simulation #' #' @param design A data frame of a survey design. @@ -12,6 +14,9 @@ #' If `NULL` (the default), choices will be randomly assigned. #' @param n_draws The number of Halton draws to use for simulated choices #' for mixed logit models. Defaults to `100`. +#' @references +#' Helveston, J. P. (2023). logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations. Journal of Statistical Software, 105(10), 1–37, +#' \doi{10.18637/jss.v105.i10} #' @return Returns the `design` data frame with an additional `choice` column #' identifying the simulated choices. #' @export @@ -27,12 +32,13 @@ #' freshness = c('Poor', 'Average', 'Excellent') #' ) #' -#' # Make a randomized survey design +#' # Make a survey design from all possible profiles +#' # (This is the default setting where method = 'full' for "full factorial") #' design <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents -#' n_alts = 3, # Number of alternatives per question -#' n_q = 6 # Number of questions per respondent +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6 # Number of questions per respondent #' ) #' #' # Simulate random choices diff --git a/R/design.R b/R/design.R index caf79b3..ce18a08 100644 --- a/R/design.R +++ b/R/design.R @@ -1,19 +1,20 @@ -#' Make a random or Bayesian D-efficient choice-based conjoint survey design +#' Make a choice-based conjoint survey design #' #' This function creates a data frame containing a choice-based conjoint survey -#' design where each row is an alternative. Designs can be either a -#' randomized or Bayesian D-efficient, in which case an implementation of the -#' CEA or Modfed Federov algorithm is used via the {idefix} package +#' design where each row is an alternative. Generate a variety of survey +#' designs, including full factorial designs, orthogonal designs, and +#' Bayesian D-efficient designs as well as designs with "no choice" options +#' and "labeled" (also known as "alternative specific") designs. #' -#' @keywords logitr mnl mxl mixed logit design +#' @keywords experiment design mnl mxl mixed logit logitr idefix DoE.base #' @param profiles A data frame in which each row is a possible profile. #' This can be generated using the `cbc_profiles()` function. #' @param n_resp Number of survey respondents. #' @param n_alts Number of alternatives per choice question. #' @param n_q Number of questions per respondent. -#' @param n_blocks Number of blocks used in Bayesian D-efficient design. -#' Max allowable is one block per respondent, defaults to `1`, meaning every -#' respondent sees the same set of choice questions. +#' @param n_blocks Number of blocks used in Orthogonal or Bayesian D-efficient +#' designs. Max allowable is one block per respondent. Defaults to `1`, meaning +#' every respondent sees the same choice set. #' @param n_draws Number of draws used in simulating the prior distribution #' used in Bayesian D-efficient designs. Defaults to `50`. #' @param n_start A numeric value indicating the number of random start designs @@ -26,22 +27,20 @@ #' @param label The name of the variable to use in a "labeled" design #' (also called an "alternative-specific design") such that each set of #' alternatives contains one of each of the levels in the `label` attribute. -#' Currently only compatible with randomized designs. If used, the `n_alts` -#' argument will be ignored as its value is defined by the unique number of -#' levels in the `label` variable. Defaults to `NULL`. +#' Currently not compatible with Bayesian D-efficient designs. If used, +#' the `n_alts` argument will be ignored as its value is defined by the unique +#' number of levels in the `label` variable. Defaults to `NULL`. +#' @param method Choose the design method to use: `"full"`, `"orthogonal"`, +#' `"CEA"` or `"Modfed"`. Defaults to `"full"`. See details below for complete +#' description of each method. #' @param priors A list of one or more assumed prior parameters used to -#' generate a Bayesian D-efficient design. If `NULL` (the default), a -#' randomized design will be generated. +#' generate a Bayesian D-efficient design. Defaults to `NULL` #' @param prior_no_choice Prior utility value for the "no choice" alternative. #' Only required if `no_choice = TRUE`. Defaults to `NULL`. #' @param probs If `TRUE`, for Bayesian D-efficient designs the resulting #' design includes average predicted probabilities for each alternative in each #' choice set given the sample from the prior preference distribution. #' Defaults to `FALSE`. -#' @param method Which method to use for obtaining a Bayesian D-efficient -#' design, `"CEA"` or `"Modfed"`? If `priors` are specified, it defaults to -#' `"CEA"`, otherwise it defaults to `NULL`. See `?idefix::CEA` and -#' `?idefix::Modfed` for more details. #' @param keep_db_error If `TRUE`, for Bayesian D-efficient designs the returned #' object will be a list containing the design and the DB-error score. #' Defaults to `FALSE`. @@ -49,7 +48,45 @@ #' iterations when searching for a Bayesian D-efficient design. The default is #' 50. #' @param parallel Logical value indicating whether computations should be done -#' over multiple cores. The default is `TRUE`. +#' over multiple cores. The default is `FALSE`. +#' @details +#' The `method` argument determines the design method used. Options are: +#' +#' - `"full"` +#' - `"orthogonal"` +#' - `"CEA"` +#' - `"Modfed"` +#' +#' The `"full"` method uses a "full factorial" design where choice sets are +#' created by randomly selecting from the full set of `profiles`. Blocking can +#' used with these designs where blocks are created from subsets of the full +#' factorial design. For more information about blocking with full factorial +#' designs, see `?DoE.base::fac.design` as well as the JSS article on the +#' {DoE.base} package (Grömping, 2018). +#' +#' The `"orthogonal"` method first finds an orthogonal array from the full +#' set of `profiles` (if possible), then randomly selects from it to create +#' choice sets. For some designs an orthogonal array can't be found, in which +#' case a full factorial design is used. This approach is also sometimes called +#' a "main effects" design since orthogonal arrays focus the information on the +#' main effects at the expense of information about interaction effects. For +#' more information about orthogonal designs, see `?DoE.base::oa.design` as +#' well as the JSS article on the {DoE.base} package +#' (Grömping, 2018). +#' +#' For Bayesian D-efficient designs, use `"CEA"` or `"Modfed"` along with +#' specified `priors`. If `method` is set to `"CEA"` or `"Modfed"` but without +#' `priors` specified, a prior of all `0`s will be used and a warning message +#' stating this will be shown. If you are using a restricted set of `profiles`, +#' only the `"Modfed"` method can be used as `"CEA"` requires unrestricted +#' `profiles`. For more details on Bayesian D-efficient designs, see +#' `?idefix::CEA` and `?idefix::Modfed` as well as the JSS article on the +#' {idefix} package (Traets et al, 2020). +#' @references +#' Grömping, U. (2018). R Package DoE.base for Factorial Experiments. Journal of Statistical Software, 85(5), 1–41, +#' \doi{10.18637/jss.v085.i05} +#' Traets, F., Sanchez, D. G., & Vandebroek, M. (2020). Generating Optimal Designs for Discrete Choice Experiments in R: The idefix Package. Journal of Statistical Software, 96(3), 1–41, +#' \doi{10.18637/jss.v096.i03} #' @return A data frame containing a choice-based conjoint survey design where #' each row is an alternative. #' @export @@ -65,16 +102,27 @@ #' freshness = c('Poor', 'Average', 'Excellent') #' ) #' -#' # Make a randomized survey design -#' design_rand <- cbc_design( +#' # Make a survey design from all possible profiles +#' # (This is the default setting where method = 'full' for "full factorial") +#' design_full <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents #' n_alts = 3, # Number of alternatives per question #' n_q = 6 # Number of questions per respondent #' ) #' -#' # Make a randomized survey design with a "no choice" option -#' design_rand_nochoice <- cbc_design( +#' # Make a survey design from an orthogonal array of profiles +#' design_ortho <- cbc_design( +#' profiles = profiles, +#' n_resp = 300, # Number of respondents +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6, # Number of questions per respondent +#' method = 'orthogonal' +#' ) +#' +#' # Make a survey design from all possible profiles +#' # with a "no choice" option +#' design_full_nochoice <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents #' n_alts = 3, # Number of alternatives per question @@ -82,9 +130,9 @@ #' no_choice = TRUE #' ) #' -#' # Make a randomized labeled survey design with each "type" appearing in -#' # each choice question -#' design_rand_labeled <- cbc_design( +#' # Make a survey design from all possible profiles +#' # with each level of the "type" attribute appearing as an alternative +#' design_full_labeled <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents #' n_alts = 3, # Number of alternatives per question @@ -93,13 +141,13 @@ #' ) #' #' # Make a Bayesian D-efficient design with a prior model specified -#' # Note that by default parallel = TRUE. -#' design_deff <- cbc_design( +#' # Note that by speed can be improved by setting parallel = TRUE +#' design_bayesian <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents -#' n_alts = 3, # Number of alternatives per question -#' n_q = 6, # Number of questions per respondent -#' n_start = 1, +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6, # Number of questions per respondent +#' n_start = 1, # Defauls to 5, set to 1 here for a quick example #' priors = list( #' price = -0.1, #' type = c(0.1, 0.2), @@ -115,24 +163,18 @@ cbc_design <- function( n_q, n_blocks = 1, n_draws = 50, - no_choice = FALSE, n_start = 5, + no_choice = FALSE, label = NULL, + method = "full", priors = NULL, prior_no_choice = NULL, probs = FALSE, - method = NULL, keep_db_error = FALSE, max_iter = 50, - parallel = TRUE + parallel = FALSE ) { - if (!is.null(priors)) { - if (is.null(method)) { - # Set default method to 'CEA' if priors are specified and - # user didn't specify a method. - method <- 'CEA' - } - } + method <- check_design_method(method, priors) check_inputs_design( profiles, n_resp, @@ -140,30 +182,30 @@ cbc_design <- function( n_q, n_blocks, n_draws, - no_choice, n_start, + no_choice, label, + method, priors, prior_no_choice, probs, - method, keep_db_error, max_iter, parallel ) profiles <- as.data.frame(profiles) # tibbles break things - if (is.null(priors)) { - design <- make_design_rand(profiles, n_resp, n_alts, n_q, no_choice, label) - } else if (!is.null(label)) { - warning( - "The use of the 'label' argument is currently only compatible with ", - "randomized designs, so the provided 'priors' are being ignored.\n" + if (method == 'full') { + design <- make_design_full( + profiles, n_resp, n_alts, n_q, n_blocks, no_choice, label + ) + } else if (method == 'orthogonal') { + design <- make_design_orthogonal( + profiles, n_resp, n_alts, n_q, no_choice, label ) - design <- make_design_rand(profiles, n_resp, n_alts, n_q, no_choice, label) } else { - design <- make_design_deff( - profiles, n_resp, n_alts, n_q, n_blocks, n_draws, no_choice, n_start, - label, priors, prior_no_choice, probs, method, keep_db_error, max_iter, + design <- make_design_bayesian( + profiles, n_resp, n_alts, n_q, n_blocks, n_draws, n_start, no_choice, + label, method, priors, prior_no_choice, probs, keep_db_error, max_iter, parallel ) } @@ -172,9 +214,13 @@ cbc_design <- function( return(design) } -# Randomized Design ---- +# Randomize the design ---- + +# Sample from profiles to create randomized choice sets -make_design_rand <- function(profiles, n_resp, n_alts, n_q, no_choice, label) { +get_randomized_design <- function( + profiles, n_resp, n_alts, n_q, no_choice, label +) { if (is.null(label)) { design <- get_design_rand(profiles, n_resp, n_alts, n_q) } else { @@ -339,14 +385,6 @@ get_col_types <- function(data) { return(unlist(lapply(types, test))) } -get_type_ids <- function(profile_lvls) { - types <- get_col_types(profile_lvls) - ids <- list() - ids$discrete <- types %in% c("factor", "character") - ids$continuous <- !ids$discrete - return(ids) -} - reorder_cols <- function(design) { metaNames <- c("profileID", "respID", "qID", "altID", "obsID") varNames <- setdiff(names(design), metaNames) @@ -354,46 +392,103 @@ reorder_cols <- function(design) { return(design) } -# Bayesian D-efficient Design ---- +# Full Factorial Design ---- -make_design_deff <- function( - profiles, n_resp, n_alts, n_q, n_blocks, n_draws, no_choice, n_start, - label, priors, prior_no_choice, probs, method, keep_db_error, max_iter, - parallel +make_design_full <- function( + profiles, n_resp, n_alts, n_q, n_blocks, no_choice, label ) { - # Set up initial parameters for creating design + if (n_blocks > 1) { + design <- make_design_full_blocked( + profiles, n_resp, n_alts, n_q, n_blocks, no_choice, label + ) + } else { + design <- get_randomized_design( + profiles, n_resp, n_alts, n_q, no_choice, label + ) + } + return(design) +} - # Make sure order of priors matches order of attributes in profiles - profile_lvls <- profiles[, 2:ncol(profiles)] - varnames <- names(profile_lvls) - priors <- priors[varnames] +make_design_full_blocked <- function( + profiles, n_resp, n_alts, n_q, n_blocks, no_choice, label +) { + # Make blocks + design <- suppressMessages(as.data.frame( + DoE.base::fac.design( + factor.names = get_profile_list(profiles), + blocks = n_blocks, + block.name = "blockID" + ) + )) + design$blockID <- as.numeric(as.character(design$blockID)) + design <- design[,c(names(profiles)[2:ncol(profiles)], "blockID")] + type_ids <- get_type_ids(profiles) + profiles <- join_profiles(design, profiles, type_ids) + + # Randomize design within each block + profiles <- split(profiles, profiles$blockID) + # Make sure number of respondents divides well into blocks + n_resp_list <- rep(n_resp / n_blocks, n_blocks) + if (! all(n_resp_list %% 1 == 0)) { + n_resp_list <- floor(n_resp_list) + n_resp_list[n_blocks] <- n_resp_list[n_blocks] + 1 + } + design <- list() + for (i in 1:n_blocks) { + design[[i]] <- get_randomized_design( + profiles[[i]], n_resp_list[i], n_alts, n_q, no_choice, label + ) + } + design <- do.call(rbind, design) + design <- add_metadata(design, n_resp, n_alts, n_q) + return(design) +} - # Set up priors - mu <- unlist(priors) - if (no_choice) { - mu <- c(prior_no_choice, mu) +# Orthogonal Design ---- + +make_design_orthogonal <- function( + profiles, n_resp, n_alts, n_q, no_choice, label +) { + oa <- suppressMessages(as.data.frame( + DoE.base::oa.design( + factor.names = get_profile_list(profiles) + ) + )) + if (nrow(oa) == nrow(profiles)) { + message("No orthogonal array found; using full factorial for design") + } else { + message( + "Orthogonal array found; using ", nrow(oa), " out of ", + nrow(profiles), " profiles for design" + ) } + type_ids <- get_type_ids(profiles) + oa <- join_profiles(oa, profiles, type_ids) + design <- get_randomized_design(oa, n_resp, n_alts, n_q, no_choice, label) + return(design) +} +# Bayesian D-efficient Design ---- + +make_design_bayesian <- function( + profiles, n_resp, n_alts, n_q, n_blocks, n_draws, n_start, no_choice, + label, method, priors, prior_no_choice, probs, keep_db_error, max_iter, + parallel +) { # Set up levels and coding - ids <- get_type_ids(profile_lvls) - lvl.names <- list() - for (i in seq_len(ncol(profile_lvls))) { - if (ids$discrete[i]) { - lvl.names[[i]] <- levels(profile_lvls[,i]) - } else { - lvl.names[[i]] <- unique(profile_lvls[,i]) - } - } + profile_list <- get_profile_list(profiles) + type_ids <- get_type_ids(profiles) + lvl.names <- unname(profile_list) lvls <- unname(unlist(lapply(lvl.names, function(x) length(x)))) coding <- rep("C", length(lvls)) c.lvls <- NULL - if (any(ids$continuous)) { - c.lvls <- lvl.names[ids$continuous] + if (any(type_ids$continuous)) { + c.lvls <- lvl.names[type_ids$continuous] } # lvl.names must be all characters for decoding process lvl.names <- lapply(lvl.names, function(x) as.character(x)) - if (any(ids$discrete)) { - coding[ids$discrete] <- "D" + if (any(type_ids$discrete)) { + coding[type_ids$discrete] <- "D" } no_choice_alt <- NULL alt_cte <- rep(0, n_alts) @@ -402,6 +497,24 @@ make_design_deff <- function( alt_cte <- c(alt_cte, 1) no_choice_alt <- n_alts } + + # Setup priors + profile_lvls <- profiles[, 2:ncol(profiles)] + varnames <- names(profile_lvls) + if (is.null(priors)) { + # No priors specified, so use all 0s + warning( + 'Since the ', method, ' method is used but no priors were ', + 'specified, a zero prior will be used (all coefficients set to 0)' + ) + priors <- lapply(profile_list, function(x) rep(0, length(x) - 1)) + priors[type_ids$continuous] <- 0 + } + # Make sure order of priors matches order of attributes in profiles + mu <- unlist(priors[varnames]) + if (no_choice) { + mu <- c(prior_no_choice, mu) + } sigma <- diag(length(mu)) par_draws <- MASS::mvrnorm(n = n_draws, mu = mu, Sigma = sigma) n_alt_cte <- sum(alt_cte) @@ -412,18 +525,15 @@ make_design_deff <- function( } # Make the design - profiles_restricted <- nrow(expand.grid(lvl.names)) > nrow(profiles) - if (profiles_restricted & (method == "CEA")) { # "CEA" method only works with unrestricted profile set method <- "Modfed" warning( 'The "CEA" algorithm requires the use of an unrestricted set of ', - 'profiles, so "Modfed" is being used instead.' + 'profiles, so "Modfed" is being used instead.\n' ) } - if (method == "CEA") { D <- idefix::CEA( lvls = lvls, @@ -440,7 +550,7 @@ make_design_deff <- function( } else { D <- idefix::Modfed( cand.set = defineCandidateSet( - lvls, coding, c.lvls, profile_lvls, ids, profiles_restricted + lvls, coding, c.lvls, profile_lvls, type_ids, profiles_restricted ), par.draws = par_draws, n.alts = n_alts, @@ -466,9 +576,9 @@ make_design_deff <- function( # Join on profileIDs to design design <- design_raw$design names(design) <- varnames - design <- join_profiles(design, profiles, varnames, ids) + design <- join_profiles(design, profiles, type_ids) if (no_choice) { - design <- add_no_choice_deff(design, n_alts, varnames[ids$discrete]) + design <- add_no_choice_deff(design, n_alts, varnames[type_ids$discrete]) } # Include probs? @@ -489,7 +599,7 @@ make_design_deff <- function( design <- add_metadata(design, n_resp, n_alts, n_q) design <- reorder_cols(design) - # Print error + # Print DB error message( "Bayesian D-efficient design found with DB-error of ", round(D$error, 5) @@ -503,8 +613,32 @@ make_design_deff <- function( return(design) } +get_type_ids <- function(profiles) { + types <- get_col_types(profiles[, 2:ncol(profiles)]) + ids <- list() + ids$discrete <- types %in% c("factor", "character") + ids$continuous <- !ids$discrete + return(ids) +} + +get_profile_list <- function(profiles) { + profile_lvls <- profiles[, 2:ncol(profiles)] + varnames <- names(profile_lvls) + type_ids <- get_type_ids(profiles) + profile_list <- list() + for (i in seq_len(ncol(profile_lvls))) { + if (type_ids$discrete[i]) { + profile_list[[i]] <- levels(profile_lvls[,i]) + } else { + profile_list[[i]] <- unique(profile_lvls[,i]) + } + } + names(profile_list) <- varnames + return(profile_list) +} + defineCandidateSet <- function( - lvls, coding, c.lvls, profile_lvls, ids, profiles_restricted + lvls, coding, c.lvls, profile_lvls, type_ids, profiles_restricted ) { # Make candidate set with profiles, assuming non-restricted cand_set <- idefix::Profiles( @@ -518,7 +652,7 @@ defineCandidateSet <- function( # including restricted profiles cand_set_res <- fastDummies::dummy_cols( profile_lvls, - select_columns = names(profile_lvls)[ids$discrete], + select_columns = names(profile_lvls)[type_ids$discrete], remove_first_dummy = TRUE, remove_selected_columns = TRUE ) @@ -540,26 +674,31 @@ defineCandidateSet <- function( return(cand_set_res) } -join_profiles <- function(design, profiles, varnames, ids) { +join_profiles <- function(design, profiles, type_ids) { # Replaces the generated design with rows from profiles, which ensures # factor levels in profiles are maintained in design # Keep track of row order in design design$row_id <- seq(nrow(design)) + # Before joining profiles, ensure that all the data types are the same + # as in profiles, otherwise join won't work properly + # Convert numeric columns to actual numbers - for (id in which(ids$continuous)) { - design[,id] <- as.numeric(design[,id]) + for (id in which(type_ids$continuous)) { + design[,id] <- as.numeric(as.character(design[,id])) } # Convert character types to factors and set same levels as profiles - for (id in which(ids$discrete)) { + for (id in which(type_ids$discrete)) { design[,id] <- factor(design[,id], levels = levels(profiles[,id+1])) } # Join on profileIDs, then reorder to retain design order + varnames <- names(profiles[, 2:ncol(profiles)]) design <- merge(design, profiles, by = varnames, all.x = TRUE) design <- design[order(design$row_id),] + if ('blockID' %in% names(design)) { varnames <- c(varnames, 'blockID') } design <- design[c('profileID', varnames)] return(design) } diff --git a/R/input_checks.R b/R/input_checks.R index ef5b50c..b9308db 100644 --- a/R/input_checks.R +++ b/R/input_checks.R @@ -3,7 +3,10 @@ check_inputs_profiles <- function(levels) { check_vector <- !is.vector(levels[[i]]) check_name <- is.null(names(levels)[i]) if (check_vector | check_name) { - stop("Each item in ... must be a named vector where the names are attributes and the values in the vector are levels of that attribute.") + stop( + 'Each item in "..." must be a named vector where the names are ', + 'attributes and the values in the vector are levels of that attribute' + ) } } } @@ -11,18 +14,34 @@ check_inputs_profiles <- function(levels) { check_inputs_restrict <- function(profiles) { # Check if profiles is a data frame if (!is.data.frame(profiles)) { - stop("The 'profiles' argument must be a data frame.") + stop('The "profiles" argument must be a data frame.') } # Check if profiles has been created by the cbc_profiles function if (!"profileID" %in% colnames(profiles)) { stop( - "The 'profiles' data frame must be created using the 'cbc_profiles' function ", - "and contain the 'profileID' variable." + 'The "profiles" data frame must be created using the "cbc_profiles()"', + 'function and contain the "profileID" variable.' ) } } +check_design_method <- function(method, priors) { + if (!is.null(priors)) { + if (! method_is_bayesian(method)) { + # Set method to 'CEA' if priors are specified and + # user didn't specify an appropriate method. + warning( + 'Since "priors" are specified, the "method" must be either "CEA" ', + 'or "Modfed". The specified "method" is being ignored and set to ', + '"CEA"\n' + ) + method <- 'CEA' + } + } + return(method) +} + check_inputs_design <- function( profiles, n_resp, @@ -30,17 +49,22 @@ check_inputs_design <- function( n_q, n_blocks, n_draws, - no_choice, n_start, + no_choice, label, + method, priors, prior_no_choice, probs, - method, keep_db_error, max_iter, parallel ) { + + if (n_blocks < 1) { + stop('n_blocks must be greater than or equal to 1') + } + if (n_blocks > n_resp) { stop("Maximum allowable number of blocks is one block per respondent") } @@ -48,14 +72,30 @@ check_inputs_design <- function( # If using a Bayesian D-efficient design with a no choice option, user must # specify a value for prior_no_choice if (no_choice) { - if (!is.null(priors) & is.null(prior_no_choice)) { - stop( - "If 'no_choice = TRUE', you must specify the prior utility ", - 'value for the "no choice" option using prior_no_choice' - ) + if (!is.null(priors) & is.null(prior_no_choice)) { + stop( + 'If "no_choice = TRUE", you must specify the prior utility ', + 'value for the "no choice" option using prior_no_choice' + ) + } + } - } + # The labeled design isn't yet supported for Bayesian D-efficient designs + if (!is.null(label) & method_is_bayesian(method)) { + stop( + 'The use of the "label" argument is currently not compatible with ', + 'Bayesian D-efficient designs' + ) + } + + # Check that an appropriate method is used + + if (! method %in% c('full', 'orthogonal', 'CEA', 'Modfed')) { + stop( + 'The "method" argument must be set to "full", "orthogonal", ', + '"Modfed", or "CEA"' + ) } # Check that priors are appropriate if specified @@ -63,8 +103,13 @@ check_inputs_design <- function( if (!is.null(priors)) { # Check that user specified an appropriate method - if ((method != "CEA") & (method != "Modfed")) { - stop('The method argument must be either "Modfed" or "CEA"') + # This should already be handled + if (! method_is_bayesian(method)) { + stop( + 'Since "priors" are specified, the "method" argument must ', + 'be either "Modfed" or "CEA" to obtain a Bayesian ', + 'D-efficient design' + ) } # Check that prior names aren't missing @@ -73,28 +118,28 @@ check_inputs_design <- function( missing <- setdiff(names(profile_lvls), prior_names) if (length(missing) > 0) { stop( - "'priors' is missing the following variables: \n\n", + '"priors" is missing the following variables: \n\n', paste(missing, collapse = "\n") ) } # Check that prior levels aren't missing - ids <- get_type_ids(profile_lvls) - for (id in which(ids$discrete)) { + type_ids <- get_type_ids(profiles) + for (id in which(type_ids$discrete)) { n_lvls <- length(unique(profile_lvls[,id])) - 1 if (length(priors[[id]]) != n_lvls) { stop( - "Invalid number of values provided in 'priors' for the '", - prior_names[id], "' attribute. Please provide ", n_lvls, - " values" + 'Invalid number of values provided in "priors" for the "', + prior_names[id], '" attribute. Please provide ', n_lvls, + ' values' ) } } - for (id in which(ids$continuous)) { + for (id in which(type_ids$continuous)) { if (length(priors[[id]]) != 1) { stop( - "Invalid number of values provided in 'priors' for the '", - prior_names[id], "' attribute. Please provide 1 value" + 'Invalid number of values provided in "priors" for the "', + prior_names[id], '" attribute. Please provide 1 value' ) } } @@ -103,12 +148,12 @@ check_inputs_design <- function( # Check that the number of alternatives per observation is larger than # the number of unique profiles if (n_alts > nrow(profiles)) { - stop( - "The number of alternatives per observation, specified by n_alts, ", - "is larger than the number of unique profiles. Either decrease ", - "n_alts to be less than ", nrow(profiles), " or add more ", - "attributes / levels to increase the number of profiles." - ) + stop( + 'The number of alternatives per observation, specified by "n_alts", ', + "is larger than the number of unique profiles. Either decrease ", + '"n_alts" to be less than ', nrow(profiles), " or add more ", + "attributes / levels to increase the number of profiles." + ) } # Check that number of questions per respondents is larger than the @@ -125,13 +170,13 @@ check_inputs_design <- function( ncomb <- choose(n, k) # More robust # ncomb <- factorial(n) / (factorial(k)*(factorial(n-k))) if (n_q > ncomb) { - stop( - "The number of questions per respondent, specified by n_q, ", - "is larger than the number of unique sets of choice sets. ", - "You can correct this by decreasing n_q to be less than ", - ncomb, ", decreasing n_alts, or add more attributes / levels ", - "to increase the number of choice set combinations." - ) + stop( + 'The number of questions per respondent, specified by "n_q", ', + "is larger than the number of unique sets of choice sets. ", + 'You can correct this by decreasing "n_q" to be less than ', + ncomb, ', decreasing "n_alts", or add more attributes / levels ', + "to increase the number of choice set combinations." + ) } } } diff --git a/R/inspect.R b/R/inspect.R index 4982187..7cc4af5 100644 --- a/R/inspect.R +++ b/R/inspect.R @@ -20,12 +20,13 @@ #' freshness = c('Poor', 'Average', 'Excellent') #' ) #' -#' # Make a randomized survey design +#' # Make a survey design from all possible profiles +#' # (This is the default setting where method = 'full' for "full factorial") #' design <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents -#' n_alts = 3, # Number of alternatives per question -#' n_q = 6 # Number of questions per respondent +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6 # Number of questions per respondent #' ) #' #' # Inspect the design balance diff --git a/R/methods.R b/R/methods.R index 28f97dd..e09425c 100644 --- a/R/methods.R +++ b/R/methods.R @@ -51,12 +51,13 @@ print.cbc_models <- function ( #' freshness = c('Poor', 'Average', 'Excellent') #' ) #' -#' # Make a randomized survey design +#' # Make a survey design from all possible profiles +#' # (This is the default setting where method = 'full' for "full factorial") #' design <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents -#' n_alts = 3, # Number of alternatives per question -#' n_q = 6 # Number of questions per respondent +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6 # Number of questions per respondent #' ) #' #' # Simulate random choices diff --git a/R/power.R b/R/power.R index 79fbc5b..c49fad0 100644 --- a/R/power.R +++ b/R/power.R @@ -7,7 +7,8 @@ #' level of statistical power on each coefficient. The number of models to #' estimate is set by the `nbreaks` argument, which breaks up the data into #' groups of increasing sample sizes. All models are estimated models using -#' the {logitr} package. For more details see Helveston (2023) \doi{10.18637/jss.v105.i10}. +#' the 'logitr' package. For more details see the JSS article on the 'logitr' +#' package (Helveston, 2023). #' @keywords logitr mnl mxl mixed logit sample size power #' #' @param data The data, formatted as a `data.frame` object. @@ -44,6 +45,9 @@ #' @param ... Other arguments that are passed to `logitr::logitr()` for model #' estimation. See the {logitr} documentation for details about other #' available arguments. +#' @references +#' Helveston, J. P. (2023). logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations. Journal of Statistical Software, 105(10), 1–37, +#' \doi{10.18637/jss.v105.i10} #' @return Returns a data frame of estimated model coefficients and standard #' errors for the same model estimated on subsets of the `data` with increasing #' sample sizes. @@ -60,12 +64,13 @@ #' freshness = c('Poor', 'Average', 'Excellent') #' ) #' -#' # Make a randomized survey design +#' # Make a survey design from all possible profiles +#' # (This is the default setting where method = 'full' for "full factorial") #' design <- cbc_design( #' profiles = profiles, #' n_resp = 300, # Number of respondents -#' n_alts = 3, # Number of alternatives per question -#' n_q = 6 # Number of questions per respondent +#' n_alts = 3, # Number of alternatives per question +#' n_q = 6 # Number of questions per respondent #' ) #' #' # Simulate random choices @@ -230,3 +235,83 @@ extract_errors <- function(models) { row.names(results) <- NULL return(results) } + +#' Plot a comparison of different design powers +#' +#' This function creates a ggplot2 object comparing the power curves of +#' different designs. Each design is color coded and each facet (sub plot) +#' is a model coefficient. +#' @param ... Any number of data frame containing power results obtained from +#' the `cbc_power()` function, separated by commas. +#' @return A plot comparing the power curves of different designs. +#' @importFrom ggplot2 ggplot aes geom_hline geom_point expand_limits theme_bw +#' theme element_blank labs facet_wrap +#' @importFrom rlang .data +#' @export +#' @examples +#' library(cbcTools) +#' +#' # Generate all possible profiles +#' profiles <- cbc_profiles( +#' price = c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5), +#' type = c("Fuji", "Gala", "Honeycrisp"), +#' freshness = c('Poor', 'Average', 'Excellent') +#' ) +#' +#' # Make designs to compare: full factorial vs bayesian d-efficient +#' design_full <- cbc_design( +#' profiles = profiles, +#' n_resp = 300, n_alts = 3, n_q = 6 +#' ) +#' # Same priors will be used in bayesian design and simulated choices +#' priors <- list( +#' price = -0.1, +#' type = c(0.1, 0.2), +#' freshness = c(0.1, 0.2) +#' ) +#' design_bayesian <- cbc_design( +#' profiles = profiles, +#' n_resp = 300, n_alts = 3, n_q = 6, n_start = 1, method = "CEA", +#' priors = priors, parallel = FALSE +#' ) +#' +#' # Obtain power for each design by simulating choices +#' power_full <- design_full |> +#' cbc_choices(obsID = "obsID", priors = priors) |> +#' cbc_power( +#' pars = c("price", "type", "freshness"), +#' outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 +#' ) +#' power_bayesian <- design_bayesian |> +#' cbc_choices(obsID = "obsID", priors = priors) |> +#' cbc_power( +#' pars = c("price", "type", "freshness"), +#' outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 +#' ) +#' +#' # Compare power of each design +#' plot_compare_power(power_bayesian, power_full) +plot_compare_power <- function(...) { + power <- list(...) + design_names <- unlist(lapply(as.list(match.call())[-1], deparse)) + names(power) <- design_names + for (i in 1:length(power)) { + power[[i]]$design <- names(power)[i] + } + power <- do.call(rbind, power) + ggplot2::ggplot(power) + + geom_hline(yintercept = 0.05, color = "red", linetype = 2) + + geom_point( + aes(x = .data$sampleSize, y = .data$se, color = .data$design), + size = 1.8 + ) + + facet_wrap(~.data$coef) + + expand_limits(y = 0) + + theme_bw(base_size = 14) + + theme(panel.grid.minor = element_blank()) + + labs( + color = "Design", + x = "Sample size", + y = "Standard error" + ) +} diff --git a/R/profiles.R b/R/profiles.R index 81ab306..3f3a4dc 100644 --- a/R/profiles.R +++ b/R/profiles.R @@ -59,12 +59,28 @@ cbc_profiles <- function(...) { #' type == "Honeycrisp" & freshness == "Poor", #' type == "Fuji" & freshness == "Excellent" #' ) + cbc_restrict <- function(profiles, ...) { check_inputs_restrict(profiles) + + # drop_ids <- unique(unlist(lapply( + # rlang::enquos(...), + # function(x) { + # dplyr::filter(profiles, !!x) |> dplyr::pull(.data$profileID) + # } + # ))) + + # Came up with a different approach to avoid the {dplyr} dependency + drop_ids <- unique(unlist(lapply( - rlang::enquos(...), - function(x) dplyr::filter(profiles, !!x) |> dplyr::pull(.data$profileID) + rlang::enexprs(...), + function(x) { + subset_ids <- subset(profiles, eval(x), select = c("profileID")) + as.character(subset_ids$profileID) + } ))) + drop_ids <- which(profiles$profileID %in% drop_ids) + profiles <- profiles[-drop_ids,] profiles <- add_profile_ids(profiles) return(profiles) diff --git a/R/tools.R b/R/tools.R index 1e69262..a1ffd17 100644 --- a/R/tools.R +++ b/R/tools.R @@ -10,3 +10,7 @@ "Please cite the package in your publications, see:\ncitation(\"cbcTools\")" ) } + +method_is_bayesian <- function(method) { + return(method %in% c('CEA', 'Modfed')) +} diff --git a/man/cbc_balance.Rd b/man/cbc_balance.Rd index 898c5da..3c05504 100644 --- a/man/cbc_balance.Rd +++ b/man/cbc_balance.Rd @@ -29,12 +29,13 @@ profiles <- cbc_profiles( freshness = c('Poor', 'Average', 'Excellent') ) -# Make a randomized survey design +# Make a survey design from all possible profiles +# (This is the default setting where method = 'full' for "full factorial") design <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6 # Number of questions per respondent + n_alts = 3, # Number of alternatives per question + n_q = 6 # Number of questions per respondent ) # Inspect the design balance diff --git a/man/cbc_choices.Rd b/man/cbc_choices.Rd index b0d3746..70190c2 100644 --- a/man/cbc_choices.Rd +++ b/man/cbc_choices.Rd @@ -25,7 +25,9 @@ identifying the simulated choices. } \description{ Simulate choices for a survey design, either randomly or according to a -utility model defined by user-provided prior parameters. +utility model defined by user-provided prior parameters. All choices are +simulated using the 'logitr' package. For more details see the JSS article +on the 'logitr' package (Helveston, 2023). } \examples{ library(cbcTools) @@ -39,12 +41,13 @@ profiles <- cbc_profiles( freshness = c('Poor', 'Average', 'Excellent') ) -# Make a randomized survey design +# Make a survey design from all possible profiles +# (This is the default setting where method = 'full' for "full factorial") design <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6 # Number of questions per respondent + n_alts = 3, # Number of alternatives per question + n_q = 6 # Number of questions per respondent ) # Simulate random choices @@ -87,6 +90,10 @@ data <- cbc_choices( ) ) } +\references{ +Helveston, J. P. (2023). logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations. Journal of Statistical Software, 105(10), 1–37, +\doi{10.18637/jss.v105.i10} +} \keyword{logit} \keyword{logitr} \keyword{mixed} diff --git a/man/cbc_design.Rd b/man/cbc_design.Rd index e3fad94..8b1b48b 100644 --- a/man/cbc_design.Rd +++ b/man/cbc_design.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/design.R \name{cbc_design} \alias{cbc_design} -\title{Make a random or Bayesian D-efficient choice-based conjoint survey design} +\title{Make a choice-based conjoint survey design} \usage{ cbc_design( profiles, @@ -11,16 +11,16 @@ cbc_design( n_q, n_blocks = 1, n_draws = 50, - no_choice = FALSE, n_start = 5, + no_choice = FALSE, label = NULL, + method = "full", priors = NULL, prior_no_choice = NULL, probs = FALSE, - method = NULL, keep_db_error = FALSE, max_iter = 50, - parallel = TRUE + parallel = FALSE ) } \arguments{ @@ -33,32 +33,35 @@ This can be generated using the \code{cbc_profiles()} function.} \item{n_q}{Number of questions per respondent.} -\item{n_blocks}{Number of blocks used in Bayesian D-efficient design. -Max allowable is one block per respondent, defaults to \code{1}, meaning every -respondent sees the same set of choice questions.} +\item{n_blocks}{Number of blocks used in Orthogonal or Bayesian D-efficient +designs. Max allowable is one block per respondent. Defaults to \code{1}, meaning +every respondent sees the same choice set.} \item{n_draws}{Number of draws used in simulating the prior distribution used in Bayesian D-efficient designs. Defaults to \code{50}.} -\item{no_choice}{Include a "no choice" option in the choice sets? Defaults -to \code{FALSE}. If \code{TRUE}, the total number of alternatives per question will be -one more than the provided \code{n_alts} argument.} - \item{n_start}{A numeric value indicating the number of random start designs to use in obtaining a Bayesian D-efficient design. The default is \code{5}. Increasing \code{n_start} can result in a more efficient design at the expense of increased computational time.} +\item{no_choice}{Include a "no choice" option in the choice sets? Defaults +to \code{FALSE}. If \code{TRUE}, the total number of alternatives per question will be +one more than the provided \code{n_alts} argument.} + \item{label}{The name of the variable to use in a "labeled" design (also called an "alternative-specific design") such that each set of alternatives contains one of each of the levels in the \code{label} attribute. -Currently only compatible with randomized designs. If used, the \code{n_alts} -argument will be ignored as its value is defined by the unique number of -levels in the \code{label} variable. Defaults to \code{NULL}.} +Currently not compatible with Bayesian D-efficient designs. If used, +the \code{n_alts} argument will be ignored as its value is defined by the unique +number of levels in the \code{label} variable. Defaults to \code{NULL}.} + +\item{method}{Choose the design method to use: \code{"full"}, \code{"orthogonal"}, +\code{"CEA"} or \code{"Modfed"}. Defaults to \code{"full"}. See details below for complete +description of each method.} \item{priors}{A list of one or more assumed prior parameters used to -generate a Bayesian D-efficient design. If \code{NULL} (the default), a -randomized design will be generated.} +generate a Bayesian D-efficient design. Defaults to \code{NULL}} \item{prior_no_choice}{Prior utility value for the "no choice" alternative. Only required if \code{no_choice = TRUE}. Defaults to \code{NULL}.} @@ -68,11 +71,6 @@ design includes average predicted probabilities for each alternative in each choice set given the sample from the prior preference distribution. Defaults to \code{FALSE}.} -\item{method}{Which method to use for obtaining a Bayesian D-efficient -design, \code{"CEA"} or \code{"Modfed"}? If \code{priors} are specified, it defaults to -\code{"CEA"}, otherwise it defaults to \code{NULL}. See \code{?idefix::CEA} and -\code{?idefix::Modfed} for more details.} - \item{keep_db_error}{If \code{TRUE}, for Bayesian D-efficient designs the returned object will be a list containing the design and the DB-error score. Defaults to \code{FALSE}.} @@ -82,7 +80,7 @@ iterations when searching for a Bayesian D-efficient design. The default is 50.} \item{parallel}{Logical value indicating whether computations should be done -over multiple cores. The default is \code{TRUE}.} +over multiple cores. The default is \code{FALSE}.} } \value{ A data frame containing a choice-based conjoint survey design where @@ -90,9 +88,45 @@ each row is an alternative. } \description{ This function creates a data frame containing a choice-based conjoint survey -design where each row is an alternative. Designs can be either a -randomized or Bayesian D-efficient, in which case an implementation of the -CEA or Modfed Federov algorithm is used via the {idefix} package +design where each row is an alternative. Generate a variety of survey +designs, including full factorial designs, orthogonal designs, and +Bayesian D-efficient designs as well as designs with "no choice" options +and "labeled" (also known as "alternative specific") designs. +} +\details{ +The \code{method} argument determines the design method used. Options are: +\itemize{ +\item \code{"full"} +\item \code{"orthogonal"} +\item \code{"CEA"} +\item \code{"Modfed"} +} + +The \code{"full"} method uses a "full factorial" design where choice sets are +created by randomly selecting from the full set of \code{profiles}. Blocking can +used with these designs where blocks are created from subsets of the full +factorial design. For more information about blocking with full factorial +designs, see \code{?DoE.base::fac.design} as well as the JSS article on the +{DoE.base} package (Grömping, 2018). + +The \code{"orthogonal"} method first finds an orthogonal array from the full +set of \code{profiles} (if possible), then randomly selects from it to create +choice sets. For some designs an orthogonal array can't be found, in which +case a full factorial design is used. This approach is also sometimes called +a "main effects" design since orthogonal arrays focus the information on the +main effects at the expense of information about interaction effects. For +more information about orthogonal designs, see \code{?DoE.base::oa.design} as +well as the JSS article on the {DoE.base} package +(Grömping, 2018). + +For Bayesian D-efficient designs, use \code{"CEA"} or \code{"Modfed"} along with +specified \code{priors}. If \code{method} is set to \code{"CEA"} or \code{"Modfed"} but without +\code{priors} specified, a prior of all \code{0}s will be used and a warning message +stating this will be shown. If you are using a restricted set of \code{profiles}, +only the \code{"Modfed"} method can be used as \code{"CEA"} requires unrestricted +\code{profiles}. For more details on Bayesian D-efficient designs, see +\code{?idefix::CEA} and \code{?idefix::Modfed} as well as the JSS article on the +{idefix} package (Traets et al, 2020). } \examples{ library(cbcTools) @@ -106,16 +140,27 @@ profiles <- cbc_profiles( freshness = c('Poor', 'Average', 'Excellent') ) -# Make a randomized survey design -design_rand <- cbc_design( +# Make a survey design from all possible profiles +# (This is the default setting where method = 'full' for "full factorial") +design_full <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents n_alts = 3, # Number of alternatives per question n_q = 6 # Number of questions per respondent ) -# Make a randomized survey design with a "no choice" option -design_rand_nochoice <- cbc_design( +# Make a survey design from an orthogonal array of profiles +design_ortho <- cbc_design( + profiles = profiles, + n_resp = 300, # Number of respondents + n_alts = 3, # Number of alternatives per question + n_q = 6, # Number of questions per respondent + method = 'orthogonal' +) + +# Make a survey design from all possible profiles +# with a "no choice" option +design_full_nochoice <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents n_alts = 3, # Number of alternatives per question @@ -123,9 +168,9 @@ design_rand_nochoice <- cbc_design( no_choice = TRUE ) -# Make a randomized labeled survey design with each "type" appearing in -# each choice question -design_rand_labeled <- cbc_design( +# Make a survey design from all possible profiles +# with each level of the "type" attribute appearing as an alternative +design_full_labeled <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents n_alts = 3, # Number of alternatives per question @@ -134,13 +179,13 @@ design_rand_labeled <- cbc_design( ) # Make a Bayesian D-efficient design with a prior model specified -# Note that by default parallel = TRUE. -design_deff <- cbc_design( +# Note that by speed can be improved by setting parallel = TRUE +design_bayesian <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6, # Number of questions per respondent - n_start = 1, + n_alts = 3, # Number of alternatives per question + n_q = 6, # Number of questions per respondent + n_start = 1, # Defauls to 5, set to 1 here for a quick example priors = list( price = -0.1, type = c(0.1, 0.2), @@ -150,7 +195,16 @@ design_deff <- cbc_design( parallel = FALSE ) } +\references{ +Grömping, U. (2018). R Package DoE.base for Factorial Experiments. Journal of Statistical Software, 85(5), 1–41, +\doi{10.18637/jss.v085.i05} +Traets, F., Sanchez, D. G., & Vandebroek, M. (2020). Generating Optimal Designs for Discrete Choice Experiments in R: The idefix Package. Journal of Statistical Software, 96(3), 1–41, +\doi{10.18637/jss.v096.i03} +} +\keyword{DoE.base} \keyword{design} +\keyword{experiment} +\keyword{idefix} \keyword{logit} \keyword{logitr} \keyword{mixed} diff --git a/man/cbc_power.Rd b/man/cbc_power.Rd index d44a3d8..b3f5c86 100644 --- a/man/cbc_power.Rd +++ b/man/cbc_power.Rd @@ -83,7 +83,8 @@ is useful for determining the required sample size for obtaining a desired level of statistical power on each coefficient. The number of models to estimate is set by the \code{nbreaks} argument, which breaks up the data into groups of increasing sample sizes. All models are estimated models using -the {logitr} package. For more details see Helveston (2023) \doi{10.18637/jss.v105.i10}. +the 'logitr' package. For more details see the JSS article on the 'logitr' +package (Helveston, 2023). } \examples{ library(cbcTools) @@ -97,12 +98,13 @@ profiles <- cbc_profiles( freshness = c('Poor', 'Average', 'Excellent') ) -# Make a randomized survey design +# Make a survey design from all possible profiles +# (This is the default setting where method = 'full' for "full factorial") design <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6 # Number of questions per respondent + n_alts = 3, # Number of alternatives per question + n_q = 6 # Number of questions per respondent ) # Simulate random choices @@ -122,6 +124,10 @@ power <- cbc_power( n_cores = 2 ) } +\references{ +Helveston, J. P. (2023). logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations. Journal of Statistical Software, 105(10), 1–37, +\doi{10.18637/jss.v105.i10} +} \keyword{logit} \keyword{logitr} \keyword{mixed} diff --git a/man/miscmethods.cbc_errors.Rd b/man/miscmethods.cbc_errors.Rd index be47eab..00858e0 100644 --- a/man/miscmethods.cbc_errors.Rd +++ b/man/miscmethods.cbc_errors.Rd @@ -31,12 +31,13 @@ profiles <- cbc_profiles( freshness = c('Poor', 'Average', 'Excellent') ) -# Make a randomized survey design +# Make a survey design from all possible profiles +# (This is the default setting where method = 'full' for "full factorial") design <- cbc_design( profiles = profiles, n_resp = 300, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6 # Number of questions per respondent + n_alts = 3, # Number of alternatives per question + n_q = 6 # Number of questions per respondent ) # Simulate random choices diff --git a/man/plot_compare_power.Rd b/man/plot_compare_power.Rd new file mode 100644 index 0000000..737cc74 --- /dev/null +++ b/man/plot_compare_power.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/power.R +\name{plot_compare_power} +\alias{plot_compare_power} +\title{Plot a comparison of different design powers} +\usage{ +plot_compare_power(...) +} +\arguments{ +\item{...}{Any number of data frame containing power results obtained from +the \code{cbc_power()} function, separated by commas.} +} +\value{ +A plot comparing the power curves of different designs. +} +\description{ +This function creates a ggplot2 object comparing the power curves of +different designs. Each design is color coded and each facet (sub plot) +is a model coefficient. +} +\examples{ +library(cbcTools) + +# Generate all possible profiles +profiles <- cbc_profiles( + price = c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5), + type = c("Fuji", "Gala", "Honeycrisp"), + freshness = c('Poor', 'Average', 'Excellent') +) + +# Make designs to compare: full factorial vs bayesian d-efficient +design_full <- cbc_design( + profiles = profiles, + n_resp = 300, n_alts = 3, n_q = 6 +) +# Same priors will be used in bayesian design and simulated choices +priors <- list( + price = -0.1, + type = c(0.1, 0.2), + freshness = c(0.1, 0.2) +) +design_bayesian <- cbc_design( + profiles = profiles, + n_resp = 300, n_alts = 3, n_q = 6, n_start = 1, method = "CEA", + priors = priors, parallel = FALSE +) + +# Obtain power for each design by simulating choices +power_full <- design_full |> +cbc_choices(obsID = "obsID", priors = priors) |> + cbc_power( + pars = c("price", "type", "freshness"), + outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 + ) +power_bayesian <- design_bayesian |> + cbc_choices(obsID = "obsID", priors = priors) |> + cbc_power( + pars = c("price", "type", "freshness"), + outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 + ) + +# Compare power of each design +plot_compare_power(power_bayesian, power_full) +} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 1c4161a..e92352f 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -48,6 +48,7 @@ reference: desc: "Functions for conducting a power analysis." contents: - cbc_power + - plot_compare_power - title: "Methods" desc: "Various methods." contents: diff --git a/vignettes/library.bib b/vignettes/library.bib new file mode 100644 index 0000000..c7564af --- /dev/null +++ b/vignettes/library.bib @@ -0,0 +1,35 @@ +@article{Grömping2018, + title={R Package DoE.base for Factorial Experiments}, + volume={85}, + url={https://www.jstatsoft.org/index.php/jss/article/view/v085i05}, + doi={10.18637/jss.v085.i05}, + number={5}, + journal={Journal of Statistical Software}, + author={Grömping, Ulrike}, + year={2018}, + pages={1–41} +} + +@article{Traets2020, + title={Generating Optimal Designs for Discrete Choice Experiments in R: The idefix Package}, + volume={96}, + url={https://www.jstatsoft.org/index.php/jss/article/view/v096i03}, + doi={10.18637/jss.v096.i03}, + number={3}, + journal={Journal of Statistical Software}, + author={Traets, Frits and Sanchez, Daniel Gil and Vandebroek, Martina}, + year={2020}, + pages={1–41} +} + +@article{Helveston2023, + title={logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations}, + volume={105}, + url={https://www.jstatsoft.org/index.php/jss/article/view/v105i10}, + doi={10.18637/jss.v105.i10}, + number={10}, + journal={Journal of Statistical Software}, + author={Helveston, John Paul}, + year={2023}, + pages={1–37} +} \ No newline at end of file diff --git a/vignettes/usage.Rmd b/vignettes/usage.Rmd index 9eaa394..75b2897 100644 --- a/vignettes/usage.Rmd +++ b/vignettes/usage.Rmd @@ -5,6 +5,7 @@ vignette: > %\VignetteIndexEntry{Usage} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} +bibliography: "`r here::here('vignettes', 'library.bib')`" --- ```{r setup, include=FALSE, message=FALSE, warning=FALSE} @@ -35,7 +36,7 @@ The first step in designing an experiment is to define the attributes and levels library(cbcTools) profiles <- cbc_profiles( - price = seq(1, 4, 0.5), # $ per pound + price = seq(1, 5, 0.5), # $ per pound type = c('Fuji', 'Gala', 'Honeycrisp'), freshness = c('Poor', 'Average', 'Excellent') ) @@ -51,21 +52,21 @@ Depending on the context of your survey, you may wish to eliminate some profiles > **CAUTION: including restrictions in your designs can substantially reduce the statistical power of your design, so use them cautiously and avoid them if possible**. -If you do wish to restrict some attribute level combinations, you can do so using the `cbc_restrict()` function, which takes the full set of `profiles` along with any number of restricted pairs of attribute levels, defined as pairs of logical expressions separated by commas. In the example below, I include the following restrictions: +If you do wish to restrict some attribute level combinations, you can do so using the `cbc_restrict()` function, which takes the full set of `profiles` along with any number of restricted pairs of attribute levels, defined as pairs of logical expressions separated by commas. In the example below, I include the following restrictions (these are arbitrary and just for illustration purposes): - `"Gala"` apples will not be shown with the prices `1.5`, `2.5`, and `3.5`. - `"Honeycrisp"` apples will not be shown with prices less than `2`. -- `"Honeycrisp"` apples will not be shown with the `"Poor"` freshness. +- `"Honeycrisp"` apples will only be shown with `"Average"` freshness. - `"Fuji"` apples will not be shown with the `"Excellent"` freshness. -With these restrictions, there are now only 32 profiles compared to 63 without them: +With these restrictions, there are now only 39 profiles compared to 81 without them: ```{r} restricted_profiles <- cbc_restrict( profiles, type == "Gala" & price %in% c(1.5, 2.5, 3.5), type == "Honeycrisp" & price > 2, - type == "Honeycrisp" & freshness == "Poor", + type == "Honeycrisp" & freshness %in% c("Poor", "Excellent"), type == "Fuji" & freshness == "Excellent" ) @@ -76,44 +77,68 @@ restricted_profiles Once a set of profiles is obtained, a conjoint survey can then be generated using the `cbc_design()` function. A variety of survey designs can be generated, including: -- Random designs +- Full factorial designs +- Orthogonal (a.k.a. "main effects") designs - Labeled designs (a.k.a. "alternative-specific" designs) - Designs with a "no choice" option (a.k.a. "outside good") - Bayesian D-efficient designs -## Random designs +## Full factorial designs + +Full factorial designs are obtained by setting `method = 'full'` (the default setting for `method`). This method simply samples from the full set of `profiles`, ensuring that no two profiles are the same in any choice question. Blocking can used with these designs where blocks are created from subsets of the full factorial design. For more information about blocking with full factorial designs, see `?DoE.base::fac.design` as well as the JSS article on the {DoE.base} package [@Grömping2018]. -The randomized design simply samples from the set of `profiles`, ensuring that no two profiles are the same in any choice question. The resulting `design` data frame includes the following columns: +The resulting `design` data frame includes the following columns: - `profileID`: Identifies the profile in `profiles`. - `respID`: Identifies each survey respondent. - `qID`: Identifies the choice question answered by the respondent. - `altID`:Identifies the alternative in any one choice observation. - `obsID`: Identifies each unique choice observation across all respondents. +- `blockID`: If blocking is used, identifies each unique block. ```{r} -design <- cbc_design( +design_full <- cbc_design( profiles = profiles, n_resp = 900, # Number of respondents n_alts = 3, # Number of alternatives per question - n_q = 6 # Number of questions per respondent + n_q = 6, # Number of questions per respondent + method = 'full' # This is the default method +) + +dim(design_full) # View dimensions +head(design_full) # Preview first 6 rows +``` + +## Orthogonal designs + +Orthogonal designs are obtained by setting `method = 'orthogonal'`. This method first finds an orthogonal array from the full set of `profiles` (if possible), then randomly selects from it to create choice sets. For some designs an orthogonal array can't be found, in which case a full factorial design is used. This approach is also sometimes called a "main effects" design since orthogonal arrays focus the information on the main effects at the expense of information about interaction effects. For more information about orthogonal designs, see `?DoE.base::oa.design` as well as the JSS article on the {DoE.base} package [@Grömping2018] + +```{r} +#| message: true + +design_ortho <- cbc_design( + profiles = profiles, + n_resp = 900, # Number of respondents + n_alts = 3, # Number of alternatives per question + n_q = 6, # Number of questions per respondent + method = 'orthogonal' ) -dim(design) # View dimensions -head(design) # Preview first 6 rows +dim(design_ortho) # View dimensions +head(design_ortho) # Preview first 6 rows ``` ## Labeled designs (a.k.a. "alternative-specific" designs) -You can also make a "labeled" design (also known as "alternative-specific" design) where the levels of one attribute is used as a label by setting the `label` argument to that attribute. This by definition sets the number of alternatives in each question to the number of levels in the chosen attribute, so the `n_alts` argument is overridden. Here is an example labeled survey using the `type` attribute as the label: +You can also make a "labeled" design (also known as "alternative-specific" design) where the levels of one attribute are used as a label by setting the `label` argument to that attribute. This by definition sets the number of alternatives in each question to the number of levels in the chosen attribute, so the `n_alts` argument is overridden. Here is an example of a labeled full factorial survey using the `type` attribute as the label: ```{r} design_labeled <- cbc_design( - profiles = profiles, - n_resp = 900, # Number of respondents - n_alts = 3, # Number of alternatives per question - n_q = 6, # Number of questions per respondent - label = "type" # Set the "type" attribute as the label + profiles = profiles, + n_resp = 900, # Number of respondents + n_alts = 3, # Number of alternatives per question + n_q = 6, # Number of questions per respondent + label = "type" # Set the "type" attribute as the label ) dim(design_labeled) @@ -122,9 +147,9 @@ head(design_labeled) In the above example, you can see in the first six rows of the survey that the `type` attribute is always fixed to be the same order, ensuring that each level in the `type` attribute will always be shown in each choice question. -## Designs with a "no choice" option (a.k.a. "outside good") +## Designs with a "no choice" option (a.k.a. an "outside good") -You can include a "no choice" (also known as "outside good") option in your survey by setting `no_choice = TRUE`. If included, all categorical attributes will be dummy-coded to appropriately dummy-code the "no choice" alternative. +You can include a "no choice" (also known as an "outside good") option in your survey by setting `no_choice = TRUE`. If included, all categorical attributes will be dummy-coded to appropriately dummy-code the "no choice" alternative. ```{r} design_nochoice <- cbc_design( @@ -141,7 +166,7 @@ head(design_nochoice) ## Bayesian D-efficient designs -A Bayesian D-efficient design can be obtained by providing a list of prior parameters to define an expected prior utility model. These designs are optimized to minimize the D-error of the design given a prior model. The optimization is handled using the [{idefix} package](https://www.jstatsoft.org/article/view/v096i03). For now, designs are limited to multinomial logit priors (the {idefix} package can generate designs with mixed logit priors). These designs also currently do not support the ability to specify interaction terms in the prior model or use "labeled" designs. +Bayesian D-efficient designs are obtained by setting `method = 'CEA'` or `method = 'Modfed'` along with a specified list of `priors` (parameters that define a prior utility model). These designs are optimized to minimize the D-error of the design given a prior model. The optimization is handled using the [{idefix} package](https://cran.r-project.org/web/packages/idefix/index.html). For now, designs are limited to multinomial logit priors (the {idefix} package can generate designs with mixed logit priors). These designs also currently do not support the ability to specify interaction terms in the prior model or use "labeled" designs. If `method` is set to `"CEA"` or `"Modfed"` but without `priors` specified, a prior of all `0`s will be used and a warning message stating this will be shown. If you are using a restricted set of `profiles`, only the `"Modfed"` method can be used as `"CEA"` requires unrestricted `profiles`. For more details on Bayesian D-efficient designs, see `?idefix::CEA` and `?idefix::Modfed` as well as the JSS article on the {idefix} package [@Traets2020]. In the example below, the prior model assumes the following parameters: @@ -150,7 +175,7 @@ In the example below, the prior model assumes the following parameters: - 2 categorical parameters for `freshness` (`"Average"` and `"Excellent"`) ```{r} -design_db_eff <- cbc_design( +design_bayesian <- cbc_design( profiles = profiles, n_resp = 900, # Number of respondents n_alts = 3, # Number of alternatives per question @@ -159,17 +184,18 @@ design_db_eff <- cbc_design( price = -0.1, type = c(0.1, 0.2), freshness = c(0.1, 0.2) - ) + ), + method = 'CEA' ) -dim(design_db_eff) -head(design_db_eff) +dim(design_bayesian) +head(design_bayesian) ``` Bayesian D-efficient designs that include a "no choice" option should set `no_choice = TRUE` and also define a prior for the "no choice" option using `prior_no_choice`, e.g.: ```{r} -design_db_eff_no_choice <- cbc_design( +design_bayesian_no_choice <- cbc_design( profiles = profiles, n_resp = 900, # Number of respondents n_alts = 3, # Number of alternatives per question @@ -180,11 +206,12 @@ design_db_eff_no_choice <- cbc_design( type = c(0.1, 0.2), freshness = c(0.1, 0.2) ), - prior_no_choice = -0.1 + prior_no_choice = -0.1, + method = 'CEA' ) -dim(design_db_eff_no_choice) -head(design_db_eff_no_choice) +dim(design_bayesian_no_choice) +head(design_bayesian_no_choice) ``` # Inspect survey designs @@ -194,6 +221,13 @@ The package includes some functions to quickly inspect some basic metrics of a d The `cbc_balance()` function prints out a summary of the individual and pairwise counts of each level of each attribute across all choice questions: ```{r} +design <- cbc_design( + profiles = profiles, + n_resp = 900, + n_alts = 3, + n_q = 6 +) + cbc_balance(design) ``` @@ -205,7 +239,7 @@ cbc_overlap(design) # Simulate choices -You can simulate choices for a given `design` using the `cbc_choices()` function. +You can simulate choices for a given `design` using the `cbc_choices()` function. Choices are simulated using the [{logitr} package](https://jhelvy.github.io/logitr/). For more information, see `?logitr::logitr` as well as the JSS article on the {logitr} package [@Helveston2023]. ## Random choices @@ -277,7 +311,7 @@ data <- cbc_choices( # Conduct a power analysis -The simulated choice data can be used to conduct a power analysis by estimating the same model multiple times with incrementally increasing sample sizes. As the sample size increases, the estimated coefficient standard errors will decrease (i.e. coefficient estimates become more precise). The `cbc_power()` function achieves this by partitioning the choice data into multiple sizes (defined by the `nbreaks` argument) and then estimating a user-defined choice model on each data subset. In the example below, 10 different sample sizes are used. All models are estimated using the [{logitr}](https://jhelvy.github.io/logitr/) package: +The simulated choice data can be used to conduct a power analysis by estimating the same model multiple times with incrementally increasing sample sizes. As the sample size increases, the estimated coefficient standard errors will decrease (i.e. coefficient estimates become more precise). The `cbc_power()` function achieves this by partitioning the choice data into multiple sizes (defined by the `nbreaks` argument) and then estimating a user-defined choice model on each data subset. In the example below, 10 different sample sizes are used. All models are estimated using the [{logitr} package](https://jhelvy.github.io/logitr/). For more information, see `?logitr::logitr` as well as the JSS article on the {logitr} package [@Helveston2023]. ```{r} power <- cbc_power( @@ -321,8 +355,10 @@ summary(models[[10]]) One of the convenient features of how the package is written is that the object generated in each step is used as the first argument to the function for the next step. Thus, just like in the overall program diagram, the functions can be piped together. For example, the "pipeline" below uses the Base R pipe operator (`|>`) to generate profiles, generate a design, simulate choices according to a prior utility model, conduct a power analysis, and then finally plot the results: -```{r, eval=FALSE} -design <- cbc_profiles( +```{r} +#| eval: false + +cbc_profiles( price = seq(1, 4, 0.5), # $ per pound type = c('Fuji', 'Gala', 'Honeycrisp'), freshness = c('Poor', 'Average', 'Excellent') @@ -352,3 +388,45 @@ plot() ```{r, ref.label='power', echo=FALSE} ``` + +## Comparing multiple designs + +When evaluating multiple designs, it can be helpful to visually compare their respective power curves. This can be done using the `plot_compare_power()` function. To use it, you have to first create different designs and then simulate the power of each design by simulating choices. Here is an example comparing a full factorial design against a Bayesian D-efficient design: + +```{r} +# Make designs to compare: full factorial vs bayesian d-efficient +design_full <- cbc_design( + profiles = profiles, + n_resp = 300, n_alts = 3, n_q = 6 +) +# Same priors will be used in bayesian design and simulated choices +priors <- list( + price = -0.1, + type = c(0.1, 0.2), + freshness = c(0.1, 0.2) +) +design_bayesian <- cbc_design( + profiles = profiles, + n_resp = 300, n_alts = 3, n_q = 6, n_start = 1, method = "CEA", + priors = priors, parallel = FALSE +) + +# Obtain power for each design by simulating choices +power_full <- design_full |> +cbc_choices(obsID = "obsID", priors = priors) |> + cbc_power( + pars = c("price", "type", "freshness"), + outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 + ) +power_bayesian <- design_bayesian |> + cbc_choices(obsID = "obsID", priors = priors) |> + cbc_power( + pars = c("price", "type", "freshness"), + outcome = "choice", obsID = "obsID", nbreaks = 10, n_q = 6, n_cores = 2 + ) + +# Compare power of each design +plot_compare_power(power_bayesian, power_full) +``` + +# References