initial commit package

ingorohlfing · Jan 9, 2018 · e2de9c4 · e2de9c4
commit e2de9c4
Show file tree

Hide file tree

Showing 25 changed files with 1,742 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,3 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^data-raw$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+inst/doc
+.Rproj.user
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,24 @@
+Package: qcapower
+Type: Package
+Title: Estimate power in QCA and the number of cases needed to achieve a target
+    level
+Version: 0.0.0.9000
+Imports:
+  ggplot2,
+  ggforce
+Authors@R: c(
+  person("Ingo", "Rohlfing", email = "[email protected]", role = c("aut", "cre")),
+  person("Holger", "Doering", email = "[email protected]", role = c("aut")))
+Description: The package allows you to estimate power of a term in a truth table
+    analysis. A term can be anything: a condition, conjunction or disjunction of any
+    of these. For ex ante power estimation, you need a consistency score, the number
+    of cases, an alternative hypothesis about the actual consistency score of the
+    term and a null hypothesis about a consistency score separating consistent from
+    inconsistent terms.
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
+Suggests: knitr,
+    rmarkdown
+VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,9 @@
+# Generated by roxygen2: do not edit by hand
+
+export(qcapower)
+export(qp_cases)
+export(qp_cases_brute)
+export(qp_quant_plot)
+export(qp_run_plot)
+import(ggforce)
+import(ggplot2)
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,35 @@
+#' Data simulated power estimates
+#'
+#' A dataset containing power simulations for different number of cases and
+#' different values for null- and alternative hypothesis
+#'
+#' \describe{
+#' \item{cases}{number of cases}
+#' \item{null_hypo}{null hypothesis (H0)}
+#' \item{alt_hypo}{alternative hypothesis}
+#' \item{sims}{number of simulations}
+#' \item{perms}{number of permutations}
+#' \item{perms}{calculate power}
+#' }
+#'
+#' @format A dataframe with simulation parameters and calculated power
+"qp_sim_power"
+
+
+#' Data simulated power estimates for plotting of 5\%-quantiles
+#'
+#' A dataset containing the estimated 5\%-quantiles from a power simulation with
+#' 1000 simulations each with 10000 permutations. The value for the alternative
+#' hypothesis was set to 1.
+#'
+#' \describe{
+#' \item{power}{power estimate over 1000 simulations}
+#' \item{powercum}{running power estimate for ith simulation}
+#' \item{null_hypo}{null hypothesis (H0), set to 0.8 (irrelevant here)}
+#' \item{alt_hypo}{alternative hypothesis (H1), set to 1}
+#' \item{cases}{number of cases, set to 10}
+#' \item{quant}{estimated 5\%-quantiles per simulations}
+#' }
+#'
+#' @format A dataframe with 1000 rows and 6 variables:
+"qp_sina_data"
diff --git a/R/qcapower.R b/R/qcapower.R
@@ -0,0 +1,242 @@
+#' Estimate power in Qualitative Comparative Analysis (QCA).
+#'
+#' \code{qcapower} returns a power estimate for a term, given information about
+#' the required parameters
+#'
+#' \code{qcapower} allows you to estimate power for a term, that is, to reject
+#' the null hypothesis that a set relation is in place when it is in place, in
+#' fact. A term can be a single condition, a conjunction, or a disjunction of
+#' any combination of the two.
+#'
+#' @param cases Number of cases. In fuzzy-set QCA, equal to total number of
+#'   cases
+#' @param null_hypo Null hypothesis (\emph{H0}). Consistency value separating
+#'   consistent from inconsistent terms.
+#' @param alt_hypo Alternative hypothesis (\emph{H1}). Expected, actual
+#'   consistency value of term.
+#' @param sims Number of simulations
+#' @param perms Number of permutations per simulated dataset
+#' @param alpha Level of alpha at which significance of H0 is tested
+#' @param cons_threshold Degree of tolerance in generating data with consistency
+#'   equaling \code{alt_hypo} (see vignette)
+#' @param set_seed Parameter for achieving reproducibility of estimate
+#' @return A dataframe with rows equaling the number of \code{sims}.
+#'   \code{power} is the power estimate and is identical for each rows.
+#'   \code{powercum} is the running power estimate up to this row. \code{quant}
+#'   is the 5\%-quantile of the permuted distributions. See the vignette for
+#'   more information on \code{powercum} and \code{quant}.
+#' @seealso \code{\link{qp_quant_plot}} and \code{\link{qp_run_plot}}
+#' @examples
+#' power_data <- qcapower(cases = 20, null_hypo = 0.8, alt_hypo = 0.95, sims = 10, perms = 1000)
+#' head(power_data)
+#' @export
+qcapower <- function(cases, null_hypo, alt_hypo, sims = 1000, perms = 10000,
+                     alpha = 0.05, cons_threshold = 0.01, set_seed = 135) {
+  null_sig <- vector("numeric", length = sims)
+  null_p <- vector("numeric", length = sims)
+  quant <- vector("numeric", length = sims)
+
+  set.seed(set_seed)
+  calc_consistency <- function(a, y) sum(pmin(a, y)) / sum(a)
+  random_draw <- function() round(runif(cases), digits = 2)
+  df <- data.frame(index=1:cases)
+
+  for(i in 1:sims) {
+    df$a <- random_draw()
+    df$y <- random_draw()
+    cons <- calc_consistency(df$a, df$y)
+    while(abs(alt_hypo - cons) > cons_threshold) {
+      if(cons < alt_hypo) {
+        sample_row <- sample(df[df$a > df$y, 'index'], 1)
+        df[sample_row, 'y'] <- runif(1, min = df[sample_row, 'y'], max = 1)
+      } else if(cons > alt_hypo) {
+        sample_row <- sample(df[ , 'index'], 1)
+        df[sample_row, 'y'] <- runif(1, min = 0, max = df[sample_row, 'y'])
+      }
+      cons <- calc_consistency(df$a, df$y)
+    }
+    cons_perms <- sapply(1:perms, function(x) calc_consistency(df$a, sample(df$y)))
+    quant[i] <- quantile(cons_perms, 0.05)
+    null_p[i] <- ecdf(cons_perms)(null_hypo) # p-value of null-hypo
+    null_sig[i] <- ifelse(null_p[i] + 1.96 * null_p[i] / sqrt(perms) < alpha, 1, 0)
+  }
+  power <- sum(null_sig) / sims
+  powercum <- cumsum(null_sig) / 1:sims
+  estim <- data.frame(power, powercum, alt_hypo, null_hypo, cases, quant)
+
+  return(estim)
+}
+
+
+#' Calculte the number of cases for a particular case target based on simulated
+#' data
+#'
+#' \code{qp_cases} calculates the number of cases needed for a particular
+#' power level. It is based on the pre-calculated data in \code(qp_sim_power).
+#'
+#' @param power_target Power level target
+#' @param null_hypo Null hypothesis (\emph{H0}). Consistency value separating
+#'   consistent from inconsistent terms.
+#' @param alt_hypo Alternative hypothesis (\emph{H1}). Expected, actual
+#'   consistency value of term.
+#'
+#' @seealso \code{\link{qp_cases_brute}}
+#'
+#' @examples
+#' qp_cases(0.1, null_hypo = 0.8, alt_hypo = 1)
+#'
+#' @export
+qp_cases <- function(power_target, null_hypo, alt_hypo) {
+  sim_data <- qp_sim_power
+  if(power_target < 0 | power_target > 1) {
+    stop("'power target' must be between 0 and 1")
+  }
+  if(! null_hypo %in% sim_data$null_hypo | ! alt_hypo %in% sim_data$alt_hypo) {
+    hypo_values <- sapply(c("null_hypo", "alt_hypo"),
+                          function(.x) paste(unique(sim_data[[.x]]), collapse = ", "))
+    stop(sprintf("\n\nnull hypothesis must be %s\n\nalternative hypothesis must be %s",
+                  hypo_values[["null_hypo"]], hypo_values[["alt_hypo"]]))
+  }
+
+  sim_select <- sim_data[sim_data$power >= power_target
+                         & sim_data$null_hypo == null_hypo
+                         & sim_data$alt_hypo == alt_hypo, "cases"]
+
+  if(length(sim_select) == 0) {
+    return(NA)
+  } else {
+    return(min(sim_select))
+  }
+}
+
+
+#' Calculte the number of cases for a particular case target with brute force
+#'
+#' \code{qp_cases_brute} calculates the number of cases needed for a particular
+#' power level. The function starts at a \code(start_value) for the number of
+#' cases and increments until the \code(power_target) is met or the
+#' \code(max_value) has been reached.
+#'
+#' Running the function can take a lot of time. Use \code{\link{qp_cases}} to
+#' calculate the number of cases based on simulated data.
+#'
+#' @param power_target Power level target
+#' @param start_value Default number of cases for initial search
+#' @param max_value Default maximum number of cases for search
+#' @param progress Show progress of calculation (default \code{TRUE})
+#' @param ... \code{qcapower} parameters -- see \code{\link{qcapower}}
+#'
+#' @seealso \code{\link{qp_cases_brute}}
+#'
+#' @examples
+#' \dontrun{
+#' qp_cases_brute(0.9, null_hypo = 0.80, alt_hypo = 1)
+#'
+#' qp_cases_brute(0.9, null_hypo = 0.80, alt_hypo = 1, start_value = 20,
+#'                max_value = 50, perms = 500)
+#' }
+#'
+#' @export
+qp_cases_brute <- function(power_target, start_value = 2, max_value = 100,
+                           progress = TRUE, ...) {
+  cases <- start_value - 1
+  params <- list(...)
+  power <- 0
+
+  while(power < power_target) {
+    cases <- cases + 1
+    if(cases > max_value | cases <= 0) {
+      stop("Number of cases smaller than 0 or larger than 'max_value'")
+    }
+    params[['cases']] <- cases
+    power <- unique(do.call(qcapower, params)$power)
+    if(progress) {
+      print(sprintf('cases = %d -- power = %.2f', cases, power))
+    }
+  }
+  return(cases)
+}
+
+
+#' Plot of power estimate against the number of simulations
+#'
+#' \code{qp_run_plot} allows you to plot the running power estimate to
+#' determine whether \code{sims} is sufficiently large to derive a reliable
+#' estimate
+#'
+#' Creates a plot with \code{ggplot2}
+#'
+#' @param power_est Dataframe containing the simulation results (see
+#'   \code{\link{qcapower}})
+#' @param title Option for adding title to plot (default \code{FALSE})
+#' @examples
+#' power_data <- qcapower(cases = 20, null_hypo = 0.8, alt_hypo = 0.95, sims = 10, perms = 1000)
+#' qp_run_plot(power_data)
+#'
+#' # Using data with 10000 estimates
+#' data(qp_sina_data)
+#' qp_run_plot(qp_sina_data)
+#' @import ggplot2
+#' @export
+qp_run_plot <- function(power_est, title = FALSE) {
+  pl_title = "Power estimate over simulations"
+  pl_y_label = "running power estimate"
+
+  power_est$id <- seq(1, nrow(power_est), by = 1)
+  pl <- ggplot(data=power_est, mapping=aes(x = id, y = powercum)) +
+    geom_line() +
+    scale_x_continuous(name = "simulation index",
+                       breaks = c(1, nrow(power_est))) +
+    scale_y_continuous(name = pl_y_label,
+                       breaks = seq(0, 1, by = 0.2),
+                       limits = c(0, 1)) +
+    theme_bw()
+
+  if(title == TRUE) {
+    pl <- pl +
+      ggtitle(pl_title) +
+      theme(plot.title = element_text(hjust = 0.5))
+  }
+
+  print(pl)
+}
+
+
+#' Plot of 5%-quantiles for assessing the dispersion of the permutated
+#' distributions
+#'
+#' Depending on the number of cases, the permuted distributions of consistency
+#' values can differ widely in terms of their location on the spectrum and their
+#' shape.
+#'
+#' Creates a sina plot with \code{ggforce}
+#'
+#' @param power_est Dataframe containing simulation results (see
+#'   \code{\link{qcapower}})
+#' @param title Option for adding title to plot (default \code{FALSE})
+#' @examples
+#' sim_data <- qp_sina_data
+#' qp_quant_plot(sim_data)
+#' @import ggplot2
+#' @import ggforce
+#' @export
+qp_quant_plot <- function(power_est, title = FALSE) {
+  pl_title = "Distribution of 5%-quantiles"
+  pl_y_label = "estimated 5%-quantiles"
+
+  pl <- ggplot(data=power_est, mapping=aes(factor(cases), quant)) +
+    geom_sina(size=0.1) +
+    scale_x_discrete(labels = NULL, name = NULL) +
+    scale_y_continuous(name = pl_y_label,
+                       breaks = seq(0, 1, by = 0.2),
+                       limits = c(0, 1)) +
+    theme_bw()
+
+  if(title == TRUE) {
+    pl <- pl +
+      ggtitle(pl_title)
+      theme(plot.title = element_text(hjust = 0.5))
+  }
+
+  print(pl)
+}
diff --git a/data-raw/qp_data.R b/data-raw/qp_data.R
@@ -0,0 +1,8 @@
+library(qcapower)
+library(devtools)
+
+qp_sina_data <- qcapower(cases = 10, null_hypo = 0.8, alt_hypo = 1, sims = 1000, perms = 10000)
+use_data(qp_sina_data)
+
+qp_sim_power <- read.csv("simulations.csv", as.is = TRUE)
+use_data(qp_sim_power)
diff --git a/data-raw/simulations-qcapower.R b/data-raw/simulations-qcapower.R
@@ -0,0 +1,39 @@
+# "qcapower()" function from qcapower-package
+# to run simulations in "simulation.R" without installing package
+
+qcapower <- function(cases, null_hypo, alt_hypo, sims = 1000, perms = 10000,
+                     alpha = 0.05, cons_threshold = 0.01, set_seed = 135) {
+  null_sig <- vector("numeric", length = sims)
+  null_p <- vector("numeric", length = sims)
+  quant <- vector("numeric", length = sims)
+
+  set.seed(set_seed)
+  calc_consistency <- function(a, y) sum(pmin(a, y)) / sum(a)
+  random_draw <- function() round(runif(cases), digits = 2)
+  df <- data.frame(index=1:cases)
+
+  for(i in 1:sims) {
+    df$a <- random_draw()
+    df$y <- random_draw()
+    cons <- calc_consistency(df$a, df$y)
+    while(abs(alt_hypo - cons) > cons_threshold) {
+      if(cons < alt_hypo) {
+        sample_row <- sample(df[df$a > df$y, 'index'], 1)
+        df[sample_row, 'y'] <- runif(1, min = df[sample_row, 'y'], max = 1)
+      } else if(cons > alt_hypo) {
+        sample_row <- sample(df[ , 'index'], 1)
+        df[sample_row, 'y'] <- runif(1, min = 0, max = df[sample_row, 'y'])
+      }
+      cons <- calc_consistency(df$a, df$y)
+    }
+    cons_perms <- sapply(1:perms, function(x) calc_consistency(df$a, sample(df$y)))
+    quant[i] <- quantile(cons_perms, 0.05)
+    null_p[i] <- ecdf(cons_perms)(null_hypo) # p-value of null-hypo
+    null_sig[i] <- ifelse(null_p[i] + 1.96 * null_p[i] / sqrt(perms) < alpha, 1, 0)
+  }
+  power <- sum(null_sig) / sims
+  powercum <- cumsum(null_sig) / 1:sims
+  estim <- data.frame(power, powercum, alt_hypo, null_hypo, cases, quant)
+
+  return(estim)
+}