diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1e93f191..6e7598fe 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,85 +1,59 @@ -# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. -# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: - - main - - master - - devel - pull_request: - branches: - - main - - master - - devel + push: + branches: + - main + - master + - devel + pull_request: + branches: + - main + - master + - devel -name: R-CMD-check +name: R-CMD-check.yaml -jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} - - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - - strategy: - fail-fast: false - matrix: - config: - - {os: windows-latest, r: 'release'} - - {os: macOS-latest, r: 'release'} - - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.config.r }} - - - uses: r-lib/actions/setup-pandoc@v2 +permissions: read-all - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") - shell: Rscript {0} - - - name: Cache R packages - if: runner.os != 'Windows' - uses: actions/cache@v2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- - - - name: Install system dependencies - if: runner.os == 'Linux' - run: | - while read -r cmd - do - eval sudo $cmd - done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') - - - name: Install dependencies - run: | - remotes::install_deps(dependencies = TRUE) - remotes::install_cran("rcmdcheck") - shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") - shell: Rscript {0} - - - name: Upload check results - if: failure() - uses: actions/upload-artifact@main - with: - name: ${{ runner.os }}-r${{ matrix.config.r }}-results - path: check +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.posit.co/cran/__linux__/jammy/latest"} + - {os: ubuntu-22.04, r: 'devel', rspm: "https://packagemanager.posit.co/cran/__linux__/jammy/latest"} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' \ No newline at end of file diff --git a/.github/workflows/rhub.yaml b/.github/workflows/rhub.yaml new file mode 100644 index 00000000..bdfab195 --- /dev/null +++ b/.github/workflows/rhub.yaml @@ -0,0 +1,95 @@ +# R-hub's generic GitHub Actions workflow file. It's canonical location is at +# https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml +# You can update this file to a newer version using the rhub2 package: +# +# rhub::rhub_setup() +# +# It is unlikely that you need to modify this file manually. + +name: R-hub +run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" + +on: + workflow_dispatch: + inputs: + config: + description: 'A comma separated list of R-hub platforms to use.' + type: string + default: 'linux,windows,macos' + name: + description: 'Run name. You can leave this empty now.' + type: string + id: + description: 'Unique ID. You can leave this empty now.' + type: string + +jobs: + + setup: + runs-on: ubuntu-latest + outputs: + containers: ${{ steps.rhub-setup.outputs.containers }} + platforms: ${{ steps.rhub-setup.outputs.platforms }} + + steps: + # NO NEED TO CHECKOUT HERE + - uses: r-hub/actions/setup@v1 + with: + config: ${{ github.event.inputs.config }} + id: rhub-setup + + linux-containers: + needs: setup + if: ${{ needs.setup.outputs.containers != '[]' }} + runs-on: ubuntu-latest + name: ${{ matrix.config.label }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.containers) }} + container: + image: ${{ matrix.config.container }} + + steps: + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/platform-info@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/setup-deps@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/run-check@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + + other-platforms: + needs: setup + if: ${{ needs.setup.outputs.platforms != '[]' }} + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.label }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.platforms) }} + + steps: + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/setup-r@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} + - uses: r-hub/actions/platform-info@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/setup-deps@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} + - uses: r-hub/actions/run-check@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index fec79b17..3f128f73 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Tplyr Title: A Traceability Focused Grammar of Clinical Data Summary -Version: 1.2.1 +Version: 1.2.1.9000 Authors@R: c( person(given = "Eli", @@ -43,7 +43,7 @@ Authors@R: ) Description: A traceability focused tool created to simplify the data manipulation necessary to create clinical summaries. License: MIT + file LICENSE -URL: https://github.com/atorus-research/Tplyr +URL: https://atorus-research.github.io/Tplyr/, https://github.com/atorus-research/Tplyr BugReports: https://github.com/atorus-research/Tplyr/issues Encoding: UTF-8 Depends: R (>= 3.5.0) @@ -71,7 +71,7 @@ Suggests: pharmaRTF, withr VignetteBuilder: knitr -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 RdMacros: lifecycle Config/testthat/edition: 3 LazyData: true diff --git a/R/regex.R b/R/regex.R index d259212f..786cca11 100644 --- a/R/regex.R +++ b/R/regex.R @@ -3,7 +3,7 @@ #' This function allows you to extract important regular expressions used inside #' Tplyr. #' -#' There are two important regular expressions used within Tplyr. The +#' There are three important regular expressions used within Tplyr. The #' format_string expression is the expression to parse format strings. This is #' what is used to make sense out of strings like 'xx (XX.x%)' or 'a+1 (A.a+2)' #' by inferring what the user is specifying about number formatting. @@ -11,7 +11,8 @@ #' The 'format_group' regex is the opposite of this, and when given a string of #' numbers, such as ' 5 (34%) \[9]' will return the separate segments of numbers #' broken into their format groups, which in this example would be ' 5', -#' '(34%)', and '\[9]'. +#' '(34%)', and '\[9]'. Lastly, the 'number_group' regex has a similar application +#' to the 'format_group' regex, but targets only numbers #' #' @param rx A character string with either the value 'format_string' or #' 'format_group' @@ -25,14 +26,17 @@ #' get_tplyr_regex('format_string') #' #' get_tplyr_regex('format_group') +#' +#' get_tplyr_regex('number_group') #' -get_tplyr_regex <- function(rx=c("format_string", "format_group")) { +get_tplyr_regex <- function(rx=c("format_string", "format_group", "number_group")) { rx <- match.arg(rx) switch( rx, 'format_string' = get_format_string_regex(), - 'format_group' = get_format_group_regex() + 'format_group' = get_format_group_regex(), + 'number_group' = get_numeric_group_regex() ) } @@ -110,3 +114,17 @@ get_format_group_regex <- function() { regex(paste0(nwsd, ws, num, nws)) } + +#' Return the regex for identifying numbers within an output string +#' +#' This regex targets the individual numbers within the string +#' +#' @return A regular expression +#' @noRd +get_numeric_group_regex <- function() { + #`-?` - Matches an optional negative sign + # `(?:\d*\.\d+|\d+)` - A non-capturing group with two alternatives: + # `\d*\.\d+` - Matches decimals like `.75`, `0.56`, or `123.45` + # `\d+` - Matches integers like `1`, `523`, `56` + regex("-?(?:\\d*\\.\\d+|\\d+)") +} diff --git a/R/riskdiff.R b/R/riskdiff.R index be17c0b8..a6af81fb 100644 --- a/R/riskdiff.R +++ b/R/riskdiff.R @@ -176,7 +176,21 @@ prep_two_way <- function(comp) { msg = paste0("There are no records for the following groups within the variable ", as_name(treat_var), ": ", paste(invalid_groups, collapse=", "))) - two_way <- numeric_data + # create the merge columns + mrg <- as_label(pop_treat_var) + names(mrg) <- as_label(treat_var) + mrg_cols <- append(mrg, map_chr(cols, as_label)) + + two_way <- numeric_data %>% + left_join( + select(header_n, everything(), tot_fill = n), + by = mrg_cols + ) %>% + mutate( + distinct_total = if_else(is.na(distinct_total), tot_fill, distinct_total) + ) + + rm(mrg, mrg_cols) # Nested layers need to plug the NAs left over - needs revision in the future if (is_built_nest && quo_is_symbol(by[[1]])) { @@ -188,7 +202,6 @@ prep_two_way <- function(comp) { ) } - # If distinct is set and distinct values are there, use them if (comp_distinct && !is.null(distinct_by)) { two_way <- two_way %>% diff --git a/R/str_extractors.R b/R/str_extractors.R index cdb73535..f3a9b46e 100644 --- a/R/str_extractors.R +++ b/R/str_extractors.R @@ -22,7 +22,7 @@ #' #' @examples #' -#' string <- c(" 0 (0.0%)", " 8 (9.3%)", "78 (90.7%)") +#' string <- c(" 0 (0.0%)", " 8 (9.3%)", "78 (90.7%)", "-1 (-.56, .75) -523%, 56 | -34") #' #' str_extract_fmt_group(string, 2) #' @@ -31,11 +31,11 @@ str_extract_fmt_group <- function(string, format_group) { if (!inherits(string, "character")) { - stop("Paramter `string` must be a character vector", call.=FALSE) + stop("Parameter `string` must be a character vector", call.=FALSE) } - if (!inherits(format_group, "numeric") || (inherits(format_group, "numeric") && format_group %% 1 != 0)) { - stop("Paramter `format_group` must be an integer", call.=FALSE) + if (!inherits(format_group, c("integer", "numeric")) || (inherits(format_group, "numeric") && format_group %% 1 != 0)) { + stop("Parameter `format_group` must be an integer", call.=FALSE) } # Pull out regex to drive the work @@ -57,15 +57,15 @@ str_extract_fmt_group <- function(string, format_group) { str_extract_num <- function(string, format_group) { if (!inherits(string, "character")) { - stop("Paramter `string` must be a character vector", call.=FALSE) + stop("Parameter `string` must be a character vector", call.=FALSE) } - if (!inherits(format_group, "numeric") || (inherits(format_group, "numeric") && format_group %% 1 != 0)) { - stop("Paramter `format_group` must be an integer", call.=FALSE) + if (!inherits(format_group, c("integer", "numeric")) || (inherits(format_group, "numeric") && format_group %% 1 != 0)) { + stop("Parameter `format_group` must be an integer", call.=FALSE) } # Pull out regex to drive the work - f_grp_rx <- get_format_group_regex() + f_grp_rx <- get_numeric_group_regex() # Pull out all the match groups and then get the numeric for the conditional number match_groups <- str_match_all(string, f_grp_rx) @@ -73,6 +73,6 @@ str_extract_num <- function(string, format_group) { # Get the number upon which the condition will be evaluated map_dbl( match_groups, - ~ if (nrow(.) < format_group) {NA_real_} else {as.double(.[format_group, 2])} + ~ if (nrow(.) < format_group) {NA_real_} else {as.double(.[format_group, 1])} ) } diff --git a/man/Tplyr.Rd b/man/Tplyr.Rd index 06706f33..330856e3 100644 --- a/man/Tplyr.Rd +++ b/man/Tplyr.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/zzz.R \docType{package} \name{Tplyr} -\alias{Tplyr} \alias{Tplyr-package} +\alias{Tplyr} \title{A grammar of summary data for clinical reports} \description{ `r lifecycle::badge("experimental")` diff --git a/man/get_tplyr_regex.Rd b/man/get_tplyr_regex.Rd index 448057a1..eca594b7 100644 --- a/man/get_tplyr_regex.Rd +++ b/man/get_tplyr_regex.Rd @@ -4,7 +4,7 @@ \alias{get_tplyr_regex} \title{Retrieve one of Tplyr's regular expressions} \usage{ -get_tplyr_regex(rx = c("format_string", "format_group")) +get_tplyr_regex(rx = c("format_string", "format_group", "number_group")) } \arguments{ \item{rx}{A character string with either the value 'format_string' or @@ -18,7 +18,7 @@ This function allows you to extract important regular expressions used inside Tplyr. } \details{ -There are two important regular expressions used within Tplyr. The +There are three important regular expressions used within Tplyr. The format_string expression is the expression to parse format strings. This is what is used to make sense out of strings like 'xx (XX.x\%)' or 'a+1 (A.a+2)' by inferring what the user is specifying about number formatting. @@ -26,7 +26,8 @@ by inferring what the user is specifying about number formatting. The 'format_group' regex is the opposite of this, and when given a string of numbers, such as ' 5 (34\%) [9]' will return the separate segments of numbers broken into their format groups, which in this example would be ' 5', -'(34\%)', and '[9]'. +'(34\%)', and '[9]'. Lastly, the 'number_group' regex has a similar application +to the 'format_group' regex, but targets only numbers } \examples{ @@ -34,4 +35,6 @@ get_tplyr_regex('format_string') get_tplyr_regex('format_group') +get_tplyr_regex('number_group') + } diff --git a/man/str_extractors.Rd b/man/str_extractors.Rd index b8fe8b9b..be09ac4d 100644 --- a/man/str_extractors.Rd +++ b/man/str_extractors.Rd @@ -31,7 +31,7 @@ are ' 5', '(34.4\%)', and '[9]'. } \examples{ -string <- c(" 0 (0.0\%)", " 8 (9.3\%)", "78 (90.7\%)") +string <- c(" 0 (0.0\%)", " 8 (9.3\%)", "78 (90.7\%)", "-1 (-.56, .75) -523\%, 56 | -34") str_extract_fmt_group(string, 2) diff --git a/tests/testthat/_snaps/riskdiff.md b/tests/testthat/_snaps/riskdiff.md index 7a376ad0..b39e111e 100644 --- a/tests/testthat/_snaps/riskdiff.md +++ b/tests/testthat/_snaps/riskdiff.md @@ -22,3 +22,122 @@ Comparison {4, 4} has duplicated values. Comparisons must not be duplicates +# Missing counts don't cause error in comparisons + + Code + head(as.data.frame(build(t))) + Condition + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Warning in `prop.test()`: + Chi-squared approximation may be incorrect + Output + row_label1 row_label2 + 1 SKIN AND SUBCUTANEOUS TISSUE DISORDERS SKIN AND SUBCUTANEOUS TISSUE DISORDERS + 2 SKIN AND SUBCUTANEOUS TISSUE DISORDERS ALOPECIA + 3 SKIN AND SUBCUTANEOUS TISSUE DISORDERS BLISTER + 4 SKIN AND SUBCUTANEOUS TISSUE DISORDERS COLD SWEAT + 5 SKIN AND SUBCUTANEOUS TISSUE DISORDERS DERMATITIS ATOPIC + 6 SKIN AND SUBCUTANEOUS TISSUE DISORDERS DERMATITIS CONTACT + var1_Placebo_F var1_Placebo_M var1_Xanomeline High Dose_F + 1 13 ( 24.5%) 8 ( 24.2%) 0 ( 0.0%) + 2 1 ( 1.9%) 0 ( 0.0%) 0 ( 0.0%) + 3 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) + 4 0 ( 0.0%) 1 ( 3.0%) 0 ( 0.0%) + 5 0 ( 0.0%) 1 ( 3.0%) 0 ( 0.0%) + 6 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) + var1_Xanomeline High Dose_M var1_Xanomeline Low Dose_F + 1 0 ( 0.0%) 24 ( 48.0%) + 2 0 ( 0.0%) 0 ( 0.0%) + 3 0 ( 0.0%) 2 ( 4.0%) + 4 0 ( 0.0%) 0 ( 0.0%) + 5 0 ( 0.0%) 0 ( 0.0%) + 6 0 ( 0.0%) 0 ( 0.0%) + var1_Xanomeline Low Dose_M ord_layer_index + 1 18 ( 52.9%) 1 + 2 0 ( 0.0%) 1 + 3 3 ( 8.8%) 1 + 4 0 ( 0.0%) 1 + 5 0 ( 0.0%) 1 + 6 1 ( 2.9%) 1 + rdiff_Xanomeline High Dose_Placebo_F rdiff_Xanomeline High Dose_Placebo_M + 1 -0.245 (-0.383, -0.108) -0.242 (-0.415, -0.070) + 2 -0.019 (-0.074, 0.037) 0.000 ( 0.000, 0.000) + 3 0.000 ( 0.000, 0.000) 0.000 ( 0.000, 0.000) + 4 0.000 ( 0.000, 0.000) -0.030 (-0.115, 0.055) + 5 0.000 ( 0.000, 0.000) -0.030 (-0.115, 0.055) + 6 0.000 ( 0.000, 0.000) 0.000 ( 0.000, 0.000) + ord_layer_1 ord_layer_2 + 1 1 Inf + 2 1 1 + 3 1 2 + 4 1 3 + 5 1 4 + 6 1 5 + diff --git a/tests/testthat/test-riskdiff.R b/tests/testthat/test-riskdiff.R index 644ffec0..5735edbe 100644 --- a/tests/testthat/test-riskdiff.R +++ b/tests/testthat/test-riskdiff.R @@ -278,3 +278,26 @@ test_that("Error generates when duplicating riskdiff comparison values", { ) }) + +test_that("Missing counts don't cause error in comparisons", { + + +adae <- filter(tplyr_adae, TRTA != "Xanomeline High Dose" & AEDECOD != "ACTINIC KERATOSIS") + +# Create table +t <- tplyr_table(adae, TRTA, cols=SEX) %>% + # Set population + set_pop_data(tplyr_adsl) %>% + set_pop_treat_var(TRT01A) %>% + # Layer 1: Organ System and OCMQ (Narrow) Count Layer + add_layer( + group_count(vars(AEBODSYS, AEDECOD)) %>% + # Set distinct counts per subject + set_distinct_by(USUBJID) %>% + # Add risk differences + add_risk_diff(c("Xanomeline High Dose", "Placebo")) + ) + + # Build the table + expect_snapshot(head(as.data.frame(build(t)))) +}) \ No newline at end of file diff --git a/tests/testthat/test-str_extractors.R b/tests/testthat/test-str_extractors.R index 393a2d67..207f968f 100644 --- a/tests/testthat/test-str_extractors.R +++ b/tests/testthat/test-str_extractors.R @@ -1,57 +1,63 @@ -string <- c(" 0 (0.0%)", " 8 (9.3%)", "78 (90.7%)") +string <- c(" 0 (0.0%)", " 8 (9.3%)", "78 (90.7%)", "-1 (-0.56, -.75) -523%") test_that("String extractor errors generate properly", { expect_error( str_extract_fmt_group(c(1), 1), - "Paramter `string`" + "Parameter `string`" ) expect_error( str_extract_fmt_group(string, "hi"), - "Paramter `format_group`" + "Parameter `format_group`" ) expect_error( str_extract_num(c(1), 1), - "Paramter `string`" + "Parameter `string`" ) expect_error( str_extract_num(string, "hi"), - "Paramter `format_group`" + "Parameter `format_group`" ) }) test_that("Format groups can be extracted", { expect_equal( str_extract_fmt_group(string, 1), - c(' 0', ' 8', '78') + c(' 0', ' 8', '78', "-1") ) expect_equal( str_extract_fmt_group(string, 2), - c("(0.0%)", "(9.3%)", "(90.7%)") + c("(0.0%)", "(9.3%)", "(90.7%)", "(-0.56,") ) expect_equal( str_extract_fmt_group(string, 3), - rep(NA_character_, 3) + c(rep(NA_character_, 3), "-.75)") ) }) test_that("Numbers from format groups can be extracted", { expect_equal( str_extract_num(string, 1), - c(0, 8, 78) + c(0, 8, 78, -1) ) expect_equal( str_extract_num(string, 2), - c(0.0, 9.3, 90.7) + c(0.0, 9.3, 90.7, -.56) ) expect_equal( str_extract_num(string, 3), - rep(NA_real_, 3) + c(rep(NA_real_, 3), -.75) ) + + expect_equal( + str_extract_num(string, 4), + c(rep(NA_real_, 3), -523) + ) + }) diff --git a/vignettes/denom.Rmd b/vignettes/denom.Rmd index cb657c4e..e99d97a4 100644 --- a/vignettes/denom.Rmd +++ b/vignettes/denom.Rmd @@ -30,7 +30,7 @@ Make sure you have a good understand of count and shift layers before you review ## Population Data in the Denominator -What do you do when your target dataset doesn't _have_ the information necessary to create your denominator? For example - when you create an adverse event table, the adverse event dataset likely only contains records for subjects who experienced an adverse event. But subjects who did _not_ have an adverse event are still part of the study population and must be considered in the denominator. +What do you do when your target dataset doesn't _have_ the information necessary to create your denominator? For example, when you create an adverse event table, the adverse event dataset likely only contains records for subjects who experienced an adverse event. But subjects who did _not_ have an adverse event are still part of the study population and must be considered in the denominator. For this reason,**Tplyr** allows lets you set a separate population dataset - but there are a couple things you need to do to trigger **Tplyr** to use the population data as your denominator. @@ -74,11 +74,11 @@ Fortunately, denominators are much simpler when they're kept within a single dat ## Denominator Grouping -When you're looking within a single dataset, there are a couple factors that you need to consider for a denominator. The first is which grouping variables create those denominators. Let's look at this from two perspectives - count layers and shift layers. +When you're looking within a single dataset, there are a couple factors that you need to consider for a denominator. Firstly, which grouping variables create those denominators? Let's look at this from two perspectives: count layers and shift layers. ### Count layers -Most of the complexity of denominators comes from nuanced situations. A solid 80% of the time, defaults will work. For example, in a frequency table, you will typically want data within a column to sum to 100%. For example: +Most of the complexity of denominators comes from nuanced situations. Tplyr is designed with practical defaults that suit most clinical summaries. For example, in a frequency table, you will typically want data within a column to sum to 100%, like so: ```{r} tplyr_adsl <- tplyr_adsl %>% @@ -180,9 +180,9 @@ There are some circumstances that you'll encounter where the filter used for a d Yeah we know - there are a lot of different places that filtering can happen... -So let's take the example shown below. The first layer has no layer level filtering applied, so the table level `where` is the only filter applied. The second layer has a layer level filter applied, so the denominators will be based on that layer level filter. Notice how in this case, the percentages in the second layer add up to 100%. This is because the denominator only includes values used in that layer. +So let's take the example shown below. The first layer has no layer-level filtering applied, so the table-level `where` is the only filter applied. The second layer has a layer-level filter applied, so the denominators will be based on that layer-level filter. Notice how in this case, the percentages in the second layer add up to 100%. This is because the denominator only includes values used in that layer. -The third layer has a layer level filter applied, but additionally uses `set_denom_where()`. The `set_denom_where()` in this example is actually *removing* the layer level filter for the denominators. This is because in R, when you filter using `TRUE`, the filter returns all records. So by using `TRUE` in `set_denom_where()`, the layer level filter is effectively removed. This causes the denominator to include all values available from the table and not just those selected for that layer - so for this layer, the percentages will *not add up to 100%*. This is important - this allows the percentages from Layer 3 to sum to the total percentage of "DISCONTINUED" from Layer 1. +The third layer has a layer-level filter applied, but additionally uses `set_denom_where()`. The `set_denom_where()` in this example is actually *removing* the layer-level filter for the denominators. This is because in R, when you filter using `TRUE`, the filter returns all records. So by using `TRUE` in `set_denom_where()`, the layer-level filter is effectively removed. This causes the denominator to include all values available from the table and not just those selected for that layer - so for this layer, the percentages will *not add up to 100%*. This is important - this allows the percentages from Layer 3 to sum to the total percentage of "DISCONTINUED" from Layer 1. ```{r} tplyr_adsl2 <- tplyr_adsl %>% @@ -210,9 +210,9 @@ t %>% Missing counts are a tricky area for frequency tables, and they play directly in with denominators as well. These values raise a number of questions. For example, do you want to format the missing counts the same way as the event counts? Do you want to present missing counts with percentages? Do missing counts belong in the denominator? -The `set_missing_count()` function can take a new `f_str()` object to set the display of missing values. If not specified, the associated count layer's format will be used. Using the `...` parameter, you are able to specify the row label desired for missing values and values that you determine to be considered 'missing'. For example, you may have NA values in the target variable, and then values like "Not Collected" that you also wish to consider "missing". `set_missing_count()` allows you to group those together. Actually - you're able to establish as many different "missing" groups as you want - even though that scenario is fairly unlikely. +The `set_missing_count()` function can take a new `f_str()` object to set the display of missing values. If not specified, the associated count layer's format will be used. Using the `...` parameter, you are able to specify the row label desired for missing values and values that you determine to be considered 'missing'. For example, you may have NA values in the target variable, and then values like "Not Collected" that you also wish to consider "missing". `set_missing_count()` allows you to group those together. Actually you're able to establish as many different "missing" groups as you want - even though that scenario is fairly unlikely. -In the example below 50 random values are removed and NA is specified as the missing string. This leads us to another parameter - `denom_ignore`. By default, if you specify missing values they will still be considered within the denominator, but when you have missing counts, you may wish to exclude them from the totals being summarized. By setting `denom_ignore` to TRUE, your denominators will ignore any groups of missing values that you've specified. +In the example below, 50 random values are removed and NA is specified as the missing string. This leads us to another parameter: `denom_ignore`. By default, Tplyr will include missing values within the denominator, but you may wish to exclude them from the totals being summarized. By setting `denom_ignore` to TRUE, your denominators will ignore any groups of missing values that you've specified. ```{r} tplyr_adae2 <- tplyr_adae @@ -231,11 +231,11 @@ t %>% kable() ``` -We did one more other thing worth explaining in the example above - gave the missing count its own sort value. If you leave this field null, it will simply be the maximum value in the order layer plus 1, to put the Missing counts at the bottom during an ascending sort. But tables can be sorted a lot of different ways, as you'll see in the sort vignette. So instead of trying to come up with novel ways for you to control where the missing row goes - we decided to just let you specify your own value. +We did one more other thing worth explaining in the example above - we gave the missing count its own sort value. If you leave this field null, it will simply be the maximum value in the order layer plus 1, to put the Missing counts at the bottom during an ascending sort. But tables can be sorted a lot of different ways, as you'll see in the sort vignette. So instead of trying to come up with novel ways for you to control where the missing row goes, we decided to just let you specify your own value. ## Missing Subjects -Missing counts and counting missing subjects work two different ways within Tplyr. Missing counts, as described above, will examine the records present in the data and collect and missing values. But for these results to be counted, they need to first be provided within the input data itself. On the other hand, missing subjects are calculated by looking at the difference between the potential number of subjects within the column (i.e. the combination of the treatment variables and column variables) and the number of subjects actually present. Consider this example: +Missing counts and counting missing subjects work two different ways within Tplyr. Missing counts, as described above, will examine the records present in the data and collect any missing values. But for these results to be counted, they need to first be provided within the input data itself. On the other hand, missing subjects are calculated by looking at the difference between the *potential* number of subjects within the column (i.e. the combination of the treatment variables and column variables) and the number of subjects *actually* present. Consider this example: ```{r missing_subs1} missing_subs <- tplyr_table(tplyr_adae, TRTA) %>% @@ -255,7 +255,7 @@ Missing counts and counting missing subjects work two different ways within Tply kable() ``` -In the example above, we produce a nested count layer. The function `add_missing_subjects_row()` triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default to 'Missing'. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. Lastly, you can provide an `f_str()` to format the missing subjects row separately from the rest of the layer, but whatever format is applied to the layer will apply otherwise. +In the example above, we produce a nested count layer. The function `add_missing_subjects_row()` triggers the addition of the new result row for which the missing subjects are calculated. The row label applied for this can be configured using `set_missing_subjects_row_label()`, and the row label itself will default to 'Missing'. Depending on your sorting needs, a `sort_value` can be applied to whatever numeric value you provide. You can also provide an `f_str()` to format the missing subjects row separately from the rest of the layer. Note that in nested count layers, missing subject rows will generate for each independent group within the outer layer. Outer layers cannot have missing subject rows calculated individually. This would best be done in an independent layer itself, as the result would apply to the whole input target dataset. @@ -306,7 +306,7 @@ tplyr_table(tplyr_adsl2, TRT01P) %>% kable() ``` -Now the table is more intuitive. We used `set_missing_count()` to update our denominators, so missing have been excluded. Now, the total row intuitively matches the denominators used within each group, and we can see how many missing records were excluded. +Now the table is more intuitive. We used `set_missing_count()` to update our denominators, so missings have been excluded. Now, the total row intuitively matches the denominators used within each group, and we can see how many missing records were excluded. _You may have stumbled upon this portion of the vignette while searching for how to create a total column. **Tplyr** allows you to do this as well with the function `add_total_group()` and read more in `vignette("table")`._ diff --git a/vignettes/layer_templates.Rmd b/vignettes/layer_templates.Rmd index 021fc0b6..c166905b 100644 --- a/vignettes/layer_templates.Rmd +++ b/vignettes/layer_templates.Rmd @@ -19,13 +19,13 @@ library(Tplyr) library(knitr) ``` -There are several scenarios where a layer template may be useful. Some tables, like demographics tables, may have many layers that will all essentially look the same. Categorical variables will have the same count layer settings, and continuous variables will have the same desc layer settings. A template allows a user to build those settings once per layer, then reference the template when the **Tplyr** table is actually built. Another scenario might be building a set of company layer templates that are built for standard tables to reduce the footprint of code across analyses. In either of these cases, the idea is the reduce the amount of redundant code necessary to create a table. +There are several scenarios where a layer template may be useful. Some tables, like demographics tables, may have many layers that will all essentially look the same. Categorical variables will have the same count layer settings, and continuous variables will have the same desc layer settings. A template allows a user to build those settings once per layer, then reference the template when the **Tplyr** table is actually built. Another scenario might be building a set of company layer templates that are built for standard tables to reduce the footprint of code across analyses. In either of these cases, the idea is to reduce the amount of redundant code necessary to create a table. -Tplyr has already has a couple of mechanisms to reduce redundant application of formats. For example, `vignettes('tplyr_options')` shows how the options `tplyr.count_layer_default_formats`, `tplyr.desc_layer_default_formats`, and `tplyr.shift_layer_default_formats` can be used to create default format string settings. Additionally, you can set formats table wide using `set_count_layer_formats()`, `set_desc_layer_formats()`, or `set_shift_layer_formats()`. But what these functions and options _don't_ allow you to do is pre-set and reuse the settings for an entire layer, so all of the additional potential layer modifying functions are ignored. This is where layer templates come in. +Tplyr has already has mechanisms to reduce redundant application of formats. For example, `vignettes('tplyr_options')` shows how the options `tplyr.count_layer_default_formats`, `tplyr.desc_layer_default_formats`, and `tplyr.shift_layer_default_formats` can be used to create default format string settings. Additionally, you can set formats table-wide using `set_count_layer_formats()`, `set_desc_layer_formats()`, or `set_shift_layer_formats()`. But what these functions and options _don't_ allow you to do is pre-set and reuse the settings for an entire layer, so all of the additional potential layer-modifying functions are ignored. This is where layer templates come in. # Basic Templates -The functions `new_layer_template()` and `use_template()` allow a user to create and use layer templates. Layer templates allow a user to pre-build and reuse an entire layer configuration, from the layer constructor down to all modifying functions. Furthermore, users can specify parameters they may want to be interchangeable. Additionally, layer templates are extensible, so a template can be use and then further extended with additional layer modifying functions. +The functions `new_layer_template()` and `use_template()` allow a user to create and use layer templates. Layer templates allow a user to pre-build and reuse an entire layer configuration, from the layer constructor down to all modifying functions. Furthermore, users can specify parameters they may want to be interchangeable. Additionally, layer templates are extensible, so a template can be used and then further extended with additional layer-modifying functions. Consider the following example: @@ -37,7 +37,7 @@ new_layer_template( ) ``` -In this example, we've created a basic layer template. The template is named "example_template", and this is the name we'll use to reference the template when we want to use it. When the template is created, we start with the function `group_count(...)`. Note the use of the ellipsis (i.e. `...`). This is a required part of a layer template. Templates must start with a **Tplyr** layer constructor, which is one of the function `group_count()`, `group_desc()`, or `group_shift()`. The ellipsis is necessary because when the template is used, we are able to pass arguments directly into the layer constructor. For example: +In this example, we've created a basic layer template. The template is named "example_template", and this is the name we'll use to reference the template when we want to use it. When the template is created, we start with the function `group_count(...)`. Note the use of the ellipsis (i.e. `...`). This is a required part of a layer template. Templates must start with a **Tplyr** layer constructor, which is one of the functions `group_count()`, `group_desc()`, or `group_shift()`. The ellipsis is necessary because when the template is used, we are able to pass arguments directly into the layer constructor. For example: ```{r using a template} tplyr_table(tplyr_adsl, TRT01P) %>% @@ -48,7 +48,7 @@ tplyr_table(tplyr_adsl, TRT01P) %>% kable() ``` -Within `use_template()`, the first parameter is the template name. After that, we supply arguments as we normally would into `group_count()`, `group_desc()`, or `group_shift()`. Additionally, note that our formats have been applied just as they would be if we used `set_format_strings()` as specified in the template. Our template was applied, the table built with all of the settings appropriately. +Within `use_template()`, the first parameter is the template name. After that, we supply arguments as we normally would into `group_count()`, `group_desc()`, or `group_shift()`. Additionally, note that our formats have been applied just as they would be if we used `set_format_strings()` as specified in the template. Our template was applied, and the table built with all of the settings appropriately. An additional feature of layer templates is that they act just as any other function would in a **Tplyr** layer. This means that they're also extensible and can be expanded on directly within a **Tplyr** table. For example: @@ -62,7 +62,7 @@ tplyr_table(tplyr_adsl, TRT01P) %>% kable() ``` -Here we show two things - first, that the we called the template without the by variable argument from the previous example. This allows a template to have some flexibility depending on the context of its usage. Furthermore, we added the additional modifier function `add_total_row()`. In this example, we took the layer as constructed by the template and then modified that layer further. This may be useful if most but not all of a layer is reusable. The reusable portions can be put in a template, and the rest added using normal **Tplyr** syntax. +Here we show two things - first, that we called the template without the *by* variable argument from the previous example. This allows a template to have some flexibility depending on the context of its usage. Furthermore, we added the additional modifier function `add_total_row()`. In this example, we took the layer as constructed by the template and then modified that layer further. This may be useful if most but not all of a layer is reusable. The reusable portions can be put in a template, and the rest added using normal **Tplyr** syntax. ## Templates With Parameters diff --git a/vignettes/shift.Rmd b/vignettes/shift.Rmd index 43a30e8d..d4af9376 100644 --- a/vignettes/shift.Rmd +++ b/vignettes/shift.Rmd @@ -24,14 +24,14 @@ library(knitr) Shift tables are a special kind of frequency table - but what they count are changes in state. This is most common when looking at laboratory ranges, where you may be interested in seeing how a subject's results related to normal ranges. The 'change in state' would refer to how that subject's results were at baseline versus different points of measure. Shift tables allow you to see the distribution of how subjects move between normal ranges, and if the population is improving or worsening as the study progresses. -While shift tables are very similar to a normal frequency table, there's more nuance here, and thus we decided to create `group_shift()`. This function is largely an abstraction of a count layer, and in fact re-uses a good deal of the same underlying code. But we handle some of the complexity for you to make the interface easy to use and the behavior similar to that of the `group_count()` and `group_desc()` APIs. Given that shift tables are built on count layers, many of functions that work with count layers behave in the same way when using shift layers. However, the following cannot be used in shift layers: +While shift tables are very similar to a normal frequency table, there's more nuance here, and thus we decided to create `group_shift()`. This function is largely an abstraction of a count layer, and in fact re-uses a good deal of the same underlying code. But we handle some of the complexity for you to make the interface easy to use and the behavior similar to that of the `group_count()` and `group_desc()` APIs. Given that shift tables are built on count layers, many functions that work with count layers behave in the same way when used on shift layers. However, the following cannot be used in shift layers: - Functions related to nested counts, including `set_nest_count()`, `set_outer_sort_position()` - Functions related to total rows and missing rows, including `set_missing_count()`, `add_total_row()`, `set_total_row_label()` - Risk difference, including `add_risk_diff()` -- and finally, result based sorting methods, including `set_order_count_method()`, `set_ordering_cols()`, `set_result_order_var()` +- and finally, result-based sorting methods, including `set_order_count_method()`, `set_ordering_cols()`, `set_result_order_var()` -One thing to note - the `group_shift()` API is intended to be used on shift tables where one group is presented in rows and the other group in columns. Occasionally, shift tables will have a row based approach that shows "Low to High", "Normal to High", etc. For those situations, `group_count()` will do just fine. +One thing to note - the `group_shift()` API is intended to be used on shift tables where one group is presented in rows and the other group in columns. Occasionally, shift tables will have a row-based approach that shows "Low to High", "Normal to High", etc. For those situations, `group_count()` will do just fine. ## A Basic Example @@ -49,7 +49,7 @@ tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% First, let's look at the differences in the shift API. Shift layers *must* take a row and a column variable, as the layer is designed to create a box for you that explains the changes in state. The row variable will typically be your "from" variable, and the column variable will typically be your "to" variable. Behind the scenes, **Tplyr** breaks this down for you to properly count and present the data. -For the most part, the last example gets us where we want to go - but there's still some that's left to be desired. It doesn’t look like there are any 'L' values for BNRIND in the dataset so we are not getting and rows containing 'L'. Let’s see if we can fix that by dummying in the possible values. +For the most part, the last example gets us where we want to go - but there's still some that's left to be desired. It doesn’t look like there are any 'L' values for BNRIND in the dataset so we are not getting any rows containing 'L'. Let’s see if we can fix that by dummying in the possible values. ## Filling Missing Groups Using Factors @@ -65,7 +65,7 @@ tplyr_table(tplyr_adlb, TRTA, where=PARAMCD == "CK") %>% kable() ``` -There we go. This is another situation where using factors in R let's us dummy values within the dataset. Furthermore, since factors are ordered, it automatically corrected the sort order of the row labels too. +There we go. This is another situation where using factors in R enables us to dummy values within the dataset. Furthermore, since factors are ordered, Tplyr automatically corrected the sort order of the row labels too. Now, instead of alphabetically (H then L then N), our rows are sorted by factor levels (L then N then H). ## Where to go from here