Merge pull request #23 from ethanbass/improve_parallelization

v0.5.0
ethanbass · May 7, 2023 · 0ac1312 · 0ac1312
2 parents 562025e + 1c22b69
commit 0ac1312
Show file tree

Hide file tree

Showing 44 changed files with 1,252 additions and 331 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -18,13 +18,14 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macos-latest,   r: 'release'}
+          - {os: macos-latest,   r: 'release', visual_tests: true}
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
           - {os: ubuntu-latest,   r: 'oldrel-1'}
 
     env:
+      VISUAL_TESTS: ${{ matrix.config.visual_tests }}
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
 
@@ -39,6 +40,11 @@ jobs:
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
       - name: Set up Python 3.9
         uses: actions/setup-python@v3
         with:
@@ -51,12 +57,7 @@ jobs:
           Rscript -e "reticulate::conda_install('r-reticulate', 'python-kaleido')"
           Rscript -e "reticulate::conda_install('r-reticulate', 'plotly', channel = 'plotly')"
           Rscript -e "reticulate::use_miniconda('r-reticulate')"
-
-      - uses: r-lib/actions/setup-r-dependencies@v2
-        with:
-          extra-packages: any::rcmdcheck
-          needs: check
-
+          
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,13 @@
 Type: Package
 Package: chromatographR
 Title: Chromatographic Data Analysis Toolset
-Version: 0.4.8
-Authors@R: 
+Version: 0.5.0
+Authors@R: c(
     person("Ethan", "Bass", , "[email protected]", role = c("aut", "cre"),
-           comment = c(ORCID = "0000-0002-6175-6739"))
+           comment = c(ORCID = "0000-0002-6175-6739")),
+    person(c("Hans","W"), "Borchers", role = c("ctb", "cph"),
+            comment = c("Author of savgol and pinv functions bundled from pracma"))
+    )
 Maintainer: Ethan Bass <[email protected]>
 Description: Tools for high-throughput analysis of HPLC-DAD/UV
     chromatograms (or similar data). Includes functions for preprocessing, alignment,
@@ -31,25 +34,30 @@ Imports:
     dynamicTreeCut,
     fastcluster,
     Formula,
+    fs,
     graphics,
     grDevices,
     lattice,
     methods,
     minpack.lm,
     parallel,
     ptw,
+    purrr,
     pvclust,
     scales,
     stats,
     utils,
     VPdtw
 Suggests: 
+    cowplot,
     ggplot2,
     knitr,
     openxlsx,
     pbapply,
     plotly,
+    reticulate,
     rmarkdown,
+    rsvg,
     spelling,
     testthat (>= 3.0.0),
     vdiffr

diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ importFrom(lattice,panel.stripplot)
 importFrom(lattice,stripplot)
 importFrom(methods,new)
 importFrom(minpack.lm,nlsLM)
+importFrom(parallel,mclapply)
 importFrom(pvclust,pvclust)
 importFrom(pvclust,pvpick)
 importFrom(pvclust,pvrect)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,38 @@
+# chromatographR 0.5.0
+
+#### New features
+
+* Added `ggplot2` option to `plot_spectrum`, `plot.peak_table` and `plot_all_spectra` functions.
+* Reworked `write_chroms` for more sensible handling of paths and added `filename` argument.
+* Updated `get_purity` function to improve speed.
+* Added additional argument to `reshape_chroms` function for subsetting data by 
+retention times (`rts`).
+* Added parallel processing through the `pbapply` package for the `correct_rt`,
+`get_peaks`, and `preprocess` functions by setting the `cl` argument.
+
+#### Other changes
+
+* Changed behavior of `preprocess` when inferring retention times so chromatograms are no longer rounded down to the largest integer.
+* In `preprocess`, spectral smoothing is no longer applied on 2D chromatograms, removing error message when preprocess is used with default settings.
+* Moved position of `...` argument to end in `plot.peak_table`.
+* Changed `progress_bar` argument to `show_progress` in `correct_rt`, `preprocess`
+and `get_peaks` to fix strange `pmatch` behavior with additional arguments to
+preprocess.
+* Changed orientation of "plotly" plots generated by `plot_spectrum` to match other
+plotting engines.
+* Deprecated the `mc.cores` argument in `correct_rt` is now deprecated in favor of the new
+`cl` argument.
+* Deprecated the `parallel` argument in `preprocess` in favor of just using `cl`.
+* Changed name of first argument in `mirror_plot` from `peak_table` to `x`. Otherwise the function has not changed.
+* Added additional tests, improving test coverage to 80%.
+* Updated `get_chrom_list` (internal) to allow parsing of subsetted lists.
+
 # chromatographR 0.4.8
 
-* Fixed `merge_peaks` function so it works properly (to combine 2 or more peaks in peak table).
+* Fixed bug in `merge_peaks` function so it works properly (to combine 2 or more
+peaks in a peak table).
 * Fixed bugs in `plot_chroms` preventing plotting with `ggplot2` and plotting wrong chromatograms in base R. 
-* Added additional tests for `plot_chroms`, reshape functions.
+* Added additional tests for `plot_chroms` and reshape functions.
 
 # chromatographR 0.4.7
 
@@ -33,6 +63,7 @@
 # chromatographR 0.4.5
 
 ### New Features
+
 * Added `reshape_chroms` function for converting chromatograms to "long" format.
 * Added `write_peaktable` function to easily write peak_table to `csv` or `xlsx`.
 * Added `get_purity` function for assessing peak purity.

diff --git a/R/attach_metadata.R b/R/attach_metadata.R
@@ -144,7 +144,7 @@ get_reference_spectra <- function(peak_table, chrom_list,
 
 attach_ref_spectra <- function(peak_table, chrom_list, ref = c("max.cor","max.int")){
   check_peaktable(peak_table)
-  peak_table$ref_spectra <- get_reference_spectra(peak_table, chrom_list, ref)
+  peak_table$ref_spectra <- get_reference_spectra(peak_table, chrom_list, ref = ref)
   peak_table$args["reference_spectra"] <- ref
   return(peak_table)
 }
@@ -193,7 +193,7 @@ normalize_data <- function(peak_table, column, chrom_list,
     })))
     rownames(pktab) <- rownames(peak_table$tab)
     peak_table$tab <- pktab
-    peak_table$args[c("normalized","normalization_by")] <- c(TRUE, column)
+    peak_table$args[c("normalized", "normalization_by")] <- c(TRUE, column)
     return(peak_table)
   } else if (what == "chrom_list"){
     if (missing(chrom_list)){

diff --git a/R/correct_rt.R b/R/correct_rt.R
@@ -35,9 +35,13 @@
 #' @param maxshift Integer. Maximum allowable shift for \code{\link[VPdtw]{VPdtw}}.
 #' Defaults to 50.
 #' @param verbose Whether to print verbose output.
-#' @param progress_bar Logical. Whether to show progress bar. Defaults to 
+#' @param show_progress Logical. Whether to show progress bar. Defaults to 
 #' \code{TRUE} if \code{\link[pbapply]{pbapply}} is installed. Currently works 
 #' only for \code{ptw} alignments.
+#' @param cl Argument to \code{\link[pbapply]{pblapply}} or \code{\link[parallel]{mclapply}}.
+#' Either an integer specifying the number of clusters to use for parallel
+#' processing or a cluster object created by \code{\link[parallel]{makeCluster}}.
+#' Defaults to 2. On Windows integer values will be ignored.
 #' @param \dots Optional arguments for the \code{\link[ptw:ptw]{ptw}} function.
 #' The only argument that cannot be changed is \code{warp.type}: this is always
 #' equal to \code{"global"}.
@@ -80,7 +84,7 @@ correct_rt <- function(chrom_list, lambdas, models = NULL, reference = 'best',
                        init.coef = c(0, 1, 0), n.traces = NULL, n.zeros = 0, 
                        scale = FALSE, trwdth = 200, plot_it = FALSE,
                        penalty = 5, maxshift = 50,
-                       verbose = FALSE, progress_bar, ...){
+                       verbose = FALSE, show_progress = NULL, cl = 2, ...){
   what <- match.arg(what, c("corrected.values", "models"))
   alg <- match.arg(alg, c("ptw", "vpdtw"))
 
@@ -134,10 +138,7 @@ correct_rt <- function(chrom_list, lambdas, models = NULL, reference = 'best',
                             penalty = penalty, maxshift = maxshift))
     if (alg == "ptw"){
       if (is.null(models)){
-        if (missing(progress_bar)){
-          progress_bar <- check_for_pkg("pbapply", return_boolean = TRUE)
-        }
-        laplee <- choose_apply_fnc(progress_bar)
+        laplee <- choose_apply_fnc(show_progress, cl = cl)
         models <- laplee(seq_len(dim(allmats)[3]), function(ii){
           ptw(allmats.t[,, reference],
               allmats.t[,, ii], selected.traces = traces, init.coef = init.coef,

diff --git a/R/fit_peaks.R b/R/fit_peaks.R
@@ -56,9 +56,12 @@
 find_peaks <- function(y, smooth_type=c("gaussian", "box", "savgol", "mva","tmva","none"),
                        smooth_window = .001, slope_thresh = 0, amp_thresh = 0,
                        bounds = TRUE){
+  if (!is.vector(y)){
+    stop("Please provide a vector to argument `y` to proceed.")
+  }
   smooth_type <- match.arg(smooth_type, c("gaussian", "box","savgol", "mva","tmva","none"))
   if (smooth_window < 1){
-    smooth_window <- length(y)*smooth_window
+    smooth_window <- max(length(y) * smooth_window, 1)
   }
   # compute derivative (with or without smoothing)
   if (smooth_type == "savgol"){
@@ -71,7 +74,7 @@ find_peaks <- function(y, smooth_type=c("gaussian", "box", "savgol", "mva","tmva
   }  else if (smooth_type == "box"){
     d <- diff(ksmooth(seq_along(y), y, kernel = "box", bandwidth = smooth_window)$y)
   } else if (smooth_type == "tmva"){
-    d <- runmean(runmean(diff(y), k=smooth_window), k=smooth_window)
+    d <- runmean(runmean(diff(y), k = smooth_window), k = smooth_window)
   } else{
     d <- diff(y)
   }
@@ -159,9 +162,10 @@ find_peaks <- function(y, smooth_type=c("gaussian", "box", "savgol", "mva","tmva
 #' good model for chromatographic peaks in isocratic HPLC? \emph{Chromatographia},
 #' /bold{26}: 285-296. \doi{10.1007/BF02268168}.
 #' @export fit_peaks
+
 fit_peaks <- function (x, lambda, pos = NULL, sd.max = 50,
                        fit = c("egh", "gaussian", "raw"),  max.iter = 1000, 
-                       estimate_purity = TRUE, noise_threshold=.001, ...){
+                       estimate_purity = TRUE, noise_threshold = .001, ...){
   y <- x[,lambda]
   fit <- match.arg(fit, c("egh", "gaussian", "raw"))
   if (is.null(pos)){
@@ -209,7 +213,7 @@ fit_peaks <- function (x, lambda, pos = NULL, sd.max = 50,
 #' Gaussian function
 #' @note: Adapted from \href{https://github.com/robertdouglasmorrison/DuffyTools/blob/master/R/gaussian.R}
 #' @noRd
-gaussian <- function(x, center=0, width=1, height=NULL, floor=0) {
+gaussian <- function(x, center = 0, width = 1, height=NULL, floor=0) {
 
   # adapted from Earl F. Glynn;  Stowers Institute for Medical Research, 2007
   twoVar <- 2 * width * width
@@ -315,7 +319,7 @@ fit_egh <- function(x1, y1, start.center=NULL, start.width=NULL, start.tau=NULL,
     if (is.null( start.floor)) start.floor <- quantile( y1, seq(0,1,0.1))[2]
     starts <- c(starts, "floor"=start.floor)
     nlsAns <- try(nlsLM(y1 ~ egh(x1, center, width, height, tau, floor),
-                        start=starts, control=controlList), silent=TRUE)
+                        start = starts, control = controlList), silent = TRUE)
   }
 
   # package up the results to pass back
@@ -398,7 +402,7 @@ fitpk_raw <- function(x, pos, lambda, max.iter,
 #' @param fl Filter length (for instance fl = 51..151), has to be odd.
 #' @param forder filter order Filter order (2 = quadratic filter, 4 = quartic).
 #' @param dorder Derivative order (0 = smoothing, 1 = first derivative, etc.).
-#' @note This function is ported from \href{https://cran.r-project.org/web/packages/pracma/index.html}{pracma},
+#' @note This function is bundled from \href{https://cran.r-project.org/web/packages/pracma/index.html}{pracma},
 #' where it is licensed under GPL (>= 3).
 #' @importFrom stats convolve
 #' @noRd

diff --git a/R/get_peaks.R b/R/get_peaks.R
@@ -29,8 +29,12 @@
 #' @param estimate_purity Logical. Whether to estimate purity or not. Defaults
 #' to FALSE. (If TRUE, this will slow down the function significantly).
 #' @param noise_threshold Noise threshold. Argument to \code{get_purity}.
-#' @param progress_bar Logical. Whether to show progress bar. Defaults to 
+#' @param show_progress Logical. Whether to show progress bar. Defaults to 
 #' \code{TRUE} if \code{\link[pbapply]{pbapply}} is installed.
+#' @param cl Argument to \code{\link[pbapply]{pblapply}} or \code{\link[parallel]{mclapply}}.
+#' Either an integer specifying the number of clusters to use for parallel
+#' processing or a cluster object created by \code{\link[parallel]{makeCluster}}.
+#' Defaults to 2. On Windows integer values will be ignored.
 #' @param \dots Additional arguments to \code{\link{find_peaks}}.
 #' @return The result is an S3 object of class \code{peak_list}, containing a nested
 #' list of data.frames containing information about the peaks fitted for each
@@ -69,7 +73,7 @@ get_peaks <- function(chrom_list, lambdas, fit = c("egh", "gaussian", "raw"),
                       sd.max = 50, max.iter = 100,
                       time.units = c("min", "s", "ms"),
                       estimate_purity = FALSE,  noise_threshold = .001,
-                      progress_bar, ...){
+                      show_progress = NULL, cl = 2, ...){
   time.units <- match.arg(time.units, c("min", "s", "ms"))
   tfac <- switch(time.units, "min" = 1, "s" = 60, "ms" = 60*1000)
   fit <- match.arg(fit, c("egh", "gaussian", "raw"))
@@ -90,10 +94,7 @@ get_peaks <- function(chrom_list, lambdas, fit = c("egh", "gaussian", "raw"),
     names(chrom_list) <- seq_along(chrom_list)
   }
   peaks<-list()
-  if (missing(progress_bar)){
-    progress_bar <- check_for_pkg("pbapply", return_boolean = TRUE)
-  }
-  laplee <- choose_apply_fnc(progress_bar)
+  laplee <- choose_apply_fnc(show_progress, cl = cl)
   result <- laplee(seq_along(chrom_list), function(sample){
     suppressWarnings(ptable <- lapply(lambdas, function(lambda){
       cbind(sample = names(chrom_list)[sample], lambda,

diff --git a/R/get_peaktable.R b/R/get_peaktable.R
@@ -89,7 +89,7 @@
 #' get_peaktable(pks, response = "area")
 #' @seealso \code{\link{attach_ref_spectra}} \code{\link{attach_metadata}}
 #' @export get_peaktable
-#' 
+
 get_peaktable <- function(peak_list, chrom_list, response = c("area", "height"),
                           use.cor = FALSE, hmax = 0.2, plot_it = FALSE,
                           ask = plot_it, clust = c("rt","sp.rt"),
@@ -132,28 +132,28 @@ get_peaktable <- function(peak_list, chrom_list, response = c("area", "height"),
     }
     if (clust == 'sp.rt'){
       if (is.null(sigma.t)){
-        sigma.t <- .5*mean(do.call(rbind,unlist(pkLst,recursive = FALSE))$end - 
+        sigma.t <- 0.5 * mean(do.call(rbind,unlist(pkLst,recursive = FALSE))$end - 
                              do.call(rbind,unlist(pkLst,recursive = FALSE))$start)
       }
-      ts<- as.numeric(rownames(chrom_list[[1]]))
+      ts <- as.numeric(rownames(chrom_list[[1]]))
       sp <- sapply(seq_along(pkcenters), function(i){
         rescale(t(chrom_list[[file.idx[i]]][
           which(elementwise.all.equal(ts, pkcenters[i])),]))
       }, simplify=TRUE)
       cor.matrix <- cor(sp, method = "pearson")
       mint <- abs(outer(unlist(pkcenters), unlist(pkcenters), FUN="-"))
-      S <- (exp((-(1-abs(cor.matrix))^2)/(2*sigma.r^2)))*exp(-(mint^2)/(2*sigma.t^2))
-      D <- 1-S
+      S <- (exp((-(1 - abs(cor.matrix))^2)/(2*sigma.r^2)))*exp(-(mint^2)/(2*sigma.t^2))
+      D <- 1 - S
       linkage <- "average"
       pkcenters.hcl <- hclust(as.dist(D), method = linkage)
       pkcenters.cl <- cutreeDynamicTree(pkcenters.hcl, maxTreeHeight = hmax, 
                                       deepSplit = deepSplit, minModuleSize = 2)
       sing <- which(pkcenters.cl == 0)
       pkcenters.cl[sing] <- max(pkcenters.cl) + seq_along(sing)
     }
-    cl.centers <- aggregate(cbind(rt,start,end,sd,tau,FWHM,r.squared,purity) ~ 
-                              pkcenters.cl, data=xx, "mean", na.rm=TRUE,
-                            na.action="na.pass")[,-1]
+    vars <- c("rt", "start", "end", "sd", "tau", "FWHM", "r.squared", "purity")
+    cl.centers <- aggregate(xx[,vars], by = list(pkcenters.cl), FUN = "mean",
+                            na.rm = TRUE, na.action = "na.pass")[,-1]
     ncl <- length(cl.centers$rt)
 
     ## re-order clusters from small to large rt