Merge pull request #31 from ethanbass/dev

v0.7.0
ethanbass · Jan 26, 2024 · 708ab3a · 708ab3a
2 parents 9f4d90b + babf42a
commit 708ab3a
Show file tree

Hide file tree

Showing 67 changed files with 2,956 additions and 944 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: chromatographR
 Title: Chromatographic Data Analysis Toolset
-Version: 0.6.1
+Version: 0.7.0
 Authors@R: c(
     person("Ethan", "Bass", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-6175-6739")),
@@ -43,7 +43,6 @@ Imports:
     parallel,
     ptw,
     purrr,
-    pvclust,
     scales,
     stats,
     utils,
@@ -55,6 +54,7 @@ Suggests:
     openxlsx,
     pbapply,
     plotly,
+    pvclust,
     reticulate,
     rmarkdown,
     rsvg,
@@ -68,4 +68,4 @@ Encoding: UTF-8
 Language: en-US
 LazyData: true
 LazyDataCompression: xz
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -19,9 +19,10 @@ export(filter_peaks)
 export(filter_peaktable)
 export(find_peaks)
 export(fit_peaks)
+export(get_agilent_threshold)
 export(get_peaks)
 export(get_peaktable)
-export(load_chroms)
+export(get_purity)
 export(merge_peaks)
 export(mirror_plot)
 export(normalize_data)
@@ -33,8 +34,8 @@ export(read_chroms)
 export(reshape_chroms)
 export(reshape_peaktable)
 export(scan_chrom)
+export(trim_peak)
 export(write_peaktable)
-import(chromConverter)
 import(ptw)
 importFrom(Formula,Formula)
 importFrom(caTools,runmean)
@@ -62,9 +63,6 @@ importFrom(lattice,stripplot)
 importFrom(methods,new)
 importFrom(minpack.lm,nlsLM)
 importFrom(parallel,mclapply)
-importFrom(pvclust,pvclust)
-importFrom(pvclust,pvpick)
-importFrom(pvclust,pvrect)
 importFrom(scales,alpha)
 importFrom(scales,rescale)
 importFrom(stats,aggregate)
@@ -92,6 +90,5 @@ importFrom(stats,smooth.spline)
 importFrom(stats,terms)
 importFrom(stats,var)
 importFrom(utils,head)
-importFrom(utils,read.csv)
 importFrom(utils,tail)
 importFrom(utils,write.csv)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,36 @@
+# chromatographR 0.7.0
+
+* Updated `correct_peaks` function so it works properly for correcting retention times in peak lists.
+* Added `fixed_levels` argument to `reshape_peaktable` so features can be plotted in the order they're provided by the user.
+* Added option for summing split peaks using the `merge_peaks` function by selecting `method = "sum"`.
+* Updated `get_peaktable` so that the `use.cor` argument works correctly (to use corrected retention times stored in a separate column).
+* Fixed `mirror_plot` so it can take numeric input for lambdas.
+* Changed default setting of `verbose` argument in `correct_rt` from `FALSE`
+to default setting.
+* Removed `load_chroms` function. Use `read_chroms` instead.
+* Eliminated spurious warning from `attach_ref_spectra` function.
+* Changed name of `index` argument in `plot.peak_list` to `idx`. The original argument is now deprecated.
+* Fixed bug affecting `plot_purity` argument in `plot.peak_list`.
+* Fixed bug in `reshape_chroms` so empty metadata column no longer appears.
+* The `plot_spectrum` function now includes the peak names when plotting spectra.
+* Fixed `correct_rt` so it no longer requires user-provided `lambdas` for 1D chromatograms.
+* Added `subset.peak_table` function for easily subsetting peak_tables (e.g. to exclude specific
+peaks or samples).
+* Added `what` argument for `plot_all_spectra` (e.g. to plot multiple spectra at a particular retention time).
+
+#### Refactoring of `cluster_spectra` function:
+
+* For simplicity, `cluster_spectra` now requires reference spectra to be attached to peak table.
+* Accordingly, the `chrom_list` argument is no longer needed.
+* Saving to RDS is now turned off by default.
+* The `pvclust` package is now suggested instead of being required.
+
+#### Updates to vignette and documentation
+
+* Suggest numeric input to lambdas instead of character input to reduce unnecessary confusion.
+* Made other minor changes to text of vignette to (hopefully) improve clarity.
+* Added a short section on the attachment of reference spectra.
+
 # chromatographR 0.6.1
 
 * Fixed bug in plot functions (e.g. `plot_chroms` and `plot_spectrum`) causing error when retention times are inconsistent between chromatograms.
@@ -12,7 +45,6 @@
 * Updated `get_peaktable` for greater flexibility (e.g. for usage of 'ChemStation' peak lists as input).
 
 #### Other changes
-
 * Made some minor changes to vignette to improve clarity (e.g. using single wavelength for integration, etc.)
 
 #### Bug fixes

diff --git a/R/attach_metadata.R b/R/attach_metadata.R
@@ -41,7 +41,7 @@ attach_metadata <- function(peak_table, metadata, column){
   missing_meta <- !(meta[, column] %in% metadata[, column])
   if (sum(missing_meta) > 0)
     warning("The supplied metadata does not include all samples.")
-  meta <- keep_order(meta, merge, y=metadata, all.x = TRUE, all.y = FALSE,
+  meta <- keep_order(meta, merge, y = metadata, all.x = TRUE, all.y = FALSE,
                      sort = FALSE, by = column)
   peak_table$sample_meta <- meta
   return(peak_table)
@@ -52,7 +52,7 @@ attach_metadata <- function(peak_table, metadata, column){
 #' @noRd
 keep_order <- function(data, fn, ...) { 
   col <- ".sortColumn"
-  data[,col] <- 1:nrow(data) 
+  data[,col] <- seq_len(nrow(data))
   out <- fn(data, ...) 
   if (!col %in% colnames(out)) stop("Ordering column not preserved by function") 
   out <- out[order(out[,col]),] 
@@ -100,11 +100,13 @@ get_reference_spectra <- function(peak_table, chrom_list,
       apply(x, 2, as.numeric)
     })
     sp.ref <- sapply(seq_along(sp.l), function(i){
-      sp.l[[i]][,which.max(colMeans(cor(sp.l[[i]][,which(apply((sp.l[[i]]), 2, sd)!=0), drop=FALSE])))]
+      sp.l[[i]][, which.max(
+        colMeans(cor(sp.l[[i]][, which(apply((sp.l[[i]]), 2, sd)!=0), 
+                                                          drop = FALSE])))]
     })
   } else {
     sp.ref <- sapply(colnames(peak_table$tab), function(pk){
-      try(plot_spectrum(loc = pk, peak_table, chrom_list, plot_trace=FALSE,
+      try(plot_spectrum(loc = pk, peak_table, chrom_list, plot_trace = FALSE,
                     plot_spectrum = FALSE, export_spectrum = TRUE,
                     verbose = FALSE,
                     scale_spectrum = TRUE, engine = "base")
@@ -144,6 +146,7 @@ get_reference_spectra <- function(peak_table, chrom_list,
 
 attach_ref_spectra <- function(peak_table, chrom_list, ref = c("max.cor","max.int")){
   check_peaktable(peak_table)
+  ref <- match.arg(ref, c("max.cor","max.int"))
   peak_table$ref_spectra <- get_reference_spectra(peak_table, chrom_list, ref = ref)
   peak_table$args["reference_spectra"] <- ref
   return(peak_table)

diff --git a/R/chromatographR-package.R b/R/chromatographR-package.R
@@ -1,57 +1,72 @@
-
-
-#' chromatographR
-#' 
 #' chromatographR
 #' 
-#' \tabular{ll}{ Package: \tab chromatographR\cr Type: \tab Package\cr Version:
-#' \tab 0.6.1 \cr Date: \tab 2023-10-23\cr License: GPL (>= 2) }
+#' Chromatographic Data Analysis Toolset
 #' 
-#' @name chromatographR-package
-#' @aliases chromatographR-package chromatographR
-#' @docType package
+#' Tools for high-throughput analysis of HPLC-DAD/UV
+#' chromatograms (or similar data). Includes functions for preprocessing, alignment,
+#' peak-finding and fitting, peak-table construction, data-visualization, etc.
+#' Preprocessing and peak-table construction follow the rough formula laid out
+#' in alsace (Wehrens, R., Bloemberg, T.G., and Eilers P.H.C., 2015.
+#' \doi{10.1093/bioinformatics/btv299}). Alignment of chromatograms is available
+#' using parametric time warping (ptw) (Wehrens, R., Bloemberg, T.G., and Eilers
+#' P.H.C. 2015. \doi{10.1093/bioinformatics/btv299}) or variable penalty dynamic
+#' time warping (VPdtw) (Clifford, D., & Stone, G. 2012. \doi{10.18637/jss.v047.i08}).
+#' Peak-finding relies on the algorithm suggested by Tom O'Haver in his
+#' \href{https://terpconnect.umd.edu/~toh/spectrum/PeakFindingandMeasurement.htm}{
+#' Pragmatic Introduction to Signal Processing}. Peaks are then fitted to a 
+#' gaussian or exponential-gaussian hybrid peak shape using non-linear least 
+#' squares (Lan, K. & Jorgenson, J. W. 2001. \doi{10.1016/S0021-9673(01)00594-5}). 
+#' More details on package usage and a suggested workflow can be found in the 
+#' vignette.
 #' @author Ethan Bass
-#' 
-#' Maintainer: Ethan Bass
-#' @keywords package
-NULL
+"_PACKAGE"
 
-#' HPLC-DAD data of goldenrod root extracts.
+#' Raw goldenrod root chromatograms
 #' 
-#' Four HPLC-DAD data matrices of *Solidago altissima* roots extracted in 90
-#' percent methanol.
+#' A list of four HPLC-DAD data matrices of \emph{Solidago altissima} roots
+#' extracted in 90\% methanol. Retention times are stored in rows and 
+#' wavelengths are stored in columns.
 #' 
 #' @name Sa
 #' @docType data
-#' @format  A list of four matrices (time x wavelength).
+#' @keywords data
+#' @usage data(Sa)
+#' @format  A list of four matrices (1301 times x 60 wavelengths).
 NULL
 
-#' HPLC-DAD data of goldenrod root extracts.
+#' Preprocessed goldenrod root chromatograms
 #' 
-#' Pre-processed chromatograms.
+#' A list of four pre-processed HPLC-DAD chromatograms derived from the raw data
+#' stored in \code{Sa}. Retention times are stored in rows and wavelengths
+#' are stored in columns. The time axis is compressed to save space and 
+#' processing time so the data are a little choppy. 
 #'
 #' @name Sa_pr
+#' @keywords data
 #' @docType data
-#' @format  Four pre-processed matrices (time x wavelength) to use in examples.
+#' @usage data(Sa_pr)
+#' @format  A list of four pre-processed matrices (434 retention times x 60 wavelengths).
 NULL
 
-#' HPLC-DAD data of goldenrod root extracts.
+#' Warped goldenrod root chromatograms.
 #' 
-#' Pre-processed and warped chromatograms.
+#' A list of four pre-processed and warped goldenrod root chromatograms derived
+#' from the raw data stored in \code{Sa}.
 #'
 #' @name Sa_warp
 #' @docType data
-#' @format  Four pre-processed and warped matrices (time x wavelength) to use in
-#' examples.
+#' @keywords data
+#' @usage data(Sa_warp)
+#' @format  A list of four pre-processed and warped matrices (434 times x 60 wavelengths).
 NULL
 
 #' Goldenrod peak table
 #' 
-#' Peak table generated from example goldenrod extracts for examples.
+#' Peak table generated from exemplary goldenrod root extracts.
 #'
 #' @name pk_tab
 #' @docType data
+#' @keywords data
+#' @usage data(pk_tab)
 #' @format  A \code{peak_table} object.
 NULL
-
-
diff --git a/R/cluster_spectra.R b/R/cluster_spectra.R
@@ -25,14 +25,10 @@ setClass("cluster", representation(peaks = "character", pval = "numeric"))
 #' the confidence threshold will be returned.
 #'
 #' @name cluster_spectra
-#' @importFrom pvclust pvclust pvrect pvpick
 #' @importFrom stats cor
 #' @importFrom methods new
 #' @importFrom graphics matplot
 #' @param peak_table Peak table from \code{\link{get_peaktable}}.
-#' @param chrom_list A list of chromatograms in matrix format (timepoints x
-#' wavelengths). If no argument is provided here, the function will try to find
-#' the \code{chrom_list} object used to create the provided \code{peak_table}.
 #' @param peak_no Minimum and maximum thresholds for the number of peaks a
 #' cluster may have.
 #' @param alpha Confidence threshold for inclusion of cluster.
@@ -78,69 +74,75 @@ setClass("cluster", representation(peaks = "character", pval = "numeric"))
 #' @examples \donttest{
 #' data(pk_tab)
 #' data(Sa_warp)
+#' pk_tab <- attach_ref_spectra(pk_tab, Sa_warp, ref="max.int")
 #' cl <- cluster_spectra(pk_tab, nboot=100, max.only = FALSE, save = FALSE, alpha = .97)
 #' }
 #' @export cluster_spectra
 #' @md
 
-cluster_spectra <- function(peak_table, chrom_list, peak_no = c(5,100),
-                            alpha=0.95, nboot=1000, plot_dend=TRUE,
-                            plot_spectra=TRUE, verbose=TRUE, save=TRUE,
-                            parallel=TRUE, max.only=FALSE,
-                            output=c("clusters", "pvclust", "both"),
+cluster_spectra <- function(peak_table, peak_no = c(5,100), alpha = 0.95, 
+                            nboot = 1000, plot_dend = TRUE, plot_spectra = TRUE, 
+                            verbose = getOption("verbose"), 
+                            save = FALSE, parallel = TRUE, max.only = FALSE,
+                            output = c("pvclust", "clusters"),
                             ...){
+  check_for_pkg("pvclust")
   check_peaktable(peak_table)
-  if (missing(chrom_list)){
-    chrom_list <- get_chrom_list(peak_table)
-  } else get_chrom_list(peak_table, chrom_list)
-  output <- match.arg(output, c("clusters","pvclust","both"))
+  output <- match.arg(output, c("pvclust", "clusters"), several.ok = TRUE)
   if (is.data.frame(peak_table$ref_spectra) | is.matrix(peak_table$ref_spectra)){
-    rep <- peak_table$ref_spectra
-  } else{
-    if (verbose)
-      print('...collecting representative spectra')
-    rep <- sapply(colnames(peak_table[[1]]), function(j){
-      sp <- plot_spectrum(loc=j, peak_table=peak_table, chrom_list = chrom_list,
-                          scale_spectrum=TRUE, plot_trace=FALSE,
-                          export_spectrum=TRUE, plot_spectrum=FALSE, verbose=verbose)
-    })
-    rep <- data.frame(do.call(cbind,rep))
-    names(rep) <- paste0('V',seq_len(ncol(rep)))
+    spectra <- peak_table$ref_spectra
+  } else {
+    stop("Please attach reference spectra (using the `attach_ref_spectra` function) before running `cluster_spectra`.")
+  }
+  rm <- which(apply(spectra, 2, sd) == 0)
+  if (length(rm) > 0){
+    if (verbose){
+      message(paste0("Removing peaks due to bad spectra: ", paste(sQuote(colnames(spectra)[rm]),collapse=", ")))
+    }
+    spectra <- spectra[, -rm]
   }
-  rm <- which(apply(rep,2,sd)==0)
-  if (length(rm)>0)
-    rep <- rep[,-rm]
-  d <- 1 - abs(cor(rep, method="pearson"))
   if (verbose)
-    print('...clustering spectra')
-  result <- suppressWarnings(pvclust(rep, method.dist = "cor",
-                             nboot = nboot, parallel = parallel, quiet=!verbose, ...)
+    message('...clustering spectra')
+  result <- suppressWarnings(pvclust::pvclust(spectra, method.dist = "cor", 
+                                              nboot = nboot, parallel = parallel,
+                                              quiet = !verbose, ...)
   )
   if (plot_dend){
-    plot(result,labels = FALSE, cex.pv=0.5, print.pv = 'au',print.num = FALSE)
-    pvrect(result, alpha = alpha, max.only = max.only)
+    plot(result, labels = FALSE, cex.pv = 0.5, print.pv = 'au',
+         print.num = FALSE)
+    pvclust::pvrect(result, alpha = alpha, max.only = max.only)
+  }
+  if (save){
+    saveRDS(result, 'pvclust.RDS')
   }
-  if (save) saveRDS(result, 'pvclust.RDS')
-  p <- pvpick(result, alpha = alpha, max.only = max.only)
-  l <- sapply(p$clusters, length)
-  sub <- p$clusters[which(l > peak_no[1] & l < peak_no[2])]
-  pval <- 1-result$edges[p$edges[which(l > peak_no[1] & l < peak_no[2])],'au']
-  sub <- lapply(seq_along(sub), function(i){
-    new("cluster", peaks=sub[[i]], pval=pval[i])})
-  pval <- format(round(
-    result$edges[p$edges[which(l > peak_no[1] & l < peak_no[2])],'au'],2),
-    nsmall=2)
-  names(sub) <- paste0('c', seq_along(sub))
+  picks <- pvclust::pvpick(result, alpha = alpha, max.only = max.only)
+  cl_size <- sapply(picks$clusters, length)
 
-  if (plot_spectra){
-    if (verbose) print('...plotting clustered spectra')
-    new.lambdas <- colnames(chrom_list[[1]])
-    sapply(seq_along(sub), function(i){ 
-      matplot(new.lambdas,rep[,sub[[i]]@peaks],
-              type='l', ylab='', yaxt='n', xlab=expression(lambda),
-              main=paste0('cluster ', i, '; p = ',
-                          format(round(sub[[i]]@pval,2),nsmall=2))
-                          )})
+  ## filter clusters ##
+  cl_idx <- which(cl_size > peak_no[1] & cl_size < peak_no[2])
+  if (verbose && length(cl_idx) < length(cl_size)){
+    message(paste0("Removing ", length(cl_size) - length(cl_idx), " under- or oversized clusters from results."))
+  }
+  clusters <- picks$clusters[cl_idx]
+  pval <- 1 - result$edges[picks$edges[cl_idx], 'au']
+  clusters <- lapply(seq_along(clusters), function(i){
+    new("cluster", peaks = clusters[[i]], pval = pval[i])
+  })
+  if (length(clusters) != 0){
+    names(clusters) <- paste0('c', seq_along(clusters))
+
+    if (plot_spectra){
+      if (verbose) message('...plotting clustered spectra')
+      lambdas <- as.numeric(rownames(spectra))
+      sapply(seq_along(clusters), function(i){ 
+        matplot(lambdas, spectra[, clusters[[i]]@peaks],
+                type = 'l', ylab = '', yaxt = 'n', xlab = expression(lambda),
+                main = paste0('cluster ', i, '; p = ',
+                              format(round(clusters[[i]]@pval, 2), nsmall = 2))
+        )
+      })
+    }
   }
-  switch(output, "clusters" = sub, "pvclust" = result, "both" = list(result,sub))
+  pvclust <- result
+  mget(output)
 }