Merge pull request #9 from RogerGinBer/devel

Update 12/21
RogerGinBer · Dec 29, 2021 · 10189ac · 10189ac
2 parents 5ca8d8c + 0aed33f
commit 10189ac
Show file tree

Hide file tree

Showing 6 changed files with 189 additions and 175 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,7 +13,7 @@ Description: RHermes is a broad-scoped targeted metabolomics package to identify
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 VignetteBuilder: knitr
 biocViews: MassSpectrometry, Metabolomics, GUI
 BugReports: https://github.com/RogerGinBer/RHermes/issues

diff --git a/NAMESPACE b/NAMESPACE
@@ -68,6 +68,8 @@ importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(grDevices,colorRamp)
 importFrom(grDevices,rgb)
+importFrom(keras,array_reshape)
+importFrom(keras,k_argmax)
 importFrom(keras,load_model_hdf5)
 importFrom(keras,predict_classes)
 importFrom(stats,IQR)

diff --git a/R/IL_methods.R b/R/IL_methods.R
@@ -68,8 +68,7 @@ function(struct, id, rts, minint = Inf) {
 #'@param maxInjections Numeric, the maximum number of planned injections to
 #'  export. Defaults to 9999 to export all of them.
 #'@param sepFiles Logical, whether to generate a single csv file or multiple
-#'  csvs, each corresponding to each injection/chromatographic run. From our
-#'  experience with an Orbitrap Fusion, separate csvs will simplify the task.
+#'  csvs, each corresponding to each injection/chromatographic run. 
 #'@return Nothing. As a side effect, it generates one/multiple .csv files with
 #'  the inclusion list data
 #'@examples
@@ -79,14 +78,14 @@ function(struct, id, rts, minint = Inf) {
 #'@export
 setGeneric("exportIL", function(struct, id, file = "./InclusionList",
                                 mode = "both", maxOver = 5, defaultIT = 100,
-                                sepFiles = FALSE, maxInjections = 9999) {
+                                sepFiles = TRUE, maxInjections = 9999) {
     standardGeneric("exportIL")
 })
 
 #'@rdname exportIL
 setMethod("exportIL", c("RHermesExp", "numeric", "ANY", "ANY", "ANY", "ANY"),
 function(struct, id, file = "./InclusionList", mode = "both", maxOver = 5,
-         defaultIT = 100, sepFiles = FALSE, maxInjections = 9999) {
+         defaultIT = 100, sepFiles = TRUE, maxInjections = 9999) {
     validObject(struct)
     if (length(struct@data@MS2Exp) == 0) {
         stop("This object doesn't have any ILs")
@@ -96,7 +95,7 @@ function(struct, id, file = "./InclusionList", mode = "both", maxOver = 5,
     }
     IL <- struct@data@MS2Exp[[id]]@IL@IL
     IL$IT <- defaultIT
-    plan <- injectionPlanner(IL, 10, maxOver, byMaxInt = TRUE,
+    plan <- injectionPlanner(IL, maxOver = maxOver, byMaxInt = TRUE,
                              injections = maxInjections,
                              mode = mode)
     if (sepFiles) {
@@ -130,9 +129,8 @@ function(struct, id, file = "./InclusionList", mode = "both", maxOver = 5,
 })
 
 
-injectionPlanner <- function(IL, injections, maxover, byMaxInt = TRUE,
-                             mode = "continuous", returnAll = FALSE) {
-    if (returnAll) {injections <- 1e4}
+injectionPlanner <- function(IL, injections, maxOver, byMaxInt = TRUE,
+                             mode = "continuous") {
     if (byMaxInt) {IL <- IL[order(-IL$MaxInt), ]}
     idx <- which(is.na(IL$start) | is.na(IL$end))  #NA depuration
     if (length(idx) != 0) {IL <- IL[-idx, ]}
@@ -157,7 +155,7 @@ injectionPlanner <- function(IL, injections, maxover, byMaxInt = TRUE,
             ok_entries <- c()
             for (i in seq_len(nrow(IL))) {
                 timeidx <- which(timeInt >= IL$start[i] & timeInt <= IL$end[i])
-                if (any(OL[timeidx] >= maxover)) {next}
+                if (any(OL[timeidx] >= maxOver)) {next}
                 ok_entries <- c(ok_entries, i)
                 OL[timeidx] <- OL[timeidx] + 1
             }
@@ -186,7 +184,8 @@ injectionPlanner <- function(IL, injections, maxover, byMaxInt = TRUE,
         if(mode == "both" & nrow(deep_IL) != 0){
             message(paste(nrow(deep_IL), "high IT scans could not be planned",
                           "within the continuous MS2 injections.",
-                          "Adding them separately."))
+                          "Adding them separately after injection",
+                          length(plan)))
             while (nrow(deep_IL) != 0 & injections > 0) {
                 timeInt <- seq(min(deep_IL$start, na.rm = TRUE),
                                max(deep_IL$end, na.rm = TRUE),
@@ -231,7 +230,7 @@ injectionPlanner <- function(IL, injections, maxover, byMaxInt = TRUE,
     if (nrow(IL) != 0) {
         message(paste0(nrow(IL), "ILs haven't been added to the injection plan",
                        " due to lack of space. Try again with more injections,",
-                       " more maxover or returnAll = TRUE"))
+                       " more maxOver or maxInjections"))
     }
     # print(plot(cumsum(sapply(plan, nrow))))
     # hist_data <- lapply(seq_along(plan), function(x){

diff --git a/R/Plot_functions.R b/R/Plot_functions.R
@@ -455,7 +455,7 @@ function(struct, id, entry, plot = TRUE) {
             isoname <- paste0("[", atom, "]", 1)
             cur <- df[df$code == isoname, ]
             th <- cur$abundance[cur$class == "Theoretical"]
-            if (th < 30000) {
+            if (th[1] < 30000) {
                 return(NA)
             }
             exp <- cur$abundance[cur$class == "Experimental"]

diff --git a/R/SOI_functions.R b/R/SOI_functions.R
@@ -172,6 +172,15 @@ PLprocesser <- function(PL, ExpParam, SOIParam, blankPL = NA, filename) {
     if (any(Groups$length > maxlen)) {
         Groups <- groupShort(Groups, maxlen)
     }
+
+    message("Number of scans:")
+    nscans <- apply(Groups, 1, function(x) {
+        d <- x["peaks"][[1]]
+        return(dim(d)[1])
+    })
+    Groups$nscans <- nscans
+    Groups <- Groups[Groups$nscans > 5, ]
+
     ## Blank substraction
     if (useblank) {
         Groups <- blankSubstraction(Groups, blankPL)
@@ -201,14 +210,6 @@ PLprocesser <- function(PL, ExpParam, SOIParam, blankPL = NA, filename) {
     Groups$MaxInt <- MaxI
     Groups <- Groups[Groups$MaxInt > 0, ]
 
-    message("Number of scans:")
-    nscans <- apply(Groups, 1, function(x) {
-        d <- x["peaks"][[1]]
-        return(dim(d)[1])
-    })
-    Groups$nscans <- nscans
-    Groups <- Groups[Groups$nscans > 2, ]
-
     message("Converting from ionic formula to F/A combinations:")
     Groups$anot <- lapply(Groups$formula, function(x) {
         apply(FA_to_ion[.(x), c(1, 3)], 1, function(y) {
@@ -378,43 +379,43 @@ parallelGroupShort <- function(i, LG, maxlen){
         starts <- pks[,2]
         ends <- pks[,3]
         known_peak <- rep(TRUE, nrow(pks))
-    if (nrow(pks) > 1) {
-        for (i in seq_len(nrow(pks) - 1)) {
-            if (ends[i] < starts[i + 1]) {
-            starts <- c(starts, ends[i])
-            ends <- c(ends, starts[i + 1])
-            known_peak <- c(known_peak, FALSE)
+        if (nrow(pks) > 1) {
+            for (i in seq_len(nrow(pks) - 1)) {
+                if (ends[i] < starts[i + 1]) {
+                starts <- c(starts, ends[i])
+                ends <- c(ends, starts[i + 1])
+                known_peak <- c(known_peak, FALSE)
+                }
             }
         }
-    }
-    if (min(ms1data[,1]) < min(starts)) {
-        ends <- c(ends, min(starts))
-        starts <- c(starts, min(ms1data[, 1]))
-        known_peak <- c(known_peak, FALSE)
-        }
-    if (max(ms1data[,1]) > max(ends)) {
-        starts <- c(starts, max(ends))
-        ends <- c(ends, max(ms1data[, 1]))
-        known_peak <- c(known_peak, FALSE)
-        }
-    if (any(!known_peak)) {
-        too_long <- ends - starts > maxlen
-        if (any(too_long & !known_peak)) {
-            for (i in which(too_long & !known_peak)) {
-            curstart <- starts[i]
-            curend <- ends[i]
-            times <- seq(from = curstart, to = curend,
+        if (min(ms1data[,1]) < min(starts)) {
+            ends <- c(ends, min(starts))
+            starts <- c(starts, min(ms1data[, 1]))
+            known_peak <- c(known_peak, FALSE)
+            }
+        if (max(ms1data[,1]) > max(ends)) {
+            starts <- c(starts, max(ends))
+            ends <- c(ends, max(ms1data[, 1]))
+            known_peak <- c(known_peak, FALSE)
+            }
+        if (any(!known_peak)) {
+            too_long <- ends - starts > maxlen
+            if (any(too_long & !known_peak)) {
+                for (i in which(too_long & !known_peak)) {
+                curstart <- starts[i]
+                curend <- ends[i]
+                times <- seq(from = curstart, to = curend,
                         length.out = ceiling((curend - curstart) / maxlen) + 1)
-            starts <- c(starts, times[-length(times)])
-            ends <- c(ends, times[-1])
+                starts <- c(starts, times[-length(times)])
+                ends <- c(ends, times[-1])
+                }
+            starts <- starts[-which(too_long & !known_peak)]
+            ends <- ends[-which(too_long & !known_peak)]
             }
-        starts <- starts[-which(too_long & !known_peak)]
-        ends <- ends[-which(too_long & !known_peak)]
         }
-    }
-    NewGR <- data.table(start = starts, end = ends,
-                        length = ends - starts, formula = curGR$formula,
-                        peaks = curGR$peaks, mass = curGR$mass)
+        NewGR <- data.table(start = starts, end = ends,
+                            length = ends - starts, formula = curGR$formula,
+                            peaks = curGR$peaks, mass = curGR$mass)
     } else {
         #Divide long group into equal-sized smaller groups
         times <- seq(from = curGR[1, 1][[1]], to = curGR[1, 2][[1]],
@@ -433,16 +434,16 @@ parallelGroupShort <- function(i, LG, maxlen){
     return(NewGR)
 }
 
+#' @importFrom keras k_argmax array_reshape
 blankSubstraction <- function(Groups, blankPL){
     message("Blank substraction:")
     setkeyv(blankPL, c("formv", "rt"))
-
     message("First cleaning")
     toKeep <- lapply(seq_len(nrow(Groups)), firstCleaning, Groups, blankPL) %>%
         unlist()
     sure <- Groups[which(toKeep), ]
     if (any(toKeep)) {Groups <- Groups[-which(toKeep),]}
-    reticulate::py_available(initialize = TRUE)
+    reticulate::py_available(initialize = TRUE)  #Start Python connection
     if (reticulate::py_module_available("keras") &
         reticulate::py_module_available("tensorflow")) {
         model <- load_model_hdf5(system.file("extdata", "ImprovedModel.h5",
@@ -465,16 +466,28 @@ blankSubstraction <- function(Groups, blankPL){
             organizeddata <- keras::array_reshape(organizeddata,
                                                 c(nrow(organizeddata), 400),
                                                 order = "C")  #ANN input
-        q <- model %>% keras::predict_classes(organizeddata)
+        q <- model %>% predict(organizeddata) %>% k_argmax()
 
-        Groups <- Groups[-which(q == 0), ]  #ANN output
+        Groups <- Groups[-which(q$numpy == 0), ]  #ANN output
         Groups <- Groups[, -c("MLdata")]
+
         }
     } else {
-        warning(paste("A Keras installation was not found and ANN blank",
-                "substraction was not performed"))
+        setkeyv(blankPL, "formv")
+        message("Preparing input for cosine similarity")
+        RES <- lapply(seq_len(nrow(Groups)), prepareNetInput, Groups, blankPL)
+        cosines <- sapply(RES, function(x){
+            if(is.na(x)[1]){return(Inf)}
+            philentropy::cosine_dist(x[1,] + 1e-12, x[2,] + 1e-12,
+                                     testNA = FALSE)
+        })
+        Groups <- Groups[cosines < 0.8, ]
+        warning("A Keras installation was not found and ANN blank ",
+                "subtraction was not performed. A less-accurate ",
+                "cosine similarity score was applied instead.")
     }
     Groups <- rbind(sure, Groups)
+    return(Groups)
 }
 
 #'@importFrom stats IQR quantile