filterProtein for Scaffold

uconn-scs · May 15, 2024 · b09140d · b09140d
1 parent 689e502
commit b09140d
Show file tree

Hide file tree

Showing 20 changed files with 80 additions and 65 deletions.
diff --git a/R/filterings.R b/R/filterings.R
@@ -209,15 +209,26 @@ filterOutIn <- function(dataSet,
 #' @param dataSet The 2d data set of experimental values.
 #' 
 #' @param proteinInformation The name of the .csv file containing protein information data
-#' (including the path to the file, if needed). The file should include the following 4
-#' columns: "PG.Genes", "PG.ProteinAccessions", "PG.ProteinDescriptions", and
-#' "PG.ProteinNames". This file is automatically generated by the function
-#' \code{\link[msDiaLogue]{preprocessing}}.
+#' (including the path to the file, if needed). The file should include the following
+#' columns:
+#' \itemize{
+#' \item For Spectronaut: "PG.Genes", "PG.ProteinAccessions", "PG.ProteinDescriptions",
+#' and "PG.ProteinNames".
+#' \item For Scaffold: "ProteinDescriptions", "AccessionNumber", and "AlternateID".
+#' }
+#' This file is automatically generated by the function
+#' \code{\link[msDiaLogue]{preprocessing}} or
+#' \code{\link[msDiaLogue]{preprocessing_scaffold}}.
 #' 
 #' @param text A character vector of text used as the key for selecting or removing.
 #' 
 #' @param by A character string specifying the information to which the \code{text} filter
-#' is applied, with allowable options: "gene", "accession" and "description".
+#' is applied, with allowable options:
+#' \itemize{
+#' \item For Spectronaut: "PG.Genes", "PG.ProteinAccessions", "PG.ProteinDescriptions",
+#' and "PG.ProteinNames".
+#' \item For Scaffold: "ProteinDescriptions", "AccessionNumber", and "AlternateID".
+#' }
 #' 
 #' @param removeList A boolean (default = TRUE) specifying whether the list of proteins
 #' should be removed or selected.
@@ -230,8 +241,9 @@ filterOutIn <- function(dataSet,
 #' current working directory. This option only works when \code{removeList = TRUE}.
 #' 
 #' @details
-#' The function is an extension of the function \code{\link[msDiaLogue]{preprocessing}}
-#' that allows for filtering proteins based on additional information.
+#' The function is an extension of the function \code{\link[msDiaLogue]{preprocessing}} or
+#' \code{\link[msDiaLogue]{preprocessing_scaffold}} that allows for filtering proteins
+#' based on additional information.
 #' 
 #' @import dplyr
 #' @importFrom utils read.csv write.csv
@@ -245,7 +257,9 @@ filterOutIn <- function(dataSet,
 filterProtein <- function(dataSet,
                           proteinInformation = "preprocess_protein_information.csv",
                           text = c(),
-                          by = c("gene", "accession", "description", "name"),
+                          by = c("PG.Genes", "PG.ProteinAccessions",
+                                 "PG.ProteinDescriptions", "PG.ProteinNames",
+                                 "ProteinDescriptions", "AccessionNumber", "AlternateID"),
                           removeList = TRUE,
                           saveRm = TRUE) {
 
@@ -255,12 +269,14 @@ filterProtein <- function(dataSet,
   filteredData <- dataSet %>%
     select(-c("R.Condition", "R.Replicate"))
 
-  by <- paste0("PG.", if(by != "gene") {"Protein"},
-               toupper(substr(by, 1, 1)), substring(by, 2), "s")
-
   index <- grep(paste(text, collapse = "|"), proteinInformation[[by]])
 
-  proteinName <- proteinInformation[index,]$PG.ProteinNames
+  if (by %in% c("PG.Genes", "PG.ProteinAccessions",
+                "PG.ProteinDescriptions", "PG.ProteinNames")) {
+    proteinName <- proteinInformation[index,]$PG.ProteinNames
+  } else {
+    proteinName <- proteinInformation[index,]$AccessionNumber
+  }
 
   result <- dataSet %>%
     select(any_of(c("R.Condition", "R.Replicate", proteinName)))

diff --git a/R/globals.R b/R/globals.R
@@ -40,7 +40,7 @@ utils::globalVariables(c(
   "R.Replicate", # <preprocessing>
   "PG.Quantity", # <preprocessing>
   ".", # <preprocessing_scaffold>
-  "ProteinAccessions", # <preprocessing_scaffold>
+  "AccessionNumber", # <preprocessing_scaffold>
   "ConditionReplicate", # <preprocessing_scaffold>
   "R.Condition", # <preprocessing_scaffold>
   "R.Replicate", # <preprocessing_scaffold>

diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -241,13 +241,13 @@ preprocessing_scaffold <- function(fileName, dataSet = NULL) {
     select(-"#") %>%
     slice(-n()) %>%
     rename(ProteinDescriptions = names(.)[3],
-           ProteinAccessions = "Accession Number",
-           Genes = "Alternate ID",
+           AccessionNumber = "Accession Number",
+           AlternateID = "Alternate ID",
            MolecularWeight = "Molecular Weight",
            ProteinGroupingAmbiguity = "Protein Grouping Ambiguity")
 
   infoColName <- c("Visible?", "Starred?",
-                   "ProteinDescriptions", "ProteinAccessions", "Genes",
+                   "ProteinDescriptions", "AccessionNumber", "AlternateID",
                    "MolecularWeight", "ProteinGroupingAmbiguity")
 
   proteinInformation <- dataSet %>%
@@ -259,22 +259,22 @@ preprocessing_scaffold <- function(fileName, dataSet = NULL) {
 
   ## select columns necessary for analysis
   selectedData <- dataSet %>%
-    select(-infoColName[infoColName != "ProteinAccessions"]) %>%
-    mutate(across(-ProteinAccessions, ~as.numeric(replace(., . == "Missing Value", NA)))) %>%
-    pivot_longer(-ProteinAccessions, names_to = "ConditionReplicate", values_to = "Quantity") %>%
-    # gather(ConditionReplicate, Quantity, -ProteinAccessions) %>%
+    select(-infoColName[infoColName != "AccessionNumber"]) %>%
+    mutate(across(-AccessionNumber, ~as.numeric(replace(., . == "Missing Value", NA)))) %>%
+    pivot_longer(-AccessionNumber, names_to = "ConditionReplicate", values_to = "Quantity") %>%
+    # gather(ConditionReplicate, Quantity, -AccessionNumber) %>%
     mutate(ConditionReplicate = sub(".+_(.+)", "\\1", ConditionReplicate)) %>%
     mutate(R.Condition = sub("^(\\d+|[a-zA-Z0-9]+).*", "\\1", ConditionReplicate),
            R.Replicate = sub("^[^.]*[-]?([0-9]+)$", "\\1", ConditionReplicate)) %>%
-    select(R.Condition, R.Replicate, ProteinAccessions, Quantity) %>%
+    select(R.Condition, R.Replicate, AccessionNumber, Quantity) %>%
     mutate(Quantity = replace(Quantity, Quantity %in% c(0,1), NA))
 
   ## reformat the data to present proteins as the columns and
   ## to group replicates under each protein
   reformatedData <- selectedData %>%
     pivot_wider(id_cols = c(R.Condition, R.Replicate),
-                names_from = ProteinAccessions, values_from = Quantity)
-  # spread(ProteinAccessions, Quantity)
+                names_from = AccessionNumber, values_from = Quantity)
+  # spread(AccessionNumber, Quantity)
 
   ## generate a histogram of the log2-transformed values for full data set
   ## note: the Scaffold is a preprocessed data report.

diff --git a/docs/articles/scaffold.html b/docs/articles/scaffold.html
diff --git a/docs/articles/scaffold_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/scaffold_files/figure-html/unnamed-chunk-4-1.png
diff --git a/docs/articles/usage_template.html b/docs/articles/usage_template.html
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-14-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-14-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-34-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-34-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-37-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-37-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-38-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-38-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-4-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-4-2.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-4-2.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-40-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-40-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-41-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-41-1.png
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-42-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-42-1.png
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -4,7 +4,7 @@ pkgdown_sha: ~
 articles:
   scaffold: scaffold.html
   usage_template: usage_template.html
-last_built: 2024-05-04T05:24Z
+last_built: 2024-05-15T04:17Z
 urls:
   reference: https://uconn-scs.github.io/msDiaLogue/reference
   article: https://uconn-scs.github.io/msDiaLogue/articles

diff --git a/docs/reference/filterProtein.html b/docs/reference/filterProtein.html
diff --git a/docs/search.json b/docs/search.json
diff --git a/man/filterProtein.Rd b/man/filterProtein.Rd
diff --git a/tests/testData/Toy_Scaffold_Data.xls b/tests/testData/Toy_Scaffold_Data.xls