fix dataMissing

uconn-scs · Jul 26, 2024 · 5801a83 · 5801a83
1 parent c0ee0ad
commit 5801a83
Show file tree

Hide file tree

Showing 13 changed files with 152 additions and 36 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,6 +24,7 @@ Imports:
     ggplot2,
     ggrepel,
     ggvenn,
+    glue,
     impute,
     limma,
     mice,
@@ -32,13 +33,13 @@ Imports:
     pheatmap,
     psych,
     readxl,
+    scales,
     seqinr,
     softImpute,
     tidyr (>= 1.3.0),
     tibble,
     tictoc,
-    utils,
-    visdat
+    utils
 Suggests:
     roxyglobals,
     testthat (>= 3.0.0)

diff --git a/NAMESPACE b/NAMESPACE
@@ -39,6 +39,7 @@ importFrom(Rdpack,reprompt)
 importFrom(UpSetR,fromList)
 importFrom(UpSetR,upset)
 importFrom(data.table,fread)
+importFrom(glue,glue)
 importFrom(impute,impute.knn)
 importFrom(limma,normalizeQuantiles)
 importFrom(mice,complete)
@@ -48,6 +49,7 @@ importFrom(pcaMethods,completeObs)
 importFrom(pcaMethods,pca)
 importFrom(psych,describeBy)
 importFrom(readxl,read_excel)
+importFrom(scales,percent)
 importFrom(seqinr,getName)
 importFrom(seqinr,read.fasta)
 importFrom(seqinr,write.fasta)
@@ -69,4 +71,3 @@ importFrom(tibble,column_to_rownames)
 importFrom(tibble,rownames_to_column)
 importFrom(utils,read.csv)
 importFrom(utils,write.csv)
-importFrom(visdat,vis_miss)
diff --git a/R/dataMissing.R b/R/dataMissing.R
@@ -27,45 +27,115 @@
 #' 
 #' @param dataSet The 2d data set of experimental values.
 #' 
+#' @param sort_miss A boolean (default = FALSE) specifying whether to arrange the columns
+#' in order of missingness.
+#' 
 #' @param plot A boolean (default = FALSE) specifying whether to plot the missingness.
 #' 
+#' @param show_pct_legend A boolean (default = TRUE) specifying whether the percentages of
+#' missing and present values in the entire dataset are shown in the legend of the
+#' visualization when \code{plot = TRUE}.
+#' 
 #' @param show_labels A boolean (default = TRUE) specifying whether protein names are
 #' shown in the visualization when \code{plot = TRUE}.
 #' 
+#' @param show_pct_col A boolean (default = TRUE) specifying whether the percentages of
+#' missing data in the samples for that protein are shown in the labels of the
+#' visualization when \code{show_labels = TRUE}.
+#' 
 #' @import dplyr
 #' @import ggplot2
-#' @importFrom visdat vis_miss
+#' @import tidyr
+#' @importFrom glue glue
+#' @importFrom scales percent
 #' 
 #' @returns A 2d dataframe including:
 #' \itemize{
 #' \item "count_miss": The count of missing values for each protein.
-#' \item "pct-miss": The percentage of missing values for each protein.
-#' \item "pct_total_miss": The percentage of missing values for each protein relative to
+#' \item "pct_miss_col": The percentage of missing values for each protein.
+#' \item "pct_miss_tot": The percentage of missing values for each protein relative to
 #' the total missing values in the entire dataset.
 #' }
 #' 
 #' @autoglobal
 #' 
 #' @export
 
-dataMissing <- function(dataSet, plot = FALSE, show_labels = TRUE) {
+dataMissing <- function(dataSet, sort_miss = FALSE,
+                        plot = FALSE, show_pct_legend = TRUE,
+                        show_labels = TRUE, show_pct_col = TRUE) {
+
   dataMissing <- select(dataSet, -c(R.Condition, R.Replicate))
-  if (plot == TRUE) {
-    if (show_labels == TRUE) {
-      plot <- visdat::vis_miss(dataMissing)
+
+  if (sort_miss) {
+    dataMissing <- dataMissing[,names(sort(colSums(is.na(dataMissing)), decreasing = TRUE))]
+  }
+
+  if (plot) {
+
+    plotdf <- dataMissing %>%
+      mutate(row = row_number()) %>%
+      pivot_longer(cols = -row, names_to = "variable", values_to = "value",
+                   values_transform = list(value = is.na))
+
+    if (show_pct_legend) {
+      pct_missing <- mean(is.na(dataMissing))*100
+      if (pct_missing == 0) {
+        lab_missing <- "No Missing Values"
+        lab_present <- "Present (100%)"
+      } else if (pct_missing < 0.1) {
+        lab_missing <- "Missing (< 0.1%)"
+        lab_present <- "Present (> 99.9%)"
+      } else {
+        pct_missing <- round(pct_missing, 1)
+        pct_present <- 100 - pct_missing
+        lab_missing <- glue::glue("Missing\n({pct_missing}%)")
+        lab_present <- glue::glue("Present\n({pct_present}%)")
+      }
+    } else {
+      lab_missing <- "Missing"
+      lab_present <- "Present"
+    }
+
+    plot <- ggplot(plotdf, aes(x = variable, y = row)) +
+      geom_raster(aes(fill = value)) +
+      scale_fill_manual(name = "", breaks = c("TRUE", "FALSE"),
+                        values = c("grey20", "grey80"),
+                        labels = c(lab_missing, lab_present)) +
+      scale_y_reverse() +
+      theme_minimal() +
+      labs(x = "", y = "Observations") +
+      theme(legend.position = "bottom",
+            axis.text.x = element_text(angle = 45, hjust = 0))
+
+   if (show_labels) {
+     if (show_pct_col) {
+       lab_pct_miss_col <- colMeans(is.na(dataMissing)) %>%
+         sapply(function(x) {
+           case_when(x == 0 ~  "0%",
+                     x < 0.001 ~ "<0.1%",
+                     x < 0.01 ~ "<1%",
+                     x >= 0.01 ~ scales::percent(x, accuracy = 1))
+         })
+       plot <- plot +
+         scale_x_discrete(position = "top", limits = names(dataMissing),
+                          labels = glue::glue("{names(lab_pct_miss_col)} ({lab_pct_miss_col})"))
+     } else {
+       plot <- plot +
+         scale_x_discrete(position = "top", limits = names(dataMissing))
+     }
     } else {
-      plotData <- dataMissing
-      colnames(plotData) <- sprintf("%0*d", nchar(ncol(dataMissing)), 1:ncol(dataMissing))
-      plot <- visdat::vis_miss(plotData) +
-        ggplot2::scale_x_discrete(labels = element_blank())
+      plot <- plot +
+        scale_x_discrete(position = "top", labels = element_blank())
     }
+
     print(plot)
   }
 
   count_miss <- colSums(is.na(dataMissing))
   result <- data.frame(count_miss,
-                       pct_miss = count_miss/nrow(dataMissing)*100,
-                       pct_total_miss = count_miss/sum(count_miss)*100)
+                       pct_miss_col = colMeans(is.na(dataMissing))*100,
+                       pct_miss_tot = count_miss/sum(count_miss)*100)
   return(as.data.frame(t(result)))
 }
 
diff --git a/R/globals.R b/R/globals.R
@@ -5,6 +5,8 @@ utils::globalVariables(c(
   "R.Replicate", # <analyze>
   "R.Condition", # <dataMissing>
   "R.Replicate", # <dataMissing>
+  "variable", # <dataMissing>
+  "value", # <dataMissing>
   "PG.Quantity", # <preProcessFiltering>
   "PG.NrOfStrippedSequencesIdentified", # <preProcessFiltering>
   "PG.ProteinNames", # <preProcessFiltering>

diff --git a/docs/articles/scaffold.html b/docs/articles/scaffold.html
diff --git a/docs/articles/usage_template.html b/docs/articles/usage_template.html
diff --git a/docs/articles/usage_template_files/figure-html/unnamed-chunk-16-1.png b/docs/articles/usage_template_files/figure-html/unnamed-chunk-16-1.png
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -4,7 +4,7 @@ pkgdown_sha: ~
 articles:
   scaffold: scaffold.html
   usage_template: usage_template.html
-last_built: 2024-07-25T19:05Z
+last_built: 2024-07-26T06:13Z
 urls:
   reference: https://uconn-scs.github.io/msDiaLogue/reference
   article: https://uconn-scs.github.io/msDiaLogue/articles
diff --git a/docs/reference/dataMissing.html b/docs/reference/dataMissing.html
diff --git a/docs/search.json b/docs/search.json
diff --git a/man/dataMissing.Rd b/man/dataMissing.Rd
diff --git a/tests/storedData/dataMissing_Toy.csv b/tests/storedData/dataMissing_Toy.csv
@@ -1,4 +1,4 @@
 "","RAB3D_HUMAN","ADH1_YEAST","LYSC_CHICK","BGAL_HUMAN","SYTC_HUMAN","CYC_BOVIN","PA1B2_HUMAN","TEBP_HUMAN","UAP1_HUMAN","B3GLT_HUMAN","NFXL1_HUMAN","VPS36_HUMAN","T126B_HUMAN","ORC3_HUMAN","BAG5_HUMAN","ANGL3_HUMAN","ZC11B_HUMAN","MAP11_HUMAN"
 "count_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,3
-"pct_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,40,30
-"pct_total_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.5,50,37.5
+"pct_miss_col",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,40,30
+"pct_miss_tot",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.5,50,37.5
diff --git a/vignettes/usage_template.Rmd b/vignettes/usage_template.Rmd
@@ -349,9 +349,9 @@ providing the following information:
 
 + `count_miss`: The count of missing values for each protein.
 
-+ `pct-miss`: The percentage of missing values for each protein.
++ `pct_miss_col`: The percentage of missing values for each protein.
 
-+ `pct_total_miss`: The percentage of missing values for each protein relative to the
++ `pct_miss_tot`: The percentage of missing values for each protein relative to the
 total missing values in the entire dataset.
 
 <div style="overflow-x: auto;">