Skip to content

Commit

Permalink
fix dataMissing
Browse files Browse the repository at this point in the history
  • Loading branch information
Carol-seven committed Jul 26, 2024
1 parent c0ee0ad commit 5801a83
Show file tree
Hide file tree
Showing 13 changed files with 152 additions and 36 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Imports:
ggplot2,
ggrepel,
ggvenn,
glue,
impute,
limma,
mice,
Expand All @@ -32,13 +33,13 @@ Imports:
pheatmap,
psych,
readxl,
scales,
seqinr,
softImpute,
tidyr (>= 1.3.0),
tibble,
tictoc,
utils,
visdat
utils
Suggests:
roxyglobals,
testthat (>= 3.0.0)
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ importFrom(Rdpack,reprompt)
importFrom(UpSetR,fromList)
importFrom(UpSetR,upset)
importFrom(data.table,fread)
importFrom(glue,glue)
importFrom(impute,impute.knn)
importFrom(limma,normalizeQuantiles)
importFrom(mice,complete)
Expand All @@ -48,6 +49,7 @@ importFrom(pcaMethods,completeObs)
importFrom(pcaMethods,pca)
importFrom(psych,describeBy)
importFrom(readxl,read_excel)
importFrom(scales,percent)
importFrom(seqinr,getName)
importFrom(seqinr,read.fasta)
importFrom(seqinr,write.fasta)
Expand All @@ -69,4 +71,3 @@ importFrom(tibble,column_to_rownames)
importFrom(tibble,rownames_to_column)
importFrom(utils,read.csv)
importFrom(utils,write.csv)
importFrom(visdat,vis_miss)
96 changes: 83 additions & 13 deletions R/dataMissing.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,45 +27,115 @@
#'
#' @param dataSet The 2d data set of experimental values.
#'
#' @param sort_miss A boolean (default = FALSE) specifying whether to arrange the columns
#' in order of missingness.
#'
#' @param plot A boolean (default = FALSE) specifying whether to plot the missingness.
#'
#' @param show_pct_legend A boolean (default = TRUE) specifying whether the percentages of
#' missing and present values in the entire dataset are shown in the legend of the
#' visualization when \code{plot = TRUE}.
#'
#' @param show_labels A boolean (default = TRUE) specifying whether protein names are
#' shown in the visualization when \code{plot = TRUE}.
#'
#' @param show_pct_col A boolean (default = TRUE) specifying whether the percentages of
#' missing data in the samples for that protein are shown in the labels of the
#' visualization when \code{show_labels = TRUE}.
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom visdat vis_miss
#' @import tidyr
#' @importFrom glue glue
#' @importFrom scales percent
#'
#' @returns A 2d dataframe including:
#' \itemize{
#' \item "count_miss": The count of missing values for each protein.
#' \item "pct-miss": The percentage of missing values for each protein.
#' \item "pct_total_miss": The percentage of missing values for each protein relative to
#' \item "pct_miss_col": The percentage of missing values for each protein.
#' \item "pct_miss_tot": The percentage of missing values for each protein relative to
#' the total missing values in the entire dataset.
#' }
#'
#' @autoglobal
#'
#' @export

dataMissing <- function(dataSet, plot = FALSE, show_labels = TRUE) {
dataMissing <- function(dataSet, sort_miss = FALSE,
plot = FALSE, show_pct_legend = TRUE,
show_labels = TRUE, show_pct_col = TRUE) {

dataMissing <- select(dataSet, -c(R.Condition, R.Replicate))
if (plot == TRUE) {
if (show_labels == TRUE) {
plot <- visdat::vis_miss(dataMissing)

if (sort_miss) {
dataMissing <- dataMissing[,names(sort(colSums(is.na(dataMissing)), decreasing = TRUE))]
}

if (plot) {

plotdf <- dataMissing %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "variable", values_to = "value",
values_transform = list(value = is.na))

if (show_pct_legend) {
pct_missing <- mean(is.na(dataMissing))*100
if (pct_missing == 0) {
lab_missing <- "No Missing Values"
lab_present <- "Present (100%)"
} else if (pct_missing < 0.1) {
lab_missing <- "Missing (< 0.1%)"
lab_present <- "Present (> 99.9%)"
} else {
pct_missing <- round(pct_missing, 1)
pct_present <- 100 - pct_missing
lab_missing <- glue::glue("Missing\n({pct_missing}%)")
lab_present <- glue::glue("Present\n({pct_present}%)")
}
} else {
lab_missing <- "Missing"
lab_present <- "Present"
}

plot <- ggplot(plotdf, aes(x = variable, y = row)) +
geom_raster(aes(fill = value)) +
scale_fill_manual(name = "", breaks = c("TRUE", "FALSE"),
values = c("grey20", "grey80"),
labels = c(lab_missing, lab_present)) +
scale_y_reverse() +
theme_minimal() +
labs(x = "", y = "Observations") +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 0))

if (show_labels) {
if (show_pct_col) {
lab_pct_miss_col <- colMeans(is.na(dataMissing)) %>%
sapply(function(x) {
case_when(x == 0 ~ "0%",
x < 0.001 ~ "<0.1%",
x < 0.01 ~ "<1%",
x >= 0.01 ~ scales::percent(x, accuracy = 1))
})
plot <- plot +
scale_x_discrete(position = "top", limits = names(dataMissing),
labels = glue::glue("{names(lab_pct_miss_col)} ({lab_pct_miss_col})"))
} else {
plot <- plot +
scale_x_discrete(position = "top", limits = names(dataMissing))
}
} else {
plotData <- dataMissing
colnames(plotData) <- sprintf("%0*d", nchar(ncol(dataMissing)), 1:ncol(dataMissing))
plot <- visdat::vis_miss(plotData) +
ggplot2::scale_x_discrete(labels = element_blank())
plot <- plot +
scale_x_discrete(position = "top", labels = element_blank())
}

print(plot)
}

count_miss <- colSums(is.na(dataMissing))
result <- data.frame(count_miss,
pct_miss = count_miss/nrow(dataMissing)*100,
pct_total_miss = count_miss/sum(count_miss)*100)
pct_miss_col = colMeans(is.na(dataMissing))*100,
pct_miss_tot = count_miss/sum(count_miss)*100)
return(as.data.frame(t(result)))
}

2 changes: 2 additions & 0 deletions R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ utils::globalVariables(c(
"R.Replicate", # <analyze>
"R.Condition", # <dataMissing>
"R.Replicate", # <dataMissing>
"variable", # <dataMissing>
"value", # <dataMissing>
"PG.Quantity", # <preProcessFiltering>
"PG.NrOfStrippedSequencesIdentified", # <preProcessFiltering>
"PG.ProteinNames", # <preProcessFiltering>
Expand Down
2 changes: 1 addition & 1 deletion docs/articles/scaffold.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions docs/articles/usage_template.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pkgdown_sha: ~
articles:
scaffold: scaffold.html
usage_template: usage_template.html
last_built: 2024-07-25T19:05Z
last_built: 2024-07-26T06:13Z
urls:
reference: https://uconn-scs.github.io/msDiaLogue/reference
article: https://uconn-scs.github.io/msDiaLogue/articles
30 changes: 27 additions & 3 deletions docs/reference/dataMissing.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/search.json

Large diffs are not rendered by default.

24 changes: 21 additions & 3 deletions man/dataMissing.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions tests/storedData/dataMissing_Toy.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"","RAB3D_HUMAN","ADH1_YEAST","LYSC_CHICK","BGAL_HUMAN","SYTC_HUMAN","CYC_BOVIN","PA1B2_HUMAN","TEBP_HUMAN","UAP1_HUMAN","B3GLT_HUMAN","NFXL1_HUMAN","VPS36_HUMAN","T126B_HUMAN","ORC3_HUMAN","BAG5_HUMAN","ANGL3_HUMAN","ZC11B_HUMAN","MAP11_HUMAN"
"count_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,3
"pct_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,40,30
"pct_total_miss",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.5,50,37.5
"pct_miss_col",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,40,30
"pct_miss_tot",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.5,50,37.5
4 changes: 2 additions & 2 deletions vignettes/usage_template.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,9 @@ providing the following information:

+ `count_miss`: The count of missing values for each protein.

+ `pct-miss`: The percentage of missing values for each protein.
+ `pct_miss_col`: The percentage of missing values for each protein.

+ `pct_total_miss`: The percentage of missing values for each protein relative to the
+ `pct_miss_tot`: The percentage of missing values for each protein relative to the
total missing values in the entire dataset.

<div style="overflow-x: auto;">
Expand Down

0 comments on commit 5801a83

Please sign in to comment.