radiator 1.3.0 2024-02-22

thierrygosselin · thierrygosselin · commit a0c534586ffa · 2024-02-22T22:18:31.000-05:00
* Bug fix using coverage and DArT data
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: radiator
 Type: Package
 Title: RADseq Data Exploration, Manipulation and Visualization using R
-Version: 1.2.9
-Date: 2024-01-25
+Version: 1.3.0
+Date: 2024-02-22
 Encoding: UTF-8
 Authors@R: c(
   person("Thierry", "Gosselin", email = "thierrygosselin@icloud.com", role = c("aut", "cre")),
@@ -56,7 +56,7 @@ License: GPL-3
 LazyLoad: yes
 VignetteBuilder:
     knitr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 ByteCompile: TRUE
 URL: https://thierrygosselin.github.io/radiator/
 BugReports: https://github.com/thierrygosselin/radiator/issues
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# radiator 1.3.0 2024-02-22
+
+* Bug fix using coverage and DArT data
+
+
 # radiator 1.2.9 2024-01-25
 
 * Bug fix stemming from genalex files and genind conversion
diff --git a/R/filter_coverage.R b/R/filter_coverage.R
@@ -186,7 +186,7 @@ filter_coverage <- function(
     verbose = verbose)
 
   # Verify that coverage information is present in the data...
-  depth.info <- check_coverage(gds = data, stacks.haplo.check = TRUE, dart.check = TRUE)
+  depth.info <- check_coverage(gds = data, genotypes.metadata.check = TRUE, stacks.haplo.check = TRUE, dart.check = TRUE)
   if (is.null(depth.info)) {
     message("\n\nCoverate information is not available for this dataset, returning GDS...")
     return(data)
@@ -204,6 +204,8 @@ filter_coverage <- function(
     individuals = FALSE,
     markers = TRUE,
     missing = FALSE,
+    coverage = TRUE,
+    # allele.coverage = TRUE,
     allele.coverage = FALSE,
     mac = FALSE,
     heterozygosity = FALSE,
diff --git a/R/gds.R b/R/gds.R
@@ -967,9 +967,9 @@ check_coverage <- function(gds, genotypes.metadata.check = FALSE, stacks.haplo.c
   # stacks haplotype vcf have the info fields for depth in the VCF header
   # but they do not have the info with genotypes...
   # it's laziness from stacks...
+  data.source <- extract_data_source(gds)
 
   if (stacks.haplo.check) {
-    data.source <- extract_data_source(gds)
     biallelic <- radiator::detect_biallelic_markers(data = gds)
     biallelic <- gdsfmt::read.gdsn(gdsfmt::index.gdsn(
       node = gds, path = "radiator/biallelic", silent = TRUE))
@@ -989,7 +989,10 @@ check_coverage <- function(gds, genotypes.metadata.check = FALSE, stacks.haplo.c
         markers.meta.select = c("AVG_COUNT_REF", "AVG_COUNT_SNP"),
         whitelist = TRUE
       )
-      if (!is.null(got.coverage)) got.coverage <- c("AVG_COUNT_REF", "AVG_COUNT_SNP")
+      if (!is.null(got.coverage)) {
+        got.coverage <- c("AVG_COUNT_REF", "AVG_COUNT_SNP")
+        return(got.coverage)
+      }
     }#End DART 1row and 2 rows
   }
 
@@ -1006,6 +1009,7 @@ check_coverage <- function(gds, genotypes.metadata.check = FALSE, stacks.haplo.c
       # this part might generate an error if you actually have genotypes metadata...
       # need to run tests...
       got.coverage <- geno.index
+      return(got.coverage)
     }
     geno.index <- NULL
   }
@@ -1018,13 +1022,9 @@ check_coverage <- function(gds, genotypes.metadata.check = FALSE, stacks.haplo.c
     check = "none", verbose = FALSE)$ID
 
   if (length(have) > 0) {
-    # if (!exhaustive) {
-    # want <- c("DP", "CATG")
-    # } else {
     want <- c("DP", "AD", "CATG")
-    # }
-
     got.coverage <- purrr::keep(.x = have, .p = have %in% want)
+    return(got.coverage)
   } else {
     got.coverage <- NULL
   }
@@ -2461,7 +2461,7 @@ generate_stats <- function(
       genotypes.metadata.check = TRUE,
       dart.check = TRUE
     )
-    if (!"DP" %in% got.coverage) coverage <- FALSE
+    if (!"DP" %in% got.coverage && !"READ_DEPTH" %in% got.coverage) coverage <- FALSE
     if (!"AD" %in% got.coverage) allele.coverage <- FALSE
     if (!exhaustive) allele.coverage <- FALSE
     got.coverage <- NULL
@@ -2496,28 +2496,52 @@ generate_stats <- function(
           replacement = c("COVERAGE_TOTAL", "COVERAGE_MEAN", "COVERAGE_MEDIAN", "COVERAGE_IQR"),
           vectorize_all = FALSE
         )
+        data.source <- radiator::extract_data_source(gds = gds)
+
+        if ("dart" %in% data.source) {
+          dart.data <- radiator::extract_genotypes_metadata(
+            gds = gds,
+            genotypes.meta.select = c("M_SEQ", "ID_SEQ", "READ_DEPTH"),
+            whitelist = TRUE
+          ) %>%
+            radiator::rad_wide(
+              x = .,
+              formula = "ID_SEQ ~ M_SEQ",
+              values_from = "READ_DEPTH"
+            ) %>%
+            dplyr::select(-ID_SEQ)
+          colnames(dart.data) <- NULL
+          dart.data <- as.matrix(dart.data)
+        } else {
+          dart.data <- NULL
+        }
 
         if (markers) {
-          dp_f_m <- function(gds, coverage.stats) {
-
+          dp_f_m <- function(gds, coverage.stats, dart.data) {
             # Using switch instead was not optimal for additional options in the func...
             if (coverage.stats == "sum") rad_cov_stats <- function(x) round(sum(x, na.rm = TRUE))
             if (coverage.stats == "mean") rad_cov_stats <- function(x) round(mean(x, na.rm = TRUE))
             if (coverage.stats == "median") rad_cov_stats <- function(x) round(stats::median(x, na.rm = TRUE))
             # if (coverage.stats == "iqr") rad_cov_stats <- function(x) round(abs(diff(stats::quantile(x, probs = c(0.25, 0.75), na.rm = TRUE))))
             if (coverage.stats == "iqr") rad_cov_stats <- function(x) round(matrixStats::iqr(x, na.rm = TRUE)) # faster
 
-            x <- SeqArray::seqApply(
-              gdsfile = gds,
-              var.name = "annotation/format/DP",
-              FUN = rad_cov_stats,
-              as.is = "integer",
-              margin = "by.variant",
-              parallel = TRUE
-            )
+            if (!is.null(dart.data)) {
+              x <- as.integer(apply(X = dart.data, MARGIN = 2, FUN = rad_cov_stats))
+              dart.data <- NULL
+            } else {
+              x <- SeqArray::seqApply(
+                gdsfile = gds,
+                var.name = "annotation/format/DP",
+                FUN = rad_cov_stats,
+                as.is = "integer",
+                margin = "by.variant",
+                parallel = TRUE
+              )
+            }
+            return(x)
           }
 
-          dp.m <- purrr::map_dfc(.x = coverage.stats.l, .f = dp_f_m, gds = gds)
+          dp.m <- purrr::map_dfc(.x = coverage.stats.l, .f = dp_f_m, gds = gds, dart.data = dart.data)
         }
 
         if (individuals) {
@@ -2527,10 +2551,16 @@ generate_stats <- function(
           # Note to myself: the huge speed gain by using other packages robustbse, Rfast, etc.
           # is not worth the unreliability of the results check your testing files...
 
-          dp.temp <- SeqArray::seqGetData(
-            gdsfile = gds,
-            var.name = "annotation/format/DP"
-          )
+          if ("dart" %in% data.source) {
+            dp.temp <- dart.data
+            dart.data <- NULL
+          } else {
+            dp.temp <- SeqArray::seqGetData(
+              gdsfile = gds,
+              var.name = "annotation/format/DP"
+            )
+          }
+
 
           dp_f_i <- function(coverage.stats, x) {
             if ("sum" %in% coverage.stats) x <- rowSums(x, na.rm = TRUE)
diff --git a/R/utils.R b/R/utils.R
@@ -977,9 +977,28 @@ strip_rad <- function(
 ) {
   objs <- utils::object.size(x)
 
-  # STRATA ----------
-  strata.n <- intersect(colnames(x), c("STRATA", "POP_ID"))
+  # Check if ID_SEQ, STRATA_SEQ and M_SEQ present...
+  if (rlang::has_name(x, "ID_SEQ") && rlang::has_name(x, "INDIVIDUALS")) {
+    x %<>% dplyr::select(-ID_SEQ)
+  }
+
+  if (rlang::has_name(x, "STRATA_SEQ")) {
+    strata.n <- intersect(colnames(x), c("STRATA", "POP_ID"))
+    if (length(strata.n) > 0 && strata.n %in% c("STRATA", "POP_ID")) {
+      x %<>% dplyr::select(-STRATA_SEQ)
+    }
+  }
 
+  if (rlang::has_name(x, "M_SEQ")) {
+    markers.n <- intersect(colnames(x), c("VARIANT_ID", "CHROM", "LOCUS", "POS", "MARKERS"))
+    if (length(markers.n) > 0) {
+      x %<>% dplyr::select(-M_SEQ)
+    }
+  }
+  strata.n <- markers.n <- NULL
+
+
+  # STRATA ----------
   if (rlang::has_name(x, "POP_ID")) {
     strata <- radiator::generate_strata(data = x, pop.id = TRUE) %>%
       dplyr::mutate(
@@ -1009,7 +1028,7 @@ strip_rad <- function(
     pos = env.arg,
     envir = env.arg
   )
-  cm <- keep.strata <- pop.id <- strata.n <- strata <- NULL
+  cm <- keep.strata <- pop.id <- strata <- NULL
 
   # MARKERS ---------
   x %<>%
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@ state and is being actively
 developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active)
 [![minimal R
 version](https://img.shields.io/badge/R%3E%3D-NA-6666ff.svg)](https://cran.r-project.org/)
-[![packageversion](https://img.shields.io/badge/Package%20version-1.2.9-orange.svg)](commits/master)
-[![Last-changedate](https://img.shields.io/badge/last%20change-2024--01--25-brightgreen.svg)](/commits/master)
+[![packageversion](https://img.shields.io/badge/Package%20version-1.3.0-orange.svg)](commits/master)
+[![Last-changedate](https://img.shields.io/badge/last%20change-2024--02--22-brightgreen.svg)](/commits/master)
 [![R-CMD-check](https://github.com/thierrygosselin/radiator/workflows/R-CMD-check/badge.svg)](https://github.com/thierrygosselin/radiator/actions)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3687060.svg)](https://doi.org/10.5281/zenodo.3687060)
 <!-- badges: end -->