thierrygosselin
diff --git a/‎.DS_Store
2 KB b/‎.DS_Store
2 KB
diff --git a/‎DESCRIPTION
+2-2 b/‎DESCRIPTION
+2-2
diff --git a/‎NEWS.md
+5 b/‎NEWS.md
+5
diff --git a/‎R/global_variables.R
+2-1 b/‎R/global_variables.R
+2-1
diff --git a/‎R/tidy_genomic_data.R
+6-2 b/‎R/tidy_genomic_data.R
+6-2
diff --git a/‎R/tidy_vcf.R
+22-11 b/‎R/tidy_vcf.R
+22-11
diff --git a/‎R/tidy_wide.R
+6-2 b/‎R/tidy_wide.R
+6-2
diff --git a/‎R/utils.R
-14 b/‎R/utils.R
-14
@@ -1,8 +1,8 @@
 Package: radiator
 Type: Package
 Title: RADseq Data Exploration, Manipulation and Visualization using R
-Version: 0.0.19
-Date: 2018-11-07
+Version: 0.0.20
+Date: 2018-11-12
 Encoding: UTF-8
 Authors@R: c(
   person("Thierry", "Gosselin", email = "[email protected]", role = c("aut", "cre")),
 
@@ -1,3 +1,8 @@
+# radiator v.0.0.20 2018-11-12
+
+* `tidy_vcf`, `tidy_genomic_data` and `genomic_converter`: works better with ipyrad vcf's
+
+
 # radiator v.0.0.19 2018-11-07
 
 * `tidy_vcf`, `tidy_genomic_data` and `genomic_converter`: works better with freebayes and stacks vcf
 
@@ -94,7 +94,8 @@ if (getRversion() >= "2.15.1") {
       "FILTER_SHORT_LD", "FILTER_STRANDS", "GENOTYPED_THRESHOLD", "MARKERS_A",
       "MARKERS_B", "MISSING_POP", "NUMBER_MARKERS", "SNP_PER_LOCUS", "id.select",
       "individuals.missing", "keep.both.strands", "n.ind", "snp.select.no.maf",
-      "common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant"
+      "common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant",
+      "STRANDS", "COVERAGE_TOTAL", "COVERAGE_MEAN"
     )
   )
 }
@@ -1105,7 +1105,7 @@ tidy_genomic_data <- function(
 
   # Blacklist genotypes --------------------------------------------------------
   if (is.null(blacklist.genotype)) { # no Whitelist
-    if (verbose) message("Erasing genotype: no")
+      if (verbose) message("Erasing genotype: no")
   } else {
     if (verbose) message("Erasing genotype: yes")
     want <- c("MARKERS", "CHROM", "LOCUS", "POS", "INDIVIDUALS")
@@ -1260,7 +1260,11 @@ tidy_genomic_data <- function(
   }
 
   # Results --------------------------------------------------------------------
-  input %<>% dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
+  if (!is.null(strata)) {
+    input %<>% dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
+  } else {
+    input %<>% dplyr::arrange(INDIVIDUALS, MARKERS)
+  }
 
 
   n.markers <- dplyr::n_distinct(input$MARKERS)
 
@@ -345,6 +345,7 @@ tidy_vcf <- function(
     tidy.vcf <- TRUE
   }
 
+  # Tidying TRUE  --------------------------------------------------------------
   if (tidy.vcf) {
     # re-calibration of ref/alt alleles ------------------------------------------
     if (ref.calibration) {
@@ -484,16 +485,25 @@ tidy_vcf <- function(
         check = "none", verbose = FALSE)$ID
       # current version doesn't deal well with PL with 3 fields separated with ","
       # want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV", "RO", "QR", "AO", "QA")
-      want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV")
-
-      if (!is.null(overwrite.metadata)) want <- overwrite.metadata
-      parse.format.list <- purrr::keep(.x = have, .p = have %in% want)
-      # work on parallelization of this part
-      data$tidy.data.metadata <- purrr::map(
-        .x = parse.format.list, .f = parse_gds_metadata, data = data,
-        verbose = verbose, parallel.core = parallel.core) %>%
-        purrr::flatten(.) %>%
-        purrr::flatten_df(.)
+
+      if (length(have) > 0) {
+        want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV")
+
+        if (!is.null(overwrite.metadata)) want <- overwrite.metadata
+        if (verbose) message("    genotypes metadata: ", stringi::stri_join(want, collapse = ", "))
+
+        parse.format.list <- purrr::keep(.x = have, .p = have %in% want)
+        # work on parallelization of this part
+        data$tidy.data.metadata <- purrr::map(
+          .x = parse.format.list, .f = parse_gds_metadata, data = data,
+          verbose = verbose, parallel.core = parallel.core) %>%
+          purrr::flatten(.) %>%
+          purrr::flatten_df(.)
+      } else {
+        if (verbose) message("    genotypes metadata: none found")
+        vcf.metadata <- FALSE
+        data$tidy.data.metadata <- NULL
+      }
     }
 
     # Remove or not gds connection and file --------------------------------------
@@ -516,7 +526,8 @@ tidy_vcf <- function(
     ## Note to myself: check timig with 1M SNPs to see
     ## if this is more efficient than data.table melt...
 
-    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "COL", "REF", "ALT")
+    want <- intersect(c("MARKERS", "CHROM", "LOCUS", "POS", "COL", "REF", "ALT"),
+                      names(data$markers.meta))
     data$tidy.data <- suppressWarnings(
       dplyr::select(data$markers.meta, dplyr::one_of(want))) %>%
       dplyr::bind_cols(
 
@@ -94,7 +94,11 @@ tidy_wide <- function(data, import.metadata = FALSE) {
 
   # Determine long (tidy) or wide dataset
   if (!"MARKERS" %in% colnames(data) && !"LOCUS" %in% colnames(data)) {
-    data <- tidyr::gather(data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS))
+    if (tibble::has_name(data, "POP_ID")) {
+      data <- tidyr::gather(data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS))
+    } else {
+      data <- tidyr::gather(data = data, key = MARKERS, value = GT, -INDIVIDUALS)
+    }
   }
 
   # necessary steps to make sure we work with unique markers and not duplicated LOCUS
@@ -142,7 +146,7 @@ tidy_wide <- function(data, import.metadata = FALSE) {
   }
 
   data$INDIVIDUALS <- clean_ind_names(data$INDIVIDUALS)# clean id names
-  data$POP_ID <- clean_pop_names(data$POP_ID)# clean pop id
+  if (tibble::has_name(data, "POP_ID"))   data$POP_ID <- clean_pop_names(data$POP_ID)# clean pop id
   data <- dplyr::ungroup(data) # Make sure no data groupings exists
   return(data)
 }#End tidy_wide
@@ -444,11 +444,6 @@ interactive_question <- function(x, answer.opt = NULL, minmax = NULL) {
 check_header_source <- function(vcf) {
 
   check.header <- SeqArray::seqVCF_Header(vcf)
-
-  if (check.header$format$Number[check.header$format$ID == "AD"] == 1) {
-    check.header$format$Number[check.header$format$ID == "AD"] <- "."
-  }
-
   problematic.id <- c("AD", "AO", "QA", "GL")
   problematic.id <- purrr::keep(.x = problematic.id, .p = problematic.id %in% check.header$format$ID)
   for (p in problematic.id) {
@@ -465,15 +460,6 @@ check_header_source <- function(vcf) {
     if (!keep.stacks.gl) {
       check.header$format <- dplyr::filter(check.header$format, ID != "GL")
     }
-    # check for HQ in FORMAT header (stacks haplotypes specific adjustments)
-    # if (TRUE %in% c(check.header$format$ID == "HQ")) {
-    #   markers.info <- character(0)
-    #   overwrite.metadata <- "GT"
-    # } else {
-    #   markers.info <- NULL
-    #   overwrite.metadata <- NULL
-    # }
-    # This trick doesnt work because stacks SNP vcf also as the HQ in the header :(
     markers.info <- NULL
     overwrite.metadata <- NULL
   } else {
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,8 @@ if (getRversion() >= "2.15.1") {`
`94`	`94`	`"FILTER_SHORT_LD", "FILTER_STRANDS", "GENOTYPED_THRESHOLD", "MARKERS_A",`
`95`	`95`	`"MARKERS_B", "MISSING_POP", "NUMBER_MARKERS", "SNP_PER_LOCUS", "id.select",`
`96`	`96`	`"individuals.missing", "keep.both.strands", "n.ind", "snp.select.no.maf",`
`97`		`- "common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant"`
	`97`	`+ "common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant",`
	`98`	`+ "STRANDS", "COVERAGE_TOTAL", "COVERAGE_MEAN"`
`98`	`99`	`)`
`99`	`100`	`)`
`100`	`101`	`}`