Skip to content

Commit f42f2fc

Browse files
# radiator v.0.0.20 2018-11-12
* `tidy_vcf`, `tidy_genomic_data` and `genomic_converter`: works better with ipyrad vcf's
1 parent 94f6146 commit f42f2fc

12 files changed

+262
-183
lines changed

.DS_Store

2 KB
Binary file not shown.

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: radiator
22
Type: Package
33
Title: RADseq Data Exploration, Manipulation and Visualization using R
4-
Version: 0.0.19
5-
Date: 2018-11-07
4+
Version: 0.0.20
5+
Date: 2018-11-12
66
Encoding: UTF-8
77
Authors@R: c(
88
person("Thierry", "Gosselin", email = "[email protected]", role = c("aut", "cre")),

NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# radiator v.0.0.20 2018-11-12
2+
3+
* `tidy_vcf`, `tidy_genomic_data` and `genomic_converter`: works better with ipyrad vcf's
4+
5+
16
# radiator v.0.0.19 2018-11-07
27

38
* `tidy_vcf`, `tidy_genomic_data` and `genomic_converter`: works better with freebayes and stacks vcf

R/global_variables.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ if (getRversion() >= "2.15.1") {
9494
"FILTER_SHORT_LD", "FILTER_STRANDS", "GENOTYPED_THRESHOLD", "MARKERS_A",
9595
"MARKERS_B", "MISSING_POP", "NUMBER_MARKERS", "SNP_PER_LOCUS", "id.select",
9696
"individuals.missing", "keep.both.strands", "n.ind", "snp.select.no.maf",
97-
"common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant"
97+
"common.markers", "FILTER_INDIVIDUALS_MISSING", "tidy.data", "num.variant",
98+
"STRANDS", "COVERAGE_TOTAL", "COVERAGE_MEAN"
9899
)
99100
)
100101
}

R/tidy_genomic_data.R

+6-2
Original file line numberDiff line numberDiff line change
@@ -1105,7 +1105,7 @@ tidy_genomic_data <- function(
11051105

11061106
# Blacklist genotypes --------------------------------------------------------
11071107
if (is.null(blacklist.genotype)) { # no Whitelist
1108-
if (verbose) message("Erasing genotype: no")
1108+
if (verbose) message("Erasing genotype: no")
11091109
} else {
11101110
if (verbose) message("Erasing genotype: yes")
11111111
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "INDIVIDUALS")
@@ -1260,7 +1260,11 @@ tidy_genomic_data <- function(
12601260
}
12611261

12621262
# Results --------------------------------------------------------------------
1263-
input %<>% dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
1263+
if (!is.null(strata)) {
1264+
input %<>% dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
1265+
} else {
1266+
input %<>% dplyr::arrange(INDIVIDUALS, MARKERS)
1267+
}
12641268

12651269

12661270
n.markers <- dplyr::n_distinct(input$MARKERS)

R/tidy_vcf.R

+22-11
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ tidy_vcf <- function(
345345
tidy.vcf <- TRUE
346346
}
347347

348+
# Tidying TRUE --------------------------------------------------------------
348349
if (tidy.vcf) {
349350
# re-calibration of ref/alt alleles ------------------------------------------
350351
if (ref.calibration) {
@@ -484,16 +485,25 @@ tidy_vcf <- function(
484485
check = "none", verbose = FALSE)$ID
485486
# current version doesn't deal well with PL with 3 fields separated with ","
486487
# want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV", "RO", "QR", "AO", "QA")
487-
want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV")
488-
489-
if (!is.null(overwrite.metadata)) want <- overwrite.metadata
490-
parse.format.list <- purrr::keep(.x = have, .p = have %in% want)
491-
# work on parallelization of this part
492-
data$tidy.data.metadata <- purrr::map(
493-
.x = parse.format.list, .f = parse_gds_metadata, data = data,
494-
verbose = verbose, parallel.core = parallel.core) %>%
495-
purrr::flatten(.) %>%
496-
purrr::flatten_df(.)
488+
489+
if (length(have) > 0) {
490+
want <- c("DP", "AD", "GL", "PL", "HQ", "GQ", "GOF", "NR", "NV")
491+
492+
if (!is.null(overwrite.metadata)) want <- overwrite.metadata
493+
if (verbose) message(" genotypes metadata: ", stringi::stri_join(want, collapse = ", "))
494+
495+
parse.format.list <- purrr::keep(.x = have, .p = have %in% want)
496+
# work on parallelization of this part
497+
data$tidy.data.metadata <- purrr::map(
498+
.x = parse.format.list, .f = parse_gds_metadata, data = data,
499+
verbose = verbose, parallel.core = parallel.core) %>%
500+
purrr::flatten(.) %>%
501+
purrr::flatten_df(.)
502+
} else {
503+
if (verbose) message(" genotypes metadata: none found")
504+
vcf.metadata <- FALSE
505+
data$tidy.data.metadata <- NULL
506+
}
497507
}
498508

499509
# Remove or not gds connection and file --------------------------------------
@@ -516,7 +526,8 @@ tidy_vcf <- function(
516526
## Note to myself: check timig with 1M SNPs to see
517527
## if this is more efficient than data.table melt...
518528

519-
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "COL", "REF", "ALT")
529+
want <- intersect(c("MARKERS", "CHROM", "LOCUS", "POS", "COL", "REF", "ALT"),
530+
names(data$markers.meta))
520531
data$tidy.data <- suppressWarnings(
521532
dplyr::select(data$markers.meta, dplyr::one_of(want))) %>%
522533
dplyr::bind_cols(

R/tidy_wide.R

+6-2
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,11 @@ tidy_wide <- function(data, import.metadata = FALSE) {
9494

9595
# Determine long (tidy) or wide dataset
9696
if (!"MARKERS" %in% colnames(data) && !"LOCUS" %in% colnames(data)) {
97-
data <- tidyr::gather(data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS))
97+
if (tibble::has_name(data, "POP_ID")) {
98+
data <- tidyr::gather(data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS))
99+
} else {
100+
data <- tidyr::gather(data = data, key = MARKERS, value = GT, -INDIVIDUALS)
101+
}
98102
}
99103

100104
# necessary steps to make sure we work with unique markers and not duplicated LOCUS
@@ -142,7 +146,7 @@ tidy_wide <- function(data, import.metadata = FALSE) {
142146
}
143147

144148
data$INDIVIDUALS <- clean_ind_names(data$INDIVIDUALS)# clean id names
145-
data$POP_ID <- clean_pop_names(data$POP_ID)# clean pop id
149+
if (tibble::has_name(data, "POP_ID")) data$POP_ID <- clean_pop_names(data$POP_ID)# clean pop id
146150
data <- dplyr::ungroup(data) # Make sure no data groupings exists
147151
return(data)
148152
}#End tidy_wide

R/utils.R

-14
Original file line numberDiff line numberDiff line change
@@ -444,11 +444,6 @@ interactive_question <- function(x, answer.opt = NULL, minmax = NULL) {
444444
check_header_source <- function(vcf) {
445445

446446
check.header <- SeqArray::seqVCF_Header(vcf)
447-
448-
if (check.header$format$Number[check.header$format$ID == "AD"] == 1) {
449-
check.header$format$Number[check.header$format$ID == "AD"] <- "."
450-
}
451-
452447
problematic.id <- c("AD", "AO", "QA", "GL")
453448
problematic.id <- purrr::keep(.x = problematic.id, .p = problematic.id %in% check.header$format$ID)
454449
for (p in problematic.id) {
@@ -465,15 +460,6 @@ check_header_source <- function(vcf) {
465460
if (!keep.stacks.gl) {
466461
check.header$format <- dplyr::filter(check.header$format, ID != "GL")
467462
}
468-
# check for HQ in FORMAT header (stacks haplotypes specific adjustments)
469-
# if (TRUE %in% c(check.header$format$ID == "HQ")) {
470-
# markers.info <- character(0)
471-
# overwrite.metadata <- "GT"
472-
# } else {
473-
# markers.info <- NULL
474-
# overwrite.metadata <- NULL
475-
# }
476-
# This trick doesnt work because stacks SNP vcf also as the HQ in the header :(
477463
markers.info <- NULL
478464
overwrite.metadata <- NULL
479465
} else {

0 commit comments

Comments
 (0)