Skip to content

Commit

Permalink
* tidy_genind and tidy_genlight: fix a few bugs when MARKERS REF/…
Browse files Browse the repository at this point in the history
…ALT or POP_ID/STRATA where used.

* `write_genind` and `write_genlight`: works faster with GDS and tidy data.
* thanks @italo-granato that highlighted a bug with STRATA/POP_ID (pull request#40).
  • Loading branch information
thierrygosselin committed Mar 15, 2019
1 parent c75192d commit 7fef881
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 71 deletions.
6 changes: 3 additions & 3 deletions R/gds.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ radiator_gds <- function(

if (rlang::has_name(genotypes.df, "MARKERS")) {
genotypes.df %<>%
dplyr::select(-MARKERS)
data.matrix(.) %>%
dplyr::select(-MARKERS) %>%
data.matrix(.) %>%
magrittr::set_rownames(x = ., value = variant.id)
}

Expand Down Expand Up @@ -111,7 +111,7 @@ radiator_gds <- function(
separate_markers(
data = .,
sep = "__",
# markers.meta.all.only = TRUE,
markers.meta.all.only = TRUE,
biallelic = TRUE,
verbose = verbose)

Expand Down
55 changes: 37 additions & 18 deletions R/genind.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.

#' @param data A genind object in the global environment.
#' @param data (path or object) A genind object in the global environment or
#' path to a genind file that will be open with \code{readRDS}.

#' @param keep.allele.names Allows to keep allele names for the tidy dataset.
#' Requires the alleles to be numeric. To have this argument in
Expand Down Expand Up @@ -59,8 +60,9 @@ tidy_genind <- function(
) {

# Checking for missing and/or default arguments ------------------------------
if (missing(data)) stop("data argument required")
if (class(data)[1] != "genind") stop("Input is not a genind object")
if (missing(data)) rlang::abort("data argument required")
if (is.vector(data)) data <- readRDS(data)
if (class(data)[1] != "genind") rlang::abort("Input is not a genind object")
if (verbose) message("genind info:")
strata <- tibble::tibble(INDIVIDUALS = rownames(data@tab))
if (is.null(data@pop)) {
Expand All @@ -77,17 +79,31 @@ tidy_genind <- function(

if (gds) {
# prepare genind
alt.alleles <- tibble::tibble(MARKERS_ALLELES = colnames(data@tab), COUNT = colSums(x = data@tab, na.rm = TRUE)) %>%
alt.alleles <- tibble::tibble(
MARKERS_ALLELES = colnames(data@tab),
COUNT = colSums(x = data@tab, na.rm = TRUE)
) %>%
dplyr::mutate(
MARKERS = stringi::stri_extract_first_regex(str = colnames(data@tab), pattern = "^[^.]+"),
ALLELES = stringi::stri_extract_last_regex(str = colnames(data@tab), pattern = "(?<=\\.).*")
) %>%
dplyr::group_by(MARKERS) %>%
dplyr::mutate(REF = dplyr::if_else(COUNT == min(COUNT, na.rm = TRUE), "ALT", "REF")) %>%
dplyr::ungroup(.) %>%
dplyr::select(MARKERS_ALLELES, REF) %>%
dplyr::filter(REF == "ALT") %>%
dplyr::select(MARKERS_ALLELES) %$% MARKERS_ALLELES
dplyr::arrange(MARKERS, COUNT, ALLELES) %>%
dplyr::mutate(REF = rep(c("ALT", "REF"), n() / 2))

# check that REF/ALT are A, C, G, T before going further
# any(unique(alt.alleles$ALLELES) %in% c("A", "C", "G", "T"))
# any(unique(c("A", "DD", "G")) %in% c("A", "C", "G", "T"))

ref.alt <- dplyr::select(alt.alleles, MARKERS, REF, ALLELES) %>%
data.table::as.data.table(.) %>%
data.table::dcast.data.table(
data = .,
formula = MARKERS ~ REF,
value.var = "ALLELES"
) %>%
tibble::as_tibble(.)

alt.alleles %<>% dplyr::filter(REF == "ALT") %$% MARKERS_ALLELES

geno <- tibble::as_tibble(t(data@tab), rownames = "MARKERS") %>%
dplyr::filter(MARKERS %in% alt.alleles) %>%
Expand All @@ -97,14 +113,18 @@ tidy_genind <- function(
dplyr::arrange(VARIANT_ID)

alt.alleles <- NULL
markers.meta <- dplyr::select(geno, VARIANT_ID, MARKERS)
markers.meta <- dplyr::select(geno, VARIANT_ID, MARKERS) %>%
dplyr::left_join(ref.alt, by = "MARKERS")
ref.alt <- NULL

suppressWarnings(
geno %<>%
dplyr::select(-MARKERS) %>%
tibble::column_to_rownames(.data = ., var = "VARIANT_ID")
)

gds.filename <- radiator_gds(
source = "genind",
genotypes.df = geno,
strata = strata,
biallelic = TRUE,
Expand All @@ -113,7 +133,6 @@ tidy_genind <- function(
verbose = verbose
)
if (verbose) message("Written: GDS filename: ", gds.filename)

}# End gds genind

if (tidy) {
Expand Down Expand Up @@ -332,7 +351,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
variable.name = "ALLELES",
value.name = "n"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
# tidyr::gather(data = ., key = ALLELES, value = n, -c(INDIVIDUALS, POP_ID, MARKERS)) %>%
dplyr::mutate(
ALLELES = dplyr::case_when(
Expand All @@ -349,7 +368,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES,
value.var = "n"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
# dplyr::group_by(POP_ID, INDIVIDUALS) %>%
# tidyr::spread(data =., key = MARKERS_ALLELES, value = n) %>%
# dplyr::ungroup(.) %>%
Expand All @@ -367,7 +386,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
variable.name = "ALLELES",
value.name = "n"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
# tidyr::gather(data = ., key = ALLELES, value = n, -c(INDIVIDUALS, POP_ID, MARKERS)) %>%
dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>%
dplyr::select(-MARKERS, -ALLELES) %>%
Expand All @@ -377,7 +396,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES,
value.var = "n"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
# dplyr::group_by(POP_ID, INDIVIDUALS) %>%
# tidyr::spread(data =., key = MARKERS_ALLELES, value = n) %>%
# dplyr::ungroup(.) %>%
Expand Down Expand Up @@ -412,7 +431,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
variable.name = "ALLELES",
value.name = "GT"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS, GT) %>%
dplyr::count(x = ., POP_ID, INDIVIDUALS, MARKERS, GT) %>%
dplyr::ungroup(.) %>%
Expand All @@ -431,7 +450,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) {
formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES,
value.var = "n"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
dplyr::arrange(POP_ID, INDIVIDUALS))
}

Expand Down
107 changes: 62 additions & 45 deletions R/genlight.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# tidy_genlight ----------------------------------------------------------------
#' @name tidy_genlight
#' @title Tidy a genlight object to a tidy dataframe
#' @description Tidy genlight object to a tidy dataframe.
#' @title Tidy a genlight object to a tidy dataframe and/or GDS object/file
#' @description Tidy genlight object to a tidy dataframe and/or GDS object/file.
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.

#' @param data A genlight object in the global environment.
#' @param data (path or object) A genlight object in the global environment or
#' path to a genlight file that will be open with \code{readRDS}.
#' @inheritParams tidy_genomic_data

#' @param tidy (logical) Generate a tidy dataset.
Expand Down Expand Up @@ -66,29 +67,23 @@ tidy_genlight <- function(
')
}

## test
# data
# verbose = TRUE

# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("data argument required")
if (class(data)[1] != "genlight") rlang::abort("Input is not a genlight object")

# Import data ---------------------------------------------------------------
if (is.vector(data)) data <- readRDS(data)
if (class(data)[1] != "genlight") rlang::abort("Input is not a genlight object")

if (verbose) message("genlight info:")
# strata ?
strata <- tibble::tibble(INDIVIDUALS = data@ind.names)
if (is.null(data@pop)) {
if (verbose) message(" strata: no")
if (verbose) message(" 'pop' will be added")
strata %<>% dplyr::mutate(STRATA = "pop")
# strata <- tibble::tibble(
# INDIVIDUALS = [email protected],
# POP_ID = rep("pop1", dim(data)[1]))
strata %<>% dplyr::mutate(POP_ID = "pop")
} else {
if (verbose) message(" strata: yes")
strata$STRATA = data@pop
strata$POP_ID = data@pop
}

n.markers <- dim(data)[2]
Expand Down Expand Up @@ -152,7 +147,7 @@ tidy_genlight <- function(
"GT_VCF", "GT_BIN", "GT")

if (verbose) message("Generating tidy data...")
data <- suppressWarnings(
tidy.data <- suppressWarnings(
data.frame(data) %>%
magrittr::set_colnames(x = ., value = markers$MARKERS) %>%
tibble::add_column(.data = ., INDIVIDUALS = rownames(.), .before = 1) %>%
Expand All @@ -163,7 +158,7 @@ tidy_genlight <- function(
variable.name = "MARKERS",
value.name = "GT_BIN"
) %>%
tibble::as_data_frame(.) %>%
tibble::as_tibble(.) %>%
dplyr::full_join(markers, by = "MARKERS") %>%
dplyr::full_join(strata, by = "INDIVIDUALS") %>%
dplyr::mutate(
Expand All @@ -188,19 +183,17 @@ tidy_genlight <- function(
dplyr::select(dplyr::one_of(want)))

if (write) {
radiator::write_rad(data = data, path = filename.genlight)
radiator::write_rad(data = tidy.data, path = filename.genlight)
if (verbose) message("File written: ", filename.short)
}

}#End tidy genlight
if (gds) {

markers %<>% dplyr::mutate(VARIANT_ID = as.integer(factor(MARKERS)))

# markers %<>% dplyr::mutate(VARIANT_ID = as.integer(factor(MARKERS)))
gds.filename <- radiator_gds(
source = "genlight",
genotypes.df = tibble::as_tibble(data.frame(data) %>% t) %>%
tibble::add_column(.data = ., VARIANT_ID = markers$VARIANT_ID, .before = 1) %>%
dplyr::arrange(VARIANT_ID),
tibble::add_column(.data = ., MARKERS = markers$MARKERS, .before = 1) %>%
dplyr::arrange(MARKERS),
strata = dplyr::rename(strata, STRATA = POP_ID),
biallelic = TRUE,
markers.meta = markers,
Expand All @@ -209,7 +202,13 @@ tidy_genlight <- function(
)
if (verbose) message("Written: GDS filename: ", gds.filename)
}# End gds genlight
return(data)

if (tidy) {
return(tidy.data)
} else {
message("returning GDS filename")
return(gds.filename)
}
} # End tidy_genlight

# write_genlight ----------------------------------------------------------------
Expand Down Expand Up @@ -271,46 +270,64 @@ write_genlight <- function(data, biallelic = TRUE, write = FALSE, verbose = FALS
data <- gds2tidy(gds = data, parallel.core = parallel::detectCores() - 1)
data.type <- "tbl_df"
} else {
want <- c("MARKERS", "POP_ID", "INDIVIDUALS", "REF", "ALT", "GT", "GT_BIN")
data <- suppressWarnings(radiator::tidy_wide(data = data, import.metadata = TRUE) %>%
dplyr::select(dplyr::one_of(want)) %>%
dplyr::arrange(POP_ID, INDIVIDUALS))
if (rlang::has_name(data, "STRATA")) data %<>% dplyr::rename(POP_ID = STRATA)
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "POP_ID", "INDIVIDUALS", "REF", "ALT", "GT_VCF", "GT_BIN")
data <- suppressWarnings(
radiator::tidy_wide(data = data, import.metadata = TRUE) %>%
dplyr::select(dplyr::one_of(want)) %>%
dplyr::arrange(POP_ID, INDIVIDUALS)
)
}

# Detect if biallelic data
if (is.null(biallelic)) biallelic <- radiator::detect_biallelic_markers(data = input)
if (is.null(biallelic)) biallelic <- radiator::detect_biallelic_markers(data)
if (!biallelic) rlang::abort("genlight object requires biallelic genotypes")

# data = input
input %<>% dplyr::arrange(MARKERS, CHROM, LOCUS, POS, POP_ID, INDIVIDUALS)
marker.meta <- dplyr::distinct(.data = input, MARKERS, CHROM, LOCUS, POS)
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT")
markers.meta <- suppressWarnings(
dplyr::select(data, dplyr::one_of(want)) %>%
dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
separate_markers(
data = .,
sep = "__",
markers.meta.all.only = TRUE,
biallelic = TRUE,
verbose = verbose)
)
data %<>% dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS)


if (!tibble::has_name(input, "GT_BIN")) {
input$GT_BIN <- stringi::stri_replace_all_fixed(
str = input$GT_VCF,
if (!rlang::has_name(data, "GT_BIN") && rlang::has_name(data, "GT_VCF")) {
data$GT_BIN <- stringi::stri_replace_all_fixed(
str = data$GT_VCF,
pattern = c("0/0", "1/1", "0/1", "1/0", "./."),
replacement = c("0", "2", "1", "1", NA),
vectorize_all = FALSE
)
}

input <- dplyr::select(.data = input, MARKERS, POP_ID, INDIVIDUALS, GT_BIN) %>%
data %<>%
dplyr::select(MARKERS, POP_ID, INDIVIDUALS, GT_BIN) %>%
dplyr::mutate(GT_BIN = as.integer(GT_BIN)) %>%
dplyr::group_by(INDIVIDUALS, POP_ID) %>%
tidyr::spread(data = ., key = MARKERS, value = GT_BIN) %>%
dplyr::ungroup(.)
data.table::as.data.table(.) %>%
data.table::dcast.data.table(
data = .,
formula = POP_ID + INDIVIDUALS ~ MARKERS,
value.var = "GT_BIN"
) %>%
tibble::as_tibble(.)

# Generate genlight
genlight.object <- methods::new(
"genlight",
input[,-(1:2)],
data[,-(1:2)],
parallel = FALSE
)
adegenet::indNames(genlight.object) <- input$INDIVIDUALS
adegenet::pop(genlight.object) <- input$POP_ID
adegenet::chromosome(genlight.object) <- marker.meta$CHROM
adegenet::locNames(genlight.object) <- marker.meta$LOCUS
adegenet::position(genlight.object) <- marker.meta$POS
adegenet::indNames(genlight.object) <- data$INDIVIDUALS
adegenet::pop(genlight.object) <- data$POP_ID
adegenet::chromosome(genlight.object) <- markers.meta$CHROM
adegenet::locNames(genlight.object) <- markers.meta$LOCUS
adegenet::position(genlight.object) <- markers.meta$POS


# Check
Expand All @@ -335,4 +352,4 @@ write_genlight <- function(data, biallelic = TRUE, write = FALSE, verbose = FALS
if (verbose) message("File written: ", filename.short)
}
return(genlight.object)
} # End write_genlight
} # End write_genlight
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[![Travis-CI Build Status](https://travis-ci.org/thierrygosselin/radiator.svg?branch=master)](https://travis-ci.org/thierrygosselin/radiator) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/thierrygosselin/radiator?branch=master&svg=true)](https://ci.appveyor.com/project/thierrygosselin/radiator) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/radiator)](http://cran.r-project.org/package=radiator) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![DOI](https://zenodo.org/badge/14548/thierrygosselin/radiator.svg)](https://zenodo.org/badge/latestdoi/14548/thierrygosselin/radiator)

[![packageversion](https://img.shields.io/badge/Package%20version-1.0.0-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2019--03--12-brightgreen.svg)](/commits/master)
[![packageversion](https://img.shields.io/badge/Package%20version-1.0.0-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2019--03--14-brightgreen.svg)](/commits/master)

------------------------------------------------------------------------

Expand Down
3 changes: 2 additions & 1 deletion man/tidy_genind.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 7fef881

Please sign in to comment.