From 7fef88175de7c8560f8248122b3bff83865deae0 Mon Sep 17 00:00:00 2001 From: thierrygosselin Date: Thu, 14 Mar 2019 21:53:14 -0400 Subject: [PATCH] * `tidy_genind` and `tidy_genlight`: fix a few bugs when MARKERS REF/ALT or POP_ID/STRATA where used. * `write_genind` and `write_genlight`: works faster with GDS and tidy data. * thanks @italo-granato that highlighted a bug with STRATA/POP_ID (pull request#40). --- R/gds.R | 6 +-- R/genind.R | 55 ++++++++++++++-------- R/genlight.R | 107 +++++++++++++++++++++++++------------------ README.md | 2 +- man/tidy_genind.Rd | 3 +- man/tidy_genlight.Rd | 7 +-- 6 files changed, 109 insertions(+), 71 deletions(-) diff --git a/R/gds.R b/R/gds.R index 1b92647f..77a83cf3 100644 --- a/R/gds.R +++ b/R/gds.R @@ -66,8 +66,8 @@ radiator_gds <- function( if (rlang::has_name(genotypes.df, "MARKERS")) { genotypes.df %<>% - dplyr::select(-MARKERS) - data.matrix(.) %>% + dplyr::select(-MARKERS) %>% + data.matrix(.) %>% magrittr::set_rownames(x = ., value = variant.id) } @@ -111,7 +111,7 @@ radiator_gds <- function( separate_markers( data = ., sep = "__", - # markers.meta.all.only = TRUE, + markers.meta.all.only = TRUE, biallelic = TRUE, verbose = verbose) diff --git a/R/genind.R b/R/genind.R index c5d786b3..4b19d213 100644 --- a/R/genind.R +++ b/R/genind.R @@ -6,7 +6,8 @@ #' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator} #' and might be of interest for users. -#' @param data A genind object in the global environment. +#' @param data (path or object) A genind object in the global environment or +#' path to a genind file that will be open with \code{readRDS}. #' @param keep.allele.names Allows to keep allele names for the tidy dataset. #' Requires the alleles to be numeric. To have this argument in @@ -59,8 +60,9 @@ tidy_genind <- function( ) { # Checking for missing and/or default arguments ------------------------------ - if (missing(data)) stop("data argument required") - if (class(data)[1] != "genind") stop("Input is not a genind object") + if (missing(data)) rlang::abort("data argument required") + if (is.vector(data)) data <- readRDS(data) + if (class(data)[1] != "genind") rlang::abort("Input is not a genind object") if (verbose) message("genind info:") strata <- tibble::tibble(INDIVIDUALS = rownames(data@tab)) if (is.null(data@pop)) { @@ -77,17 +79,31 @@ tidy_genind <- function( if (gds) { # prepare genind - alt.alleles <- tibble::tibble(MARKERS_ALLELES = colnames(data@tab), COUNT = colSums(x = data@tab, na.rm = TRUE)) %>% + alt.alleles <- tibble::tibble( + MARKERS_ALLELES = colnames(data@tab), + COUNT = colSums(x = data@tab, na.rm = TRUE) + ) %>% dplyr::mutate( MARKERS = stringi::stri_extract_first_regex(str = colnames(data@tab), pattern = "^[^.]+"), ALLELES = stringi::stri_extract_last_regex(str = colnames(data@tab), pattern = "(?<=\\.).*") ) %>% - dplyr::group_by(MARKERS) %>% - dplyr::mutate(REF = dplyr::if_else(COUNT == min(COUNT, na.rm = TRUE), "ALT", "REF")) %>% - dplyr::ungroup(.) %>% - dplyr::select(MARKERS_ALLELES, REF) %>% - dplyr::filter(REF == "ALT") %>% - dplyr::select(MARKERS_ALLELES) %$% MARKERS_ALLELES + dplyr::arrange(MARKERS, COUNT, ALLELES) %>% + dplyr::mutate(REF = rep(c("ALT", "REF"), n() / 2)) + + # check that REF/ALT are A, C, G, T before going further + # any(unique(alt.alleles$ALLELES) %in% c("A", "C", "G", "T")) + # any(unique(c("A", "DD", "G")) %in% c("A", "C", "G", "T")) + + ref.alt <- dplyr::select(alt.alleles, MARKERS, REF, ALLELES) %>% + data.table::as.data.table(.) %>% + data.table::dcast.data.table( + data = ., + formula = MARKERS ~ REF, + value.var = "ALLELES" + ) %>% + tibble::as_tibble(.) + + alt.alleles %<>% dplyr::filter(REF == "ALT") %$% MARKERS_ALLELES geno <- tibble::as_tibble(t(data@tab), rownames = "MARKERS") %>% dplyr::filter(MARKERS %in% alt.alleles) %>% @@ -97,7 +113,10 @@ tidy_genind <- function( dplyr::arrange(VARIANT_ID) alt.alleles <- NULL - markers.meta <- dplyr::select(geno, VARIANT_ID, MARKERS) + markers.meta <- dplyr::select(geno, VARIANT_ID, MARKERS) %>% + dplyr::left_join(ref.alt, by = "MARKERS") + ref.alt <- NULL + suppressWarnings( geno %<>% dplyr::select(-MARKERS) %>% @@ -105,6 +124,7 @@ tidy_genind <- function( ) gds.filename <- radiator_gds( + source = "genind", genotypes.df = geno, strata = strata, biallelic = TRUE, @@ -113,7 +133,6 @@ tidy_genind <- function( verbose = verbose ) if (verbose) message("Written: GDS filename: ", gds.filename) - }# End gds genind if (tidy) { @@ -332,7 +351,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { variable.name = "ALLELES", value.name = "n" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% # tidyr::gather(data = ., key = ALLELES, value = n, -c(INDIVIDUALS, POP_ID, MARKERS)) %>% dplyr::mutate( ALLELES = dplyr::case_when( @@ -349,7 +368,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES, value.var = "n" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% # dplyr::group_by(POP_ID, INDIVIDUALS) %>% # tidyr::spread(data =., key = MARKERS_ALLELES, value = n) %>% # dplyr::ungroup(.) %>% @@ -367,7 +386,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { variable.name = "ALLELES", value.name = "n" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% # tidyr::gather(data = ., key = ALLELES, value = n, -c(INDIVIDUALS, POP_ID, MARKERS)) %>% dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>% dplyr::select(-MARKERS, -ALLELES) %>% @@ -377,7 +396,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES, value.var = "n" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% # dplyr::group_by(POP_ID, INDIVIDUALS) %>% # tidyr::spread(data =., key = MARKERS_ALLELES, value = n) %>% # dplyr::ungroup(.) %>% @@ -412,7 +431,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { variable.name = "ALLELES", value.name = "GT" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS, GT) %>% dplyr::count(x = ., POP_ID, INDIVIDUALS, MARKERS, GT) %>% dplyr::ungroup(.) %>% @@ -431,7 +450,7 @@ write_genind <- function(data, write = FALSE, verbose = FALSE) { formula = POP_ID + INDIVIDUALS ~ MARKERS_ALLELES, value.var = "n" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% dplyr::arrange(POP_ID, INDIVIDUALS)) } diff --git a/R/genlight.R b/R/genlight.R index 578a2b29..1d925029 100644 --- a/R/genlight.R +++ b/R/genlight.R @@ -1,11 +1,12 @@ # tidy_genlight ---------------------------------------------------------------- #' @name tidy_genlight -#' @title Tidy a genlight object to a tidy dataframe -#' @description Tidy genlight object to a tidy dataframe. +#' @title Tidy a genlight object to a tidy dataframe and/or GDS object/file +#' @description Tidy genlight object to a tidy dataframe and/or GDS object/file. #' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator} #' and might be of interest for users. -#' @param data A genlight object in the global environment. +#' @param data (path or object) A genlight object in the global environment or +#' path to a genlight file that will be open with \code{readRDS}. #' @inheritParams tidy_genomic_data #' @param tidy (logical) Generate a tidy dataset. @@ -66,15 +67,12 @@ tidy_genlight <- function( ') } - ## test - # data - # verbose = TRUE - # Checking for missing and/or default arguments ------------------------------ if (missing(data)) rlang::abort("data argument required") - if (class(data)[1] != "genlight") rlang::abort("Input is not a genlight object") # Import data --------------------------------------------------------------- + if (is.vector(data)) data <- readRDS(data) + if (class(data)[1] != "genlight") rlang::abort("Input is not a genlight object") if (verbose) message("genlight info:") # strata ? @@ -82,13 +80,10 @@ tidy_genlight <- function( if (is.null(data@pop)) { if (verbose) message(" strata: no") if (verbose) message(" 'pop' will be added") - strata %<>% dplyr::mutate(STRATA = "pop") - # strata <- tibble::tibble( - # INDIVIDUALS = data@ind.names, - # POP_ID = rep("pop1", dim(data)[1])) + strata %<>% dplyr::mutate(POP_ID = "pop") } else { if (verbose) message(" strata: yes") - strata$STRATA = data@pop + strata$POP_ID = data@pop } n.markers <- dim(data)[2] @@ -152,7 +147,7 @@ tidy_genlight <- function( "GT_VCF", "GT_BIN", "GT") if (verbose) message("Generating tidy data...") - data <- suppressWarnings( + tidy.data <- suppressWarnings( data.frame(data) %>% magrittr::set_colnames(x = ., value = markers$MARKERS) %>% tibble::add_column(.data = ., INDIVIDUALS = rownames(.), .before = 1) %>% @@ -163,7 +158,7 @@ tidy_genlight <- function( variable.name = "MARKERS", value.name = "GT_BIN" ) %>% - tibble::as_data_frame(.) %>% + tibble::as_tibble(.) %>% dplyr::full_join(markers, by = "MARKERS") %>% dplyr::full_join(strata, by = "INDIVIDUALS") %>% dplyr::mutate( @@ -188,19 +183,17 @@ tidy_genlight <- function( dplyr::select(dplyr::one_of(want))) if (write) { - radiator::write_rad(data = data, path = filename.genlight) + radiator::write_rad(data = tidy.data, path = filename.genlight) if (verbose) message("File written: ", filename.short) } - }#End tidy genlight if (gds) { - - markers %<>% dplyr::mutate(VARIANT_ID = as.integer(factor(MARKERS))) - + # markers %<>% dplyr::mutate(VARIANT_ID = as.integer(factor(MARKERS))) gds.filename <- radiator_gds( + source = "genlight", genotypes.df = tibble::as_tibble(data.frame(data) %>% t) %>% - tibble::add_column(.data = ., VARIANT_ID = markers$VARIANT_ID, .before = 1) %>% - dplyr::arrange(VARIANT_ID), + tibble::add_column(.data = ., MARKERS = markers$MARKERS, .before = 1) %>% + dplyr::arrange(MARKERS), strata = dplyr::rename(strata, STRATA = POP_ID), biallelic = TRUE, markers.meta = markers, @@ -209,7 +202,13 @@ tidy_genlight <- function( ) if (verbose) message("Written: GDS filename: ", gds.filename) }# End gds genlight - return(data) + + if (tidy) { + return(tidy.data) + } else { + message("returning GDS filename") + return(gds.filename) + } } # End tidy_genlight # write_genlight ---------------------------------------------------------------- @@ -271,46 +270,64 @@ write_genlight <- function(data, biallelic = TRUE, write = FALSE, verbose = FALS data <- gds2tidy(gds = data, parallel.core = parallel::detectCores() - 1) data.type <- "tbl_df" } else { - want <- c("MARKERS", "POP_ID", "INDIVIDUALS", "REF", "ALT", "GT", "GT_BIN") - data <- suppressWarnings(radiator::tidy_wide(data = data, import.metadata = TRUE) %>% - dplyr::select(dplyr::one_of(want)) %>% - dplyr::arrange(POP_ID, INDIVIDUALS)) + if (rlang::has_name(data, "STRATA")) data %<>% dplyr::rename(POP_ID = STRATA) + want <- c("MARKERS", "CHROM", "LOCUS", "POS", "POP_ID", "INDIVIDUALS", "REF", "ALT", "GT_VCF", "GT_BIN") + data <- suppressWarnings( + radiator::tidy_wide(data = data, import.metadata = TRUE) %>% + dplyr::select(dplyr::one_of(want)) %>% + dplyr::arrange(POP_ID, INDIVIDUALS) + ) } # Detect if biallelic data - if (is.null(biallelic)) biallelic <- radiator::detect_biallelic_markers(data = input) + if (is.null(biallelic)) biallelic <- radiator::detect_biallelic_markers(data) if (!biallelic) rlang::abort("genlight object requires biallelic genotypes") - # data = input - input %<>% dplyr::arrange(MARKERS, CHROM, LOCUS, POS, POP_ID, INDIVIDUALS) - marker.meta <- dplyr::distinct(.data = input, MARKERS, CHROM, LOCUS, POS) + want <- c("MARKERS", "CHROM", "LOCUS", "POS", "REF", "ALT") + markers.meta <- suppressWarnings( + dplyr::select(data, dplyr::one_of(want)) %>% + dplyr::distinct(MARKERS, .keep_all = TRUE) %>% + separate_markers( + data = ., + sep = "__", + markers.meta.all.only = TRUE, + biallelic = TRUE, + verbose = verbose) + ) + data %<>% dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS) + - if (!tibble::has_name(input, "GT_BIN")) { - input$GT_BIN <- stringi::stri_replace_all_fixed( - str = input$GT_VCF, + if (!rlang::has_name(data, "GT_BIN") && rlang::has_name(data, "GT_VCF")) { + data$GT_BIN <- stringi::stri_replace_all_fixed( + str = data$GT_VCF, pattern = c("0/0", "1/1", "0/1", "1/0", "./."), replacement = c("0", "2", "1", "1", NA), vectorize_all = FALSE ) } - input <- dplyr::select(.data = input, MARKERS, POP_ID, INDIVIDUALS, GT_BIN) %>% + data %<>% + dplyr::select(MARKERS, POP_ID, INDIVIDUALS, GT_BIN) %>% dplyr::mutate(GT_BIN = as.integer(GT_BIN)) %>% - dplyr::group_by(INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = MARKERS, value = GT_BIN) %>% - dplyr::ungroup(.) + data.table::as.data.table(.) %>% + data.table::dcast.data.table( + data = ., + formula = POP_ID + INDIVIDUALS ~ MARKERS, + value.var = "GT_BIN" + ) %>% + tibble::as_tibble(.) # Generate genlight genlight.object <- methods::new( "genlight", - input[,-(1:2)], + data[,-(1:2)], parallel = FALSE ) - adegenet::indNames(genlight.object) <- input$INDIVIDUALS - adegenet::pop(genlight.object) <- input$POP_ID - adegenet::chromosome(genlight.object) <- marker.meta$CHROM - adegenet::locNames(genlight.object) <- marker.meta$LOCUS - adegenet::position(genlight.object) <- marker.meta$POS + adegenet::indNames(genlight.object) <- data$INDIVIDUALS + adegenet::pop(genlight.object) <- data$POP_ID + adegenet::chromosome(genlight.object) <- markers.meta$CHROM + adegenet::locNames(genlight.object) <- markers.meta$LOCUS + adegenet::position(genlight.object) <- markers.meta$POS # Check @@ -335,4 +352,4 @@ write_genlight <- function(data, biallelic = TRUE, write = FALSE, verbose = FALS if (verbose) message("File written: ", filename.short) } return(genlight.object) - } # End write_genlight +} # End write_genlight diff --git a/README.md b/README.md index 9075435a..1775eca1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Travis-CI Build Status](https://travis-ci.org/thierrygosselin/radiator.svg?branch=master)](https://travis-ci.org/thierrygosselin/radiator) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/thierrygosselin/radiator?branch=master&svg=true)](https://ci.appveyor.com/project/thierrygosselin/radiator) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/radiator)](http://cran.r-project.org/package=radiator) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![DOI](https://zenodo.org/badge/14548/thierrygosselin/radiator.svg)](https://zenodo.org/badge/latestdoi/14548/thierrygosselin/radiator) -[![packageversion](https://img.shields.io/badge/Package%20version-1.0.0-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2019--03--12-brightgreen.svg)](/commits/master) +[![packageversion](https://img.shields.io/badge/Package%20version-1.0.0-orange.svg)](commits/master) [![Last-changedate](https://img.shields.io/badge/last%20change-2019--03--14-brightgreen.svg)](/commits/master) ------------------------------------------------------------------------ diff --git a/man/tidy_genind.Rd b/man/tidy_genind.Rd index d7934428..ddb0ba3e 100644 --- a/man/tidy_genind.Rd +++ b/man/tidy_genind.Rd @@ -8,7 +8,8 @@ tidy_genind(data, keep.allele.names = FALSE, tidy = TRUE, gds = TRUE, write = FALSE, verbose = FALSE) } \arguments{ -\item{data}{A genind object in the global environment.} +\item{data}{(path or object) A genind object in the global environment or +path to a genind file that will be open with \code{readRDS}.} \item{keep.allele.names}{Allows to keep allele names for the tidy dataset. Requires the alleles to be numeric. To have this argument in diff --git a/man/tidy_genlight.Rd b/man/tidy_genlight.Rd index 970c4a56..dade5bea 100644 --- a/man/tidy_genlight.Rd +++ b/man/tidy_genlight.Rd @@ -2,13 +2,14 @@ % Please edit documentation in R/genlight.R \name{tidy_genlight} \alias{tidy_genlight} -\title{Tidy a genlight object to a tidy dataframe} +\title{Tidy a genlight object to a tidy dataframe and/or GDS object/file} \usage{ tidy_genlight(data, tidy = TRUE, gds = TRUE, write = FALSE, verbose = FALSE) } \arguments{ -\item{data}{A genlight object in the global environment.} +\item{data}{(path or object) A genlight object in the global environment or +path to a genlight file that will be open with \code{readRDS}.} \item{tidy}{(logical) Generate a tidy dataset. Default: \code{tidy = TRUE}.} @@ -25,7 +26,7 @@ the function is a little more chatty during execution. Default: \code{verbose = TRUE}.} } \description{ -Tidy genlight object to a tidy dataframe. +Tidy genlight object to a tidy dataframe and/or GDS object/file. Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator} and might be of interest for users. }