diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000..19582aa4 Binary files /dev/null and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index b69ffc8e..1bd94e70 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: radiator Type: Package Title: RADseq Data Exploration, Manipulation and Visualization using R -Version: 1.1.6 -Date: 2020-06-23 +Version: 1.1.7 +Date: 2020-08-21 Encoding: UTF-8 Authors@R: c( person("Thierry", "Gosselin", email = "thierrygosselin@icloud.com", role = c("aut", "cre")), @@ -46,7 +46,7 @@ Suggests: stringdist, stringi, tibble, - tidyr (>= 1.0.0), + tidyr (>= 1.1.0), utils License: GPL-3 LazyLoad: yes diff --git a/NAMESPACE b/NAMESPACE index 0595914c..a91b4931 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,7 +46,6 @@ export(detect_mixed_genomes) export(detect_paralogs) export(detect_ref_genome) export(diagnostic_maf) -export(discard_monomorphic_markers) export(distance2tibble) export(distance_individuals) export(erase_genotypes) @@ -63,19 +62,16 @@ export(extract_markers_metadata) export(filter_blacklist_genotypes) export(filter_common_markers) export(filter_coverage) -export(filter_dart) export(filter_dart_reproducibility) export(filter_fis) export(filter_genotyping) export(filter_het) export(filter_hwe) -export(filter_individual) export(filter_individuals) export(filter_ld) export(filter_mac) export(filter_maf) export(filter_monomorphic) -export(filter_population) export(filter_rad) export(filter_snp_number) export(filter_snp_position_read) @@ -105,7 +101,6 @@ export(ind_total_reads) export(individuals2strata) export(integrate_ref) export(join_strata) -export(keep_common_markers) export(ld2df) export(ld_boxplot) export(ld_missing) diff --git a/NEWS.md b/NEWS.md index c45d67ae..8df02f86 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# radiator 1.1.7 2020-08-21 + +* PLINK files: fix a couple of bugs reading the tped +* removed `tidyr::gather` and `tidyr::spread` dependencies (they are deprecated) +* DArT data in 1-row format was not working properly with latest `data.table` melt function. +Changed to `tidyr::pivot_long`. + + # radiator 1.1.6 2020-06-23 * updated radiator so that it work with latest release of SeqArray (v.1.28.1), that diff --git a/R/allele_frequencies.R b/R/allele_frequencies.R index f518245c..86253800 100644 --- a/R/allele_frequencies.R +++ b/R/allele_frequencies.R @@ -92,7 +92,8 @@ allele_frequencies <- function(data, verbose = TRUE) { maf <- maf.data maf.data <- NULL maf.local.wide <- dplyr::select(.data = maf, MARKERS, POP_ID, MAF_LOCAL) %>% - tidyr::spread(data = ., key = MARKERS, value = MAF_LOCAL) + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "MAF_LOCAL") + maf.global <- dplyr::distinct(.data = maf, MARKERS, MAF_GLOBAL) @@ -104,16 +105,21 @@ allele_frequencies <- function(data, verbose = TRUE) { freq.wide <- dplyr::ungroup(freq) %>% dplyr::select(MARKERS, POP_ID, REF = FREQ_REF, ALT = MAF_LOCAL) %>% - tidyr::gather(data = ., key = ALLELES, value = FREQ, -c(MARKERS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "MARKERS"), + names_to = "ALLELES", + values_to = "FREQ" + ) %>% dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>% dplyr::select(-MARKERS, -ALLELES) %>% dplyr::arrange(MARKERS_ALLELES, POP_ID) %>% dplyr::group_by(POP_ID) %>% - tidyr::spread(data = ., key = MARKERS_ALLELES, value = FREQ) + tidyr::pivot_wider(data = ., names_from = "MARKERS_ALLELES", values_from = "FREQ") freq.mat <- suppressWarnings( freq.wide %>% - tibble::remove_rownames(df = .) %>% + tibble::remove_rownames(.data = .) %>% tibble::column_to_rownames(.data = ., var = "POP_ID") %>% as.matrix(.) ) @@ -132,7 +138,12 @@ allele_frequencies <- function(data, verbose = TRUE) { A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(MARKERS, POP_ID, INDIVIDUALS, A1, A2) %>% - tidyr::gather(key = ALLELES_GROUP, ALLELES, -c(INDIVIDUALS, POP_ID, MARKERS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES_GROUP", + values_to = "ALLELES" + ) %>% dplyr::group_by(MARKERS, ALLELES, POP_ID) %>% dplyr::tally(.) %>% dplyr::ungroup(.) %>% @@ -153,7 +164,7 @@ allele_frequencies <- function(data, verbose = TRUE) { dplyr::select(-MARKERS, -ALLELES) %>% dplyr::arrange(MARKERS_ALLELES, POP_ID) %>% dplyr::group_by(POP_ID) %>% - tidyr::spread(data = ., key = MARKERS_ALLELES, value = FREQ) + tidyr::pivot_wider(data = ., names_from = "MARKERS_ALLELES", values_from = "FREQ") freq.mat <- suppressWarnings( freq.wide %>% diff --git a/R/bayescan.R b/R/bayescan.R index 161eff2e..3a1c7f62 100644 --- a/R/bayescan.R +++ b/R/bayescan.R @@ -420,7 +420,7 @@ run_bayescan <- function( ) %>% tidyr::pivot_longer( data = ., - cols = tidyselect::everything(), + cols = dplyr::everything(), names_to = "ACCURACY_MARKERS", values_to = "N" ) %>% diff --git a/R/betas_estimator.R b/R/betas_estimator.R index 49f2104c..397bf87c 100644 --- a/R/betas_estimator.R +++ b/R/betas_estimator.R @@ -116,7 +116,12 @@ betas_estimator <- function( A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(MARKERS, POP_ID, INDIVIDUALS, A1, A2) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::group_by(MARKERS, GT, POP_ID) %>% dplyr::tally(.) %>% dplyr::ungroup() %>% diff --git a/R/dart.R b/R/dart.R index 2a49197e..734695bc 100644 --- a/R/dart.R +++ b/R/dart.R @@ -391,16 +391,24 @@ read_dart <- function( suppressWarnings( data %<>% dplyr::select(dplyr::one_of(want)) %>% - data.table::as.data.table(x = .) %>% - data.table::melt.data.table( + tidyr::pivot_longer( data = ., - id.vars = c("CLONE_ID", "SEQUENCE"), - variable.name = "INDIVIDUALS", - variable.factor = FALSE, - value.name = "VALUE" - ) %>% - tibble::as_tibble(.) + cols = -c("CLONE_ID", "SEQUENCE"), + names_to = "INDIVIDUALS", + values_to = "VALUE" + ) ) + + # data.table::as.data.table(x = .) %>% + # data.table::melt.data.table( + # data = ., + # id.vars = c("CLONE_ID", "SEQUENCE"), + # variable.name = "INDIVIDUALS", + # variable.factor = FALSE, + # value.name = "VALUE" + # ) %>% + # tibble::as_tibble(.) + n.clone <- length(unique(data$CLONE_ID)) data <- radiator::join_strata(data = data, strata = strata) @@ -1134,7 +1142,7 @@ dart2gds <- function( alt %<>% dplyr::bind_rows(ref.s) %>% dplyr::arrange(VARIANT_ID) - ref %<>% dplyr::bind_rows(alt.s)%>% + ref %<>% dplyr::bind_rows(alt.s) %>% dplyr::arrange(VARIANT_ID) alt.s <- ref.s <- NULL @@ -1175,25 +1183,41 @@ dart2gds <- function( } genotypes.meta <- suppressMessages( - data.table::as.data.table(alt) %>% - data.table::melt.data.table( - data = ., - id.vars = want, - variable.name = "INDIVIDUALS", - value.name = "ALLELE_ALT_DEPTH", - variable.factor = FALSE) %>% - tibble::as_tibble(.) %>% + tidyr::pivot_longer( + data = alt, + cols = -want, + names_to = "INDIVIDUALS", + values_to = "ALLELE_ALT_DEPTH" + ) %>% dplyr::bind_cols( - data.table::as.data.table(ref) %>% - data.table::melt.data.table( - data = ., - id.vars = c("VARIANT_ID", "MARKERS"), - variable.name = "INDIVIDUALS", - value.name = "ALLELE_REF_DEPTH", - variable.factor = FALSE) %>% - tibble::as_tibble(.) + tidyr::pivot_longer( + data = ref, + cols = -c("VARIANT_ID", "MARKERS"), + names_to = "INDIVIDUALS", + values_to = "ALLELE_REF_DEPTH" + ) ) ) + + # data.table::as.data.table(alt) %>% + # data.table::melt.data.table( + # data = ., + # id.vars = want, + # variable.name = "INDIVIDUALS", + # value.name = "ALLELE_ALT_DEPTH", + # variable.factor = FALSE) %>% + # tibble::as_tibble(.) %>% + # dplyr::bind_cols( + # data.table::as.data.table(ref) %>% + # data.table::melt.data.table( + # data = ., + # id.vars = c("VARIANT_ID", "MARKERS"), + # variable.name = "INDIVIDUALS", + # value.name = "ALLELE_REF_DEPTH", + # variable.factor = FALSE) %>% + # tibble::as_tibble(.) + # ) + # ) ref <- alt <- NULL # Faster to check that the bind_cols worked by checking the variant id @@ -1428,7 +1452,12 @@ tidy_dart_metadata <- function( COL_TYPE = c("c", "c", "i", "d", "d", "d", "d")) dart.col.type <- dart.col.type %>% - tidyr::gather(data = .,key = DELETE, value = INFO) %>% + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "DELETE", + values_to = "INFO" + ) %>% dplyr::select(-DELETE) %>% dplyr::mutate(INFO = stringi::stri_trans_toupper(INFO)) %>% dplyr::left_join(want, by = "INFO") %>% diff --git a/R/detect_biallelic_markers.R b/R/detect_biallelic_markers.R index de5218f5..e5d87a28 100644 --- a/R/detect_biallelic_markers.R +++ b/R/detect_biallelic_markers.R @@ -159,9 +159,6 @@ detect_biallelic_markers <- function(data, verbose = FALSE, parallel.core = para dplyr::filter(MARKERS %in% sampled.markers) %>% dplyr::distinct(MARKERS, GT) %>% separate_gt(x = ., gt = "GT", sep = 3, exclude = "MARKERS", parallel.core = parallel.core) %>% - # dplyr::mutate(A1 = stringi::stri_sub(GT, 1, 3), A2 = stringi::stri_sub(GT, 4,6)) %>% - # dplyr::select(-GT) %>% - # tidyr::gather(data = ., key = ALLELES_GROUP, value = ALLELES, -MARKERS) %>% dplyr::distinct(MARKERS, HAPLOTYPES) %>% dplyr::count(x = ., MARKERS) #%>% dplyr::select(n) } @@ -170,8 +167,6 @@ detect_biallelic_markers <- function(data, verbose = FALSE, parallel.core = para data <- dplyr::filter(data, GT_VCF != "./.") %>% dplyr::filter(MARKERS %in% sampled.markers) %>% dplyr::distinct(MARKERS, GT_VCF) %>% - # tidyr::separate(data = ., col = GT_VCF, into = c("A1", "A2"), sep = "/") %>% - # tidyr::gather(data = ., key = ALLELES_GROUP, value = ALLELES, -MARKERS) %>% separate_gt(x = ., gt = "GT_VCF", exclude = "MARKERS", parallel.core = parallel.core) %>% dplyr::distinct(MARKERS, HAPLOTYPES) %>% # Here read alleles, not haplotypes dplyr::count(x = ., MARKERS) #%>% dplyr::select(n) @@ -181,8 +176,6 @@ detect_biallelic_markers <- function(data, verbose = FALSE, parallel.core = para data <- dplyr::filter(data, GT_VCF_NUC != "./.") %>% dplyr::filter(MARKERS %in% sampled.markers) %>% dplyr::distinct(MARKERS, GT_VCF_NUC) %>% - # tidyr::separate(data = ., col = GT_VCF_NUC, into = c("A1", "A2"), sep = "/") %>% - # tidyr::gather(data = ., key = ALLELES_GROUP, value = ALLELES, -MARKERS) %>% separate_gt(x = ., exclude = "MARKERS", parallel.core = parallel.core) %>% dplyr::distinct(MARKERS, HAPLOTYPES) %>% dplyr::count(x = ., MARKERS) diff --git a/R/detect_biallelic_problems.R b/R/detect_biallelic_problems.R index fdbd62e7..9f17afe6 100644 --- a/R/detect_biallelic_problems.R +++ b/R/detect_biallelic_problems.R @@ -155,7 +155,7 @@ detect_biallelic_problems <- function( dplyr::mutate(REF = dplyr::if_else(REF == max(REF), "REF", "ALT")) %>% dplyr::ungroup(.) %>% dplyr::group_by_at(dplyr::vars(c(markers.metadata, "ALLELES", "REF"))) %>% - tidyr::spread(data = ., key = POP_ID, value = N) %>% + tidyr::pivot_wider(data = ., names_from = "POP_ID", values_from = "N") %>% readr::write_tsv(x = ., path = blacklist.info.filename, na = "-") res$blacklist.info <- blacklist.info dodgy.markers <- dplyr::n_distinct(blacklist.info$MARKERS) diff --git a/R/detect_duplicate_genomes.R b/R/detect_duplicate_genomes.R index a93b4bd3..debd28e9 100644 --- a/R/detect_duplicate_genomes.R +++ b/R/detect_duplicate_genomes.R @@ -382,7 +382,6 @@ detect_duplicate_genomes <- function( data = ., id.vars = c("MARKERS", "INDIVIDUALS"), variable.name = "ALLELES", value.name = "n", variable.factor = FALSE) %>% tibble::as_tibble(.) %>% - # tidyr::gather(data = ., key = ALLELES, value = n, -c(MARKERS, INDIVIDUALS)) %>% dplyr::mutate(MARKERS_ALLELES = stringi::stri_join(MARKERS, ALLELES, sep = ".")) %>% dplyr::select(-ALLELES) %>% dplyr::arrange(MARKERS_ALLELES, INDIVIDUALS) @@ -996,7 +995,7 @@ distance_individuals <- function( value.var = "n" ) %>% tibble::as_tibble(.) %>% - tibble::remove_rownames(.) %>% + tibble::remove_rownames(.data = .) %>% tibble::column_to_rownames(.data = ., var = "INDIVIDUALS")) x <- suppressWarnings( @@ -1221,8 +1220,12 @@ allele_count <- function(x) { A2 = stringi::stri_sub(str = GT, from = 4, to = 6) ) %>% dplyr::select(-GT) %>% - tidyr::gather( - data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::arrange(MARKERS, INDIVIDUALS, GT) %>% dplyr::count(x = ., INDIVIDUALS, MARKERS, GT) %>% dplyr::ungroup(.) %>% diff --git a/R/detect_het_outliers.R b/R/detect_het_outliers.R index ca8020ce..8a74db46 100644 --- a/R/detect_het_outliers.R +++ b/R/detect_het_outliers.R @@ -201,7 +201,12 @@ res$summary.alt.allele <- dplyr::ungroup(res$outlier.summary$het.summary) %>% ONE_HET_ONLY = length(MARKERS[HOM_ALT == 0 & HET == 1]), ONE_HOM_ALT_ONE_HET_ONLY = length(MARKERS[HOM_ALT == 1 & HET == 1]) ) %>% - tidyr::gather(data = ., key = MARKERS, value = NUMBERS) %>% + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "MARKERS", + values_to = "NUMBERS" + ) %>% dplyr::mutate(PROPORTION = NUMBERS / (NUMBERS[MARKERS == "TOTAL"])) # Estimate heterozygotes miscall rate ------------------------------------------- @@ -397,11 +402,21 @@ plot_het_outliers <- function(data, path.folder = NULL) { freq.summary <- dplyr::bind_cols( res$het.summary %>% dplyr::select(MARKERS, POP_ID, HOM_REF = FREQ_HOM_REF_O, HET = FREQ_HET_O, HOM_ALT = FREQ_HOM_ALT_O) %>% - tidyr::gather(data = ., key = GENOTYPES, value = OBSERVED, -c(MARKERS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "MARKERS"), + names_to = "GENOTYPES", + values_to = "OBSERVED" + ) %>% dplyr::arrange(MARKERS, POP_ID), res$het.summary %>% dplyr::select(MARKERS, POP_ID, HOM_REF = FREQ_HOM_REF_E, HET = FREQ_HET_E, HOM_ALT = FREQ_HOM_ALT_E) %>% - tidyr::gather(data = ., key = GENOTYPES, value = EXPECTED, -c(MARKERS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "MARKERS"), + names_to = "GENOTYPES", + values_to = "EXPECTED" + ) %>% dplyr::arrange(MARKERS, POP_ID) %>% dplyr::select(EXPECTED) ) %>% @@ -507,7 +522,7 @@ estimate_m <- function( ) { D <- dplyr::select(data, INDIVIDUALS, MARKERS, GT_BIN) %>% dplyr::group_by(INDIVIDUALS) %>% - tidyr::spread(data = ., key = MARKERS, value = GT_BIN) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT_BIN") %>% dplyr::ungroup(.) %>% dplyr::select(-INDIVIDUALS) %>% as.matrix(.) diff --git a/R/detect_mixed_genomes.R b/R/detect_mixed_genomes.R index 3ebd70b0..35e6c7b9 100644 --- a/R/detect_mixed_genomes.R +++ b/R/detect_mixed_genomes.R @@ -330,7 +330,12 @@ detect_mixed_genomes <- function( het.ind.overall <- dplyr::mutate(.data = het.ind, POP_ID = as.character(POP_ID)) %>% dplyr::bind_rows(dplyr::mutate(.data = het.ind, POP_ID = rep("OVERALL", n()))) %>% dplyr::mutate(POP_ID = factor(POP_ID, levels = c(pop.levels, "OVERALL"))) %>% - tidyr::gather(data = ., key = MISSING_GROUP, value = MISSING_PROP, -c(POP_ID, INDIVIDUALS, GENOTYPED, HET_NUMBER, HET_PROP)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "GENOTYPED", "HET_NUMBER", "HET_PROP"), + names_to = "MISSING_GROUP", + values_to = "MISSING_PROP" + ) %>% dplyr::mutate(MISSING_GROUP = factor(MISSING_GROUP, levels = c("MISSING_PROP_POP", "MISSING_PROP_OVERALL"))) %>% dplyr::arrange(POP_ID, MISSING_GROUP) @@ -384,7 +389,12 @@ detect_mixed_genomes <- function( het.ind.overall <- dplyr::mutate(.data = het.ind, POP_ID = as.character(POP_ID)) %>% dplyr::bind_rows(dplyr::mutate(.data = het.ind, POP_ID = rep("OVERALL", n()))) %>% dplyr::mutate(POP_ID = factor(POP_ID, levels = c(pop.levels, "OVERALL"), ordered = TRUE)) %>% - tidyr::gather(data = ., key = MISSING_GROUP, value = MISSING_PROP, -c(POP_ID, INDIVIDUALS, HET_PROP)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "HET_PROP"), + names_to = "MISSING_GROUP", + values_to = "MISSING_PROP" + ) %>% dplyr::mutate( MISSING_GROUP = factor(MISSING_GROUP, levels = c("MISSING_PROP_POP", "MISSING_PROP_OVERALL")), diff --git a/R/filter_common_markers.R b/R/filter_common_markers.R index b8503252..cb81dd90 100644 --- a/R/filter_common_markers.R +++ b/R/filter_common_markers.R @@ -72,11 +72,6 @@ filter_common_markers <- function( # path.folder <- NULL # parameters <- NULL # internal <- FALSE - # parameters = filters.parameters - # path.folder = wf - - - # obj.keeper <- c(ls(envir = globalenv()), "data") if (!filter.common.markers) { return(data) @@ -95,8 +90,6 @@ filter_common_markers <- function( on.exit(options(width = opt.change), add = TRUE) on.exit(radiator_toc(timing, verbose = verbose), add = TRUE) on.exit(radiator_function_header(f.name = "filter_common_markers", start = FALSE, verbose = verbose), add = TRUE) - # on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L))) - # message("\nKeeping only common markers between strata") # Checking for missing and/or default arguments ------------------------------ if (missing(data)) rlang::abort("Input file missing") @@ -189,7 +182,7 @@ filter_common_markers <- function( # plot_upset-------------------------------------------------------------- if (fig) { plot.filename <- stringi::stri_join( - "common.markers.upsetrplot_", file.date, ".pdf") + "common.markers.upsetrplot_", file.date) plot.filename <- file.path(path.folder, plot.filename) plot_upset(x = data, data.type = data.type, @@ -268,7 +261,7 @@ filter_common_markers <- function( } if (fig) { plot.filename <- stringi::stri_join( - "common.markers.upsetrplot_", file.date, ".pdf") + "common.markers.upsetrplot_", file.date) plot.filename <- file.path(path.folder, plot.filename) plot_upset(x = data, data.type = data.type, @@ -454,35 +447,21 @@ plot_upset <- function( ) n.pop = length(unique(strata$STRATA)) - # PLAN B while SeqArray bug is fixed - # plot.data <- extract_genotypes_metadata( - # gds = x, - # genotypes.meta.select = c("INDIVIDUALS", "MARKERS", "GT_BIN") - # ) %>% - # dplyr::filter(!is.na(GT_BIN)) %>% - # join_strata(data = ., strata = strata, verbose = FALSE) %>% - # dplyr::distinct(MARKERS, STRATA) %>% - # dplyr::mutate( - # n = rep(1, n()), - # STRATA = stringi::stri_join("POP_", STRATA) - # ) %>% - # tidyr::spread(data = ., key = STRATA, value = n, fill = 0) %>% - # data.frame(.) - sample.bk <- strata$INDIVIDUALS missing_markers_pop <- function( strata.split, x, parallel.core = parallel::detectCores() - 2 ) { - # max.core <-length(unique(strata.split$INDIVIDUALS)) + # strata.split <- dplyr::group_split(.tbl = strata, STRATA)[[4]] parallel.core.opt <- parallel_core_opt(parallel.core) SeqArray::seqSetFilter( object = x, sample.id = strata.split$INDIVIDUALS, action = "set", - verbose = FALSE) + verbose = FALSE + ) res <- tibble::tibble( STRATA = SeqArray::seqMissing( @@ -508,6 +487,7 @@ plot_upset <- function( ) %>% data.frame(.)#UpSetR requires data.frame + readr::write_tsv(x = plot.data, path = stringi::stri_join(plot.filename, ".tsv")) SeqArray::seqSetFilter( object = x, sample.id = sample.bk, @@ -526,15 +506,26 @@ plot_upset <- function( n = rep(1, n()), POP_ID = stringi::stri_join("POP_", POP_ID) ) %>% - tidyr::spread(data = ., key = POP_ID, value = n, fill = 0) %>% + tidyr::pivot_wider(data = ., names_from = "POP_ID", values_from = "n", values_fill = 0) %>% data.frame(.)#UpSetR requires data.frame } - pdf(file = plot.filename, onefile = FALSE) - UpSetR::upset( - plot.data, + # generate the plot + # print(dev.list()) + + # pdf( + # file = stringi::stri_join(plot.filename, ".pdf"), + # onefile = FALSE + # ) + grDevices::png(filename = stringi::stri_join(plot.filename, ".png")) + print( + UpSetR::upset( + data = plot.data, nsets = n.pop, order.by = "freq", empty.intersections = NULL) + ) dev.off() + # print(dev.list()) + }#End plot_upset diff --git a/R/filter_coverage.R b/R/filter_coverage.R index 6019ff60..c11271b3 100644 --- a/R/filter_coverage.R +++ b/R/filter_coverage.R @@ -309,9 +309,12 @@ filter_coverage <- function( if (nrow(helper.table.low) > 1) { markers.plot.low <- ggplot2::ggplot( - data = tidyr::gather( + data = tidyr::pivot_longer( data = helper.table.low, - key = LIST, value = MARKERS, -COVERAGE_LOW), + cols = -COVERAGE_LOW, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = COVERAGE_LOW, y = MARKERS)) + ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + @@ -336,7 +339,7 @@ filter_coverage <- function( units = "cm", useDingbats = FALSE, limitsize = FALSE - ) + ) } helper.table.high <- tibble::tibble(COVERAGE_HIGH = ch.range) %>% @@ -350,9 +353,12 @@ filter_coverage <- function( if (nrow(helper.table.high) > 1) { markers.plot.high <- ggplot2::ggplot( - data = tidyr::gather( + data = tidyr::pivot_longer( data = helper.table.high, - key = LIST, value = MARKERS, -COVERAGE_HIGH), + cols = -COVERAGE_HIGH, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = COVERAGE_HIGH, y = MARKERS)) + ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + @@ -398,7 +404,7 @@ filter_coverage <- function( dplyr::filter( COVERAGE_MEAN < filter.coverage[1] | COVERAGE_MEAN > filter.coverage[2] - ) %$% VARIANT_ID + ) %$% VARIANT_ID markers.meta <- extract_markers_metadata(gds = data, whitelist = FALSE) %>% dplyr::mutate( FILTERS = dplyr::if_else( diff --git a/R/filter_dart_reproducibility.R b/R/filter_dart_reproducibility.R index 7ecccff5..75243116 100644 --- a/R/filter_dart_reproducibility.R +++ b/R/filter_dart_reproducibility.R @@ -217,9 +217,13 @@ filter_dart_reproducibility <- function( # figures markers.plot <- ggplot2::ggplot( - data = helper.table %<>% tidyr::gather( - data = ., - key = LIST, value = MARKERS, -REPRODUCIBILITY), + data = helper.table %<>% + tidyr::pivot_longer( + data = ., + cols = -REPRODUCIBILITY, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = REPRODUCIBILITY, y = MARKERS)) + ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + diff --git a/R/filter_fis.R b/R/filter_fis.R index 75c19c7a..71fd4bb0 100644 --- a/R/filter_fis.R +++ b/R/filter_fis.R @@ -40,7 +40,7 @@ filter_fis <- function(data, approach = "haplotype", fis.min.threshold, fis.max. if (is.vector(data)) { - data <- readr::read_tsv(data, col_names = T) + data <- readr::read_tsv(data, col_names = TRUE) } pop.number <- dplyr::n_distinct(data$POP_ID) diff --git a/R/filter_genotyping.R b/R/filter_genotyping.R index b4b2629f..a9bca46c 100644 --- a/R/filter_genotyping.R +++ b/R/filter_genotyping.R @@ -271,11 +271,11 @@ filter_genotyping <- function( if (verbose) message("File written: markers.pop.missing.helper.table.tsv") helper.table %<>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = LIST, - value = MARKERS, - -c(MISSING_PROP, STRATA) + cols = -c("MISSING_PROP", "STRATA"), + names_to = "LIST", + values_to = "MARKERS" ) %>% dplyr::mutate(STRATA = factor(STRATA, levels = strata.levels, ordered = TRUE)) %>% dplyr::arrange(STRATA) @@ -284,9 +284,12 @@ filter_genotyping <- function( strata.levels <- NULL } else { helper.table %<>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = LIST, value = MARKERS, -MISSING_PROP) + cols = -MISSING_PROP, + names_to = "LIST", + values_to = "MARKERS" + ) n.pop <- 1L strata <- FALSE } diff --git a/R/filter_het.R b/R/filter_het.R index 4101343b..f5fab514 100644 --- a/R/filter_het.R +++ b/R/filter_het.R @@ -403,7 +403,12 @@ information") het.ind.overall <- dplyr::mutate(.data = het.ind, POP_ID = as.character(POP_ID)) %>% dplyr::bind_rows(dplyr::mutate(.data = het.ind, POP_ID = rep("OVERALL", n()))) %>% dplyr::mutate(POP_ID = factor(POP_ID, levels = c(levels(het.ind$POP_ID), "OVERALL"))) %>% - tidyr::gather(data = ., key = MISSING_GROUP, value = MISSING_PROP, -c(POP_ID, INDIVIDUALS, GENOTYPED, HET_NUMBER, HET_PROP)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "GENOTYPED", "HET_NUMBER", "HET_PROP"), + names_to = "MISSING_GROUP", + values_to = "MISSING_PROP" + ) %>% dplyr::mutate(MISSING_GROUP = factor(MISSING_GROUP, levels = c("MISSING_PROP_POP", "MISSING_PROP_OVERALL"))) # Get stats... @@ -662,11 +667,11 @@ use the overall approach.\n") het.summary.tidy <- suppressWarnings( dplyr::bind_rows(het.summary.pop, het.summary.overall) %>% dplyr::full_join(het.missing.overall, by = c("LOCUS", "POP_ID")) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = HET_GROUP, - value = VALUE, - -c(LOCUS, POP_ID, MISSING_PROP) + cols = -c("LOCUS", "POP_ID", "MISSING_PROP"), + names_to = "HET_GROUP", + values_to = "VALUE" ) %>% dplyr::rename(MARKERS = LOCUS) %>% dplyr::mutate( @@ -715,11 +720,11 @@ use the overall approach.\n") het.summary.tidy <- suppressWarnings( dplyr::bind_rows(het.summary.pop, het.summary.overall) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = HET_GROUP, - value = VALUE, - -c(MARKERS, POP_ID, MISSING_PROP) + cols = -c("MARKERS", "POP_ID", "MISSING_PROP"), + names_to = "HET_GROUP", + values_to = "VALUE" ) ) } @@ -812,11 +817,11 @@ use the overall approach.\n") MAX_0.8 = dplyr::if_else(HET_MAX <= 0.8, "whitelist", "blacklist"), MAX_0.9 = dplyr::if_else(HET_MAX <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = MAX_THRESHOLD, - value = MAX_OUTLIERS, - MAX_0.4:MAX_0.9 + cols = MAX_0.4:MAX_0.9, + names_to = "MAX_THRESHOLD", + values_to = "MAX_OUTLIERS" ) %>% dplyr::mutate( DIF_0.1 = dplyr::if_else(HET_DIF <= 0.1, "whitelist", "blacklist"), @@ -829,11 +834,11 @@ use the overall approach.\n") DIF_0.8 = dplyr::if_else(HET_DIF <= 0.8, "whitelist", "blacklist"), DIF_0.9 = dplyr::if_else(HET_DIF <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = DIF_THRESHOLD, - value = DIF_OUTLIERS, - DIF_0.1:DIF_0.9 + cols = DIF_0.1:DIF_0.9, + names_to = "DIF_THRESHOLD", + values_to = "DIF_OUTLIERS" ) %>% tidyr::unite( data = ., @@ -880,7 +885,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(MAX_THRESHOLD, MAX_OUTLIERS) %>% - tidyr::spread(data = ., key = MAX_THRESHOLD, value = MARKERS, fill = 0) + tidyr::pivot_wider(data = ., names_from = "MAX_THRESHOLD", values_from = "MARKERS", values_fill = 0) dif.threshold <- dplyr::distinct( het.helper, LOCUS, DIF_THRESHOLD, DIF_OUTLIERS @@ -902,7 +907,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(DIF_THRESHOLD, DIF_OUTLIERS) %>% - tidyr::spread(data = ., key = DIF_THRESHOLD, value = MARKERS, fill = 0) + tidyr::pivot_wider(data = ., names_from = "DIF_THRESHOLD", values_from = "MARKERS", values_fill = 0) max.dif.threshold.combined <- dplyr::distinct( @@ -925,7 +930,8 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(MAX_DIF_THRESHOLD, MAX_DIF_OUTLIERS) %>% - tidyr::spread(data = ., key = MAX_DIF_THRESHOLD, value = MARKERS, fill = 0) + tidyr::pivot_wider(data = ., names_from = "MAX_DIF_THRESHOLD", values_from = "MARKERS", values_fill = 0) + # overall het.helper.overall <- dplyr::select(het.summary.overall, LOCUS, HET_MAX, HET_DIF) %>% @@ -937,11 +943,11 @@ use the overall approach.\n") MAX_0.8 = dplyr::if_else(HET_MAX <= 0.8, "whitelist", "blacklist"), MAX_0.9 = dplyr::if_else(HET_MAX <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = MAX_THRESHOLD, - value = MAX_OUTLIERS, - MAX_0.4:MAX_0.9 + cols = MAX_0.4:MAX_0.9, + names_to = "MAX_THRESHOLD", + values_to = "MAX_OUTLIERS" ) %>% dplyr::mutate( DIF_0.1 = dplyr::if_else(HET_DIF <= 0.1, "whitelist", "blacklist"), @@ -954,11 +960,11 @@ use the overall approach.\n") DIF_0.8 = dplyr::if_else(HET_DIF <= 0.8, "whitelist", "blacklist"), DIF_0.9 = dplyr::if_else(HET_DIF <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = DIF_THRESHOLD, - value = DIF_OUTLIERS, - DIF_0.1:DIF_0.9 + cols = DIF_0.1:DIF_0.9, + names_to = "DIF_THRESHOLD", + values_to = "DIF_OUTLIERS" ) %>% tidyr::unite( data = ., @@ -1005,7 +1011,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(MAX_THRESHOLD, MAX_OUTLIERS) %>% - tidyr::spread(data = ., key = MAX_THRESHOLD, value = MARKERS, fill = 0) %>% + tidyr::pivot_wider(data = ., names_from = "MAX_THRESHOLD", values_from = "MARKERS", values_fill = 0) %>% dplyr::filter(MAX_OUTLIERS == 0) %>% dplyr::ungroup(.) %>% dplyr::select(-MAX_OUTLIERS) @@ -1030,7 +1036,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(DIF_THRESHOLD, DIF_OUTLIERS) %>% - tidyr::spread(data = ., key = DIF_THRESHOLD, value = MARKERS, fill = 0) %>% + tidyr::pivot_wider(data = ., names_from = "DIF_THRESHOLD", values_from = "MARKERS", values_fill = 0) %>% dplyr::filter(DIF_OUTLIERS == 0) %>% dplyr::ungroup(.) %>% dplyr::select(-DIF_OUTLIERS) @@ -1056,7 +1062,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(MAX_DIF_THRESHOLD, MAX_DIF_OUTLIERS) %>% - tidyr::spread(data = ., key = MAX_DIF_THRESHOLD, value = MARKERS, fill = 0) %>% + tidyr::pivot_wider(data = ., names_from = "MAX_DIF_THRESHOLD", values_from = "MARKERS", values_fill = 0) %>% dplyr::filter(MAX_DIF_OUTLIERS == 0) %>% dplyr::ungroup(.) %>% dplyr::select(-MAX_DIF_OUTLIERS) @@ -1110,7 +1116,12 @@ use the overall approach.\n") `0.8` = dplyr::if_else(HET_MEAN <= 0.8, "whitelist", "blacklist"), `0.9` = dplyr::if_else(HET_MEAN <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather(data = ., key = THRESHOLD, value = OUTLIERS, `0.4`:`0.9`) %>% + tidyr::pivot_longer( + data = ., + cols = `0.4`:`0.9`, + names_to = "THRESHOLD", + values_to = "OUTLIERS" + ) %>% dplyr::group_by(MARKERS, THRESHOLD, THRESHOLD) %>% dplyr::summarise(OUTLIERS = length(OUTLIERS[OUTLIERS == "whitelist"])) %>% dplyr::ungroup(.) %>% @@ -1127,7 +1138,8 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(THRESHOLD, OUTLIERS) %>% - tidyr::spread(data = ., key = THRESHOLD, value = MARKERS, fill = 0) + tidyr::pivot_wider(data = ., names_from = "MAX_THRESHOLD", values_from = "MARKERS", values_fill = 0) + # by overall helper.table.het.overall <- dplyr::select(het.summary.overall, MARKERS, HET_MEAN) %>% @@ -1139,7 +1151,12 @@ use the overall approach.\n") `0.8` = dplyr::if_else(HET_MEAN <= 0.8, "whitelist", "blacklist"), `0.9` = dplyr::if_else(HET_MEAN <= 0.9, "whitelist", "blacklist") ) %>% - tidyr::gather(data = ., key = THRESHOLD, value = OUTLIERS, `0.4`:`0.9`) %>% + tidyr::pivot_longer( + data = ., + cols = `0.4`:`0.9`, + names_to = "THRESHOLD", + values_to = "OUTLIERS" + ) %>% dplyr::group_by(MARKERS, THRESHOLD, THRESHOLD) %>% dplyr::summarise(OUTLIERS = length(OUTLIERS[OUTLIERS == "whitelist"])) %>% dplyr::ungroup(.) %>% @@ -1156,7 +1173,7 @@ use the overall approach.\n") ) %>% dplyr::select(-c(n, WHITELIST, BLACKLIST)) %>% dplyr::group_by(THRESHOLD, OUTLIERS) %>% - tidyr::spread(data = ., key = THRESHOLD, value = MARKERS, fill = 0) %>% + tidyr::pivot_wider(data = ., names_from = "THRESHOLD", values_from = "MARKERS", values_fill = 0) %>% dplyr::filter(OUTLIERS == 0) %>% dplyr::ungroup(.) %>% dplyr::select(-OUTLIERS) @@ -1229,11 +1246,11 @@ use the overall approach.\n") dplyr::rename(SNP_NUM = n) %>% dplyr::left_join(het.summary.overall, by = "LOCUS") %>% dplyr::select(-POP_ID) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = HET_GROUP, - value = VALUE, - -c(LOCUS, SNP_NUM) + cols = -c("LOCUS", "SNP_NUM"), + names_to = "HET_GROUP", + values_to = "VALUE" ) %>% dplyr::mutate( HET_GROUP = factor( @@ -1274,11 +1291,11 @@ use the overall approach.\n") dplyr::summarise(COVERAGE = mean(READ_DEPTH, na.rm = TRUE)) %>% dplyr::left_join(het.summary.overall, by = "LOCUS") %>% dplyr::select(-POP_ID) %>% - tidyr::gather( + tidyr::pivot_longer( data = ., - key = HET_GROUP, - value = VALUE, - -c(LOCUS, COVERAGE) + cols = -c("LOCUS", "COVERAGE"), + names_to = "HET_GROUP", + values_to = "VALUE" ) %>% dplyr::mutate( HET_GROUP = factor( diff --git a/R/filter_hwe.R b/R/filter_hwe.R index 0858d727..e3a86479 100644 --- a/R/filter_hwe.R +++ b/R/filter_hwe.R @@ -551,7 +551,7 @@ filter_hwe <- function( hwd.helper.table <- hwd.helper.table.long %>% dplyr::group_by(N_POP_HWD) %>% - tidyr::spread(data = ., key = SIGNIFICANCE, value = n) %>% + tidyr::pivot_wider(data = ., names_from = "SIGNIFICANCE", values_from = "n") %>% dplyr::filter(N_POP_HWD != 0) %>% dplyr::ungroup(.) %>% readr::write_tsv(x = ., path = file.path(path.folder, "hwd.helper.table.tsv")) diff --git a/R/filter_ld.R b/R/filter_ld.R index cf2e75c0..2f08b7c1 100644 --- a/R/filter_ld.R +++ b/R/filter_ld.R @@ -533,7 +533,12 @@ filter_ld <- function( A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(MARKERS, INDIVIDUALS, A1, A2) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("MARKERS", "INDIVIDUALS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::group_by(MARKERS, GT) %>% dplyr::tally(.) %>% dplyr::group_by(MARKERS) %>% diff --git a/R/filter_mac.R b/R/filter_mac.R index 177f143e..a5153585 100644 --- a/R/filter_mac.R +++ b/R/filter_mac.R @@ -379,9 +379,12 @@ filter_mac <- function( if (verbose) message("File written: maf.helper.table.tsv") mac.markers.plot <- ggplot2::ggplot( - data = tidyr::gather( + data = tidyr::pivot_longer( data = maf.helper.table[1:25, -2], - key = LIST, value = MARKERS, -MAC), + cols = -MAC, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = MAC, y = MARKERS)) + ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + @@ -544,7 +547,12 @@ compute_maf <- function(x, biallelic) { A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(MARKERS, POP_ID, INDIVIDUALS, A1, A2) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) maf.local <- x %>% dplyr::group_by(MARKERS, POP_ID, GT) %>% @@ -581,9 +589,12 @@ compute_maf <- function(x, biallelic) { sep = "/", extra = "drop", remove = TRUE ) %>% - tidyr::gather( - data = ., key = ALLELE_GROUP, value = HAPLOTYPES, - -dplyr::one_of(c("MARKERS", "INDIVIDUALS", "POP_ID"))) %>% + tidyr::pivot_longer( + data = ., + cols = -dplyr::one_of(c("MARKERS", "INDIVIDUALS", "POP_ID")), + names_to = "ALLELE_GROUP", + values_to = "HAPLOTYPES" + ) %>% dplyr::select(-ALLELE_GROUP) %>% dplyr::group_by(MARKERS, HAPLOTYPES, POP_ID) %>% dplyr::tally(.) %>% @@ -780,7 +791,12 @@ mac_one <- function(x) { A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(-GT) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("MARKERS", "INDIVIDUALS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::group_by(MARKERS, GT) %>% dplyr::tally(.) %>% dplyr::group_by(MARKERS) %>% diff --git a/R/filter_maf.R b/R/filter_maf.R index ada3e550..07dbd6b2 100644 --- a/R/filter_maf.R +++ b/R/filter_maf.R @@ -470,7 +470,12 @@ filter_maf <- function( RANGE = stringi::stri_join(round(min(MAF_GLOBAL, na.rm = TRUE), 4), " - ", round(max(MAF_GLOBAL, na.rm = TRUE), 4)) ) - global.data <- tidyr::gather(data = global.data, key = "GROUP", value = "GLOBAL", -c(MARKERS, OVERALL)) %>% + global.data <- tidyr::pivot_longer( + data = global.data, + cols = -c("MARKERS", "OVERALL"), + names_to = "GROUP", + values_to = "GLOBAL" + ) %>% dplyr::mutate(GROUP = stringi::stri_replace_all_fixed(str = GROUP, pattern = c("ALT_GLOBAL", "MAF_GLOBAL"), replacement = c("ALT count", "ALT frequency (MAF)"), vectorize_all = FALSE)) histo.maf.global <- ggplot2::ggplot(global.data, ggplot2::aes(x = GLOBAL)) + @@ -514,7 +519,12 @@ filter_maf <- function( histo.maf.local <- histo.maf.local %>% dplyr::select(MARKERS, POP_ID, `ALT count` = ALT_LOCAL, `ALT frequency` = MAF_LOCAL) %>% - tidyr::gather(data = ., key = "GROUP", value = "LOCAL", -c(MARKERS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "MARKERS"), + names_to = "GROUP", + values_to = "LOCAL" + ) %>% ggplot2::ggplot( data = ., ggplot2::aes(x = LOCAL, na.rm = FALSE)) + ggplot2::geom_histogram(bins = 30) + diff --git a/R/filter_monomorphic.R b/R/filter_monomorphic.R index b2cd8992..09d981aa 100644 --- a/R/filter_monomorphic.R +++ b/R/filter_monomorphic.R @@ -242,7 +242,12 @@ filter_monomorphic <- function( A2 = stringi::stri_sub(GT, 4,6) ) %>% dplyr::select(-GT) %>% - tidyr::gather(data = ., key = ALLELES_GROUP, value = ALLELES, -MARKERS) %>% + tidyr::pivot_longer( + data = ., + cols = -MARKERS, + names_to = "ALLELES_GROUP", + values_to = "ALLELES" + ) %>% dplyr::distinct(MARKERS, ALLELES) %>% dplyr::count(x = ., MARKERS) %>% dplyr::filter(n == 1) %>% diff --git a/R/filter_snp_number.R b/R/filter_snp_number.R index 6d0440bd..5893298a 100644 --- a/R/filter_snp_number.R +++ b/R/filter_snp_number.R @@ -306,9 +306,13 @@ filter_snp_number <- function( # figures markers.plot <- ggplot2::ggplot( - data = helper.table %<>% tidyr::gather( - data = ., - key = LIST, value = MARKERS, -SNP_PER_LOCUS), + data = helper.table %<>% + tidyr::pivot_longer( + data = ., + cols = -SNP_PER_LOCUS, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = SNP_PER_LOCUS, y = MARKERS)) + ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + diff --git a/R/filter_snp_position_read.R b/R/filter_snp_position_read.R index 29fe6140..0f98b1af 100644 --- a/R/filter_snp_position_read.R +++ b/R/filter_snp_position_read.R @@ -282,9 +282,12 @@ filter_snp_position_read <- function( # figures markers.plot <- ggplot2::ggplot( - data = tidyr::gather( + data = tidyr::pivot_longer( data = helper.table, - key = LIST, value = MARKERS, -STATS), + cols = -STATS, + names_to = "LIST", + values_to = "MARKERS" + ), ggplot2::aes(x = STATS, y = MARKERS)) + # ggplot2::geom_line() + ggplot2::geom_point(size = 2, shape = 21, fill = "white") + diff --git a/R/fstat.R b/R/fstat.R index a662fe29..c12ee3fc 100644 --- a/R/fstat.R +++ b/R/fstat.R @@ -109,14 +109,11 @@ tidy_fstat <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { data <- dplyr::slice(.data = data, -(1:(as.numeric(fstat.first.line$nl) + 1))) # separate the dataset by tab - data <- tibble::as_data_frame( - purrr::invoke(#similar to do.call - rbind, stringi::stri_split_fixed(str = data$data, pattern = "\t") - ) - ) - - # new colnames - colnames(data) <- c("POP_ID", markers) + data <- tibble::as_tibble( + do.call(rbind, stringi::stri_split_fixed(str = data$data, pattern = "\t")), + .name_repair = "minimal" + ) %>% + magrittr::set_colnames(x = ., c("POP_ID", markers)) # Create a string of id id <- dplyr::data_frame(INDIVIDUALS = paste0("IND-", seq_along(1:length(data$POP_ID)))) @@ -191,7 +188,7 @@ tidy_fstat <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { if (!tidy) { data <- data %>% dplyr::group_by(POP_ID, INDIVIDUALS) %>% - tidyr::spread(data = ., key = MARKERS, value = GENOTYPE) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GENOTYPE") %>% dplyr::arrange(POP_ID, INDIVIDUALS) } @@ -291,7 +288,7 @@ write_hierfstat <- function(data, filename = NULL) { tidyr::unite(data = data, GT, A1, A2, sep = "") %>% dplyr::mutate(GT = as.numeric(GT)) %>% dplyr::group_by(POP_ID, INDIVIDUALS) %>% - tidyr::spread(data = ., MARKERS, GT) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT") %>% dplyr::ungroup(.) %>% dplyr::arrange(POP_ID, INDIVIDUALS) %>% dplyr::mutate(POP_ID = as.integer(POP_ID), INDIVIDUALS = NULL) diff --git a/R/genepop.R b/R/genepop.R index 23e70802..26a43ce0 100644 --- a/R/genepop.R +++ b/R/genepop.R @@ -159,14 +159,16 @@ tidy_genepop <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { col_names = "data", col_types = "c") } else { + V1 <- NULL data %<>% - dplyr::rename(data = V1) %>% + dplyr::rename(.data = ., data = V1) %>% dplyr::slice(-1) %>% # removes genepop header tibble::as_tibble() } # Replace white space with only 1 space - data$data %<>% stringi::stri_replace_all_regex( + data$data %<>% + stringi::stri_replace_all_regex( str = ., pattern = "\\s+", replacement = " ", @@ -264,21 +266,22 @@ tidy_genepop <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { # create a data frame -------------------------------------------------------- # separate the dataset by space data <- tibble::as_tibble( - purrr::invoke(#similar to do.call - rbind, stringi::stri_split_fixed(str = data$GT, pattern = " ") - ) - ) - - # add column names - colnames(data) <- markers + do.call(rbind, stringi::stri_split_fixed(str = data$GT, pattern = " ")), + .name_repair = "minimal" + ) %>% + magrittr::set_colnames(x = ., markers) + markers <- NULL # Population info ------------------------------------------------------------ # Strata if (!is.null(strata)) { - strata.df <- radiator::read_strata(strata = strata, pop.id = TRUE, verbose = FALSE) #join strata and data individuals %<>% - dplyr::left_join(strata.df, by = "INDIVIDUALS") + dplyr::left_join( + radiator::read_strata(strata = strata, pop.id = TRUE, verbose = FALSE) %$% + strata, + by = "INDIVIDUALS" + ) } else { # add pop based on internal genepop: integer and reorder the columns @@ -288,9 +291,10 @@ tidy_genepop <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { # combine the individuals back to the dataset data <- dplyr::bind_cols(individuals, data) + individuals <- NULL # Scan for genotype coding and tidy ------------------------------------------ - gt.coding <- dplyr::select(data, -INDIVIDUALS, -POP_ID) %>% + gt.coding <- dplyr::select(.data = data, -INDIVIDUALS, -POP_ID) %>% purrr::flatten_chr(.) %>% unique(.) %>% nchar(.) %>% @@ -307,8 +311,12 @@ tidy_genepop <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { if (gt.coding == 2) { gt.sep <- 1 } - data <- tidyr::gather( - data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS)) %>% + data <- tidyr::pivot_longer( + data = data, + cols = -c("POP_ID", "INDIVIDUALS"), + names_to = "MARKERS", + values_to = "GT" + ) %>% tidyr::separate( data = ., col = GT, into = c("A1", "A2"), sep = gt.sep, remove = TRUE, extra = "drop" @@ -319,10 +327,18 @@ tidy_genepop <- function(data, strata = NULL, tidy = TRUE, filename = NULL) { ) %>% tidyr::unite(data = ., col = GT, A1, A2, sep = "") %>% dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS) - } - if (tidy) { - data <- tidyr::gather( - data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS)) + + if (!tidy) { + data <- tidyr::pivot_wider(data = data, names_from = "MARKERS", values_from = "GT") + } + } else { + if (tidy) { + data <- tidyr::pivot_longer( + data = data, + cols = -c("POP_ID", "INDIVIDUALS"), + names_to = "MARKERS", + values_to = "GT") + } } } diff --git a/R/genind.R b/R/genind.R index ae269c0e..cbc64fc0 100644 --- a/R/genind.R +++ b/R/genind.R @@ -161,7 +161,6 @@ tidy_genind <- function( tibble::add_column(.data = ., INDIVIDUALS = rownames(data@tab), .before = 1) %>% tibble::add_column(.data = ., POP_ID = data@pop) %>% dplyr::select(POP_ID, INDIVIDUALS, dplyr::ends_with(match = ".A2")) %>% - # tidyr::gather(data = ., key = MARKERS, value = GT_BIN, -c(POP_ID, INDIVIDUALS)) %>% data.table::as.data.table(.) %>% data.table::melt.data.table( data = ., @@ -184,7 +183,6 @@ tidy_genind <- function( data <- tibble::as_tibble(data@tab) %>% tibble::add_column(.data = ., INDIVIDUALS = rownames(data@tab), .before = 1) %>% tibble::add_column(.data = ., POP_ID = data@pop) %>% - # tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(POP_ID, INDIVIDUALS)) %>% data.table::as.data.table(.) %>% data.table::melt.data.table( data = ., diff --git a/R/genomic_converter.R b/R/genomic_converter.R index 5cf58b6a..8560c75c 100644 --- a/R/genomic_converter.R +++ b/R/genomic_converter.R @@ -28,8 +28,17 @@ #' Use a character string, #' e.g. \code{output = c("genind", "genepop", "structure")}, to have preferred #' output formats generated. With default, only the tidy format is generated. +#' +#' Make sure to read the particularities of each format, some might +#' requires extra columns in the strata file. You can find the info in the +#' corresponding \emph{write_} functions of radiator +#' (\href{https://thierrygosselin.github.io/radiator/reference/index.html}{reference}). +#' #' Default: \code{output = NULL}. + + + #' @param filename (optional) The filename prefix for the object in the global environment #' or the working directory. Default: \code{filename = NULL}. A default name will be used, #' customized with the output file(s) selected. diff --git a/R/gtypes.R b/R/gtypes.R index aba170a9..3e1dc14b 100644 --- a/R/gtypes.R +++ b/R/gtypes.R @@ -30,8 +30,13 @@ tidy_gtypes <- function(data) { tibble::as_data_frame(data@data) %>% dplyr::rename(INDIVIDUALS = ids, POP_ID = strata) %>% dplyr::mutate(ALLELES = rep(c("A1", "A2"), n() / 2)) %>% - tidyr::gather(data = ., key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES))) - + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "ALLELES"), + names_to = "MARKERS", + values_to = "GT" + ) + ) # detect stratg genotype coding ---------------------------------------------- # For GT = c("A", "C", "G", "T") gt.format <- sort(unique(input$GT)) @@ -60,7 +65,7 @@ tidy_gtypes <- function(data) { GT = replace(GT, which(is.na(GT)), "000"), POP_ID = as.character(POP_ID)) %>% dplyr::group_by(POP_ID, INDIVIDUALS, MARKERS) %>% - tidyr::spread(data = ., key = ALLELES, value = GT) %>% + tidyr::pivot_wider(data = ., names_from = "ALLELES", values_from = "GT") %>% dplyr::ungroup(.) %>% tidyr::unite(data = ., col = GT, A1, A2, sep = "") %>% dplyr::select(POP_ID, INDIVIDUALS, MARKERS, GT) diff --git a/R/haplotype_reconstruction.R b/R/haplotype_reconstruction.R index 53fb0b8b..f1611f46 100644 --- a/R/haplotype_reconstruction.R +++ b/R/haplotype_reconstruction.R @@ -20,11 +20,13 @@ haplotype_reconstruction <- function( data <- tidyr::separate( data = data, col = HAPLOTYPES, - into = as.character(seq(1, n.snp, 1)), sep = 1:(n.snp-1), remove = FALSE) %>% - tidyr::gather( - data = ., key = "SNP", - value = "NUC", - -c(MARKERS, HAPLOTYPES, SNP_N)) %>% + into = as.character(seq(1, n.snp, 1)), sep = 1:(n.snp - 1), remove = FALSE) %>% + tidyr::pivot_longer( + data = ., + cols = -c("MARKERS", "HAPLOTYPES", "SNP_N"), + names_to = "SNP", + values_to = "NUC" + ) %>% dplyr::mutate(SNP = as.integer(SNP)) %>% dplyr::group_by(SNP) %>% dplyr::mutate( @@ -35,7 +37,8 @@ haplotype_reconstruction <- function( dplyr::select(-POLYMORPHIC) %>% dplyr::arrange(SNP, HAPLOTYPES) %>% dplyr::group_by(MARKERS, HAPLOTYPES, SNP_N) %>% - tidyr::spread(SNP, NUC, convert = TRUE) %>% + # tidyr spread(SNP, NUC, convert = TRUE) %>% + tidyr::pivot_wider(data = ., names_from = "SNP", values_from = "NUC") %>% tidyr::unite( data = ., col = HAPLOTYPES_NEW, -c(MARKERS, HAPLOTYPES, SNP_N), diff --git a/R/ibdg_fh.R b/R/ibdg_fh.R index 9c57e0d3..e3d744cb 100644 --- a/R/ibdg_fh.R +++ b/R/ibdg_fh.R @@ -256,7 +256,12 @@ ibdg_fh <- function( dplyr::select(-GT) freq <- input.alleles %>% - tidyr::gather(data = ., key = ALLELE_GROUP, value = ALLELES, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("MARKERS", "INDIVIDUALS", "POP_ID"), + names_to = "ALLELE_GROUP", + values_to = "ALLELES" + ) %>% dplyr::group_by(MARKERS, ALLELES, POP_ID) %>% dplyr::tally(.) %>% dplyr::ungroup(.) %>% diff --git a/R/pi.R b/R/pi.R index df46a855..5573f1d5 100644 --- a/R/pi.R +++ b/R/pi.R @@ -177,8 +177,12 @@ pi <- function( # Pi: by pop------------------------------------------------------------------ message(" Pi calculations: populations...") data %<>% dplyr::select(POP_ID, INDIVIDUALS, MARKERS, ALLELE1, ALLELE2) %>% - tidyr::gather(ALLELE_GROUP, ALLELES, -c(POP_ID, INDIVIDUALS, MARKERS)) - + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELE_GROUP", + values_to = "ALLELES" + ) res$pi.populations <- data %>% split(x = ., f = .$POP_ID) %>% purrr::map_df( diff --git a/R/plink.R b/R/plink.R index d5cc2e67..0056e7bb 100644 --- a/R/plink.R +++ b/R/plink.R @@ -101,7 +101,6 @@ read_plink <- function( ) { # #Test - # data = "my_plink.bed" # filename <- NULL # parallel.core <- parallel::detectCores() - 1 # verbose <- TRUE @@ -112,7 +111,7 @@ read_plink <- function( if (verbose) message("Execution date@time: ", file.date) old.dir <- getwd() opt.change <- getOption("width") - options(future.globals.maxSize= Inf) + options(future.globals.maxSize = Inf) options(width = 70) timing <- radiator_tic() res <- list() @@ -164,15 +163,20 @@ read_plink <- function( NUMBER = seq(1, n()), ALLELE1 = rep("A1", n()), ALLELE2 = rep("A2", n()) ) %>% - tidyr::gather(ALLELES_GROUP, ALLELES, -c(INDIVIDUALS, NUMBER)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("INDIVIDUALS", "NUMBER"), + names_to = "ALLELES_GROUP", + values_to = "ALLELES" + ) %>% dplyr::arrange(NUMBER) %>% dplyr::select(-ALLELES_GROUP) %>% tidyr::unite(INDIVIDUALS_ALLELES, c(INDIVIDUALS, ALLELES), sep = "_", remove = FALSE) %>% dplyr::arrange(NUMBER) %>% dplyr::mutate(NUMBER = seq(from = (1 + 4), to = n() + 4)) %>% dplyr::select(-ALLELES) - tped.header.names <- c("LOCUS", tped.header.prep$INDIVIDUALS_ALLELES) - tped.header.integer <- c(2, tped.header.prep$NUMBER) + tped.header.names <- c("CHROM", "LOCUS", "POS", tped.header.prep$INDIVIDUALS_ALLELES) + tped.header.integer <- c(1, 2, 4, tped.header.prep$NUMBER) tped.header.prep <- NULL # import PLINK @@ -187,7 +191,12 @@ read_plink <- function( showProgress = TRUE, data.table = FALSE) %>% tibble::as_tibble(.) %>% - dplyr::mutate(LOCUS = as.character(LOCUS)) + dplyr::mutate( + CHROM = as.character(CHROM), + LOCUS = as.character(LOCUS), + POS = as.character(POS) + ) %>% + tidyr::unite(data = ., col = "MARKERS", c("CHROM", "LOCUS", "POS"), remove = FALSE, sep = "__") # Unused objects tped.header.integer <- tped.header.names <- NULL @@ -489,9 +498,10 @@ tidy_plink <- function( ... ) { # Test - # verbose = FALSE + # verbose = TRUE # strata = NULL # ref.calibration = TRUE + # parallel.core = parallel::detectCores() - 1 # Cleanup--------------------------------------------------------------------- radiator_function_header(f.name = "tidy_plink", verbose = verbose) @@ -499,7 +509,7 @@ tidy_plink <- function( if (verbose) message("Execution date@time: ", file.date) old.dir <- getwd() opt.change <- getOption("width") - options(future.globals.maxSize= Inf) + options(future.globals.maxSize = Inf) options(width = 70) timing <- radiator_tic() #back to the original directory and options @@ -512,6 +522,7 @@ tidy_plink <- function( if (missing(data)) rlang::abort("PLINK file missing") # Function call and dotslist ------------------------------------------------- + path.folder <- filename <- ref.calibration <- NULL rad.dots <- radiator_dots( func.name = as.list(sys.call())[[1]], fd = rlang::fn_fmls_names(), @@ -540,17 +551,24 @@ tidy_plink <- function( if (verbose) message("Tidying the PLINK tped file ...") # Filling GT and new separating INDIVIDUALS from ALLELES # combining alleles - # input <- tidyr::gather(data = input, key = INDIVIDUALS_ALLELES, value = GT, -LOCUS) strata <- data$strata - data <- data.table::as.data.table(data$data) %>% - data.table::melt.data.table( - data = ., - id.vars = "LOCUS", - variable.name = "INDIVIDUALS_ALLELES", - value.name = "GT", - variable.factor = FALSE - ) %>% - tibble::as_tibble(.) + data <- tidyr::pivot_longer( + data = data$data, + cols = -c("MARKERS", "CHROM", "LOCUS", "POS"), + names_to = "INDIVIDUALS_ALLELES", + values_to = "GT" + ) + + + # data.table::as.data.table(data$data) %>% + # data.table::melt.data.table( + # data = ., + # id.vars = "LOCUS", + # variable.name = "INDIVIDUALS_ALLELES", + # value.name = "GT", + # variable.factor = FALSE + # ) %>% + # tibble::as_tibble(.) # detect GT coding @@ -582,11 +600,11 @@ tidy_plink <- function( col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS", "ALLELES"), sep = "_") %>% - dplyr::group_by(LOCUS, INDIVIDUALS) %>% - tidyr::spread(data = ., key = ALLELES, value = GT) %>% + dplyr::group_by(MARKERS, CHROM, LOCUS, POS, INDIVIDUALS) %>% + tidyr::pivot_wider(data = ., names_from = "ALLELES", values_from = "GT") %>% dplyr::ungroup(.) %>% tidyr::unite(data = ., col = GT, A1, A2, sep = "") %>% - dplyr::select(LOCUS, INDIVIDUALS, GT) + dplyr::select(MARKERS, CHROM, LOCUS, POS, INDIVIDUALS, GT) # population levels and strata if (verbose) message("Integrating the tfam/strata file...") @@ -615,8 +633,8 @@ tidy_plink <- function( # detect if biallelic give vcf style genotypes # biallelic <- radiator::detect_biallelic_markers(input) - filename <- internal <- parameters <- path.folder <- ref.calibration <- NULL - rm(filename, internal, parameters, path.folder, ref.calibration) + # filename <- internal <- parameters <- path.folder <- ref.calibration <- NULL + # rm(filename, internal, parameters, path.folder, ref.calibration) if (ref.calibration) { data %<>% @@ -626,7 +644,7 @@ tidy_plink <- function( ) return(res = list(input = data$input, biallelic = data$biallelic)) } else { - return(res = list(input = data, biallelic = "biallelic")) + return(res = list(input = data, biallelic = radiator::detect_biallelic_markers(data))) } } #End tidy tped @@ -681,8 +699,8 @@ tidy_plink <- function( # checks # dplyr::n_distinct(tidy.data$INDIVIDUALS) # dplyr::n_distinct(tidy.data$MARKERS) - filename <- internal <- parameters <- path.folder <- ref.calibration <- NULL - rm(filename, internal, parameters, path.folder, ref.calibration) + # filename <- internal <- parameters <- path.folder <- ref.calibration <- NULL + # rm(filename, internal, parameters, path.folder, ref.calibration) return(res = list(input = tidy.data, biallelic = "biallelic")) }#End tidy bed @@ -740,7 +758,12 @@ write_plink <- function(data, filename = NULL) { A2 = stringi::stri_sub(str = GT, from = 4, to = 6) ) %>% dplyr::select(-GT) %>% - tidyr::gather(ALLELES, GENOTYPE, -c(COL1, MARKERS, COL3, COL4, INDIVIDUALS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("COL1", "MARKERS", "COL3", "COL4", "INDIVIDUALS"), + names_to = "ALLELES", + values_to = "GENOTYPE" + ) %>% dplyr::mutate( GENOTYPE = as.character(as.numeric(GENOTYPE)), GENOTYPE = stringi::stri_pad_left(GENOTYPE, width = 2, pad = "0") @@ -748,7 +771,7 @@ write_plink <- function(data, filename = NULL) { dplyr::arrange(INDIVIDUALS, ALLELES) %>% tidyr::unite(INDIVIDUALS_ALLELES, INDIVIDUALS, ALLELES, sep = "_") %>% dplyr::group_by(COL1, MARKERS, COL3, COL4) %>% - tidyr::spread(data = ., key = INDIVIDUALS_ALLELES, value = GENOTYPE) %>% + tidyr::pivot_wider(data = ., names_from = "INDIVIDUALS_ALLELES", values_from = "GENOTYPE") %>% dplyr::arrange(MARKERS) tfam <- dplyr::distinct(.data = data, POP_ID, INDIVIDUALS) %>% diff --git a/R/radiator_deprecated_functions.R b/R/radiator_deprecated_functions.R index b8795bd1..1bb8bf6d 100644 --- a/R/radiator_deprecated_functions.R +++ b/R/radiator_deprecated_functions.R @@ -1,1323 +1 @@ -# Discard monomorphic markers -------------------------------------------------- - -#' @name discard_monomorphic_markers - -#' @title Discard monomorphic markers - -#' @description Discard monomorphic markers. -#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator} -#' and might be of interest for users. - -#' @param data A tidy data frame object in the global environment or -#' a tidy data frame in wide or long format in the working directory. -#' \emph{How to get a tidy data frame ?} -#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}. - - -#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty -#' during execution. -#' Default: \code{verbose = FALSE}. - -#' @return A list with the filtered input file and the blacklist of markers removed. - -#' @export -#' @rdname discard_monomorphic_markers -#' @keywords internal - -#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com} - -discard_monomorphic_markers <- function(data, verbose = FALSE) { - message("\n\nDeprecated function, update your code to use: filter_monomorphic function or filter.monomorphic argument\n\n") - - # Checking for missing and/or default arguments ------------------------------ - if (missing(data)) rlang::abort("Input file missing") - - # Detect format -------------------------------------------------------------- - data.type <- radiator::detect_genomic_format(data) - - if (data.type == "SeqVarGDSClass") { - if (verbose) message("Scanning for monomorphic markers...") - # radiator::summary_gds(gds = data) - # radiator::sync_gds(gds = data, samples = NULL, markers = NULL) - markers <- SeqArray::seqGetData(data, "variant.id") - n.markers.before <- length(markers) - mono.markers <- markers[SeqArray::seqNumAllele(gdsfile = data) == 1] - # mono.markers <- c(6L,7L) - n.markers.removed <- length(mono.markers) - - if (n.markers.removed > 0) { - whitelist.polymorphic.markers <- markers[-mono.markers] - n.markers.after <- n.markers.before - n.markers.removed - radiator::sync_gds(gds = data, markers = whitelist.polymorphic.markers) - } else { - whitelist.polymorphic.markers <- markers - n.markers.after <- n.markers.before - } - - # check for markers meta node - # markers.meta <- radiator_gds_markers_meta(gds = data) - # - # if (is.null(markers.meta)) { - # suppressWarnings(gdsfmt::add.gdsn( - # node = radiator.gds, - # name = "markers.meta", - # val = res$markers.meta, - # replace = TRUE, - # compress = "ZIP_RA", - # closezip = TRUE)) - # } - res <- list(input = data, - blacklist.monomorphic.markers = mono.markers, - whitelist.polymorphic.markers = whitelist.polymorphic.markers - ) - if (verbose) { - n.markers <- stringi::stri_join(n.markers.before, n.markers.removed, n.markers.after, sep = "/") - message(" Number of markers before/blacklisted/after: ", n.markers) - } - - - - } else { - # Import data --------------------------------------------------------------- - data <- radiator::tidy_wide(data = data, import.metadata = TRUE) - - - if (tibble::has_name(data, "CHROM")) { - markers.df <- dplyr::distinct(.data = data, MARKERS, CHROM, LOCUS, POS) - } - if (verbose) message("Scanning for monomorphic markers...") - n.markers.before <- dplyr::n_distinct(data$MARKERS) - - if (tibble::has_name(data, "GT_BIN")) { - mono.markers <- dplyr::select(.data = data, MARKERS, GT_BIN) %>% - dplyr::filter(!is.na(GT_BIN)) %>% - dplyr::distinct(MARKERS, GT_BIN) %>% - dplyr::count(x = ., MARKERS) %>% - dplyr::filter(n == 1) %>% - dplyr::distinct(MARKERS) - } else { - mono.markers <- dplyr::select(.data = data, MARKERS, GT) %>% - dplyr::filter(GT != "000000") %>% - dplyr::distinct(MARKERS, GT) %>% - dplyr::mutate( - A1 = stringi::stri_sub(GT, 1, 3), - A2 = stringi::stri_sub(GT, 4,6) - ) %>% - dplyr::select(-GT) %>% - tidyr::gather(data = ., key = ALLELES_GROUP, value = ALLELES, -MARKERS) %>% - dplyr::distinct(MARKERS, ALLELES) %>% - dplyr::count(x = ., MARKERS) %>% - dplyr::filter(n == 1) %>% - dplyr::distinct(MARKERS) - } - # Remove the markers from the dataset - n.markers.removed <- nrow(mono.markers) - - if (length(mono.markers$MARKERS) > 0) { - data <- dplyr::anti_join(data, mono.markers, by = "MARKERS") - if (tibble::has_name(data, "CHROM")) { - mono.markers <- dplyr::left_join(mono.markers, markers.df, by = "MARKERS") - } - } else { - mono.markers <- tibble::data_frame(MARKERS = character(0)) - } - n.markers.after <- dplyr::n_distinct(data$MARKERS) - - want <- c("MARKERS", "CHROM", "LOCUS", "POS") - whitelist.polymorphic.markers <- suppressWarnings( - dplyr::select(data, dplyr::one_of(want)) %>% - dplyr::distinct(MARKERS, .keep_all = TRUE)) - res <- list(input = data, - blacklist.monomorphic.markers = mono.markers, - whitelist.polymorphic.markers = whitelist.polymorphic.markers - ) - if (verbose) { - n.markers <- stringi::stri_join(n.markers.before, n.markers.removed, n.markers.after, sep = "/") - message(" Number of markers before/blacklisted/after: ", n.markers) - } - } - - return(res) -} # end discard mono markers - -# Keep markers in common between all populations - -#' @name keep_common_markers - -#' @title Keep markers in common between all populations - -#' @description Keep markers in common between all populations. -#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator} -#' and might be of interest for users. - - -#' @param data A tidy data frame object in the global environment or -#' a tidy data frame in wide or long format in the working directory. -#' \emph{How to get a tidy data frame ?} -#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}. - -#' @param plot (optional, logical) \code{plot = TRUE} will produce an -#' \href{https://github.com/hms-dbmi/UpSetR}{UpSet plot} to visualize the number -#' of markers between populations. The package is required for this to work... -#' Default: \code{plot = FALSE}. - -#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty -#' during execution. -#' Default: \code{verbose = FALSE}. - -#' @return A list with the filtered input data set, -#' the whitelist of markers in common and the blacklist of markers discarded. - - -#' @export -#' @keywords internal - -#' @rdname keep_common_markers - -#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com} -#' @section Life cycle: -#' -#' As of radiator v.0.0.22, keep.common.markers function is deprecated and was replaced by -#' \code{\link{filter_common_markers}} in an effort to have more meaningful functions names. - - -# keep_common_markers ---------------------------------------------------------- -keep_common_markers <- function(data, plot = FALSE, verbose = FALSE) { - - message("\n\nDeprecated function, update your code to use: filter_common_markers\n\n") - - - if (plot) { - if (!requireNamespace("UpSetR", quietly = TRUE)) { - rlang::abort("UpSetR needed for this function to work - Install with install.packages('UpSetR')") - } - } - - # Checking for missing and/or default arguments ------------------------------ - if (missing(data)) rlang::abort("Input file missing") - - # Import data --------------------------------------------------------------- - if (is.vector(data)) { - data <- radiator::tidy_wide(data = data, import.metadata = TRUE) - } - - # check genotype column naming - if (tibble::has_name(data, "GENOTYPE")) { - colnames(data) <- stringi::stri_replace_all_fixed( - str = colnames(data), - pattern = "GENOTYPE", - replacement = "GT", - vectorize_all = FALSE) - } - - # necessary steps to make sure we work with unique markers and not duplicated LOCUS - if (tibble::has_name(data, "LOCUS") && !tibble::has_name(data, "MARKERS")) { - data <- dplyr::rename(.data = data, MARKERS = LOCUS) - } - - # markers.meta - want <- c("MARKERS", "CHROM", "LOCUS", "POS") - markers.meta <- suppressWarnings(dplyr::select(data, dplyr::one_of(want)) %>% - dplyr::distinct(MARKERS, .keep_all = TRUE)) - - if (verbose) message("Using markers common in all populations:") - - if (tibble::has_name(data, "GT_BIN")) { - blacklist <- dplyr::select(.data = data, MARKERS, POP_ID, GT_BIN) %>% - dplyr::filter(!is.na(GT_BIN)) - } else { - blacklist <- dplyr::select(.data = data, MARKERS, POP_ID, GT) %>% - dplyr::filter(GT != "000000") - } - - blacklist <- dplyr::distinct(blacklist, MARKERS, POP_ID) %>% - dplyr::count(x = ., MARKERS) %>% - dplyr::filter(n != dplyr::n_distinct(data$POP_ID)) %>% - dplyr::distinct(MARKERS) %>% - dplyr::arrange(MARKERS) - - markers.data <- dplyr::n_distinct(data$MARKERS) - blacklist.markers <- nrow(blacklist) - markers.in.common <- markers.data - blacklist.markers - - if (verbose) { - n.markers <- stringi::stri_join(markers.data, blacklist.markers, markers.in.common, sep = "/") - message(" Number of markers before/blacklisted/after:", n.markers) - } - - if (plot) { - pops <- unique(data$POP_ID) - - if (length(pops) > 1) { - if (tibble::has_name(data, "GT_BIN")) { - plot.data <- dplyr::filter(data, !is.na(GT_BIN)) - } else { - plot.data <- dplyr::filter(data, GT != "000000") - } - plot.data <- dplyr::distinct(plot.data, MARKERS, POP_ID) %>% - dplyr::mutate( - n = rep(1, n()), - POP_ID = stringi::stri_join("POP_", POP_ID) - ) %>% - tidyr::spread(data = ., key = POP_ID, value = n, fill = 0) %>% - data.frame(.) - - UpSetR::upset(plot.data, nsets = length(pops), order.by = "freq", empty.intersections = "on") - message(" UpSet plot generated to visualize common markers") - } - } - - if (blacklist.markers > 0) { - # system.time(data2 <- dplyr::anti_join(data, blacklist, by = "MARKERS")) - data <- dplyr::filter(data, !MARKERS %in% blacklist$MARKERS) - - if (ncol(markers.meta) > 1) { - blacklist <- dplyr::left_join(blacklist, markers.meta, by = "MARKERS") %>% - readr::write_tsv(x = ., path = "blacklist.not.in.common.markers.tsv") - } - - } else { - blacklist <- tibble::data_frame(INDIVIDUALS = character(0)) - } - want <- c("MARKERS", "CHROM", "LOCUS", "POS") - whitelist <- suppressWarnings(dplyr::select(data, dplyr::one_of(want)) %>% - dplyr::distinct(MARKERS, .keep_all = TRUE)) - - - if (blacklist.markers > 0) { - readr::write_tsv(x = whitelist, path = "whitelist.common.markers.tsv") - } - - return(res = list(input = data, - whitelist.common.markers = whitelist, - blacklist.not.in.common.markers = blacklist)) - } - -# Filter dart function --------------------------------------------------------- -#' @name filter_dart - -#' @title filter DArT file - -#' @description Use \code{\link{filter_rad}}. -#' @param ... Read life cycle section below -#' @section Life cycle: -#' -#' As of radiator v.0.0.22, \code{filter_dart} is now replaced by -#' \code{\link{filter_rad}} in an effort to have more meaningful functions names. -#' This is THE function to rule them all. - -#' @export -#' @rdname filter_dart - -filter_dart <- function(...) { - rlang::abort("As of radiator v.0.0.22, filter_dart is now replaced by filter_rad. - \nRead documentation, for latest changes, and modify your codes!\n") -} - - -# filter_individual------------------------------------------------------------- -#' @name filter_individual -#' @title Individual filter -#' @description Filter individuals genotyped at a marker from a tidy data -#' set (long format) of any of these file format: -#' vcf, plink (tped/tfam), stacks haplotype file, genind, -#' genepop, data frame in wide format. The function uses -#' \code{\link[radiator]{tidy_genomic_data}} and -#' \code{\link[radiator]{tidy_wide}} to load the file. For filtering -#' you can consider the overall number of individuals (no concept of populations here), -#' or the number of genotyped individuals per pop. The threshold can be a fixed -#' number of individuals, a proportion or a percentage. - -# Most arguments are inherited from tidy_genomic_data -#' @inheritParams tidy_genomic_data -#' @inheritParams read_strata -#' @inheritParams radiator_common_arguments - -#' @param interactive.filter (optional, logical) Do you want the filtering session to -#' be interactive. With default: \code{interactive.filter == TRUE}, the user is -#' asked to see figures of distribution before making decisions for filtering. - -#' @param ind.approach (character). -#' The approach to filter a marker, is it based on the overall number of -#' genotyped individuals (no concept of populations here), -#' \code{ind.approach = "overall"}, or -#' the number of genotyped individuals per population \code{ind.approach = "pop"}. -#' Default: \code{ind.approach = "pop"}. - -#' @param ind.threshold The individual threshold, proportion, percentage or -#' number e.g. 0.70, 70, 15. -#' Default: \code{ind.threshold = 0.5}. - -#' @param percent Is the threshold a percentage? TRUE or FALSE. -#' This argument is necessary to distinguish percentage from integer individual -#' threshold (e.g. 70 percent or 70 individuals). -#' Default: \code{percent = FALSE}. - -#' @param prob.pop.threshold (integer, optional) Useful to incorporate problematic -#' populations dragging down polymorphism discovery, but still wanted for analysis. -#' If individuals with missing genotypes are not managed upstream, -#' use this threshold to allow variance in the number of populations passing -#' the \code{ind.threshold} argument. e.g. with \code{prob.pop.threshold = 2}, -#' you tolerate a maximum of 2 populations failing to pass the -#' \code{ind.threshold}. Manage after the problematic populations -#' downstream with blacklist of individuals and/or missing data imputations. This -#' argument is not necessary with \code{ind.approach = "overall"}. -#' Default: \code{prob.pop.threshold = 0}. See details for more info. - -#' @param filename (optional) The function uses \code{\link[fst]{write.fst}}, -#' to write the tidy data frame in -#' the folder created in the working directory. The file extension appended to -#' the \code{filename} provided is \code{.rad}. -#' Default: \code{filename = NULL}. - - -#' @rdname filter_individual -#' @export - -#' @details -#' \strong{Interactive version} -#' -#' There is 2 steps in the interactive version to visualize and filter -#' the data based on the number of genotyped individuals: -#' -#' Step 1. Impact of individual threshold on marker discovery -#' -#' Step 2. Choose the filtering approach and thresholds -#' -#' -#' -#' \strong{prob.pop.threshold} -#' -#' If your a regular radiator user, you've seen the \code{pop.num.threshold}. -#' \code{prob.pop.threshold}, is a bit different and requires more thinking, -#' because the number of populations genotyped potentially vary across markers, -#' which make difficult, sometimes, the use of \code{pop.num.threshold}. -#' e.g. If only 2 populations out of 10 are genotyped for a marker and you use -#' \code{pop.num.threshold = 0.6} this could lead to inconsistensis. Thus, -#' it's easier to use the number of population you are willing to accept -#' failling to conform to the threshold. - - -#' @return With \code{interactive.filter = FALSE}, a list in the global environment, -#' with 7 objects: -#' \enumerate{ -#' \item $tidy.filtered.ind -#' \item $whitelist.markers -#' \item $blacklist.markers -#' \item $strata -#' \item $filters.parameters -#' } -#' -#' With \code{interactive.filter = TRUE}, a list with 2 additionnal objects is created. -#' \enumerate{ -#' \item $plot.ind.threshold -#' \item $ind.threshold.helper.table -#' } -#' -#' The object can be isolated in separate object outside the list by -#' following the example below. - -#' @examples -#' \dontrun{ -#' turtle.ind <- filter_individual( -#' data = "turtle.vcf", -#' strata = "turtle.strata.tsv", -#' ind.approach = "pop", -#' ind.threshold = 50, -#' percent = TRUE, -#' prob.pop.threshold = 2, -#' filename = "tidy.data.turtle" -#' ) - -#' -#' #If interactive.filter = TRUE, a list is created and to view the filtered tidy data: -#' tidy.data <- turtle.ind$tidy.filtered.ind -#' -#' #Inside the same list, to isolate the blacklist.genotypes: -#' bg <- turtle.ind$blacklist.genotypes -#' -#' # The remaining argument are used in tidy_genomic_data during import and allow -#' # the user to apply some filtering or selection before doing the filtering. -#' } - -filter_individual <- function( - interactive.filter = TRUE, - data, - strata = NULL, - ind.approach = "pop", - ind.threshold = 0.5, - prob.pop.threshold = 0, - percent = FALSE, - filename = NULL, - parallel.core = parallel::detectCores() - 1, - ... -) { - cat("#######################################################################\n") - cat("##################### radiator::filter_individual #######################\n") - cat("#######################################################################\n") - opt.change <- getOption("width") - options(width = 70) - timing <- proc.time() - # manage missing arguments ----------------------------------------------------- - if (missing(data)) rlang::abort("Input file missing") - if (!is.null(pop.levels) & is.null(pop.labels)) { - pop.levels <- stringi::stri_replace_all_fixed(pop.levels, pattern = " ", replacement = "_", vectorize_all = FALSE) - pop.labels <- pop.levels - } - - if (!is.null(pop.labels) & is.null(pop.levels)) rlang::abort("pop.levels is required if you use pop.labels") - - if (!is.null(pop.labels)) { - if (length(pop.labels) != length(pop.levels)) rlang::abort("pop.labels and pop.levels must have the same length (number of groups)") - pop.labels <- stringi::stri_replace_all_fixed(pop.labels, pattern = " ", replacement = "_", vectorize_all = FALSE) - } - - if (!is.null(pop.select)) { - pop.select <- stringi::stri_replace_all_fixed(pop.select, pattern = " ", replacement = "_", vectorize_all = FALSE) - } - # Message about steps taken during the process --------------------------------- - if (interactive.filter) { - message("Interactive mode: on") - message("2 steps to visualize and filter the data based on the number of genotyped individuals:") - message("Step 1. Impact of individual threshold on marker discovery") - message("Step 2. Choose the filtering approach and thresholds") - } - - # Folder ------------------------------------------------------------------- - # Date and time - file.date <- format(Sys.time(), "%Y%m%d@%H%M") - folder.message <- stringi::stri_join("filter_individual_", file.date, sep = "") - path.folder <- stringi::stri_join(getwd(),"/", folder.message, sep = "") - dir.create(file.path(path.folder)) - - message("\nFolder created: \n", folder.message) - file.date <- NULL #unused object - - # Filter parameter file ------------------------------------------------------ - message("Parameters used in this run are stored in a file") - filters.parameters <- list.files(path = getwd(), pattern = "filters_parameters.tsv", full.names = TRUE) - if (length(filters.parameters) == 0) { - filters.parameters <- tibble::data_frame(FILTERS = as.character(), PARAMETERS = as.character(), VALUES = as.integer(), BEFORE = as.character(), AFTER = as.character(), BLACKLIST = as.integer(), UNITS = as.character(), COMMENTS = as.character()) - readr::write_tsv(x = filters.parameters, path = "filters_parameters.tsv", append = FALSE, col_names = TRUE) - message(" Created a parameter file: filters_parameters.tsv") - } else { - message(" Using the filters parameters file: filters_parameters.tsv") - } - - # File type detection---------------------------------------------------------- - data.type <- radiator::detect_genomic_format(data) - - # import data ---------------------------------------------------------------- - message("Importing data ...") - input <- radiator::tidy_genomic_data( - data = data, - vcf.metadata = vcf.metadata, - blacklist.id = blacklist.id, - blacklist.genotype = blacklist.genotype, - whitelist.markers = whitelist.markers, - monomorphic.out = monomorphic.out, - max.marker = max.marker, - snp.ld = snp.ld, - common.markers = common.markers, - strata = strata, - pop.levels = pop.levels, - pop.labels = pop.labels, - pop.select = pop.select, - filename = NULL, - parallel.core = parallel.core - ) - - # create a strata.df - strata.df <- input %>% - dplyr::select(INDIVIDUALS, POP_ID) %>% - dplyr::distinct(INDIVIDUALS, .keep_all = TRUE) - strata <- strata.df - pop.levels <- levels(input$POP_ID) - pop.labels <- pop.levels - - # prepare filter, table and figure---------------------------------------------- - ind.total <- dplyr::n_distinct(input$INDIVIDUALS) # total number of individuals - pop.number <- dplyr::n_distinct(input$POP_ID) # number of pop - - # individuals per pop - ind.pop <- input %>% - dplyr::distinct(POP_ID, INDIVIDUALS) %>% - dplyr::group_by(POP_ID) %>% - dplyr::tally(.) %>% - dplyr::rename(N_IND = n) - - # input genotyped - input.genotyped <- input %>% - dplyr::filter(GT != "000000") - - # overall genotyped individuals - overall <- input.genotyped %>% - dplyr::select(MARKERS, INDIVIDUALS) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::rename(GENOTYPED = n) %>% - dplyr::mutate(PERCENT = ceiling(GENOTYPED/ind.total*100)) - - # number of pop. genotyped per marker - pop.genotyped.marker <- input.genotyped %>% - dplyr::distinct(MARKERS, POP_ID) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::rename(POP_GENOTYPED = n) - - # genotyped individuals per pop - pop <- input.genotyped %>% - dplyr::select(MARKERS, INDIVIDUALS, POP_ID) %>% - dplyr::group_by(MARKERS, POP_ID) %>% - dplyr::tally(.) %>% - dplyr::rename(GENOTYPED = n) %>% - dplyr::inner_join(ind.pop, by = "POP_ID") %>% - dplyr::mutate(PERCENT = ceiling(GENOTYPED/N_IND*100)) - - input.genotyped <- NULL # unused object - - # Step 1. Impact of individual threshold on marker discovery------------------ - if (interactive.filter) { - message("Step 1. Impact of individual threshold on marker discovery") - } - # line.graph <- as.character(readLines(n = 1)) - # if (line.graph == "y") { - threshold.helper.overall <- overall %>% - dplyr::ungroup(.) %>% - dplyr::summarise( - `10` = length(PERCENT[PERCENT >= 10]), - `20` = length(PERCENT[PERCENT >= 20]), - `30` = length(PERCENT[PERCENT >= 30]), - `40` = length(PERCENT[PERCENT >= 40]), - `50` = length(PERCENT[PERCENT >= 50]), - `60` = length(PERCENT[PERCENT >= 60]), - `70` = length(PERCENT[PERCENT >= 70]), - `80` = length(PERCENT[PERCENT >= 80]), - `90` = length(PERCENT[PERCENT >= 90]), - `100` = length(PERCENT[PERCENT == 100]) - ) %>% - tidyr::gather(data = ., key = IND_THRESHOLD, value = MARKER_NUMBER) %>% - dplyr::mutate(POP_ID = rep("OVERALL", n())) - - threshold.helper.pop <- pop %>% - dplyr::group_by(POP_ID) %>% - dplyr::summarise( - `10` = length(PERCENT[PERCENT >= 10]), - `20` = length(PERCENT[PERCENT >= 20]), - `30` = length(PERCENT[PERCENT >= 30]), - `40` = length(PERCENT[PERCENT >= 40]), - `50` = length(PERCENT[PERCENT >= 50]), - `60` = length(PERCENT[PERCENT >= 60]), - `70` = length(PERCENT[PERCENT >= 70]), - `80` = length(PERCENT[PERCENT >= 80]), - `90` = length(PERCENT[PERCENT >= 90]), - `100` = length(PERCENT[PERCENT == 100]) - ) %>% - tidyr::gather(data = ., key = IND_THRESHOLD, value = MARKER_NUMBER, -POP_ID) - - mean.pop <- threshold.helper.pop %>% - dplyr::group_by(IND_THRESHOLD) %>% - dplyr::summarise( - MARKER_NUMBER = round(mean(MARKER_NUMBER), 0) - ) %>% - dplyr::mutate(POP_ID = rep("MEAN_POP", n())) - - threshold.helper <- suppressWarnings( - dplyr::bind_rows(threshold.helper.pop, mean.pop, threshold.helper.overall) %>% - dplyr::mutate( - IND_THRESHOLD = as.numeric(IND_THRESHOLD), - POP_ID = factor(POP_ID, levels = c(levels(input$POP_ID), "MEAN_POP", "OVERALL"), ordered = TRUE) - )) - - # Set the breaks for the figure - max.markers <- dplyr::n_distinct(input$MARKERS) - - #Function to replace packageplyr round_any - rounder <- function(x, accuracy, f = round) { - f(x / accuracy) * accuracy - } - - # max.markers <- 658 - if (max.markers >= 1000) { - y.breaks.by <- rounder(max.markers/10, 100, ceiling) - y.breaks.max <- rounder(max.markers, 1000, ceiling) - y.breaks <- seq(0, y.breaks.max, by = y.breaks.by) - } else { - y.breaks.by <- rounder(max.markers/10, 10, ceiling) - y.breaks.max <- rounder(max.markers, 100, ceiling) - y.breaks <- seq(0, y.breaks.max, by = y.breaks.by) - } - - plot.ind.threshold <- ggplot2::ggplot(threshold.helper, ggplot2::aes(x = IND_THRESHOLD, y = MARKER_NUMBER)) + - ggplot2::geom_line() + - ggplot2::geom_point(size = 2, shape = 21, fill = "white") + - ggplot2::scale_x_continuous(name = "Individual threshold (percent)", breaks = seq(10, 100, by = 10)) + - ggplot2::scale_y_continuous(name = "Number of markers", breaks = y.breaks, limits = c(0, y.breaks.max)) + - ggplot2::theme( - axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"), - axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"), - axis.text.x = ggplot2::element_text(size = 8, family = "Helvetica", angle = 90, hjust = 1, vjust = 0.5), - strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold") - ) + - ggplot2::facet_grid(~POP_ID) - if (interactive.filter) { - print(plot.ind.threshold) - message(" Look at the plot and inspect the change in the number of markers - in relation to the individual percentage thresholds\n") - } - - # save - ggplot2::ggsave(stringi::stri_join(path.folder, "/plot.ind.threshold.pdf"), width = pop.number*4, height = 10, dpi = 600, units = "cm", useDingbats = FALSE) - ggplot2::ggsave(stringi::stri_join(path.folder, "/plot.ind.threshold.png"), width = pop.number*4, height = 10, dpi = 300, units = "cm") - message("2 versions (pdf and png) of the line graph of individual threshold on marker discovery were written in the folder") - - # Helper table for individual thresholds - ind.threshold.helper.table <- threshold.helper %>% - dplyr::group_by(POP_ID) %>% - tidyr::spread(data = ., key = IND_THRESHOLD, MARKER_NUMBER) - - if (interactive.filter) { - message("\n Inspect the table (ind.threshold.helper.table) to help you view - the relation between individual thresholds and marker discovery:") - print(ind.threshold.helper.table) - message(" First column: POP_ID") - message(" Remaining columns: the individual thresholds in percent - with the value = the number of markers discovered") - message(" The last 2 rows: MEAN_POP is the mean across your populations - and OVERALL is if you had 1 large population") - } - readr::write_tsv( - x = ind.threshold.helper.table, - path = stringi::stri_join(path.folder, "/", "ind.threshold.helper.table.tsv")) - message(" ind.threshold.helper.table was written in the folder") - - # Step 2. Choose the filtering approach and thresholds------------------------ - # 2 approach: filtering with the overall n. ind. ("overall") or by pop ("pop") - if (interactive.filter) { - message("Step 2. Choose the filtering approach and thresholds") - message(" The approach to filter a marker: do you want it based on the overall - number of genotyped individuals or - on the number of genotyped individuals per pop ? (overall or pop):") - ind.approach <- as.character(readLines(n = 1)) - - message(" Enter the individual threshold (number, proportion or percentage). - e.g. enter 10 (for 10 individuals), 0.8 for proportion and 80 for percentage.") - ind.threshold <- as.numeric(readLines(n = 1)) - - message(" The value you just entered for the individual threshold, is it a percentage? (TRUE/FALSE)") - percent <- as.character(readLines(n = 1)) - if (ind.approach == "pop") { - message("Tolerance for deviation: look at the plot produced ealier and if you see some populations dragging down - the number of markers for certain percentage thresholds, you have 3 options:\n - 1. remove the population (use pop.select argument to keep the desired populations) - 2. remove individuals with too many missing genotypes (use blacklist.id argument) - 3. use the next threshold (below) to allow variance and then - manage the missing values with blacklist of individuals and/or - missing data imputations.\n - Enter the number of problematic population that you allow to deviate from the threshold:") - prob.pop.threshold <- as.numeric(readLines(n = 1)) - } - } - - # Filtering ------------------------------------------------------------------ - message(" Filtering...") - - # some discrepencies need to be highllighted here. If you have entire pop not genotyped for a markers - # this will compute them when doing the filtering: - # summarise(GENOTYPED = length(INDIVIDUALS[GT != "000000"])) - # if we first remove the individuals not genotyped with : - # filter(GT != "000000") - # the pop not genotyped are not accounted for. And this is what we want here. - # filter_populations take care the ungenotyped pop and common.markers make sure that for certain analysis - # you can have common markers or not. - # so here we focus on when pop got a marker is it at 50% 60% 70% ... genotyped? - if (ind.approach == "overall") { - if (stringi::stri_detect_fixed(percent, "F") & ind.threshold > 1) { - # ind.threshold <- 100 - threshold.id <- "(Using a fixed threshold)" - prob.pop.threshold <- "NA" - ind.approach <- "overall individuals (no pop)" - filter <- overall %>% - dplyr::filter(GENOTYPED >= ind.threshold) %>% - dplyr::distinct(MARKERS) - } else if (stringi::stri_detect_fixed(percent, "F") & stringi::stri_detect_fixed(ind.threshold, ".") & ind.threshold < 1) { - # ind.threshold <- 0.7 - threshold.id <- "(proportion)" - prob.pop.threshold <- "NA" - ind.approach <- "overall individuals (no pop)" - filter <- overall %>% - dplyr::filter(PERCENT >= ind.threshold*100) %>% - dplyr::distinct(MARKERS) - } else {# percent - # ind.threshold <- 70 - threshold.id <- "(percent)" - prob.pop.threshold <- "NA" - ind.approach <- "overall individuals (no pop)" - filter <- overall %>% - dplyr::filter(PERCENT >= ind.threshold) %>% - dplyr::distinct(MARKERS) - } - } else {# approach by pop - if (stringi::stri_detect_fixed(percent, "F") & ind.threshold > 1) { - message("Using a fixed threshold") - threshold.id <- "(ind.)" - ind.approach <- "individuals by pop" - # ind.threshold <- 15 - # prob.pop.threshold <- 3 - filter <- pop %>% - dplyr::ungroup(.) %>% - dplyr::filter(GENOTYPED >= ind.threshold) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::inner_join(pop.genotyped.marker, by = "MARKERS") %>% - dplyr::mutate(PROB_POP = POP_GENOTYPED - n) %>% - dplyr::filter(PROB_POP <= prob.pop.threshold) %>% - dplyr::distinct(MARKERS) - } else if (stringi::stri_detect_fixed(percent, "F") & stringi::stri_detect_fixed(ind.threshold, ".") & ind.threshold < 1) { - message("Using a proportion threshold...") - threshold.id <- "(proportion)" - ind.approach <- "individuals by pop" - # ind.threshold <- 0.6 - # prob.pop.threshold <- 3 - filter <- pop %>% - dplyr::ungroup(.) %>% - dplyr::filter(PERCENT >= ind.threshold*100) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::inner_join(pop.genotyped.marker, by = "MARKERS") %>% - dplyr::mutate(PROB_POP = POP_GENOTYPED - n) %>% - dplyr::filter(PROB_POP <= prob.pop.threshold) %>% - dplyr::distinct(MARKERS) - } else { - message("Using a percentage threshold...") - threshold.id <- "(percent)" - ind.approach <- "individuals by pop" - # ind.threshold <- 0.6 - # prob.pop.threshold <- 3 - filter <- pop %>% - dplyr::ungroup(.) %>% - dplyr::filter(PERCENT >= ind.threshold) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::inner_join(pop.genotyped.marker, by = "MARKERS") %>% - dplyr::mutate(PROB_POP = POP_GENOTYPED - n) %>% - dplyr::filter(PROB_POP <= prob.pop.threshold) %>% - dplyr::distinct(MARKERS) - } - } - - # Apply the filter to the tidy data - filter <- dplyr::left_join(x = filter, input, by = "MARKERS") - - # Update filters.parameters SNP ---------------------------------------------- - - if (data.type == "haplo.file") { - snp.before <- as.character("NA") - snp.after <- as.character("NA") - snp.blacklist <- as.character("NA") - } else { - snp.before <- as.integer(dplyr::n_distinct(input$MARKERS)) - snp.after <- as.integer(dplyr::n_distinct(filter$MARKERS)) - snp.blacklist <- as.integer(dplyr::n_distinct(input$MARKERS) - dplyr::n_distinct(filter$MARKERS)) - } - - if (tibble::has_name(input, "LOCUS")) { - locus.before <- as.integer(dplyr::n_distinct(input$LOCUS)) - locus.after <- as.integer(dplyr::n_distinct(filter$LOCUS)) - locus.blacklist <- as.integer(dplyr::n_distinct(input$LOCUS) - dplyr::n_distinct(filter$LOCUS)) - } else { - locus.before <- as.character("NA") - locus.after <- as.character("NA") - locus.blacklist <- as.character("NA") - } - - - markers.before <- stringi::stri_join(snp.before, locus.before, sep = "/") - markers.after <- stringi::stri_join(snp.after, locus.after, sep = "/") - markers.blacklist <- stringi::stri_join(snp.blacklist, locus.blacklist, sep = "/") - - - filters.parameters <- tibble::data_frame( - FILTERS = c("Individuals", rep(as.character(""), 2)), - PARAMETERS = c("ind.approach", "ind.threshold", "prob.pop.threshold"), - VALUES = c(ind.approach, paste(">=", ind.threshold, " ", threshold.id), prob.pop.threshold), - BEFORE = c("", "", markers.before), - AFTER = c("", "", markers.after), - BLACKLIST = c("", "", markers.blacklist), - UNITS = c("", "genotyped", "SNP/LOCUS"), - COMMENTS = c("", "", "") - ) - readr::write_tsv(x = filters.parameters, path = "filters_parameters.tsv", append = TRUE, col_names = FALSE) - - # saving tidy data - if (!is.null(filename)) { - tidy.name <- stringi::stri_join(filename, ".rad") - message("Writing the filtered tidy data set: ", tidy.name) - write_rad(data = filter, path = file.path(path.folder, tidy.name)) - } - - # saving whitelist - message("Writing the whitelist of markers: whitelist.markers.ind.tsv") - - if (tibble::has_name(input, "CHROM")) { - whitelist.markers <- dplyr::ungroup(filter) %>% - dplyr::distinct(CHROM, LOCUS, POS) - } else { - whitelist.markers <- dplyr::ungroup(filter) %>% - dplyr::distinct(MARKERS) - } - readr::write_tsv(whitelist.markers, stringi::stri_join(path.folder, "/", "whitelist.markers.ind.tsv"), append = FALSE, col_names = TRUE) - - - # saving blacklist - message("Writing the blacklist of markers: blacklist.markers.ind.tsv") - if (tibble::has_name(input, "CHROM")) { - blacklist.markers <- dplyr::ungroup(input) %>% - dplyr::distinct(CHROM, LOCUS, POS) %>% - dplyr::anti_join(whitelist.markers, by = c("CHROM", "LOCUS", "POS")) - } else { - blacklist.markers <- dplyr::ungroup(input) %>% - dplyr::distinct(MARKERS) %>% - dplyr::anti_join(whitelist.markers, by = "MARKERS") - } - readr::write_tsv(blacklist.markers, stringi::stri_join(path.folder, "/", "blacklist.markers.ind.tsv"), append = FALSE, col_names = TRUE) - - # results -------------------------------------------------------------------- - cat("############################### RESULTS ###############################\n") - message("ind.approach: ", ind.approach) - message("ind.threshold: ", ">= ", ind.threshold, " ", threshold.id) - message("prob.pop.threshold: ", prob.pop.threshold) - message("The number of markers (SNP/LOCUS) removed by the Individual filter:\n", markers.blacklist) - message("The number of markers (SNP/LOCUS) before -> after the Individual filter:\n", markers.before, " -> ", markers.after) - timing <- proc.time() - timing - message("\nComputation time: ", round(timing[[3]]), " sec") - cat("############################## completed ##############################\n") - res <- list() - res$tidy.filtered.ind <- filter - res$whitelist.markers <- whitelist.markers - res$blacklist.markers <- blacklist.markers - res$strata <- strata - res$filters.parameters <- filters.parameters - res$plot.ind.threshold <- plot.ind.threshold - res$ind.threshold.helper.table <- ind.threshold.helper.table - return(res) - } - - - - -# filter_population------------------------------------------------------------- - -#' @name filter_population -#' @title Population filter -#' @description Filter markers based on populations genotyped. Use a tidy data -#' set (long format) of any of these file format: -#' vcf, plink (tped/tfam), stacks haplotype file, genind, -#' genepop, data frame in wide format. The function uses -#' \code{\link[radiator]{tidy_genomic_data}} and -#' \code{\link[radiator]{tidy_wide}} to load the file. For filtering -#' The threshold can be a fixed number of population, a proportion or a percentage. - -# Most arguments are inherited from tidy_genomic_data -#' @inheritParams tidy_genomic_data -#' @inheritParams read_strata -#' @inheritParams radiator_common_arguments - - -#' @param interactive.filter (optional, logical) Do you want the filtering session to -#' be interactive. With default: \code{interactive.filter == TRUE}, the user is -#' asked to see figures of distribution before making decisions for filtering. - -#' @param pop.threshold The population threshold, proportion, percentage or -#' number e.g. 0.70, 70, 15. -#' Default: \code{pop.threshold = 100}. - -#' @param percent Is the threshold a percentage? TRUE or FALSE. -#' This argument is necessary to distinguish percentage from integer population -#' threshold (e.g. 50 percent or 50 populations). -#' Default: \code{percent = TRUE}. - -#' @param filename (optional) The function uses \code{\link[fst]{write.fst}}, -#' to write the tidy data frame in -#' the folder created in the working directory. The file extension appended to -#' the \code{filename} provided is \code{.rad}. -#' Default: \code{filename = NULL}. - -#' @rdname filter_population -#' @export - -#' @details -#' \strong{Interactive version} -#' -#' There is 2 steps in the interactive version to visualize and filter -#' the data based on the population representation: -#' -#' Step 1. Impact of population threshold on marker discovery -#' -#' Step 2. Choose the filtering threshold - - -#' @return With \code{interactive.filter = FALSE}, a list in the global environment, -#' with 7 objects: -#' \enumerate{ -#' \item $tidy.filtered.ind -#' \item $whitelist.markers -#' \item $blacklist.markers -#' \item $strata -#' \item $filters.parameters -#' } -#' -#' With \code{interactive.filter = TRUE}, a list with 2 additionnal objects is created. -#' \enumerate{ -#' \item $plot.pop.threshold -#' \item $pop.threshold.helper.table -#' } -#' -#' The object can be isolated in separate object outside the list by -#' following the example below. - -#' @examples -#' \dontrun{ -#' turtle.pop <- filter_population( -#' data = "turtle.vcf", -#' strata = "turtle.strata.tsv", -#' pop.thresholds = 100, -#' percent = TRUE, -#' filename = "tidy.data.turtle.tsv" -#' ) - -#' -#' #If interactive.filter = TRUE, a list is created and to view the filtered tidy data: -#' tidy.data <- turtle.ind$tidy.filtered.ind -#' -#' #Inside the same list, to isolate the blacklist.genotypes: -#' bg <- turtle.ind$blacklist.genotypes -#' -#' # The remaining argument are used in tidy_genomic_data during import and allow -#' # the user to apply some filtering or selection before doing the filtering. -#' } - - -filter_population <- function( - interactive.filter = TRUE, - data, - pop.threshold = 100, - percent = TRUE, - filename = NULL, - strata = NULL, - parallel.core = parallel::detectCores() - 1, - ... -) { - cat("#######################################################################\n") - cat("##################### radiator::filter_population #######################\n") - cat("#######################################################################\n") - opt.change <- getOption("width") - options(width = 70) - timing <- proc.time() - # manage missing arguments ----------------------------------------------------- - if (missing(data)) rlang::abort("Input file missing") - if (!is.null(pop.levels) & is.null(pop.labels)) { - pop.levels <- stringi::stri_replace_all_fixed(pop.levels, pattern = " ", replacement = "_", vectorize_all = FALSE) - pop.labels <- pop.levels - } - - if (!is.null(pop.labels) & is.null(pop.levels)) rlang::abort("pop.levels is required if you use pop.labels") - - if (!is.null(pop.labels)) { - if (length(pop.labels) != length(pop.levels)) rlang::abort("pop.labels and pop.levels must have the same length (number of groups)") - pop.labels <- stringi::stri_replace_all_fixed(pop.labels, pattern = " ", replacement = "_", vectorize_all = FALSE) - } - - if (!is.null(pop.select)) { - pop.select <- stringi::stri_replace_all_fixed(pop.select, pattern = " ", replacement = "_", vectorize_all = FALSE) - } - - # Message about steps taken during the process --------------------------------- - if (interactive.filter) { - message("Interactive mode: on") - message("2 steps to visualize and filter the data based on the number of genotyped individuals:") - message("Step 1. Impact of population threshold on marker discovery") - message("Step 2. Choose the filtering threshold") - } - # Folder ------------------------------------------------------------------- - # Date and time - file.date <- stringi::stri_replace_all_fixed( - Sys.time(), - pattern = " EDT", replacement = "") %>% - stringi::stri_replace_all_fixed( - str = ., - pattern = c("-", " ", ":"), replacement = c("", "@", ""), - vectorize_all = FALSE) %>% - stringi::stri_sub(str = ., from = 1, to = 13) - folder.extension <- stringi::stri_join("filter_population_", file.date, sep = "") - path.folder <- stringi::stri_join(getwd(),"/", folder.extension, sep = "") - dir.create(file.path(path.folder)) - - message("\nFolder created: \n", folder.extension) - file.date <- NULL #unused object - - # Filter parameter file ------------------------------------------------------ - message("Parameters used in this run are stored in a file") - filters.parameters <- list.files(path = getwd(), pattern = "filters_parameters.tsv", full.names = TRUE) - if (length(filters.parameters) == 0) { - filters.parameters <- tibble::data_frame(FILTERS = as.character(), PARAMETERS = as.character(), VALUES = as.integer(), BEFORE = as.character(), AFTER = as.character(), BLACKLIST = as.integer(), UNITS = as.character(), COMMENTS = as.character()) - readr::write_tsv(x = filters.parameters, path = "filters_parameters.tsv", append = FALSE, col_names = TRUE) - message(" Created a parameter file: filters_parameters.tsv") - } else { - message(" Using the filters parameters file: filters_parameters.tsv") - } - # File type detection---------------------------------------------------------- - data.type <- radiator::detect_genomic_format(data) - - # import data ---------------------------------------------------------------- - message("Importing data ...") - input <- radiator::tidy_genomic_data( - data = data, - vcf.metadata = vcf.metadata, - blacklist.id = blacklist.id, - blacklist.genotype = blacklist.genotype, - whitelist.markers = whitelist.markers, - monomorphic.out = monomorphic.out, - max.marker = max.marker, - snp.ld = snp.ld, - common.markers = common.markers, - strata = strata, - pop.levels = pop.levels, - pop.labels = pop.labels, - pop.select = pop.select, - filename = NULL, - parallel.core = parallel.core - ) - - # create a strata.df - strata.df <- input %>% - dplyr::select(INDIVIDUALS, POP_ID) %>% - dplyr::distinct(INDIVIDUALS, .keep_all = TRUE) - strata <- strata.df - pop.levels <- levels(input$POP_ID) - pop.labels <- pop.levels - - # prepare filter, table and figure-------------------------------------------- - pop.number <- dplyr::n_distinct(input$POP_ID) # number of pop - - filter.prep <- input %>% - dplyr::filter(GT != "000000") %>% - dplyr::distinct(MARKERS, POP_ID) %>% - dplyr::group_by(MARKERS) %>% - dplyr::tally(.) %>% - dplyr::rename(POP_GENOTYPED = n) - - - get_pop_number <- function(pop.indice, data) { - marker.num <- dplyr::filter( - .data = data, - POP_GENOTYPED >= pop.indice) %>% - dplyr::summarise(MARKER_NUMBER = sum(MARKER_NUMBER)) - res <- tibble::data_frame( - POP_GENOTYPED = pop.indice, - MARKER_NUMBER = marker.num$MARKER_NUMBER) - return(res) - } - - pop.threshold.helper.table <- filter.prep %>% - dplyr::group_by(POP_GENOTYPED) %>% - dplyr::summarise(MARKER_NUMBER = length(MARKERS)) - - pop.threshold.helper.table <- purrr::map_df(.x = 1:pop.number, .f = get_pop_number, data = pop.threshold.helper.table) - - # Step 1. Impact of population threshold on marker discovery------------------ - if (interactive.filter) { - message("Step 1. Impact of population threshold on marker discovery") - message(" Inspect the plot and table produced showing the relationship - between the number of markers and the number of population genotyped for - the marker") - } - # line.graph <- as.character(readLines(n = 1)) - # if (line.graph == "y") { - # Set the breaks for the figure - max.markers <- dplyr::n_distinct(input$MARKERS) - - #Function to replace packageplyr round_any - rounder <- function(x, accuracy, f = round) { - f(x / accuracy) * accuracy - } - - # max.markers <- 658 - if (max.markers >= 1000) { - y.breaks.by <- rounder(max.markers/10, 100, ceiling) - y.breaks.max <- rounder(max.markers, 1000, ceiling) - y.breaks <- seq(0, y.breaks.max, by = y.breaks.by) - } else { - y.breaks.by <- rounder(max.markers/10, 10, ceiling) - y.breaks.max <- rounder(max.markers, 100, ceiling) - y.breaks <- seq(0, y.breaks.max, by = y.breaks.by) - } - - plot.pop.threshold <- ggplot2::ggplot( - pop.threshold.helper.table, - ggplot2::aes(x = POP_GENOTYPED, y = MARKER_NUMBER)) + - ggplot2::geom_line() + - ggplot2::geom_point(size = 2, shape = 21, fill = "white") + - ggplot2::scale_x_continuous(name = "Number of populations genotyped") + - ggplot2::scale_y_continuous( - name = "Number of markers", breaks = y.breaks, limits = c(0, y.breaks.max)) + - ggplot2::theme( - axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"), - axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"), - axis.text.x = ggplot2::element_text(size = 8, family = "Helvetica", hjust = 1, vjust = 0.5), - strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold") - ) - # plot.pop.threshold - if (interactive.filter) print(plot.pop.threshold) - # save - ggplot2::ggsave(stringi::stri_join(path.folder, "/plot.pop.threshold.pdf"), width = length(pop.threshold.helper.table$POP_GENOTYPED)*2, height = 10, dpi = 600, units = "cm", useDingbats = FALSE) - ggplot2::ggsave(stringi::stri_join(path.folder, "/plot.pop.threshold.png"), width = length(pop.threshold.helper.table$POP_GENOTYPED)*2, height = 10, dpi = 300, units = "cm") - message("\n 2 versions (pdf and png) of the line graph of populations threshold - on marker discovery were writted in the folder") - - - # Helper table for individual thresholds - if (interactive.filter) { - message("\n A table (pop.threshold.helper.table) to help you view - the relation between population threshold and marker discovery, with your dataset:") - print(pop.threshold.helper.table) - } - readr::write_tsv(x = pop.threshold.helper.table, path = stringi::stri_join(path.folder, "/", "pop.threshold.helper.table.tsv")) - message(" pop.threshold.helper.table also written in the folder") - - # Step 2. Choose the filtering threshold ------------------------------------- - if (interactive.filter) { - message("Step 2. Choose the filtering threshold.") - message(" Enter the population threshold (number, proportion or percentage). - e.g. enter 10 (for 10 populations), 0.8 for proportion and 80 for percentage.") - pop.threshold <- as.numeric(readLines(n = 1)) - - message(" The value you just enter for the pop threshold, is it a percentage? (TRUE/FALSE)") - percent <- as.logical(readLines(n = 1)) - } - - # Filtering ------------------------------------------------------------------ - # pop.number <- dplyr::n_distinct(input$POP_ID) # number of pop - - filter.prep <- filter.prep %>% - dplyr::mutate(PERCENT = ceiling(POP_GENOTYPED/pop.number*100)) - - if (!percent & pop.threshold >= 1) { - message("Using a fixed threshold") - threshold.id <- "(pop.)" - # pop.threshold <- 12 - filter <- filter.prep %>% - dplyr::ungroup(.) %>% - dplyr::filter(POP_GENOTYPED >= pop.threshold) %>% - dplyr::distinct(MARKERS) - } else if (!percent & stringi::stri_detect_fixed(pop.threshold, ".") & pop.threshold < 1) { - message("Using a proportion threshold...") - threshold.id <- "(proportion)" - # pop.threshold <- 0.6 - filter <- filter.prep %>% - dplyr::ungroup(.) %>% - dplyr::filter(PERCENT >= pop.threshold*100) %>% - dplyr::distinct(MARKERS) - } else { - message("Using a percentage threshold...") - threshold.id <- "(percent)" - # pop.threshold <- 100 - filter <- filter.prep %>% - dplyr::ungroup(.) %>% - dplyr::filter(PERCENT >= pop.threshold) %>% - dplyr::distinct(MARKERS) - } - - # Apply the filter to the tidy data - filter <- dplyr::left_join(x = filter, input, by = "MARKERS") - - - # Update filters.parameters SNP ---------------------------------------------- - if (data.type == "haplo.file") { - snp.before <- as.character("NA") - snp.after <- as.character("NA") - snp.blacklist <- as.character("NA") - } else { - snp.before <- as.integer(dplyr::n_distinct(input$MARKERS)) - snp.after <- as.integer(dplyr::n_distinct(filter$MARKERS)) - snp.blacklist <- as.integer(dplyr::n_distinct(input$MARKERS) - dplyr::n_distinct(filter$MARKERS)) - } - - if (tibble::has_name(input, "LOCUS")) { - locus.before <- as.integer(dplyr::n_distinct(input$LOCUS)) - locus.after <- as.integer(dplyr::n_distinct(filter$LOCUS)) - locus.blacklist <- as.integer(dplyr::n_distinct(input$LOCUS) - dplyr::n_distinct(filter$LOCUS)) - } else { - locus.before <- as.character("NA") - locus.after <- as.character("NA") - locus.blacklist <- as.character("NA") - } - - - markers.before <- stringi::stri_join(snp.before, locus.before, sep = "/") - markers.after <- stringi::stri_join(snp.after, locus.after, sep = "/") - markers.blacklist <- stringi::stri_join(snp.blacklist, locus.blacklist, sep = "/") - - filters.parameters <- tibble::data_frame( - FILTERS = c("Populations"), - PARAMETERS = c("pop.threshold"), - VALUES = c(stringi::stri_join(pop.threshold, " ", threshold.id)), - BEFORE = c(markers.before), - AFTER = c(markers.after), - BLACKLIST = c(markers.blacklist), - UNITS = c("SNP/LOCUS"), - COMMENTS = c("") - ) - readr::write_tsv(x = filters.parameters, path = "filters_parameters.tsv", append = TRUE, col_names = FALSE) - - # saving tidy data - if (!is.null(filename)) { - tidy.name <- stringi::stri_join(filename, ".rad") - message("Writing the filtered tidy data set: ", tidy.name) - write_rad(data = filter, path = file.path(path.folder, tidy.name)) - } - - # saving whitelist - message("Writing the whitelist of markers: whitelist.markers.pop.tsv") - - if (tibble::has_name(input, "CHROM")) { - whitelist.markers <- dplyr::ungroup(filter) %>% - dplyr::distinct(CHROM, LOCUS, POS) - } else { - whitelist.markers <- dplyr::ungroup(filter) %>% - dplyr::distinct(MARKERS) - } - readr::write_tsv(whitelist.markers, stringi::stri_join(path.folder, "/", "whitelist.markers.pop.tsv"), append = FALSE, col_names = TRUE) - - # saving blacklist - message("Writing the blacklist of markers: blacklist.markers.pop.tsv") - if (tibble::has_name(input, "CHROM")) { - blacklist.markers <- dplyr::ungroup(input) %>% - dplyr::distinct(CHROM, LOCUS, POS) %>% - dplyr::anti_join(whitelist.markers, by = c("CHROM", "LOCUS", "POS")) - } else { - blacklist.markers <- dplyr::ungroup(input) %>% - dplyr::distinct(MARKERS) %>% - dplyr::anti_join(whitelist.markers, by = "MARKERS") - } - readr::write_tsv(blacklist.markers, stringi::stri_join(path.folder, "/", "blacklist.markers.pop.tsv"), append = FALSE, col_names = TRUE) - - # results -------------------------------------------------------------------- - cat("############################### RESULTS ###############################\n") - message("pop.threshold: ", ">= ", pop.threshold) - message("The number of markers (SNP/LOCUS) removed by the Population filter:\n", markers.blacklist) - message("The number of markers (SNP/LOCUS) before -> after the Population filter:\n", markers.before, " -> ", markers.after) - timing <- proc.time() - timing - message("\nComputation time: ", round(timing[[3]]), " sec") - cat("############################## completed ##############################\n") - res <- list() - res$tidy.filtered.pop <- filter - res$whitelist.markers <- whitelist.markers - res$blacklist.markers <- blacklist.markers - res$strata <- strata - res$filters.parameters <- filters.parameters - res$plot.pop.threshold <- plot.pop.threshold - res$pop.threshold.helper.table <- pop.threshold.helper.table - return(res) - } +# empty diff --git a/R/radiator_tidy.R b/R/radiator_tidy.R index d5ee4899..715c5584 100644 --- a/R/radiator_tidy.R +++ b/R/radiator_tidy.R @@ -92,9 +92,19 @@ tidy_wide <- function(data, import.metadata = FALSE) { # Determine long (tidy) or wide dataset if (!"MARKERS" %in% colnames(data) && !"LOCUS" %in% colnames(data)) { if (rlang::has_name(data, "POP_ID")) { - data <- tidyr::gather(data = data, key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS)) + data <- tidyr::pivot_longer( + data = data, + cols = -c("POP_ID", "INDIVIDUALS"), + names_to = "MARKERS", + values_to = "GT" + ) } else { - data <- tidyr::gather(data = data, key = MARKERS, value = GT, -INDIVIDUALS) + data <- tidyr::pivot_longer( + data = data, + cols = -INDIVIDUALS, + names_to = "MARKERS", + values_to = "GT" + ) } } @@ -516,6 +526,9 @@ tidy_genomic_data <- function( biallelic <- input$biallelic input <- input$input + + + } # End import PLINK # Import stacks haplotypes---------------------------------------------------- @@ -544,7 +557,12 @@ tidy_genomic_data <- function( na = "-", col_names = FALSE, col_types = readr::cols(.default = readr::col_character())) %>% - tidyr::gather(data = .,key = DELETE, value = INFO) %>% + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "DELETE", + values_to = "DELETE" + ) %>% dplyr::mutate(INFO = clean_ind_names(INFO)) %>% dplyr::select(-DELETE) %>% dplyr::mutate(INFO = clean_ind_names(INFO)) %>% @@ -575,11 +593,12 @@ tidy_genomic_data <- function( message("\nNumber of loci in catalog: ", n.catalog.locus) message("Number of individuals: ", n.individuals) - input <- tidyr::gather( + input <- tidyr::pivot_longer( data = input, - key = "INDIVIDUALS", - value = "GT_VCF_NUC", # previously using "GT_HAPLO" - -LOCUS) + cols = -LOCUS, + names_to = "INDIVIDUALS", + values_to = "GT_VCF_NUC"# previously using "GT_HAPLO" + ) input$INDIVIDUALS <- radiator::clean_ind_names(input$INDIVIDUALS) diff --git a/R/sex_markers.R b/R/sex_markers.R index 41d5317c..fa0c8043 100644 --- a/R/sex_markers.R +++ b/R/sex_markers.R @@ -1801,12 +1801,7 @@ This marker could be absent due to an error.") dplyr::distinct(res$sexy.summary, SEX_MARKERS, METHOD) %>% dplyr::mutate(n = rep(1, n()), METHOD = stringi::stri_join("METHOD_", METHOD)) %>% - tidyr::spread( - data = ., - key = METHOD, - value = n, - fill = 0 - ) %>% + tidyr::pivot_wider(data = ., names_from = "METHOD", values_from = "n", values_fill = 0) %>% data.frame(.) message("The 'upset' plot shows any overlapping sex-linked markers between methods") diff --git a/R/strata.R b/R/strata.R index be573042..9b42e42a 100644 --- a/R/strata.R +++ b/R/strata.R @@ -620,7 +620,12 @@ strata_haplo <- function(strata = NULL, data = NULL, blacklist.id = NULL) { na = "-", col_names = FALSE, col_types = readr::cols(.default = readr::col_character())) %>% - tidyr::gather(data = .,key = DELETE, value = INDIVIDUALS) %>% + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "DELETE", + values_to = "INDIVIDUALS" + ) %>% dplyr::mutate(INDIVIDUALS = clean_ind_names(INDIVIDUALS)) %>% dplyr::select(-DELETE) %>% dplyr::filter(!INDIVIDUALS %in% c("Catalog ID", "Cnt")) %>% diff --git a/R/vcf.R b/R/vcf.R index 7affebac..726ec3d0 100644 --- a/R/vcf.R +++ b/R/vcf.R @@ -2602,7 +2602,7 @@ write_vcf <- function( dplyr::mutate(GT_VCF_POP_ID = stringi::stri_join(GT_VCF, POP_ID, sep = ":")) %>% dplyr::select(-c(GT_VCF, POP_ID)) %>% dplyr::group_by(MARKERS, CHROM, LOCUS, POS, INFO, REF, ALT) %>% - tidyr::spread(data = ., key = INDIVIDUALS, value = GT_VCF_POP_ID) %>% + tidyr::pivot_wider(data = ., names_from = "INDIVIDUALS", values_from = "GT_VCF_POP_ID") %>% dplyr::ungroup(.) %>% dplyr::mutate( QUAL = rep(".", n()), @@ -2616,7 +2616,7 @@ write_vcf <- function( dplyr::left_join(data, info.field, by = "MARKERS") %>% dplyr::select(MARKERS, CHROM, LOCUS, POS, REF, ALT, INDIVIDUALS, GT_VCF, INFO) %>% dplyr::group_by(MARKERS, CHROM, LOCUS, POS, INFO, REF, ALT) %>% - tidyr::spread(data = ., key = INDIVIDUALS, value = GT_VCF) %>% + tidyr::pivot_wider(data = ., names_from = "INDIVIDUALS", values_from = "GT_VCF") %>% dplyr::ungroup(.) %>% dplyr::mutate( QUAL = rep(".", n()), @@ -3329,7 +3329,7 @@ vcf_strata <- function(data, strata, filename = NULL) { remove = TRUE ) %>% dplyr::group_by(`#CHROM`, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT) %>% - tidyr::spread(data = ., key = INDIVIDUALS, value = FORMAT_ID) %>% + tidyr::pivot_wider(data = ., names_from = "INDIVIDUALS", values_from = "FORMAT_ID") %>% dplyr::ungroup(.) %>% dplyr::mutate( FORMAT = stringi::stri_join( diff --git a/R/visualization.R b/R/visualization.R index 0aa56794..7471eb1d 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -103,10 +103,15 @@ markers_genotyped_helper <- function(x, y, overall.only = FALSE) { `90` = length(PERCENT[PERCENT <= 90]), `100` = length(PERCENT[PERCENT <= 100]) ) %>% - tidyr::gather(data = ., key = GENOTYPED_THRESHOLD, value = NUMBER_MARKERS) %>% + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "GENOTYPED_THRESHOLD", + values_to = "NUMBER_MARKERS" + ) %>% dplyr::mutate(POP_ID = rep("OVERALL", n())) - if (!overall.only){ + if (!overall.only) { threshold.helper.pop <- x %>% dplyr::group_by(POP_ID) %>% dplyr::summarise( @@ -122,7 +127,12 @@ markers_genotyped_helper <- function(x, y, overall.only = FALSE) { `90` = length(PERCENT[PERCENT <= 90]), `100` = length(PERCENT[PERCENT <= 100]) ) %>% - tidyr::gather(data = ., key = GENOTYPED_THRESHOLD, value = NUMBER_MARKERS, -POP_ID) + tidyr::pivot_longer( + data = ., + cols = -POP_ID, + names_to = "GENOTYPED_THRESHOLD", + values_to = "NUMBER_MARKERS" + ) mean.pop <- threshold.helper.pop %>% dplyr::group_by(GENOTYPED_THRESHOLD) %>% @@ -633,7 +643,12 @@ plot_density_distribution_het <- function(data, pop.levels, het.group, aes.colou HET_DIFF = HET_MAX - HET_MIN ) %>% dplyr::group_by(LOCUS, POP_ID) %>% - tidyr::gather(data = ., key = HET_GROUP, value = VALUE) + tidyr::pivot_longer( + data = ., + cols = dplyr::everything(), + names_to = "HET_GROUP", + values_to = "VALUE" + ) if (missing(pop.levels) == "TRUE") { data.summary <- data.summary diff --git a/R/write_arlequin.R b/R/write_arlequin.R index f2111d04..327bb6bf 100644 --- a/R/write_arlequin.R +++ b/R/write_arlequin.R @@ -77,12 +77,17 @@ write_arlequin <- function( # arlequin format ---------------------------------------------------------------- data <- data %>% tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3, extra = "drop", remove = TRUE) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(POP_ID, INDIVIDUALS, MARKERS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::mutate( GT = stringi::stri_replace_all_fixed(str = GT, pattern = "000", replacement = "-9", vectorize_all = FALSE) ) %>% dplyr::select(INDIVIDUALS, POP_ID, MARKERS, ALLELES, GT) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT") %>% dplyr::select(-ALLELES) %>% dplyr::arrange(POP_ID, INDIVIDUALS) diff --git a/R/write_colony.R b/R/write_colony.R index 96e62b69..91d90111 100644 --- a/R/write_colony.R +++ b/R/write_colony.R @@ -235,7 +235,12 @@ radiator_colony <- function( A2 = stringi::stri_sub(GT, 4, 6), GT = NULL ) %>% - tidyr::gather(data = ., key = ALLELE_GROUP, value = ALLELES, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELE_GROUP", + values_to = "ALLELES" + ) %>% dplyr::mutate(ALLELES = as.numeric(ALLELES)) %>% dplyr::arrange(MARKERS, POP_ID, INDIVIDUALS) @@ -266,10 +271,15 @@ radiator_colony <- function( dplyr::arrange(MARKERS) %>% dplyr::mutate(GROUP = seq(1, n(), by = 1)) %>% dplyr::mutate_all(.tbl = ., .funs = as.character) %>% - tidyr::gather(data = ., key = ALLELES_FREQ, value = VALUE, -c(MARKERS, GROUP)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("GROUP", "MARKERS"), + names_to = "ALLELES_FREQ", + values_to = "VALUE" + ) %>% dplyr::mutate(ALLELES_FREQ = factor(ALLELES_FREQ, levels = c("ALLELES", "FREQ"), ordered = TRUE)) %>% dplyr::group_by(MARKERS, ALLELES_FREQ) %>% - tidyr::spread(data = ., key = GROUP, value = VALUE) %>% + tidyr::pivot_wider(data = ., names_from = "GROUP", values_from = "VALUE") %>% dplyr::ungroup(.) %>% tidyr::unite(data = ., col = INFO, -c(MARKERS, ALLELES_FREQ), sep = " ") %>% dplyr::mutate( @@ -285,7 +295,7 @@ radiator_colony <- function( data <- tidyr::unite(data = data, col = MARKERS.ALLELE_GROUP, MARKERS, ALLELE_GROUP, sep = ".") %>% dplyr::group_by(POP_ID, INDIVIDUALS) %>% - tidyr::spread(data = ., key = MARKERS.ALLELE_GROUP, value = ALLELES) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS.ALLELE_GROUP", values_from = "ALLELES") %>% dplyr::arrange(POP_ID, INDIVIDUALS) %>% dplyr::ungroup(.) %>% dplyr::select(-POP_ID) %>% diff --git a/R/write_dadi.R b/R/write_dadi.R index 4c374b64..288a6624 100644 --- a/R/write_dadi.R +++ b/R/write_dadi.R @@ -172,7 +172,6 @@ write_dadi <- function( ) %>% dplyr::ungroup(.) %>% dplyr::select(POP_ID, Allele1 = REF, A1, Allele2 = ALT, A2, MARKERS) %>% - # tidyr::gather(ALLELE_GROUP, COUNT, -c(POP_ID, MARKERS, Allele1, Allele2)) %>% data.table::as.data.table(.) %>% data.table::melt.data.table( data = ., @@ -183,8 +182,6 @@ write_dadi <- function( ) %>% tibble::as_tibble(.) %>% tidyr::unite(POP, POP_ID, ALLELE_GROUP, sep = "_") %>% - # dplyr::group_by(MARKERS, Allele1, Allele2) %>% - # tidyr::spread(data = ., key = POP, value = COUNT) %>% data.table::as.data.table(.) %>% data.table::dcast.data.table( data = ., @@ -476,7 +473,6 @@ write_dadi <- function( ) %>% dplyr::ungroup(.) %>% dplyr::select(POP_ID, Allele1 = REF, A1, Allele2 = ALT, A2, MARKERS) %>% - # tidyr::gather(ALLELE_GROUP, COUNT, -c(POP_ID, MARKERS, Allele1, Allele2)) %>% data.table::as.data.table(.) %>% data.table::melt.data.table( data = ., @@ -488,7 +484,6 @@ write_dadi <- function( tibble::as_tibble(.) %>% tidyr::unite(POP, POP_ID, ALLELE_GROUP, sep = "_") %>% dplyr::group_by(MARKERS, Allele1, Allele2) %>% - # tidyr::spread(data = ., key = POP, value = COUNT) %>% data.table::as.data.table(.) %>% data.table::dcast.data.table( data = ., diff --git a/R/write_faststructure.R b/R/write_faststructure.R index a4056e42..2a582f40 100644 --- a/R/write_faststructure.R +++ b/R/write_faststructure.R @@ -129,7 +129,12 @@ write_faststructure <- function( ), GT_BIN = NULL ) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(POP_ID, INDIVIDUALS, MARKERS)) + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) ) } else { want <- c("INDIVIDUALS", "POP_ID", "MARKERS", "GT") @@ -137,7 +142,12 @@ write_faststructure <- function( data %<>% dplyr::select(dplyr::one_of(want)) %>% tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3, extra = "drop", remove = TRUE) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(POP_ID, INDIVIDUALS, MARKERS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::mutate( GT = stringi::stri_replace_all_fixed(str = GT, pattern = "000", replacement = "-9", vectorize_all = FALSE), GT = as.integer(GT) @@ -148,7 +158,7 @@ write_faststructure <- function( # common to both GT and GT_BIN data %<>% dplyr::select(INDIVIDUALS, POP_ID, MARKERS, ALLELES, GT) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT") %>% dplyr::mutate(POP_ID = as.integer(POP_ID)) %>% dplyr::select(-ALLELES) diff --git a/R/write_rubias.R b/R/write_rubias.R index 428f610a..3cf41c79 100644 --- a/R/write_rubias.R +++ b/R/write_rubias.R @@ -10,7 +10,7 @@ #' @param strata (optional, tibble file or object) This tibble of individual's #' metadata must contain -#' four colums: \code{SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS}. Those columns +#' four columns: \code{SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS}. Those columns #' are described in \strong{rubias}. #' Default:\code{strata = NULL}. With default, \code{SAMPLE_TYPE} is filled #' with \code{reference}. \code{REPUNIT} and \code{COLLECTION} will be filled by @@ -49,9 +49,9 @@ write_rubias <- function( ) { ## TEST - strata = NULL - filename = NULL - parallel.core = parallel::detectCores() - 1 + # strata = NULL + # filename = NULL + # parallel.core = parallel::detectCores() - 1 # Checking for missing and/or default arguments if (missing(data)) rlang::abort("Input file necessary to write the rubias file is missing") diff --git a/R/write_stockr.R b/R/write_stockr.R index 5b951df0..e9dc8367 100644 --- a/R/write_stockr.R +++ b/R/write_stockr.R @@ -65,7 +65,7 @@ write_stockr <- function(data, filename = NULL, verbose = TRUE) { ) %>% tibble::as_tibble(.) %>% dplyr::select(MARKERS, strata$INDIVIDUALS) %>% - tibble::remove_rownames(.) %>% + tibble::remove_rownames(.data = .) %>% tibble::column_to_rownames(.data = ., var = "MARKERS")) %>% as.matrix(.) diff --git a/R/write_structure.R b/R/write_structure.R index fe3c8b5f..2b157968 100644 --- a/R/write_structure.R +++ b/R/write_structure.R @@ -80,13 +80,18 @@ write_structure <- function( # Structure format ---------------------------------------------------------------- data %<>% tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3, extra = "drop", remove = TRUE) %>% - tidyr::gather(data = ., key = ALLELES, value = GT, -c(POP_ID, INDIVIDUALS, MARKERS)) %>% + tidyr::pivot_longer( + data = ., + cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"), + names_to = "ALLELES", + values_to = "GT" + ) %>% dplyr::mutate( GT = stringi::stri_replace_all_fixed(str = GT, pattern = "000", replacement = "-9", vectorize_all = FALSE), GT = as.integer(GT) ) %>% dplyr::select(INDIVIDUALS, POP_ID, MARKERS, ALLELES, GT) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% + tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT") %>% dplyr::mutate(POP_ID = as.integer(POP_ID)) %>% dplyr::select(-ALLELES) %>% dplyr::arrange(POP_ID, INDIVIDUALS) diff --git a/README.md b/README.md index 7703d1e4..cb1c8bd6 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ Status](https://travis-ci.org/thierrygosselin/radiator.svg?branch=master)](https state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3687060.svg)](https://doi.org/10.5281/zenodo.3687060) -[![packageversion](https://img.shields.io/badge/Package%20version-1.1.6-orange.svg)](commits/master) -[![Last-changedate](https://img.shields.io/badge/last%20change-2020--07--10-brightgreen.svg)](/commits/master) +[![packageversion](https://img.shields.io/badge/Package%20version-1.1.7-orange.svg)](commits/master) +[![Last-changedate](https://img.shields.io/badge/last%20change-2020--08--21-brightgreen.svg)](/commits/master) # radiator: an R package for RADseq Data Exploration, Manipulation and Visualization diff --git a/docs/404.html b/docs/404.html index 60dd1e35..a43e9bcc 100644 --- a/docs/404.html +++ b/docs/404.html @@ -71,7 +71,7 @@
diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 96f004dd..f2a37a8e 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -71,7 +71,7 @@ diff --git a/docs/articles/get_started.html b/docs/articles/get_started.html index b5c45b7b..01d9230f 100644 --- a/docs/articles/get_started.html +++ b/docs/articles/get_started.html @@ -31,7 +31,7 @@ @@ -88,7 +88,7 @@vignettes/get_started.Rmd
get_started.Rmd
vignettes/life_cycle.Rmd
life_cycle.Rmd
vignettes/rad_genomics_computer_setup.Rmd
rad_genomics_computer_setup.Rmd
NEWS.md
+ tidyr::gather
and tidyr::spread
dependencies (they are deprecated)data.table
melt function. Changed to tidyr::pivot_long
.Assign the default values for...
-assign_defaults(n, verbose = TRUE, pos = 1)+
assign_defaults(n, env.arg, verbose = TRUE)diff --git a/docs/reference/bayescan_one.html b/docs/reference/bayescan_one.html index e07654e2..7401119b 100644 --- a/docs/reference/bayescan_one.html +++ b/docs/reference/bayescan_one.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/betas_estimator.html b/docs/reference/betas_estimator.html index b9788d34..b547565e 100644 --- a/docs/reference/betas_estimator.html +++ b/docs/reference/betas_estimator.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/blacklist_hw.html b/docs/reference/blacklist_hw.html index 03a3fbba..02c368e7 100644 --- a/docs/reference/blacklist_hw.html +++ b/docs/reference/blacklist_hw.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/boxplot_stats.html b/docs/reference/boxplot_stats.html index 677308f8..a94ec4c6 100644 --- a/docs/reference/boxplot_stats.html +++ b/docs/reference/boxplot_stats.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/calibrate_alleles.html b/docs/reference/calibrate_alleles.html index 8ff425cb..85475fd3 100644 --- a/docs/reference/calibrate_alleles.html +++ b/docs/reference/calibrate_alleles.html @@ -79,7 +79,7 @@ diff --git a/docs/reference/change_pop_names.html b/docs/reference/change_pop_names.html index 6223810f..82446752 100644 --- a/docs/reference/change_pop_names.html +++ b/docs/reference/change_pop_names.html @@ -74,7 +74,7 @@ diff --git a/docs/reference/check_args_class.html b/docs/reference/check_args_class.html index d4131952..dfdab645 100644 --- a/docs/reference/check_args_class.html +++ b/docs/reference/check_args_class.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/check_dart.html b/docs/reference/check_dart.html index f50e3535..531cd0d4 100644 --- a/docs/reference/check_dart.html +++ b/docs/reference/check_dart.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/check_header_source_vcf.html b/docs/reference/check_header_source_vcf.html index 820ae759..3500465d 100644 --- a/docs/reference/check_header_source_vcf.html +++ b/docs/reference/check_header_source_vcf.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/check_pop_levels.html b/docs/reference/check_pop_levels.html index 97f0b8ac..cfcc53eb 100644 --- a/docs/reference/check_pop_levels.html +++ b/docs/reference/check_pop_levels.html @@ -73,7 +73,7 @@ diff --git a/docs/reference/clean_ad.html b/docs/reference/clean_ad.html index 4bc47540..f26363b6 100644 --- a/docs/reference/clean_ad.html +++ b/docs/reference/clean_ad.html @@ -73,7 +73,7 @@ diff --git a/docs/reference/clean_dart_colnames.html b/docs/reference/clean_dart_colnames.html index 3c769198..703cfdd8 100644 --- a/docs/reference/clean_dart_colnames.html +++ b/docs/reference/clean_dart_colnames.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/clean_dart_locus.html b/docs/reference/clean_dart_locus.html index 23938417..85ef3e59 100644 --- a/docs/reference/clean_dart_locus.html +++ b/docs/reference/clean_dart_locus.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/clean_gl.html b/docs/reference/clean_gl.html index 2d64b840..30fb9878 100644 --- a/docs/reference/clean_gl.html +++ b/docs/reference/clean_gl.html @@ -73,7 +73,7 @@ diff --git a/docs/reference/clean_ind_names.html b/docs/reference/clean_ind_names.html index ef2dd65d..fae59fb4 100644 --- a/docs/reference/clean_ind_names.html +++ b/docs/reference/clean_ind_names.html @@ -77,7 +77,7 @@ diff --git a/docs/reference/clean_markers_names.html b/docs/reference/clean_markers_names.html index 8f347371..ef388a0c 100644 --- a/docs/reference/clean_markers_names.html +++ b/docs/reference/clean_markers_names.html @@ -78,7 +78,7 @@ diff --git a/docs/reference/clean_nr.html b/docs/reference/clean_nr.html index 6c83e5e7..9ead84f4 100644 --- a/docs/reference/clean_nr.html +++ b/docs/reference/clean_nr.html @@ -73,7 +73,7 @@ diff --git a/docs/reference/clean_nv.html b/docs/reference/clean_nv.html index 2a3c92f2..55aeae1d 100644 --- a/docs/reference/clean_nv.html +++ b/docs/reference/clean_nv.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/clean_pl.html b/docs/reference/clean_pl.html index 9ba052ef..984d2043 100644 --- a/docs/reference/clean_pl.html +++ b/docs/reference/clean_pl.html @@ -77,7 +77,7 @@ diff --git a/docs/reference/clean_pop_names.html b/docs/reference/clean_pop_names.html index 736848ce..6c9f06ab 100644 --- a/docs/reference/clean_pop_names.html +++ b/docs/reference/clean_pop_names.html @@ -76,7 +76,7 @@ diff --git a/docs/reference/compound_assignment_pipe_operator.html b/docs/reference/compound_assignment_pipe_operator.html index c90b7481..e6b4992c 100644 --- a/docs/reference/compound_assignment_pipe_operator.html +++ b/docs/reference/compound_assignment_pipe_operator.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/compute_mac.html b/docs/reference/compute_mac.html index 203c9765..c40e8281 100644 --- a/docs/reference/compute_mac.html +++ b/docs/reference/compute_mac.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/compute_maf.html b/docs/reference/compute_maf.html index 8ce88d5f..76ffabbc 100644 --- a/docs/reference/compute_maf.html +++ b/docs/reference/compute_maf.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/count_monomorphic.html b/docs/reference/count_monomorphic.html index bf0c2136..cbdc4ae7 100644 --- a/docs/reference/count_monomorphic.html +++ b/docs/reference/count_monomorphic.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/dart2gds.html b/docs/reference/dart2gds.html index bc03fb36..b014b21b 100644 --- a/docs/reference/dart2gds.html +++ b/docs/reference/dart2gds.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/data_info.html b/docs/reference/data_info.html index a1b11a8d..e1af76a7 100644 --- a/docs/reference/data_info.html +++ b/docs/reference/data_info.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/detect_all_missing.html b/docs/reference/detect_all_missing.html index ec8cf663..9d45de96 100644 --- a/docs/reference/detect_all_missing.html +++ b/docs/reference/detect_all_missing.html @@ -74,7 +74,7 @@ diff --git a/docs/reference/detect_allele_problems.html b/docs/reference/detect_allele_problems.html index ae96792e..1e5e2953 100644 --- a/docs/reference/detect_allele_problems.html +++ b/docs/reference/detect_allele_problems.html @@ -80,7 +80,7 @@ diff --git a/docs/reference/detect_biallelic_markers.html b/docs/reference/detect_biallelic_markers.html index ef1f211f..58e64355 100644 --- a/docs/reference/detect_biallelic_markers.html +++ b/docs/reference/detect_biallelic_markers.html @@ -74,7 +74,7 @@ diff --git a/docs/reference/detect_biallelic_problems.html b/docs/reference/detect_biallelic_problems.html index 399cec50..ae4c6999 100644 --- a/docs/reference/detect_biallelic_problems.html +++ b/docs/reference/detect_biallelic_problems.html @@ -76,7 +76,7 @@ diff --git a/docs/reference/detect_dart_format.html b/docs/reference/detect_dart_format.html index ace1f0bd..43a3b982 100644 --- a/docs/reference/detect_dart_format.html +++ b/docs/reference/detect_dart_format.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/detect_duplicate_genomes.html b/docs/reference/detect_duplicate_genomes.html index 41676228..c72e8b5d 100644 --- a/docs/reference/detect_duplicate_genomes.html +++ b/docs/reference/detect_duplicate_genomes.html @@ -79,7 +79,7 @@ diff --git a/docs/reference/detect_genomic_format.html b/docs/reference/detect_genomic_format.html index b25a8a65..5f2b500c 100644 --- a/docs/reference/detect_genomic_format.html +++ b/docs/reference/detect_genomic_format.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/detect_het_outliers.html b/docs/reference/detect_het_outliers.html index 2be0bbc3..6bb708d6 100644 --- a/docs/reference/detect_het_outliers.html +++ b/docs/reference/detect_het_outliers.html @@ -76,7 +76,7 @@ diff --git a/docs/reference/detect_microsatellites.html b/docs/reference/detect_microsatellites.html index 44c6d139..9871b7fb 100644 --- a/docs/reference/detect_microsatellites.html +++ b/docs/reference/detect_microsatellites.html @@ -75,7 +75,7 @@ diff --git a/docs/reference/detect_mixed_genomes.html b/docs/reference/detect_mixed_genomes.html index af602516..1bafb958 100644 --- a/docs/reference/detect_mixed_genomes.html +++ b/docs/reference/detect_mixed_genomes.html @@ -74,7 +74,7 @@ diff --git a/docs/reference/detect_paralogs.html b/docs/reference/detect_paralogs.html index ff4d10c4..e84dbea1 100644 --- a/docs/reference/detect_paralogs.html +++ b/docs/reference/detect_paralogs.html @@ -77,7 +77,7 @@ diff --git a/docs/reference/detect_ref_genome.html b/docs/reference/detect_ref_genome.html index 7426bbd1..f9134363 100644 --- a/docs/reference/detect_ref_genome.html +++ b/docs/reference/detect_ref_genome.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/diagnostic_maf.html b/docs/reference/diagnostic_maf.html index 4a767c2f..c21e83cf 100644 --- a/docs/reference/diagnostic_maf.html +++ b/docs/reference/diagnostic_maf.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/discard_monomorphic_markers.html b/docs/reference/discard_monomorphic_markers.html deleted file mode 100644 index 3162ed12..00000000 --- a/docs/reference/discard_monomorphic_markers.html +++ /dev/null @@ -1,193 +0,0 @@ - - - - - - - - -
R/radiator_deprecated_functions.R
- discard_monomorphic_markers.Rd
Discard monomorphic markers. -Used internally in radiator -and might be of interest for users.
-discard_monomorphic_markers(data, verbose = FALSE)- -
data | -A tidy data frame object in the global environment or
-a tidy data frame in wide or long format in the working directory.
-How to get a tidy data frame ?
-Look into radiator |
-
---|---|
verbose | -(optional, logical) |
-
A list with the filtered input file and the blacklist of markers removed.
- -Extract and Assign ...
-extract_dots(n, v, verbose = TRUE, pos = 1)+
extract_dots(n, v, env.arg, verbose = TRUE)diff --git a/docs/reference/extract_genotypes_metadata.html b/docs/reference/extract_genotypes_metadata.html index 3bae261b..6ff8c2c6 100644 --- a/docs/reference/extract_genotypes_metadata.html +++ b/docs/reference/extract_genotypes_metadata.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/extract_individuals_metadata.html b/docs/reference/extract_individuals_metadata.html index 0ecb6516..18527753 100644 --- a/docs/reference/extract_individuals_metadata.html +++ b/docs/reference/extract_individuals_metadata.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/extract_individuals_vcf.html b/docs/reference/extract_individuals_vcf.html index 675d4de6..c3777aa3 100644 --- a/docs/reference/extract_individuals_vcf.html +++ b/docs/reference/extract_individuals_vcf.html @@ -74,7 +74,7 @@ diff --git a/docs/reference/extract_info_vcf.html b/docs/reference/extract_info_vcf.html index 2f37c3e8..b63bee95 100644 --- a/docs/reference/extract_info_vcf.html +++ b/docs/reference/extract_info_vcf.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/extract_markers_metadata.html b/docs/reference/extract_markers_metadata.html index 373ea1f3..7a8b1634 100644 --- a/docs/reference/extract_markers_metadata.html +++ b/docs/reference/extract_markers_metadata.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/filter_blacklist_genotypes.html b/docs/reference/filter_blacklist_genotypes.html index 824a50ec..cb77a748 100644 --- a/docs/reference/filter_blacklist_genotypes.html +++ b/docs/reference/filter_blacklist_genotypes.html @@ -75,7 +75,7 @@ diff --git a/docs/reference/filter_common_markers.html b/docs/reference/filter_common_markers.html index f447f1bc..5ed21971 100644 --- a/docs/reference/filter_common_markers.html +++ b/docs/reference/filter_common_markers.html @@ -79,7 +79,7 @@ diff --git a/docs/reference/filter_coverage.html b/docs/reference/filter_coverage.html index dc67dcc4..e80f4e04 100644 --- a/docs/reference/filter_coverage.html +++ b/docs/reference/filter_coverage.html @@ -76,7 +76,7 @@ diff --git a/docs/reference/filter_dart_reproducibility.html b/docs/reference/filter_dart_reproducibility.html index e4ba9fc0..866fa418 100644 --- a/docs/reference/filter_dart_reproducibility.html +++ b/docs/reference/filter_dart_reproducibility.html @@ -75,7 +75,7 @@ diff --git a/docs/reference/filter_fis.html b/docs/reference/filter_fis.html index dcca907c..5f6f5f36 100644 --- a/docs/reference/filter_fis.html +++ b/docs/reference/filter_fis.html @@ -72,7 +72,7 @@ diff --git a/docs/reference/filter_genotyping.html b/docs/reference/filter_genotyping.html index b09f743e..5e7eddd4 100644 --- a/docs/reference/filter_genotyping.html +++ b/docs/reference/filter_genotyping.html @@ -75,7 +75,7 @@ diff --git a/docs/reference/filter_het.html b/docs/reference/filter_het.html index 31ee09ee..c707f849 100644 --- a/docs/reference/filter_het.html +++ b/docs/reference/filter_het.html @@ -92,7 +92,7 @@ diff --git a/docs/reference/filter_hwe.html b/docs/reference/filter_hwe.html index a2296bd9..95abd7df 100644 --- a/docs/reference/filter_hwe.html +++ b/docs/reference/filter_hwe.html @@ -90,7 +90,7 @@ diff --git a/docs/reference/filter_individuals.html b/docs/reference/filter_individuals.html index a97a929a..ef8c50b8 100644 --- a/docs/reference/filter_individuals.html +++ b/docs/reference/filter_individuals.html @@ -83,7 +83,7 @@ diff --git a/docs/reference/filter_ld.html b/docs/reference/filter_ld.html index ecbfcba9..469bd553 100644 --- a/docs/reference/filter_ld.html +++ b/docs/reference/filter_ld.html @@ -92,7 +92,7 @@ diff --git a/docs/reference/filter_mac.html b/docs/reference/filter_mac.html index 591caf1d..c3f249fd 100644 --- a/docs/reference/filter_mac.html +++ b/docs/reference/filter_mac.html @@ -79,7 +79,7 @@ diff --git a/docs/reference/filter_maf.html b/docs/reference/filter_maf.html index af39ca03..df301b09 100644 --- a/docs/reference/filter_maf.html +++ b/docs/reference/filter_maf.html @@ -75,7 +75,7 @@ diff --git a/docs/reference/filter_monomorphic.html b/docs/reference/filter_monomorphic.html index bfbd83ac..b7e13625 100644 --- a/docs/reference/filter_monomorphic.html +++ b/docs/reference/filter_monomorphic.html @@ -84,7 +84,7 @@ diff --git a/docs/reference/filter_rad.html b/docs/reference/filter_rad.html index 7bcc60b5..e8d83a61 100644 --- a/docs/reference/filter_rad.html +++ b/docs/reference/filter_rad.html @@ -75,7 +75,7 @@ @@ -197,8 +197,12 @@
output = c("genind", "genepop", "structure")
, to have preferred
-output formats generated. With default, only the tidy format is generated.
-Default: output = NULL
.
+output formats generated. With default, only the tidy format is generated.
+Make sure to read the particularities of each format, some might +requires extra columns in the strata file. You can find the info in the +corresponding write_ functions of radiator +(reference).
+Default: output = NULL
.
output = c("genind", "genepop", "structure")
, to have preferred
-output formats generated. With default, only the tidy format is generated.
-Default: output = NULL
.
+output formats generated. With default, only the tidy format is generated.
+Make sure to read the particularities of each format, some might +requires extra columns in the strata file. You can find the info in the +corresponding write_ functions of radiator +(reference).
+Default: output = NULL
.
filter DArT file
Individual filter
Population filter
(optional, tibble file or object) This tibble of individual's
metadata must contain
-four colums: SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS
. Those columns
+four columns: SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS
. Those columns
are described in rubias.
Default:strata = NULL
. With default, SAMPLE_TYPE
is filled
with reference
. REPUNIT
and COLLECTION
will be filled by
diff --git a/docs/reference/write_snprelate.html b/docs/reference/write_snprelate.html
index 61d31494..785b2815 100644
--- a/docs/reference/write_snprelate.html
+++ b/docs/reference/write_snprelate.html
@@ -75,7 +75,7 @@
diff --git a/docs/reference/write_stockr.html b/docs/reference/write_stockr.html
index 8ecabd67..87762273 100644
--- a/docs/reference/write_stockr.html
+++ b/docs/reference/write_stockr.html
@@ -74,7 +74,7 @@
diff --git a/docs/reference/write_structure.html b/docs/reference/write_structure.html
index 6b8c62c6..5f349eb4 100644
--- a/docs/reference/write_structure.html
+++ b/docs/reference/write_structure.html
@@ -75,7 +75,7 @@
diff --git a/docs/reference/write_vcf.html b/docs/reference/write_vcf.html
index b6bf997a..49314286 100644
--- a/docs/reference/write_vcf.html
+++ b/docs/reference/write_vcf.html
@@ -75,7 +75,7 @@
diff --git a/man/.DS_Store b/man/.DS_Store
new file mode 100644
index 00000000..5008ddfc
Binary files /dev/null and b/man/.DS_Store differ
diff --git a/man/discard_monomorphic_markers.Rd b/man/discard_monomorphic_markers.Rd
deleted file mode 100644
index d2552073..00000000
--- a/man/discard_monomorphic_markers.Rd
+++ /dev/null
@@ -1,30 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/radiator_deprecated_functions.R
-\name{discard_monomorphic_markers}
-\alias{discard_monomorphic_markers}
-\title{Discard monomorphic markers}
-\usage{
-discard_monomorphic_markers(data, verbose = FALSE)
-}
-\arguments{
-\item{data}{A tidy data frame object in the global environment or
-a tidy data frame in wide or long format in the working directory.
-\emph{How to get a tidy data frame ?}
-Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.}
-
-\item{verbose}{(optional, logical) \code{verbose = TRUE} to be chatty
-during execution.
-Default: \code{verbose = FALSE}.}
-}
-\value{
-A list with the filtered input file and the blacklist of markers removed.
-}
-\description{
-Discard monomorphic markers.
-Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
-and might be of interest for users.
-}
-\author{
-Thierry Gosselin \email{thierrygosselin@icloud.com}
-}
-\keyword{internal}
diff --git a/man/filter_dart.Rd b/man/filter_dart.Rd
deleted file mode 100644
index 5e9a0fff..00000000
--- a/man/filter_dart.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/radiator_deprecated_functions.R
-\name{filter_dart}
-\alias{filter_dart}
-\title{filter DArT file}
-\usage{
-filter_dart(...)
-}
-\arguments{
-\item{...}{Read life cycle section below}
-}
-\description{
-Use \code{\link{filter_rad}}.
-}
-\section{Life cycle}{
-
-
-As of radiator v.0.0.22, \code{filter_dart} is now replaced by
-\code{\link{filter_rad}} in an effort to have more meaningful functions names.
-This is THE function to rule them all.
-}
-
diff --git a/man/filter_individual.Rd b/man/filter_individual.Rd
deleted file mode 100644
index cc82388f..00000000
--- a/man/filter_individual.Rd
+++ /dev/null
@@ -1,162 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/radiator_deprecated_functions.R
-\name{filter_individual}
-\alias{filter_individual}
-\title{Individual filter}
-\usage{
-filter_individual(
- interactive.filter = TRUE,
- data,
- strata = NULL,
- ind.approach = "pop",
- ind.threshold = 0.5,
- prob.pop.threshold = 0,
- percent = FALSE,
- filename = NULL,
- parallel.core = parallel::detectCores() - 1,
- ...
-)
-}
-\arguments{
-\item{interactive.filter}{(optional, logical) Do you want the filtering session to
-be interactive. With default: \code{interactive.filter == TRUE}, the user is
-asked to see figures of distribution before making decisions for filtering.}
-
-\item{data}{14 options for input (\strong{diploid data only}): VCFs (SNPs or Haplotypes,
-to make the vcf population ready),
-plink (tped, bed), stacks haplotype file, genind (library(adegenet)),
-genlight (library(adegenet)), gtypes (library(strataG)), genepop, DArT,
-and a data frame in long/tidy or wide format. To verify that radiator detect
-your file format use \code{\link{detect_genomic_format}} (see example below).
-Documented in \strong{Input genomic datasets} of \code{\link{tidy_genomic_data}}.
-
-\strong{DArT and VCF data}: \pkg{radiator} was not meant to generate alleles
-and genotypes if you are using a VCF file with no genotype
-(only genotype likelihood: GL or PL).
-Neither is \pkg{radiator} able to magically generate a genind object
-from a SilicoDArT dataset. Please look at the first few lines of your dataset
-to understand it's limit before asking raditor to convert or filter your dataset.}
-
-\item{strata}{(optional)
-The strata file is a tab delimited file with a minimum of 2 columns headers:
-\code{INDIVIDUALS} and \code{STRATA}. Documented in \code{\link{read_strata}}.
-DArT data: a third column \code{TARGET_ID} is required.
-Documented on \code{\link{read_dart}}. Also use the strata read function to
-blacklist individuals.
-Default: \code{strata = NULL}.}
-
-\item{ind.approach}{(character).
-The approach to filter a marker, is it based on the overall number of
-genotyped individuals (no concept of populations here),
-\code{ind.approach = "overall"}, or
-the number of genotyped individuals per population \code{ind.approach = "pop"}.
-Default: \code{ind.approach = "pop"}.}
-
-\item{ind.threshold}{The individual threshold, proportion, percentage or
-number e.g. 0.70, 70, 15.
-Default: \code{ind.threshold = 0.5}.}
-
-\item{prob.pop.threshold}{(integer, optional) Useful to incorporate problematic
-populations dragging down polymorphism discovery, but still wanted for analysis.
-If individuals with missing genotypes are not managed upstream,
-use this threshold to allow variance in the number of populations passing
-the \code{ind.threshold} argument. e.g. with \code{prob.pop.threshold = 2},
-you tolerate a maximum of 2 populations failing to pass the
-\code{ind.threshold}. Manage after the problematic populations
-downstream with blacklist of individuals and/or missing data imputations. This
-argument is not necessary with \code{ind.approach = "overall"}.
-Default: \code{prob.pop.threshold = 0}. See details for more info.}
-
-\item{percent}{Is the threshold a percentage? TRUE or FALSE.
-This argument is necessary to distinguish percentage from integer individual
-threshold (e.g. 70 percent or 70 individuals).
-Default: \code{percent = FALSE}.}
-
-\item{filename}{(optional) The function uses \code{\link[fst]{write.fst}},
-to write the tidy data frame in
-the folder created in the working directory. The file extension appended to
-the \code{filename} provided is \code{.rad}.
-Default: \code{filename = NULL}.}
-
-\item{parallel.core}{(optional) The number of core used for parallel
-execution during import.
-Default: \code{parallel.core = parallel::detectCores() - 1}.}
-
-\item{...}{(optional) To pass further arguments for fine-tuning the function.}
-}
-\value{
-With \code{interactive.filter = FALSE}, a list in the global environment,
-with 7 objects:
-\enumerate{
-\item $tidy.filtered.ind
-\item $whitelist.markers
-\item $blacklist.markers
-\item $strata
-\item $filters.parameters
-}
-
-With \code{interactive.filter = TRUE}, a list with 2 additionnal objects is created.
-\enumerate{
-\item $plot.ind.threshold
-\item $ind.threshold.helper.table
-}
-
-The object can be isolated in separate object outside the list by
-following the example below.
-}
-\description{
-Filter individuals genotyped at a marker from a tidy data
-set (long format) of any of these file format:
-vcf, plink (tped/tfam), stacks haplotype file, genind,
-genepop, data frame in wide format. The function uses
-\code{\link[radiator]{tidy_genomic_data}} and
-\code{\link[radiator]{tidy_wide}} to load the file. For filtering
-you can consider the overall number of individuals (no concept of populations here),
-or the number of genotyped individuals per pop. The threshold can be a fixed
-number of individuals, a proportion or a percentage.
-}
-\details{
-\strong{Interactive version}
-
-There is 2 steps in the interactive version to visualize and filter
-the data based on the number of genotyped individuals:
-
-Step 1. Impact of individual threshold on marker discovery
-
-Step 2. Choose the filtering approach and thresholds
-
-
-
-\strong{prob.pop.threshold}
-
-If your a regular radiator user, you've seen the \code{pop.num.threshold}.
-\code{prob.pop.threshold}, is a bit different and requires more thinking,
-because the number of populations genotyped potentially vary across markers,
-which make difficult, sometimes, the use of \code{pop.num.threshold}.
-e.g. If only 2 populations out of 10 are genotyped for a marker and you use
-\code{pop.num.threshold = 0.6} this could lead to inconsistensis. Thus,
-it's easier to use the number of population you are willing to accept
-failling to conform to the threshold.
-}
-\examples{
-\dontrun{
-turtle.ind <- filter_individual(
-data = "turtle.vcf",
-strata = "turtle.strata.tsv",
-ind.approach = "pop",
-ind.threshold = 50,
-percent = TRUE,
-prob.pop.threshold = 2,
-filename = "tidy.data.turtle"
-)
-
-#If interactive.filter = TRUE, a list is created and to view the filtered tidy data:
-tidy.data <- turtle.ind$tidy.filtered.ind
-
-#Inside the same list, to isolate the blacklist.genotypes:
-bg <- turtle.ind$blacklist.genotypes
-
-# The remaining argument are used in tidy_genomic_data during import and allow
-# the user to apply some filtering or selection before doing the filtering.
-}
-}
diff --git a/man/filter_population.Rd b/man/filter_population.Rd
deleted file mode 100644
index 576d7f97..00000000
--- a/man/filter_population.Rd
+++ /dev/null
@@ -1,125 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/radiator_deprecated_functions.R
-\name{filter_population}
-\alias{filter_population}
-\title{Population filter}
-\usage{
-filter_population(
- interactive.filter = TRUE,
- data,
- pop.threshold = 100,
- percent = TRUE,
- filename = NULL,
- strata = NULL,
- parallel.core = parallel::detectCores() - 1,
- ...
-)
-}
-\arguments{
-\item{interactive.filter}{(optional, logical) Do you want the filtering session to
-be interactive. With default: \code{interactive.filter == TRUE}, the user is
-asked to see figures of distribution before making decisions for filtering.}
-
-\item{data}{14 options for input (\strong{diploid data only}): VCFs (SNPs or Haplotypes,
-to make the vcf population ready),
-plink (tped, bed), stacks haplotype file, genind (library(adegenet)),
-genlight (library(adegenet)), gtypes (library(strataG)), genepop, DArT,
-and a data frame in long/tidy or wide format. To verify that radiator detect
-your file format use \code{\link{detect_genomic_format}} (see example below).
-Documented in \strong{Input genomic datasets} of \code{\link{tidy_genomic_data}}.
-
-\strong{DArT and VCF data}: \pkg{radiator} was not meant to generate alleles
-and genotypes if you are using a VCF file with no genotype
-(only genotype likelihood: GL or PL).
-Neither is \pkg{radiator} able to magically generate a genind object
-from a SilicoDArT dataset. Please look at the first few lines of your dataset
-to understand it's limit before asking raditor to convert or filter your dataset.}
-
-\item{pop.threshold}{The population threshold, proportion, percentage or
-number e.g. 0.70, 70, 15.
-Default: \code{pop.threshold = 100}.}
-
-\item{percent}{Is the threshold a percentage? TRUE or FALSE.
-This argument is necessary to distinguish percentage from integer population
-threshold (e.g. 50 percent or 50 populations).
-Default: \code{percent = TRUE}.}
-
-\item{filename}{(optional) The function uses \code{\link[fst]{write.fst}},
-to write the tidy data frame in
-the folder created in the working directory. The file extension appended to
-the \code{filename} provided is \code{.rad}.
-Default: \code{filename = NULL}.}
-
-\item{strata}{(optional)
-The strata file is a tab delimited file with a minimum of 2 columns headers:
-\code{INDIVIDUALS} and \code{STRATA}. Documented in \code{\link{read_strata}}.
-DArT data: a third column \code{TARGET_ID} is required.
-Documented on \code{\link{read_dart}}. Also use the strata read function to
-blacklist individuals.
-Default: \code{strata = NULL}.}
-
-\item{parallel.core}{(optional) The number of core used for parallel
-execution during import.
-Default: \code{parallel.core = parallel::detectCores() - 1}.}
-
-\item{...}{(optional) To pass further arguments for fine-tuning the function.}
-}
-\value{
-With \code{interactive.filter = FALSE}, a list in the global environment,
-with 7 objects:
-\enumerate{
-\item $tidy.filtered.ind
-\item $whitelist.markers
-\item $blacklist.markers
-\item $strata
-\item $filters.parameters
-}
-
-With \code{interactive.filter = TRUE}, a list with 2 additionnal objects is created.
-\enumerate{
-\item $plot.pop.threshold
-\item $pop.threshold.helper.table
-}
-
-The object can be isolated in separate object outside the list by
-following the example below.
-}
-\description{
-Filter markers based on populations genotyped. Use a tidy data
-set (long format) of any of these file format:
-vcf, plink (tped/tfam), stacks haplotype file, genind,
-genepop, data frame in wide format. The function uses
-\code{\link[radiator]{tidy_genomic_data}} and
-\code{\link[radiator]{tidy_wide}} to load the file. For filtering
-The threshold can be a fixed number of population, a proportion or a percentage.
-}
-\details{
-\strong{Interactive version}
-
-There is 2 steps in the interactive version to visualize and filter
-the data based on the population representation:
-
-Step 1. Impact of population threshold on marker discovery
-
-Step 2. Choose the filtering threshold
-}
-\examples{
-\dontrun{
-turtle.pop <- filter_population(
-data = "turtle.vcf",
-strata = "turtle.strata.tsv",
-pop.thresholds = 100,
-percent = TRUE,
-filename = "tidy.data.turtle.tsv"
-)
-
-#If interactive.filter = TRUE, a list is created and to view the filtered tidy data:
-tidy.data <- turtle.ind$tidy.filtered.ind
-
-#Inside the same list, to isolate the blacklist.genotypes:
-bg <- turtle.ind$blacklist.genotypes
-
-# The remaining argument are used in tidy_genomic_data during import and allow
-# the user to apply some filtering or selection before doing the filtering.
-}
-}
diff --git a/man/filter_rad.Rd b/man/filter_rad.Rd
index 727cf3cd..af8963f2 100644
--- a/man/filter_rad.Rd
+++ b/man/filter_rad.Rd
@@ -52,6 +52,12 @@ snprelate, maverick, genepopedit, rubias, hapmap and dadi.
Use a character string,
e.g. \code{output = c("genind", "genepop", "structure")}, to have preferred
output formats generated. With default, only the tidy format is generated.
+
+Make sure to read the particularities of each format, some might
+requires extra columns in the strata file. You can find the info in the
+corresponding \emph{write_} functions of radiator
+(\href{https://thierrygosselin.github.io/radiator/reference/index.html}{reference}).
+
Default: \code{output = NULL}.}
\item{filename}{(optional) The filename prefix for the objet in the global environment
diff --git a/man/genomic_converter.Rd b/man/genomic_converter.Rd
index 634d70d7..b9eb664d 100644
--- a/man/genomic_converter.Rd
+++ b/man/genomic_converter.Rd
@@ -46,6 +46,12 @@ snprelate, maverick, genepopedit, rubias, hapmap and dadi.
Use a character string,
e.g. \code{output = c("genind", "genepop", "structure")}, to have preferred
output formats generated. With default, only the tidy format is generated.
+
+Make sure to read the particularities of each format, some might
+requires extra columns in the strata file. You can find the info in the
+corresponding \emph{write_} functions of radiator
+(\href{https://thierrygosselin.github.io/radiator/reference/index.html}{reference}).
+
Default: \code{output = NULL}.}
\item{filename}{(optional) The filename prefix for the object in the global environment
diff --git a/man/keep_common_markers.Rd b/man/keep_common_markers.Rd
deleted file mode 100644
index d1d0c6e9..00000000
--- a/man/keep_common_markers.Rd
+++ /dev/null
@@ -1,43 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/radiator_deprecated_functions.R
-\name{keep_common_markers}
-\alias{keep_common_markers}
-\title{Keep markers in common between all populations}
-\usage{
-keep_common_markers(data, plot = FALSE, verbose = FALSE)
-}
-\arguments{
-\item{data}{A tidy data frame object in the global environment or
-a tidy data frame in wide or long format in the working directory.
-\emph{How to get a tidy data frame ?}
-Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.}
-
-\item{plot}{(optional, logical) \code{plot = TRUE} will produce an
-\href{https://github.com/hms-dbmi/UpSetR}{UpSet plot} to visualize the number
-of markers between populations. The package is required for this to work...
-Default: \code{plot = FALSE}.}
-
-\item{verbose}{(optional, logical) \code{verbose = TRUE} to be chatty
-during execution.
-Default: \code{verbose = FALSE}.}
-}
-\value{
-A list with the filtered input data set,
-the whitelist of markers in common and the blacklist of markers discarded.
-}
-\description{
-Keep markers in common between all populations.
-Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
-and might be of interest for users.
-}
-\section{Life cycle}{
-
-
-As of radiator v.0.0.22, keep.common.markers function is deprecated and was replaced by
-\code{\link{filter_common_markers}} in an effort to have more meaningful functions names.
-}
-
-\author{
-Thierry Gosselin \email{thierrygosselin@icloud.com}
-}
-\keyword{internal}
diff --git a/man/write_rubias.Rd b/man/write_rubias.Rd
index 7367d1e1..916236a8 100644
--- a/man/write_rubias.Rd
+++ b/man/write_rubias.Rd
@@ -25,7 +25,7 @@ Look into \code{\link{tidy_genomic_data}},
\item{strata}{(optional, tibble file or object) This tibble of individual's
metadata must contain
-four colums: \code{SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS}. Those columns
+four columns: \code{SAMPLE_TYPE, REPUNIT, COLLECTION, INDIVIDUALS}. Those columns
are described in \strong{rubias}.
Default:\code{strata = NULL}. With default, \code{SAMPLE_TYPE} is filled
with \code{reference}. \code{REPUNIT} and \code{COLLECTION} will be filled by