From 2d4250e68d94510a52c0015a401e9b149019d8bb Mon Sep 17 00:00:00 2001 From: Thierry Gosselin Date: Thu, 25 Feb 2016 17:10:52 -0500 Subject: [PATCH] **v.0.1.7** * New input file: Re-introduced the haplotype data frame file from stacks. * Argument name change: `imputations` is now `impute.method`. * New argument: `impute` with 2 options: `impute = "genotype"` or `impute = "allele"`. --- DESCRIPTION | 4 +- R/assignment_ngs.R | 681 +++++++++++++++++++++++++++++------------- README.md | 26 +- man/assignment_ngs.Rd | 19 +- 4 files changed, 517 insertions(+), 213 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4d4cf48..15a00ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: assigner Type: Package Title: Assignment Analysis with GBS/RADseq Data using R -Version: 0.1.6 -Date: 2015-02-24 +Version: 0.1.7 +Date: 2015-02-25 Encoding: UTF-8 Authors@R: c( person("Thierry", "Gosselin", email = "thierrygosselin@icloud.com", role = c("aut", "cre"))) diff --git a/R/assignment_ngs.R b/R/assignment_ngs.R index 9bc0716..a6855df 100644 --- a/R/assignment_ngs.R +++ b/R/assignment_ngs.R @@ -69,8 +69,9 @@ #' choosing among these 3 options: \code{"random"} selection, \code{"first"} or #' \code{"last"} SNP on the same short read/haplotype. #' Default: \code{snp.ld = NULL}. -#' Note that for other file type, use PLINK with the linkage disequilibrium -#' based SNP pruning option. +#' Note that for other file type, use stackr package for haplotype file and +#' create a whitelist, for plink and data frames, use PLINK linkage +#' disequilibrium based SNP pruning option. #' @param common.markers (optional) Logical. Default = \code{FALSE}. #' With \code{TRUE}, will keep markers genotyped in all the populations. @@ -78,7 +79,9 @@ #' @param maf.thresholds (string, double, optional) String with #' local/populations and global/overall maf thresholds, respectively. #' Default: \code{maf.thresholds = NULL}. -#' e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold of 0.05 and a global threshold of 0.1. +#' e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold +#' of 0.05 and a global threshold of 0.1. Available for VCF, PLINK and data frame +#' files. Use stackr for haplotypes files. #' @param maf.pop.num.threshold (integer, optional) When maf thresholds are used, #' this argument is for the number of pop required to pass the maf thresholds #' to keep the locus. Default: \code{maf.pop.num.threshold = 1} @@ -136,7 +139,7 @@ #' @param folder (optional) The name of the folder created in the working directory to save the files/results. #' @param gsi_sim.filename (optional) The name of the file written to the directory. -#' Use the extension ".txt" at the end. Default \code{gsi_sim_data.txt}. +#' Use the extension ".txt" at the end. Default \code{assignment_data.txt}. #' The number of markers used will be appended to the name of the file. #' @param keep.gsi.files (Boolean) Default \code{FALSE} The input and output gsi_sim files #' will be deleted from the directory when finished processing. @@ -178,10 +181,14 @@ #' With \code{subsample = 20} and \code{iteration.subsample = 10}, #' 20 individuals/populations will be randomly chosen 10 times. -#' @param imputations Should a map-independent imputations of markers be + +#' @param imputation.method Should a map-independent imputations of markers be #' computed. Available choices are: (1) \code{FALSE} for no imputation. #' (2) \code{"max"} to use the most frequent category for imputations. -#' (3) \code{"rf"} using Random Forest algorithm. Default = \code{FALSE}. +#' (3) \code{"rf"} using Random Forest algorithm. +#' Default: \code{imputation.method = FALSE}. +#' @param impute (character) Imputation on missing genotype +#' \code{impute = "genotype"} or alleles \code{impute = "allele"}. #' @param imputations.group \code{"global"} or \code{"populations"}. #' Should the imputations be computed globally or by populations. If you choose #' global, turn the verbose to \code{TRUE}, to see progress. @@ -302,23 +309,23 @@ # required to pass the R CMD check and have 'no visible binding for global variable' if (getRversion() >= "2.15.1") { utils::globalVariables( - c('ID', '#CHROM', 'CHROM', 'FORMAT', 'INDIVIDUALS', 'FORMAT_ID', 'LOCUS', - 'POS', 'REF', 'ALT', 'POP_ID', 'READ_DEPTH', 'ALLELE_DEPTH', 'GL', - 'ERASE', 'GT', 'MARKERS', 'QQ', 'PQ', 'N', 'MAF_GLOBAL', 'MAF_LOCAL', - 'ALLELES', 'POP_ID', 'GT', 'INDIVIDUALS', 'MARKERS', 'POP_ID', 'nal', - 'ALLELES_GROUP', 'ALLELES', 'N_IND_GENE', 'P', 'N', 'nal_sq', - 'nal_sq_sum', 'nal_sq_sum_nt', 'npl', 'het', 'mho', 'mhom', 'dum', - 'dum1', 'SSG', 'ntal', 'SSP', 'ntalb', 'SSi', 'MSI', 'sigw', 'MSP', - 'siga', 'sigb', 'lsiga', 'lsigb', 'lsigw', 'FST', 'MARKERS', - 'MARKERS_ALLELES', 'ALLELES', 'POP_ID', 'INDIVIDUALS', 'filename', - 'ID', 'KEEPER', 'ASSIGN', 'OTHERS', 'CURRENT', 'INFERRED', - 'SECOND_BEST_POP', 'SCORE', 'SECOND_BEST_SCORE', 'NUMBER', 'INDIVIDUALS_ALLELES', - 'MARKER_NUMBER', 'MISSING_DATA', 'TOTAL', 'ASSIGNMENT_PERC', - 'MARKERS', 'CURRENT', 'INFERRED', 'MISSING_DATA', - 'ITERATIONS', 'METHOD', 'TOTAL', 'MEAN_i', 'MEAN', 'ASSIGNMENT_PERC', - 'SE', 'MEDIAN', 'MIN', 'MAX', 'QUANTILE25', 'QUANTILE75', 'SE_MIN', - 'SE_MAX', '.', 'QUAL', 'FILTER', 'INFO', 'pb', 'SUBSAMPLE', 'STRATA', - 'sum.pop', 'A1', 'A2', 'INDIVIDUALS_2' + c("ID", "#CHROM", "CHROM", "FORMAT", "INDIVIDUALS", "FORMAT_ID", "LOCUS", + "POS", "REF", "ALT", "POP_ID", "READ_DEPTH", "ALLELE_DEPTH", "GL", + "ERASE", "GT", "MARKERS", "QQ", "PQ", "N", "MAF_GLOBAL", "MAF_LOCAL", + "ALLELES", "POP_ID", "GT", "INDIVIDUALS", "MARKERS", "POP_ID", "nal", + "ALLELES_GROUP", "ALLELES", "N_IND_GENE", "P", "N", "nal_sq", + "nal_sq_sum", "nal_sq_sum_nt", "npl", "het", "mho", "mhom", "dum", + "dum1", "SSG", "ntal", "SSP", "ntalb", "SSi", "MSI", "sigw", "MSP", + "siga", "sigb", "lsiga", "lsigb", "lsigw", "FST", "MARKERS", + "MARKERS_ALLELES", "ALLELES", "POP_ID", "INDIVIDUALS", "filename", + "ID", "KEEPER", "ASSIGN", "OTHERS", "CURRENT", "INFERRED", + "SECOND_BEST_POP", "SCORE", "SECOND_BEST_SCORE", "NUMBER", "INDIVIDUALS_ALLELES", + "MARKER_NUMBER", "MISSING_DATA", "TOTAL", "ASSIGNMENT_PERC", + "MARKERS", "CURRENT", "INFERRED", "MISSING_DATA", + "ITERATIONS", "METHOD", "TOTAL", "MEAN_i", "MEAN", "ASSIGNMENT_PERC", + "SE", "MEDIAN", "MIN", "MAX", "QUANTILE25", "QUANTILE75", "SE_MIN", + "SE_MAX", ".", "QUAL", "FILTER", "INFO", "pb", "SUBSAMPLE", "STRATA", + "sum.pop", "A1", "A2", "INDIVIDUALS_2", "Cnt", "Catalog ID", "GROUP" ) ) } @@ -348,9 +355,10 @@ assignment_ngs <- function(data, thl = 1, iteration.method = 10, folder, - gsi_sim.filename = "gsi_sim_data.txt", + gsi_sim.filename = "assignment_data.txt", keep.gsi.files, - imputations = FALSE, + imputation.method = FALSE, + impute = "genotypes", imputations.group = "populations", num.tree = 100, iteration.rf = 10, @@ -387,10 +395,11 @@ assignment_ngs <- function(data, if (missing(iteration.method)) iteration.method <- 10 if (sampling.method == "ranked" & thl == "all") iteration.method <- 1 if (sampling.method == "ranked" & thl == 1) iteration.method <- 1 - if (missing(gsi_sim.filename)) gsi_sim.filename <- "gsi_sim_data.txt" + if (missing(gsi_sim.filename)) gsi_sim.filename <- "assignment_data.txt" if (missing(keep.gsi.files)) keep.gsi.files <- FALSE - if (missing(imputations)) imputations <- FALSE + if (missing(imputation.method)) imputation.method <- FALSE if (missing(imputations.group)) imputations.group <- "populations" + if (missing(impute)) impute <- "genotypes" if (missing(num.tree)) num.tree <- 100 if (missing(iteration.rf)) iteration.rf <- 10 if (missing(split.number)) split.number <- 100 @@ -403,17 +412,30 @@ assignment_ngs <- function(data, if (identical(data.type, "##fileformat=VCF") | stri_detect_fixed(str = data, pattern = ".vcf")) { data.type <- "vcf.file" message("File type: VCF") - } else if (stri_detect_fixed(str = data, pattern = ".tped")) { + } + + if (stri_detect_fixed(str = data, pattern = ".tped")) { data.type <- "plink.file" message("File type: PLINK") if (!file.exists(stri_replace_all_fixed(str = data, pattern = ".tped", replacement = ".tfam", vectorize_all = FALSE))) { stop("Missing tfam file with the same prefix as your tped") } - } else { + } + + if (stri_detect_fixed(str = data.type, pattern = "POP_ID") | stri_detect_fixed(str = data.type, pattern = "INDIVIDUALS")) { data.type <- "df.file" message("File type: data frame of genotype") } + if (stri_detect_fixed(str = data.type, pattern = "Catalog")) { + data.type <- "haplo.file" + message("File type: haplotypes from stacks") + if (is.null(blacklist.genotype)) { + stop("blacklist.genotype file missing. + Use stackr's missing_genotypes function to create this blacklist") + } + } + # Create a folder based on filename to save the output files ***************** if (is.null(folder)) { # Get date and time to have unique filenaming @@ -421,13 +443,13 @@ assignment_ngs <- function(data, file.date <- stri_replace_all_fixed(file.date, pattern = c("-", " ", ":"), replacement = c("", "@", ""), vectorize_all = FALSE) file.date <- stri_sub(file.date, from = 1, to = 13) - if (imputations == "FALSE") { + if (imputation.method == "FALSE") { message("Map-imputation: no") directory <- stri_join(getwd(),"/", "assignment_analysis_", "method_", sampling.method, "_no_imputations_", file.date, "/", sep = "") dir.create(file.path(directory)) } else { message("Map-imputation: yes") - directory <- stri_join(getwd(),"/","assignment_analysis_", "method_", sampling.method, "_imputations_", imputations,"_", imputations.group, "_", file.date, "/", sep = "") + directory <- stri_join(getwd(),"/","assignment_analysis_", "method_", sampling.method, "_imputations_", imputation.method,"_", imputations.group, "_", file.date, "/", sep = "") dir.create(file.path(directory)) } message(stri_join("Folder: ", directory)) @@ -450,6 +472,11 @@ assignment_ngs <- function(data, } } + if (data.type == "haplo.file") { + whitelist.markers <- select(.data = whitelist.markers, LOCUS) + columns.names.whitelist <- colnames(whitelist.markers) + } + # Import blacklist id ******************************************************** if (is.null(blacklist.id)) { # No blacklist of ID message("Blacklisted individuals: no") @@ -626,12 +653,10 @@ assignment_ngs <- function(data, mutate(INDIVIDUALS = stri_replace_all_fixed(str = INDIVIDUALS_ALLELES, pattern = c("_A1", "_A2"), replacement = "", vectorize_all = FALSE)) %>% left_join(strata.df, by = "INDIVIDUALS") %>% mutate( - # POP_ID = stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = FALSE), # test POP_ID = factor(POP_ID, levels = pop.levels, ordered =TRUE), - # POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered =TRUE), # test GT = stri_pad_left(str = GT, width = 3, pad = "0") ) - + # Pop select if (!is.null(pop.select)) { message(stri_join(length(pop.select), "population(s) selected", sep = " ")) @@ -725,6 +750,58 @@ assignment_ngs <- function(data, } } # End import data frame of genotypes + if (data.type == "haplo.file") { # Haplotype file + message("Importing the stacks haplotype file") + input <- data.table::fread( + input = data, + sep = "\t", + header = TRUE, + stringsAsFactors = FALSE, + verbose = FALSE, + showProgress = TRUE, + data.table = FALSE + ) %>% + as_data_frame() %>% + select(-Cnt) %>% + rename(LOCUS = `Catalog ID`) %>% + tidyr::gather(INDIVIDUALS, GT, -LOCUS) + + # Filter with whitelist of markers + if (!is.null(whitelist.markers)) { + message("Filtering with whitelist of markers") + input <- suppressWarnings(semi_join(input, whitelist.markers, by = columns.names.whitelist)) + } + + # Filter with blacklist of individuals + if (!is.null(blacklist.id)) { + message("Filtering with blacklist of individuals") + input <- suppressWarnings(anti_join(input, blacklist.id, by = "INDIVIDUALS")) + } + + if (is.null(strata)){ + input <- input %>% + mutate( # Make population ready + POP_ID = substr(INDIVIDUALS, pop.id.start, pop.id.end), + POP_ID = factor(stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = FALSE), levels = unique(pop.labels), ordered = TRUE), + INDIVIDUALS = as.character(INDIVIDUALS) + ) + } else { # Make population ready with the strata provided + strata.df <- read_tsv(file = strata, col_names = TRUE, col_types = "cc") %>% + rename(POP_ID = STRATA) + + input <- input %>% + mutate(INDIVIDUALS = as.character(INDIVIDUALS)) %>% + left_join(strata.df, by = "INDIVIDUALS") %>% + mutate(POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered =TRUE)) + } + + # Pop select + if (!is.null(pop.select)) { + message(stri_join(length(pop.select), "population(s) selected", sep = " ")) + input <- suppressWarnings(input %>% filter(POP_ID %in% pop.select)) + } + } # End import haplotypes file + # Blacklist genotypes ******************************************************** if (is.null(blacklist.genotype)) { # no Whitelist message("Erasing genotype: no") @@ -736,6 +813,11 @@ assignment_ngs <- function(data, columns.names.blacklist.genotype$CHROM <- as.character(columns.names.blacklist.genotype$CHROM) } + if (data.type == "haplo.file") { + blacklist.genotype <- select(.data = blacklist.genotype, INDIVIDUALS, LOCUS) + columns.names.blacklist.genotype <- colnames(blacklist.genotype) + } + # control check to keep only individuals in pop.select if (!is.null(pop.select)) { message("Control check to keep only individuals present in pop.select") @@ -765,13 +847,12 @@ assignment_ngs <- function(data, # control check to keep only whitelisted markers from the blacklist of genotypes if (!is.null(whitelist.markers)) { blacklist.genotype <- blacklist.genotype - message("Control check to keep only whitelisted markers -present in the blacklist of genotypes to erase.") + message("Control check to keep only whitelisted markers present in the blacklist of genotypes to erase.") # updating the whitelist of markers to have all columns that id markers if (data.type == "vcf.file"){ whitelist.markers.ind <- input %>% select(CHROM, LOCUS, POS, INDIVIDUALS) %>% distinct(CHROM, LOCUS, POS, INDIVIDUALS) } else { - whitelist.markers.ind <- input %>% select(LOCUS) %>% distinct(LOCUS) + whitelist.markers.ind <- input %>% select(LOCUS, INDIVIDUALS) %>% distinct(LOCUS, INDIVIDUALS) } # updating the blacklist.genotype @@ -781,8 +862,7 @@ present in the blacklist of genotypes to erase.") # control check to remove blacklisted individuals from the blacklist of genotypes if (!is.null(blacklist.id)) { - message("Control check to remove blacklisted individuals -present in the blacklist of genotypes to erase.") + message("Control check to remove blacklisted individuals present in the blacklist of genotypes to erase.") blacklist.genotype <- suppressWarnings(anti_join(blacklist.genotype, blacklist.id, by = "INDIVIDUALS")) columns.names.blacklist.genotype <- colnames(blacklist.genotype) } @@ -815,12 +895,19 @@ present in the blacklist of genotypes to erase.") select(-ERASE) } + if (data.type == "haplo.file") { + input <- input %>% + mutate(GT = ifelse(ERASE == "erase", "-", GT)) %>% + select(-ERASE) + } + } # End erase genotypes # dump unused object blacklist.id <- NULL whitelist.markers <- NULL whitelist.markers.ind <- NULL + blacklist.genotype <- NULL # subsampling data *********************************************************** # Function: @@ -896,8 +983,9 @@ present in the blacklist of genotypes to erase.") # LD control... keep only 1 SNP per haplotypes/reads (optional) ************ if (!is.null(snp.ld)) { if (data.type != "vcf.file") { - stop("snp.ld is only available for VCF file, for other file type, use - PLINK use the linkage disequilibrium based SNP pruning option") + stop("snp.ld is only available for VCF file, use stackr package for +haplotype file and create a whitelist, for other file type, use + PLINK linkage disequilibrium based SNP pruning option") } message("Minimizing LD...") snp.locus <- input %>% select(LOCUS, POS) %>% distinct(POS) @@ -930,7 +1018,8 @@ present in the blacklist of genotypes to erase.") message("Filtering the tidy VCF to minimize LD by keeping only 1 SNP per short read/haplotype") } # End of snp.ld control - # Unique markers id: combine CHROM, LOCUS and POS into MARKERS ************* + # Unique markers id: for VCF combine CHROM, LOCUS and POS into MARKERS ***** + # For other type of file change LOCUS to MARKERS if (data.type != "vcf.file") { input <- input %>% rename(MARKERS = LOCUS) } else { @@ -945,38 +1034,20 @@ present in the blacklist of genotypes to erase.") # Markers in common between all populations (optional) ********************* if (common.markers == TRUE) { # keep only markers present in all pop - message("Using markers common in all populations") + message("Using markers common in all populations:") pop.number <- n_distinct(input$POP_ID) - if (data.type == "df.file") { # for data frame input - pop.filter <- input %>% - filter(GT != "000000") %>% - group_by(MARKERS) %>% - filter(n_distinct(POP_ID) == pop.number) %>% - arrange(MARKERS) %>% - select(MARKERS) %>% - distinct(MARKERS) - } - - if (data.type == "vcf.file") { # for vcf input - pop.filter <- input %>% - filter(GT != "./.") %>% - group_by(MARKERS) %>% - filter(n_distinct(POP_ID) == pop.number) %>% - arrange(MARKERS) %>% - select(MARKERS) %>% - distinct(MARKERS) - } + if (data.type == "vcf.file") pop.filter <- input %>% filter(GT != "./.") + if (data.type == "plink.file") pop.filter <- input %>% filter(GT != "000") + if (data.type == "df.file") pop.filter <- input %>% filter(GT != "000000") + if (data.type == "haplo.file") pop.filter <- input %>% filter(GT != "-") - if (data.type == "plink.file") { # for PLINK input - pop.filter <- input %>% - filter(GT != "000") %>% - group_by(MARKERS) %>% - filter(n_distinct(POP_ID) == pop.number) %>% - arrange(MARKERS) %>% - select(MARKERS) %>% - distinct(MARKERS) - } + pop.filter <- pop.filter %>% + group_by(MARKERS) %>% + filter(n_distinct(POP_ID) == pop.number) %>% + arrange(MARKERS) %>% + select(MARKERS) %>% + distinct(MARKERS) message(stri_join("Number of original markers = ", n_distinct(input$MARKERS), "\n", "Number of markers present in all the populations = ", @@ -988,6 +1059,7 @@ present in the blacklist of genotypes to erase.") pop.filter <- NULL # ununsed object } # End common markers + # Minor Allele Frequency filter ******************************************** # maf.thresholds <- c(0.05, 0.1) # test if (!is.null(maf.thresholds)) { # with MAF @@ -1020,11 +1092,25 @@ present in the blacklist of genotypes to erase.") maf.global <- NULL } # end maf calculations with vcf - if (data.type != "vcf.file") { + if (data.type == "plink.file" | data.type == "df.file") { message("Calculating global and local MAF, this may take some time on large data set") - maf.data <- input %>% - select(MARKERS, GT, POP_ID) %>% - filter(GT != "000") %>% + + # For data frame we split the alleles here to prep for MAF + if (data.type == "df.file") { # for data frame of genotypes + maf.data <- input %>% + tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% + tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + select(MARKERS, GT, POP_ID) %>% + filter(GT != "000") + } + + if (data.type == "plink.file") { # For PLINK and common code below + maf.data <- input %>% + select(MARKERS, GT, POP_ID) %>% + filter(GT != "000") + } + + maf.data <- maf.data %>% group_by(MARKERS, GT, POP_ID) %>% tally %>% arrange(MARKERS, GT) %>% @@ -1041,6 +1127,11 @@ present in the blacklist of genotypes to erase.") select(MARKERS, POP_ID, MAF_LOCAL, MAF_GLOBAL) }# end maf calculations with PLINK or data frame of genotypes + if (data.type == "haplo.file") { + stop("MAF filtering is only available for haplotype file, use stackr +package and update your whitelist") + } + write_tsv(x = maf.data, path = paste0(directory.subsample,"maf.data.tsv"), col_names = TRUE, @@ -1158,73 +1249,187 @@ present in the blacklist of genotypes to erase.") ) %>% arrange(MARKERS, POP_ID) %>% select(-c(REF, ALT)) - } - - # more prep for the no imputation section - if (data.type == "vcf.file") { # for VCF input + gsim.prep <- input %>% tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>% # separate the genotypes into alleles tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) - } + } - if (data.type == "df.file") { # for data frame of genotypes + if (data.type == "df.file") { # For data frame of genotypes + message("Recoding genotypes for gsi_sim") gsim.prep <- input %>% - tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>% # separate the genotypes into alleles - tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) + tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% + tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) } if (data.type == "plink.file") { # for PLINK + message("Recoding genotypes for gsi_sim") gsim.prep <- input %>% tidyr::separate(col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS_2", "ALLELES"), sep = "_", remove = TRUE) %>% select(-INDIVIDUALS_2) } - - # Imputations ************************************************************** - if (imputations != "FALSE") { - if (data.type == "vcf.file") { # for VCF input - input.prep <- input %>% + if (data.type == "haplo.file") { # for haplotypes + message("Recoding genotypes for gsi_sim") + gsim.prep <- suppressWarnings( + input %>% mutate( - GT = stri_replace_all_fixed(GT, pattern = "0_0", replacement = "NA", vectorize_all = FALSE), + GT = stri_replace_all_fixed(GT, "-", "NA", + vectorize_all=F), GT = replace(GT, which(GT == "NA"), NA) + ) %>% + arrange(MARKERS) + ) + + gsim.prep <- suppressWarnings( + gsim.prep %>% + tidyr::separate( + col = GT, into = c("A1", "A2"), + sep = "/", extra = "drop", remove = TRUE ) %>% - group_by(INDIVIDUALS, POP_ID) %>% + mutate(A2 = ifelse(is.na(A2), A1, A2)) %>% + tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() + ) + + gsim.prep <- suppressWarnings( + gsim.prep %>% + plyr::colwise(.fun = factor, exclude = NA)(.) + ) + + gsim.prep <- suppressWarnings( + gsim.prep %>% + mutate(GROUP = rep(1, times = nrow(.))) %>% + group_by(GROUP) %>% + mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES, GROUP)) %>% ungroup() %>% - arrange(POP_ID, INDIVIDUALS) + select(-GROUP) %>% + tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% + mutate(GT = stri_replace_na(str = GT, replacement = "0")) %>% + mutate(POP_ID = droplevels(POP_ID)) + ) + } + + # Imputations ************************************************************** + if (imputation.method != "FALSE") { + message("Preparing the data for imputations") + if (data.type == "vcf.file") { # for VCF input + if (impute == "genotype") { + input.prep <- input %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "0_0", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + + if (impute == "allele") { + input.prep <- gsim.prep %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "0", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + # tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ":") %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + } # End VCF prep file for imputation + if (data.type == "plink.file") { - input.prep <- gsim.prep %>% - tidyr::separate(col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS_2", "ALLELES"), sep = "_", remove = TRUE) %>% - select(-INDIVIDUALS_2) %>% - group_by(MARKERS, INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = ALLELES, value = GT) %>% - tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) %>% - mutate( - GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE), - GT = replace(GT, which(GT == "NA"), NA) - ) %>% - group_by(INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% - ungroup() %>% - arrange(POP_ID, INDIVIDUALS) + + if (impute == "genotype"){ + input.prep <- gsim.prep %>% + group_by(MARKERS, INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = ALLELES, value = GT) %>% + tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + + if (impute == "allele"){ + input.prep <- gsim.prep %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } # glimpse(test) } # End PLINK prep file for imputation + if (data.type == "df.file") { # for df input - input.prep <- input %>% - mutate( - GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE), - GT = replace(GT, which(GT == "NA"), NA) - ) %>% - group_by(INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% - ungroup() %>% - arrange(POP_ID, INDIVIDUALS) + if (impute == "genotype") { + input.prep <- input %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + if (impute == "allele") { + input.prep <- gsim.prep %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } } # End data frame prep file for imputation + if (data.type == "haplo.file") { # for df input + if (impute == "genotype") { + input.prep <- gsim.prep %>% + group_by(MARKERS, INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = ALLELES, value = GT) %>% + tidyr::unite(col = GT, A1, A2, sep = "_", remove = TRUE) %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "0_0", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + if (impute == "allele") { + input.prep <- gsim.prep %>% + mutate( + GT = stri_replace_all_fixed(GT, pattern = "0", replacement = "NA", vectorize_all = FALSE), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() %>% + arrange(POP_ID, INDIVIDUALS) + } + } # End haplotype prep file for imputation + # Imputation with Random Forest - if (imputations == "rf") { + if (imputation.method == "rf") { # Parallel computations options options(rf.cores = parallel.core, mc.cores = parallel.core) @@ -1294,26 +1499,47 @@ present in the blacklist of genotypes to erase.") } # End imputation RF global } # End imputation RF # Imputation using the most common genotype - if (imputations == "max") { # End imputation max + if (imputation.method == "max") { # End imputation max if (imputations.group == "populations") { message("Imputations computed by populations") - input.imp <- suppressWarnings( - input.prep %>% - tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% - group_by(MARKERS, POP_ID) %>% - mutate( - GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)), - GT = replace(GT, which(GT == "NA"), NA) - ) %>% - # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers - # will take the global observed values by markers for those cases. - group_by(MARKERS) %>% - mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% - group_by(INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% - ungroup() - ) + if (impute == "genotype"){ + input.imp <- suppressWarnings( + input.prep %>% + tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% + group_by(MARKERS, POP_ID) %>% + mutate( + GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers + # will take the global observed values by markers for those cases. + group_by(MARKERS) %>% + mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() + ) + } + + if (impute == "allele"){ + input.imp <- suppressWarnings( + input.prep %>% + tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% + group_by(MARKERS, POP_ID) %>% + mutate( + GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)), + GT = replace(GT, which(GT == "NA"), NA) + ) %>% + # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers + # will take the global observed values by markers for those cases. + group_by(MARKERS) %>% + mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() + ) + } input.prep <- NULL # remove unused object @@ -1321,45 +1547,59 @@ present in the blacklist of genotypes to erase.") if (imputations.group == "global") { # Globally (not by pop_id) message("Imputations computed globally") + if (impute == "genotype"){ + input.imp <- suppressWarnings( + input.prep %>% + tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% + group_by(MARKERS) %>% + mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% + group_by(INDIVIDUALS, POP_ID) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() + ) + } - input.imp <- suppressWarnings( - input.prep %>% - tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% - group_by(MARKERS) %>% - mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% - group_by(INDIVIDUALS, POP_ID) %>% - tidyr::spread(data = ., key = MARKERS, value = GT) %>% - ungroup() - ) + if (impute == "allele"){ + input.imp <- suppressWarnings( + input.prep %>% + tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% + group_by(MARKERS) %>% + mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% + group_by(INDIVIDUALS, POP_ID, ALLELES) %>% + tidyr::spread(data = ., key = MARKERS, value = GT) %>% + ungroup() + ) + } input.prep <- NULL # remove unused object } # End imputation max global } # End imputations max - # transform the imputed dataset into gsi_sim - if (data.type == "vcf.file") { # for VCF input - message("Imputed VCF into factory for conversion into gsi_sim...") - input.imp <- suppressWarnings( - input.imp %>% - tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) - ) - - gsi.prep.imp <- suppressWarnings( - input.imp %>% - tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>% # separate the genotypes into alleles - tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) # make tidy - ) - } else { # for df input - input.imp <- suppressWarnings( - input.imp %>% - tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) - ) - - gsim.prep.imp <- suppressWarnings( - input.imp %>% + + # prepare the imputed dataset for gsi_sim + message("Preparing imputed data set for gsi_sim...") + if (impute == "genotype") { + if (data.type == "vcf.file" | data.type == "haplo.file") { # for VCF input + gsi.prep.imp <- suppressWarnings( + input.imp %>% + tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID)) %>% # make tidy + tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>% + tidyr::gather(key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) + ) + } + if (data.type == "plink.file" | data.type == "df.file") { + gsi.prep.imp <- input.imp %>% + tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>% # separate the genotypes into alleles tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) + } + } + if (impute == "allele") { + gsi.prep.imp <- suppressWarnings( + input.imp %>% + tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) ) } + input.imp <- NULL # remove unused object } # End imputations # Sampling of markers ****************************************************** @@ -1389,7 +1629,7 @@ present in the blacklist of genotypes to erase.") # Functions ****************************************************************** # Fst function: Weir & Cockerham 1984 fst_WC84 <- function(data, holdout.samples, ...) { - # data <- vcf # test + # data <- input # test # holdout.samples <- holdout$INDIVIDUALS # test # data.type <- data$GT[[1]] # VCF vs DF # test @@ -1400,7 +1640,7 @@ present in the blacklist of genotypes to erase.") # if (stri_detect_fixed(data.type, "_")){ # for VCF file input # test data.genotyped <- data %>% filter(GT != "0_0") - } else { # for df file input + } else { # for df and haplotype files data.genotyped <- data %>% filter(GT != "000000") } @@ -1410,7 +1650,7 @@ present in the blacklist of genotypes to erase.") data.genotyped <- data %>% filter(GT != "0_0") %>% # remove missing genotypes filter(!INDIVIDUALS %in% holdout.samples) # remove supplementary individual before ranking markers with Fst - } else { # for df file input + } else { # for df and haplotype files data.genotyped <- data %>% filter(GT != "000000") %>% # remove missing genotypes filter(!INDIVIDUALS %in% holdout.samples) # remove supplementary individual before ranking markers with Fst @@ -1796,10 +2036,10 @@ Progress can be monitored with activity in the folder...") # Assignment analysis without imputations filename <- stri_replace_all_fixed(gsi_sim.filename, - pattern = "txt", + pattern = ".txt", replacement = stri_join( i, m, - "no.imputation", "txt", sep = "." + "no_imputation.txt", sep = "_" ) ) @@ -1817,8 +2057,8 @@ Progress can be monitored with activity in the folder...") filename <- NULL # With imputations - if (imputations != FALSE) {# with imputations - if (imputations == "rf") { + if (imputation.method != FALSE) {# with imputations + if (imputation.method == "rf") { if (imputations.group == "populations") { missing.data <- "imputed RF populations" } else { @@ -1836,7 +2076,7 @@ Progress can be monitored with activity in the folder...") pattern = "txt", replacement = stri_join( i, m, - "imputed", "txt", sep = "." + "imputed.txt", sep = "_" ) ) assignment.imp <- assignment_analysis(data = gsi.prep.imp, @@ -1854,7 +2094,7 @@ Progress can be monitored with activity in the folder...") } # End with imputations #compile assignment results each marker number for the iteration - if (imputations == FALSE) { + if (imputation.method == FALSE) { assignment <- assignment.no.imp gsi.prep.imp <- NULL input.imp <- NULL @@ -1940,7 +2180,7 @@ Progress can be monitored with activity in the folder...") # Write the tables to directory # assignment results - if (imputations == FALSE) { + if (imputation.method == FALSE) { filename.assignment.res <- stri_join("assignment.res", "no.imputation", sampling.method, "tsv", sep = ".") } else { # with imputations filename.assignment.res <- stri_join("assignment.res", "imputed", sampling.method, "tsv", sep = ".") @@ -1948,7 +2188,7 @@ Progress can be monitored with activity in the folder...") write_tsv(x = assignment.res, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE) # assignment summary stats - if (imputations == FALSE) { + if (imputation.method == FALSE) { filename.assignment.sum <- stri_join("assignment.summary.stats", "no.imputation", sampling.method, "tsv", sep = ".") } else { # with imputations filename.assignment.sum <- stri_join("assignment.summary.stats", "imputed", sampling.method, "tsv", sep = ".") @@ -1960,8 +2200,25 @@ Progress can be monitored with activity in the folder...") if (sampling.method == "ranked") { message("Conducting Assignment analysis with ranked markers") + if (data.type == "haplo.file") { + input <- gsim.prep %>% + mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% + group_by(POP_ID, INDIVIDUALS, MARKERS) %>% + tidyr::spread(data = ., key = ALLELES, value = GT) %>% + tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) + if (imputation.method != FALSE) { + input.imp <- input.imp %>% + tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% + mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% + group_by(POP_ID, INDIVIDUALS, MARKERS) %>% + tidyr::spread(data = ., key = ALLELES, value = GT) %>% + tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) + } + } + # List of all individuals ind.pop.df<- input %>% + ungroup %>% select(POP_ID, INDIVIDUALS) %>% distinct(POP_ID, INDIVIDUALS) @@ -2041,19 +2298,19 @@ Progress can be monitored with activity in the folder...") if (thl == "all") { holdout <- NULL fst.ranked <- fst_WC84(data = input, holdout.samples = NULL) - if (imputations != FALSE) { + if (imputation.method != FALSE) { fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = NULL) } } else if (thl == 1) { holdout <- data.frame(INDIVIDUALS = i) fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS) - if (imputations != FALSE) { + if (imputation.method != FALSE) { fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS) } } else { holdout <- data.frame(holdout.individuals.list[i]) fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS) - if (imputations != FALSE) { + if (imputation.method != FALSE) { fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS) } } @@ -2066,7 +2323,7 @@ Progress can be monitored with activity in the folder...") append = FALSE ) - if (imputations != FALSE) { # With imputations + if (imputation.method != FALSE) { # With imputations fst.ranked.filename.imp <- stri_join("fst.ranked_", i, "_imputed", ".tsv", @@ -2119,7 +2376,7 @@ Progress can be monitored with activity in the folder...") filename <- NULL # With imputations - if (imputations != FALSE) { # with imputations + if (imputation.method != FALSE) { # with imputations select.markers <- filter(.data = fst.ranked.imp, RANKING <= m) %>% select(MARKERS) @@ -2127,7 +2384,7 @@ Progress can be monitored with activity in the folder...") # get the list of markers after filter markers.names <- unique(select.markers$MARKERS) # not the same in no imputation - if (imputations == "rf") { + if (imputation.method == "rf") { if (imputations.group == "populations") { missing.data <- "imputed RF populations" } else { @@ -2166,7 +2423,7 @@ Progress can be monitored with activity in the folder...") } # End with imputations #compile assignment results each marker number for the iteration - if (imputations == FALSE) {# with imputations + if (imputation.method == FALSE) {# with imputations assignment <- assignment.no.imp fst.ranked.imp <- NULL gsi.prep.imp <- NULL @@ -2189,7 +2446,7 @@ Progress can be monitored with activity in the folder...") gsim.prep = gsim.prep, input.imp = input.imp, # vcf = vcf.imp, # was an error before, double check... - gsim.prep.imp = gsim.prep.imp, + gsi.prep.imp = gsi.prep.imp, pop.levels = pop.levels, pop.labels = pop.labels, pop.id.start = pop.id.start, @@ -2199,7 +2456,7 @@ Progress can be monitored with activity in the folder...") iteration.method = iteration.method, gsi_sim.filename = gsi_sim.filename, keep.gsi.files = keep.gsi.files, - imputations = imputations, + imputation.method = imputation.method, parallel.core = parallel.core ) @@ -2328,7 +2585,7 @@ Progress can be monitored with activity in the folder...") # Write the tables to directory # assignment results - if (imputations == FALSE) { + if (imputation.method == FALSE) { filename.assignment.res <- stri_join("assignment.res", "no.imputation", sampling.method, "tsv", sep = ".") } else { # with imputations filename.assignment.res <- stri_join("assignment.res", "imputed", sampling.method, "tsv", sep = ".") @@ -2336,7 +2593,7 @@ Progress can be monitored with activity in the folder...") write_tsv(x = assignment.res.summary, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE) # assignment summary stats - if (imputations == FALSE) { + if (imputation.method == FALSE) { filename.assignment.sum <- stri_join("assignment.summary.stats", "no.imputation", sampling.method, "tsv", sep = ".") } else { # with imputations filename.assignment.sum <- stri_join("assignment.summary.stats", "imputed", sampling.method, "tsv", sep = ".") @@ -2368,7 +2625,8 @@ Progress can be monitored with activity in the folder...") iteration.method = iteration.method, gsi_sim.filename = gsi_sim.filename, keep.gsi.files = keep.gsi.files, - imputations = imputations, + imputation.method = imputation.method, + impute = impute, imputations.group = imputations.group, num.tree = num.tree, iteration.rf = iteration.rf, @@ -2440,22 +2698,43 @@ Progress can be monitored with activity in the folder...") } # End summary of the subsampling iterations # Assignment plot - plot.assignment <- ggplot(res, aes(x = factor(MARKER_NUMBER), y = MEAN))+ - geom_point(size = 2, alpha = 0.5) + - geom_errorbar(aes(ymin = SE_MIN, ymax = SE_MAX), width = 0.3) + - scale_y_continuous(breaks = c(0, 10, 20 ,30, 40, 50, 60, 70, 80, 90, 100))+ - labs(x = "Marker number")+ - labs(y = "Assignment success (%)")+ - theme_bw()+ - theme( - legend.position = "bottom", - panel.grid.minor.x = element_blank(), - panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed"), - axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"), - axis.text.x = element_text(size = 8, family = "Helvetica", face = "bold", angle = 90, hjust = 1, vjust = 0.5), - axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"), - axis.text.y = element_text(size = 10, family = "Helvetica", face = "bold") - ) + if (imputation.method == FALSE) { # no imputation + plot.assignment <- ggplot(res, aes(x = factor(MARKER_NUMBER), y = MEAN))+ + geom_point(size = 2, alpha = 0.5) + + geom_errorbar(aes(ymin = SE_MIN, ymax = SE_MAX), width = 0.3) + + scale_y_continuous(breaks = c(0, 10, 20 ,30, 40, 50, 60, 70, 80, 90, 100))+ + labs(x = "Marker number")+ + labs(y = "Assignment success (%)")+ + theme_bw()+ + theme( + legend.position = "bottom", + panel.grid.minor.x = element_blank(), + panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed"), + axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"), + axis.text.x = element_text(size = 8, family = "Helvetica", face = "bold", angle = 90, hjust = 1, vjust = 0.5), + axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"), + axis.text.y = element_text(size = 10, family = "Helvetica", face = "bold") + ) + } else { #with imputations + + plot.assignment <- ggplot(res, aes(x = factor(MARKER_NUMBER), y = MEAN))+ + geom_point(aes(colour = MISSING_DATA), size = 2, alpha = 0.8) + + geom_errorbar(aes(ymin = SE_MIN, ymax = SE_MAX), width = 0.3) + + scale_colour_manual(name = "Missing data", values = c("darkorange", "dodgerblue"))+ + scale_y_continuous(breaks = c(0, 10, 20 ,30, 40, 50, 60, 70, 80, 90, 100))+ + labs(x = "Marker number")+ + labs(y = "Assignment success (%)")+ + theme_bw()+ + theme( + legend.position = "bottom", + panel.grid.minor.x = element_blank(), + panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed"), + axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"), + axis.text.x = element_text(size = 8, family = "Helvetica", face = "bold", angle = 90, hjust = 1, vjust = 0.5), + axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"), + axis.text.y = element_text(size = 10, family = "Helvetica", face = "bold") + ) + } # end plot # results res.list <- list(assignment = res, plot.assignment = plot.assignment) diff --git a/README.md b/README.md index 2a7a713..a7b17c4 100644 --- a/README.md +++ b/README.md @@ -12,15 +12,16 @@ This is the development page of the **assigner** package for the R software. * **Conduct assignment analysis** using [gsi_sim] (https://github.com/eriqande/gsi_sim), a tool developed by Eric C. Anderson (see Anderson et al. 2008 and Anderson 2010) -* The input file are: i) in the VCF file format (Danecek et al. 2011) (*batch_x.vcf*) produced by [STACKS] (http://catchenlab.life.illinois.edu/stacks/) (Catchen et al. 2011, 2013); ii) very large files (> 50 000 markers) can be imported in PLINK tped/tfam format (Purcell et al. 2007) and iii) a data frame of genotypes. +* The input file are: i) a VCF file format (Danecek et al. 2011) (*batch_x.vcf*) produced by [STACKS] (http://catchenlab.life.illinois.edu/stacks/) (Catchen et al. 2011, 2013); ii) an haplotypes data frame file (*batch_x.haplotypes.tsv*) produced by [STACKS] (http://catchenlab.life.illinois.edu/stacks/) (Catchen et al. 2011, 2013); iii)very large files (> 50 000 markers) can be imported in PLINK tped/tfam format (Purcell et al. 2007) and iv) a data frame of genotypes. * Individuals, populations and markers can be **filtered** and/or selected in several ways using **blacklist, whitelist** and other arguments -* **Map-independent imputation** of missing genotype using **Random Forest** or the most frequent category is also available to test the impact of missing data on assignment analysis +* **Map-independent imputation** of missing genotype or alleles using **Random Forest** or the most frequent category is also available to test the impact of missing data on assignment analysis * Genotypes of poor quality (e.g. in coverage, genotype likelihood or sequencing errors) can be erased prior to imputations or assignment analysis with the use of a `blacklist.genotype` argument. * Markers can be randomly selected for a **classic LOO (Leave-One-Out) assignment** or chosen based on **ranked Fst** (Weir & Cockerham, 1984) for a **THL (Training, Holdout, Leave-one-out) assignment analysis** (reviewed in Anderson 2010) * Use `iteration.method` and/or `iteration.subsample` arguments to resample markers or individuals to get statistics! * The impact of the minor allele frequency, MAF, (local and global) can also be easily explored with custom thresholds +* The impact of filters used in other software can be explored by using the `whitelist.markers` argument. * Compute the **genotype likelihood ratio distance metric (Dlr)** (Paetkau's et al. 1997, 2004) * Import and summarise the assignment results from [GenoDive] (http://www.bentleydrummer.nl/software/software/GenoDive.html) (Meirmans and Van Tienderen, 2004) * `ggplot2`-based plotting to view assignment results and create publication-ready figures @@ -32,7 +33,7 @@ You can try out the dev version of **assigner**. Follow the 4 steps below: Step 1 You will need the package **devtools** ```r -install.packages("devtools") # to install +if (!require("devtools")) install.packages("devtools") # to install library(devtools) # to load ``` @@ -129,7 +130,19 @@ sudo rm -R /Library/Frameworks/R.framework/Resources/library/package_name Here the list of packages that **assigner** is depending on: ```r -dplyr, reshape2, ggplot2, readr, stringi, tidyr, purrr, lazyeval, adegenet, randomForestSRC, foreach, parallel, doParallel +if (!require("reshape2")) install.packages("reshape2") +if (!require("ggplot2")) install.packages("ggplot2") +if (!require("stringr")) install.packages("stringr") +if (!require("stringi")) install.packages("stringi") +if (!require("plyr")) install.packages("plyr") +if (!require("dplyr")) install.packages("dplyr") +if (!require("tidyr")) install.packages("tidyr") +if (!require("readr")) install.packages("readr") +if (!require("purrr")) install.packages("purrr") +if (!require("data.table")) install.packages("data.table") +if (!require("lazyeval")) install.packages("lazyeval") +if (!require("adegenet")) install.packages("adegenet") +if (!require("parallel")) install.packages("parallel") ``` If you don't have them, no worries, it's intalled automatically during **assigner** installation. If you have them, it's your job to update them, because i'm usually using the latest versions... @@ -157,6 +170,11 @@ The Amazon image can be imported into Google Cloud Compute Engine to start a new ## New +**v.0.1.7** +* New input file: Re-introduced the haplotype data frame file from stacks. +* Argument name change: `imputations` is now `impute.method`. +* New argument: `impute` with 2 options: `impute = "genotype"` or `impute = "allele"`. + **v.0.1.6** * Input file argument is now `data` and covers the three types of files the function can use: VCF file, PLINK tped/tfam or data frame of genotypes file. diff --git a/man/assignment_ngs.Rd b/man/assignment_ngs.Rd index f97aa4e..0872d5a 100644 --- a/man/assignment_ngs.Rd +++ b/man/assignment_ngs.Rd @@ -57,8 +57,9 @@ RADseq/GBS de novo discovery, you can minimize linkage disequilibrium (LD) by choosing among these 3 options: \code{"random"} selection, \code{"first"} or \code{"last"} SNP on the same short read/haplotype. Default: \code{snp.ld = NULL}. -Note that for other file type, use PLINK with the linkage disequilibrium -based SNP pruning option.} +Note that for other file type, use stackr package for haplotype file and +create a whitelist, for plink and data frames, use PLINK linkage +disequilibrium based SNP pruning option.} \item{common.markers}{(optional) Logical. Default = \code{FALSE}. With \code{TRUE}, will keep markers genotyped in all the populations.} @@ -66,7 +67,9 @@ With \code{TRUE}, will keep markers genotyped in all the populations.} \item{maf.thresholds}{(string, double, optional) String with local/populations and global/overall maf thresholds, respectively. Default: \code{maf.thresholds = NULL}. -e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold of 0.05 and a global threshold of 0.1.} +e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold +of 0.05 and a global threshold of 0.1. Available for VCF, PLINK and data frame +files. Use stackr for haplotypes files.} \item{maf.pop.num.threshold}{(integer, optional) When maf thresholds are used, this argument is for the number of pop required to pass the maf thresholds @@ -131,7 +134,7 @@ individuals and this process is reapeated the number of times chosen by the \item{folder}{(optional) The name of the folder created in the working directory to save the files/results.} \item{gsi_sim.filename}{(optional) The name of the file written to the directory. -Use the extension ".txt" at the end. Default \code{gsi_sim_data.txt}. +Use the extension ".txt" at the end. Default \code{assignment_data.txt}. The number of markers used will be appended to the name of the file.} \item{keep.gsi.files}{(Boolean) Default \code{FALSE} The input and output gsi_sim files @@ -180,10 +183,14 @@ subsampling, default: \code{iteration.subsample = 1}. With \code{subsample = 20} and \code{iteration.subsample = 10}, 20 individuals/populations will be randomly chosen 10 times.} -\item{imputations}{Should a map-independent imputations of markers be +\item{imputation.method}{Should a map-independent imputations of markers be computed. Available choices are: (1) \code{FALSE} for no imputation. (2) \code{"max"} to use the most frequent category for imputations. - (3) \code{"rf"} using Random Forest algorithm. Default = \code{FALSE}.} +(3) \code{"rf"} using Random Forest algorithm. +Default: \code{imputation.method = FALSE}.} + +\item{impute}{(character) Imputation on missing genotype +\code{impute = "genotype"} or alleles \code{impute = "allele"}.} \item{imputations.group}{\code{"global"} or \code{"populations"}. Should the imputations be computed globally or by populations. If you choose