From 4a1b9a4f1f06dba11fd52a30ec17311a135ecfd1 Mon Sep 17 00:00:00 2001
From: Thierry Gosselin <thierrygosselin@icloud.com>
Date: Sun, 10 Apr 2016 10:16:59 +1200
Subject: [PATCH] **v.0.2.1** * updated the function `assignment_mixture` with
 `sampling.method = "ranked"` and `assignment.analysis = "adegenet"`.

---
 DESCRIPTION               |    4 +-
 R/assignment_mixture.R    | 1522 ++++++++++++++++++-------------------
 R/assignment_ngs.R        |   81 +-
 README.md                 |    6 +-
 man/assignment_mixture.Rd |   63 +-
 man/assignment_ngs.Rd     |    2 +-
 6 files changed, 839 insertions(+), 839 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8227188..bb59a1e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: assigner
 Type: Package
 Title: Assignment Analysis with GBS/RADseq Data using R
-Version: 0.2.0
-Date: 2015-04-06
+Version: 0.2.1
+Date: 2016-04-010
 Encoding: UTF-8
 Authors@R: c(
   person("Thierry", "Gosselin", email = "thierrygosselin@icloud.com", role = c("aut", "cre")))
diff --git a/R/assignment_mixture.R b/R/assignment_mixture.R
index ca9b34a..0579133 100644
--- a/R/assignment_mixture.R
+++ b/R/assignment_mixture.R
@@ -44,7 +44,7 @@
 #' with \code{--recode transpose}.
 #' 
 #' @param mixture A text file mixture individual ID. The column header is 
-#' 'INDIVIDUALS'and the file is in the working directory (e.g. "mixture.txt").
+#' \code{INDIVIDUALS} and the file is in the working directory (e.g. "mixture.txt").
 #' 
 #' @param assignment.analysis Assignment analysis conducted with 
 #' \code{assignment.analysis = "gsi_sim"} or 
@@ -126,28 +126,12 @@
 #' chosen based on ranked Fst \code{"ranked"} using the baseline samples 
 #' for the training and the mixture samples as holdout. 
 #' Classic Leave-one-out is then used to assign individual mixture samples.
-#' @param thl (character, integer, proportion) For \code{sampling.method = "ranked"} only.
-#' Default \code{1}, 1 individual sample is used as holdout. This individual is not
-#' participating in the markers ranking. For each marker number,
-#' the analysis will be repeated with all the indiviuals in the data set
-#' (e.g. 500 individuals, 500 times 500, 1000, 2000 markers).
-#' If a proportion is used e.g. \code{0.15},= 15% of individuals in each
-#' populations are chosen randomly as holdout individuals.
-#' With \code{thl = "all"} all individuals are used for ranking (not good) and
-#' \code{iteration.method} argument below is set to \code{1} by default.
-#' For the other thl values, you can create different holdout individuals lists
-#' with the \code{iteration.method} argument below.
 #' @param iteration.method With random marker selection the iterations argument =
 #' the number of iterations to repeat marker resampling. 
 #' Default: \code{iteration.method = 10}.
 #' With \code{marker.number = c(500, 1000)} and default iterations setting,
 #' 500 markers will be randomly chosen 10 times and 1000 markers will be randomly
-#' chosen 10 times. For the ranked method, using \code{thl = 1}, the analysis
-#' will be repeated for each individuals in the data set for every
-#' \code{marker.number} selected. With a proportion argument \code{thl = 0.15},
-#' 15% of individuals in each populations are chosen randomly as holdout
-#' individuals and this process is reapeated the number of times chosen by the
-#' \code{iteration.method} value.
+#' chosen 10 times.
 
 #' @param folder (optional) The name of the folder created in the working directory to save the files/results.
 #' @param filename (optional) The name of the file written to the directory.
@@ -172,8 +156,8 @@
 #' with the pop id in it, use the \code{strata} argument.
 #' @param strata (optional) A tab delimited file with 2 columns with header:
 #' \code{INDIVIDUALS} and \code{STRATA}. Default: \code{strata = NULL}. With a 
-#' data frame of genotypes the strata is the INDIVIDUALS and POP_ID columns, with
-#' PLINK files, the \code{tfam} first 2 columns are used. 
+#' data frame of genotypes the strata is the INDIVIDUALS and POP_ID column. With
+#' a PLINK file, the first 2 columns of the \code{tfam} are used. 
 #' If a \code{strata} file is specified, the strata file will have
 #' precedence. The \code{STRATA} column can be any hierarchical grouping.
 
@@ -261,17 +245,19 @@
 
 #' @examples
 #' \dontrun{
+#' # with adegenet DAPC for the assignment and sampling.method = "random":
 #' assignment.treefrog <- assignment_mixture(
 #' data = "batch_1.vcf",
+#' mixture = "mixture.treefrog.tsv",
+#' assignment.analysis = "adegenet",
 #' whitelist.markers = "whitelist.vcf.txt",
 #' snp.ld = NULL,
 #' common.markers = TRUE,
 #' marker.number = c(500, 5000, "all"),
-#' sampling.method = "ranked",
-#' thl = 0.3,
-#' blacklist.id = "blacklist.id.lobster.tsv",
+#' sampling.method = "random",
+#' blacklist.id = "blacklist.id.tsv",
 #' subsample = 25,
-#' iteration.subsample = 10
+#' iteration.subsample = 5
 #' filename = "treefrog.txt",
 #' keep.gsi.files = FALSE,
 #' pop.levels = c("PAN", "COS")
@@ -279,12 +265,36 @@
 #' imputation.method = FALSE,
 #' parallel.core = 12
 #' )
+#' # with gsi_sim for the mixture assignment and sampling.method = "ranked"
+#' # Here I also want to impute the genotypes of the data (baseline and mixture) 
+#' # using random forest:
+#' assignment.tuna <- assignment_mixture(
+#' data = "data.frame.genotypes.tuna.tsv",
+#' mixture = "cohort.tuna.tsv"
+#' assignment.analysis = "gsi_sim",
+#' common.markers = TRUE,
+#' marker.number = c(100, 200, 300),
+#' sampling.method = "ranked",
+#' subsample = 25,
+#' iteration.subsample = 5
+#' filename = "tuna.txt",
+#' keep.gsi.files = FALSE,
+#' pop.levels = c("BAJ", "IND"),
+#' imputation.method = "rf", 
+#' impute.mixture = TRUE, 
+#' impute = "genotype", 
+#' imputations.group = "populations", 
+#' num.tree = 100, 
+#' iteration.rf = 10, 
+#' split.number = 100, 
+#' verbose = FALSE,
+#' parallel.core = 12
+#' )
 #' 
 #' Since the 'folder' argument is missing, it will be created automatically
 #' inside your working directory.
 #' 
-#' To create a dataframe with the assignment results: 
-#' assignment <- assignment.treefrog$assignment.
+#' use $ to access the data frames in the list created.
 #' }
 
 
@@ -339,7 +349,8 @@ if (getRversion() >= "2.15.1") {
       "SE_MAX", ".", "QUAL", "FILTER", "INFO", "pb", "SUBSAMPLE", "STRATA", 
       "sum.pop", "A1", "A2", "INDIVIDUALS_2", "Cnt", "Catalog ID", "GROUP",
       "COUNT", "MAX_COUNT_MARKERS", "hierarchy", "ANALYSIS", "NUMBER_ITERATIONS",
-      "TOTAL_ITERATIONS", "MEAN_ITERATIONS"
+      "TOTAL_ITERATIONS", "MEAN_ITERATIONS", "X1", "X2", "NUMBER_SUBSAMPLE",
+      "TOTAL_SUBSAMPLE", "MEAN_SUBSAMPLE"
     )
   )
 }
@@ -368,7 +379,6 @@ assignment_mixture <- function(data,
                                subsample = NULL,
                                iteration.subsample = 1,
                                sampling.method,
-                               thl = 1,
                                iteration.method = 10,
                                folder,
                                filename = "assignment_data.txt",
@@ -409,10 +419,7 @@ assignment_mixture <- function(data,
   if (missing(subsample)) subsample <- NULL
   if (missing(iteration.subsample)) iteration.subsample <- 1
   if (missing(sampling.method)) stop("Sampling method required")
-  if (sampling.method == "ranked" & missing(thl)) thl <- 1 # thl
   if (missing(iteration.method)) iteration.method <- 10
-  if (sampling.method  == "ranked" & thl == "all") iteration.method <- 1
-  if (sampling.method == "ranked" & thl == 1) iteration.method <- 1
   if (missing(filename)) filename <- "assignment_data.txt"
   if (missing(keep.gsi.files)) keep.gsi.files <- FALSE
   if (missing(imputation.method)) imputation.method <- FALSE
@@ -445,7 +452,7 @@ assignment_mixture <- function(data,
   
   if (stri_detect_fixed(str = data.type, pattern = "POP_ID") | stri_detect_fixed(str = data.type, pattern = "INDIVIDUALS")) {
     data.type <- "df.file"
-    message("File type: data frame of genotype")
+    message("File type: data frame of genotypes")
   }
   
   if (stri_detect_fixed(str = data.type, pattern = "Catalog")) {
@@ -1297,7 +1304,7 @@ package and update your whitelist")
         arrange(MARKERS, POP_ID) %>%
         select(-c(REF, ALT))
       
-      gsim.prep <- input %>%
+      gsi.prep <- input %>%
         tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>%  # separate the genotypes into alleles
         tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
     }
@@ -1317,7 +1324,7 @@ package and update your whitelist")
     }
     
     if (data.type == "haplo.file") { # for haplotypes
-      gsim.prep <- suppressWarnings(
+      gsi.prep <- suppressWarnings(
         input %>%
           mutate(GT = stri_replace_all_fixed(GT, "-", "000/000", vectorize_all=F)) %>% 
           arrange(MARKERS) %>% 
@@ -1335,7 +1342,7 @@ package and update your whitelist")
     }
     if (data.type == "haplo.file" & assignment.analysis == "gsi_sim") {
       input <- suppressWarnings(
-        gsim.prep %>%
+        gsi.prep %>%
           filter(GT != "000") %>% 
           tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
           ungroup() %>% 
@@ -1356,26 +1363,26 @@ package and update your whitelist")
           tidyr::unite(data = ., GT, A1, A2, sep = "", remove = TRUE)
       )
       
-      # for haplo.file we need to change back again the gsim.prep file
-      gsim.prep <- input %>% 
+      # for haplo.file we need to change back again the gsi.prep file
+      gsi.prep <- input %>% 
         tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
         tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
     }
     if (data.type == "df.file") { # For data frame of genotypes
-      gsim.prep <- input %>% 
+      gsi.prep <- input %>% 
         tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
         tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
     }
     if (data.type == "plink.file") { # for PLINK
       message("Recoding genotypes for gsi_sim")
-      gsim.prep <- input %>% 
+      gsi.prep <- input %>% 
         tidyr::separate(col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS_2", "ALLELES"), sep = "_", remove = TRUE) %>% 
         select(-INDIVIDUALS_2)
     }
     if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
       if (assignment.analysis == "adegenet" ) {
         genind.prep <- suppressWarnings(
-          gsim.prep %>%
+          gsi.prep %>%
             filter(GT != "000") %>% 
             tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
             ungroup() %>% 
@@ -1416,16 +1423,22 @@ package and update your whitelist")
       }
     }
     
-    # only adegenet
+    # Conversion to adegenet genind object
     if (assignment.analysis == "adegenet") {
       # genind arguments common to all data.type
+      genind.prep <- genind.prep %>% arrange(POP_ID, INDIVIDUALS)
       ind <- as.character(genind.prep$INDIVIDUALS)
       pop <- genind.prep$POP_ID
       genind.df <- genind.prep %>% ungroup() %>% 
         select(-c(INDIVIDUALS, POP_ID))
       rownames(genind.df) <- ind
       loc.names <- colnames(genind.df)
-      strata <- genind.prep %>% ungroup() %>% select(INDIVIDUALS, POP_ID) %>% distinct(INDIVIDUALS, POP_ID)
+      strata <- genind.prep %>% 
+        ungroup() %>% 
+        select(INDIVIDUALS, POP_ID) %>% 
+        distinct(INDIVIDUALS, POP_ID) %>% 
+        mutate(MIXTURE = ifelse(INDIVIDUALS %in% mixture.df$INDIVIDUALS, "mixture", "baseline"))
+      
       
       # genind constructor
       prevcall <- match.call()
@@ -1440,6 +1453,12 @@ package and update your whitelist")
       genind.prep <- NULL
     }
     
+    # only gsi_sim
+    if (assignment.analysis == "gsi_sim"){
+    gsi.prep <- gsi.prep %>% 
+      arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES)
+    }
+    
     # Imputations **************************************************************
     if (imputation.method != "FALSE") {
       message("Preparing the data for imputations")
@@ -1480,7 +1499,7 @@ package and update your whitelist")
       if (data.type == "plink.file") {
         
         if (impute == "genotype"){
-          input.prep <- gsim.prep %>% 
+          input.prep <- gsi.prep %>% 
             group_by(MARKERS, INDIVIDUALS, POP_ID) %>% 
             tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
             tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) %>% 
@@ -1495,7 +1514,7 @@ package and update your whitelist")
         }
         
         if (impute == "allele"){
-          input.prep <- gsim.prep %>%
+          input.prep <- gsi.prep %>%
             mutate(
               GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
               GT = replace(GT, which(GT == "NA"), NA)
@@ -1522,7 +1541,7 @@ package and update your whitelist")
             arrange(POP_ID, INDIVIDUALS)
         }
         if (impute == "allele") {
-          input.prep <- gsim.prep %>%
+          input.prep <- gsi.prep %>%
             mutate(
               GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
               GT = replace(GT, which(GT == "NA"), NA)
@@ -1536,25 +1555,19 @@ package and update your whitelist")
       
       # If imputations.group is by populations, we remove mixture samples conduct the imputations on
       # the baseline by populations, but the imputation for the mixture samples is conducted globally, automatically. 
-      if (imputations.group == "populations" & impute.mixture == TRUE) {
-        input.prep.mixture <- input.prep %>% 
-          filter(INDIVIDUALS %in% mixture.df$INDIVIDUALS) %>% 
-          mutate(POP_ID = droplevels(POP_ID))
-        
-        input.prep.baseline <- input.prep %>% 
-          filter(!INDIVIDUALS %in% mixture.df$INDIVIDUALS) %>% 
-          mutate(POP_ID = droplevels(POP_ID))
-        
-        input.prep <- NULL # remove unused object
-      }
+      input.prep.mixture <- input.prep %>% 
+        filter(INDIVIDUALS %in% mixture.df$INDIVIDUALS) %>% 
+        mutate(POP_ID = droplevels(POP_ID))
       
-      if (imputations.group == "populations" & impute.mixture == FALSE) {
-        input.prep.baseline <- input.prep %>% 
-          filter(!INDIVIDUALS %in% mixture.df$INDIVIDUALS) %>% 
-          mutate(POP_ID = droplevels(POP_ID))
-        
-        input.prep <- NULL # remove unused object
-      }
+      input.prep.baseline <- input.prep %>% 
+        filter(!INDIVIDUALS %in% mixture.df$INDIVIDUALS) %>% 
+        mutate(POP_ID = droplevels(POP_ID))
+      
+      strata.df.subsample <- input.prep %>% 
+        select(INDIVIDUALS, POP_ID) %>% 
+        distinct(INDIVIDUALS, POP_ID)
+      
+      input.prep <- NULL # remove unused object
       
       # Imputation with Random Forest
       if (imputation.method == "rf") {
@@ -1573,7 +1586,7 @@ package and update your whitelist")
         
         # Random Forest by pop
         if (imputations.group == "populations") {
-          message("Imputations computed by populations, take a break...")
+          message("Imputations computed by populations for baseline samples, take a break...")
           df.split.pop <- split(x = input.prep.baseline, f = input.prep.baseline$POP_ID) # slip data frame by population
           pop.list <- names(df.split.pop) # list the pop
           imputed.dataset <-list() # create empty list
@@ -1588,6 +1601,9 @@ package and update your whitelist")
             message(paste("Completed imputations for pop ", pop.list, sep = ""))
             # imputed.dataset[[i]] <- impute_markers_rf(sep.pop) # test with foreach
             imputed.dataset <- impute_genotype_rf(sep.pop)
+            imputed.dataset <- suppressWarnings(
+              plyr::colwise(as.character, exclude = NA)(imputed.dataset)
+            )
             return(imputed.dataset)
           } # End impute_rf_pop
           
@@ -1601,25 +1617,39 @@ package and update your whitelist")
           )
           
           # Compiling the results
-          message("Compiling imputations results")
           input.imp <- suppressWarnings(bind_rows(input.imp))
           
           # Second round of imputations (globally) to remove introduced NA 
           # In case that some pop don't have the markers
           input.imp <- suppressWarnings(plyr::colwise(factor, exclude = NA)(input.imp)) # Make the columns factor
           input.imp <- impute_genotype_rf(input.imp) # impute globally
+          input.imp <- plyr::colwise(as.character, exclude = NA)(input.imp)
+          
+          # combine the mixture (no imputation) + the imputed baseline
+          input.imp <- suppressWarnings(
+            bind_rows(input.imp, input.prep.mixture) %>% 
+              select(-POP_ID) # remove the column POP_ID
+          )
           
           if (impute.mixture == TRUE) {
-            # combine the mixture (no imputation) + the imputed baseline
-            input.imp <- suppressWarnings(
-              bind_rows(input.imp, input.prep.mixture) %>% 
-                select(-POP_ID) # remove the column POP_ID
-            )
-            
             # impute globally the mixture samples
+            message("Imputations computed globally for mixture samples")
             input.imp <- suppressWarnings(plyr::colwise(factor, exclude = NA)(input.imp)) # Make the columns factor
             input.imp <- impute_genotype_rf(input.imp) # impute globally
-            input.imp <- left_join(strata.df, input.imp, by = "INDIVIDUALS")
+            input.imp <- plyr::colwise(as.character, exclude = NA)(input.imp)
+            input.imp <- suppressWarnings(
+              left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                arrange(POP_ID, INDIVIDUALS) %>% 
+                ungroup()
+            )
+          }
+          
+          if (impute.mixture == FALSE) {
+            input.imp <- suppressWarnings(
+              left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                arrange(POP_ID, INDIVIDUALS) %>% 
+                ungroup()
+            )
           }
           
           # dump unused objects
@@ -1634,19 +1664,79 @@ package and update your whitelist")
         } # End imputation RF populations
         # Random Forest global
         if (imputations.group == "global") { # Globally (not by pop_id)
-          message("Imputations computed globally, take a break...")
-          input.imp <- plyr::colwise(factor, exclude = NA)(input.prep)
+          message("Imputations computed globally for baseline samples, take a break...")
+          input.imp <- plyr::colwise(factor, exclude = NA)(input.prep.baseline)
           input.imp <- input.imp %>% select(-POP_ID) # remove the column POP_ID
-          input.imp <- impute_genotype_rf(input.prep)
-          input.imp <- left_join(strata.df, input.imp, by = "INDIVIDUALS")
+          input.imp <- impute_genotype_rf(input.imp)
+          input.imp <- plyr::colwise(as.character, exclude = NA)(input.imp)
           input.prep <- NULL # remove unused object
           
+          
+          # combine the mixture (no imputation) + the imputed baseline
+          input.imp <- suppressWarnings(
+            bind_rows(input.imp, 
+                      input.prep.mixture %>% 
+                        select(-POP_ID)# remove the column POP_ID
+                      )
+          )
+
+          if (impute.mixture == TRUE) {
+            # impute globally the mixture samples
+            message("Imputations computed globally for mixture samples")
+            input.imp <- suppressWarnings(plyr::colwise(factor, exclude = NA)(input.imp)) # Make the columns factor
+            input.imp <- impute_genotype_rf(input.imp) # impute globally
+            input.imp <- plyr::colwise(as.character, exclude = NA)(input.imp)
+            input.imp <- suppressWarnings(
+              left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                arrange(POP_ID, INDIVIDUALS) %>% 
+                ungroup()
+            )
+          }
+          
+          if (impute.mixture == FALSE) {
+            input.imp <- suppressWarnings(
+              left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                arrange(POP_ID, INDIVIDUALS) %>% 
+                ungroup()
+            )
+          }
         } # End imputation RF global
+        
+        # data prep
+        if (impute == "genotype") {
+          input.imp <- suppressWarnings(
+            input.imp %>%
+              tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID))
+          )
+          
+          # Replace the GT == NA for "000000"
+          if (impute.mixture == FALSE) {
+            input.imp <- input.imp %>% 
+              mutate(
+                GT = stri_replace_na(GT, replacement = "000000")
+              )
+          }
+        }
+        if (impute == "allele") {
+          input.imp <- suppressWarnings(
+            input.imp %>%
+              tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES))
+          )
+          # Replace the GT == NA for "000"
+          if (impute.mixture == FALSE) {
+            input.imp <- input.imp %>% 
+              mutate(
+                GT = stri_replace_na(GT, replacement = "000")
+              )
+          }
+        }
       } # End imputation RF
+      
       # Imputation using the most common genotype
       if (imputation.method == "max") { # End imputation max
         if (imputations.group == "populations") {
-          message("Imputations computed by populations")
+          message("Imputations computed by populations for baseline samples")
+          message(stri_paste("Imputing: ", impute))
           
           if (impute == "genotype"){
             input.imp <- suppressWarnings(
@@ -1660,30 +1750,46 @@ package and update your whitelist")
                 # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers
                 # will take the global observed values by markers for those cases.
                 group_by(MARKERS) %>%
-                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)))
+                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% 
+                ungroup() %>% 
+                select(-POP_ID)
+            )
+            # combine the mixture (no imputation) + the imputed baseline
+            input.imp <- suppressWarnings(
+              bind_rows(input.imp, 
+                        input.prep.mixture %>% 
+                          tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
+                          select(-POP_ID)
+              )
             )
             
             if (impute.mixture == TRUE) {
-              # combine the mixture (no imputation) + the imputed baseline
-              input.imp <- suppressWarnings(
-                bind_rows(input.imp, 
-                          input.prep.mixture %>% 
-                            tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID))
-                ) %>% 
-                  mutate(POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered = TRUE))
-              )
-              
               # impute globally the mixture samples
+              message("Imputations computed globally for mixture samples")
               input.imp <- suppressWarnings(
                 input.imp %>% 
                   group_by(MARKERS) %>%
-                  mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
-                  group_by(INDIVIDUALS, POP_ID) %>% 
+                  mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% 
+                  ungroup()
+              )
+                  
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
                   arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
-                  tidyr::spread(data = ., key = MARKERS, value = GT) %>%
                   ungroup()
               )
             }
+            
+            if (impute.mixture == FALSE) {
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
+                  mutate(
+                    GT = stri_replace_na(GT, replacement = "000000")
+                  ) %>% 
+                  ungroup()
+              )
+            } 
           }
           
           if (impute == "allele"){
@@ -1698,30 +1804,47 @@ package and update your whitelist")
                 # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers
                 # will take the global observed values by markers for those cases.
                 group_by(MARKERS) %>%
-                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)))
+                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% 
+                ungroup() %>% 
+                select(-POP_ID)
             )
             
-            if (impute.mixture == TRUE) {
-              # combine the mixture (no imputation) + the imputed baseline
-              input.imp <- suppressWarnings(
-                bind_rows(input.imp, 
-                          input.prep.mixture %>% 
-                            tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES))
-                ) %>% 
-                  mutate(POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered = TRUE))
+            # combine the mixture (no imputation) + the imputed baseline
+            input.imp <- suppressWarnings(
+              bind_rows(input.imp, 
+                        input.prep.mixture %>% 
+                          tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
+                          select(-POP_ID)
               )
-              
+            )
+            
+            if (impute.mixture == TRUE) {
               # impute globally the mixture samples
+              message("Imputations computed globally for mixture samples")
               input.imp <- suppressWarnings(
                 input.imp %>% 
                   group_by(MARKERS) %>%
                   mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
-                  group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
-                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
-                  tidyr::spread(data = ., key = MARKERS, value = GT) %>%
+                  ungroup()
+              )
+
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
                   ungroup()
               )
             }
+
+            if (impute.mixture == FALSE) {
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
+                  mutate(
+                    GT = stri_replace_na(GT, replacement = "000")
+                  ) %>% 
+                  ungroup()
+              )
+            } 
           }
           
           input.prep <- NULL # remove unused object
@@ -1731,122 +1854,258 @@ package and update your whitelist")
         } # End imputation max populations 
         if (imputations.group == "global") {
           # Globally (not by pop_id)
-          message("Imputations computed globally")
+          message("Imputations computed globally for baseline samples")
+          message(stri_paste("Imputing: ", impute))
+          
           if (impute == "genotype"){
             input.imp <- suppressWarnings(
-              input.prep %>%
+              input.prep.baseline %>%
                 tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>%
                 group_by(MARKERS) %>%
                 mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
-                group_by(INDIVIDUALS, POP_ID) %>% 
-                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
-                ungroup()
+                ungroup() %>% 
+                select(-POP_ID)
+            )
+            
+            # combine the mixture (no imputation) + the imputed baseline
+            input.imp <- suppressWarnings(
+              bind_rows(input.imp, 
+                        input.prep.mixture %>% 
+                          tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
+                          select(-POP_ID)
+              )
             )
+            
+            if (impute.mixture == TRUE) {
+              # impute globally the mixture samples
+              message("Imputations computed globally for mixture samples")
+              input.imp <- suppressWarnings(
+                input.imp %>% 
+                  group_by(MARKERS) %>%
+                  mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>% 
+                  ungroup()
+              )
+              
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
+                  ungroup()
+              )
+              }
+
+            if (impute.mixture == FALSE) {
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
+                  mutate(
+                    GT = stri_replace_na(GT, replacement = "000000")
+                  ) %>% 
+                  ungroup()
+              )
+            } 
           }
           
           if (impute == "allele"){
             input.imp <- suppressWarnings(
-              input.prep %>%
+              input.prep.baseline %>%
                 tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
                 group_by(MARKERS) %>%
                 mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
-                group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
-                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
-                ungroup()
+                ungroup() %>% 
+                select(-POP_ID)
+            )
+            
+            # combine the mixture (no imputation) + the imputed baseline
+            input.imp <- suppressWarnings(
+              bind_rows(input.imp, 
+                        input.prep.mixture %>% 
+                          tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
+                          select(-POP_ID)
+              )
             )
+            if (impute.mixture == TRUE) {
+              # impute globally the mixture samples
+              message("Imputations computed globally for mixture samples")
+              input.imp <- suppressWarnings(
+                input.imp %>% 
+                  group_by(MARKERS) %>%
+                  mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
+                  ungroup()
+              )
+              
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
+                  ungroup()
+              )
+            }
+            if (impute.mixture == FALSE) {
+              input.imp <- suppressWarnings(
+                left_join(strata.df.subsample, input.imp, by = "INDIVIDUALS") %>% 
+                  arrange(POP_ID, INDIVIDUALS, MARKERS) %>% 
+                  mutate(
+                    GT = stri_replace_na(GT, replacement = "000")
+                  ) %>% 
+                  ungroup()
+              )
+            } 
           }
           
           input.prep <- NULL # remove unused object
         } # End imputation max global 
       } # End imputations max
       
+      # test <- input.imp %>% filter(GT == "000000")
+      # test <- input.imp %>% filter(GT == "000")
+      # test <- input.imp %>% ungroup %>% filter(is.na(GT))
+      
       # prepare the imputed dataset for gsi_sim or adegenet
-      message("Preparing imputed data set for gsi_sim...")
+      message("Preparing imputed data set")
       if (assignment.analysis == "gsi_sim") {
         if (impute == "genotype") {
           if (data.type == "vcf.file") { # for VCF input
             gsi.prep.imp <- suppressWarnings(
               input.imp %>%
-                tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID)) %>% # make tidy
+                # tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID)) %>% # make tidy
                 tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
                 tidyr::gather(key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID))
             )
           }
           if (data.type == "plink.file" | data.type == "df.file" | data.type == "haplo.file") {
             gsi.prep.imp <- input.imp %>%
-              tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
+              # tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
               tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>%  # separate the genotypes into alleles
-              tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
+              tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% 
+              select(POP_ID, INDIVIDUALS, MARKERS, ALLELES, GT) %>% 
+              arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES)
           }
         }
         if (impute == "allele") {
-          gsi.prep.imp <- suppressWarnings(
-            input.imp %>%
-              tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES))
-          )
+          gsi.prep.imp <- input.imp
         }
       }
       
       # adegenet
-      if (data.type == "vcf.file" & assignment.analysis == "adegenet" ) {
-        genind.prep.imp <- input.imp %>% 
-          tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
-          tidyr::spread(data = ., key = ALLELES, value = GT) %>%
-          tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
-          mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
-          arrange(MARKERS, POP_ID) %>%
-          tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
-          tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
-          tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
-          group_by(POP_ID, INDIVIDUALS) %>%
-          tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
-          arrange(POP_ID, INDIVIDUALS)
-      }
-      if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
-        if (assignment.analysis == "adegenet" ) {
-          genind.prep.imp <- suppressWarnings(
-            input.imp %>%
-              ungroup() %>% 
-              plyr::colwise(.fun = factor, exclude = NA)(.)
-          )
-          
-          genind.prep.imp <- suppressWarnings(
-            genind.prep.imp %>%
-              ungroup() %>% 
-              mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
-              ungroup() %>% 
-              tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
-              select(-ALLELES) %>%
-              group_by(POP_ID, INDIVIDUALS, MARKERS, GT) %>% 
-              tally %>%
-              ungroup() %>%
-              tidyr::unite(MARKERS_ALLELES, MARKERS, GT, sep = ":", remove = TRUE) %>%
-              arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>% 
-              group_by(POP_ID, INDIVIDUALS) %>% 
-              tidyr::spread(data = ., key = MARKERS_ALLELES, value = n) %>%
-              ungroup() %>%
-              tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(INDIVIDUALS, POP_ID)) %>% 
-              tidyr::separate(data = ., col = MARKERS_ALLELES, into = c("MARKERS", "ALLELES"), sep = ":", remove = TRUE) %>% 
-              mutate(COUNT = as.numeric(stri_replace_na(str = COUNT, replacement = "0"))) %>% 
-              ungroup() %>%
-              arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
+      if (assignment.analysis == "adegenet") {
+        if (impute == "genotype") {
+          if (data.type == "vcf.file") {
+            genind.prep.imp <- input.imp %>% 
+              # tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
+              tidyr::spread(data = ., key = ALLELES, value = GT) %>%
+              tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
+              mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
+              arrange(MARKERS, POP_ID) %>%
+              tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
+              tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
               tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
-              tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>% 
+              group_by(POP_ID, INDIVIDUALS) %>%
+              tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
               arrange(POP_ID, INDIVIDUALS)
-          )
-        }
-      }
-      
-      # only adegenet
-      if (assignment.analysis == "adegenet") {
+          }
+          if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
+            genind.prep.imp <- input.imp %>%
+              # tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
+              tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>%  # separate the genotypes into alleles
+              tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
+            
+            genind.prep.imp <- suppressWarnings(
+              genind.prep.imp %>%
+                tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
+                ungroup() %>% 
+                plyr::colwise(.fun = factor, exclude = NA)(.)
+            )
+            
+            genind.prep.imp <- suppressWarnings(
+              genind.prep.imp %>%
+                ungroup() %>% 
+                mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
+                ungroup() %>% 
+                tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
+                select(-ALLELES) %>%
+                group_by(POP_ID, INDIVIDUALS, MARKERS, GT) %>% 
+                tally %>%
+                ungroup() %>%
+                tidyr::unite(MARKERS_ALLELES, MARKERS, GT, sep = ":", remove = TRUE) %>%
+                arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>% 
+                group_by(POP_ID, INDIVIDUALS) %>% 
+                tidyr::spread(data = ., key = MARKERS_ALLELES, value = n) %>%
+                ungroup() %>%
+                tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(INDIVIDUALS, POP_ID)) %>% 
+                tidyr::separate(data = ., col = MARKERS_ALLELES, into = c("MARKERS", "ALLELES"), sep = ":", remove = TRUE) %>% 
+                mutate(COUNT = as.numeric(stri_replace_na(str = COUNT, replacement = "0"))) %>% 
+                ungroup() %>%
+                arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
+                tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
+                tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>% 
+                arrange(POP_ID, INDIVIDUALS)
+            )
+          }
+        } # end impute genotype adegenet
+        if (impute == "allele") {
+          if (data.type == "vcf.file") {
+            genind.prep.imp <- input.imp %>% 
+              # tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
+              tidyr::spread(data = ., key = ALLELES, value = GT) %>%
+              tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
+              mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
+              arrange(MARKERS, POP_ID) %>%
+              tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
+              tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
+              tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
+              group_by(POP_ID, INDIVIDUALS) %>%
+              tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
+              arrange(POP_ID, INDIVIDUALS)
+          }
+          if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
+            genind.prep.imp <- suppressWarnings(
+              input.imp %>%
+                tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
+                ungroup() %>% 
+                plyr::colwise(.fun = factor, exclude = NA)(.)
+            )
+            
+            genind.prep.imp <- suppressWarnings(
+              genind.prep.imp %>%
+                ungroup() %>% 
+                mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
+                ungroup() %>% 
+                tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
+                select(-ALLELES) %>%
+                group_by(POP_ID, INDIVIDUALS, MARKERS, GT) %>% 
+                tally %>%
+                ungroup() %>%
+                tidyr::unite(MARKERS_ALLELES, MARKERS, GT, sep = ":", remove = TRUE) %>%
+                arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>% 
+                group_by(POP_ID, INDIVIDUALS) %>% 
+                tidyr::spread(data = ., key = MARKERS_ALLELES, value = n) %>%
+                ungroup() %>%
+                tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(INDIVIDUALS, POP_ID)) %>% 
+                tidyr::separate(data = ., col = MARKERS_ALLELES, into = c("MARKERS", "ALLELES"), sep = ":", remove = TRUE) %>% 
+                mutate(COUNT = as.numeric(stri_replace_na(str = COUNT, replacement = "0"))) %>% 
+                ungroup() %>%
+                arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
+                tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
+                tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>% 
+                arrange(POP_ID, INDIVIDUALS)
+            )
+          }
+        } # end impute genotype adegenet
+        
         # genind arguments common to all data.type
+        genind.prep.imp <- genind.prep.imp %>% arrange(POP_ID, INDIVIDUALS)
         ind <- as.character(genind.prep.imp$INDIVIDUALS)
         pop <- genind.prep.imp$POP_ID
-        genind.df <- genind.prep.imp %>% ungroup() %>% 
+        genind.df <- genind.prep.imp %>%
+          ungroup() %>% 
           select(-c(INDIVIDUALS, POP_ID))
+        
         rownames(genind.df) <- ind
         loc.names <- colnames(genind.df)
-        strata <- genind.prep.imp %>% ungroup() %>% select(INDIVIDUALS, POP_ID) %>% distinct(INDIVIDUALS, POP_ID)
+        strata <- genind.prep.imp %>% 
+          ungroup() %>% 
+          select(INDIVIDUALS, POP_ID) %>% 
+          distinct(INDIVIDUALS, POP_ID) %>% 
+          mutate(MIXTURE = ifelse(INDIVIDUALS %in% mixture.df$INDIVIDUALS, "mixture", "baseline"))
         
         # genind constructor
         prevcall <- match.call()
@@ -1860,9 +2119,9 @@ package and update your whitelist")
         genind.df <- NULL
         # genind.prep <- NULL
         # genind.prep.imp <- NULL
-      }
+        
+      } # end adegenet
       
-      input.imp <- NULL # remove unused object
     } # End imputations
     
     # Sampling of markers ******************************************************
@@ -1873,7 +2132,6 @@ package and update your whitelist")
       distinct(MARKERS) %>% 
       arrange(MARKERS)
     
-    
     marker.number <- stri_replace_all_fixed(str = marker.number, pattern = "all", 
                                             replacement = nrow(unique.markers), 
                                             vectorize_all = TRUE)
@@ -1894,9 +2152,11 @@ package and update your whitelist")
     # Fst function: Weir & Cockerham 1984
     fst_WC84 <- function(data, holdout.samples, ...) {
       # data <- input # test
-      # holdout.samples <- holdout$INDIVIDUALS # test
+      # data <- input.imp # test
+      # data <- gsi.prep.imp # test
+      
+      # holdout.samples <- holdout.individuals$INDIVIDUALS # test
       
-      # data.type <- data$GT[[1]] # VCF vs DF # test
       pop.number <- n_distinct(data$POP_ID)
       
       if (is.null(holdout.samples)) { # use all the individuals
@@ -2106,8 +2366,8 @@ package and update your whitelist")
     
     # Assignment with gsi_sim
     if (assignment.analysis == "gsi_sim") {
-      assignment_analysis <- function(data, select.markers, markers.names, missing.data, i, m, holdout, filename, ...) {
-        # data <- gsim.prep #test
+      assignment_analysis <- function(data, select.markers, markers.names, missing.data, i, m, filename, ...) {
+        # data <- gsi.prep #test
         # missing.data <- "no.imputation" #test
         data.select <- suppressWarnings(
           data %>%
@@ -2165,12 +2425,6 @@ package and update your whitelist")
         }
         
         # Get Assignment results -------------------------------------------------
-        # Keep track of the holdout individual
-        if (sampling.method == "ranked") {
-          holdout.id <- NULL
-          # holdout.id <- holdout$INDIVIDUALS
-        }
-        
         # Number of markers
         n.locus <- m
         
@@ -2199,9 +2453,10 @@ package and update your whitelist")
                 SCORE = round(SCORE, 2),
                 SECOND_BEST_SCORE = round(SECOND_BEST_SCORE, 2),
                 MARKER_NUMBER = as.numeric(rep(n.locus, n())),
+                METHOD = rep(sampling.method, n()),
                 MISSING_DATA = rep(missing.data, n())
               ) %>%
-              select(INDIVIDUALS, ANALYSIS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, MISSING_DATA, ITERATIONS) %>%
+              select(INDIVIDUALS, ANALYSIS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, METHOD, MISSING_DATA) %>%
               arrange(CURRENT)
           )
         } else {
@@ -2222,36 +2477,26 @@ package and update your whitelist")
                 SCORE = round(SCORE, 2),
                 SECOND_BEST_SCORE = round(SECOND_BEST_SCORE, 2),
                 MARKER_NUMBER = as.numeric(rep(n.locus, n())),
-                MISSING_DATA = rep(missing.data, n()),
-                ITERATIONS = rep(i, n())
+                METHOD = rep(sampling.method, n()),
+                MISSING_DATA = rep(missing.data, n())
               ) %>%
-              select(INDIVIDUALS, ANALYSIS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, MISSING_DATA, ITERATIONS) %>%
+              select(INDIVIDUALS, ANALYSIS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, METHOD, MISSING_DATA) %>%
               arrange(CURRENT)
           )
+          
+          if (sampling.method == "random") {
+            assignment <- assignment %>% 
+              mutate(ITERATIONS = rep(i, n())) %>% 
+              select(INDIVIDUALS, ANALYSIS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, METHOD, MISSING_DATA, ITERATIONS) %>%
+              arrange(CURRENT)
+          }
+          
         }
         
         
         # Option remove the output file from directory to save space
         if (keep.gsi.files == FALSE) file.remove(output.gsi)
         
-        # saving preliminary results
-        # if (sampling.method == "ranked") {
-        #   if (thl != 1 & thl != "all") {
-        #     assignment1 <- assignment %>%
-        #       mutate(
-        #         CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
-        #         CURRENT = droplevels(CURRENT)
-        #       ) %>% 
-        #       group_by(CURRENT, MARKER_NUMBER, MISSING_DATA) %>%
-        #       summarise(
-        #         n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
-        #         TOTAL = length(CURRENT)
-        #       ) %>%
-        #       ungroup() %>% 
-        #       mutate(ASSIGNMENT_PERC = round(n/TOTAL*100, 0)) %>% 
-        #       select(-n, -TOTAL)
-        #   }
-        # }
         return(assignment)
       } # End assignment_analysis function
     }
@@ -2267,113 +2512,65 @@ package and update your whitelist")
         pop.data <- data.select@pop
         pop.data <- droplevels(pop.data)
         
-        
-        if (sampling.method == "random") {
-          # Alpha-Score DAPC
-          # When all the individuals are accounted for in the model construction
-          dapc.best.optim.a.score <- optim.a.score(dapc(data.select, n.da = length(levels(pop.data)), n.pca = round((length(indNames(data.select))/3)-1, 0)), pop = pop.data, plot = FALSE)$best
-          message(stri_paste("a-score optimisation for iteration:", i, sep = " ")) # message not working in parallel...
-          
-          # DAPC with all the data
-          dapc.all <- dapc(data.select, n.da = length(levels(pop.data)), n.pca = dapc.best.optim.a.score, pop = pop.data)
-          message(stri_paste("DAPC iteration:", i, sep = " "))
-          message(stri_paste("DAPC marker group:", m, sep = " "))
-        }
-        
-        if (sampling.method == "ranked") {
-          
-          # Alpha-Score DAPC training data
-          training.data <- data.select[!indNames(data.select) %in% holdout$INDIVIDUALS] # training dataset
-          pop.training <- training.data@pop
-          pop.training <- droplevels(pop.training)
-          
-          dapc.best.optim.a.score <- optim.a.score(dapc(training.data, n.da = length(levels(pop.training)), n.pca = round(((length(indNames(training.data))/3)-1), 0)), pop = pop.training, plot = FALSE)$best
-          message(stri_paste("a-score optimisation for iteration:", i, sep = " "))
-          
-          dapc.training <- dapc(training.data, n.da = length(levels(pop.training)), n.pca = dapc.best.optim.a.score, pop = pop.training)
-          message(stri_paste("DAPC of training data set for iteration:", i, sep = " "))
-          
-          # DAPC holdout individuals
-          holdout.data <- data.select[indNames(data.select) %in% holdout$INDIVIDUALS] # holdout dataset
-          pop.holdout <- holdout.data@pop
-          pop.holdout <- droplevels(pop.holdout)
-          assignment.levels <- levels(pop.holdout) # for figure
-          rev.assignment.levels <- rev(assignment.levels)  # for figure 
-          
-          dapc.predict.holdout <- predict.dapc(dapc.training, newdata = holdout.data)
-          message(stri_paste("Assigning holdout data for iteration:", i, sep = " "))
-        }
-        
-        
-        # Get Assignment results -------------------------------------------------
-        # Keep track of the holdout individual
-        if (sampling.method == "ranked") {
-          if (thl == "all") {
-            holdout.id <- NULL
-          } else {
-            holdout.id <- holdout$INDIVIDUALS
-          }
-        }
+        # # Alpha-Score DAPC
+        # # When all the individuals are accounted for in the model construction
+        # dapc.best.optim.a.score <- optim.a.score(dapc(data.select, n.da = length(levels(pop.data)), n.pca = round((length(indNames(data.select))/3)-1, 0)), pop = pop.data, plot = FALSE)$best
+        # message(stri_paste("a-score optimisation for iteration:", i, sep = " ")) # message not working in parallel...
+        # 
+        # # DAPC with all the data
+        # dapc.all <- dapc(data.select, n.da = length(levels(pop.data)), n.pca = dapc.best.optim.a.score, pop = pop.data)
+        # message(stri_paste("DAPC iteration:", i, sep = " "))
+        # message(stri_paste("DAPC marker group:", m, sep = " "))
+        
+        # Alpha-Score DAPC training data
+        training.data <- data.select[data.select@strata$MIXTURE == "baseline"]
+        # training.data <- data.select[!indNames(data.select) %in% holdout$INDIVIDUALS] # training dataset
+        # indNames(training.data)
+        # training.data@strata
+        pop.training <- training.data@pop
+        pop.training <- droplevels(pop.training)
+        
+        dapc.best.optim.a.score <- optim.a.score(dapc(training.data, n.da = length(levels(pop.training)), n.pca = round(((length(indNames(training.data))/3)-1), 0)), pop = pop.training, plot = FALSE)$best
+        message(stri_paste("a-score optimisation for iteration:", i, sep = " "))
+        
+        dapc.training <- dapc(training.data, n.da = length(levels(pop.training)), n.pca = dapc.best.optim.a.score, pop = pop.training)
+        message(stri_paste("DAPC of training data set for iteration:", i, sep = " "))
+        
+        # DAPC holdout individuals
+        holdout.data <- data.select[data.select@strata$MIXTURE == "mixture"]
+        # indNames(holdout.data)
+        # holdout.data@strata
+        # holdout.data <- data.select[indNames(data.select) %in% holdout$INDIVIDUALS] # holdout dataset
+        pop.holdout <- holdout.data@pop
+        pop.holdout <- droplevels(pop.holdout)
+        assignment.levels <- levels(pop.data)
+        rev.assignment.levels <- rev(assignment.levels)
+        
+        dapc.predict.holdout <- predict.dapc(dapc.training, newdata = holdout.data)
+        message(stri_paste("Assigning holdout data for iteration:", i, sep = " "))
+        
+        
+        # Get Assignment results -----------------------------------------------
         
         # Number of markers
         n.locus <- m
-        if (sampling.method == "random") {
-          assignment <- data_frame(ASSIGNMENT_PERC = summary(dapc.all)$assign.per.pop*100) %>% 
-            bind_cols(data_frame(POP_ID = levels(pop.data))) %>%
-            mutate(ASSIGNMENT_PERC = round(ASSIGNMENT_PERC, 2)) %>% 
-            select(POP_ID, ASSIGNMENT_PERC)
-        }        
         
         if (sampling.method == "ranked") {
-          assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>% 
-            rename(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
-            mutate(
-              CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
-              INFERRED = factor(INFERRED, levels = assignment.levels, ordered = TRUE)
-            )
-          
-          # assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>% 
-          #   select(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
-          #   mutate(
-          #     CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
-          #     INFERRED = factor(INFERRED, levels = assignment.levels, ordered = TRUE)
-          #   ) %>% 
-          #   group_by(CURRENT, INFERRED) %>% 
-          #   tally %>% 
-          #   group_by(CURRENT) %>% 
-          #   mutate(TOTAL = sum(n)) %>% 
-          #   ungroup() %>% 
-          #   mutate(ASSIGNMENT_PERC = round(n/TOTAL*100, 0)) %>% 
-          #   filter(CURRENT == INFERRED)
+          i <- "not available with sampling.method = ranked"
         }
         
-        assignment <- assignment %>% 
+        assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>% 
+          rename(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
           mutate(
-            ITERATIONS = rep(i, n()),
+            ANALYSIS = rep("mixture", n()),
             MARKER_NUMBER = as.numeric(rep(n.locus, n())),
-            MISSING_DATA = rep(missing.data, n())
+            METHOD = rep(sampling.method, n()),
+            MISSING_DATA = rep(missing.data, n()),
+            SUBSAMPLE = rep(subsample.id, n()),
+            CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
+            INFERRED = factor(INFERRED, levels = assignment.levels, ordered = TRUE),
+            ITERATIONS = rep(i, n())
           )
-        
-        # if (sampling.method == "ranked") {
-        #   if (thl != 1 & thl != "all") {
-        #     assignment <- assignment %>%
-        #       mutate(
-        #         CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
-        #         CURRENT = droplevels(CURRENT)
-        #       ) %>% 
-        #       group_by(CURRENT, MARKER_NUMBER, MISSING_DATA) %>%
-        #       summarise(
-        #         n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
-        #         TOTAL = length(CURRENT)
-        #       ) %>%
-        #       ungroup() %>% 
-        #       mutate(
-        #         ASSIGNMENT_PERC = round(n/TOTAL*100, 0),
-        #         ITERATIONS = rep(i, n())
-        #       ) %>% 
-        #       select(-n, -TOTAL)
-        #   }
-        # }
         return(assignment)
       } # End assignment_analysis_adegenet function
     } # End assignment_function
@@ -2383,7 +2580,6 @@ package and update your whitelist")
       message("Conducting Assignment analysis with markers selected randomly")
       # Number of times to repeat the sampling of markers
       iterations.list <- 1:iteration.method
-      # iterations.list <- 1:200 # test
       
       # Function: Random selection of marker function + iteration.method
       marker_selection <- function(iteration.method) {
@@ -2420,7 +2616,6 @@ package and update your whitelist")
 First sign of progress may take some time
 Progress can be monitored with activity in the folder...")
       mrl <- NULL
-      holdout <- NULL
       assignment.random <- list()
       assignment_random <- function(markers.random.lists, ...) {
         mrl <- markers.random.lists
@@ -2446,13 +2641,12 @@ Progress can be monitored with activity in the folder...")
                                            )
         )
         if (assignment.analysis == "gsi_sim") {
-          assignment.no.imp <- assignment_analysis(data = gsim.prep,
+          assignment.no.imp <- assignment_analysis(data = gsi.prep,
                                                    select.markers = select.markers,
                                                    markers.names = markers.names,
                                                    missing.data = "no.imputation", 
                                                    i = i, 
                                                    m = m,
-                                                   holdout = NULL,
                                                    filename = filename
           )
         }
@@ -2462,8 +2656,7 @@ Progress can be monitored with activity in the folder...")
                                                             markers.names = markers.names,
                                                             missing.data = "no.imputation", 
                                                             i = i, 
-                                                            m = m,
-                                                            holdout = NULL
+                                                            m = m
           )
         }
         
@@ -2501,7 +2694,6 @@ Progress can be monitored with activity in the folder...")
                                                   missing.data = missing.data, 
                                                   i = i,
                                                   m = m,
-                                                  holdout = holdout,
                                                   filename = filename
             )
           }
@@ -2511,8 +2703,7 @@ Progress can be monitored with activity in the folder...")
                                                            markers.names = markers.names,
                                                            missing.data = missing.data, 
                                                            i = i,
-                                                           m = m,
-                                                           holdout = holdout
+                                                           m = m
             )
           }
           
@@ -2529,7 +2720,7 @@ Progress can be monitored with activity in the folder...")
         } else {
           assignment <- bind_rows(assignment.no.imp, assignment.imp)
         }
-        assignment <- mutate(.data = assignment, ITERATIONS = rep(i, n()))
+        # assignment <- mutate(.data = assignment, ITERATIONS = rep(i, n()))
         return(assignment)
       } # End of iterations for both with and without imputations
       
@@ -2544,7 +2735,6 @@ Progress can be monitored with activity in the folder...")
       
       # Compiling the results
       message("Compiling results")
-      
       assignment.res <- suppressWarnings(
         bind_rows(assignment.res) %>% 
           mutate(SUBSAMPLE = rep(subsample.id, n())) %>% 
@@ -2557,7 +2747,7 @@ Progress can be monitored with activity in the folder...")
       } else { # with imputations
         filename.assignment.res <- stri_join("assignment.mixture", "imputed", sampling.method, "tsv", sep = ".")
       }
-      write_tsv(x = assignment.res, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE)
+      write_tsv(x = assignment.res, path = paste0(directory.subsample, filename.assignment.res), col_names = TRUE, append = FALSE)
       
       if (assignment.analysis == "gsi_sim") {
         assignment.mixture.summary.stats <- assignment.res %>% 
@@ -2578,511 +2768,245 @@ Progress can be monitored with activity in the folder...")
           ) %>% 
           select(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE, INFERRED, NUMBER_ITERATIONS, TOTAL_ITERATIONS, MEAN_ITERATIONS, MEAN, SE, MIN, MAX, MEDIAN, QUANTILE25, QUANTILE75) %>% 
           arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE)
-        
-        # Write the tables to directory
-        # assignment summary stats
-        if (imputation.method == FALSE) {
-          filename.assignment.sum <- stri_join("assignment.mixture.summary.results", "no.imputation", sampling.method, "tsv", sep = ".")
-        } else { # with imputations
-          filename.assignment.sum <- stri_join("assignment.mixture.summary.results", "imputed", sampling.method, "tsv", sep = ".")
-        }
-        write_tsv(x = assignment.mixture.summary.stats, path = paste0(directory.subsample,filename.assignment.sum), col_names = TRUE, append = FALSE)
       }
       
       if (assignment.analysis == "adegenet") {
-        assignment.stats.pop <- suppressWarnings(
+        assignment.mixture.summary.stats <- suppressWarnings(
           assignment.res %>%
-            rename(CURRENT = POP_ID) %>% 
-            group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
+            ungroup() %>%
+            mutate(CURRENT = factor(CURRENT)) %>% 
+            group_by(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE) %>%
             summarise(
-              MEAN = round(mean(ASSIGNMENT_PERC), 2),
-              SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
-              MIN = round(min(ASSIGNMENT_PERC), 2),
-              MAX = round(max(ASSIGNMENT_PERC), 2),
-              MEDIAN = round(median(ASSIGNMENT_PERC), 2),
-              QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
-              QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
+              NUMBER_ITERATIONS = length(ITERATIONS),
+              MEAN_ITERATIONS = round((NUMBER_ITERATIONS/iteration.method)*100, 2)
             ) %>%
-            ungroup %>% 
-            mutate(
-              CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
-              CURRENT = droplevels(CURRENT)
-            ) %>%
-            arrange(CURRENT, MARKER_NUMBER)
+            ungroup() %>%
+            arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE, CURRENT, INFERRED)
         )
       }
       
       # Next step is common for gsi_sim and adegenet
-      # pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
-      
-      # assignment.stats.overall <- assignment.stats.pop %>%
-      #   group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
-      #   rename(ASSIGNMENT_PERC = MEAN) %>%
-      #   summarise(
-      #     MEAN = round(mean(ASSIGNMENT_PERC), 2),
-      #     SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
-      #     MIN = round(min(ASSIGNMENT_PERC), 2),
-      #     MAX = round(max(ASSIGNMENT_PERC), 2),
-      #     MEDIAN = round(median(ASSIGNMENT_PERC), 2),
-      #     QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
-      #     QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
-      #   ) %>%
-      #   mutate(CURRENT = rep("OVERALL", n())) %>%
-      #   arrange(CURRENT, MARKER_NUMBER)
-      # 
-      # assignment.summary.stats <- suppressWarnings(
-      #   bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
-      #     mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
-      #     arrange(CURRENT, MARKER_NUMBER) %>%
-      #     mutate(
-      #       SE_MIN = MEAN - SE,
-      #       SE_MAX = MEAN + SE,
-      #       ITERATIONS = rep(iteration.method, n())
-      #     ) %>%
-      #     select(CURRENT, MARKER_NUMBER, MEAN, MEDIAN, SE, MIN, MAX, QUANTILE25, QUANTILE75, SE_MIN, SE_MAX, METHOD, MISSING_DATA, ITERATIONS)
-      # )
-      
-      
+      # Write the tables to directory
+      # assignment summary stats
+      if (imputation.method == FALSE) {
+        filename.assignment.sum <- stri_join("assignment.mixture.summary.results", "no.imputation", sampling.method, "tsv", sep = ".")
+      } else { # with imputations
+        filename.assignment.sum <- stri_join("assignment.mixture.summary.results", "imputed", sampling.method, "tsv", sep = ".")
+      }
+      write_tsv(x = assignment.mixture.summary.stats, path = paste0(directory.subsample,filename.assignment.sum), col_names = TRUE, append = FALSE)
     } # End method random
     
     # Ranked method ************************************************************
     if (sampling.method == "ranked") {
       message("Conducting Assignment analysis with ranked markers")
       
-      # if (data.type == "haplo.file") {
-      #   input <- gsim.prep %>%
-      #     mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% 
-      #     group_by(POP_ID, INDIVIDUALS, MARKERS) %>% 
-      #     tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
-      #     tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE)
-      #   if (imputation.method != FALSE) {
-      #     input.imp <- input.imp %>%
-      #       tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
-      #       mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% 
-      #       group_by(POP_ID, INDIVIDUALS, MARKERS) %>% 
-      #       tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
-      #       tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE)
-      #   }
-      # }
-      
       # List of all individuals
       ind.pop.df<- input %>% 
         ungroup %>% 
         select(POP_ID, INDIVIDUALS) %>% 
         distinct(POP_ID, INDIVIDUALS)
       
-      # thl selection
       message("Using thl method, ranking Fst with training samples...")
-      if (thl == 1) {
-        # Will go through the individuals in the list one by one.
-        iterations.list <- ind.pop.df$INDIVIDUALS
-        # Keep track of holdout individuals
-        holdout.individuals <- ind.pop.df %>%
-          mutate(ITERATIONS = stri_join("HOLDOUT", seq(1:n()), sep = "_"))
-      } else if (thl == "all") { # no holdout for that one
-        iterations.list <- iteration.method
-        holdout.individuals <- NULL
-        message("Warning: using all the individuals for ranking markers based on Fst\nNo holdout samples")
-        message("Recommended reading: \nAnderson, E. C. (2010) Assessing the power of informative subsets of
-                loci for population assignment: standard methods are upwardly biased.\nMolecular ecology resources 10, 4:701-710.")
-      } else {
-        # Create x (iterations) list of y (thl) proportion of individuals per pop.
-        if (stri_detect_fixed(thl, ".") & thl < 1) {
-          # iteration.method <- 5 # test
-          # thl <- 0.4 # test
-          holdout.individuals.list <- list()
-          iterations.list <- 1:iteration.method
-          for (x in 1:iteration.method) {
-            holdout.individuals <- ind.pop.df %>%
-              group_by(POP_ID) %>%
-              sample_frac(thl, replace = FALSE) %>%  # sampling fraction for each pop
-              arrange(POP_ID, INDIVIDUALS) %>%
-              ungroup() %>%
-              select(INDIVIDUALS) %>%
-              mutate(ITERATIONS = rep(x, n()))
-            holdout.individuals.list[[x]] <- holdout.individuals
-          }
-          holdout.individuals <- as.data.frame(bind_rows(holdout.individuals.list))
-        }
-        
-        # Create x (iterations) list of y (thl) individuals per pop.
-        if (thl > 1) {
-          holdout.individuals.list <- list()
-          iterations.list <- 1:iteration.method
-          for (x in 1:iteration.method) {
-            holdout.individuals <- ind.pop.df %>%
-              group_by(POP_ID) %>%
-              sample_n(thl, replace = FALSE) %>% # sampling individuals for each pop
-              arrange(POP_ID, INDIVIDUALS) %>%
-              ungroup() %>%
-              select(INDIVIDUALS) %>%
-              mutate(ITERATIONS = rep(x, n()))
-            holdout.individuals.list[[x]] <- holdout.individuals
-          }
-          holdout.individuals <- as.data.frame(bind_rows(holdout.individuals.list))
-        }
-      } # End tracking holdout individuals
+      holdout.individuals <- mixture.df
+      
       write_tsv(x = holdout.individuals, 
                 path = paste0(directory.subsample,"holdout.individuals.tsv"), 
                 col_names = TRUE, 
                 append = FALSE
       )
-      message("Holdout samples saved in your folder")
+      message("Holdout samples = mixture samples: saved in your folder")
       
       # Going through the loop of holdout individuals
       message("Starting parallel computations for the assignment analysis
 First sign of progress may take some time
 Progress can be monitored with activity in the folder...")
       
-      assignment_ranking <- function(iterations.list, ...) {
-        # i <- "TRI_09" #test
-        # i <- "CAR_01" #test
-        # i <- 1
-        # i <- 5
-        i <- iterations.list
+      # assignment_ranking <- function(iterations.list, ...) {
+      
+      # Ranking Fst with training dataset (keep holdout individuals out)
+      message("Ranking markers based on Fst with training samples")
+      fst.ranked <- fst_WC84(data = input, holdout.samples = holdout.individuals$INDIVIDUALS)
+
+      write_tsv(
+        x = fst.ranked, 
+        path = paste0(directory.subsample, "fst_ranked.tsv"), 
+        col_names = TRUE, 
+        append = FALSE
+      )
+      if (imputation.method != FALSE) {
+        fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout.individuals$INDIVIDUALS)
+        write_tsv(x = fst.ranked.imp, 
+                  path = paste0(directory.subsample, "fst_ranked_imputed.tsv"), 
+                  col_names = TRUE, 
+                  append = FALSE
+        )
+      }
+      
+      # Markers numbers loop function
+      message("Going throught the marker.number")
+      # assignment.marker <- list() # Create empty lists to feed the results
+      
+      i <- NULL
+      assignment_marker_loop <- function(m, ...) {
+        message("Marker number: ", m)
+        # m <- 200 # test
+        # m <- 400 # test
+        m <- as.numeric(m)
+        RANKING <- NULL
+        select.markers <- filter(.data = fst.ranked, RANKING <= m) %>%
+          select(MARKERS)
         
-        # Ranking Fst with training dataset (keep holdout individuals out)
-        message("Ranking markers based on Fst")
-        if (thl == "all") {
-          holdout <- NULL
-          fst.ranked <- fst_WC84(data = input, holdout.samples = NULL)
-          if (imputation.method != FALSE) {
-            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = NULL)
-          }
-        } else if (thl == 1) {
-          holdout <- data.frame(INDIVIDUALS = i)
-          fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS)
-          if (imputation.method != FALSE) {
-            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS)
-          }
-        } else {
-          holdout <- data.frame(holdout.individuals.list[i])
-          fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS)
-          if (imputation.method != FALSE) {
-            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS)
-          }
-        }
+        # get the list of markers after filter
+        markers.names <- unique(select.markers$MARKERS)
         
-        fst.ranked.filename <- stri_join("fst.ranked_", i, ".tsv", sep = "") # No imputation
-        write_tsv(
-          x = fst.ranked, 
-          path = paste0(directory.subsample, fst.ranked.filename), 
-          col_names = TRUE, 
-          append = FALSE
+        # Assignment analysis without imputations
+        filename <- stri_replace_all_fixed(base.filename,
+                                           pattern = ".txt",
+                                           replacement = stri_join(
+                                             "", "markers", m, 
+                                             "no_imputation.txt", sep = "_"
+                                           )
         )
         
-        if (imputation.method != FALSE) {  # With imputations
-          fst.ranked.filename.imp <- stri_join("fst.ranked_", i,
-                                               "_imputed",
-                                               ".tsv", 
-                                               sep = ""
-          ) 
-          write_tsv(x = fst.ranked.imp, 
-                    path = paste0(directory.subsample, fst.ranked.filename.imp), 
-                    col_names = TRUE, 
-                    append = FALSE
+        
+        if (assignment.analysis == "gsi_sim") {
+          assignment.no.imp <- assignment_analysis(data = gsi.prep,
+                                                   select.markers = select.markers,
+                                                   markers.names = markers.names,
+                                                   missing.data = "no.imputation",
+                                                   i = NULL,
+                                                   m = m,
+                                                   filename = filename
           )
         }
         
-        # Markers numbers loop function
-        message("Going throught the marker.number")
-        assignment.marker <- list() # Create empty lists to feed the results
-        assignment_marker_loop <- function(m, ...) {
-          message("Marker number: ", m)
-          # m <- 200 # test
-          # m <- 400 # test
-          m <- as.numeric(m)
-          RANKING <- NULL
-          select.markers <- filter(.data = fst.ranked, RANKING <= m) %>%
+        if (assignment.analysis == "adegenet") {
+          assignment.no.imp <- assignment_analysis_adegenet(data = genind.object,
+                                                            select.markers = select.markers,
+                                                            markers.names = markers.names,
+                                                            missing.data = "no.imputation", 
+                                                            i = NULL, 
+                                                            m = m
+          )
+        }
+        # unused objects
+        select.markers <- NULL
+        markers.names <- NULL
+        RANKING <- NULL
+        filename <- NULL
+        
+        # With imputations
+        if (imputation.method != FALSE) {  # with imputations
+          
+          select.markers <- filter(.data = fst.ranked.imp, RANKING <= m) %>%
             select(MARKERS)
           
           # get the list of markers after filter
-          markers.names <- unique(select.markers$MARKERS)
+          markers.names <- unique(select.markers$MARKERS)  # not the same in no imputation
+          
+          if (imputation.method == "rf") {
+            if (imputations.group == "populations") {
+              missing.data <- "imputed RF populations"
+            } else {
+              missing.data <- "imputed RF global"
+            }
+          } else {
+            if (imputations.group == "populations") {
+              missing.data <- "imputed max populations"
+            } else {
+              missing.data <- "imputed max global"
+            }
+          }
           
-          # Assignment analysis without imputations
-          filename <- stri_replace_all_fixed(filename,
-                                             pattern = "txt",
+          # Assignment analysis WITH imputations
+          filename <- stri_replace_all_fixed(base.filename,
+                                             pattern = ".txt",
                                              replacement = stri_join(
-                                               i, m, 
-                                               "no.imputation", "txt", sep = "."
+                                               "", "markers", m, 
+                                               "imputed.txt", sep = "_"
                                              )
           )
+          
           if (assignment.analysis == "gsi_sim") {
-            assignment.no.imp <- assignment_analysis(data = gsim.prep,
-                                                     select.markers = select.markers,
-                                                     markers.names = markers.names,
-                                                     missing.data = "no.imputation", 
-                                                     i = i, 
-                                                     m = m,
-                                                     holdout = holdout,
-                                                     filename = filename
+            assignment.imp <- assignment_analysis(data = gsi.prep.imp,
+                                                  select.markers = select.markers,
+                                                  markers.names = markers.names,
+                                                  missing.data = missing.data, 
+                                                  i = NULL,
+                                                  m = m,
+                                                  filename = filename
             )
           }
-          
           if (assignment.analysis == "adegenet") {
-            assignment.no.imp <- assignment_analysis_adegenet(data = genind.object,
-                                                              select.markers = select.markers,
-                                                              markers.names = markers.names,
-                                                              missing.data = "no.imputation", 
-                                                              i = i, 
-                                                              m = m,
-                                                              holdout = holdout
+            assignment.imp <- assignment_analysis_adegenet(data = genind.object.imp,
+                                                           select.markers = select.markers,
+                                                           markers.names = markers.names,
+                                                           missing.data = missing.data, 
+                                                           i = NULL,
+                                                           m = m
             )
           }
+          
           # unused objects
           select.markers <- NULL
           markers.names <- NULL
           RANKING <- NULL
-          filename <- NULL
-          
-          # With imputations
-          if (imputation.method != FALSE) {  # with imputations
-            
-            select.markers <- filter(.data = fst.ranked.imp, RANKING <= m) %>%
-              select(MARKERS)
-            
-            # get the list of markers after filter
-            markers.names <- unique(select.markers$MARKERS)  # not the same in no imputation
-            
-            if (imputation.method == "rf") {
-              if (imputations.group == "populations") {
-                missing.data <- "imputed RF populations"
-              } else {
-                missing.data <- "imputed RF global"
-              }
-            } else {
-              if (imputations.group == "populations") {
-                missing.data <- "imputed max populations"
-              } else {
-                missing.data <- "imputed max global"
-              }
-            }
-            
-            # Assignment analysis WITH imputations
-            filename <- stri_replace_all_fixed(filename,
-                                               pattern = "txt",
-                                               replacement = stri_join(
-                                                 i, m, 
-                                                 "imputed", "txt", sep = "."
-                                               )
-            )
-            if (assignment.analysis == "gsi_sim") {
-              assignment.imp <- assignment_analysis(data = gsi.prep.imp,
-                                                    select.markers = select.markers,
-                                                    markers.names = markers.names,
-                                                    missing.data = missing.data, 
-                                                    i = i,
-                                                    m = m,
-                                                    holdout = holdout,
-                                                    filename = filename
-              )
-            }
-            if (assignment.analysis == "adegenet") {
-              assignment.imp <- assignment_analysis_adegenet(data = gsi.prep.imp,
-                                                             select.markers = select.markers,
-                                                             markers.names = markers.names,
-                                                             missing.data = missing.data, 
-                                                             i = i,
-                                                             m = m,
-                                                             holdout = holdout
-              )
-            }
-            
-            # unused objects
-            select.markers <- NULL
-            markers.names <- NULL
-            RANKING <- NULL
-          } # End with imputations
-          
-          #compile assignment results each marker number for the iteration
-          if (imputation.method == FALSE) {# with imputations
-            assignment <- assignment.no.imp
-            fst.ranked.imp <- NULL
-            gsi.prep.imp <- NULL
-            input.imp <- NULL
-          } else {
-            assignment <- bind_rows(assignment.no.imp, assignment.imp)
-          }
-          m <- as.character(m)
-          assignment.marker[[m]] <- assignment
-          return(assignment.marker)
-        }  # End marker number loop for both with and without imputations
-        
-        assignment.marker <- map(
-          .x = marker.number, 
-          .f = assignment_marker_loop,
-          fst.ranked = fst.ranked,
-          fst.ranked.imp = fst.ranked.imp,
-          i = i,
-          holdout = holdout,
-          input = input,
-          gsim.prep = gsim.prep,
-          input.imp = input.imp,
-          # vcf = vcf.imp, # was an error before, double check...
-          gsi.prep.imp = gsi.prep.imp,
-          pop.levels = pop.levels,
-          pop.labels = pop.labels,
-          pop.id.start =  pop.id.start,
-          pop.id.end = pop.id.end,
-          sampling.method = sampling.method,
-          thl = thl,
-          iteration.method = iteration.method,
-          filename = filename,
-          keep.gsi.files = keep.gsi.files,
-          imputation.method = imputation.method,
-          parallel.core = parallel.core
-        )
-        
-        message("Summarizing the assignment analysis results by iterations and marker group")
-        assignment.res.summary <- suppressWarnings(
-          bind_rows(purrr::flatten(assignment.marker)) %>%
-            mutate(METHOD = rep(stri_join("thl_", thl) , n()))
-        )
+        } # End with imputations
         
-        res.filename <- stri_join("assignment_", i, ".tsv", sep = "") # No imputation
-        write_tsv(x = assignment.res.summary, path = paste0(directory.subsample, res.filename), 
-                  col_names = TRUE, 
-                  append = FALSE
-        )
-        return(assignment.res.summary)
-      }  # End assignment ranking function
+        #compile assignment results each marker number for the iteration
+        if (imputation.method == FALSE) {# with imputations
+          assignment <- assignment.no.imp
+          fst.ranked.imp <- NULL
+          gsi.prep.imp <- NULL
+          input.imp <- NULL
+        } else {
+          assignment <- bind_rows(assignment.no.imp, assignment.imp)
+        }
+        return(assignment)
+      }  # End marker number loop for both with and without imputations
       
       # using mclapply
       assignment.res <- list()
       assignment.res <- parallel::mclapply(
-        X = iterations.list, 
-        FUN = assignment_ranking, 
+        X = marker.number, 
+        FUN = assignment_marker_loop,
         mc.preschedule = FALSE, 
         mc.silent = FALSE, 
         mc.cores = parallel.core,
-        marker.number = marker.number
+        fst.ranked = fst.ranked,
+        fst.ranked.imp = fst.ranked.imp,
+        i = NULL,
+        input = input,
+        gsi.prep = gsi.prep,
+        input.imp = input.imp,
+        gsi.prep.imp = gsi.prep.imp,
+        pop.levels = pop.levels,
+        pop.labels = pop.labels,
+        pop.id.start =  pop.id.start,
+        pop.id.end = pop.id.end,
+        sampling.method = sampling.method,
+        iteration.method = iteration.method,
+        filename = filename,
+        keep.gsi.files = keep.gsi.files,
+        imputation.method = imputation.method,
+        parallel.core = parallel.core
       )
       
       # Compiling the results
       message("Compiling results")
-      assignment.res.summary <- suppressWarnings(
-        bind_rows(assignment.res) %>%
-          mutate(METHOD = rep(stri_join("thl_", thl) , n()))
+      assignment.res <- suppressWarnings(
+        bind_rows(assignment.res) %>% 
+          mutate(SUBSAMPLE = rep(subsample.id, n())) %>% 
+          arrange(INDIVIDUALS, MARKER_NUMBER, MISSING_DATA)
       )
       
-      if (thl == 1 | thl == "all") {
-        assignment.stats.pop <- assignment.res.summary %>%
-          mutate(
-            CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
-            CURRENT = droplevels(CURRENT)
-          ) %>% 
-          group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
-          summarise(
-            n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
-            TOTAL = length(CURRENT)
-          ) %>%
-          ungroup() %>% 
-          mutate(MEAN = round(n/TOTAL*100, 0)) %>% 
-          select(-n, -TOTAL)
-        
-        pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
-        
-        assignment.stats.overall <- assignment.stats.pop %>% 
-          group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
-          rename(ASSIGNMENT_PERC = MEAN) %>%
-          summarise(
-            MEAN = round(mean(ASSIGNMENT_PERC), 2),
-            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
-            MIN = round(min(ASSIGNMENT_PERC), 2),
-            MAX = round(max(ASSIGNMENT_PERC), 2),
-            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
-            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
-            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
-          ) %>% 
-          mutate(CURRENT = rep("OVERALL", n())) %>% 
-          arrange(CURRENT, MARKER_NUMBER)
-        
-        assignment.summary.stats <- suppressWarnings(
-          bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
-            mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
-            arrange(CURRENT, MARKER_NUMBER) %>%
-            mutate(
-              SE_MIN = MEAN - SE,
-              SE_MAX = MEAN + SE
-            )
-        )
-        
-      } else {
-        # thl != 1 or "all"
-        # summary stats
-        if (assignment.analysis == "adegenet") {
-          assignment.res.summary <- assignment.res.summary %>% 
-            group_by(CURRENT, INFERRED, ITERATIONS, MARKER_NUMBER, MISSING_DATA, METHOD) %>% 
-            tally %>% 
-            group_by(CURRENT) %>% 
-            mutate(TOTAL = sum(n)) %>% 
-            ungroup() %>% 
-            mutate(ASSIGNMENT_PERC = round(n/TOTAL*100, 0)) %>% 
-            filter(CURRENT == INFERRED) %>% 
-            select(-n, -TOTAL)
-        }
-        
-        assignment.stats.pop <- assignment.res.summary %>%
-          mutate(
-            CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
-            CURRENT = droplevels(CURRENT)
-          ) %>%
-          group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
-          summarise(
-            MEAN = round(mean(ASSIGNMENT_PERC), 2),
-            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
-            MIN = round(min(ASSIGNMENT_PERC), 2),
-            MAX = round(max(ASSIGNMENT_PERC), 2),
-            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
-            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
-            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2),
-            SE_MIN = MEAN - SE,
-            SE_MAX = MEAN + SE
-          ) %>%
-          arrange(CURRENT, MARKER_NUMBER)
-        
-        pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
-        
-        assignment.stats.overall <- assignment.stats.pop %>%
-          group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
-          rename(ASSIGNMENT_PERC = MEAN) %>%
-          summarise(
-            MEAN = round(mean(ASSIGNMENT_PERC), 2),
-            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
-            MIN = round(min(ASSIGNMENT_PERC), 2),
-            MAX = round(max(ASSIGNMENT_PERC), 2),
-            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
-            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
-            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2),
-            SE_MIN = MEAN - SE,
-            SE_MAX = MEAN + SE
-          ) %>%
-          mutate(CURRENT = rep("OVERALL", n())) %>%
-          arrange(CURRENT, MARKER_NUMBER)
-        
-        assignment.summary.stats <- suppressWarnings(
-          bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
-            mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
-            arrange(CURRENT, MARKER_NUMBER)
-        )
-      } # End thl != 1
-      
-      # Write the tables to directory
-      # assignment results
-      if (imputation.method == FALSE) {
-        filename.assignment.res <- stri_join("assignment.res", "no.imputation", sampling.method, "tsv", sep = ".")
-      } else { # with imputations
-        filename.assignment.res <- stri_join("assignment.res", "imputed", sampling.method, "tsv", sep = ".")
-      }
-      write_tsv(x = assignment.res.summary, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE)
-      
-      # assignment summary stats
+      # Write to the directory assignment results
       if (imputation.method == FALSE) {
-        filename.assignment.sum <- stri_join("assignment.summary.stats", "no.imputation", sampling.method, "tsv", sep = ".")
+        filename.assignment.res <- stri_join("assignment.mixture", "no.imputation", sampling.method, "tsv", sep = ".")
       } else { # with imputations
-        filename.assignment.sum <- stri_join("assignment.summary.stats", "imputed", sampling.method, "tsv", sep = ".")
+        filename.assignment.res <- stri_join("assignment.mixture", "imputed", sampling.method, "tsv", sep = ".")
       }
-      write_tsv(x = assignment.summary.stats, path = paste0(directory.subsample,filename.assignment.sum), col_names = TRUE, append = FALSE)
+      write_tsv(x = assignment.res, path = paste0(directory.subsample, filename.assignment.res), col_names = TRUE, append = FALSE)
     } # End of ranked thl method
     
     return(assignment.res)
@@ -3104,8 +3028,6 @@ Progress can be monitored with activity in the folder...")
              pop.id.start =  pop.id.start,
              pop.id.end = pop.id.end,
              sampling.method = sampling.method,
-             thl = thl,
-             iteration.method = iteration.method,
              filename = filename,
              keep.gsi.files = keep.gsi.files,
              imputation.method = imputation.method,
@@ -3118,31 +3040,97 @@ Progress can be monitored with activity in the folder...")
              parallel.core = parallel.core
   )
   res <- bind_rows(res)
-
+  
   if (!is.null(subsample)){
     write_tsv(x = res, path = paste0(directory, "assignment.mixture.results.tsv"), col_names = TRUE, append = FALSE)
-    # rename the other with subsampling results...
   }
   
   # Summary of the subsampling iterations
-  assignment.mixture.summary.subsample <- res %>% 
-    group_by(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, INFERRED, SUBSAMPLE) %>%
-    summarise(
-      NUMBER_ITERATIONS = length(ITERATIONS),
-      MEAN_ITERATIONS = round((NUMBER_ITERATIONS/iteration.method)*100, 2),
-      MEAN = round(mean(SCORE), 2),
-      SE = round(sqrt(var(SCORE)/length(SCORE)), 2),
-      MIN = round(min(SCORE), 2),
-      MAX = round(max(SCORE), 2),
-      MEDIAN = round(median(SCORE), 2),
-      QUANTILE25 = round(quantile(SCORE, 0.25), 2),
-      QUANTILE75 = round(quantile(SCORE, 0.75), 2)
-    ) %>% 
-    mutate(
-      TOTAL_ITERATIONS = rep(iteration.method, n())
-    ) %>% 
-    select(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE, INFERRED, NUMBER_ITERATIONS, TOTAL_ITERATIONS, MEAN_ITERATIONS, MEAN, SE, MIN, MAX, MEDIAN, QUANTILE25, QUANTILE75) %>% 
-    arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE)
+  if (sampling.method == "random") {
+    if (assignment.analysis == "gsi_sim") {
+      assignment.mixture.summary.subsample <- res %>% 
+        group_by(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, INFERRED, SUBSAMPLE) %>%
+        summarise(
+          NUMBER_ITERATIONS = length(ITERATIONS),
+          MEAN_ITERATIONS = round((NUMBER_ITERATIONS/iteration.method)*100, 2),
+          MEAN = round(mean(SCORE), 2),
+          SE = round(sqrt(var(SCORE)/length(SCORE)), 2),
+          MIN = round(min(SCORE), 2),
+          MAX = round(max(SCORE), 2),
+          MEDIAN = round(median(SCORE), 2),
+          QUANTILE25 = round(quantile(SCORE, 0.25), 2),
+          QUANTILE75 = round(quantile(SCORE, 0.75), 2)
+        ) %>% 
+        mutate(
+          TOTAL_ITERATIONS = rep(iteration.method, n())
+        ) %>% 
+        select(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE, INFERRED, NUMBER_ITERATIONS, TOTAL_ITERATIONS, MEAN_ITERATIONS, MEAN, SE, MIN, MAX, MEDIAN, QUANTILE25, QUANTILE75) %>% 
+        arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE)
+    }
+    if (assignment.analysis == "adegenet") {
+      assignment.mixture.summary.subsample <- res %>% 
+        select(-X1, -X2) %>% 
+        ungroup() %>%
+        mutate(CURRENT = factor(CURRENT)) %>% 
+        group_by(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE) %>%
+        summarise(
+          NUMBER_ITERATIONS = length(ITERATIONS),
+          MEAN_ITERATIONS = round((NUMBER_ITERATIONS/iteration.method)*100, 2)
+        ) %>%
+        ungroup() %>%
+        arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, SUBSAMPLE, CURRENT, INFERRED) %>% 
+        group_by(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, MISSING_DATA) %>%
+        summarise(
+          MEAN_SUBSAMPLE = round(mean(MEAN_ITERATIONS), 2),
+          SE = round(sqrt(var(MEAN_ITERATIONS)/length(MEAN_ITERATIONS)), 2),
+          MIN = round(min(MEAN_ITERATIONS), 2),
+          MAX = round(max(MEAN_ITERATIONS), 2),
+          MEDIAN = round(median(MEAN_ITERATIONS), 2),
+          QUANTILE25 = round(quantile(MEAN_ITERATIONS, 0.25), 2),
+          QUANTILE75 = round(quantile(MEAN_ITERATIONS, 0.75), 2)
+        ) %>%
+        ungroup() %>%
+        arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, MISSING_DATA, CURRENT, INFERRED)
+    }
+  } # end random
+  if (sampling.method == "ranked") {
+    if (assignment.analysis == "gsi_sim") {
+      assignment.mixture.summary.subsample <- res %>% 
+        group_by(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA, INFERRED) %>%
+        summarise(
+          NUMBER_SUBSAMPLE = length(SUBSAMPLE),
+          MEAN_SUBSAMPLE = round((NUMBER_SUBSAMPLE/iteration.subsample)*100, 2),
+          MEAN = round(mean(SCORE), 2),
+          SE = round(sqrt(var(SCORE)/length(SCORE)), 2),
+          MIN = round(min(SCORE), 2),
+          MAX = round(max(SCORE), 2),
+          MEDIAN = round(median(SCORE), 2),
+          QUANTILE25 = round(quantile(SCORE, 0.25), 2),
+          QUANTILE75 = round(quantile(SCORE, 0.75), 2)
+        ) %>% 
+        mutate(
+          TOTAL_SUBSAMPLE = rep(iteration.subsample, n())
+        ) %>% 
+        select(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA, INFERRED, NUMBER_SUBSAMPLE, TOTAL_SUBSAMPLE, MEAN_SUBSAMPLE, MEAN, SE, MIN, MAX, MEDIAN, QUANTILE25, QUANTILE75) %>% 
+        arrange(INDIVIDUALS, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA)
+    }
+    if (assignment.analysis == "adegenet") {
+      assignment.mixture.summary.subsample <- res %>%
+        select(-X1, -X2, -ITERATIONS) %>%
+        ungroup() %>%
+        mutate(CURRENT = factor(CURRENT)) %>%
+        group_by(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA) %>%
+        summarise(
+          NUMBER_SUBSAMPLE = length(SUBSAMPLE),
+          MEAN_SUBSAMPLE = round((NUMBER_SUBSAMPLE/iteration.subsample)*100, 2)) %>% 
+        ungroup() %>% 
+        mutate(TOTAL_SUBSAMPLE = rep(iteration.subsample, n())) %>% 
+        select(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA, NUMBER_SUBSAMPLE, TOTAL_SUBSAMPLE, MEAN_SUBSAMPLE) %>% 
+        arrange(INDIVIDUALS, CURRENT, INFERRED, ANALYSIS, MARKER_NUMBER, METHOD, MISSING_DATA)
+    }
+  } # end ranked
+  
+  
   
   # assignment summary results
   if (imputation.method == FALSE) {
diff --git a/R/assignment_ngs.R b/R/assignment_ngs.R
index 8963a2a..ea6de47 100644
--- a/R/assignment_ngs.R
+++ b/R/assignment_ngs.R
@@ -260,7 +260,7 @@
 #' marker.number = c(500, 5000, "all"),
 #' sampling.method = "ranked",
 #' thl = 0.3,
-#' blacklist.id = "blacklist.id.lobster.tsv",
+#' blacklist.id = "blacklist.id.treefrog.tsv",
 #' subsample = 25,
 #' iteration.subsample = 10
 #' gsi_sim.filename = "treefrog.txt",
@@ -442,7 +442,7 @@ assignment_ngs <- function(data,
   
   if (stri_detect_fixed(str = data.type, pattern = "POP_ID") | stri_detect_fixed(str = data.type, pattern = "INDIVIDUALS")) {
     data.type <- "df.file"
-    message("File type: data frame of genotype")
+    message("File type: data frame of genotypes")
   }
   
   if (stri_detect_fixed(str = data.type, pattern = "Catalog")) {
@@ -1279,7 +1279,7 @@ package and update your whitelist")
         arrange(MARKERS, POP_ID) %>%
         select(-c(REF, ALT))
       
-      gsim.prep <- input %>%
+      gsi.prep <- input %>%
         tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>%  # separate the genotypes into alleles
         tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
     }
@@ -1299,7 +1299,7 @@ package and update your whitelist")
     }
     
     if (data.type == "haplo.file") { # for haplotypes
-      gsim.prep <- suppressWarnings(
+      gsi.prep <- suppressWarnings(
         input %>%
           mutate(GT = stri_replace_all_fixed(GT, "-", "000/000", vectorize_all=F)) %>% 
           arrange(MARKERS) %>% 
@@ -1317,7 +1317,7 @@ package and update your whitelist")
     }
     if (data.type == "haplo.file" & assignment.analysis == "gsi_sim") {
       input <- suppressWarnings(
-        gsim.prep %>%
+        gsi.prep %>%
           filter(GT != "000") %>% 
           tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
           ungroup() %>% 
@@ -1338,25 +1338,25 @@ package and update your whitelist")
           tidyr::unite(data = ., GT, A1, A2, sep = "", remove = TRUE)
       )
       
-      # for haplo.file we need to change back again the gsim.prep file
-      gsim.prep <- input %>% 
+      # for haplo.file we need to change back again the gsi.prep file
+      gsi.prep <- input %>% 
         tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
         tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
     }
     if (data.type == "df.file") { # For data frame of genotypes
-      gsim.prep <- input %>% 
+      gsi.prep <- input %>% 
         tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
         tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
     }
     if (data.type == "plink.file") { # for PLINK
-      gsim.prep <- input %>% 
+      gsi.prep <- input %>% 
         tidyr::separate(col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS_2", "ALLELES"), sep = "_", remove = TRUE) %>% 
         select(-INDIVIDUALS_2)
     }
     if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
       if (assignment.analysis == "adegenet" ) {
         genind.prep <- suppressWarnings(
-          gsim.prep %>%
+          gsi.prep %>%
             filter(GT != "000") %>% 
             tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
             ungroup() %>% 
@@ -1397,6 +1397,9 @@ package and update your whitelist")
       }
     }
     
+    gsi.prep <- gsi.prep %>% 
+      arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES)
+
     # only adegenet
     if (assignment.analysis == "adegenet") {
       # genind arguments common to all data.type
@@ -1461,7 +1464,7 @@ package and update your whitelist")
       if (data.type == "plink.file") {
         
         if (impute == "genotype"){
-          input.prep <- gsim.prep %>% 
+          input.prep <- gsi.prep %>% 
             group_by(MARKERS, INDIVIDUALS, POP_ID) %>% 
             tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
             tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) %>% 
@@ -1476,7 +1479,7 @@ package and update your whitelist")
         }
         
         if (impute == "allele"){
-          input.prep <- gsim.prep %>%
+          input.prep <- gsi.prep %>%
             mutate(
               GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
               GT = replace(GT, which(GT == "NA"), NA)
@@ -1503,7 +1506,7 @@ package and update your whitelist")
             arrange(POP_ID, INDIVIDUALS)
         }
         if (impute == "allele") {
-          input.prep <- gsim.prep %>%
+          input.prep <- gsi.prep %>%
             mutate(
               GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
               GT = replace(GT, which(GT == "NA"), NA)
@@ -1663,7 +1666,7 @@ package and update your whitelist")
       } # End imputations max
       
       # prepare the imputed dataset for gsi_sim or adegenet
-      message("Preparing imputed data set for gsi_sim...")
+      message("Preparing imputed data set...")
       if (assignment.analysis == "gsi_sim") {
         if (impute == "genotype") {
           if (data.type == "vcf.file") { # for VCF input
@@ -1690,22 +1693,22 @@ package and update your whitelist")
       }
       
       # adegenet
-      if (data.type == "vcf.file" & assignment.analysis == "adegenet" ) {
-        genind.prep.imp <- input.imp %>% 
-          tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
-          tidyr::spread(data = ., key = ALLELES, value = GT) %>%
-          tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
-          mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
-          arrange(MARKERS, POP_ID) %>%
-          tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
-          tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
-          tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
-          group_by(POP_ID, INDIVIDUALS) %>%
-          tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
-          arrange(POP_ID, INDIVIDUALS)
-      }
-      if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
-        if (assignment.analysis == "adegenet" ) {
+      if (assignment.analysis == "adegenet") {
+        if (data.type == "vcf.file") {
+          genind.prep.imp <- input.imp %>% 
+            tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
+            tidyr::spread(data = ., key = ALLELES, value = GT) %>%
+            tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
+            mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
+            arrange(MARKERS, POP_ID) %>%
+            tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
+            tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
+            tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
+            group_by(POP_ID, INDIVIDUALS) %>%
+            tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
+            arrange(POP_ID, INDIVIDUALS)
+        }
+        if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
           genind.prep.imp <- suppressWarnings(
             input.imp %>%
               ungroup() %>% 
@@ -1737,10 +1740,6 @@ package and update your whitelist")
               arrange(POP_ID, INDIVIDUALS)
           )
         }
-      }
-      
-      # only adegenet
-      if (assignment.analysis == "adegenet") {
         # genind arguments common to all data.type
         ind <- as.character(genind.prep.imp$INDIVIDUALS)
         pop <- genind.prep.imp$POP_ID
@@ -1762,8 +1761,8 @@ package and update your whitelist")
         genind.df <- NULL
         # genind.prep <- NULL
         # genind.prep.imp <- NULL
-      }
-      
+        
+      } # end adegenet
       input.imp <- NULL # remove unused object
     } # End imputations
     
@@ -2012,7 +2011,7 @@ package and update your whitelist")
                                  If you have internet access, you can install it
                                  from within R by invoking the function \"install_gsi_sim(fromSource = TRUE)\"")
       assignment_analysis <- function(data, select.markers, markers.names, missing.data, i, m, holdout, filename, ...) {
-        # data <- gsim.prep #test
+        # data <- gsi.prep #test
         # data <- genind.prep #test
         # missing.data <- "no.imputation" #test
         data.select <- suppressWarnings(
@@ -2338,7 +2337,7 @@ Progress can be monitored with activity in the folder...")
                                            )
         )
         if (assignment.analysis == "gsi_sim") {
-          assignment.no.imp <- assignment_analysis(data = gsim.prep,
+          assignment.no.imp <- assignment_analysis(data = gsi.prep,
                                                    select.markers = select.markers,
                                                    markers.names = markers.names,
                                                    missing.data = "no.imputation", 
@@ -2548,7 +2547,7 @@ Progress can be monitored with activity in the folder...")
       message("Conducting Assignment analysis with ranked markers")
       
       # if (data.type == "haplo.file") {
-      #   input <- gsim.prep %>%
+      #   input <- gsi.prep %>%
       #     mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% 
       #     group_by(POP_ID, INDIVIDUALS, MARKERS) %>% 
       #     tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
@@ -2706,7 +2705,7 @@ Progress can be monitored with activity in the folder...")
                                              )
           )
           if (assignment.analysis == "gsi_sim") {
-            assignment.no.imp <- assignment_analysis(data = gsim.prep,
+            assignment.no.imp <- assignment_analysis(data = gsi.prep,
                                                      select.markers = select.markers,
                                                      markers.names = markers.names,
                                                      missing.data = "no.imputation", 
@@ -2814,7 +2813,7 @@ Progress can be monitored with activity in the folder...")
           i = i,
           holdout = holdout,
           input = input,
-          gsim.prep = gsim.prep,
+          gsi.prep = gsi.prep,
           input.imp = input.imp,
           # vcf = vcf.imp, # was an error before, double check...
           gsi.prep.imp = gsi.prep.imp,
diff --git a/README.md b/README.md
index 9f2e3c9..bb50ece 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ This is the development page of the **assigner** package for the R software.
 
 **Use assigner to:**
 
-* **Conduct assignment analysis** using [gsi_sim] (https://github.com/eriqande/gsi_sim), a tool developed 
+* **Conduct assignment analysis** and **mixture analysis** using [gsi_sim] (https://github.com/eriqande/gsi_sim), a tool developed 
 by Eric C. Anderson (see Anderson et al. 2008 and Anderson 2010) or [adegenet] (https://github.com/thibautjombart/adegenet), a R package developed by Thibaul Jombart, to conduct the assignment analysis.
 * The input file are:
     1. a VCF file format (Danecek et al. 2011) (*batch_x.vcf*) produced by [STACKS] (http://catchenlab.life.illinois.edu/stacks/) (Catchen et al. 2011, 2013),
@@ -140,6 +140,10 @@ The Amazon image can be imported into Google Cloud Compute Engine to start a new
 
 ## New
 
+**v.0.2.1**
+* updated the function `assignment_mixture` with `sampling.method = "ranked"` and 
+`assignment.analysis = "adegenet"`.
+
 **v.0.2.0**
 * new function: `assignment_mixture` for mixture analysis.
 
diff --git a/man/assignment_mixture.Rd b/man/assignment_mixture.Rd
index 659eeb0..0c08e69 100644
--- a/man/assignment_mixture.Rd
+++ b/man/assignment_mixture.Rd
@@ -30,7 +30,7 @@ to convert very large VCF file. For \code{.ped} file conversion to
 with \code{--recode transpose}.}
 
 \item{mixture}{A text file mixture individual ID. The column header is 
-'INDIVIDUALS'and the file is in the working directory (e.g. "mixture.txt").}
+\code{INDIVIDUALS} and the file is in the working directory (e.g. "mixture.txt").}
 
 \item{assignment.analysis}{Assignment analysis conducted with 
 \code{assignment.analysis = "gsi_sim"} or 
@@ -118,29 +118,12 @@ chosen based on ranked Fst \code{"ranked"} using the baseline samples
 for the training and the mixture samples as holdout. 
 Classic Leave-one-out is then used to assign individual mixture samples.}
 
-\item{thl}{(character, integer, proportion) For \code{sampling.method = "ranked"} only.
-Default \code{1}, 1 individual sample is used as holdout. This individual is not
-participating in the markers ranking. For each marker number,
-the analysis will be repeated with all the indiviuals in the data set
-(e.g. 500 individuals, 500 times 500, 1000, 2000 markers).
-If a proportion is used e.g. \code{0.15},= 15% of individuals in each
-populations are chosen randomly as holdout individuals.
-With \code{thl = "all"} all individuals are used for ranking (not good) and
-\code{iteration.method} argument below is set to \code{1} by default.
-For the other thl values, you can create different holdout individuals lists
-with the \code{iteration.method} argument below.}
-
 \item{iteration.method}{With random marker selection the iterations argument =
 the number of iterations to repeat marker resampling. 
 Default: \code{iteration.method = 10}.
 With \code{marker.number = c(500, 1000)} and default iterations setting,
 500 markers will be randomly chosen 10 times and 1000 markers will be randomly
-chosen 10 times. For the ranked method, using \code{thl = 1}, the analysis
-will be repeated for each individuals in the data set for every
-\code{marker.number} selected. With a proportion argument \code{thl = 0.15},
-15% of individuals in each populations are chosen randomly as holdout
-individuals and this process is reapeated the number of times chosen by the
-\code{iteration.method} value.}
+chosen 10 times.}
 
 \item{folder}{(optional) The name of the folder created in the working directory to save the files/results.}
 
@@ -172,8 +155,8 @@ with the pop id in it, use the \code{strata} argument.}
 
 \item{strata}{(optional) A tab delimited file with 2 columns with header:
 \code{INDIVIDUALS} and \code{STRATA}. Default: \code{strata = NULL}. With a 
-data frame of genotypes the strata is the INDIVIDUALS and POP_ID columns, with
-PLINK files, the \code{tfam} first 2 columns are used. 
+data frame of genotypes the strata is the INDIVIDUALS and POP_ID column. With
+a PLINK file, the first 2 columns of the \code{tfam} are used. 
 If a \code{strata} file is specified, the strata file will have
 precedence. The \code{STRATA} column can be any hierarchical grouping.}
 
@@ -278,17 +261,19 @@ instruction here: \url{https://github.com/eriqande/gsi_sim}.
 }
 \examples{
 \dontrun{
+# with adegenet DAPC for the assignment and sampling.method = "random":
 assignment.treefrog <- assignment_mixture(
 data = "batch_1.vcf",
+mixture = "mixture.treefrog.tsv",
+assignment.analysis = "adegenet",
 whitelist.markers = "whitelist.vcf.txt",
 snp.ld = NULL,
 common.markers = TRUE,
 marker.number = c(500, 5000, "all"),
-sampling.method = "ranked",
-thl = 0.3,
-blacklist.id = "blacklist.id.lobster.tsv",
+sampling.method = "random",
+blacklist.id = "blacklist.id.tsv",
 subsample = 25,
-iteration.subsample = 10
+iteration.subsample = 5
 filename = "treefrog.txt",
 keep.gsi.files = FALSE,
 pop.levels = c("PAN", "COS")
@@ -296,12 +281,36 @@ pop.id.start = 5, pop.id.end = 7,
 imputation.method = FALSE,
 parallel.core = 12
 )
+# with gsi_sim for the mixture assignment and sampling.method = "ranked"
+# Here I also want to impute the genotypes of the data (baseline and mixture) 
+# using random forest:
+assignment.tuna <- assignment_mixture(
+data = "data.frame.genotypes.tuna.tsv",
+mixture = "cohort.tuna.tsv"
+assignment.analysis = "gsi_sim",
+common.markers = TRUE,
+marker.number = c(100, 200, 300),
+sampling.method = "ranked",
+subsample = 25,
+iteration.subsample = 5
+filename = "tuna.txt",
+keep.gsi.files = FALSE,
+pop.levels = c("BAJ", "IND"),
+imputation.method = "rf", 
+impute.mixture = TRUE, 
+impute = "genotype", 
+imputations.group = "populations", 
+num.tree = 100, 
+iteration.rf = 10, 
+split.number = 100, 
+verbose = FALSE,
+parallel.core = 12
+)
 
 Since the 'folder' argument is missing, it will be created automatically
 inside your working directory.
 
-To create a dataframe with the assignment results: 
-assignment <- assignment.treefrog$assignment.
+use $ to access the data frames in the list created.
 }
 }
 \author{
diff --git a/man/assignment_ngs.Rd b/man/assignment_ngs.Rd
index b126496..a7096fa 100644
--- a/man/assignment_ngs.Rd
+++ b/man/assignment_ngs.Rd
@@ -276,7 +276,7 @@ common.markers = TRUE,
 marker.number = c(500, 5000, "all"),
 sampling.method = "ranked",
 thl = 0.3,
-blacklist.id = "blacklist.id.lobster.tsv",
+blacklist.id = "blacklist.id.treefrog.tsv",
 subsample = 25,
 iteration.subsample = 10
 gsi_sim.filename = "treefrog.txt",