From 8adcb524a3a09e3d6c9ff79ec7a3096b7845f058 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= <timothee.flutre@inrae.fr>
Date: Fri, 21 Aug 2020 13:43:51 +0200
Subject: [PATCH 1/2] aadd support for parallel in find.clusters

---
 DESCRIPTION          |  8 ++++++--
 R/find.clust.R       | 28 +++++++++++++++++++++++++---
 man/find.clusters.Rd | 21 +++++++++++++++++++--
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2aecbd5..4b7111c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: adegenet
 Title: Exploratory Analysis of Genetic and Genomic Data
-Version: 2.1.3
+Version: 2.1.4
 Authors@R: 
     c(person(given = "Thibaut",
              family = "Jombart",
@@ -59,7 +59,11 @@ Authors@R:
       person(given = "Alexandre",
              family = "Courtiol",
              role = "ctb",
-             comment = c(ORCID = "0000-0003-0637-2959")))
+             comment = c(ORCID = "0000-0003-0637-2959")),
+      person(given = "Timothée",
+             family = "Flutre",
+             role = "ctb",
+             comment = c(ORCID = "0000-0003-4489-4782")))
 Description: Toolset for the exploration of genetic and genomic
     data. Adegenet provides formal (S4) classes for storing and handling
     various genetic data, including genetic markers with varying ploidy
diff --git a/R/find.clust.R b/R/find.clust.R
index 8dfc14c..dc02318 100644
--- a/R/find.clust.R
+++ b/R/find.clust.R
@@ -14,7 +14,8 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
                                      max.n.clust = round(nrow(x)/10), n.iter = 1e5,
                                      n.start = 10, center = TRUE, scale = TRUE,
                                      pca.select = c("nbEig","percVar"),
-                                     perc.pca = NULL, ..., dudi = NULL){
+                                     perc.pca = NULL, ..., dudi = NULL,
+                                     parallel=FALSE, n.cores=NULL){
 
     ## CHECKS ##
     stat <- match.arg(stat)
@@ -103,7 +104,28 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
         nbClust <- min.n.clust:max.n.clust
         WSS <- numeric(0)
 
-        for(i in 1:length(nbClust)){
+        if(parallel && is.null(n.cores)){
+          n.cores <- parallel::detectCores()
+        }
+
+        if(parallel){
+          WSS <- unlist(parallel::mclapply(1:length(nbClust),
+                                           function(i){
+                                             if (method == "kmeans") {
+                                               ## kmeans clustering (original method)
+                                               temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start)
+                                               ##WSS[i] <- sum(temp$withinss)
+                                             } else {
+                                               ## ward clustering
+                                               temp <- list()
+                                               temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i])
+                                             }
+                                             WSS[i] <- .compute.wss(XU, temp$cluster)
+                                           },
+                                           mc.cores=n.cores, mc.silent=TRUE,
+                                           mc.cleanup=TRUE, mc.preschedule=FALSE))
+        } else{
+          for(i in 1:length(nbClust)){
             if (method == "kmeans") {
                 ## kmeans clustering (original method)
                 temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start)
@@ -114,7 +136,7 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
                 temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i])
             }
                 WSS[i] <- .compute.wss(XU, temp$cluster)
-            
+          }
         }
 
 
diff --git a/man/find.clusters.Rd b/man/find.clusters.Rd
index 6a57be1..13d38e3 100644
--- a/man/find.clusters.Rd
+++ b/man/find.clusters.Rd
@@ -27,7 +27,8 @@ The K-means procedure used in \code{find.clusters} is
 \code{\link[stats]{kmeans}} function from the \code{stats} package. The PCA
 function is \code{\link[ade4]{dudi.pca}} from the \code{ade4} package, except
 for \linkS4class{genlight} objects which use the \code{\link{glPca}} procedure
-from adegenet.
+from adegenet. When the \code{parallel} package is available, \code{glPca}
+uses multiple-core ressources for more efficient computations.
 
 \code{find.clusters} is a generic function with methods for the
  following types of objects:\cr
@@ -46,7 +47,7 @@ from adegenet.
               "smoothNgoesup", "goodfit"), max.n.clust = round(nrow(x)/10),
               n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE,
               pca.select = c("nbEig","percVar"), perc.pca = NULL, \ldots, dudi =
-              NULL)
+              NULL, parallel = FALSE, n.cores = NULL)
 
 \method{find.clusters}{matrix}(x, \ldots)
 
@@ -158,6 +159,15 @@ from adegenet.
     the ade4 package). If provided, prior PCA will be ignored, and this object
     will be used as a prior step for variable orthogonalisation.}
 
+\item{parallel}{a logical indicating whether multiple cores -if
+    available- should be used for the computations (TRUE, default), or
+    not (FALSE); requires the package \code{parallel} to be installed
+    (see details).}
+
+\item{n.cores}{if \code{parallel} is TRUE, the number of cores to be
+    used in the computations; if NULL, then the maximum number of cores
+    available on the computer is used.}
+
 \item{glPca}{an optional \code{\link{glPca}} object; if provided, dimension
     reduction is not performed (saving computational time) but taken directly
     from this object.}
@@ -220,6 +230,13 @@ from adegenet.
   number of clusters. This approach does not rely on differences between
   successive statistics, but on absolute fit. It selects the model with
   the smallest K so that the overall fit is above a given threshold.
+
+  === Using multiple cores ===
+  
+  Most recent machines have one or several processors with multiple
+  cores. R processes usually use one single core. The package
+  \code{parallel} allows for parallelizing some computations on
+  multiple cores, which can decrease drastically computational time.
 }
 \value{
   The class \code{find.clusters} is a list with the following

From c59ad282baf248a412a9dcbda546231feb23a91b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= <timothee.flutre@inrae.fr>
Date: Fri, 21 Aug 2020 13:43:51 +0200
Subject: [PATCH 2/2] add support for parallel in find.clusters

---
 DESCRIPTION          |  8 ++++++--
 R/find.clust.R       | 28 +++++++++++++++++++++++++---
 man/find.clusters.Rd | 21 +++++++++++++++++++--
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2aecbd5..4b7111c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: adegenet
 Title: Exploratory Analysis of Genetic and Genomic Data
-Version: 2.1.3
+Version: 2.1.4
 Authors@R: 
     c(person(given = "Thibaut",
              family = "Jombart",
@@ -59,7 +59,11 @@ Authors@R:
       person(given = "Alexandre",
              family = "Courtiol",
              role = "ctb",
-             comment = c(ORCID = "0000-0003-0637-2959")))
+             comment = c(ORCID = "0000-0003-0637-2959")),
+      person(given = "Timothée",
+             family = "Flutre",
+             role = "ctb",
+             comment = c(ORCID = "0000-0003-4489-4782")))
 Description: Toolset for the exploration of genetic and genomic
     data. Adegenet provides formal (S4) classes for storing and handling
     various genetic data, including genetic markers with varying ploidy
diff --git a/R/find.clust.R b/R/find.clust.R
index 8dfc14c..dc02318 100644
--- a/R/find.clust.R
+++ b/R/find.clust.R
@@ -14,7 +14,8 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
                                      max.n.clust = round(nrow(x)/10), n.iter = 1e5,
                                      n.start = 10, center = TRUE, scale = TRUE,
                                      pca.select = c("nbEig","percVar"),
-                                     perc.pca = NULL, ..., dudi = NULL){
+                                     perc.pca = NULL, ..., dudi = NULL,
+                                     parallel=FALSE, n.cores=NULL){
 
     ## CHECKS ##
     stat <- match.arg(stat)
@@ -103,7 +104,28 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
         nbClust <- min.n.clust:max.n.clust
         WSS <- numeric(0)
 
-        for(i in 1:length(nbClust)){
+        if(parallel && is.null(n.cores)){
+          n.cores <- parallel::detectCores()
+        }
+
+        if(parallel){
+          WSS <- unlist(parallel::mclapply(1:length(nbClust),
+                                           function(i){
+                                             if (method == "kmeans") {
+                                               ## kmeans clustering (original method)
+                                               temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start)
+                                               ##WSS[i] <- sum(temp$withinss)
+                                             } else {
+                                               ## ward clustering
+                                               temp <- list()
+                                               temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i])
+                                             }
+                                             WSS[i] <- .compute.wss(XU, temp$cluster)
+                                           },
+                                           mc.cores=n.cores, mc.silent=TRUE,
+                                           mc.cleanup=TRUE, mc.preschedule=FALSE))
+        } else{
+          for(i in 1:length(nbClust)){
             if (method == "kmeans") {
                 ## kmeans clustering (original method)
                 temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start)
@@ -114,7 +136,7 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU
                 temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i])
             }
                 WSS[i] <- .compute.wss(XU, temp$cluster)
-            
+          }
         }
 
 
diff --git a/man/find.clusters.Rd b/man/find.clusters.Rd
index 6a57be1..13d38e3 100644
--- a/man/find.clusters.Rd
+++ b/man/find.clusters.Rd
@@ -27,7 +27,8 @@ The K-means procedure used in \code{find.clusters} is
 \code{\link[stats]{kmeans}} function from the \code{stats} package. The PCA
 function is \code{\link[ade4]{dudi.pca}} from the \code{ade4} package, except
 for \linkS4class{genlight} objects which use the \code{\link{glPca}} procedure
-from adegenet.
+from adegenet. When the \code{parallel} package is available, \code{glPca}
+uses multiple-core ressources for more efficient computations.
 
 \code{find.clusters} is a generic function with methods for the
  following types of objects:\cr
@@ -46,7 +47,7 @@ from adegenet.
               "smoothNgoesup", "goodfit"), max.n.clust = round(nrow(x)/10),
               n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE,
               pca.select = c("nbEig","percVar"), perc.pca = NULL, \ldots, dudi =
-              NULL)
+              NULL, parallel = FALSE, n.cores = NULL)
 
 \method{find.clusters}{matrix}(x, \ldots)
 
@@ -158,6 +159,15 @@ from adegenet.
     the ade4 package). If provided, prior PCA will be ignored, and this object
     will be used as a prior step for variable orthogonalisation.}
 
+\item{parallel}{a logical indicating whether multiple cores -if
+    available- should be used for the computations (TRUE, default), or
+    not (FALSE); requires the package \code{parallel} to be installed
+    (see details).}
+
+\item{n.cores}{if \code{parallel} is TRUE, the number of cores to be
+    used in the computations; if NULL, then the maximum number of cores
+    available on the computer is used.}
+
 \item{glPca}{an optional \code{\link{glPca}} object; if provided, dimension
     reduction is not performed (saving computational time) but taken directly
     from this object.}
@@ -220,6 +230,13 @@ from adegenet.
   number of clusters. This approach does not rely on differences between
   successive statistics, but on absolute fit. It selects the model with
   the smallest K so that the overall fit is above a given threshold.
+
+  === Using multiple cores ===
+  
+  Most recent machines have one or several processors with multiple
+  cores. R processes usually use one single core. The package
+  \code{parallel} allows for parallelizing some computations on
+  multiple cores, which can decrease drastically computational time.
 }
 \value{
   The class \code{find.clusters} is a list with the following