From 8adcb524a3a09e3d6c9ff79ec7a3096b7845f058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= Date: Fri, 21 Aug 2020 13:43:51 +0200 Subject: [PATCH 1/2] aadd support for parallel in find.clusters --- DESCRIPTION | 8 ++++++-- R/find.clust.R | 28 +++++++++++++++++++++++++--- man/find.clusters.Rd | 21 +++++++++++++++++++-- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2aecbd5..4b7111c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: adegenet Title: Exploratory Analysis of Genetic and Genomic Data -Version: 2.1.3 +Version: 2.1.4 Authors@R: c(person(given = "Thibaut", family = "Jombart", @@ -59,7 +59,11 @@ Authors@R: person(given = "Alexandre", family = "Courtiol", role = "ctb", - comment = c(ORCID = "0000-0003-0637-2959"))) + comment = c(ORCID = "0000-0003-0637-2959")), + person(given = "Timothée", + family = "Flutre", + role = "ctb", + comment = c(ORCID = "0000-0003-4489-4782"))) Description: Toolset for the exploration of genetic and genomic data. Adegenet provides formal (S4) classes for storing and handling various genetic data, including genetic markers with varying ploidy diff --git a/R/find.clust.R b/R/find.clust.R index 8dfc14c..dc02318 100644 --- a/R/find.clust.R +++ b/R/find.clust.R @@ -14,7 +14,8 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU max.n.clust = round(nrow(x)/10), n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE, pca.select = c("nbEig","percVar"), - perc.pca = NULL, ..., dudi = NULL){ + perc.pca = NULL, ..., dudi = NULL, + parallel=FALSE, n.cores=NULL){ ## CHECKS ## stat <- match.arg(stat) @@ -103,7 +104,28 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU nbClust <- min.n.clust:max.n.clust WSS <- numeric(0) - for(i in 1:length(nbClust)){ + if(parallel && is.null(n.cores)){ + n.cores <- parallel::detectCores() + } + + if(parallel){ + WSS <- unlist(parallel::mclapply(1:length(nbClust), + function(i){ + if (method == "kmeans") { + ## kmeans clustering (original method) + temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start) + ##WSS[i] <- sum(temp$withinss) + } else { + ## ward clustering + temp <- list() + temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i]) + } + WSS[i] <- .compute.wss(XU, temp$cluster) + }, + mc.cores=n.cores, mc.silent=TRUE, + mc.cleanup=TRUE, mc.preschedule=FALSE)) + } else{ + for(i in 1:length(nbClust)){ if (method == "kmeans") { ## kmeans clustering (original method) temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start) @@ -114,7 +136,7 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i]) } WSS[i] <- .compute.wss(XU, temp$cluster) - + } } diff --git a/man/find.clusters.Rd b/man/find.clusters.Rd index 6a57be1..13d38e3 100644 --- a/man/find.clusters.Rd +++ b/man/find.clusters.Rd @@ -27,7 +27,8 @@ The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the \code{ade4} package, except for \linkS4class{genlight} objects which use the \code{\link{glPca}} procedure -from adegenet. +from adegenet. When the \code{parallel} package is available, \code{glPca} +uses multiple-core ressources for more efficient computations. \code{find.clusters} is a generic function with methods for the following types of objects:\cr @@ -46,7 +47,7 @@ from adegenet. "smoothNgoesup", "goodfit"), max.n.clust = round(nrow(x)/10), n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE, pca.select = c("nbEig","percVar"), perc.pca = NULL, \ldots, dudi = - NULL) + NULL, parallel = FALSE, n.cores = NULL) \method{find.clusters}{matrix}(x, \ldots) @@ -158,6 +159,15 @@ from adegenet. the ade4 package). If provided, prior PCA will be ignored, and this object will be used as a prior step for variable orthogonalisation.} +\item{parallel}{a logical indicating whether multiple cores -if + available- should be used for the computations (TRUE, default), or + not (FALSE); requires the package \code{parallel} to be installed + (see details).} + +\item{n.cores}{if \code{parallel} is TRUE, the number of cores to be + used in the computations; if NULL, then the maximum number of cores + available on the computer is used.} + \item{glPca}{an optional \code{\link{glPca}} object; if provided, dimension reduction is not performed (saving computational time) but taken directly from this object.} @@ -220,6 +230,13 @@ from adegenet. number of clusters. This approach does not rely on differences between successive statistics, but on absolute fit. It selects the model with the smallest K so that the overall fit is above a given threshold. + + === Using multiple cores === + + Most recent machines have one or several processors with multiple + cores. R processes usually use one single core. The package + \code{parallel} allows for parallelizing some computations on + multiple cores, which can decrease drastically computational time. } \value{ The class \code{find.clusters} is a list with the following From c59ad282baf248a412a9dcbda546231feb23a91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= Date: Fri, 21 Aug 2020 13:43:51 +0200 Subject: [PATCH 2/2] add support for parallel in find.clusters --- DESCRIPTION | 8 ++++++-- R/find.clust.R | 28 +++++++++++++++++++++++++--- man/find.clusters.Rd | 21 +++++++++++++++++++-- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2aecbd5..4b7111c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: adegenet Title: Exploratory Analysis of Genetic and Genomic Data -Version: 2.1.3 +Version: 2.1.4 Authors@R: c(person(given = "Thibaut", family = "Jombart", @@ -59,7 +59,11 @@ Authors@R: person(given = "Alexandre", family = "Courtiol", role = "ctb", - comment = c(ORCID = "0000-0003-0637-2959"))) + comment = c(ORCID = "0000-0003-0637-2959")), + person(given = "Timothée", + family = "Flutre", + role = "ctb", + comment = c(ORCID = "0000-0003-4489-4782"))) Description: Toolset for the exploration of genetic and genomic data. Adegenet provides formal (S4) classes for storing and handling various genetic data, including genetic markers with varying ploidy diff --git a/R/find.clust.R b/R/find.clust.R index 8dfc14c..dc02318 100644 --- a/R/find.clust.R +++ b/R/find.clust.R @@ -14,7 +14,8 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU max.n.clust = round(nrow(x)/10), n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE, pca.select = c("nbEig","percVar"), - perc.pca = NULL, ..., dudi = NULL){ + perc.pca = NULL, ..., dudi = NULL, + parallel=FALSE, n.cores=NULL){ ## CHECKS ## stat <- match.arg(stat) @@ -103,7 +104,28 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU nbClust <- min.n.clust:max.n.clust WSS <- numeric(0) - for(i in 1:length(nbClust)){ + if(parallel && is.null(n.cores)){ + n.cores <- parallel::detectCores() + } + + if(parallel){ + WSS <- unlist(parallel::mclapply(1:length(nbClust), + function(i){ + if (method == "kmeans") { + ## kmeans clustering (original method) + temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start) + ##WSS[i] <- sum(temp$withinss) + } else { + ## ward clustering + temp <- list() + temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i]) + } + WSS[i] <- .compute.wss(XU, temp$cluster) + }, + mc.cores=n.cores, mc.silent=TRUE, + mc.cleanup=TRUE, mc.preschedule=FALSE)) + } else{ + for(i in 1:length(nbClust)){ if (method == "kmeans") { ## kmeans clustering (original method) temp <- kmeans(XU, centers = nbClust[i], iter.max = n.iter, nstart = n.start) @@ -114,7 +136,7 @@ find.clusters.data.frame <- function(x, clust = NULL, n.pca = NULL, n.clust = NU temp$cluster <- cutree(hclust(dist(XU)^2, method = "ward.D2"), k = nbClust[i]) } WSS[i] <- .compute.wss(XU, temp$cluster) - + } } diff --git a/man/find.clusters.Rd b/man/find.clusters.Rd index 6a57be1..13d38e3 100644 --- a/man/find.clusters.Rd +++ b/man/find.clusters.Rd @@ -27,7 +27,8 @@ The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the \code{ade4} package, except for \linkS4class{genlight} objects which use the \code{\link{glPca}} procedure -from adegenet. +from adegenet. When the \code{parallel} package is available, \code{glPca} +uses multiple-core ressources for more efficient computations. \code{find.clusters} is a generic function with methods for the following types of objects:\cr @@ -46,7 +47,7 @@ from adegenet. "smoothNgoesup", "goodfit"), max.n.clust = round(nrow(x)/10), n.iter = 1e5, n.start = 10, center = TRUE, scale = TRUE, pca.select = c("nbEig","percVar"), perc.pca = NULL, \ldots, dudi = - NULL) + NULL, parallel = FALSE, n.cores = NULL) \method{find.clusters}{matrix}(x, \ldots) @@ -158,6 +159,15 @@ from adegenet. the ade4 package). If provided, prior PCA will be ignored, and this object will be used as a prior step for variable orthogonalisation.} +\item{parallel}{a logical indicating whether multiple cores -if + available- should be used for the computations (TRUE, default), or + not (FALSE); requires the package \code{parallel} to be installed + (see details).} + +\item{n.cores}{if \code{parallel} is TRUE, the number of cores to be + used in the computations; if NULL, then the maximum number of cores + available on the computer is used.} + \item{glPca}{an optional \code{\link{glPca}} object; if provided, dimension reduction is not performed (saving computational time) but taken directly from this object.} @@ -220,6 +230,13 @@ from adegenet. number of clusters. This approach does not rely on differences between successive statistics, but on absolute fit. It selects the model with the smallest K so that the overall fit is above a given threshold. + + === Using multiple cores === + + Most recent machines have one or several processors with multiple + cores. R processes usually use one single core. The package + \code{parallel} allows for parallelizing some computations on + multiple cores, which can decrease drastically computational time. } \value{ The class \code{find.clusters} is a list with the following