thibautjombart · libor-m · Aug 14, 2016 · Aug 14, 2016
diff --git a/R/glFunctions.R b/R/glFunctions.R
@@ -268,6 +268,174 @@ glDotProd <- function(x, center=FALSE, scale=FALSE, alleleAsUnit=FALSE,
 
 
 
+##
+## PCA for genlight objects, trading memory for speed
+## for 5 MB `genlight` the full matrix is 16.6 MB and the
+## scaling is linear
+##
+## from the timing tests it looks that
+## especially the NA handling is killing the performance
+##
+## 'elapsed' time for
+## system.time(pca <- X) where X is:
+## A) glPca(aa.genlight.tatry, nf=10)
+## B) glPca(aa.genlight.tatry, nf=10, useC = F)
+## C) glPca(aa.genlight.tatry, nf=10, parallel = F, useC = F)
+## D) glPcaBigFast(aa.genlight.tatry, nf=10)
+##
+## for second test NAs were substituted by 0
+##
+## 43 individuals, 50k loci, 119414 NAs
+## A) 170.1 s no NAs ~80 s
+## B) 169 s
+## C) 14 s
+## D) 0.75 s both
+##
+## 88 individuals, 73631 loci, 273025 NAs
+## A) 1171.3 s no NAs 46.7 s
+## B) 65.2 no NAs 68.5 s
+## C) 59.0 no NAs 68.5 s
+## D) 3.2 s both
+##
+## 105 individuals, 85979 loci, 433076 NAs
+## A) 2365.7 s no NAs 83.8 s
+## B) 104.6 no NAs 122.6 s
+## C) 105.3 s no NAs 119.6
+## D) 4.5 s both
+
+glPcaFast <- function(x,
+                      center=TRUE,
+                      scale=FALSE,
+                      nf=NULL,
+                      loadings=TRUE,
+                      alleleAsUnit=FALSE,
+                      returnDotProd=FALSE){
+
+  if(!inherits(x, "genlight")) stop("x is not a genlight object")
+
+  # keep the original mean / var code, as it's used further down
+  # and has some NA checks..
+  if(center) {
+    vecMeans <- glMean(x, alleleAsUnit=alleleAsUnit)
+    if(any(is.na(vecMeans))) stop("NAs detected in the vector of means")
+  }
+
+  if(scale){
+    vecVar <- glVar(x, alleleAsUnit=alleleAsUnit)
+    if(any(is.na(vecVar))) stop("NAs detected in the vector of variances")
+  }
+
+  # convert to full data, try to keep the NA handling as similar
+  # to the original as possible
+  # - dividing by ploidy keeps the NAs
+  mx <- t(sapply(x$gen, as.integer)) / ploidy(x)
+
+  # handle NAs
+  NAidx <- which(is.na(mx), arr.ind = T)
+  if (center) {
+    mx[NAidx] <- vecMeans[NAidx[,2]]
+  } else {
+    mx[NAidx] <- 0
+  }
+
+  # center and scale
+  mx <- scale(mx,
+              center = if (center) vecMeans else F,
+              scale = if (scale) vecVar else F)
+
+  # all dot products at once using underlying BLAS
+  # to support thousands of samples, this could be
+  # replaced by 'Truncated SVD', but it would require more changes
+  # in the code around
+  allProd <- tcrossprod(mx) / nInd(x) # assume uniform weights
+
+  ## PERFORM THE ANALYSIS ##
+  ## eigenanalysis
+  eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
+  rank <- sum(eigRes$values > 1e-12)
+  eigRes$values <- eigRes$values[1:rank]
+  eigRes$vectors <- eigRes$vectors[, 1:rank, drop=FALSE]
+
+  ## scan nb of axes retained
+  if(is.null(nf)){
+    barplot(eigRes$values, main="Eigenvalues", col=heat.colors(rank))
+    cat("Select the number of axes: ")
+    nf <- as.integer(readLines(n = 1))
+  }
+
+  ## rescale PCs
+  res <- list()
+  res$eig <- eigRes$values
+  nf <- min(nf, sum(res$eig>1e-10))
+  ##res$matprod <- allProd # for debugging
+
+  ## use: li = XQU = V\Lambda^(1/2)
+  eigRes$vectors <- eigRes$vectors * sqrt(nInd(x)) # D-normalize vectors
+  res$scores <- sweep(eigRes$vectors[, 1:nf, drop=FALSE],2, sqrt(eigRes$values[1:nf]), FUN="*")
+
+
+  ## GET LOADINGS ##
+  ## need to decompose X^TDV into a sum of n matrices of dim p*r
+  ## but only two such matrices are represented at a time
+  if(loadings){
+    if(scale) {
+      vecSd <- sqrt(vecVar)
+    }
+    res$loadings <- matrix(0, nrow=nLoc(x), ncol=nf) # create empty matrix
+    ## use: c1 = X^TDV
+    ## and X^TV = A_1 + ... + A_n
+    ## with A_k = X_[k-]^T v[k-]
+    myPloidy <- ploidy(x)
+    for(k in 1:nInd(x)){
+      temp <- as.integer(x@gen[[k]]) / myPloidy[k]
+      if(center) {
+        temp[is.na(temp)] <- vecMeans[is.na(temp)]
+        temp <- temp - vecMeans
+      } else {
+        temp[is.na(temp)] <- 0
+      }
+      if(scale){
+        temp <- temp/vecSd
+      }
+
+      res$loadings <- res$loadings + matrix(temp) %*% eigRes$vectors[k, 1:nf, drop=FALSE]
+    }
+
+    res$loadings <- res$loadings / nInd(x) # don't forget the /n of X_tDV
+    res$loadings <- sweep(res$loadings, 2, sqrt(eigRes$values[1:nf]), FUN="/")
+  }
+
+
+  ## FORMAT OUTPUT ##
+  colnames(res$scores) <- paste("PC", 1:nf, sep="")
+  if(!is.null(indNames(x))){
+    rownames(res$scores) <- indNames(x)
+  } else {
+    rownames(res$scores) <- 1:nInd(x)
+  }
+
+  if(!is.null(res$loadings)){
+    colnames(res$loadings) <- paste("Axis", 1:nf, sep="")
+    if(!is.null(locNames(x)) & !is.null(alleles(x))){
+      rownames(res$loadings) <- paste(locNames(x),alleles(x), sep=".")
+    } else {
+      rownames(res$loadings) <- 1:nLoc(x)
+    }
+  }
+
+  if(returnDotProd){
+    res$dotProd <- allProd
+    rownames(res$dotProd) <- colnames(res$dotProd) <- indNames(x)
+  }
+
+  res$call <- match.call()
+
+  class(res) <- "glPca"
+
+  return(res)
+
+
+}
 
 
 ########