Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

glPca order of magnitude faster #150

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions R/glFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,174 @@ glDotProd <- function(x, center=FALSE, scale=FALSE, alleleAsUnit=FALSE,



##
## PCA for genlight objects, trading memory for speed
## for 5 MB `genlight` the full matrix is 16.6 MB and the
## scaling is linear
##
## from the timing tests it looks that
## especially the NA handling is killing the performance
##
## 'elapsed' time for
## system.time(pca <- X) where X is:
## A) glPca(aa.genlight.tatry, nf=10)
## B) glPca(aa.genlight.tatry, nf=10, useC = F)
## C) glPca(aa.genlight.tatry, nf=10, parallel = F, useC = F)
## D) glPcaBigFast(aa.genlight.tatry, nf=10)
##
## for second test NAs were substituted by 0
##
## 43 individuals, 50k loci, 119414 NAs
## A) 170.1 s no NAs ~80 s
## B) 169 s
## C) 14 s
## D) 0.75 s both
##
## 88 individuals, 73631 loci, 273025 NAs
## A) 1171.3 s no NAs 46.7 s
## B) 65.2 no NAs 68.5 s
## C) 59.0 no NAs 68.5 s
## D) 3.2 s both
##
## 105 individuals, 85979 loci, 433076 NAs
## A) 2365.7 s no NAs 83.8 s
## B) 104.6 no NAs 122.6 s
## C) 105.3 s no NAs 119.6
## D) 4.5 s both

glPcaFast <- function(x,
center=TRUE,
scale=FALSE,
nf=NULL,
loadings=TRUE,
alleleAsUnit=FALSE,
returnDotProd=FALSE){

if(!inherits(x, "genlight")) stop("x is not a genlight object")

# keep the original mean / var code, as it's used further down
# and has some NA checks..
if(center) {
vecMeans <- glMean(x, alleleAsUnit=alleleAsUnit)
if(any(is.na(vecMeans))) stop("NAs detected in the vector of means")
}

if(scale){
vecVar <- glVar(x, alleleAsUnit=alleleAsUnit)
if(any(is.na(vecVar))) stop("NAs detected in the vector of variances")
}

# convert to full data, try to keep the NA handling as similar
# to the original as possible
# - dividing by ploidy keeps the NAs
mx <- t(sapply(x$gen, as.integer)) / ploidy(x)

# handle NAs
NAidx <- which(is.na(mx), arr.ind = T)
if (center) {
mx[NAidx] <- vecMeans[NAidx[,2]]
} else {
mx[NAidx] <- 0
}

# center and scale
mx <- scale(mx,
center = if (center) vecMeans else F,
scale = if (scale) vecVar else F)

# all dot products at once using underlying BLAS
# to support thousands of samples, this could be
# replaced by 'Truncated SVD', but it would require more changes
# in the code around
allProd <- tcrossprod(mx) / nInd(x) # assume uniform weights

## PERFORM THE ANALYSIS ##
## eigenanalysis
eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
rank <- sum(eigRes$values > 1e-12)
eigRes$values <- eigRes$values[1:rank]
eigRes$vectors <- eigRes$vectors[, 1:rank, drop=FALSE]

## scan nb of axes retained
if(is.null(nf)){
barplot(eigRes$values, main="Eigenvalues", col=heat.colors(rank))
cat("Select the number of axes: ")
nf <- as.integer(readLines(n = 1))
}

## rescale PCs
res <- list()
res$eig <- eigRes$values
nf <- min(nf, sum(res$eig>1e-10))
##res$matprod <- allProd # for debugging

## use: li = XQU = V\Lambda^(1/2)
eigRes$vectors <- eigRes$vectors * sqrt(nInd(x)) # D-normalize vectors
res$scores <- sweep(eigRes$vectors[, 1:nf, drop=FALSE],2, sqrt(eigRes$values[1:nf]), FUN="*")


## GET LOADINGS ##
## need to decompose X^TDV into a sum of n matrices of dim p*r
## but only two such matrices are represented at a time
if(loadings){
if(scale) {
vecSd <- sqrt(vecVar)
}
res$loadings <- matrix(0, nrow=nLoc(x), ncol=nf) # create empty matrix
## use: c1 = X^TDV
## and X^TV = A_1 + ... + A_n
## with A_k = X_[k-]^T v[k-]
myPloidy <- ploidy(x)
for(k in 1:nInd(x)){
temp <- as.integer(x@gen[[k]]) / myPloidy[k]
if(center) {
temp[is.na(temp)] <- vecMeans[is.na(temp)]
temp <- temp - vecMeans
} else {
temp[is.na(temp)] <- 0
}
if(scale){
temp <- temp/vecSd
}

res$loadings <- res$loadings + matrix(temp) %*% eigRes$vectors[k, 1:nf, drop=FALSE]
}

res$loadings <- res$loadings / nInd(x) # don't forget the /n of X_tDV
res$loadings <- sweep(res$loadings, 2, sqrt(eigRes$values[1:nf]), FUN="/")
}


## FORMAT OUTPUT ##
colnames(res$scores) <- paste("PC", 1:nf, sep="")
if(!is.null(indNames(x))){
rownames(res$scores) <- indNames(x)
} else {
rownames(res$scores) <- 1:nInd(x)
}

if(!is.null(res$loadings)){
colnames(res$loadings) <- paste("Axis", 1:nf, sep="")
if(!is.null(locNames(x)) & !is.null(alleles(x))){
rownames(res$loadings) <- paste(locNames(x),alleles(x), sep=".")
} else {
rownames(res$loadings) <- 1:nLoc(x)
}
}

if(returnDotProd){
res$dotProd <- allProd
rownames(res$dotProd) <- colnames(res$dotProd) <- indNames(x)
}

res$call <- match.call()

class(res) <- "glPca"

return(res)


}


########
Expand Down