Merge branch 'devel'

kharchenkolab · May 3, 2021 · 58812c6 · 58812c6
2 parents 0711f57 + f95d5db
commit 58812c6
Show file tree

Hide file tree

Showing 45 changed files with 734 additions and 1,288 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,4 @@
 .Rhistory
 .RData
 .Ruserdata
-pagoda2*.tar.gz
+pagoda2*.tar.gz
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 ## Upcoming
 
+## [1.0.3] - 2020-05-01
+
+- Removed `jsDist()` as it's in sccore
+- Removed `multi2dend()` as it's in sccore
+- Removed strong dependency for p2data
+
 ## [1.0.2] - 2020-03-03
 
 ### Changed

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,11 @@
 Package: pagoda2
 Title: Single Cell Analysis and Differential Expression
-Version: 1.0.2
+Version: 1.0.3
 Authors@R: c(person("Nikolas","Barkas", email="[email protected]", role="aut"), person("Viktor", "Petukhov", email="[email protected]", role="aut"), person("Peter", "Kharchenko", email = "[email protected]", role = "aut"), person("Simon", "Steiger", email = "[email protected]", role = "ctb"), person("Evan", "Biederstedt", email="[email protected]", role=c("cre", "aut")))
 Description: Analyzing and interactively exploring large-scale single-cell RNA-seq datasets. 'pagoda2' primarily performs normalization and differential gene expression analysis, with an interactive application for exploring single-cell RNA-seq datasets. It performs basic tasks such as cell size normalization, gene variance normalization, and can be used to identify subpopulations and run differential expression within individual samples. 'pagoda2' was written to rapidly process modern large-scale scRNAseq datasets of approximately 1e6 cells. The companion web application allows users to explore which gene expression patterns form the different subpopulations within your data. The package also serves as the primary method for preprocessing data for conos, <https://github.com/kharchenkolab/conos>. This package interacts with data available through the 'p2data' package, which is available in a 'drat' repository. To access this data package, see the instructions at <https://github.com/kharchenkolab/pagoda2>. The size of the 'p2data' package is approximately 6 MB.
 License: GPL-3 
 Copyright: See the file COPYRIGHTS for various pagoda2 copyright details
 Encoding: UTF-8
-LazyData: true
 Depends: 
     R (>= 3.5.0), Matrix, igraph
 biocViews:

diff --git a/R/Pagoda2.R b/R/Pagoda2.R
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -41,10 +41,6 @@ inplaceWinsorizeSparseCols <- function(sY, n, ncores = 1L) {
     .Call('_pagoda2_inplaceWinsorizeSparseCols', PACKAGE = 'pagoda2', sY, n, ncores)
 }
 
-jsDist <- function(m, ncores = 1L) {
-    .Call('_pagoda2_jsDist', PACKAGE = 'pagoda2', m, ncores)
-}
-
 orderColumnRows <- function(p, i) {
     .Call('_pagoda2_orderColumnRows', PACKAGE = 'pagoda2', p, i)
 }

diff --git a/R/helpers.R b/R/helpers.R
@@ -18,48 +18,6 @@ NULL
   library.dynam.unload("pagoda2", libpath)
 }
 
-#' Translate multilevel segmentation into a dendrogram, with the lowest level of the dendrogram listing the cells
-#'
-#' @param cl clusters
-#' @param counts matrix of counts
-#' @param deep boolean (default=FALSE)
-#' @param dist character vector Distance metric (default='cor')
-#' @return cell dendrogram
-#' @keywords internal
-multi2dend <- function(cl, counts, deep=FALSE, dist='cor') {
-  if (deep) {
-    clf <- as.integer(cl$memberships[1,]) # take the lowest level
-  } else {
-    clf <- as.integer(membership(cl))
-  }
-  names(clf) <- names(membership(cl))
-  clf.size <- unlist(tapply(clf,factor(clf,levels=seq(1,max(clf))),length))
-  rowFac <- rep(NA, nrow(counts));
-  rowFac[match(names(clf),rownames(counts))] <- clf
-  lvec <- colSumByFac(counts,rowFac)[-1,,drop=FALSE]
-  if(dist=='JS') {
-    lvec.dist <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec))))
-  } else { # use correlation distance in log10 space
-    lvec.dist <- 1-cor(t(log10(lvec/pmax(1,Matrix::rowSums(lvec))+1)))
-  }
-  d <- as.dendrogram(stats::hclust(as.dist(lvec.dist),method='ward.D'))
-  # add cell info to the laves
-  addinfo <- function(l, env) {
-    v <- as.integer(mget("index",envir=env,ifnotfound=0)[[1]])+1;
-    attr(l,'nodeId') <- v
-    assign("index",v,envir=env)
-    attr(l,'nCells') <- sum(clf.size[as.integer(unlist(l))])
-    if(is.leaf(l)) {
-      attr(l,'cells') <- names(clf)[clf==attr(l,'label')]
-    }
-    attr(l,'root') <- FALSE
-    return(l);
-  }
-  d <- dendrapply(d,addinfo,env=environment())
-  attr(d,'root') <- TRUE
-  return(d)
-}
-
 #' Quick utility to check if given character vector is colors
 #' Thanks to Stackoverflow: http://stackoverflow.com/questions/13289009/check-if-character-string-is-a-valid-color-representation
 #'
@@ -77,7 +35,7 @@ areColors <- function(x) {
 #' @param mc.preschedule See ?parallel::mclapply (default=FALSE). If TRUE then the computation is first divided to (at most) as many jobs are there are cores and then the jobs are started, each job possibly covering more than one value. If FALSE, then one job is forked for each value of X. The former is better for short computations or large number of values in X, the latter is better for jobs that have high variance of completion time and not too many values of X compared to mc.cores.
 #' @return list, as returned by lapply
 #' @keywords internal
-papply <- function(..., n.cores=parallel::detectCores(), mc.preschedule=FALSE) {
+papply <- function(..., n.cores=parallel::detectCores(), mc.preschedule=FALSE) { # TODO: replace it with sccore::plapply
   if(n.cores>1) {
     if(requireNamespace("parallel", quietly = TRUE)) {
       return(mclapply(...,mc.cores=n.cores,mc.preschedule=mc.preschedule))

diff --git a/R/largeVis.R b/R/largeVis.R
@@ -71,18 +71,6 @@ buildWijMatrix <- function(x, threads = NULL, perplexity = 50) {
 #' is to use the edge weights, consistent with the reference implementation.
 #'
 #' @return A dense [N,D] matrix of the coordinates projecting the w_ij matrix into the lower-dimensional space.
-#' @examples
-#' \donttest{
-#' data(CO2)
-#' CO2$Plant <- as.integer(CO2$Plant)
-#' CO2$Type <- as.integer(CO2$Type)
-#' CO2$Treatment <- as.integer(CO2$Treatment)
-#' co <- scale(as.matrix(CO2))
-#' # Very small datasets often produce a warning regarding the alias table.  This is safely ignored.
-#' suppressWarnings(vis <- largeVis(t(co), K = 20, sgd_batches = 1, threads = 2))
-#' suppressWarnings(coords <- projectKNNs(vis$wij, threads = 2))
-#' plot(t(coords))
-#' }
 #' @export
 projectKNNs <- function(wij, # symmetric sparse matrix
                         dim = 2, # dimension of the projection space
@@ -98,10 +86,14 @@ projectKNNs <- function(wij, # symmetric sparse matrix
                         threads = NULL,
                         verbose = getOption("verbose", TRUE)) {
 
-  if (alpha < 0) stop("alpha < 0 is meaningless")
+  if (alpha < 0) {
+    stop("alpha < 0 is meaningless")
+  }
   N <-  (length(wij@p) - 1)
   js <- rep(0:(N - 1), diff(wij@p))
-  if (any(is.na(js))) stop("NAs in the index vector.")
+  if (any(is.na(js))) {
+    stop("NAs in the index vector")
+  }
   is <- wij@i
 
   ##############################################

diff --git a/R/pagoda2WebApp.R b/R/pagoda2WebApp.R
@@ -136,7 +136,7 @@ pagoda2WebApp$methods(
       lvec <- t(lvec/pmax(1,rowSums(lvec)))
       colnames(lvec) <- which(table(cl0)>0)
       rownames(lvec) <- colnames(r$misc[['rawCounts']])
-      ld <- jsDist(lvec)
+      ld <- sccore::jsDist(lvec)
       colnames(ld) <- rownames(ld) <- colnames(lvec)
 
       #hcGroup is a hclust object of whatever cell groupings we provided above

diff --git a/R/pipelineHelpers.R b/R/pipelineHelpers.R
@@ -19,22 +19,12 @@
 #' @param get.tsne boolean Whether to calculate tSNE embedding (default=TRUE) 
 #' @param make.geneknn boolean Whether pre-calculate gene kNN (for gene search) (default=TRUE) 
 #' @return a new 'Pagoda2' object
-#' @examples
-#' \donttest{
-#' ## load count matrix
-#' cm <- p2data::sample_BM1
-#' ## perform basic p2 processing
-#' p2 <- basicP2proc(cm)
-#' }
 #' 
 #' @export 
 basicP2proc <- function(cd, n.cores=1, n.odgenes=3e3, nPcs=100, k=30, perplexity=50, 
   log.scale=TRUE, trim=10, keep.genes=NULL, min.cells.per.gene=0, min.transcripts.per.cell=100, 
   get.largevis=TRUE, get.tsne=TRUE, make.geneknn=TRUE) {
 
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
   rownames(cd) <- make.unique(rownames(cd))
   ## Basic Processing
   p2 <- Pagoda2$new(cd, n.cores = n.cores, keep.genes = keep.genes, trim=trim, log.scale=log.scale, min.cells.per.gene=min.cells.per.gene, min.transcripts.per.cell=min.transcripts.per.cell)
@@ -77,10 +67,6 @@ basicP2proc <- function(cd, n.cores=1, n.odgenes=3e3, nPcs=100, k=30, perplexity
 #' @export 
 extendedP2proc <- function(p2, organism = 'hs') {
 
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
-
   if (organism == 'hs') {
     go.env <- p2.generate.human.go(p2)
   } else if (organism == 'mm') {
@@ -201,19 +187,10 @@ webP2proc <- function(p2, additionalMetadata=NULL, title='Pagoda2',
 #'     annotated at that GO term or to one of its child nodes in the GO ontology (default=NULL)
 #' @param eg.alias2eg mappings between common gene symbol identifiers and entrez gene identifiers (default=NULL)
 #' @param min.env.length numeric Minimum environment length (default=5)
-#' @examples
-#' \donttest{
-#' cm <- p2data::sample_BM1
-#' p2 <- basicP2proc(cm)
-#' p2.generate.go(p2, organism='hs')
-#' }
 #' 
 #' @export 
 p2.generate.go <- function(r, organism=NULL, go2all.egs=NULL, eg.alias2eg=NULL, min.env.length=5) {
 
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
 
   if (is.null(organism) && (is.null(go2all.egs) || is.null(eg.alias2eg))) {
     stop('Either organism or go2all.egs and eg.alias2eg must be specified');
@@ -265,18 +242,9 @@ p2.generate.go <- function(r, organism=NULL, go2all.egs=NULL, eg.alias2eg=NULL,
 #' 
 #' @param r a 'Pagoda2' object
 #' @return a GO environment object
-#' @examples
-#' \donttest{
-#' cm <- p2data::sample_BM1
-#' p2 <- basicP2proc(cm)
-#' p2.generate.dr.go(p2)
-#' }
 #' 
 #' @export
 p2.generate.dr.go <- function(r) {
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
   p2.generate.go(r, "dr")
 }
 
@@ -285,37 +253,19 @@ p2.generate.dr.go <- function(r) {
 #' 
 #' @param r a 'Pagoda2' object
 #' @return a GO environment object
-#' @examples
-#' \donttest{
-#' cm <- p2data::sample_BM1
-#' p2 <- basicP2proc(cm)
-#' p2.generate.human.go(p2)
-#' }
 #' 
 #' @export
 p2.generate.human.go <- function(r) {
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
   p2.generate.go(r, "hs")
 }
 
 #' Generate a GO environment for mouse for overdispersion analysis for the the back end
 #' 
 #' @param r a 'Pagoda2' object
 #' @return a GO environment object
-#' @examples
-#' \donttest{
-#' cm <- p2data::sample_BM1
-#' p2 <- basicP2proc(cm)
-#' p2.generate.mouse.go(p2)
-#' }
 #' 
 #' @export 
 p2.generate.mouse.go <- function(r) {
-  if (!requireNamespace("p2data", quietly = TRUE)) {
-    stop("Package \"p2data\" needed for the Pagoda2 class to work. This can be installed via a drat repository, using \"install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')\". Please read the details provided within the README at https://github.com/kharchenkolab/pagoda2.", call. = FALSE)
-  }
   p2.generate.go(r, "mm")
 }
 

diff --git a/README.md b/README.md
@@ -53,32 +53,6 @@ install.packages('devtools')
 devtools::install_github('kharchenkolab/pagoda2', build_vignettes = TRUE)
 ```
 
-Please note that the package `pagoda2` depends on data in a data package (`p2data`) that is available through a `drat` repository on GitHub. To use the `pagoda2` package, you will need to install `p2data`. There are two equally valid options to install this package:
-
-A) Users could install `p2data` by adding the `drat` archive to the list of repositories your system will query when adding and updating R packages. Once you do this, you can install `p2data` with `install.packages()`, using the command:
-
-```r
-library(drat)
-addRepo("kharchenkolab")
-install.packages("p2data")
-```
-
-The following command is also a valid approach:
-
-```r
-install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')
-```
-
-Please see the [drat documentation](https://dirk.eddelbuettel.com/code/drat.html) for more comprehensive explanations and vignettes.
-
-
-B) Another way to install the package `p2data` is to use `devtools::install_github()`:
-
-```r
-library(devtools)
-install_github("kharchenkolab/p2data")
-```
-
 
 ### Installing Linux dependencies
 

diff --git a/doc/pagoda2.walkthrough.R b/doc/pagoda2.walkthrough.R
@@ -12,7 +12,7 @@ install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='
 ## load the dataset
 countMatrix <- p2data::sample_BM1
 ## all basic pagoda2 processing with basicP2proc()
-p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, 
+p2.processed <- basicP2proc(countMatrix, n.cores=1, min.cells.per.gene=10, 
                     n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE)
 
 ## -----------------------------------------------------------------------------

diff --git a/doc/pagoda2.walkthrough.Rmd b/doc/pagoda2.walkthrough.Rmd
@@ -31,14 +31,15 @@ library(dplyr)
 library(ggplot2)
 ```
 
-We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package `p2data` (See the README of pagoda2 for installation details).
+We have pre-generated a dataset of 3000 bone marrow cells that you can load as a matrix directly using the package `p2data`, which is available through a `drat` repository on GitHub. Note that the size of the 'p2data' package is approximately 6 MB. This package may be installed as follows:
 
 ```{r message=FALSE}
 install.packages('p2data', repos='https://kharchenkolab.github.io/drat/', type='source')
 ```
 
-The following command load the dataset of 3000 bone marrow cells as a sparse matrix:
+(Please see the [drat documentation](https://dirk.eddelbuettel.com/code/drat.html) for more comprehensive explanations and vignettes regarding `drat` repositories.)
 
+The following command load the dataset of 3000 bone marrow cells as a sparse matrix:
 
 ```r
 countMatrix <- p2data::sample_BM1
@@ -52,7 +53,7 @@ Next we feed this input into the function `basicP2proc()`, which performs all ba
 ## load the dataset
 countMatrix <- p2data::sample_BM1
 ## all basic pagoda2 processing with basicP2proc()
-p2.processed <- basicP2proc(countMatrix, n.cores=2, min.cells.per.gene=10, 
+p2.processed <- basicP2proc(countMatrix, n.cores=1, min.cells.per.gene=10, 
                     n.odgenes=2e3, get.largevis=FALSE, make.geneknn=FALSE)
 ```