corceslab
diff --git a/Diff for: ‎DESCRIPTION
+2-2 b/Diff for: ‎DESCRIPTION
+2-2
diff --git a/Diff for: ‎NAMESPACE
+1 b/Diff for: ‎NAMESPACE
+1
diff --git a/Diff for: ‎R/CHOIR.R
+253-137 b/Diff for: ‎R/CHOIR.R
+253-137
diff --git a/Diff for: ‎R/HelperUtils.R
+1-2 b/Diff for: ‎R/HelperUtils.R
+1-2
diff --git a/Diff for: ‎R/PlottingUtils.R
+10-8 b/Diff for: ‎R/PlottingUtils.R
+10-8
diff --git a/Diff for: ‎R/buildTree.R
+230-128 b/Diff for: ‎R/buildTree.R
+230-128
diff --git a/Diff for: ‎R/combineTrees.R
+188-33 b/Diff for: ‎R/combineTrees.R
+188-33
@@ -1,8 +1,8 @@
 Package: CHOIR
-Title: CHOIR - Clustering Hierachy Optimization by Iterative Random forests
+Title: CHOIR - Cluster Hierachy Optimization by Iterative Random forests
 Version: 0.2.0
 Authors@R: 
-	person("Cathrine", "Petersen", , "cathrine.petersen@gladstone.ucsf.edu", role = c("aut", "cre"), 
+	person("Cathrine", "Sant", , "cathrine.sant@gladstone.ucsf.edu", role = c("aut", "cre"), 
 		comment = c(ORCID = "0000-0002-5821-9828"))
 Description: CHOIR is a clustering algorithm for single-cell sequencing data. CHOIR applies a framework of permutation tests and random forest classifiers across a hierarchical clustering tree to statistically identify clusters that represent distinct populations.
 License: MIT + file LICENSE
 
@@ -5,6 +5,7 @@ export(CHOIR)
 export(CHOIRpalette)
 export(buildParentTree)
 export(buildTree)
+export(combineTrees)
 export(compareClusters)
 export(inferTree)
 export(plotCHOIR)
 
@@ -674,8 +674,7 @@
 
 .getNewLabels <- function(merge_groups,
                           level,
-                          compiled_labels,
-                          ) {
+                          compiled_labels) {
 
   # Create new list
   merge_group_labels <- vector(mode = "list", length(merge_groups))
 
@@ -59,17 +59,19 @@ CHOIRpalette <- function(n) {
 #' Simplifies running \code{Seurat::RunUMAP()} after CHOIR clustering by
 #' automatically fetching the pre-generated dimensionality reductions.
 #'
-#' @param object An object of class 'Seurat', 'SingleCellExperiment', or
-#' 'ArchRProject' that has undergone CHOIR clustering.
-#' @param key The name under which CHOIR-related data for this run is retrieved
-#' from the object. Defaults to 'CHOIR'.
+#' @param object An object of class \code{Seurat}, \code{SingleCellExperiment},
+#' or \code{ArchRProject} that has undergone CHOIR clustering. For multi-omic
+#' data, we recommend using \code{ArchRProject} objects.
+#' @param key The name under which CHOIR-related data for this run is stored in
+#' the object. Defaults to “CHOIR”.
 #' @param reduction A character vector indicating which CHOIR subtree
 #' dimensionality reduction to run UMAP on (e.g., 'P0_reduction',
 #' 'P1_reduction'). Default = \code{NULL} will run UMAP on all of the
-#' dimensionality reductions generated by CHOIR stored under the provided 'key'.
-#' @param verbose A boolean value indicating whether to use verbose output
-#' during the execution of this function. Can be set to \code{FALSE} for a
-#' cleaner output.
+#' dimensionality reductions generated by CHOIR stored under the provided
+#' \code{key}.
+#' @param verbose A Boolean value indicating whether to use verbose output
+#' during the execution of CHOIR. Defaults to \code{TRUE}, but can be set to
+#' \code{FALSE} for a cleaner output.
 #'
 #' @return Returns the object with the following added data stored under the
 #' provided key: \describe{
 
@@ -6,45 +6,200 @@
 #' into a single compiled tree, which is then further pruned to standardize
 #' thresholds across each subtree.
 #'
-#' @param object An object of class 'Seurat', 'SingleCellExperiment', or
-#' 'ArchRProject', output after running buildParentTree.
+#' @param object An object of class \code{Seurat}, \code{SingleCellExperiment},
+#' or \code{ArchRProject} that was output from function \code{buildParentTree}.
+#' For multi-omic data, we recommend using \code{ArchRProject} objects.
 #' @param subtree_list A list containing the CHOIR records from each subtree.
 #' @param key The name under which CHOIR-related data for this run is stored in
-#' the object. Defaults to 'CHOIR'.
+#' the object. Defaults to “CHOIR”.
 #' @param alpha A numerical value indicating the significance level used for
 #' permutation test comparisons of cluster distinguishability. Defaults to 0.05.
-#' @param p_adjust A string indicating which multiple comparison
-#' adjustment to use. Permitted values are 'fdr', 'bonferroni', and 'none'.
-#' Defaults to 'bonferroni'.
-#' @param feature_set
-#' @param exclude_features
-#' @param n_iterations
-#' @param n_trees
-#' @param use_variance
-#' @param min_accuracy
-#' @param min_connections
-#' @param max_repeat_errors
-#' @param distance_approx
-#' @param distance_awareness
-#' @param collect_all_metrics
-#' @param sample_max
-#' @param downsampling_rate
-#' @param normalization_method
-#' @param batch_correction_method
-#' @param batch_labels
-#' @param use_assay
-#' @param input_matrix
-#' @param nn_matrix
-#' @param dist_matrix
-#' @param reduction
-#' @param n_cores
-#' @param random_seed
-#' @param verbose
+#' Decreasing the alpha value will yield more conservative clusters (fewer
+#' clusters) and will often decrease the computational time required, because
+#' fewer cluster comparisons may be needed.
+#' @param p_adjust A string indicating which multiple comparison adjustment
+#' method to use. Permitted values are “bonferroni”, “fdr”, and “none”. Defaults
+#' to “bonferroni”. Other correction methods may be less conservative,
+#' identifying more clusters, as CHOIR applies filters that reduce the total
+#' number of tests performed.
+#' @param feature_set A string indicating whether to train random forest
+#' classifiers on “all” features or only variable (“var”) features. Defaults to
+#' “var”. Computational time and memory required may increase if more features
+#' are used. Using all features instead of variable features may result in more
+#' conservative cluster calls.
+#' @param exclude_features A character vector indicating features that should be
+#' excluded from input to the random forest classifier. Defaults to \code{NULL},
+#' which means that no features will be excluded. This parameter can be used,
+#' for example, to exclude features correlated with cell quality, such as
+#' mitochondrial genes. Failure to exclude problematic features could result in
+#' clusters driven by cell quality, while over-exclusion of features could
+#' reduce the ability of CHOIR to distinguish cell populations that differ by
+#' those features.
+#' @param n_iterations A numerical value indicating the number of iterations run
+#' for each permutation test comparison. Increasing the number of iterations
+#' will approximately linearly increase the computational time required but
+#' provide a more accurate estimation of the significance of the permutation
+#' test. Decreasing the number of iterations runs the risk of leading to
+#' underclustering due to lack of statistical power. The default value, 100
+#' iterations, was selected because it avoids underclustering, while minimizing
+#' computational time and the diminishing returns from running CHOIR with
+#' additional iterations.
+#' @param n_trees A numerical value indicating the number of trees in each
+#' random forest. Defaults to 50. Increasing the number of trees is likely to
+#' increase the computational time required. Though not entirely predictable,
+#' increasing the number of trees up to a point may enable more nuanced
+#' distinctions, but is likely to provide diminishing returns.
+#' @param use_variance A Boolean value indicating whether to use the variance of
+#' the random forest accuracy scores as part of the permutation test threshold.
+#' Defaults to \code{TRUE}. Setting this parameter to \code{FALSE} will make
+#' CHOIR considerably less conservative, identifying more clusters, particularly
+#' on large datasets.
+#' @param min_accuracy A numerical value indicating the minimum accuracy
+#' required of the random forest classifier, below which clusters will be
+#' automatically merged. Defaults to 0.5, representing the random chance
+#' probability of assigning correct cluster labels; therefore, decreasing the
+#' minimum accuracy is not recommended. Increasing the minimum accuracy will
+#' lead to more conservative cluster assignments and will often decrease the
+#' computational time required, because fewer cluster comparisons may be needed.
+#' @param min_connections A numerical value indicating the minimum number of
+#' nearest neighbors between two clusters for those clusters to be considered
+#' adjacent. Non-adjacent clusters will not be merged. Defaults to 1. This
+#' threshold allows CHOIR to avoid running the full permutation test comparison
+#' for clusters that are highly likely to be distinct, saving computational
+#' time. Therefore, setting this parameter to 0 will increase the number of
+#' permutation test comparisons run and, thus, the computational time. The
+#' intent of this parameter is only to avoid running permutation test
+#' comparisons between clusters that are so different that they should not be
+#' merged. Therefore, we do not recommend increasing this parameter value
+#' beyond 10, as higher values may result in instances of overclustering.
+#' @param max_repeat_errors A numerical value indicating the maximum number of
+#' repeatedly mislabeled cells that will be taken into account during the
+#' permutation tests. This parameter is used to account for situations in which
+#' random forest classifier errors are concentrated among a few cells that are
+#' repeatedly misassigned. If set to 0, such repeat errors will not be
+#' evaluated. Defaults to 20. These situations are relatively infrequent, but
+#' setting this parameter to lower values (especially 0) may result in
+#' underclustering due to a small number of intermediate cells. Setting this
+#' parameter to higher values may lead to instances of overclustering and is not
+#' recommended.
+#' @param distance_approx A Boolean value indicating whether or not to use
+#' approximate distance calculations. Defaults to \code{TRUE}, which will use
+#' centroid-based distances. Setting distance approximation to \code{FALSE} will
+#' substantially increase the computational time and memory required,
+#' particularly for large datasets. Using approximated distances (\code{TRUE})
+#' rather than absolute distances (\code{FALSE}) is unlikely to have a
+#' meaningful effect on the distance thresholds imposed by CHOIR.
+#' @param distance_awareness A numerical value representing the distance
+#' threshold above which a cluster will not merge with another cluster and
+#' significance testing will not be used. Specifically, this value is a
+#' multiplier applied to the distance between a cluster and its closest
+#' distinguishable neighbor based on random forest comparison. Defaults to 2,
+#' which sets this threshold at a two-fold increase in distance over the closest
+#' distinguishable neighbor. This threshold allows CHOIR to avoid running the
+#' full permutation test comparison for clusters that are highly likely to be
+#' distinct, saving computational time. To omit all distance calculations and
+#' perform permutation testing on all comparisons, set this parameter to
+#' \code{FALSE}. Setting this parameter to \code{FALSE} or increasing the input
+#' value will increase the number of permutation test comparisons run and, thus,
+#' the computational time. In rare cases, very small distant clusters may be
+#' erroneously merged when distance thresholds are not used. The intent of this
+#' parameter is only to avoid running permutation test comparisons between
+#' clusters that are so different that they should not be merged. We do not
+#' recommend decreasing this parameter value below 1.5, as lower values may
+#' result in instances of overclustering.
+#' @param collect_all_metrics A Boolean value indicating whether to collect and
+#' save additional metrics from the random forest classifiers, including feature
+#' importances for every comparison. Defaults to \code{FALSE}. Setting this
+#' parameter to \code{TRUE} will slightly increase the computational time
+#' required. This parameter has no effect on the final cluster calls.
+#' @param sample_max A numerical value indicating the maximum number of cells to
+#' be sampled per cluster to train/test each random forest classifier. Defaults
+#' to \code{Inf} (infinity), which does not cap the number of cells used, so all
+#' cells will be used in all comparisons. Decreasing this parameter may decrease
+#' the computational time required, but may result in instances of
+#' underclustering. If input is provided to both the \code{downsampling_rate}
+#' and \code{sample_max} parameters, the minimum resulting cell number is
+#' calculated and used for each comparison.
+#' @param downsampling_rate A numerical value indicating the proportion of cells
+#' to be sampled per cluster to train/test each random forest classifier. For
+#' efficiency, the default value, "auto", sets the downsampling rate according
+#' to the dataset size. Decreasing this parameter may decrease the computational
+#' time required, but may also make the final cluster calls more conservative.
+#' If input is provided to both \code{downsampling_rate} and
+#' \code{sample_max parameters}, the minimum resulting cell number is calculated
+#' and used for each comparison.
+#' @param min_reads A numeric value used to filter out features prior to input
+#' to the random forest classifier. The default value, \code{NULL}, will filter
+#' out features with 0 counts for the current clusters being compared. Higher
+#' values should be used with caution, but may increase the signal-to-noise
+#' ratio encountered by the random forest classifiers.
+#' @param normalization_method A character string or vector indicating which
+#' normalization method to use. In general, input data should be supplied to
+#' CHOIR after normalization, except when the user wishes to use
+#' \code{Seurat SCTransform} normalization. Permitted values are “none” or
+#' “SCTransform”. Defaults to “none”. Because CHOIR has not been tested
+#' thoroughly with \code{SCTransform} normalization, we do not recommend this
+#' approach at this time. For multi-omic datasets, provide a vector with a value
+#' corresponding to each provided value of \code{use_assay} or
+#' \code{ArchR_matrix} in the same order.
+#' @param batch_correction_method A character string indicating which batch
+#' correction method to use. Permitted values are “Harmony” and “none”. Defaults
+#' to “none”. Batch correction should only be used when the different batches
+#' are not expected to also have unique cell types or cell states. Using batch
+#' correction would ensure that clusters do not originate from a single batch,
+#' thereby making the final cluster calls more conservative.
+#' @param batch_labels A character string that, if applying batch correction,
+#' specifies the name of the column in the input object metadata containing the
+#' batch labels. Defaults to \code{NULL}.
+#' @param use_assay For \code{Seurat} or \code{SingleCellExperiment} objects, a
+#' character string or vector indicating the assay(s) to use in the provided
+#' object. The default value, \code{NULL}, will choose the current active assay
+#' for \code{Seurat} objects and the \code{logcounts} assay for
+#' \code{SingleCellExperiment} objects.
+#' @param input_matrix An optional matrix containing the feature x cell data
+#' provided by the user, on which to train the random forest classifiers. By
+#' default, this parameter is set to \code{NULL}, and CHOIR will look for the
+#' feature x cell matri(ces) indicated by function \code{buildParentTree}.            ##### TRUE??? #####
+#' @param nn_matrix An optional matrix containing the nearest neighbor adjacency
+#' of the cells, provided by the user. By default, this parameter is set to
+#' \code{NULL}, and CHOIR will look for the adjacency matri(ces) generated by
+#' function \code{buildParentTree}.                                               ##### TRUE??? #####
+#' @param dist_matrix An optional distance matrix of cell to cell distances
+#' (based on dimensionality reduction cell embeddings), provided by the user. By
+#' default, this parameter is set to \code{NULL}, and CHOIR will look for the
+#' distance matri(ces) generated by function \code{buildParentTree}.              ##### TRUE??? #####
+#' @param reduction An optional matrix of dimensionality reduction cell
+#' embeddings provided by the user for subsequent clustering steps. By default,
+#' this parameter is set to \code{NULL}, and CHOIR will look for the
+#' dimensionality reductions generated by function \code{buildParentTree()}.
+#' @param n_cores A numerical value indicating the number of cores to use for
+#' parallelization. By default, CHOIR will use the number of available cores
+#' minus 2. CHOIR is parallelized at the computation of permutation test
+#' iterations. Therefore, any number of cores up to the number of iterations
+#' will theoretically decrease the computational time required. In practice,
+#' 8–16 cores are recommended for datasets up to 500,000 cells.
+#' @param random_seed A numerical value indicating the random seed to be used.
+#' Defaults to 1. CHOIR uses randomization throughout the generation and pruning
+#' of the clustering tree. Therefore, changing the random seed may yield slight
+#' differences in the final cluster assignments.
+#' @param verbose A Boolean value indicating whether to use verbose output
+#' during the execution of CHOIR. Defaults to \code{TRUE}, but can be set to
+#' \code{FALSE} for a cleaner output.
 #'
-#' @return
-#' @export
 #'
-#' @examples
+#' ############ COUNTSPLIT??
+#'
+#'@return Returns the object with the following added data stored under the
+#' provided key: \describe{
+#'   \item{clusters}{Final clusters, full hierarchical cluster tree, and
+#'   stepwise cluster results for each progressive pruning step}
+#'   \item{parameters}{Record of parameter values used}
+#'   \item{records}{Metadata for decision points during hierarchical tree
+#'   construction, all recorded permutation test comparisons, and feature
+#'   importance scores from all comparisons}
+#'   }
+#'
+#' @export
 combineTrees <- function(object,
                          subtree_list,
                          key = "CHOIR",