biocore · gwarmstrong · Aug 28, 2019 · Jun 14, 2019 · Jun 14, 2019 · Jul 23, 2019
diff --git a/Icon b/Icon
diff --git a/gemelli/_ctf_defaults.py b/gemelli/_ctf_defaults.py
@@ -0,0 +1,44 @@
+# Configuration file where you can set the parameter default values and
+# descriptions.
+DEFAULT_COMP = 3
+DEFAULT_MSC = 0
+DEFAULT_MFC = 5
+DEFAULT_MAXITER = 50
+DEFAULT_FMETA = None
+DEFAULT_COND = None
+
+DESC_INIT = ("The number of initialization vectors. Larger values will"
+             "give more accurate factorization but will be more "
+             "computationally expensive (suggested to be below 100; beware of"
+             " overfitting) [minimum 1]")
+DESC_ITERATIONSALS = ("Max number of Alternating Least Square (ALS)"
+                      " optimization iterations (suggested to be below 100;"
+                      " beware of overfitting) [minimum 1]")
+DESC_ITERATIONSRTPM = ("Max number of Robust Tensor Power Method (RTPM)"
+                       " optimization iterations (suggested to be below 100;"
+                       " beware of overfitting) [minimum 1]")
+DESC_COMP = ("The underlying low-rank structure (suggested: 1 < rank < 10)"
+             " [minimum 2]")
+DESC_MSC = "Minimum sum cutoff of sample across all features"
+DESC_MFC = "Minimum sum cutoff of features across all samples"
+DESC_OUT = "Location of output files."
+DESC_FMETA = "Feature metadata file in QIIME2 formatting."
+DESC_BIN = "Input table in biom format."
+DESC_SMETA = "Sample metadata file in QIIME2 formatting."
+DESC_SUBJ = ("Metadata column containing subject IDs to"
+             " use for pairing samples. WARNING: if"
+             " replicates exist for an individual ID at"
+             " either state_1 to state_N, that subject will"
+             " be mean grouped by default.")
+DESC_COND = ("Metadata column containing state (e.g.,Time, BodySite)"
+             " across which samples are paired."
+             " At least one is required but up to four are allowed"
+             " by other state inputs.")
+QORD = ("Compositional biplot of subjects as points and features as arrows."
+        " Where the variation between subject groupings is explained by the"
+        " log-ratio between opposing arrows. WARNING: The % variance explained"
+        " is spread over n_components and can be inflated.")
+QDIST = (
+    "A sample-sample distance matrix generated from the euclidean distance"
+    " of the subject-state ordinations and itself.")
+QLOAD = "An ordinational loading of subjects over states."
diff --git a/gemelli/base.py b/gemelli/base.py
@@ -19,13 +19,10 @@ class _BaseImpute(object):
     def fit(self):
         """ Placeholder for fit this
         should be implemetned by sub-method"""
-
-    def transform(self):
-        """ return loadings
-        """
-        return self.sample_loading, \
-            self.feature_loading, \
-            self.conditional_loading
+    @abstractmethod
+    def label(self):
+        """ Placeholder for fit this
+        should be implemetned by sub-method"""
 
 
 class _BaseConstruct(object):
@@ -36,15 +33,15 @@ class _BaseConstruct(object):
     """
     @abstractmethod
     def construct(self):
-        """          
-        conditional_loading  : array-like or list of array-like 
-             The conditional loading vectors 
-             of shape (conditions, r) if there is 1 type 
-             of condition, and a list of such matrices if 
-             there are more than 1 type of condition 
-         feature_loading : array-like 
-             The feature loading vectors 
-             of shape (features, r) 
-         sample_loading : array-like 
-             The sample loading vectors 
+        """
+        conditional_loading  : array-like or list of array-like
+             The conditional loading vectors
+             of shape (conditions, r) if there is 1 type
+             of condition, and a list of such matrices if
+             there are more than 1 type of condition
+         feature_loading : array-like
+             The feature loading vectors
+             of shape (features, r)
+         sample_loading : array-like
+             The sample loading vectors
              of shape (samples, r) """
diff --git a/gemelli/citations.bib b/gemelli/citations.bib
@@ -0,0 +1,14 @@
+@article {Martino2019,
+	author = {Martino, Cameron and Morton, James T. and Marotz, Clarisse A. and Thompson, Luke R. and Tripathi, Anupriya and Knight, Rob and Zengler, Karsten},
+	editor = {Neufeld, Josh D.},
+	title = {A Novel Sparse Compositional Technique Reveals Microbial Perturbations},
+	volume = {4},
+	number = {1},
+	elocation-id = {e00016-19},
+	year = {2019},
+	doi = {10.1128/mSystems.00016-19},
+	publisher = {American Society for Microbiology Journals},
+	URL = {https://msystems.asm.org/content/4/1/e00016-19},
+	eprint = {https://msystems.asm.org/content/4/1/e00016-19.full.pdf},
+	journal = {mSystems}
+}
diff --git a/gemelli/ctf.py b/gemelli/ctf.py
@@ -0,0 +1,179 @@
+import biom
+import skbio
+import qiime2
+from pandas import DataFrame
+from qiime2 import Metadata
+from skbio import OrdinationResults, DistanceMatrix
+from gemelli.factorization import TensorFactorization
+from gemelli.preprocessing import build, rclr
+from gemelli._ctf_defaults import (DEFAULT_COMP, DEFAULT_MSC,
+                                   DEFAULT_MFC, DEFAULT_MAXITER,
+                                   DEFAULT_FMETA as DEFFM)
+
+
+def ctf(table: biom.Table,
+        sample_metadata: qiime2.Metadata,
+        individual_id_column: str,
+        state_column: str,
+        n_components: int = DEFAULT_COMP,
+        min_sample_count: int = DEFAULT_MSC,
+        min_feature_count: int = DEFAULT_MFC,
+        max_iterations_als: int = DEFAULT_MAXITER,
+        max_iterations_rptm: int = DEFAULT_MAXITER,
+        n_initializations: int = DEFAULT_MAXITER,
+        feature_metadata: Metadata = DEFFM) -> (OrdinationResults,
+                                                DistanceMatrix,
+                                                DataFrame,
+                                                DataFrame):
+    # run CTF helper and parse output for QIIME
+    ord_res, dists, straj, ftraj = ctf_helper(table,
+                                              sample_metadata,
+                                              individual_id_column,
+                                              [state_column],
+                                              n_components,
+                                              min_sample_count,
+                                              min_feature_count,
+                                              max_iterations_als,
+                                              max_iterations_rptm,
+                                              n_initializations,
+                                              feature_metadata)
+
+    PC_cols = ["PC%i" % (i + 1) for i in range(n_components)]
+    dists = list(dists.values())[0]
+    straj = list(straj.values())[0]
+    ftraj = list(ftraj.values())[0]
+    return ord_res, dists, straj, ftraj
+
+
+def ctf_helper(table: biom.Table,
+               sample_metadata: DataFrame,
+               individual_id_column: str,
+               state_columns: list,
+               n_components: int = DEFAULT_COMP,
+               min_sample_count: int = DEFAULT_MSC,
+               min_feature_count: int = DEFAULT_MFC,
+               max_iterations_als: int = DEFAULT_MAXITER,
+               max_iterations_rptm: int = DEFAULT_MAXITER,
+               n_initializations: int = DEFAULT_MAXITER,
+               feature_metadata: DataFrame = DEFFM) -> (OrdinationResults,
+                                                        dict,
+                                                        tuple):
+    """ Runs  Compositional Tensor Factorization CTF.
+    """
+
+    # validate the metadata using q2 as a wrapper
+    if isinstance(sample_metadata, DataFrame):
+        sample_metadata = sample_metadata[state_columns +
+                                          [individual_id_column]]
+        sample_metadata = qiime2.Metadata(sample_metadata).to_dataframe()
+    else:
+        sample_metadata = sample_metadata.to_dataframe()[
+            state_columns + [individual_id_column]]
+    # validate the metadata using q2 as a wrapper
+    if isinstance(feature_metadata, DataFrame):
+        feature_metadata = qiime2.Metadata(feature_metadata).to_dataframe()
+    elif feature_metadata is not None:
+        feature_metadata = feature_metadata.to_dataframe()
+
+    # match the data (borrowed in part from gneiss.util.match)
+    subtablefids = table.ids('observation')
+    subtablesids = table.ids('sample')
+    if len(subtablesids) != len(set(subtablesids)):
+        raise ValueError('Data-table contains duplicate sample IDs')
+    if len(subtablefids) != len(set(subtablefids)):
+        raise ValueError('Data-table contains duplicate feature IDs')
+    submetadataids = set(sample_metadata.index)
+    subtablesids = set(subtablesids)
+    subtablefids = set(subtablefids)
+    if feature_metadata is not None:
+        submetadatafeat = set(feature_metadata.index)
+        fidx = subtablefids & submetadatafeat
+        if len(fidx) == 0:
+            raise ValueError(("No more features left.  Check to make "
+                              "sure that the sample names between "
+                              "`feature-metadata` and `table` are "
+                              "consistent"))
+        feature_metadata = feature_metadata.loc[fidx]
+    sidx = subtablesids & submetadataids
+    if len(sidx) == 0:
+        raise ValueError(("No more features left.  Check to make sure that "
+                          "the sample names between `sample-metadata` and"
+                          " `table` are consistent"))
+    if feature_metadata is not None:
+        table.filter(list(fidx), axis='observation', inplace=True)
+    table.filter(list(sidx), axis='sample', inplace=True)
+    sample_metadata = sample_metadata.loc[sidx]
+
+    # filter and import table
+    for axis, min_sum in zip(['sample',
+                              'observation'],
+                             [min_sample_count,
+                              min_feature_count]):
+        table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
+                             axis=axis, inplace=True)
+
+    # table to dataframe
+    table = DataFrame(table.matrix_data.toarray(),
+                      table.ids('observation'),
+                      table.ids('sample'))
+
+    # tensor building
+    tensor = build()
+    tensor.construct(table, sample_metadata,
+                     individual_id_column, state_columns)
+
+    # factorize
+    TF = TensorFactorization(
+        n_components=n_components,
+        max_als_iterations=max_iterations_als,
+        max_rtpm_iterations=max_iterations_rptm,
+        n_initializations=n_initializations).fit(rclr(tensor.counts))
+    # label tensor loadings
+    TF.label(tensor, taxonomy=feature_metadata)
+
+    # pickle the
+
+    # if the n_components is two add PC3 of zeros
+    # this is referenced as in issue in
+    # <https://github.com/biocore/emperor/commit
+    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
+    if n_components == 2:
+        TF.subjects['PC3'] = [0] * len(TF.subjects.index)
+        TF.features['PC3'] = [0] * len(TF.features.index)
+        TF.proportion_explained.loc['PC3'] = 0
+        TF.eigvals.loc['PC3'] = 0
+
+    # save ordination results
+    short_method_name = 'CTF_Biplot'
+    long_method_name = 'Compositional Tensor Factorization Biplot'
+    # only keep PC -- other tools merge metadata
+    keep_PC = [col for col in TF.features.columns if 'PC' in col]
+    ordination = OrdinationResults(
+        short_method_name,
+        long_method_name,
+        TF.eigvals,
+        samples=TF.subjects[keep_PC],
+        features=TF.features[keep_PC],
+        proportion_explained=TF.proportion_explained)
+
+    # save distance matrix for each condition
+    distances = {}
+    subject_trajectories = {}
+    feature_trajectories = {}
+    for condition, dist, straj, ftraj in zip(tensor.conditions,
+                                             TF.subject_distances,
+                                             TF.subject_trajectory,
+                                             TF.feature_trajectory):
+        # match distances to metadata
+        ids = straj.index
+        ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
+        inter = set(ind_dict).intersection(sample_metadata.index)
+        indices = sorted([ind_dict[ind] for ind in inter])
+        dist = dist[indices, :][:, indices]
+        distances[condition] = skbio.stats.distance.DistanceMatrix(
+            dist, ids=ids[indices])
+        # save traj.
+        subject_trajectories[condition] = straj
+        ftraj.index = ftraj.index.astype(str)
+        feature_trajectories[condition] = ftraj
+    return ordination, distances, subject_trajectories, feature_trajectories