Merge pull request #15 from cameronmartino/update_tensor_building

Update tensor building
biocore · Jun 13, 2019 · 770b7f0 · 770b7f0
2 parents 2b8edb5 + 2fc83d3
commit 770b7f0
Show file tree

Hide file tree

Showing 13 changed files with 646 additions and 631 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -9,11 +9,12 @@ branch = True
 include = */gemelli/*
 
 [report]
+show_missing = True
 exclude_lines =
     pragma: no cover
     raise NotImplementedError
     if __name__ == .__main__.:
 omit =
     */tests*
     */__init__.py
-
+    */base.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,46 @@
+(2019-06-12)
+
+### Features
+
+* Tensor Building and RCLR transformation in `preprocessing.rclr` and `preprocessing.build`
+    * N-mode tensor building and transformation
+    * Mean of counts for subject-conditional pairs with several samples
+
+### Backward-incompatible changes [stable]
+
+* In `preprocessing.build`:
+    * pervious -> current
+    * build().sample_order -> build().subject_order
+    * build().temporal_order -> build().condition_orders
+        * as a list for N possible condition(s)
+    * build().tensor -> build().counts
+
+### Backward-incompatible changes [experimental]
+
+### Performance enhancements
+
+* tensor building and transformation
+
+### Bug fixes
+
+* line 369-360 in `factorization.tenals` causes np.nan(s) in solution
+    * fixed by added pseudocount if any nan in solution
+
+* line 178-179 in `factorization.TenAls` 
+    * was previously checking if all missing/zero not if there were no missing/zero as intended
+
+### Deprecated functionality [stable]
+
+* In `preprocessing.rclr` and `preprocessing.build`:
+    * build().transform() -> `preprocessing.rclr` as standalone function
+
+### Deprecated functionality [experimental]
+
+### Miscellaneous
+
+* line 175 in `factorization.TenAls` to send ValueError if input is not numpy array
+
+
 (2019-05-17)
 
 ### Features

diff --git a/README.md b/README.md
@@ -6,6 +6,34 @@
 
 # gemelli
 
+## usage
+
+```python
+import numpy as np
+import pandas as pd
+from gemelli.factorization import TenAls
+from gemelli.preprocessing import build, rclr
+
+# contruct and transform the tensor
+tensor = Build()
+tensor.construct(table, metadata, subjects,
+                 [condition_1, condition_2, ..., condition_n])
+tensor_rclr = rclr(tensor.counts)
+# factorize
+TF = TenAls().fit(tensor_rclr)
+# write loading files 
+PC = ['PC'+str(i+1) for i in range(rank)]
+# loadings as daaframe
+sample_loading = pd.DataFrame(abs(TF.sample_loading),
+                              tensor.subject_order)
+feature_loading = pd.DataFrame(TF.feature_loading,
+                               tensor.feature_order)
+temporal_loading = pd.DataFrame(TF.conditional_loading,
+                                tensor.condition_orders[0])
+```
+
+## resources
+
 Named after gemelli by alighiero boetti and also the pasta. 
 
 [TenAls translated from Sewoong Oh](http://swoh.web.engr.illinois.edu/software/optspace/code.html)
diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt
@@ -1,2 +1 @@
 coveralls
-gneiss
diff --git a/gemelli/base.py b/gemelli/base.py
@@ -21,25 +21,30 @@ def fit(self):
         should be implemetned by sub-method"""
 
     def transform(self):
-        """ TODO
+        """ return loadings
         """
         return self.sample_loading, \
             self.feature_loading, \
             self.conditional_loading
 
 
-class _BaseTransform(object):
+class _BaseConstruct(object):
 
     """Base class for transformation/norm methods.
     Warning: This class should not be used directly.
     Use derived classes instead.
     """
     @abstractmethod
-    def fit(self):
-        """ Placeholder for fit this
-        should be implemetned by sub-method"""
-
-    def transform(self):
-        """ return transformed
-        """
-        return self.TRCLR
+    def construct(self):
+        """          
+        conditional_loading  : array-like or list of array-like 
+             The conditional loading vectors 
+             of shape (conditions, r) if there is 1 type 
+             of condition, and a list of such matrices if 
+             there are more than 1 type of condition 
+         feature_loading : array-like 
+             The feature loading vectors 
+             of shape (features, r) 
+         sample_loading : array-like 
+             The sample loading vectors 
+             of shape (samples, r) """
diff --git a/gemelli/factorization.py b/gemelli/factorization.py
@@ -15,7 +15,14 @@
 
 class TenAls(_BaseImpute):
 
-    def __init__(self, rank=3, iteration=50, ninit=50, tol=1e-8):
+    def __init__(
+            self,
+            rank=3,
+            iteration=50,
+            ninit=50,
+            nitr_RTPM=50,
+            tol=1e-8,
+            pseudocount=1.0):
         """
 
         This class performs a low-rank 3rd order
@@ -129,6 +136,8 @@ def __init__(self, rank=3, iteration=50, ninit=50, tol=1e-8):
         self.iteration = iteration
         self.ninit = ninit
         self.tol = tol
+        self.pseudocount = pseudocount
+        self.nitr_RTPM = nitr_RTPM
 
     def fit(self, Tensor):
         """
@@ -164,15 +173,10 @@ def _fit(self):
         sparse_tensor = self.sparse_tensor
 
         if not isinstance(sparse_tensor, np.ndarray):
-            sparse_tensor = np.array(sparse_tensor)
-            if not isinstance(sparse_tensor, np.ndarray):
-                raise ValueError('Input data is should be type numpy.ndarray')
-            if len(sparse_tensor.shape) < 3 or len(sparse_tensor.shape) > 3:
-                raise ValueError('Input data is should be 3rd-order tensor',
-                                 ' with shape (samples, features, time)')
-
-        if (np.count_nonzero(sparse_tensor) == 0 and
-                np.count_nonzero(~np.isnan(sparse_tensor)) == 0):
+            raise ValueError('Input data is should be type numpy.ndarray')
+
+        if (np.count_nonzero(sparse_tensor) == np.product(sparse_tensor.shape) and
+                np.count_nonzero(~np.isnan(sparse_tensor)) == np.product(sparse_tensor.shape)):
             raise ValueError('No missing data in the format np.nan or 0')
 
         if np.count_nonzero(np.isinf(sparse_tensor)) != 0:
@@ -189,7 +193,9 @@ def _fit(self):
                                     r=self.rank,
                                     ninit=self.ninit,
                                     nitr=self.iteration,
-                                    tol=self.tol)
+                                    nitr_RTPM=self.nitr_RTPM,
+                                    tol=self.tol,
+                                    pseudocount=self.pseudocount)
 
         self.loadings = loadings
         self.eigenvalues = np.diag(s_)
@@ -206,7 +212,15 @@ def _fit(self):
         self.dist = dist
 
 
-def tenals(TE, E, r=3, ninit=50, nitr=50, tol=1e-8):
+def tenals(
+        TE,
+        E,
+        r=3,
+        ninit=50,
+        nitr=50,
+        nitr_RTPM=50,
+        tol=1e-8,
+        pseudocount=1.0):
     """
     A low-rank 3rd order tensor factorization
     for partially observered non-symmetric
@@ -341,6 +355,8 @@ def tenals(TE, E, r=3, ninit=50, nitr=50, tol=1e-8):
                                                   0, 0))
 
                 v_alt[dim] = V_alt[dim][:, q] + v_dim.flatten()
+                # add pseudocount to prevent division by zero causing nan.
+                den[dim][den[dim] == 0] = pseudocount
                 v_alt[dim] = v_alt[dim] / den[dim]
 
                 if dim == len(dims) - 1:
@@ -453,7 +469,8 @@ def RTPM(TE, r, ninit, nitr):
             tS[init] = TenProjAlt(TE - CPcomp(S0, U),
                                   [tUn[:, [init]] for tUn in tU])
 
-        idx = np.argmax(tS, axis=0)[0]
+        idx = np.argmin(tS, axis=0)[0]
+
         for tUn, Un in zip(tU, U):
             Un[:, i] = tUn[:, idx] / norm(tUn[:, idx])
 
@@ -563,7 +580,7 @@ def CPcomp(S, U):
     """
 
     output_shape = tuple(u.shape[0] for u in U)
-    to_multiply = [S.T*u if i == 0 else u for i, u in enumerate(U)]
+    to_multiply = [S.T * u if i == 0 else u for i, u in enumerate(U)]
     product = khatri_rao(to_multiply)
     T = product.sum(1).reshape(output_shape)
 
@@ -598,7 +615,8 @@ def TenProjAlt(D, U_list):
 
 
 def khatri_rao(matrices):
-    """Returns the Khatri Rao product of a list of matrices
+    """
+    Returns the Khatri Rao product of a list of matrices
 
     Modified from TensorLy