Merge pull request #40 from biocore/two-time-bug

Fixes two subject / state error
biocore · Mar 11, 2021 · 0fbe6ac · 0fbe6ac
2 parents c419b03 + 0db8e22
commit 0fbe6ac
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+v0.0.7 (2020-12-12)
+
+### Bug fixes
+
+* Fixes in `optspace.py` function `rank_estimate` and  `factorization.py` function `tenals`
+    * fixes two subject/state tensor
+    * added test in `test_method.py` for two subject or state tensor 
+
 v0.0.6 (2020-09-27)
 
 ### Features

diff --git a/gemelli/__init__.py b/gemelli/__init__.py
@@ -6,4 +6,4 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 
-__version__ = "0.0.6"
+__version__ = "0.0.7"
diff --git a/gemelli/factorization.py b/gemelli/factorization.py
@@ -531,6 +531,9 @@ def tenals(tensor,
             total_nonzeros = np.count_nonzero(mask[i, :, :].copy())
             n_, m_ = obs_tmp.shape
             eps_tmp = total_nonzeros / np.sqrt(n_ * m_)
+            if min(obs_tmp.shape) <= 2:
+                # two-subjects/time is already low-rank
+                continue
             if rank_estimate(obs_tmp, eps_tmp) >= (min(obs_tmp.shape) - 1):
                 warnings.warn('A component of your data may be high-rank.',
                               RuntimeWarning)

diff --git a/gemelli/optspace.py b/gemelli/optspace.py
@@ -406,7 +406,6 @@ def grassmann_manifold_two(U, step_size, n_components):
 
 def rank_estimate(obs, eps, k=20, lam=0.05,
                   min_rank=3, max_iter=5000):
-
     """
     This function estimates the rank of a
     sparse matrix (i.e. with missing values).
@@ -440,9 +439,11 @@ def rank_estimate(obs, eps, k=20, lam=0.05,
            Conference on Communication, Control,
            and Computing (Allerton) 1216–1222 (2009).
     """
-
     # dim. of the data
     n, m = obs.shape
+    # ensure rank worth estimating
+    if min(n, m) <= 2:
+        return min_rank
     # get N-singular values
     s = svds(obs,  min(k, n, m) - 1, which='LM',
              return_singular_vectors=False)[::-1]

diff --git a/gemelli/q2/tests/test_method.py b/gemelli/q2/tests/test_method.py
@@ -33,41 +33,54 @@ def setUp(self):
         self.in_table, self.in_meta = create_test_table()
         self.subj = 'host_subject_id'
         self.state = 'context'
+        self.biom_table = load_table(self.in_table)
+        self.meta_table = read_csv(self.in_meta, sep='\t', index_col=0)
+        # make metadata subjects and two-time
+        self.meta_table_two_time = self.meta_table.iloc[:22, :].copy()
+        self.meta_table_two_time[self.state] = [0] * 11 + [1] * 11
+        self.meta_table_two_time[self.subj] = [i for i in range(11)] * 2
 
     def test_ctf(self):
         """Tests the basic validity of the actual ctf() method's outputs."""
-        self.biom_table = load_table(self.in_table)
-        self.meta_table = read_csv(self.in_meta, sep='\t', index_col=0)
-        ord1, ord2, disttst, stst, ftst = ctf(table=self.biom_table,
-                                              sample_metadata=self.meta_table,
-                                              individual_id_column=self.subj,
-                                              state_column=self.state)
-        # Validate types of the ctf outputs
-        self.assertIsInstance(ord1, OrdinationResults)
-        self.assertIsInstance(ord2, OrdinationResults)
-        self.assertIsInstance(disttst, DistanceMatrix)
-        self.assertIsInstance(stst, pd.DataFrame)
-        self.assertIsInstance(ftst, pd.DataFrame)
-        # Ensure that no NaNs are in the OrdinationResults
-        # NOTE that we have to use the DataFrame .any() functions instead of
-        # python's built-in any() functions -- see #29 for details on this
-        self.assertFalse(np.isnan(ord1.features).any(axis=None))
-        self.assertFalse(np.isnan(ord1.samples).any(axis=None))
-        self.assertFalse(np.isnan(ord2.features).any(axis=None))
-        self.assertFalse(np.isnan(ord2.samples).any(axis=None))
+        for meta_classes in [self.meta_table, self.meta_table_two_time]:
+            res_tmp = ctf(table=self.biom_table,
+                          sample_metadata=meta_classes,
+                          individual_id_column=self.subj,
+                          state_column=self.state)
+            ord1, ord2, disttst, stst, ftst = res_tmp
+            # Validate types of the ctf outputs
+            self.assertIsInstance(ord1, OrdinationResults)
+            self.assertIsInstance(ord2, OrdinationResults)
+            self.assertIsInstance(disttst, DistanceMatrix)
+            self.assertIsInstance(stst, pd.DataFrame)
+            self.assertIsInstance(ftst, pd.DataFrame)
+            # Ensure that no NaNs are in the OrdinationResults
+            # NOTE that we have to use the DataFrame
+            # .any() functions instead of
+            # python's built-in any() functions --
+            # see #29 for details on this
+            self.assertFalse(np.isnan(ord1.features).any(axis=None))
+            self.assertFalse(np.isnan(ord1.samples).any(axis=None))
+            self.assertFalse(np.isnan(ord2.features).any(axis=None))
+            self.assertFalse(np.isnan(ord2.samples).any(axis=None))
 
 
 class Test_qiime2_ctf(unittest.TestCase):
 
     def setUp(self):
+        self.subj = 'host_subject_id'
+        self.state = 'context'
         self.in_table, self.in_meta = create_test_table()
         self.biom_table = load_table(self.in_table)
         self.q2table = Artifact.import_data("FeatureTable[Frequency]",
                                             self.biom_table)
         self.meta_table = read_csv(self.in_meta, sep='\t', index_col=0)
+        # make metadata subjects and two-time
+        self.meta_table_two_time = self.meta_table.iloc[:22, :].copy()
+        self.meta_table_two_time[self.state] = [0] * 11 + [1] * 11
+        self.meta_table_two_time[self.subj] = [i for i in range(11)] * 2
         self.q2meta = Metadata(self.meta_table)
-        self.subj = 'host_subject_id'
-        self.state = 'context'
+        self.q2meta_two_time = Metadata(self.meta_table)
         self.out_ = os_path_sep.join(self.in_table.split(os_path_sep)[:-1])
 
     def test_qiime2_ctf(self):