Warn users on too few contigs

Now warn users already when parsing contigs if they have fewer than 20,000 sequences, in order to prevent users from accidentally overfit the VAE.
RasmussenLab · Nov 6, 2023 · 264b5a2 · 264b5a2
1 parent 9a8a66d
commit 264b5a2
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 21 deletions.
diff --git a/test/test_encode.py b/test/test_encode.py
@@ -157,10 +157,6 @@ def test_samples_too_small(self):
         t = self.tnfs.copy()
         l = self.lens.copy()
 
-        with self.assertWarns(UserWarning):
-            dl = vamb.encode.make_dataloader(r, t, l, batchsize=256)
-            vae.trainmodel(dl, batchsteps=None, nepochs=2)
-
     def test_loss_falls(self):
         vae = vamb.encode.VAE(self.rpkm.shape[1])
         rpkm_copy = self.rpkm.copy()

diff --git a/test/test_parsecontigs.py b/test/test_parsecontigs.py
@@ -2,13 +2,15 @@
 import unittest
 import random
 import numpy as np
+import warnings
 
 import testtools
 from vamb.parsecontigs import Composition, CompositionMetaData
 
 
 class TestReadContigs(unittest.TestCase):
     records = []
+    large_io = io.BytesIO()
     io = io.BytesIO()
 
     @classmethod
@@ -21,8 +23,14 @@ def setUpClass(cls):
             cls.io.write(i.format().encode())
             cls.io.write(b"\n")
 
+        for i in range(25_000):
+            record = testtools.make_randseq(rng, 250, 300)
+            cls.large_io.write(record.format().encode())
+            cls.large_io.write(b"\n")
+
     def setUp(self):
         self.io.seek(0)
+        self.large_io.seek(0)
 
     def test_unique_names(self):
         with self.assertRaises(ValueError):
@@ -33,9 +41,22 @@ def test_unique_names(self):
                 1000,
             )
 
+    # Does not warn
+    def test_nowarn(self):
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            Composition.from_file(self.large_io, minlength=250)
+
+    def test_warns_n_contigs(self):
+        with self.assertWarns(UserWarning):
+            Composition.from_file(self.io, minlength=250)
+
     def test_filter_minlength(self):
         minlen = 500
-        composition = Composition.from_file(self.io, minlength=450)
+
+        with self.assertWarns(UserWarning):
+            composition = Composition.from_file(self.io, minlength=450)
+
         md = composition.metadata
         hash1 = md.refhash
 
@@ -74,7 +95,8 @@ def test_minlength(self):
             Composition.from_file(self.io, minlength=3)
 
     def test_properties(self):
-        composition = Composition.from_file(self.io, minlength=420)
+        with self.assertWarns(UserWarning):
+            composition = Composition.from_file(self.io, minlength=420)
         passed = list(filter(lambda x: len(x.sequence) >= 420, self.records))
 
         self.assertEqual(composition.nseqs, len(composition.metadata.identifiers))
@@ -96,7 +118,8 @@ def test_properties(self):
 
     def test_save_load(self):
         buf = io.BytesIO()
-        composition_1 = Composition.from_file(self.io)
+        with self.assertWarns(UserWarning):
+            composition_1 = Composition.from_file(self.io)
         md1 = composition_1.metadata
         composition_1.save(buf)
         buf.seek(0)
@@ -126,8 +149,10 @@ def test_windows_newlines(self):
 
         buf1.seek(0)
         buf2.seek(0)
-        comp1 = Composition.from_file(buf1)
-        comp2 = Composition.from_file(buf2)
+        with self.assertWarns(UserWarning):
+            comp1 = Composition.from_file(buf1)
+        with self.assertWarns(UserWarning):
+            comp2 = Composition.from_file(buf2)
 
         self.assertEqual(comp1.metadata.refhash, comp2.metadata.refhash)
         self.assertTrue(np.all(comp1.matrix == comp2.matrix))
diff --git a/vamb/__main__.py b/vamb/__main__.py
@@ -524,7 +524,7 @@ def calc_tnf(
         log(f"Loading data from FASTA file {path}", logfile, 1)
         with vamb.vambtools.Reader(str(path)) as file:
             composition = vamb.parsecontigs.Composition.from_file(
-                file, minlength=options.min_contig_length
+                file, minlength=options.min_contig_length, logfile=logfile
             )
         composition.save(outdir.joinpath("composition.npz"))
 

diff --git a/vamb/encode.py b/vamb/encode.py
@@ -9,7 +9,6 @@
 from torch import Tensor
 from torch import nn as _nn
 from math import log as _log
-import warnings
 
 __doc__ = """Encode a depths matrix and a tnf matrix to latent representation.
 
@@ -87,14 +86,6 @@ def make_dataloader(
     if not (rpkm.dtype == tnf.dtype == _np.float32):
         raise ValueError("TNF and RPKM must be Numpy arrays of dtype float32")
 
-    if len(rpkm) < 20000:
-        warnings.warn(
-            f"WARNING: Creating DataLoader with only {len(rpkm)} sequences. "
-            "We normally expect 20,000 sequences or more to prevent overfitting. "
-            "As a deep learning model, VAEs are prone to overfitting with too few sequences. "
-            "You may want to lower the beta parameter, or use a different binner altogether."
-        )
-
     # Copy if not destroy - this way we can have all following operations in-place
     # for simplicity
     if not destroy:

diff --git a/vamb/parsecontigs.py b/vamb/parsecontigs.py
@@ -5,12 +5,14 @@
 ...     tnfs, contignames, lengths = read_contigs(filehandle)
 """
 
+import sys
 import os as _os
 import numpy as _np
 import vamb.vambtools as _vambtools
 from collections.abc import Iterable, Sequence
-from typing import IO, Union, TypeVar
+from typing import IO, Union, TypeVar, Optional, IO
 from pathlib import Path
+import warnings
 
 # This kernel is created in src/create_kernel.py. See that file for explanation
 _KERNEL: _np.ndarray = _vambtools.read_npz(
@@ -158,7 +160,12 @@ def _convert(raw: _vambtools.PushArray, projected: _vambtools.PushArray):
         raw.clear()
 
     @classmethod
-    def from_file(cls: type[C], filehandle: Iterable[bytes], minlength: int = 100) -> C:
+    def from_file(
+        cls: type[C],
+        filehandle: Iterable[bytes],
+        minlength: int = 100,
+        logfile: Optional[IO[str]] = None,
+    ) -> C:
         """Parses a FASTA file open in binary reading mode, returning Composition.
 
         Input:
@@ -206,4 +213,18 @@ def from_file(cls: type[C], filehandle: Iterable[bytes], minlength: int = 100) -
             _np.array(mask, dtype=bool),
             minlength,
         )
+
+        if len(metadata.lengths) < 20_000:
+            message = (
+                f"WARNING: Parsed only {len(metadata.lengths)} sequences from FASTA file. "
+                "We normally expect 20,000 sequences or more to prevent overfitting. "
+                "As a deep learning model, VAEs are prone to overfitting with too few sequences. "
+                "You may want to  bin more samples as a time, lower the beta parameter, "
+                "or use a different binner altogether."
+            )
+            warnings.warn(message)
+            if logfile is not None:
+                print("\n", file=logfile)
+                print(message, file=logfile)
+
         return cls(metadata, tnfs_arr)
diff --git a/vamb/vambtools.py b/vamb/vambtools.py
@@ -13,6 +13,18 @@
 from collections.abc import Iterable, Iterator, Generator
 from typing import Optional, IO, Union
 from pathlib import Path
+import warnings
+
+
+def showwarning_override(message, category, filename, lineno, file=None, line=None):
+    print(str(message) + "\n", file=file)
+
+
+# It may seem horrifying to override a stdlib method, but this is the way recommended by the
+# warnings documentation.
+# We do it because it's the only way I know to prevent displaying file numbers and source
+# code to our users, which I think is a terrible user experience
+warnings.showwarning = showwarning_override
 
 
 class PushArray: