diff --git a/pyproject.toml b/pyproject.toml index b4b95229..8ca3ee1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,3 +31,10 @@ build-backend = "setuptools.build_meta" [tool.ruff] ignore = ["E501"] + +# pyproject.toml +[tool.pytest.ini_options] +filterwarnings = [ + "error", + "ignore::UserWarning", +] diff --git a/test/test_encode.py b/test/test_encode.py index 8dd47266..8163e533 100644 --- a/test/test_encode.py +++ b/test/test_encode.py @@ -46,10 +46,6 @@ def test_bad_args(self): self.rpkm.astype(np.float64), self.tnfs, self.lens, batchsize=32 ) - # Batchsize is too large - with self.assertRaises(ValueError): - vamb.encode.make_dataloader(self.rpkm, self.tnfs, self.lens, batchsize=256) - def test_destroy(self): copy_rpkm = self.rpkm.copy() copy_tnfs = self.tnfs.copy() @@ -155,6 +151,16 @@ def test_bad_args(self): with self.assertRaises(ValueError): vamb.encode.VAE(5, dropout=-0.001) + def test_samples_too_small(self): + vae = vamb.encode.VAE(self.rpkm.shape[1]) + r = self.rpkm.copy() + t = self.tnfs.copy() + l = self.lens.copy() + + with self.assertWarns(UserWarning): + dl = vamb.encode.make_dataloader(r, t, l, batchsize=64) + vae.trainmodel(dl, batchsteps=None, nepochs=2) + def test_loss_falls(self): vae = vamb.encode.VAE(self.rpkm.shape[1]) rpkm_copy = self.rpkm.copy() diff --git a/vamb/encode.py b/vamb/encode.py index 0753c7f1..3f30b127 100644 --- a/vamb/encode.py +++ b/vamb/encode.py @@ -9,6 +9,7 @@ from torch import Tensor from torch import nn as _nn from math import log as _log +import warnings __doc__ = """Encode a depths matrix and a tnf matrix to latent representation. @@ -86,11 +87,12 @@ def make_dataloader( if not (rpkm.dtype == tnf.dtype == _np.float32): raise ValueError("TNF and RPKM must be Numpy arrays of dtype float32") - if len(lengths) < batchsize: - raise ValueError( - "Fewer sequences left after filtering than the batch size. " - + "This probably means you try to run on a too small dataset (below ~5k sequences), " - + "Check the log file, and verify BAM file content is sensible." + if len(rpkm) < 20000: + warnings.warn( + f"WARNING: Creating DataLoader with only {len(rpkm)} sequences. " + "We normally expect 20,000 sequences or more to prevent overfitting. " + "As a deep learning model, VAEs are prone to overfitting with too few sequences. " + "You may want to lower the beta parameter, or use a different binner altogether." ) # Copy if not destroy - this way we can have all following operations in-place @@ -579,17 +581,6 @@ def trainmodel( raise ValueError("All elements of batchsteps must be integers") if max(batchsteps, default=0) >= nepochs: raise ValueError("Max batchsteps must not equal or exceed nepochs") - last_batchsize = dataloader.batch_size * 2 ** len(batchsteps) - if len(dataloader.dataset) < last_batchsize: # type: ignore - raise ValueError( - f"Last batch size of {last_batchsize} exceeds dataset length " - f"of {len(dataloader.dataset)}. " # type: ignore - "This means you have too few contigs left after filtering to train. " - "It is not adviced to run Vamb with fewer than 10,000 sequences " - "after filtering. " - "Please check the Vamb log file to see where the sequences were " - "filtered away, and verify BAM files has sensible content." - ) batchsteps_set = set(batchsteps) # Get number of features