From d17db33a00a8d8548cccac24b2c9016ad801a5c9 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 6 Nov 2023 15:56:41 +0100 Subject: [PATCH] Set min contig size to 2000 and warn if smaller contig is seen We sometimes see users run Vamb with a lot of small contigs. In order to alert users that this will results in degraded binning performance, Vamb now warns users both with a runtime warning to stderr, and with a warning in the log file, if any contigs are shorter than the minimum contig size. Also, set the default minimum contig size to 2000. --- test/test_parsecontigs.py | 4 ++++ vamb/__main__.py | 4 ++-- vamb/parsecontigs.py | 26 ++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/test/test_parsecontigs.py b/test/test_parsecontigs.py index bd78a63a..68a6bd38 100644 --- a/test/test_parsecontigs.py +++ b/test/test_parsecontigs.py @@ -51,6 +51,10 @@ def test_warns_n_contigs(self): with self.assertWarns(UserWarning): Composition.from_file(self.io, minlength=250) + def test_warns_minlength(self): + with self.assertWarns(UserWarning): + Composition.from_file(self.large_io, minlength=275) + def test_filter_minlength(self): minlen = 500 diff --git a/vamb/__main__.py b/vamb/__main__.py index 69a5a02a..2b1a2da1 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -1790,8 +1790,8 @@ def add_input_output_arguments(subparser): dest="minlength", metavar="", type=int, - default=250, - help="ignore contigs shorter than this [250]", + default=2000, + help="ignore contigs shorter than this [2000]", ) inputos.add_argument( "-z", diff --git a/vamb/parsecontigs.py b/vamb/parsecontigs.py index 5cd3391b..c9d7d014 100644 --- a/vamb/parsecontigs.py +++ b/vamb/parsecontigs.py @@ -5,7 +5,6 @@ ... tnfs, contignames, lengths = read_contigs(filehandle) """ -import sys import os as _os import numpy as _np import vamb.vambtools as _vambtools @@ -163,14 +162,15 @@ def _convert(raw: _vambtools.PushArray, projected: _vambtools.PushArray): def from_file( cls: type[C], filehandle: Iterable[bytes], - minlength: int = 100, + minlength: int = 2000, logfile: Optional[IO[str]] = None, ) -> C: """Parses a FASTA file open in binary reading mode, returning Composition. Input: filehandle: Filehandle open in binary mode of a FASTA file - minlength: Ignore any references shorter than N bases [100] + minlength: Ignore any references shorter than N bases [2000] + logfile: Logfile to print warning to, if any """ if minlength < 4: @@ -181,11 +181,14 @@ def from_file( lengths = _vambtools.PushArray(_np.int32) mask = bytearray() # we convert to Numpy at end contignames: list[str] = list() + minimum_seen_length = 2_000_000_000 entries = _vambtools.byte_iterfasta(filehandle) for entry in entries: - skip = len(entry) < minlength + length = len(entry) + minimum_seen_length = min(minimum_seen_length, length) + skip = length < minlength mask.append(not skip) if skip: @@ -227,4 +230,19 @@ def from_file( print("\n", file=logfile) print(message, file=logfile) + + # Warn the user if any contigs have been observed, which is smaller + # than the threshold. + if not _np.all(metadata.mask): + message = ( + f"WARNING: The minimum sequence length has been set to {minlength}, but a contig with " + f"length {minimum_seen_length} was seen. " + "Better results are obtained if the sequence file is filtered to the minimum " + "sequence length before mapping." + ) + warnings.warn(message) + if logfile is not None: + print("\n", file=logfile) + print(message, file=logfile) + return cls(metadata, tnfs_arr)