Skip to content

Commit

Permalink
Set min contig size to 2000 and warn if smaller contig is seen
Browse files Browse the repository at this point in the history
We sometimes see users run Vamb with a lot of small contigs. In order to alert
users that this will results in degraded binning performance, Vamb now warns
users both with a runtime warning to stderr, and with a warning in the log file,
if any contigs are shorter than the minimum contig size.
Also, set the default minimum contig size to 2000.
  • Loading branch information
jakobnissen committed Nov 6, 2023
1 parent 4cf09e8 commit d17db33
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 6 deletions.
4 changes: 4 additions & 0 deletions test/test_parsecontigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ def test_warns_n_contigs(self):
with self.assertWarns(UserWarning):
Composition.from_file(self.io, minlength=250)

def test_warns_minlength(self):
with self.assertWarns(UserWarning):
Composition.from_file(self.large_io, minlength=275)

def test_filter_minlength(self):
minlen = 500

Expand Down
4 changes: 2 additions & 2 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1790,8 +1790,8 @@ def add_input_output_arguments(subparser):
dest="minlength",
metavar="",
type=int,
default=250,
help="ignore contigs shorter than this [250]",
default=2000,
help="ignore contigs shorter than this [2000]",
)
inputos.add_argument(
"-z",
Expand Down
26 changes: 22 additions & 4 deletions vamb/parsecontigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
... tnfs, contignames, lengths = read_contigs(filehandle)
"""

import sys
import os as _os
import numpy as _np
import vamb.vambtools as _vambtools
Expand Down Expand Up @@ -163,14 +162,15 @@ def _convert(raw: _vambtools.PushArray, projected: _vambtools.PushArray):
def from_file(
cls: type[C],
filehandle: Iterable[bytes],
minlength: int = 100,
minlength: int = 2000,
logfile: Optional[IO[str]] = None,
) -> C:
"""Parses a FASTA file open in binary reading mode, returning Composition.
Input:
filehandle: Filehandle open in binary mode of a FASTA file
minlength: Ignore any references shorter than N bases [100]
minlength: Ignore any references shorter than N bases [2000]
logfile: Logfile to print warning to, if any
"""

if minlength < 4:
Expand All @@ -181,11 +181,14 @@ def from_file(
lengths = _vambtools.PushArray(_np.int32)
mask = bytearray() # we convert to Numpy at end
contignames: list[str] = list()
minimum_seen_length = 2_000_000_000

entries = _vambtools.byte_iterfasta(filehandle)

for entry in entries:
skip = len(entry) < minlength
length = len(entry)
minimum_seen_length = min(minimum_seen_length, length)
skip = length < minlength
mask.append(not skip)

if skip:
Expand Down Expand Up @@ -227,4 +230,19 @@ def from_file(
print("\n", file=logfile)
print(message, file=logfile)


# Warn the user if any contigs have been observed, which is smaller
# than the threshold.
if not _np.all(metadata.mask):
message = (
f"WARNING: The minimum sequence length has been set to {minlength}, but a contig with "
f"length {minimum_seen_length} was seen. "
"Better results are obtained if the sequence file is filtered to the minimum "
"sequence length before mapping."
)
warnings.warn(message)
if logfile is not None:
print("\n", file=logfile)
print(message, file=logfile)

return cls(metadata, tnfs_arr)

0 comments on commit d17db33

Please sign in to comment.