diff --git a/doc/how_to_run.md b/doc/how_to_run.md index 38755cf4..a8be2f36 100644 --- a/doc/how_to_run.md +++ b/doc/how_to_run.md @@ -21,8 +21,11 @@ $ for sample in 1 2 3; do strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv; done +$ # Paste the aemb files together to make a TSV file with given header +$ cat <(echo -e "contigname\t1\t2\t3") paste aemb/1.tsv <(cut -f 2 aemb/2.tsv) <(cut -f 2 aemb/3.tsv) > abundance.tsv + $ # Run Vamb using the contigs and the directory with abundance files -$ vamb bin default --outdir vambout --fasta contigs.fna.gz --aemb aemb +$ vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv ``` #### TaxVamb @@ -58,14 +61,15 @@ Future runs can then instead use: #### Abundance The abundance may be computed from: -* [recommended ]A directory containing TSV files obtained by mapping reads from - each individual sample against the contig catalogue using `strobealign --aemb` * A directory of BAM files generated the same way, except using any aligner that produces a BAM file, e.g. `minimap2` +* A TSV file with the header being "contigname" followed by one samplename per sample, + and the values in the TSV file being precomputed abundances. + These may be derived from `paste`ing together outputs from the tool `strobealign --aemb` Thus, it can be specified as: ``` ---aemb dir_with_aemb_files +--abundance_tsv abundance.tsv ``` or ``` diff --git a/test/test_parsebam.py b/test/test_parsebam.py index 7dfe619c..30ff11e2 100644 --- a/test/test_parsebam.py +++ b/test/test_parsebam.py @@ -1,8 +1,8 @@ import unittest import io import numpy as np -import random import tempfile +from pathlib import Path import vamb import testtools @@ -105,59 +105,54 @@ def test_save_load(self): self.assertEqual(abundance2.refhash, self.abundance.refhash) self.assertEqual(abundance2.minid, self.abundance.minid) - def test_parse_from_aemb(self): - abundance = vamb.parsebam.Abundance.from_aemb( - testtools.AEMB_FILES, self.comp_metadata - ) - - self.assertTrue(abundance.refhash == self.comp_metadata.refhash) - self.assertTrue( - (abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all() - ) - - manual_matrix = [] - for file in testtools.AEMB_FILES: - manual_matrix.append([]) - with open(file) as f: - for line in f: - (_, ab) = line.split() - manual_matrix[-1].append(float(ab)) - manual_matrix = list(map(list, zip(*manual_matrix))) - - self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all()) - - # Good headers, but in other order - names = list(self.comp_metadata.identifiers) - random.shuffle(names) - files = [self.make_aemb_file(names) for i in range(2)] - a = vamb.parsebam.Abundance.from_aemb( - [f.name for f in files], self.comp_metadata - ) - self.assertIsInstance(a, vamb.parsebam.Abundance) - - def make_aemb_file(self, headers): - file = tempfile.NamedTemporaryFile(mode="w+") - for header in headers: - n = random.random() * 10 - print(header, "\t", str(n), file=file) - file.seek(0) - return file - - def test_bad_aemb(self): - # One header is wrong - names = list(self.comp_metadata.identifiers) - names[-4] = names[-4] + "a" - files = [self.make_aemb_file(names) for i in range(2)] - with self.assertRaises(ValueError): - vamb.parsebam.Abundance.from_aemb( - [f.name for f in files], self.comp_metadata + def test_parse_from_tsv(self): + # Check it parses + with open(testtools.AEMB_FILES[0]) as file: + lines = [s.rstrip() for s in file] + for path in testtools.AEMB_FILES[1:]: + with open(path) as file: + for i, existing in enumerate(file): + lines[i] += "\t" + existing.split("\t")[1].rstrip() + + with tempfile.NamedTemporaryFile(mode="w+") as file: + print("contigname\tfile1\tfile2\tfile3", file=file) + for line in lines: + print(line, file=file) + file.seek(0) + abundance = vamb.parsebam.Abundance.from_tsv( + Path(file.name), self.comp_metadata ) - # Too many headers - names = list(self.comp_metadata.identifiers) - names.append(names[0]) - files = [self.make_aemb_file(names) for i in range(2)] - with self.assertRaises(ValueError): - vamb.parsebam.Abundance.from_aemb( - [f.name for f in files], self.comp_metadata - ) + self.assertEqual(abundance.refhash, self.comp_metadata.refhash) + self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"]) + + # Check values are alright + M = np.zeros_like(abundance.matrix) + for row, line in enumerate(lines): + for col, cell in enumerate(line.split("\t")[1:]): + M[row, col] = float(cell) + self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all()) + + # Bad header order errors + lines[5], lines[4] = lines[4], lines[5] + + with tempfile.NamedTemporaryFile(mode="w+") as file: + print("contigname\tfile1\tfile2\tfile3", file=file) + for line in lines: + print(line, file=file) + file.seek(0) + with self.assertRaises(ValueError): + vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata) + + # Restore + lines[5], lines[4] = lines[4], lines[5] + + # Too many lines + with tempfile.NamedTemporaryFile(mode="w+") as file: + print("contigname\tfile1\tfile2\tfile3", file=file) + for line in lines: + print(line, file=file) + print(lines[-2], file=file) + file.seek(0) + with self.assertRaises(ValueError): + vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata) diff --git a/test/test_reclustering.py b/test/test_reclustering.py new file mode 100644 index 00000000..0118f74b --- /dev/null +++ b/test/test_reclustering.py @@ -0,0 +1,24 @@ +import unittest + +# Invariants: +# It produces disjoint clusters, a subset of the input points + +# For CAMI dataset, compute comp, abundance, taxonomy, markers +# Subset to e.g. 5 genera plus a few unclassified contigs + +# FASTA +# Comp +# Abundance +# Markers +# Taxonomy +# Latent +# Refined taxonomy + + +class TestKmeansReclustering(unittest.TestCase): + pass + # Make markers + lengths + # Make taxonomy + # Create latent + + # Initial clustering diff --git a/vamb/__main__.py b/vamb/__main__.py index 2a34286a..115d3f99 100755 --- a/vamb/__main__.py +++ b/vamb/__main__.py @@ -153,19 +153,13 @@ def __init__(self, paths: list[Path], min_alignment_id: Optional[float]): self.paths = paths -class AEMBPaths(list): - __slots__ = ["paths"] +class AbundanceTSVPath: + __slots__ = ["path"] - def __init__(self, dir: Path): - if not dir.is_dir(): - raise NotADirectoryError(dir) - paths = [b for b in dir.iterdir() if b.is_file() and b.suffix == ".tsv"] - if len(paths) == 0: - raise ValueError( - f'No `.tsv` files found in --aemb argument "{dir}". ' - "Make sure all TSV files in the directory ends with `.tsv`." - ) - self.paths = paths + def __init__(self, path: Path): + if not path.is_file(): + raise FileNotFoundError(path) + self.path = path class AbundanceOptions: @@ -176,7 +170,7 @@ def are_args_present(args: argparse.Namespace) -> bool: return ( isinstance(args.bampaths, list) or isinstance(args.bamdir, Path) - or isinstance(args.aemb, Path) + or isinstance(args.abundance_tsv, Path) or isinstance(args.abundancepath, Path) ) @@ -185,7 +179,7 @@ def from_args(cls, args: argparse.Namespace): return cls( typeasserted(args.bampaths, (list, type(None))), typeasserted(args.bamdir, (Path, type(None))), - typeasserted(args.aemb, (Path, type(None))), + typeasserted(args.abundance_tsv, (Path, type(None))), typeasserted(args.abundancepath, (Path, type(None))), typeasserted(args.min_alignment_id, (float, type(None))), ) @@ -194,25 +188,19 @@ def __init__( self, bampaths: Optional[list[Path]], bamdir: Optional[Path], - aemb: Optional[Path], + abundance_tsv: Optional[Path], abundancepath: Optional[Path], min_alignment_id: Optional[float], ): - assert isinstance(bampaths, (list, type(None))) - assert isinstance(bamdir, (Path, type(None))) - assert isinstance(aemb, (Path, type(None))) - assert isinstance(abundancepath, (Path, type(None))) - assert isinstance(min_alignment_id, (float, type(None))) - # Make sure only one abundance input is there if ( (bampaths is not None) + (abundancepath is not None) + (bamdir is not None) - + (aemb is not None) + + (abundance_tsv is not None) ) != 1: raise argparse.ArgumentTypeError( - "Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input" + "Must specify exactly one of BAM files, BAM dir, TSV file or abundance NPZ file input" ) if abundancepath is not None: @@ -224,8 +212,8 @@ def __init__( "The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead" ) self.paths = BAMPaths(bampaths, min_alignment_id) - elif aemb is not None: - self.paths = AEMBPaths(aemb) + elif abundance_tsv is not None: + self.paths = AbundanceTSVPath(abundance_tsv) class BasicTrainingOptions: @@ -913,11 +901,10 @@ def calc_abundance( logger.info("\tOrder of columns is:") for i, samplename in enumerate(abundance.samplenames): logger.info(f"\t{i:>6}: {samplename}") - elif isinstance(paths, AEMBPaths): - paths = list(paths.paths) - logger.info(f"\tParsing {len(paths)} AEMB TSV files") - abundance = vamb.parsebam.Abundance.from_aemb( - paths, + elif isinstance(paths, AbundanceTSVPath): + logger.info(f'\tParsing abundance from TSV at "{paths.path}"') + abundance = vamb.parsebam.Abundance.from_tsv( + paths.path, comp_metadata, ) abundance.save(outdir.joinpath("abundance.npz")) @@ -1687,10 +1674,10 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser): help="Dir with .bam files to use", ) abundanceos.add_argument( - "--aemb", + "--abundance_tsv", metavar="", type=Path, - help="Dir with .tsv files from strobealign --aemb", + help='Path to TSV file of precomputed abundances with header being "contigname(\\t)*"', ) abundanceos.add_argument( "--abundance", diff --git a/vamb/parsebam.py b/vamb/parsebam.py index b4f77d93..2d213029 100644 --- a/vamb/parsebam.py +++ b/vamb/parsebam.py @@ -236,55 +236,53 @@ def run_pycoverm( return (coverage, refhash) @classmethod - def from_aemb( - cls: type[A], - paths: list[Path], - comp_metadata: CompositionMetaData, - ) -> A: - # We cannot guarantee strobealign outputs the contigs in the right order, - # so we must reorder them. - identifier_to_index: dict[str, int] = { - s: i for (i, s) in enumerate(comp_metadata.identifiers) - } - - result = _np.empty((len(identifier_to_index), len(paths)), dtype=_np.float32) - for col_num, path in enumerate(paths): - column = _np.full(len(identifier_to_index), float("nan"), dtype=_np.float32) - with vambtools.Reader(path) as file: - n_lines_seen = 0 - for line in file: - line = line.rstrip() - if len(line) == 0: - continue - n_lines_seen += 1 - if n_lines_seen > len(column): - raise ValueError( - f'Too many rows in AEMB file "{path}", expected {len(column)}' - ) - # strobealign reserves the right to add more columns in future versions - (identifier, depth_str, *_) = line.split(maxsplit=2) - index = identifier_to_index.get(identifier.decode()) - if index is None: - raise ValueError( - f'Identifier "{identifier}" seen in `strobealign --aemb` output file "{path}", ' - "but this identifier is not seen in the FASTA file.\n" - "Verify that the AEMB file was generated by mapping to the same FASTA file " - "as was input to Vamb." - ) - column[index] = float(depth_str) - - isnans = _np.isnan(column) - if isnans.any(): - first_missing_index = _np.nonzero(isnans)[0][0] - missing_identifier = comp_metadata.identifiers[first_missing_index] + def from_tsv(cls: type[A], path: Path, comp_metadata: CompositionMetaData) -> A: + seen_identifiers: list[str] = [] + with open(path) as file: + try: + header = next(file) + except StopIteration: + err = ValueError(f'Empty abundance TSV path at "{path}"') + raise err from None + columns = header.splitlines()[0].split("\t") + if len(columns) < 1: raise ValueError( - f'Identifier "{missing_identifier}" from the input FASTA file is missing from the AEMB file "{path}".' + f'Expected at least 1 column in abundance TSV file at "{path}"' + ) + if columns[0] != "contigname": + raise ValueError('First column in header must be "contigname"') + samples = columns[1:] + n_samples = len(samples) + matrix = _np.empty((comp_metadata.nseqs, n_samples), dtype=_np.float32) + # Since we read the header already and this is zero-indexed, we need to add 2 + for line_no_minus_two, line in enumerate(file): + if line_no_minus_two == comp_metadata.nseqs: + raise ValueError( + f'Too many rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got at least {line_no_minus_two + 1}' + ) + stripped = line.rstrip() + if not line: + continue + fields = stripped.split("\t") + if len(fields) != n_samples + 1: + raise ValueError( + f'In abundance TSV file "{path}", on line {line_no_minus_two + 2}, expected {n_samples + 1} columns, found {len(fields)}' + ) + for i in range(n_samples): + matrix[line_no_minus_two, i] = float(fields[i + 1]) + seen_identifiers.append(fields[0]) + + if line_no_minus_two + 1 != comp_metadata.nseqs: + raise ValueError( + f'Too few rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got {line_no_minus_two + 1}' ) - result[:, col_num] = column - return cls( - result, - [str(i) for i in paths], - 0.0, + vambtools.RefHasher.verify_refhash( + vambtools.RefHasher.hash_refnames(seen_identifiers), comp_metadata.refhash, + "abundance TSV", + "composition", + (seen_identifiers, comp_metadata.identifiers), ) + + return cls(matrix, samples, 0.0, comp_metadata.refhash)