Replace AEMB parser with generic TSV parser

In issue 355, a user requested to bring back the --jgi option, suc that users can pass in precomputed abundances. That's completely reasonable, and it's not great that the currently recommended way is to use the internal Python types to create an Abundance object. Instead of bringing back the JGI parser, I made a more generic TSV parser. TSV is a simple, straightforward format also used elsewhere in Vamb. Also, since it's easy to convert the output from --aemb to TSV format, I removed the --aemb option.
RasmussenLab · Sep 4, 2024 · 9810ef0 · 9810ef0
1 parent bb2388d
commit 9810ef0
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 138 deletions.
diff --git a/doc/how_to_run.md b/doc/how_to_run.md
@@ -21,8 +21,11 @@ $ for sample in 1 2 3; do
       strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
   done
 
+$ # Paste the aemb files together to make a TSV file with given header
+$ cat <(echo -e "contigname\t1\t2\t3") paste aemb/1.tsv <(cut -f 2 aemb/2.tsv) <(cut -f 2 aemb/3.tsv) > abundance.tsv
+
 $ # Run Vamb using the contigs and the directory with abundance files
-$ vamb bin default --outdir vambout --fasta contigs.fna.gz --aemb aemb
+$ vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
 ```
 
 #### TaxVamb
@@ -58,14 +61,15 @@ Future runs can then instead use:
 
 #### Abundance
 The abundance may be computed from:
-* [recommended ]A directory containing TSV files obtained by mapping reads from
-  each individual sample against the contig catalogue using `strobealign --aemb`
 * A directory of BAM files generated the same way, except using any aligner that
   produces a BAM file, e.g. `minimap2`
+* A TSV file with the header being "contigname" followed by one samplename per sample,
+  and the values in the TSV file being precomputed abundances.
+  These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`
 
 Thus, it can be specified as:
 ```
---aemb dir_with_aemb_files
+--abundance_tsv abundance.tsv
 ```
 or
 ```

diff --git a/test/test_parsebam.py b/test/test_parsebam.py
@@ -1,8 +1,8 @@
 import unittest
 import io
 import numpy as np
-import random
 import tempfile
+from pathlib import Path
 
 import vamb
 import testtools
@@ -105,59 +105,54 @@ def test_save_load(self):
         self.assertEqual(abundance2.refhash, self.abundance.refhash)
         self.assertEqual(abundance2.minid, self.abundance.minid)
 
-    def test_parse_from_aemb(self):
-        abundance = vamb.parsebam.Abundance.from_aemb(
-            testtools.AEMB_FILES, self.comp_metadata
-        )
-
-        self.assertTrue(abundance.refhash == self.comp_metadata.refhash)
-        self.assertTrue(
-            (abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all()
-        )
-
-        manual_matrix = []
-        for file in testtools.AEMB_FILES:
-            manual_matrix.append([])
-            with open(file) as f:
-                for line in f:
-                    (_, ab) = line.split()
-                    manual_matrix[-1].append(float(ab))
-        manual_matrix = list(map(list, zip(*manual_matrix)))
-
-        self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all())
-
-        # Good headers, but in other order
-        names = list(self.comp_metadata.identifiers)
-        random.shuffle(names)
-        files = [self.make_aemb_file(names) for i in range(2)]
-        a = vamb.parsebam.Abundance.from_aemb(
-            [f.name for f in files], self.comp_metadata
-        )
-        self.assertIsInstance(a, vamb.parsebam.Abundance)
-
-    def make_aemb_file(self, headers):
-        file = tempfile.NamedTemporaryFile(mode="w+")
-        for header in headers:
-            n = random.random() * 10
-            print(header, "\t", str(n), file=file)
-        file.seek(0)
-        return file
-
-    def test_bad_aemb(self):
-        # One header is wrong
-        names = list(self.comp_metadata.identifiers)
-        names[-4] = names[-4] + "a"
-        files = [self.make_aemb_file(names) for i in range(2)]
-        with self.assertRaises(ValueError):
-            vamb.parsebam.Abundance.from_aemb(
-                [f.name for f in files], self.comp_metadata
+    def test_parse_from_tsv(self):
+        # Check it parses
+        with open(testtools.AEMB_FILES[0]) as file:
+            lines = [s.rstrip() for s in file]
+        for path in testtools.AEMB_FILES[1:]:
+            with open(path) as file:
+                for i, existing in enumerate(file):
+                    lines[i] += "\t" + existing.split("\t")[1].rstrip()
+
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            file.seek(0)
+            abundance = vamb.parsebam.Abundance.from_tsv(
+                Path(file.name), self.comp_metadata
             )
 
-        # Too many headers
-        names = list(self.comp_metadata.identifiers)
-        names.append(names[0])
-        files = [self.make_aemb_file(names) for i in range(2)]
-        with self.assertRaises(ValueError):
-            vamb.parsebam.Abundance.from_aemb(
-                [f.name for f in files], self.comp_metadata
-            )
+        self.assertEqual(abundance.refhash, self.comp_metadata.refhash)
+        self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"])
+
+        # Check values are alright
+        M = np.zeros_like(abundance.matrix)
+        for row, line in enumerate(lines):
+            for col, cell in enumerate(line.split("\t")[1:]):
+                M[row, col] = float(cell)
+        self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all())
+
+        # Bad header order errors
+        lines[5], lines[4] = lines[4], lines[5]
+
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            file.seek(0)
+            with self.assertRaises(ValueError):
+                vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
+
+        # Restore
+        lines[5], lines[4] = lines[4], lines[5]
+
+        # Too many lines
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            print(lines[-2], file=file)
+            file.seek(0)
+            with self.assertRaises(ValueError):
+                vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
diff --git a/test/test_reclustering.py b/test/test_reclustering.py
@@ -0,0 +1,24 @@
+import unittest
+
+# Invariants:
+# It produces disjoint clusters, a subset of the input points
+
+# For CAMI dataset, compute comp, abundance, taxonomy, markers
+# Subset to e.g. 5 genera plus a few unclassified contigs
+
+# FASTA
+# Comp
+# Abundance
+# Markers
+# Taxonomy
+# Latent
+# Refined taxonomy
+
+
+class TestKmeansReclustering(unittest.TestCase):
+    pass
+    # Make markers + lengths
+    # Make taxonomy
+    # Create latent
+
+    # Initial clustering
diff --git a/vamb/__main__.py b/vamb/__main__.py
@@ -153,19 +153,13 @@ def __init__(self, paths: list[Path], min_alignment_id: Optional[float]):
         self.paths = paths
 
 
-class AEMBPaths(list):
-    __slots__ = ["paths"]
+class AbundanceTSVPath:
+    __slots__ = ["path"]
 
-    def __init__(self, dir: Path):
-        if not dir.is_dir():
-            raise NotADirectoryError(dir)
-        paths = [b for b in dir.iterdir() if b.is_file() and b.suffix == ".tsv"]
-        if len(paths) == 0:
-            raise ValueError(
-                f'No `.tsv` files found in --aemb argument "{dir}". '
-                "Make sure all TSV files in the directory ends with `.tsv`."
-            )
-        self.paths = paths
+    def __init__(self, path: Path):
+        if not path.is_file():
+            raise FileNotFoundError(path)
+        self.path = path
 
 
 class AbundanceOptions:
@@ -176,7 +170,7 @@ def are_args_present(args: argparse.Namespace) -> bool:
         return (
             isinstance(args.bampaths, list)
             or isinstance(args.bamdir, Path)
-            or isinstance(args.aemb, Path)
+            or isinstance(args.abundance_tsv, Path)
             or isinstance(args.abundancepath, Path)
         )
 
@@ -185,7 +179,7 @@ def from_args(cls, args: argparse.Namespace):
         return cls(
             typeasserted(args.bampaths, (list, type(None))),
             typeasserted(args.bamdir, (Path, type(None))),
-            typeasserted(args.aemb, (Path, type(None))),
+            typeasserted(args.abundance_tsv, (Path, type(None))),
             typeasserted(args.abundancepath, (Path, type(None))),
             typeasserted(args.min_alignment_id, (float, type(None))),
         )
@@ -194,25 +188,19 @@ def __init__(
         self,
         bampaths: Optional[list[Path]],
         bamdir: Optional[Path],
-        aemb: Optional[Path],
+        abundance_tsv: Optional[Path],
         abundancepath: Optional[Path],
         min_alignment_id: Optional[float],
     ):
-        assert isinstance(bampaths, (list, type(None)))
-        assert isinstance(bamdir, (Path, type(None)))
-        assert isinstance(aemb, (Path, type(None)))
-        assert isinstance(abundancepath, (Path, type(None)))
-        assert isinstance(min_alignment_id, (float, type(None)))
-
         # Make sure only one abundance input is there
         if (
             (bampaths is not None)
             + (abundancepath is not None)
             + (bamdir is not None)
-            + (aemb is not None)
+            + (abundance_tsv is not None)
         ) != 1:
             raise argparse.ArgumentTypeError(
-                "Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input"
+                "Must specify exactly one of BAM files, BAM dir, TSV file or abundance NPZ file input"
             )
 
         if abundancepath is not None:
@@ -224,8 +212,8 @@ def __init__(
                 "The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead"
             )
             self.paths = BAMPaths(bampaths, min_alignment_id)
-        elif aemb is not None:
-            self.paths = AEMBPaths(aemb)
+        elif abundance_tsv is not None:
+            self.paths = AbundanceTSVPath(abundance_tsv)
 
 
 class BasicTrainingOptions:
@@ -913,11 +901,10 @@ def calc_abundance(
         logger.info("\tOrder of columns is:")
         for i, samplename in enumerate(abundance.samplenames):
             logger.info(f"\t{i:>6}: {samplename}")
-    elif isinstance(paths, AEMBPaths):
-        paths = list(paths.paths)
-        logger.info(f"\tParsing {len(paths)} AEMB TSV files")
-        abundance = vamb.parsebam.Abundance.from_aemb(
-            paths,
+    elif isinstance(paths, AbundanceTSVPath):
+        logger.info(f'\tParsing abundance from TSV at "{paths.path}"')
+        abundance = vamb.parsebam.Abundance.from_tsv(
+            paths.path,
             comp_metadata,
         )
         abundance.save(outdir.joinpath("abundance.npz"))
@@ -1687,10 +1674,10 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser):
         help="Dir with .bam files to use",
     )
     abundanceos.add_argument(
-        "--aemb",
+        "--abundance_tsv",
         metavar="",
         type=Path,
-        help="Dir with .tsv files from strobealign --aemb",
+        help='Path to TSV file of precomputed abundances with header being "contigname(\\t<samplename>)*"',
     )
     abundanceos.add_argument(
         "--abundance",

diff --git a/vamb/parsebam.py b/vamb/parsebam.py
@@ -236,55 +236,53 @@ def run_pycoverm(
         return (coverage, refhash)
 
     @classmethod
-    def from_aemb(
-        cls: type[A],
-        paths: list[Path],
-        comp_metadata: CompositionMetaData,
-    ) -> A:
-        # We cannot guarantee strobealign outputs the contigs in the right order,
-        # so we must reorder them.
-        identifier_to_index: dict[str, int] = {
-            s: i for (i, s) in enumerate(comp_metadata.identifiers)
-        }
-
-        result = _np.empty((len(identifier_to_index), len(paths)), dtype=_np.float32)
-        for col_num, path in enumerate(paths):
-            column = _np.full(len(identifier_to_index), float("nan"), dtype=_np.float32)
-            with vambtools.Reader(path) as file:
-                n_lines_seen = 0
-                for line in file:
-                    line = line.rstrip()
-                    if len(line) == 0:
-                        continue
-                    n_lines_seen += 1
-                    if n_lines_seen > len(column):
-                        raise ValueError(
-                            f'Too many rows in AEMB file "{path}", expected {len(column)}'
-                        )
-                    # strobealign reserves the right to add more columns in future versions
-                    (identifier, depth_str, *_) = line.split(maxsplit=2)
-                    index = identifier_to_index.get(identifier.decode())
-                    if index is None:
-                        raise ValueError(
-                            f'Identifier "{identifier}" seen in `strobealign --aemb` output file "{path}", '
-                            "but this identifier is not seen in the FASTA file.\n"
-                            "Verify that the AEMB file was generated by mapping to the same FASTA file "
-                            "as was input to Vamb."
-                        )
-                    column[index] = float(depth_str)
-
-            isnans = _np.isnan(column)
-            if isnans.any():
-                first_missing_index = _np.nonzero(isnans)[0][0]
-                missing_identifier = comp_metadata.identifiers[first_missing_index]
+    def from_tsv(cls: type[A], path: Path, comp_metadata: CompositionMetaData) -> A:
+        seen_identifiers: list[str] = []
+        with open(path) as file:
+            try:
+                header = next(file)
+            except StopIteration:
+                err = ValueError(f'Empty abundance TSV path at "{path}"')
+                raise err from None
+            columns = header.splitlines()[0].split("\t")
+            if len(columns) < 1:
                 raise ValueError(
-                    f'Identifier "{missing_identifier}" from the input FASTA file is missing from the AEMB file "{path}".'
+                    f'Expected at least 1 column in abundance TSV file at "{path}"'
+                )
+            if columns[0] != "contigname":
+                raise ValueError('First column in header must be "contigname"')
+            samples = columns[1:]
+            n_samples = len(samples)
+            matrix = _np.empty((comp_metadata.nseqs, n_samples), dtype=_np.float32)
+            # Since we read the header already and this is zero-indexed, we need to add 2
+            for line_no_minus_two, line in enumerate(file):
+                if line_no_minus_two == comp_metadata.nseqs:
+                    raise ValueError(
+                        f'Too many rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got at least {line_no_minus_two + 1}'
+                    )
+                stripped = line.rstrip()
+                if not line:
+                    continue
+                fields = stripped.split("\t")
+                if len(fields) != n_samples + 1:
+                    raise ValueError(
+                        f'In abundance TSV file "{path}", on line {line_no_minus_two + 2}, expected {n_samples + 1} columns, found {len(fields)}'
+                    )
+                for i in range(n_samples):
+                    matrix[line_no_minus_two, i] = float(fields[i + 1])
+                seen_identifiers.append(fields[0])
+
+            if line_no_minus_two + 1 != comp_metadata.nseqs:
+                raise ValueError(
+                    f'Too few rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got {line_no_minus_two + 1}'
                 )
-            result[:, col_num] = column
 
-        return cls(
-            result,
-            [str(i) for i in paths],
-            0.0,
+        vambtools.RefHasher.verify_refhash(
+            vambtools.RefHasher.hash_refnames(seen_identifiers),
             comp_metadata.refhash,
+            "abundance TSV",
+            "composition",
+            (seen_identifiers, comp_metadata.identifiers),
         )
+
+        return cls(matrix, samples, 0.0, comp_metadata.refhash)