Replace AEMB parser with generic TSV parser

In issue 355, a user requested to bring back the --jgi option, suc that users can pass in precomputed abundances. That's completely reasonable, and it's not great that the currently recommended way is to use the internal Python types to create an Abundance object. Instead of bringing back the JGI parser, I made a more generic TSV parser. TSV is a simple, straightforward format also used elsewhere in Vamb. Also, since it's easy to convert the output from --aemb to TSV format, I removed the --aemb option.
RasmussenLab · Sep 4, 2024 · 20327e6 · 20327e6
1 parent c923347
commit 20327e6
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 141 deletions.
diff --git a/doc/how_to_run.md b/doc/how_to_run.md
@@ -21,8 +21,11 @@ $ for sample in 1 2 3; do
       strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
   done
 
+$ # Paste the aemb files together to make a TSV file with given header
+$ cat <(echo -e "contigname\t1\t2\t3") paste aemb/1.tsv <(cut -f 2 aemb/2.tsv) <(cut -f 2 aemb/3.tsv) > abundance.tsv
+
 $ # Run Vamb using the contigs and the directory with abundance files
-$ vamb bin default --outdir vambout --fasta contigs.fna.gz --aemb aemb
+$ vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
 ```
 
 #### TaxVamb
@@ -58,14 +61,15 @@ Future runs can then instead use:
 
 #### Abundance
 The abundance may be computed from:
-* [recommended ]A directory containing TSV files obtained by mapping reads from
-  each individual sample against the contig catalogue using `strobealign --aemb`
 * A directory of BAM files generated the same way, except using any aligner that
   produces a BAM file, e.g. `minimap2`
+* A TSV file with the header being "contigname" followed by one samplename per sample,
+  and the values in the TSV file being precomputed abundances.
+  These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`
 
 Thus, it can be specified as:
 ```
---aemb dir_with_aemb_files
+--abundance_tsv abundance.tsv
 ```
 or
 ```

diff --git a/test/test_parsebam.py b/test/test_parsebam.py
@@ -1,8 +1,8 @@
 import unittest
 import io
 import numpy as np
-import random
 import tempfile
+from pathlib import Path
 
 import vamb
 import testtools
@@ -105,59 +105,54 @@ def test_save_load(self):
         self.assertEqual(abundance2.refhash, self.abundance.refhash)
         self.assertEqual(abundance2.minid, self.abundance.minid)
 
-    def test_parse_from_aemb(self):
-        abundance = vamb.parsebam.Abundance.from_aemb(
-            testtools.AEMB_FILES, self.comp_metadata
-        )
-
-        self.assertTrue(abundance.refhash == self.comp_metadata.refhash)
-        self.assertTrue(
-            (abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all()
-        )
-
-        manual_matrix = []
-        for file in testtools.AEMB_FILES:
-            manual_matrix.append([])
-            with open(file) as f:
-                for line in f:
-                    (_, ab) = line.split()
-                    manual_matrix[-1].append(float(ab))
-        manual_matrix = list(map(list, zip(*manual_matrix)))
-
-        self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all())
-
-        # Good headers, but in other order
-        names = list(self.comp_metadata.identifiers)
-        random.shuffle(names)
-        files = [self.make_aemb_file(names) for i in range(2)]
-        a = vamb.parsebam.Abundance.from_aemb(
-            [f.name for f in files], self.comp_metadata
-        )
-        self.assertIsInstance(a, vamb.parsebam.Abundance)
-
-    def make_aemb_file(self, headers):
-        file = tempfile.NamedTemporaryFile(mode="w+")
-        for header in headers:
-            n = random.random() * 10
-            print(header, "\t", str(n), file=file)
-        file.seek(0)
-        return file
-
-    def test_bad_aemb(self):
-        # One header is wrong
-        names = list(self.comp_metadata.identifiers)
-        names[-4] = names[-4] + "a"
-        files = [self.make_aemb_file(names) for i in range(2)]
-        with self.assertRaises(ValueError):
-            vamb.parsebam.Abundance.from_aemb(
-                [f.name for f in files], self.comp_metadata
+    def test_parse_from_tsv(self):
+        # Check it parses
+        with open(testtools.AEMB_FILES[0]) as file:
+            lines = [s.rstrip() for s in file]
+        for path in testtools.AEMB_FILES[1:]:
+            with open(path) as file:
+                for i, existing in enumerate(file):
+                    lines[i] += "\t" + existing.split("\t")[1].rstrip()
+
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            file.seek(0)
+            abundance = vamb.parsebam.Abundance.from_tsv(
+                Path(file.name), self.comp_metadata
             )
 
-        # Too many headers
-        names = list(self.comp_metadata.identifiers)
-        names.append(names[0])
-        files = [self.make_aemb_file(names) for i in range(2)]
-        with self.assertRaises(ValueError):
-            vamb.parsebam.Abundance.from_aemb(
-                [f.name for f in files], self.comp_metadata
-            )
+        self.assertEqual(abundance.refhash, self.comp_metadata.refhash)
+        self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"])
+
+        # Check values are alright
+        M = np.zeros_like(abundance.matrix)
+        for row, line in enumerate(lines):
+            for col, cell in enumerate(line.split("\t")[1:]):
+                M[row, col] = float(cell)
+        self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all())
+
+        # Bad header order errors
+        lines[5], lines[4] = lines[4], lines[5]
+
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            file.seek(0)
+            with self.assertRaises(ValueError):
+                vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
+
+        # Restore
+        lines[5], lines[4] = lines[4], lines[5]
+
+        # Too many lines
+        with tempfile.NamedTemporaryFile(mode="w+") as file:
+            print("contigname\tfile1\tfile2\tfile3", file=file)
+            for line in lines:
+                print(line, file=file)
+            print(lines[-2], file=file)
+            file.seek(0)
+            with self.assertRaises(ValueError):
+                vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
diff --git a/test/test_reclustering.py b/test/test_reclustering.py
@@ -0,0 +1,24 @@
+import unittest
+
+# Invariants:
+# It produces disjoint clusters, a subset of the input points
+
+# For CAMI dataset, compute comp, abundance, taxonomy, markers
+# Subset to e.g. 5 genera plus a few unclassified contigs
+
+# FASTA
+# Comp
+# Abundance
+# Markers
+# Taxonomy
+# Latent
+# Refined taxonomy
+
+
+class TestKmeansReclustering(unittest.TestCase):
+    pass
+    # Make markers + lengths
+    # Make taxonomy
+    # Create latent
+
+    # Initial clustering
diff --git a/vamb/__main__.py b/vamb/__main__.py
@@ -153,19 +153,13 @@ def __init__(self, paths: list[Path], min_alignment_id: Optional[float]):
         self.paths = paths
 
 
-class AEMBPaths(list):
-    __slots__ = ["paths"]
+class AbundanceTSVPath:
+    __slots__ = ["path"]
 
-    def __init__(self, dir: Path):
-        if not dir.is_dir():
-            raise NotADirectoryError(dir)
-        paths = [b for b in dir.iterdir() if b.is_file() and b.suffix == ".tsv"]
-        if len(paths) == 0:
-            raise ValueError(
-                f'No `.tsv` files found in --aemb argument "{dir}". '
-                "Make sure all TSV files in the directory ends with `.tsv`."
-            )
-        self.paths = paths
+    def __init__(self, path: Path):
+        if not path.is_file():
+            raise FileNotFoundError(path)
+        self.path = path
 
 
 class AbundanceOptions:
@@ -176,7 +170,7 @@ def are_args_present(args: argparse.Namespace) -> bool:
         return (
             isinstance(args.bampaths, list)
             or isinstance(args.bamdir, Path)
-            or isinstance(args.aemb, Path)
+            or isinstance(args.abundance_tsv, Path)
             or isinstance(args.abundancepath, Path)
         )
 
@@ -185,7 +179,7 @@ def from_args(cls, args: argparse.Namespace):
         return cls(
             typeasserted(args.bampaths, (list, type(None))),
             typeasserted(args.bamdir, (Path, type(None))),
-            typeasserted(args.aemb, (Path, type(None))),
+            typeasserted(args.abundance_tsv, (Path, type(None))),
             typeasserted(args.abundancepath, (Path, type(None))),
             typeasserted(args.min_alignment_id, (float, type(None))),
         )
@@ -194,25 +188,19 @@ def __init__(
         self,
         bampaths: Optional[list[Path]],
         bamdir: Optional[Path],
-        aemb: Optional[Path],
+        abundance_tsv: Optional[Path],
         abundancepath: Optional[Path],
         min_alignment_id: Optional[float],
     ):
-        assert isinstance(bampaths, (list, type(None)))
-        assert isinstance(bamdir, (Path, type(None)))
-        assert isinstance(aemb, (Path, type(None)))
-        assert isinstance(abundancepath, (Path, type(None)))
-        assert isinstance(min_alignment_id, (float, type(None)))
-
         # Make sure only one abundance input is there
         if (
             (bampaths is not None)
             + (abundancepath is not None)
             + (bamdir is not None)
-            + (aemb is not None)
+            + (abundance_tsv is not None)
         ) != 1:
             raise argparse.ArgumentTypeError(
-                "Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input"
+                "Must specify exactly one of BAM files, BAM dir, TSV file or abundance NPZ file input"
             )
 
         if abundancepath is not None:
@@ -224,8 +212,8 @@ def __init__(
                 "The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead"
             )
             self.paths = BAMPaths(bampaths, min_alignment_id)
-        elif aemb is not None:
-            self.paths = AEMBPaths(aemb)
+        elif abundance_tsv is not None:
+            self.paths = AbundanceTSVPath(abundance_tsv)
 
 
 class BasicTrainingOptions:
@@ -913,11 +901,10 @@ def calc_abundance(
         logger.info("\tOrder of columns is:")
         for i, samplename in enumerate(abundance.samplenames):
             logger.info(f"\t{i:>6}: {samplename}")
-    elif isinstance(paths, AEMBPaths):
-        paths = list(paths.paths)
-        logger.info(f"\tParsing {len(paths)} AEMB TSV files")
-        abundance = vamb.parsebam.Abundance.from_aemb(
-            paths,
+    elif isinstance(paths, AbundanceTSVPath):
+        logger.info(f'\tParsing abundance from TSV at "{paths.path}"')
+        abundance = vamb.parsebam.Abundance.from_tsv(
+            paths.path,
             comp_metadata,
         )
         abundance.save(outdir.joinpath("abundance.npz"))
@@ -1687,10 +1674,10 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser):
         help="Dir with .bam files to use",
     )
     abundanceos.add_argument(
-        "--aemb",
+        "--abundance_tsv",
         metavar="",
         type=Path,
-        help="Dir with .tsv files from strobealign --aemb",
+        help='Path to TSV file of precomputed abundances with header being "contigname(\\t<samplename>)*"',
     )
     abundanceos.add_argument(
         "--abundance",
@@ -1923,7 +1910,7 @@ def add_clustering_arguments(subparser):
         dest="max_clusters",
         metavar="",
         type=int,
-        default=None, # meaning: do not stop
+        default=None,  # meaning: do not stop
         help=argparse.SUPPRESS,
     )
 

diff --git a/vamb/encode.py b/vamb/encode.py
@@ -81,7 +81,9 @@ def make_dataloader(
         raise ValueError(f"Batch size must be minimum 1, not {batchsize}")
 
     if len(abundance) != len(tnf) or len(tnf) != len(lengths):
-        raise ValueError("Lengths of abundance, TNF and lengths arrays must be the same")
+        raise ValueError(
+            "Lengths of abundance, TNF and lengths arrays must be the same"
+        )
 
     if not (abundance.dtype == tnf.dtype == _np.float32):
         raise ValueError("TNF and abundance must be Numpy arrays of dtype float32")