Skip to content

Commit

Permalink
Replace AEMB parser with generic TSV parser
Browse files Browse the repository at this point in the history
In issue 355, a user requested to bring back the --jgi option, suc that users
can pass in precomputed abundances. That's completely reasonable, and it's not
great that the currently recommended way is to use the internal Python types
to create an Abundance object.

Instead of bringing back the JGI parser, I made a more generic TSV parser.
TSV is a simple, straightforward format also used elsewhere in Vamb. Also, since
it's easy to convert the output from --aemb to TSV format, I removed the --aemb
option.
  • Loading branch information
jakobnissen committed Sep 4, 2024
1 parent c923347 commit 20327e6
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 141 deletions.
12 changes: 8 additions & 4 deletions doc/how_to_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ $ for sample in 1 2 3; do
strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
done

$ # Paste the aemb files together to make a TSV file with given header
$ cat <(echo -e "contigname\t1\t2\t3") paste aemb/1.tsv <(cut -f 2 aemb/2.tsv) <(cut -f 2 aemb/3.tsv) > abundance.tsv

$ # Run Vamb using the contigs and the directory with abundance files
$ vamb bin default --outdir vambout --fasta contigs.fna.gz --aemb aemb
$ vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
```

#### TaxVamb
Expand Down Expand Up @@ -58,14 +61,15 @@ Future runs can then instead use:

#### Abundance
The abundance may be computed from:
* [recommended ]A directory containing TSV files obtained by mapping reads from
each individual sample against the contig catalogue using `strobealign --aemb`
* A directory of BAM files generated the same way, except using any aligner that
produces a BAM file, e.g. `minimap2`
* A TSV file with the header being "contigname" followed by one samplename per sample,
and the values in the TSV file being precomputed abundances.
These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`

Thus, it can be specified as:
```
--aemb dir_with_aemb_files
--abundance_tsv abundance.tsv
```
or
```
Expand Down
105 changes: 50 additions & 55 deletions test/test_parsebam.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import unittest
import io
import numpy as np
import random
import tempfile
from pathlib import Path

import vamb
import testtools
Expand Down Expand Up @@ -105,59 +105,54 @@ def test_save_load(self):
self.assertEqual(abundance2.refhash, self.abundance.refhash)
self.assertEqual(abundance2.minid, self.abundance.minid)

def test_parse_from_aemb(self):
abundance = vamb.parsebam.Abundance.from_aemb(
testtools.AEMB_FILES, self.comp_metadata
)

self.assertTrue(abundance.refhash == self.comp_metadata.refhash)
self.assertTrue(
(abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all()
)

manual_matrix = []
for file in testtools.AEMB_FILES:
manual_matrix.append([])
with open(file) as f:
for line in f:
(_, ab) = line.split()
manual_matrix[-1].append(float(ab))
manual_matrix = list(map(list, zip(*manual_matrix)))

self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all())

# Good headers, but in other order
names = list(self.comp_metadata.identifiers)
random.shuffle(names)
files = [self.make_aemb_file(names) for i in range(2)]
a = vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
)
self.assertIsInstance(a, vamb.parsebam.Abundance)

def make_aemb_file(self, headers):
file = tempfile.NamedTemporaryFile(mode="w+")
for header in headers:
n = random.random() * 10
print(header, "\t", str(n), file=file)
file.seek(0)
return file

def test_bad_aemb(self):
# One header is wrong
names = list(self.comp_metadata.identifiers)
names[-4] = names[-4] + "a"
files = [self.make_aemb_file(names) for i in range(2)]
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
def test_parse_from_tsv(self):
# Check it parses
with open(testtools.AEMB_FILES[0]) as file:
lines = [s.rstrip() for s in file]
for path in testtools.AEMB_FILES[1:]:
with open(path) as file:
for i, existing in enumerate(file):
lines[i] += "\t" + existing.split("\t")[1].rstrip()

with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
file.seek(0)
abundance = vamb.parsebam.Abundance.from_tsv(
Path(file.name), self.comp_metadata
)

# Too many headers
names = list(self.comp_metadata.identifiers)
names.append(names[0])
files = [self.make_aemb_file(names) for i in range(2)]
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
)
self.assertEqual(abundance.refhash, self.comp_metadata.refhash)
self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"])

# Check values are alright
M = np.zeros_like(abundance.matrix)
for row, line in enumerate(lines):
for col, cell in enumerate(line.split("\t")[1:]):
M[row, col] = float(cell)
self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all())

# Bad header order errors
lines[5], lines[4] = lines[4], lines[5]

with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
file.seek(0)
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)

# Restore
lines[5], lines[4] = lines[4], lines[5]

# Too many lines
with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
print(lines[-2], file=file)
file.seek(0)
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
24 changes: 24 additions & 0 deletions test/test_reclustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

# Invariants:
# It produces disjoint clusters, a subset of the input points

# For CAMI dataset, compute comp, abundance, taxonomy, markers
# Subset to e.g. 5 genera plus a few unclassified contigs

# FASTA
# Comp
# Abundance
# Markers
# Taxonomy
# Latent
# Refined taxonomy


class TestKmeansReclustering(unittest.TestCase):
pass
# Make markers + lengths
# Make taxonomy
# Create latent

# Initial clustering
53 changes: 20 additions & 33 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,13 @@ def __init__(self, paths: list[Path], min_alignment_id: Optional[float]):
self.paths = paths


class AEMBPaths(list):
__slots__ = ["paths"]
class AbundanceTSVPath:
__slots__ = ["path"]

def __init__(self, dir: Path):
if not dir.is_dir():
raise NotADirectoryError(dir)
paths = [b for b in dir.iterdir() if b.is_file() and b.suffix == ".tsv"]
if len(paths) == 0:
raise ValueError(
f'No `.tsv` files found in --aemb argument "{dir}". '
"Make sure all TSV files in the directory ends with `.tsv`."
)
self.paths = paths
def __init__(self, path: Path):
if not path.is_file():
raise FileNotFoundError(path)
self.path = path


class AbundanceOptions:
Expand All @@ -176,7 +170,7 @@ def are_args_present(args: argparse.Namespace) -> bool:
return (
isinstance(args.bampaths, list)
or isinstance(args.bamdir, Path)
or isinstance(args.aemb, Path)
or isinstance(args.abundance_tsv, Path)
or isinstance(args.abundancepath, Path)
)

Expand All @@ -185,7 +179,7 @@ def from_args(cls, args: argparse.Namespace):
return cls(
typeasserted(args.bampaths, (list, type(None))),
typeasserted(args.bamdir, (Path, type(None))),
typeasserted(args.aemb, (Path, type(None))),
typeasserted(args.abundance_tsv, (Path, type(None))),
typeasserted(args.abundancepath, (Path, type(None))),
typeasserted(args.min_alignment_id, (float, type(None))),
)
Expand All @@ -194,25 +188,19 @@ def __init__(
self,
bampaths: Optional[list[Path]],
bamdir: Optional[Path],
aemb: Optional[Path],
abundance_tsv: Optional[Path],
abundancepath: Optional[Path],
min_alignment_id: Optional[float],
):
assert isinstance(bampaths, (list, type(None)))
assert isinstance(bamdir, (Path, type(None)))
assert isinstance(aemb, (Path, type(None)))
assert isinstance(abundancepath, (Path, type(None)))
assert isinstance(min_alignment_id, (float, type(None)))

# Make sure only one abundance input is there
if (
(bampaths is not None)
+ (abundancepath is not None)
+ (bamdir is not None)
+ (aemb is not None)
+ (abundance_tsv is not None)
) != 1:
raise argparse.ArgumentTypeError(
"Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input"
"Must specify exactly one of BAM files, BAM dir, TSV file or abundance NPZ file input"
)

if abundancepath is not None:
Expand All @@ -224,8 +212,8 @@ def __init__(
"The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead"
)
self.paths = BAMPaths(bampaths, min_alignment_id)
elif aemb is not None:
self.paths = AEMBPaths(aemb)
elif abundance_tsv is not None:
self.paths = AbundanceTSVPath(abundance_tsv)


class BasicTrainingOptions:
Expand Down Expand Up @@ -913,11 +901,10 @@ def calc_abundance(
logger.info("\tOrder of columns is:")
for i, samplename in enumerate(abundance.samplenames):
logger.info(f"\t{i:>6}: {samplename}")
elif isinstance(paths, AEMBPaths):
paths = list(paths.paths)
logger.info(f"\tParsing {len(paths)} AEMB TSV files")
abundance = vamb.parsebam.Abundance.from_aemb(
paths,
elif isinstance(paths, AbundanceTSVPath):
logger.info(f'\tParsing abundance from TSV at "{paths.path}"')
abundance = vamb.parsebam.Abundance.from_tsv(
paths.path,
comp_metadata,
)
abundance.save(outdir.joinpath("abundance.npz"))
Expand Down Expand Up @@ -1687,10 +1674,10 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser):
help="Dir with .bam files to use",
)
abundanceos.add_argument(
"--aemb",
"--abundance_tsv",
metavar="",
type=Path,
help="Dir with .tsv files from strobealign --aemb",
help='Path to TSV file of precomputed abundances with header being "contigname(\\t<samplename>)*"',
)
abundanceos.add_argument(
"--abundance",
Expand Down Expand Up @@ -1923,7 +1910,7 @@ def add_clustering_arguments(subparser):
dest="max_clusters",
metavar="",
type=int,
default=None, # meaning: do not stop
default=None, # meaning: do not stop
help=argparse.SUPPRESS,
)

Expand Down
4 changes: 3 additions & 1 deletion vamb/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def make_dataloader(
raise ValueError(f"Batch size must be minimum 1, not {batchsize}")

if len(abundance) != len(tnf) or len(tnf) != len(lengths):
raise ValueError("Lengths of abundance, TNF and lengths arrays must be the same")
raise ValueError(
"Lengths of abundance, TNF and lengths arrays must be the same"
)

if not (abundance.dtype == tnf.dtype == _np.float32):
raise ValueError("TNF and abundance must be Numpy arrays of dtype float32")
Expand Down
Loading

0 comments on commit 20327e6

Please sign in to comment.