Skip to content

Commit

Permalink
Replace AEMB parser with generic TSV parser
Browse files Browse the repository at this point in the history
In issue 355, a user requested to bring back the --jgi option, suc that users
can pass in precomputed abundances. That's completely reasonable, and it's not
great that the currently recommended way is to use the internal Python types
to create an Abundance object.

Instead of bringing back the JGI parser, I made a more generic TSV parser.
TSV is a simple, straightforward format also used elsewhere in Vamb. Also, since
it's easy to convert the output from --aemb to TSV format, I removed the --aemb
option.
  • Loading branch information
jakobnissen committed Sep 4, 2024
1 parent bb2388d commit 9810ef0
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 138 deletions.
12 changes: 8 additions & 4 deletions doc/how_to_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ $ for sample in 1 2 3; do
strobealign -t 8 --aemb contigs.fna.gz ${sample}.{fw,rv}.fq.gz > aemb/${sample}.tsv;
done

$ # Paste the aemb files together to make a TSV file with given header
$ cat <(echo -e "contigname\t1\t2\t3") paste aemb/1.tsv <(cut -f 2 aemb/2.tsv) <(cut -f 2 aemb/3.tsv) > abundance.tsv

$ # Run Vamb using the contigs and the directory with abundance files
$ vamb bin default --outdir vambout --fasta contigs.fna.gz --aemb aemb
$ vamb bin default --outdir vambout --fasta contigs.fna.gz --abundance_tsv abundance.tsv
```

#### TaxVamb
Expand Down Expand Up @@ -58,14 +61,15 @@ Future runs can then instead use:

#### Abundance
The abundance may be computed from:
* [recommended ]A directory containing TSV files obtained by mapping reads from
each individual sample against the contig catalogue using `strobealign --aemb`
* A directory of BAM files generated the same way, except using any aligner that
produces a BAM file, e.g. `minimap2`
* A TSV file with the header being "contigname" followed by one samplename per sample,
and the values in the TSV file being precomputed abundances.
These may be derived from `paste`ing together outputs from the tool `strobealign --aemb`

Thus, it can be specified as:
```
--aemb dir_with_aemb_files
--abundance_tsv abundance.tsv
```
or
```
Expand Down
105 changes: 50 additions & 55 deletions test/test_parsebam.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import unittest
import io
import numpy as np
import random
import tempfile
from pathlib import Path

import vamb
import testtools
Expand Down Expand Up @@ -105,59 +105,54 @@ def test_save_load(self):
self.assertEqual(abundance2.refhash, self.abundance.refhash)
self.assertEqual(abundance2.minid, self.abundance.minid)

def test_parse_from_aemb(self):
abundance = vamb.parsebam.Abundance.from_aemb(
testtools.AEMB_FILES, self.comp_metadata
)

self.assertTrue(abundance.refhash == self.comp_metadata.refhash)
self.assertTrue(
(abundance.samplenames == [str(i) for i in testtools.AEMB_FILES]).all()
)

manual_matrix = []
for file in testtools.AEMB_FILES:
manual_matrix.append([])
with open(file) as f:
for line in f:
(_, ab) = line.split()
manual_matrix[-1].append(float(ab))
manual_matrix = list(map(list, zip(*manual_matrix)))

self.assertTrue((np.abs(manual_matrix - abundance.matrix) < 1e-5).all())

# Good headers, but in other order
names = list(self.comp_metadata.identifiers)
random.shuffle(names)
files = [self.make_aemb_file(names) for i in range(2)]
a = vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
)
self.assertIsInstance(a, vamb.parsebam.Abundance)

def make_aemb_file(self, headers):
file = tempfile.NamedTemporaryFile(mode="w+")
for header in headers:
n = random.random() * 10
print(header, "\t", str(n), file=file)
file.seek(0)
return file

def test_bad_aemb(self):
# One header is wrong
names = list(self.comp_metadata.identifiers)
names[-4] = names[-4] + "a"
files = [self.make_aemb_file(names) for i in range(2)]
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
def test_parse_from_tsv(self):
# Check it parses
with open(testtools.AEMB_FILES[0]) as file:
lines = [s.rstrip() for s in file]
for path in testtools.AEMB_FILES[1:]:
with open(path) as file:
for i, existing in enumerate(file):
lines[i] += "\t" + existing.split("\t")[1].rstrip()

with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
file.seek(0)
abundance = vamb.parsebam.Abundance.from_tsv(
Path(file.name), self.comp_metadata
)

# Too many headers
names = list(self.comp_metadata.identifiers)
names.append(names[0])
files = [self.make_aemb_file(names) for i in range(2)]
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_aemb(
[f.name for f in files], self.comp_metadata
)
self.assertEqual(abundance.refhash, self.comp_metadata.refhash)
self.assertEqual(list(abundance.samplenames), ["file1", "file2", "file3"])

# Check values are alright
M = np.zeros_like(abundance.matrix)
for row, line in enumerate(lines):
for col, cell in enumerate(line.split("\t")[1:]):
M[row, col] = float(cell)
self.assertTrue((np.abs((M - abundance.matrix)) < 1e-6).all())

# Bad header order errors
lines[5], lines[4] = lines[4], lines[5]

with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
file.seek(0)
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)

# Restore
lines[5], lines[4] = lines[4], lines[5]

# Too many lines
with tempfile.NamedTemporaryFile(mode="w+") as file:
print("contigname\tfile1\tfile2\tfile3", file=file)
for line in lines:
print(line, file=file)
print(lines[-2], file=file)
file.seek(0)
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_tsv(Path(file.name), self.comp_metadata)
24 changes: 24 additions & 0 deletions test/test_reclustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

# Invariants:
# It produces disjoint clusters, a subset of the input points

# For CAMI dataset, compute comp, abundance, taxonomy, markers
# Subset to e.g. 5 genera plus a few unclassified contigs

# FASTA
# Comp
# Abundance
# Markers
# Taxonomy
# Latent
# Refined taxonomy


class TestKmeansReclustering(unittest.TestCase):
pass
# Make markers + lengths
# Make taxonomy
# Create latent

# Initial clustering
51 changes: 19 additions & 32 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,13 @@ def __init__(self, paths: list[Path], min_alignment_id: Optional[float]):
self.paths = paths


class AEMBPaths(list):
__slots__ = ["paths"]
class AbundanceTSVPath:
__slots__ = ["path"]

def __init__(self, dir: Path):
if not dir.is_dir():
raise NotADirectoryError(dir)
paths = [b for b in dir.iterdir() if b.is_file() and b.suffix == ".tsv"]
if len(paths) == 0:
raise ValueError(
f'No `.tsv` files found in --aemb argument "{dir}". '
"Make sure all TSV files in the directory ends with `.tsv`."
)
self.paths = paths
def __init__(self, path: Path):
if not path.is_file():
raise FileNotFoundError(path)
self.path = path


class AbundanceOptions:
Expand All @@ -176,7 +170,7 @@ def are_args_present(args: argparse.Namespace) -> bool:
return (
isinstance(args.bampaths, list)
or isinstance(args.bamdir, Path)
or isinstance(args.aemb, Path)
or isinstance(args.abundance_tsv, Path)
or isinstance(args.abundancepath, Path)
)

Expand All @@ -185,7 +179,7 @@ def from_args(cls, args: argparse.Namespace):
return cls(
typeasserted(args.bampaths, (list, type(None))),
typeasserted(args.bamdir, (Path, type(None))),
typeasserted(args.aemb, (Path, type(None))),
typeasserted(args.abundance_tsv, (Path, type(None))),
typeasserted(args.abundancepath, (Path, type(None))),
typeasserted(args.min_alignment_id, (float, type(None))),
)
Expand All @@ -194,25 +188,19 @@ def __init__(
self,
bampaths: Optional[list[Path]],
bamdir: Optional[Path],
aemb: Optional[Path],
abundance_tsv: Optional[Path],
abundancepath: Optional[Path],
min_alignment_id: Optional[float],
):
assert isinstance(bampaths, (list, type(None)))
assert isinstance(bamdir, (Path, type(None)))
assert isinstance(aemb, (Path, type(None)))
assert isinstance(abundancepath, (Path, type(None)))
assert isinstance(min_alignment_id, (float, type(None)))

# Make sure only one abundance input is there
if (
(bampaths is not None)
+ (abundancepath is not None)
+ (bamdir is not None)
+ (aemb is not None)
+ (abundance_tsv is not None)
) != 1:
raise argparse.ArgumentTypeError(
"Must specify exactly one of BAM files, BAM dir, AEMB or abundance NPZ file input"
"Must specify exactly one of BAM files, BAM dir, TSV file or abundance NPZ file input"
)

if abundancepath is not None:
Expand All @@ -224,8 +212,8 @@ def __init__(
"The --bamfiles argument is deprecated. It works, but might be removed in future versions of Vamb. Please use --bamdir instead"
)
self.paths = BAMPaths(bampaths, min_alignment_id)
elif aemb is not None:
self.paths = AEMBPaths(aemb)
elif abundance_tsv is not None:
self.paths = AbundanceTSVPath(abundance_tsv)


class BasicTrainingOptions:
Expand Down Expand Up @@ -913,11 +901,10 @@ def calc_abundance(
logger.info("\tOrder of columns is:")
for i, samplename in enumerate(abundance.samplenames):
logger.info(f"\t{i:>6}: {samplename}")
elif isinstance(paths, AEMBPaths):
paths = list(paths.paths)
logger.info(f"\tParsing {len(paths)} AEMB TSV files")
abundance = vamb.parsebam.Abundance.from_aemb(
paths,
elif isinstance(paths, AbundanceTSVPath):
logger.info(f'\tParsing abundance from TSV at "{paths.path}"')
abundance = vamb.parsebam.Abundance.from_tsv(
paths.path,
comp_metadata,
)
abundance.save(outdir.joinpath("abundance.npz"))
Expand Down Expand Up @@ -1687,10 +1674,10 @@ def add_abundance_arguments(subparser: argparse.ArgumentParser):
help="Dir with .bam files to use",
)
abundanceos.add_argument(
"--aemb",
"--abundance_tsv",
metavar="",
type=Path,
help="Dir with .tsv files from strobealign --aemb",
help='Path to TSV file of precomputed abundances with header being "contigname(\\t<samplename>)*"',
)
abundanceos.add_argument(
"--abundance",
Expand Down
92 changes: 45 additions & 47 deletions vamb/parsebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,55 +236,53 @@ def run_pycoverm(
return (coverage, refhash)

@classmethod
def from_aemb(
cls: type[A],
paths: list[Path],
comp_metadata: CompositionMetaData,
) -> A:
# We cannot guarantee strobealign outputs the contigs in the right order,
# so we must reorder them.
identifier_to_index: dict[str, int] = {
s: i for (i, s) in enumerate(comp_metadata.identifiers)
}

result = _np.empty((len(identifier_to_index), len(paths)), dtype=_np.float32)
for col_num, path in enumerate(paths):
column = _np.full(len(identifier_to_index), float("nan"), dtype=_np.float32)
with vambtools.Reader(path) as file:
n_lines_seen = 0
for line in file:
line = line.rstrip()
if len(line) == 0:
continue
n_lines_seen += 1
if n_lines_seen > len(column):
raise ValueError(
f'Too many rows in AEMB file "{path}", expected {len(column)}'
)
# strobealign reserves the right to add more columns in future versions
(identifier, depth_str, *_) = line.split(maxsplit=2)
index = identifier_to_index.get(identifier.decode())
if index is None:
raise ValueError(
f'Identifier "{identifier}" seen in `strobealign --aemb` output file "{path}", '
"but this identifier is not seen in the FASTA file.\n"
"Verify that the AEMB file was generated by mapping to the same FASTA file "
"as was input to Vamb."
)
column[index] = float(depth_str)

isnans = _np.isnan(column)
if isnans.any():
first_missing_index = _np.nonzero(isnans)[0][0]
missing_identifier = comp_metadata.identifiers[first_missing_index]
def from_tsv(cls: type[A], path: Path, comp_metadata: CompositionMetaData) -> A:
seen_identifiers: list[str] = []
with open(path) as file:
try:
header = next(file)
except StopIteration:
err = ValueError(f'Empty abundance TSV path at "{path}"')
raise err from None
columns = header.splitlines()[0].split("\t")
if len(columns) < 1:
raise ValueError(
f'Identifier "{missing_identifier}" from the input FASTA file is missing from the AEMB file "{path}".'
f'Expected at least 1 column in abundance TSV file at "{path}"'
)
if columns[0] != "contigname":
raise ValueError('First column in header must be "contigname"')
samples = columns[1:]
n_samples = len(samples)
matrix = _np.empty((comp_metadata.nseqs, n_samples), dtype=_np.float32)
# Since we read the header already and this is zero-indexed, we need to add 2
for line_no_minus_two, line in enumerate(file):
if line_no_minus_two == comp_metadata.nseqs:
raise ValueError(
f'Too many rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got at least {line_no_minus_two + 1}'
)
stripped = line.rstrip()
if not line:
continue
fields = stripped.split("\t")
if len(fields) != n_samples + 1:
raise ValueError(
f'In abundance TSV file "{path}", on line {line_no_minus_two + 2}, expected {n_samples + 1} columns, found {len(fields)}'
)
for i in range(n_samples):
matrix[line_no_minus_two, i] = float(fields[i + 1])
seen_identifiers.append(fields[0])

if line_no_minus_two + 1 != comp_metadata.nseqs:
raise ValueError(
f'Too few rows in abundance TSV file "{path}", expected {comp_metadata.nseqs + 1}, got {line_no_minus_two + 1}'
)
result[:, col_num] = column

return cls(
result,
[str(i) for i in paths],
0.0,
vambtools.RefHasher.verify_refhash(
vambtools.RefHasher.hash_refnames(seen_identifiers),
comp_metadata.refhash,
"abundance TSV",
"composition",
(seen_identifiers, comp_metadata.identifiers),
)

return cls(matrix, samples, 0.0, comp_metadata.refhash)

0 comments on commit 9810ef0

Please sign in to comment.