Skip to content

Commit

Permalink
Improve error message on mismatching refhash
Browse files Browse the repository at this point in the history
Mismatching identifiers in FASTA and BAM files has caused a lot of issues for
our users. This is what the mechanism of refhashing was intended to detect
and prevent.
However, the error on a mismatching refhash is not that friendly, in that it
only shows the hash, but not the identifiers that caused the hash.
Improve the error message by showing the first mismatching identifier.
  • Loading branch information
jakobnissen committed Aug 14, 2023
1 parent ba96abe commit 37277e2
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 12 deletions.
5 changes: 4 additions & 1 deletion test/test_parsebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ def setUpClass(cls):
def test_refhash(self):
m = self.comp_metadata
cp = CompositionMetaData(m.identifiers, m.lengths, m.mask, m.minlength)
cp.refhash = b"a" * 32 # write bad refhash
# Change the refnames slighty
cp.identifiers = cp.identifiers.copy()
cp.identifiers[3] = cp.identifiers[3] + "w"
cp.refhash = vamb.vambtools.hash_refnames(cp.identifiers)
with self.assertRaises(ValueError):
vamb.parsebam.Abundance.from_files(
testtools.BAM_FILES, None, cp, True, 0.97, 4
Expand Down
55 changes: 44 additions & 11 deletions vamb/parsebam.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from math import isfinite
from vamb.parsecontigs import CompositionMetaData
from vamb import vambtools
from typing import Optional, TypeVar, Union, IO, Sequence
from typing import Optional, TypeVar, Union, IO, Sequence, Iterable
from itertools import zip_longest
from pathlib import Path
import shutil

Expand Down Expand Up @@ -52,15 +53,37 @@ def nsamples(self) -> int:
return len(self.samplenames)

@staticmethod
def verify_refhash(refhash: bytes, target_refhash: bytes) -> None:
def verify_refhash(
refhash: bytes,
target_refhash: bytes,
identifiers: Optional[tuple[Iterable[str], Iterable[str]]],
) -> None:
if refhash != target_refhash:
raise ValueError(
f"At least one BAM file reference name hash to {refhash.hex()}, "
f"expected {target_refhash.hex()}. "
"Make sure all BAM and FASTA identifiers are identical "
"and in the same order. "
"Note that the identifier is the header before any whitespace."
)
if identifiers is not None:
for i, (fasta_id, bam_id) in enumerate(zip_longest(*identifiers)):
if fasta_id is None:
raise ValueError(
f"FASTA has only {i} identifier(s), which is fewer than BAM file"
)
elif bam_id is None:
raise ValueError(
f"BAM has only {i} identifier(s), which is fewer than FASTA file"
)
elif fasta_id != bam_id:
raise ValueError(
f"Identifier number {i+1} does not match for FASTA and BAM files:"
f'FASTA identifier: "{fasta_id}"'
f'BAM identifier: "{bam_id}"'
)
assert False
else:
raise ValueError(
f"At least one BAM file reference name hash to {refhash.hex()}, "
f"expected {target_refhash.hex()}. "
"Make sure all BAM and FASTA identifiers are identical "
"and in the same order. "
"Note that the identifier is the header before any whitespace."
)

def save(self, io: Union[Path, IO[bytes]]):
_np.savez_compressed(
Expand All @@ -83,7 +106,7 @@ def load(
arrs["refhash"].item(),
)
if refhash is not None:
cls.verify_refhash(abundance.refhash, refhash)
cls.verify_refhash(abundance.refhash, refhash, None)

return abundance

Expand Down Expand Up @@ -136,6 +159,7 @@ def from_files(
paths,
minid,
comp_metadata.refhash if verify_refhash else None,
comp_metadata.identifiers if verify_refhash else None,
comp_metadata.mask,
)
return cls(matrix, [str(p) for p in paths], minid, refhash)
Expand All @@ -151,6 +175,7 @@ def from_files(
chunksize,
minid,
comp_metadata.refhash if verify_refhash else None,
comp_metadata.identifiers if verify_refhash else None,
comp_metadata.mask,
)

Expand All @@ -162,6 +187,7 @@ def chunkwise_loading(
nthreads: int,
minid: float,
target_refhash: Optional[bytes],
target_identifiers: Optional[Iterable[str]],
mask: _np.ndarray,
) -> A:
_os.makedirs(cache_directory)
Expand All @@ -181,6 +207,7 @@ def chunkwise_loading(
paths[chunkstart:chunkstop],
minid,
target_refhash,
target_identifiers,
mask,
)
vambtools.write_npz(filename, matrix)
Expand All @@ -200,6 +227,7 @@ def run_pycoverm(
paths: list[Path],
minid: float,
target_refhash: Optional[bytes],
target_identifiers: Optional[Iterable[str]],
mask: _np.ndarray,
) -> tuple[_np.ndarray, bytes]:
(headers, coverage) = pycoverm.get_coverages_from_bam(
Expand All @@ -224,7 +252,12 @@ def run_pycoverm(
vambtools.numpy_inplace_maskarray(coverage, mask)
refhash = vambtools.hash_refnames(headers)

if target_identifiers is None:
identifier_pairs = None
else:
identifier_pairs = (target_identifiers, headers)

if target_refhash is not None:
Abundance.verify_refhash(refhash, target_refhash)
Abundance.verify_refhash(refhash, target_refhash, identifier_pairs)

return (coverage, refhash)

0 comments on commit 37277e2

Please sign in to comment.