diff --git a/test/test_parsebam.py b/test/test_parsebam.py index 7a4ca49f..77ec2ba3 100644 --- a/test/test_parsebam.py +++ b/test/test_parsebam.py @@ -30,7 +30,10 @@ def setUpClass(cls): def test_refhash(self): m = self.comp_metadata cp = CompositionMetaData(m.identifiers, m.lengths, m.mask, m.minlength) - cp.refhash = b"a" * 32 # write bad refhash + # Change the refnames slighty + cp.identifiers = cp.identifiers.copy() + cp.identifiers[3] = cp.identifiers[3] + "w" + cp.refhash = vamb.vambtools.hash_refnames(cp.identifiers) with self.assertRaises(ValueError): vamb.parsebam.Abundance.from_files( testtools.BAM_FILES, None, cp, True, 0.97, 4 diff --git a/vamb/parsebam.py b/vamb/parsebam.py index 32b3290c..54bf8ede 100644 --- a/vamb/parsebam.py +++ b/vamb/parsebam.py @@ -11,7 +11,8 @@ from math import isfinite from vamb.parsecontigs import CompositionMetaData from vamb import vambtools -from typing import Optional, TypeVar, Union, IO, Sequence +from typing import Optional, TypeVar, Union, IO, Sequence, Iterable +from itertools import zip_longest from pathlib import Path import shutil @@ -52,15 +53,37 @@ def nsamples(self) -> int: return len(self.samplenames) @staticmethod - def verify_refhash(refhash: bytes, target_refhash: bytes) -> None: + def verify_refhash( + refhash: bytes, + target_refhash: bytes, + identifiers: Optional[tuple[Iterable[str], Iterable[str]]], + ) -> None: if refhash != target_refhash: - raise ValueError( - f"At least one BAM file reference name hash to {refhash.hex()}, " - f"expected {target_refhash.hex()}. " - "Make sure all BAM and FASTA identifiers are identical " - "and in the same order. " - "Note that the identifier is the header before any whitespace." - ) + if identifiers is not None: + for i, (fasta_id, bam_id) in enumerate(zip_longest(*identifiers)): + if fasta_id is None: + raise ValueError( + f"FASTA has only {i} identifier(s), which is fewer than BAM file" + ) + elif bam_id is None: + raise ValueError( + f"BAM has only {i} identifier(s), which is fewer than FASTA file" + ) + elif fasta_id != bam_id: + raise ValueError( + f"Identifier number {i+1} does not match for FASTA and BAM files:" + f'FASTA identifier: "{fasta_id}"' + f'BAM identifier: "{bam_id}"' + ) + assert False + else: + raise ValueError( + f"At least one BAM file reference name hash to {refhash.hex()}, " + f"expected {target_refhash.hex()}. " + "Make sure all BAM and FASTA identifiers are identical " + "and in the same order. " + "Note that the identifier is the header before any whitespace." + ) def save(self, io: Union[Path, IO[bytes]]): _np.savez_compressed( @@ -83,7 +106,7 @@ def load( arrs["refhash"].item(), ) if refhash is not None: - cls.verify_refhash(abundance.refhash, refhash) + cls.verify_refhash(abundance.refhash, refhash, None) return abundance @@ -136,6 +159,7 @@ def from_files( paths, minid, comp_metadata.refhash if verify_refhash else None, + comp_metadata.identifiers if verify_refhash else None, comp_metadata.mask, ) return cls(matrix, [str(p) for p in paths], minid, refhash) @@ -151,6 +175,7 @@ def from_files( chunksize, minid, comp_metadata.refhash if verify_refhash else None, + comp_metadata.identifiers if verify_refhash else None, comp_metadata.mask, ) @@ -162,6 +187,7 @@ def chunkwise_loading( nthreads: int, minid: float, target_refhash: Optional[bytes], + target_identifiers: Optional[Iterable[str]], mask: _np.ndarray, ) -> A: _os.makedirs(cache_directory) @@ -181,6 +207,7 @@ def chunkwise_loading( paths[chunkstart:chunkstop], minid, target_refhash, + target_identifiers, mask, ) vambtools.write_npz(filename, matrix) @@ -200,6 +227,7 @@ def run_pycoverm( paths: list[Path], minid: float, target_refhash: Optional[bytes], + target_identifiers: Optional[Iterable[str]], mask: _np.ndarray, ) -> tuple[_np.ndarray, bytes]: (headers, coverage) = pycoverm.get_coverages_from_bam( @@ -224,7 +252,12 @@ def run_pycoverm( vambtools.numpy_inplace_maskarray(coverage, mask) refhash = vambtools.hash_refnames(headers) + if target_identifiers is None: + identifier_pairs = None + else: + identifier_pairs = (target_identifiers, headers) + if target_refhash is not None: - Abundance.verify_refhash(refhash, target_refhash) + Abundance.verify_refhash(refhash, target_refhash, identifier_pairs) return (coverage, refhash)