Improve error message on mismatching refhash

Mismatching identifiers in FASTA and BAM files has caused a lot of issues for our users. This is what the mechanism of refhashing was intended to detect and prevent. However, the error on a mismatching refhash is not that friendly, in that it only shows the hash, but not the identifiers that caused the hash. Improve the error message by showing the first mismatching identifier.
RasmussenLab · Aug 14, 2023 · 37277e2 · 37277e2
1 parent ba96abe
commit 37277e2
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 12 deletions.
diff --git a/test/test_parsebam.py b/test/test_parsebam.py
@@ -30,7 +30,10 @@ def setUpClass(cls):
     def test_refhash(self):
         m = self.comp_metadata
         cp = CompositionMetaData(m.identifiers, m.lengths, m.mask, m.minlength)
-        cp.refhash = b"a" * 32  # write bad refhash
+        # Change the refnames slighty
+        cp.identifiers = cp.identifiers.copy()
+        cp.identifiers[3] = cp.identifiers[3] + "w"
+        cp.refhash = vamb.vambtools.hash_refnames(cp.identifiers)
         with self.assertRaises(ValueError):
             vamb.parsebam.Abundance.from_files(
                 testtools.BAM_FILES, None, cp, True, 0.97, 4

diff --git a/vamb/parsebam.py b/vamb/parsebam.py
@@ -11,7 +11,8 @@
 from math import isfinite
 from vamb.parsecontigs import CompositionMetaData
 from vamb import vambtools
-from typing import Optional, TypeVar, Union, IO, Sequence
+from typing import Optional, TypeVar, Union, IO, Sequence, Iterable
+from itertools import zip_longest
 from pathlib import Path
 import shutil
 
@@ -52,15 +53,37 @@ def nsamples(self) -> int:
         return len(self.samplenames)
 
     @staticmethod
-    def verify_refhash(refhash: bytes, target_refhash: bytes) -> None:
+    def verify_refhash(
+        refhash: bytes,
+        target_refhash: bytes,
+        identifiers: Optional[tuple[Iterable[str], Iterable[str]]],
+    ) -> None:
         if refhash != target_refhash:
-            raise ValueError(
-                f"At least one BAM file reference name hash to {refhash.hex()}, "
-                f"expected {target_refhash.hex()}. "
-                "Make sure all BAM and FASTA identifiers are identical "
-                "and in the same order. "
-                "Note that the identifier is the header before any whitespace."
-            )
+            if identifiers is not None:
+                for i, (fasta_id, bam_id) in enumerate(zip_longest(*identifiers)):
+                    if fasta_id is None:
+                        raise ValueError(
+                            f"FASTA has only {i} identifier(s), which is fewer than BAM file"
+                        )
+                    elif bam_id is None:
+                        raise ValueError(
+                            f"BAM has only {i} identifier(s), which is fewer than FASTA file"
+                        )
+                    elif fasta_id != bam_id:
+                        raise ValueError(
+                            f"Identifier number {i+1} does not match for FASTA and BAM files:"
+                            f'FASTA identifier: "{fasta_id}"'
+                            f'BAM   identifier: "{bam_id}"'
+                        )
+                assert False
+            else:
+                raise ValueError(
+                    f"At least one BAM file reference name hash to {refhash.hex()}, "
+                    f"expected {target_refhash.hex()}. "
+                    "Make sure all BAM and FASTA identifiers are identical "
+                    "and in the same order. "
+                    "Note that the identifier is the header before any whitespace."
+                )
 
     def save(self, io: Union[Path, IO[bytes]]):
         _np.savez_compressed(
@@ -83,7 +106,7 @@ def load(
             arrs["refhash"].item(),
         )
         if refhash is not None:
-            cls.verify_refhash(abundance.refhash, refhash)
+            cls.verify_refhash(abundance.refhash, refhash, None)
 
         return abundance
 
@@ -136,6 +159,7 @@ def from_files(
                 paths,
                 minid,
                 comp_metadata.refhash if verify_refhash else None,
+                comp_metadata.identifiers if verify_refhash else None,
                 comp_metadata.mask,
             )
             return cls(matrix, [str(p) for p in paths], minid, refhash)
@@ -151,6 +175,7 @@ def from_files(
                 chunksize,
                 minid,
                 comp_metadata.refhash if verify_refhash else None,
+                comp_metadata.identifiers if verify_refhash else None,
                 comp_metadata.mask,
             )
 
@@ -162,6 +187,7 @@ def chunkwise_loading(
         nthreads: int,
         minid: float,
         target_refhash: Optional[bytes],
+        target_identifiers: Optional[Iterable[str]],
         mask: _np.ndarray,
     ) -> A:
         _os.makedirs(cache_directory)
@@ -181,6 +207,7 @@ def chunkwise_loading(
                 paths[chunkstart:chunkstop],
                 minid,
                 target_refhash,
+                target_identifiers,
                 mask,
             )
             vambtools.write_npz(filename, matrix)
@@ -200,6 +227,7 @@ def run_pycoverm(
         paths: list[Path],
         minid: float,
         target_refhash: Optional[bytes],
+        target_identifiers: Optional[Iterable[str]],
         mask: _np.ndarray,
     ) -> tuple[_np.ndarray, bytes]:
         (headers, coverage) = pycoverm.get_coverages_from_bam(
@@ -224,7 +252,12 @@ def run_pycoverm(
         vambtools.numpy_inplace_maskarray(coverage, mask)
         refhash = vambtools.hash_refnames(headers)
 
+        if target_identifiers is None:
+            identifier_pairs = None
+        else:
+            identifier_pairs = (target_identifiers, headers)
+
         if target_refhash is not None:
-            Abundance.verify_refhash(refhash, target_refhash)
+            Abundance.verify_refhash(refhash, target_refhash, identifier_pairs)
 
         return (coverage, refhash)