diff --git a/prymer/api/picking.py b/prymer/api/picking.py index ce65059..020571e 100644 --- a/prymer/api/picking.py +++ b/prymer/api/picking.py @@ -165,13 +165,13 @@ def build_primer_pairs( # noqa: C901 # If the right primer isn't "to the right" of the left primer, move on if rp.span.start < lp.span.start or lp.span.end > rp.span.end: - first_right_primer_idx = max(first_right_primer_idx, j+1) + first_right_primer_idx = max(first_right_primer_idx, j + 1) continue amp_span = PrimerPair.calculate_amplicon_span(lp, rp) if amp_span.length < amplicon_sizes.min: - first_right_primer_idx = max(first_right_primer_idx, j+1) + first_right_primer_idx = max(first_right_primer_idx, j + 1) continue if amp_span.length > amplicon_sizes.max: diff --git a/prymer/offtarget/offtarget_detector.py b/prymer/offtarget/offtarget_detector.py index 592eb30..2a0eb4e 100644 --- a/prymer/offtarget/offtarget_detector.py +++ b/prymer/offtarget/offtarget_detector.py @@ -171,7 +171,9 @@ def __init__( # noqa: C901 max_gap_opens: int = 0, max_gap_extends: int = -1, max_amplicon_size: int = 1000, + min_amplicon_size: int = 1, min_primer_pair_hits: int = 1, + allow_overlapping_hits: bool = False, cache_results: bool = True, threads: Optional[int] = None, keep_spans: bool = True, @@ -190,7 +192,8 @@ def __init__( # noqa: C901 and `max_primer_hits`. 2. Filtering of individual primers: `max_primer_hits`. 3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`, - `max_primer_pair_hits`, and `max_amplicon_size`. + `max_primer_pair_hits`, `allow_overlapping_hits`, `min_amplicon_size, and + `max_amplicon_size`. The `three_prime_region_length` parameter is used as the seed length for `bwa aln`. @@ -225,6 +228,13 @@ def __init__( # noqa: C901 "long gaps". Must be greater than or equal to -1. max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater than 0. + min_amplicon_size: the minimum amplicon size to consider amplifiable. Must be between 1 + and `max_amplicon_size` inclusive. If `allow_overlapping_hits` is False, primer + pair hits that overlap each other will not be considered, even if they would create + an amplicon of at least `min_amplicon_size`. Default 1bp. + allow_overlapping_hits: if True, allow hits to the individual primers in a primer pair + to overlap with each other. If False, no overlap will be allowed, even if + `min_amplicon_size` would otherwise allow it. Default False. cache_results: if True, cache results for faster re-querying threads: the number of threads to use when invoking bwa keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans @@ -235,6 +245,8 @@ def __init__( # noqa: C901 Raises: ValueError: If `max_amplicon_size` is not greater than 0. + ValueError: If `min_amplicon_size` is outside the range 1 to `max_amplicon_size`, + inclusive. ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or `min_primer_pair_hits` are not greater than or equal to 0. ValueError: If `three_prime_region_length` is not greater than or equal to 8. @@ -247,6 +259,11 @@ def __init__( # noqa: C901 errors: list[str] = [] if max_amplicon_size < 1: errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}") + if min_amplicon_size < 1 or min_amplicon_size > max_amplicon_size: + errors.append( + f"'min_amplicon_size' must be between 1 and 'max_amplicon_size'={max_amplicon_size}" + f" inclusive. Saw {min_amplicon_size}" + ) if max_primer_hits < 0: errors.append( f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}" @@ -310,6 +327,8 @@ def __init__( # noqa: C901 self._max_primer_pair_hits: int = max_primer_pair_hits self._min_primer_pair_hits: int = min_primer_pair_hits self._max_amplicon_size: int = max_amplicon_size + self._min_amplicon_size: int = min_amplicon_size + self._allow_overlapping_hits: bool = allow_overlapping_hits self._cache_results: bool = cache_results self._keep_spans: bool = keep_spans self._keep_primer_spans: bool = keep_primer_spans @@ -357,7 +376,11 @@ def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTarge This method maps each primer to the specified reference with `bwa aln` to search for off-target hits. Possible amplicons are identified from the pairwise combinations of these - alignments, up to the specified `max_amplicon_size`. + alignments. + + If `allow_overlapping_hits=True`, amplicons in the size range `min_amplicon_size` to + `max_amplicon_size` will be returned. If `allow_overlapping_hits=False`, the size range will + still apply, but will exclude amplicons with overlapping hits. Primer pairs are marked as passing if both of the following are true: 1. Each primer has no more than `max_primer_hits` alignments to the genome. @@ -461,7 +484,9 @@ def _build_off_target_result( self._to_amplicons( positive_hits=left_positive_hits[refname], negative_hits=right_negative_hits[refname], + min_len=self._min_amplicon_size, max_len=self._max_amplicon_size, + allow_overlapping_hits=self._allow_overlapping_hits, strand=Strand.POSITIVE, ) ) @@ -469,7 +494,9 @@ def _build_off_target_result( self._to_amplicons( positive_hits=right_positive_hits[refname], negative_hits=left_negative_hits[refname], + min_len=self._min_amplicon_size, max_len=self._max_amplicon_size, + allow_overlapping_hits=self._allow_overlapping_hits, strand=Strand.NEGATIVE, ) ) @@ -546,14 +573,21 @@ def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]: @staticmethod def _to_amplicons( - positive_hits: list[BwaHit], negative_hits: list[BwaHit], max_len: int, strand: Strand + positive_hits: list[BwaHit], + negative_hits: list[BwaHit], + max_len: int, + strand: Strand, + min_len: int = 1, + allow_overlapping_hits: bool = False, ) -> list[Span]: """Takes lists of positive strand hits and negative strand hits and constructs amplicon mappings anywhere a positive strand hit and a negative strand hit occur where the end of - the negative strand hit is no more than `max_len` from the start of the positive strand - hit. + the negative strand hit is at least `min_len` and no more than `max_len` from the start of + the positive strand hit. - Primers may not overlap. + Primer hits may overlap if `min_len` is shorter than the length of the left and right + primers together and `allow_overlapping_hits` is True. If `allow_overlapping_hits` is False, + these will not be returned. Args: positive_hits: List of hits on the positive strand for one of the primers in the pair. @@ -563,6 +597,12 @@ def _to_amplicons( `positive_hits` are for the left primer and `negative_hits` are for the right primer. Set to Strand.NEGATIVE if `positive_hits` are for the right primer and `negative_hits` are for the left primer. + min_len: Minimum length of amplicons to consider. If `allow_overlapping_hits` is False, + primer pair hits that overlap each other will not be considered, even if they would + create an amplicon of at least `min_len`. Default 1bp. + allow_overlapping_hits: if True, allow hits to the individual primers in a primer pair + to overlap with each other. If False, no overlap will be allowed, even if + `min_amplicon_size` would otherwise allow it. Default False. Raises: ValueError: If any of the positive hits are not on the positive strand, or any of the @@ -600,13 +640,15 @@ def _to_amplicons( negative_hits_sorted[prev_negative_hit_index:], start=prev_negative_hit_index, ): - # TODO: Consider allowing overlapping positive and negative hits. - if ( - negative_hit.start > positive_hit.end - and negative_hit.end - positive_hit.start + 1 <= max_len + # Hit coordinates are 1-based end-inclusive. + amplicon_length: int = negative_hit.end - positive_hit.start + 1 + if min_len <= amplicon_length <= max_len and ( + allow_overlapping_hits or negative_hit.start > positive_hit.end ): - # If the negative hit starts to the right of the positive hit, and the amplicon - # length is <= max_len, add it to the list of amplicon hits to be returned. + # If overlapping hits are allowed and the amplicon length is in the correct + # size range, -OR- if overlapping hits are not allowed, the negative hit starts + # after the positive hit ends, and the amplicon length is less than or equal + # to the allowed maximum, add it to the list of amplicon hits to be returned. amplicons.append( Span( refname=positive_hit.refname, diff --git a/tests/offtarget/test_offtarget.py b/tests/offtarget/test_offtarget.py index 7cd3ae9..bfdddc6 100644 --- a/tests/offtarget/test_offtarget.py +++ b/tests/offtarget/test_offtarget.py @@ -24,7 +24,9 @@ def _build_detector( three_prime_region_length: int = 20, max_mismatches_in_three_prime_region: int = 0, max_mismatches: int = 0, + min_amplicon_size: int = 1, max_amplicon_size: int = 250, + allow_overlapping_hits: bool = False, cache_results: bool = True, ) -> OffTargetDetector: """Builds an `OffTargetDetector` with strict defaults""" @@ -35,7 +37,9 @@ def _build_detector( three_prime_region_length=three_prime_region_length, max_mismatches_in_three_prime_region=max_mismatches_in_three_prime_region, max_mismatches=max_mismatches, + min_amplicon_size=min_amplicon_size, max_amplicon_size=max_amplicon_size, + allow_overlapping_hits=allow_overlapping_hits, cache_results=cache_results, keep_spans=True, keep_primer_spans=True, @@ -281,7 +285,75 @@ def test_to_amplicons( ) -> None: with _build_detector(ref_fasta=ref_fasta, cache_results=cache_results) as detector: actual = detector._to_amplicons( - positive_hits=[positive], negative_hits=[negative], max_len=250, strand=strand + positive_hits=[positive], + negative_hits=[negative], + min_len=200, + max_len=250, + strand=strand, + ) + assert actual == expected, test_id + + +# Test that using the cache (or not) does not affect the results +# NB: BwaHit and Span coordinates are 1-based end-inclusive +# +# One mapping - Overlapping primers (1bp overlap) +# >>>>>>>>>> +# <<<<<<<<<< +# 19bp amplicon [1,19] +# +# One mapping - Overlapping primers (1bp amplicon) +# >>>>>>>>>> +# <<<<<<<<<< +# 1bp amplicon [10,10] +# +# No mappings - amplicon length would be 0. +# >>>>>>>>>> +# <<<<<<<<<< +@pytest.mark.parametrize("cache_results", [True, False]) +@pytest.mark.parametrize( + "test_id, positive, negative, strand, expected", + [ + ( + "One mapping - Overlapping primers (1bp overlap)", + BwaHit.build("chr1", 1, False, "10M", 0), + BwaHit.build("chr1", 10, True, "10M", 0), + Strand.POSITIVE, + [Span(refname="chr1", start=1, end=19, strand=Strand.POSITIVE)], + ), + ( + "One mapping - Overlapping primers (1bp amplicon)", + BwaHit.build("chr1", 10, False, "10M", 0), + BwaHit.build("chr1", 1, True, "10M", 0), + Strand.POSITIVE, + [Span(refname="chr1", start=10, end=10, strand=Strand.POSITIVE)], + ), + ( + "No mappings", + BwaHit.build("chr1", 11, False, "10M", 0), + BwaHit.build("chr1", 1, True, "10M", 0), + Strand.POSITIVE, + [], + ), + ], +) +def test_to_amplicons_overlapping( + ref_fasta: Path, + test_id: str, + positive: BwaHit, + negative: BwaHit, + strand: Strand, + expected: list[Span], + cache_results: bool, +) -> None: + with _build_detector(ref_fasta=ref_fasta, cache_results=cache_results) as detector: + actual = detector._to_amplicons( + positive_hits=[positive], + negative_hits=[negative], + min_len=1, + max_len=250, + allow_overlapping_hits=True, + strand=strand, ) assert actual == expected, test_id @@ -322,7 +394,11 @@ def test_to_amplicons_value_error( pytest.raises(ValueError, match=expected_error), ): detector._to_amplicons( - positive_hits=[positive], negative_hits=[negative], max_len=250, strand=Strand.POSITIVE + positive_hits=[positive], + negative_hits=[negative], + min_len=200, + max_len=250, + strand=Strand.POSITIVE, ) @@ -356,20 +432,22 @@ class CustomPrimer(Oligo): @pytest.mark.parametrize( ( "max_primer_hits,max_primer_pair_hits,min_primer_pair_hits,three_prime_region_length," - "max_mismatches_in_three_prime_region,max_mismatches,max_amplicon_size," + "max_mismatches_in_three_prime_region,max_mismatches,max_amplicon_size,min_amplicon_size," "max_gap_opens,max_gap_extends,expected_error" ), [ - (-1, 1, 1, 20, 0, 0, 1, 0, 0, "'max_primer_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 - (1, -1, 1, 20, 0, 0, 1, 0, 0, "'max_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 - (1, 1, -1, 20, 0, 0, 1, 0, 0, "'min_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 - (1, 1, 1, 5, 0, 0, 1, 0, 0, "'three_prime_region_length' must be greater than or equal to 8. Saw 5"), # noqa: E501 - (1, 1, 1, 20, -1, 0, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw -1"), # noqa: E501 - (1, 1, 1, 20, 21, 0, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw 21"), # noqa: E501 - (1, 1, 1, 20, 0, -1, 1, 0, 0, "'max_mismatches' must be greater than or equal to 0. Saw -1"), # noqa: E501 - (1, 1, 1, 20, 0, 0, 0, 0, 0, "'max_amplicon_size' must be greater than 0. Saw 0"), - (1, 1, 1, 20, 0, 0, 1, -1, 0, "'max_gap_opens' must be greater than or equal to 0. Saw -1"), - (1, 1, 1, 20, 0, 5, 1, 0, -2, re.escape("'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'=5) or greater than or equal to 0. Saw -2")), #noqa: E501 + (-1, 1, 1, 20, 0, 0, 1, 1, 0, 0, "'max_primer_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 + (1, -1, 1, 20, 0, 0, 1, 1, 0, 0, "'max_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 + (1, 1, -1, 20, 0, 0, 1, 1, 0, 0, "'min_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501 + (1, 1, 1, 5, 0, 0, 1, 1, 0, 0, "'three_prime_region_length' must be greater than or equal to 8. Saw 5"), # noqa: E501 + (1, 1, 1, 20, -1, 0, 1, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw -1"), # noqa: E501 + (1, 1, 1, 20, 21, 0, 1, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw 21"), # noqa: E501 + (1, 1, 1, 20, 0, -1, 1, 1, 0, 0, "'max_mismatches' must be greater than or equal to 0. Saw -1"), # noqa: E501 + (1, 1, 1, 20, 0, 0, 0, 1, 0, 0, "'max_amplicon_size' must be greater than 0. Saw 0"), + (1, 1, 1, 20, 0, 0, 1, 1, -1, 0, "'max_gap_opens' must be greater than or equal to 0. Saw -1"), # noqa: E501 + (1, 1, 1, 20, 0, 5, 1, 1, 0, -2, re.escape("'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'=5) or greater than or equal to 0. Saw -2")), #noqa: E501 + (1, 1, 1, 20, 0, 0, 10, 0, 0, 0, "'min_amplicon_size' must be between 1 and 'max_amplicon_size'=10 inclusive. Saw 0"), # noqa: E501 + (1, 1, 1, 20, 0, 0, 10, 11, 0, 0, "'min_amplicon_size' must be between 1 and 'max_amplicon_size'=10 inclusive. Saw 11"), # noqa: E501 ], ) # fmt: on @@ -382,6 +460,7 @@ def test_init( max_mismatches_in_three_prime_region: int, max_mismatches: int, max_amplicon_size: int, + min_amplicon_size: int, max_gap_opens: int, max_gap_extends: int, expected_error: str, @@ -396,6 +475,7 @@ def test_init( max_mismatches_in_three_prime_region=max_mismatches_in_three_prime_region, max_mismatches=max_mismatches, max_amplicon_size=max_amplicon_size, + min_amplicon_size=min_amplicon_size, max_gap_opens=max_gap_opens, max_gap_extends=max_gap_extends, )