From 81ace30e097e9277adbdd7d4ed1436247978af8f Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sun, 7 May 2023 00:10:18 +0800
Subject: [PATCH 1/9] fix (svgraph): update data structure to improve
 performance by using dict instead of set to avoid hashing the entire object
 again and again

---
 moPepGen/svgraph/PVGNode.py            |  2 +-
 moPepGen/svgraph/VariantPeptideDict.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py
index 673ac7ce..fb33aaef 100644
--- a/moPepGen/svgraph/PVGNode.py
+++ b/moPepGen/svgraph/PVGNode.py
@@ -744,7 +744,7 @@ def get_downstream_stop_altering_variants(self) -> Set[VariantRecord]:
         """ Get downstream stop altering variants """
         final_variants = set()
         for out_node in self.out_nodes:
-            if out_node.seq.seq == '*':
+            if len(out_node.seq.seq) == 1 and out_node.seq.seq.startswith('*'):
                 stop_alts = set()
                 stop_alts.update([x.variant for x in out_node.variants
                     if x.is_stop_altering and not x.downstream_cleavage_altering])
diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
index 7f579a1f..3a8931d4 100644
--- a/moPepGen/svgraph/VariantPeptideDict.py
+++ b/moPepGen/svgraph/VariantPeptideDict.py
@@ -104,11 +104,13 @@ def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf],
         while queue:
             cur_batch = queue.pop()
             cur_node = cur_batch[-1]
-            batch_vars:Set[VariantRecord] = set()
+            # Turn it into a doct of id to variant
+            batch_vars:Dict[str, VariantRecord] = {}
 
             for _node in cur_batch:
                 for var in _node.variants:
-                    batch_vars.add(var.variant)
+                    if var.variant.id not in batch_vars:
+                        batch_vars[var.variant.id] = var.variant
 
             n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1
 
@@ -138,7 +140,7 @@ def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf],
 
                 additional_variants = _node.get_downstream_stop_altering_variants()
 
-                cur_vars = {v.id for v in batch_vars}
+                cur_vars = set(batch_vars.keys())
                 for var in _node.variants:
                     cur_vars.add(var.variant.id)
                 cur_vars.update({v.id for v in additional_variants})
@@ -188,7 +190,7 @@ def join_miscleaved_peptides(self, check_variants:bool,
             metadata = VariantPeptideMetadata()
             seq:str = None
             variants:Set[VariantRecord] = set()
-            in_seq_variants:Set[VariantRecord] = set()
+            in_seq_variants:Dict[str,VariantRecord] = {}
 
             selenocysteines = []
 
@@ -211,7 +213,8 @@ def join_miscleaved_peptides(self, check_variants:bool,
                             continue
                         variants.add(variant.variant)
                         if not variant.variant.is_circ_rna():
-                            in_seq_variants.add(variant.variant)
+                            if variant.variant.id not in in_seq_variants:
+                                in_seq_variants[variant.variant.id] = variant.variant
                     if i < len(queue) - 1:
                         _node = self.leading_node if i == 0 else node
                         indels = queue[i + 1].upstream_indel_map.get(_node)
@@ -224,7 +227,7 @@ def join_miscleaved_peptides(self, check_variants:bool,
                 if any(v.is_circ_rna() for v in variants) \
                         or any(v.is_circ_rna() for v in orf.start_gain):
                     if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna):
-                        if any(n.is_missing_any_variant(in_seq_variants) for n in queue):
+                        if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue):
                             continue
                         metadata.orf = tuple(orf.orf)
                         valid_orf = orf

From 0e9b09dd1689dafb5bfd6e7a8d29e368ea191071 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Mon, 8 May 2023 20:09:42 +0800
Subject: [PATCH 2/9] fix (VariantPeptideDict): turn more set into dict

---
 moPepGen/svgraph/VariantPeptideDict.py | 38 ++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
index 3a8931d4..84ed9309 100644
--- a/moPepGen/svgraph/VariantPeptideDict.py
+++ b/moPepGen/svgraph/VariantPeptideDict.py
@@ -189,7 +189,7 @@ def join_miscleaved_peptides(self, check_variants:bool,
             queue = series.nodes
             metadata = VariantPeptideMetadata()
             seq:str = None
-            variants:Set[VariantRecord] = set()
+            variants:Dict[str,VariantRecord] = {}
             in_seq_variants:Dict[str,VariantRecord] = {}
 
             selenocysteines = []
@@ -211,7 +211,8 @@ def join_miscleaved_peptides(self, check_variants:bool,
                             continue
                         if variant.upstream_cleavage_altering:
                             continue
-                        variants.add(variant.variant)
+                        if variant.variant.id not in variants:
+                            variants[variant.variant.id] = variant.variant
                         if not variant.variant.is_circ_rna():
                             if variant.variant.id not in in_seq_variants:
                                 in_seq_variants[variant.variant.id] = variant.variant
@@ -220,11 +221,12 @@ def join_miscleaved_peptides(self, check_variants:bool,
                         indels = queue[i + 1].upstream_indel_map.get(_node)
                         if indels:
                             for variant in indels:
-                                variants.add(variant)
+                                if variant.id not in variants:
+                                    variants[variant.id] = variant
 
             valid_orf = None
             for orf in self.orfs:
-                if any(v.is_circ_rna() for v in variants) \
+                if any(v.is_circ_rna() for v in variants.values()) \
                         or any(v.is_circ_rna() for v in orf.start_gain):
                     if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna):
                         if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue):
@@ -241,16 +243,24 @@ def join_miscleaved_peptides(self, check_variants:bool,
                 continue
 
             cleavage_gain_down = queue[-1].get_cleavage_gain_from_downstream()
-            variants.update(cleavage_gain_down)
+            for v in cleavage_gain_down:
+                if v not in variants:
+                    variants[v.id] = v
 
             if check_variants:
-                variants.update(valid_orf.start_gain)
-                variants.update(additional_variants)
-                variants.update(series.additional_variants)
-
-            if any(v.is_circ_rna() for v in variants) \
+                for v in valid_orf.start_gain:
+                    if v.id not in variants:
+                        variants[v.id] = v
+                for v in additional_variants:
+                    if v.id not in variants:
+                        variants[v.id] = v
+                for v in series.additional_variants:
+                    if v.id not in variants:
+                        variants[v.id] = v
+
+            if any(v.is_circ_rna() for v in variants.values()) \
                     and queue[-1].any_unaccounted_downstream_cleavage_or_stop_altering(
-                        {x for x in variants if not x.is_circ_rna()}):
+                        {x for x in variants.values() if not x.is_circ_rna()}):
                 continue
 
             if not seq:
@@ -271,10 +281,10 @@ def join_miscleaved_peptides(self, check_variants:bool,
                         continue
 
             metadata.is_pure_circ_rna = len(variants) == 1 and \
-                list(variants)[0].is_circ_rna()
+                list(variants.values())[0].is_circ_rna()
 
             seqs = self.translational_modification(seq, metadata, denylist,
-                variants, is_start_codon, selenocysteines, substitutants,
+                variants.values(), is_start_codon, selenocysteines, substitutants,
                 check_variants, check_external_variants
             )
             for seq, metadata in seqs:
@@ -371,7 +381,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
 
                 if is_valid or is_valid_start:
                     cur_metadata = copy.copy(metadata)
-                    cur_variants = copy.copy(variants)
+                    cur_variants = set(variants)
                     cur_variants.update(comb)
                     label = vpi.create_variant_peptide_id(
                         transcript_id=self.tx_id, variants=cur_variants, orf_id=None,

From d76caf6ddca753e24985f57a876a940f0e4a4bd8 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sun, 14 May 2023 16:03:11 +0800
Subject: [PATCH 3/9] fix (callVariant): skip peptides earlier if they are
 either too long or too short to significantly improve efficiency. #736

---
 moPepGen/cli/call_noncoding_peptide.py  |   3 +-
 moPepGen/svgraph/PVGNode.py             |   3 +-
 moPepGen/svgraph/PeptideVariantGraph.py |   5 +-
 moPepGen/svgraph/VariantPeptideDict.py  | 474 ++++++++++++++----------
 test/unit/test_peptide_variant_graph.py |   8 +
 test/unit/test_variant_peptide_dict.py  |   2 +-
 6 files changed, 293 insertions(+), 202 deletions(-)

diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
index de54dc73..324267ed 100644
--- a/moPepGen/cli/call_noncoding_peptide.py
+++ b/moPepGen/cli/call_noncoding_peptide.py
@@ -200,7 +200,8 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
         check_orf=True,
         denylist=canonical_peptides,
         orf_assignment=orf_assignment,
-        w2f=w2f_reassignment
+        w2f=w2f_reassignment,
+        check_external_variants=False
     )
     orfs = get_orf_sequences(pgraph, tx_id, tx_model.gene_id, tx_seq)
     return peptides, orfs
diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py
index fb33aaef..523231af 100644
--- a/moPepGen/svgraph/PVGNode.py
+++ b/moPepGen/svgraph/PVGNode.py
@@ -305,8 +305,9 @@ def get_cleavage_gain_variants(self) -> List[seqvar.VariantRecord]:
     def get_cleavage_gain_from_downstream(self) -> List[seqvar.VariantRecord]:
         """ Get the variants that gains the cleavage by downstream nodes """
         cleavage_gain = []
+        seq_len = len(self.seq.seq)
         upstream_cleave_alts = [v.variant for v in self.variants
-            if v.location.end == len(self.seq.seq)]
+            if v.location.end == seq_len]
         for node in self.out_nodes:
             if not node.variants:
                 return []
diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py
index 075279e9..7187a931 100644
--- a/moPepGen/svgraph/PeptideVariantGraph.py
+++ b/moPepGen/svgraph/PeptideVariantGraph.py
@@ -809,7 +809,8 @@ def call_variant_peptides(self, check_variants:bool=True,
             gene_id=self.gene_id,
             truncate_sec=truncate_sec,
             w2f=w2f,
-            check_external_variants=check_external_variants
+            check_external_variants=check_external_variants,
+            cleavage_params=self.cleavage_params
         )
         traversal = PVGTraversal(
             check_variants=check_variants, check_orf=check_orf,
@@ -855,6 +856,8 @@ def call_variant_peptides(self, check_variants:bool=True,
         if check_orf:
             self.create_orf_id_map()
 
+        peptide_pool.translational_modification(w2f, self.denylist)
+
         return peptide_pool.get_peptide_sequences(
             keep_all_occurrence=keep_all_occurrence, orf_id_map=self.orf_id_map
         )
diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
index 84ed9309..37880c5a 100644
--- a/moPepGen/svgraph/VariantPeptideDict.py
+++ b/moPepGen/svgraph/VariantPeptideDict.py
@@ -19,6 +19,16 @@
     from moPepGen.seqvar.VariantRecord import VariantRecord
     from moPepGen import params
 
+class VariantPeptideMetadata():
+    """ Variant peptide metadata """
+    def __init__(self, label:str=None, orf:Tuple[int,int]=None,
+            is_pure_circ_rna:bool=False):
+        """  """
+        self.label = label
+        self.orf = orf
+        self.is_pure_circ_rna = is_pure_circ_rna
+
+T = Dict[aa.AminoAcidSeqRecord, Set[VariantPeptideMetadata]]
 
 class MiscleavedNodeSeries():
     """ Helper class when calling for miscleavage peptides. The nodes contained
@@ -27,6 +37,27 @@ def __init__(self, nodes:List[PVGNode], additional_variants:Set[VariantRecord]):
         """ """
         self.nodes = nodes
         self.additional_variants = additional_variants
+        self._len = None
+
+    def __len__(self) -> int:
+        """ Get length of node sequences """
+        if self._len is None:
+            self._len = sum([len(x.seq.seq) for x in self.nodes])
+        return self._len
+
+    def is_too_short(self, param:params.CleavageParams) -> bool:
+        """ Checks whether the sequence is too short """
+        return len(self) < param.min_length
+
+    def is_too_long(self, param:params.CleavageParams) -> bool:
+        """ Checks whether the sequence is too long """
+        return not(
+            len(self) <= param.max_length
+            or (
+                self.nodes[0].seq.seq.startswith('M')
+                and len(self) <= param.max_length + 1
+            )
+        )
 
 class MiscleavedNodes():
     """ Helper class for looking for peptides with miscleavages. This class
@@ -46,7 +77,8 @@ class MiscleavedNodes():
     def __init__(self, data:Deque[MiscleavedNodeSeries],
             cleavage_params:params.CleavageParams,
             orfs:List[PVGOrf]=None, tx_id:str=None, gene_id:str=None,
-            leading_node:PVGNode=None, subgraphs:SubgraphTree=None):
+            leading_node:PVGNode=None, subgraphs:SubgraphTree=None,
+            is_circ_rna:bool=False):
         """ constructor """
         self.data = data
         self.orfs = orfs or []
@@ -55,120 +87,30 @@ def __init__(self, data:Deque[MiscleavedNodeSeries],
         self.cleavage_params = cleavage_params
         self.leading_node = leading_node
         self.subgraphs = subgraphs
+        self.is_circ_rna = is_circ_rna
 
-    @staticmethod
-    def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf],
-            cleavage_params:params.CleavageParams, tx_id:str, gene_id:str,
-            leading_node:PVGNode, subgraphs:SubgraphTree, is_circ_rna:bool
-            ) -> MiscleavedNodes:
-        """ Find all miscleaved nodes.
-
-        node vs leading_node:
-            - When calling this function from a node within a PVG graph, a copy
-              of the node should be first created. This is because if the ORF
-              start site is located in the node, the peptide sequence should be
-              called form the ORF start site, rather than the beginning of the
-              sequence. So then a copy should be created from this node and
-              only keep the sequence after the ORF start.
-            - The `leading_node` must tbe the original copy of the node within
-              the graph, that the `node` is copied from. This is used to
-              retrieve INDELs from the downstream node.
-
-        Args:
-            - `node` (PVGNode): The node that miscleaved peptide sequences
-              starts from it should be called.
-            - `orfs` (List[PVGPOrf]): The ORF start and end locations.
-            - `cleavage_params` (CleavageParams): Cleavage related parameters.
-            - `tx_id` (str): Transcript ID.
-            - `leading_node` (PVGNode): The start node that the miscleaved
-              peptides are called from. This node must present in the PVG graph.
-        """
-        if not orfs:
-            raise ValueError('ORFs are empty')
-        queue = deque([[node]])
-        nodes = MiscleavedNodes(
-            data=deque([]),
-            cleavage_params=cleavage_params,
-            orfs=orfs,
-            tx_id=tx_id,
-            gene_id=gene_id,
-            leading_node=leading_node,
-            subgraphs=subgraphs
-        )
-
-        if not (node.cpop_collapsed or node.truncated):
-            additional_variants = leading_node.get_downstream_stop_altering_variants()
-            series = MiscleavedNodeSeries([node], additional_variants)
-            nodes.data.append(series)
-
-        while queue:
-            cur_batch = queue.pop()
-            cur_node = cur_batch[-1]
-            # Turn it into a doct of id to variant
-            batch_vars:Dict[str, VariantRecord] = {}
-
-            for _node in cur_batch:
-                for var in _node.variants:
-                    if var.variant.id not in batch_vars:
-                        batch_vars[var.variant.id] = var.variant
-
-            n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1
-
-            if n_cleavages >= cleavage_params.miscleavage:
-                continue
-
-            # This is done to reduce the complexity when the node has too
-            # many out nodes, and its out nodes also have too many out nodes.
-            if cleavage_params.additional_variants_per_misc == -1:
-                allowed_n_vars = float('Inf')
-            else:
-                allowed_n_vars = cleavage_params.max_variants_per_node
-                allowed_n_vars += (n_cleavages + 1) * cleavage_params.additional_variants_per_misc
-
-            for _node in cur_node.out_nodes:
-                if is_circ_rna and _node.is_hybrid_node(subgraphs):
-                    continue
-                if _node.truncated:
-                    continue
-                new_batch = copy.copy(cur_batch)
-
-                is_stop = _node.seq.seq == '*'
-                if is_stop:
-                    continue
-
-                new_batch.append(_node)
-
-                additional_variants = _node.get_downstream_stop_altering_variants()
-
-                cur_vars = set(batch_vars.keys())
-                for var in _node.variants:
-                    cur_vars.add(var.variant.id)
-                cur_vars.update({v.id for v in additional_variants})
-                if len(cur_vars) > allowed_n_vars:
-                    continue
-
-                if not _node.cpop_collapsed:
-                    series = MiscleavedNodeSeries(copy.copy(new_batch), additional_variants)
-                    nodes.data.append(series)
-                    if n_cleavages + 1 == cleavage_params.miscleavage:
-                        continue
-                    if n_cleavages + 1 > cleavage_params.miscleavage:
-                        raise ValueError('Something just went wrong')
-                queue.append(new_batch)
-        return nodes
-
-    def is_valid_seq(self, seq:Seq, denylist:Set[str]) -> bool:
+    def is_valid_seq(self, seq:Seq, pool:Set[Seq], denylist:Set[str]) -> bool:
         """ Check whether the seq is valid """
-        min_length = self.cleavage_params.min_length
-        max_length = self.cleavage_params.max_length
+        if seq in pool:
+            return True
         min_mw = self.cleavage_params.min_mw
-
-        return seq not in denylist \
-            and min_length <= len(seq) <= max_length \
+        return self.seq_has_valid_size(seq) \
+            and seq not in denylist \
             and 'X' not in seq \
             and SeqUtils.molecular_weight(seq, 'protein') >= min_mw
 
-    def join_miscleaved_peptides(self, check_variants:bool,
+    def seq_has_valid_size(self, seq:Seq=None, size:int=None) -> bool:
+        """ Check whether the seq has valid length """
+        if seq is None and size is None:
+            raise ValueError
+        min_length = self.cleavage_params.min_length
+        max_length = self.cleavage_params.max_length
+        if size is None:
+            size = len(seq)
+
+        return min_length <= size <= max_length
+
+    def join_miscleaved_peptides(self, pool:T, check_variants:bool,
             additional_variants:List[VariantRecord], denylist:Set[str],
             is_start_codon:bool=False, circ_rna:circ.CircRNAModel=None,
             truncate_sec:bool=False, w2f:bool=False,
@@ -188,7 +130,8 @@ def join_miscleaved_peptides(self, check_variants:bool,
         for series in self.data:
             queue = series.nodes
             metadata = VariantPeptideMetadata()
-            seq:str = None
+            seqs_to_join:List[Seq] = []
+            size:int = 0
             variants:Dict[str,VariantRecord] = {}
             in_seq_variants:Dict[str,VariantRecord] = {}
 
@@ -196,14 +139,13 @@ def join_miscleaved_peptides(self, check_variants:bool,
 
             for i, node in enumerate(queue):
                 other = str(node.seq.seq)
-                if seq is None:
-                    seq = other
-                    if truncate_sec:
-                        selenocysteines = copy.copy(node.selenocysteines)
-                else:
-                    if truncate_sec:
-                        selenocysteines += [x.shift(len(seq)) for x in node.selenocysteines]
-                    seq = seq + other
+                if truncate_sec:
+                    if size > 0:
+                        selenocysteines += [x.shift(size) for x in node.selenocysteines]
+                    else:
+                        selenocysteines +=  node.selenocysteines
+                seqs_to_join.append(other)
+                size += len(other)
 
                 if check_variants:
                     for variant in node.variants:
@@ -226,8 +168,9 @@ def join_miscleaved_peptides(self, check_variants:bool,
 
             valid_orf = None
             for orf in self.orfs:
-                if any(v.is_circ_rna() for v in variants.values()) \
-                        or any(v.is_circ_rna() for v in orf.start_gain):
+                if self.is_circ_rna \
+                        and (any(v.is_circ_rna() for v in variants.values()) \
+                        or any(v.is_circ_rna() for v in orf.start_gain)):
                     if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna):
                         if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue):
                             continue
@@ -258,58 +201,49 @@ def join_miscleaved_peptides(self, check_variants:bool,
                     if v.id not in variants:
                         variants[v.id] = v
 
-            if any(v.is_circ_rna() for v in variants.values()) \
+            if self.is_circ_rna \
+                    and any(v.is_circ_rna() for v in variants.values()) \
                     and queue[-1].any_unaccounted_downstream_cleavage_or_stop_altering(
                         {x for x in variants.values() if not x.is_circ_rna()}):
                 continue
 
-            if not seq:
-                continue
-
-            if seq in denylist:
+            if size == 0:
                 continue
 
-            seq = Seq(seq)
-            substitutants = self.find_codon_reassignments(seq, w2f)
-
             if check_variants:
                 if check_external_variants:
                     if not variants:
                         continue
                 else:
-                    if not (variants or selenocysteines or substitutants):
+                    if not (variants or selenocysteines):
                         continue
 
-            metadata.is_pure_circ_rna = len(variants) == 1 and \
-                list(variants.values())[0].is_circ_rna()
+            if not selenocysteines \
+                    and not self.seq_has_valid_size(size=size) \
+                    and not (seqs_to_join[0].startswith('M')
+                                and self.seq_has_valid_size(size=size-1)):
+                continue
+
+            seq = Seq(''.join(seqs_to_join))
+            if not seq in pool and seq in denylist:
+                continue
+
+            metadata.is_pure_circ_rna = self.is_circ_rna \
+                and len(variants) == 1 \
+                and list(variants.values())[0].is_circ_rna()
 
             seqs = self.translational_modification(seq, metadata, denylist,
-                variants.values(), is_start_codon, selenocysteines, substitutants,
-                check_variants, check_external_variants
+                variants.values(), is_start_codon, selenocysteines,
+                check_variants, check_external_variants, pool
             )
             for seq, metadata in seqs:
                 yield seq, metadata
 
-    def find_codon_reassignments(self, seq:Seq, w2f:bool=False) -> List[VariantRecord]:
-        """ Find potential codon reassignments from the given sequence. """
-        variants = []
-        if w2f:
-            i = 0
-            while i > -1:
-                i = seq.find('W', start=i)
-                if i > -1:
-                    w2f_var = create_variant_w2f(self.tx_id, i)
-                    variants.append(w2f_var)
-                    i += 1
-                    if i >= len(seq):
-                        break
-        return variants
 
     def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
             denylist:Set[str], variants:Set[VariantRecord], is_start_codon:bool,
             selenocysteines:List[seqvar.VariantRecordWithCoordinate],
-            reassignments:List[VariantRecord], check_variants:bool,
-            check_external_variants:bool
+            check_variants:bool, check_external_variants:bool, pool:Set[Seq]
             ) -> Iterable[Tuple[aa.AminoAcidSeqRecord,VariantPeptideMetadata]]:
         """ Apply any modification that could happen during translation. The
         kinds of modifications that could happen are:
@@ -317,9 +251,10 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
         2. Selenocysteine truncation.
         """
         if variants or not check_variants:
-            is_valid = self.is_valid_seq(seq, denylist)
-            is_valid_start = is_start_codon and seq.startswith('M') and\
-                self.is_valid_seq(seq[1:], denylist)
+            is_valid = self.is_valid_seq(seq, pool, denylist)
+
+            is_valid_start =  is_start_codon and seq.startswith('M') and\
+                self.is_valid_seq(seq[1:], pool, denylist)
 
             if is_valid or is_valid_start:
                 cur_metadata = copy.copy(metadata)
@@ -340,9 +275,9 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
         # Selenocysteine termination
         for sec in selenocysteines:
             seq_mod = seq[:sec.location.start]
-            is_valid = self.is_valid_seq(seq_mod, denylist)
+            is_valid = self.is_valid_seq(seq_mod, pool, denylist)
             is_valid_start = is_start_codon and seq_mod.startswith('M') and\
-                self.is_valid_seq(seq_mod[1:], denylist)
+                self.is_valid_seq(seq_mod[1:], pool, denylist)
 
             if is_valid or is_valid_start:
                 cur_metadata = copy.copy(metadata)
@@ -365,37 +300,13 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
                     aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod[1:])
                     yield aa_seq, cur_metadata
 
-        # W > F codon reassignments
-        for k in range(1, len(reassignments) + 1):
-            for comb in itertools.combinations(reassignments, k):
-                seq_mod = seq
-                for v in comb:
-                    seq_mod_new = seq_mod[:v.location.start] + v.alt
-                    if v.location.end < len(seq):
-                        seq_mod_new += seq_mod[v.location.end:]
-                    seq_mod = seq_mod_new
-
-                is_valid = self.is_valid_seq(seq_mod, denylist)
-                is_valid_start = is_start_codon and seq_mod.startswith('M') and\
-                    self.is_valid_seq(seq_mod[1:], denylist)
-
-                if is_valid or is_valid_start:
-                    cur_metadata = copy.copy(metadata)
-                    cur_variants = set(variants)
-                    cur_variants.update(comb)
-                    label = vpi.create_variant_peptide_id(
-                        transcript_id=self.tx_id, variants=cur_variants, orf_id=None,
-                        gene_id=self.gene_id
-                    )
-                    cur_metadata.label = label
+        # if check_external_variants and not variants:
+        #     return
+
+        # if not self.seq_has_valid_size(seq):
+        #     return
 
-                    if is_valid:
-                        aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod)
-                        yield aa_seq, cur_metadata
 
-                    if is_valid_start:
-                        aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod[1:])
-                        yield aa_seq, cur_metadata
 
     @staticmethod
     def any_overlaps_and_all_missing_variant(nodes:Iterable[PVGNode], variant:VariantRecord):
@@ -457,17 +368,6 @@ def update_peptide_pool(seq:aa.AminoAcidSeqRecord,
             + str(label_counter[label])
         peptide_pool.add(seq)
 
-class VariantPeptideMetadata():
-    """ Variant peptide metadata """
-    def __init__(self, label:str=None, orf:Tuple[int,int]=None,
-            is_pure_circ_rna:bool=False):
-        """  """
-        self.label = label
-        self.orf = orf
-        self.is_pure_circ_rna = is_pure_circ_rna
-
-T = Dict[aa.AminoAcidSeqRecord, Set[VariantPeptideMetadata]]
-
 class VariantPeptideDict():
     """ Variant peptide pool as dict.
 
@@ -483,18 +383,130 @@ class VariantPeptideDict():
           every node. This is useful for PVG derived from a circRNA, and this
           argument is the circRNA variant.
     """
-    def __init__(self, tx_id:str, peptides:T=None, labels:Dict[str,int]=None,
-            global_variant:VariantRecord=None, gene_id:str=None,
-            truncate_sec:bool=False, w2f:bool=False, check_external_variants:bool=True):
+    def __init__(self, tx_id:str, peptides:T=None, seqs:Set[Seq]=None,
+            labels:Dict[str,int]=None, global_variant:VariantRecord=None,
+            gene_id:str=None, truncate_sec:bool=False, w2f:bool=False,
+            check_external_variants:bool=True,
+            cleavage_params:params.CleavageParams=None):
         """ constructor """
         self.tx_id = tx_id
         self.peptides = peptides or {}
+        self.seqs = seqs or set()
         self.labels = labels or {}
         self.global_variant = global_variant
         self.gene_id = gene_id
         self.truncate_sec = truncate_sec
         self.w2f = w2f
         self.check_external_variants = check_external_variants
+        self.cleavage_params = cleavage_params
+
+    def find_miscleaved_nodes(self, node:PVGNode, orfs:List[PVGOrf],
+            cleavage_params:params.CleavageParams, tx_id:str, gene_id:str,
+            leading_node:PVGNode, subgraphs:SubgraphTree, is_circ_rna:bool
+            ) -> MiscleavedNodes:
+        """ Find all miscleaved nodes.
+
+        node vs leading_node:
+            - When calling this function from a node within a PVG graph, a copy
+              of the node should be first created. This is because if the ORF
+              start site is located in the node, the peptide sequence should be
+              called form the ORF start site, rather than the beginning of the
+              sequence. So then a copy should be created from this node and
+              only keep the sequence after the ORF start.
+            - The `leading_node` must tbe the original copy of the node within
+              the graph, that the `node` is copied from. This is used to
+              retrieve INDELs from the downstream node.
+
+        Args:
+            - `node` (PVGNode): The node that miscleaved peptide sequences
+              starts from it should be called.
+            - `orfs` (List[PVGPOrf]): The ORF start and end locations.
+            - `cleavage_params` (CleavageParams): Cleavage related parameters.
+            - `tx_id` (str): Transcript ID.
+            - `leading_node` (PVGNode): The start node that the miscleaved
+              peptides are called from. This node must present in the PVG graph.
+        """
+        if not orfs:
+            raise ValueError('ORFs are empty')
+        queue = deque([[node]])
+        nodes = MiscleavedNodes(
+            data=deque([]),
+            cleavage_params=cleavage_params,
+            orfs=orfs,
+            tx_id=tx_id,
+            gene_id=gene_id,
+            leading_node=leading_node,
+            subgraphs=subgraphs,
+            is_circ_rna=is_circ_rna
+        )
+
+        if not (node.cpop_collapsed or node.truncated):
+            additional_variants = leading_node.get_downstream_stop_altering_variants()
+            series = MiscleavedNodeSeries([node], additional_variants)
+            if series.is_too_long(self.cleavage_params) and not node.selenocysteines:
+                return nodes
+            if not series.is_too_short(self.cleavage_params):
+                nodes.data.append(series)
+
+        while queue:
+            cur_batch = queue.pop()
+            cur_node = cur_batch[-1]
+            # Turn it into a doct of id to variant
+            batch_vars:Dict[str, VariantRecord] = {}
+
+            for _node in cur_batch:
+                for var in _node.variants:
+                    if var.variant.id not in batch_vars:
+                        batch_vars[var.variant.id] = var.variant
+
+            n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1
+
+            if n_cleavages >= cleavage_params.miscleavage:
+                continue
+
+            # This is done to reduce the complexity when the node has too
+            # many out nodes, and its out nodes also have too many out nodes.
+            if cleavage_params.additional_variants_per_misc == -1:
+                allowed_n_vars = float('Inf')
+            else:
+                allowed_n_vars = cleavage_params.max_variants_per_node
+                allowed_n_vars += (n_cleavages + 1) * cleavage_params.additional_variants_per_misc
+
+            for _node in cur_node.out_nodes:
+                if is_circ_rna and _node.is_hybrid_node(subgraphs):
+                    continue
+                if _node.truncated:
+                    continue
+                new_batch = copy.copy(cur_batch)
+
+                is_stop = len(_node.seq.seq) == 1 and _node.seq.seq.startswith('*')
+                if is_stop:
+                    continue
+
+                new_batch.append(_node)
+
+                additional_variants = _node.get_downstream_stop_altering_variants()
+
+                cur_vars = set(batch_vars.keys())
+                for var in _node.variants:
+                    cur_vars.add(var.variant.id)
+                cur_vars.update({v.id for v in additional_variants})
+                if len(cur_vars) > allowed_n_vars:
+                    continue
+
+                if not _node.cpop_collapsed:
+                    series = MiscleavedNodeSeries(copy.copy(new_batch), additional_variants)
+                    if series.is_too_long(self.cleavage_params) \
+                            and not _node.selenocysteines:
+                        continue
+                    if not series.is_too_short(self.cleavage_params):
+                        nodes.data.append(series)
+                    if n_cleavages + 1 == cleavage_params.miscleavage:
+                        continue
+                    if n_cleavages + 1 > cleavage_params.miscleavage:
+                        raise ValueError('Something just went wrong')
+                queue.append(new_batch)
+        return nodes
 
     def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf],
             cleavage_params:params.CleavageParams,
@@ -522,14 +534,16 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf],
         """
         if leading_node is None:
             leading_node = node
-        miscleaved_nodes = MiscleavedNodes.find_miscleaved_nodes(
+        miscleaved_nodes = self.find_miscleaved_nodes(
             node=node, orfs=orfs, cleavage_params=cleavage_params,
             tx_id=self.tx_id, gene_id=self.gene_id, leading_node=leading_node,
             subgraphs=subgraphs, is_circ_rna=circ_rna is not None
         )
         if self.global_variant and self.global_variant not in additional_variants:
             additional_variants.append(self.global_variant)
+
         seqs = miscleaved_nodes.join_miscleaved_peptides(
+            pool=self.seqs,
             check_variants=check_variants,
             additional_variants=additional_variants,
             denylist=denylist,
@@ -546,6 +560,70 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf],
                 raise ValueError('Invalid amino acid symbol found in the sequence.')
             val = self.peptides.setdefault(seq, set())
             val.add(metadata)
+            self.seqs.add(seq.seq)
+
+
+    def find_codon_reassignments(self, seq:Seq, w2f:bool=False) -> List[VariantRecord]:
+        """ Find potential codon reassignments from the given sequence. """
+        variants = []
+        if w2f:
+            i = 0
+            while i > -1:
+                i = seq.find('W', start=i)
+                if i > -1:
+                    w2f_var = create_variant_w2f(self.tx_id, i)
+                    variants.append(w2f_var)
+                    i += 1
+                    if i >= len(seq):
+                        break
+        return variants
+
+    def is_valid_seq(self, seq:Seq, denylist:Set[str]) -> bool:
+        """ Check whether the seq is valid """
+        if seq in self.seqs:
+            return True
+        min_mw = self.cleavage_params.min_mw
+        return self.seq_has_valid_size(seq) \
+            and seq not in denylist \
+            and 'X' not in seq \
+            and SeqUtils.molecular_weight(seq, 'protein') >= min_mw
+
+    def seq_has_valid_size(self, seq:Seq) -> bool:
+        """ Check whether the seq has valid length """
+        min_length = self.cleavage_params.min_length
+        max_length = self.cleavage_params.max_length
+
+        return min_length <= len(seq) <= max_length
+
+    def translational_modification(self, w2f:bool, denylist:Set[str]):
+        """ Sequence level translational modification, e.g., W2F reassignment """
+        for seq in copy.copy(self.peptides):
+            reassignments:List[seqvar.VariantRecord] = []
+            reassignments += self.find_codon_reassignments(seq.seq, w2f)
+            if not reassignments:
+                continue
+
+            # W > F codon reassignments
+            for k in range(1, len(reassignments) + 1):
+                for comb in itertools.combinations(reassignments, k):
+                    seq_mod = seq.seq
+                    for v in comb:
+                        seq_mod_new = seq_mod[:v.location.start] + v.alt
+                        if v.location.end < len(seq):
+                            seq_mod_new += seq_mod[v.location.end:]
+                        seq_mod = seq_mod_new
+
+                    if not self.is_valid_seq(seq_mod, denylist):
+                        continue
+                    for metadata in self.peptides[seq]:
+                        cur_metadata = copy.copy(metadata)
+                        cur_metadata.label += '|'.join(v.id for v in comb)
+
+                        aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod)
+
+                        val = self.peptides.setdefault(aa_seq, set())
+                        val.add(cur_metadata)
+                        self.seqs.add(aa_seq.seq)
 
     def get_peptide_sequences(self, keep_all_occurrence:bool=True,
             orf_id_map:Dict[Tuple[int,int],str]=None
diff --git a/test/unit/test_peptide_variant_graph.py b/test/unit/test_peptide_variant_graph.py
index 67d46e2f..91f546b2 100644
--- a/test/unit/test_peptide_variant_graph.py
+++ b/test/unit/test_peptide_variant_graph.py
@@ -600,6 +600,7 @@ def test_call_and_stage_known_orf_incds(self):
         graph, nodes = create_pgraph(data, 'ENST0001')
         graph.known_orf = [0,30]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (0,30), (0,10))
         orf = PVGOrf([0, None])
         cursor = PVGCursor(nodes[1], nodes[2], True, [orf])
@@ -627,6 +628,7 @@ def test_call_and_stage_known_orf_start_and_frameshift(self):
         nodes[4].reading_frame_index = 2
         graph.known_orf = [18,60]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (18,60), (6,20))
         orf = PVGOrf([0, None], set())
         cursor = PVGCursor(nodes[1], nodes[2], False, [orf])
@@ -650,6 +652,7 @@ def test_call_and_stage_known_orf_multiple_methionine(self):
         graph, nodes = create_pgraph(data, 'ENST0001')
         graph.known_orf = [24,90]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (24,90), (8,30))
         cursor = PVGCursor(nodes[1], nodes[2], False, [0, None], [])
         graph.call_and_stage_known_orf(cursor,  traversal)
@@ -673,6 +676,7 @@ def test_call_and_stage_known_orf_start_altering(self):
         graph.cds_start_nf = True
         graph.known_orf = [6,90]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (6,90), (2,30))
         cursor = PVGCursor(graph.root, nodes[2], False, [0, None], [])
         graph.call_and_stage_known_orf(cursor,  traversal)
@@ -696,6 +700,7 @@ def test_call_and_stage_known_orf_cds_stop_gain(self):
         nodes[5].variants[0].is_stop_altering = True
         graph.known_orf = [0,90]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (0,90), (0,30))
         orf = PVGOrf([0, None])
         cursor = PVGCursor(nodes[2], nodes[4], True, [orf])
@@ -722,6 +727,7 @@ def test_call_and_stage_known_orf_cds_stop_lost(self):
         graph, nodes = create_pgraph(data, 'ENST0001')
         graph.known_orf = [0,39]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (0,42), (0,14))
         orf = PVGOrf([0,None])
         cursor = PVGCursor(nodes[4], nodes[5], False, [orf], [])
@@ -748,6 +754,7 @@ def test_call_and_stage_known_orf_not_in_cds_no_start_altering(self):
         graph, nodes = create_pgraph(data, 'ENST0001')
         graph.known_orf = [18,39]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (6,13), (18,39))
         cursor = PVGCursor(nodes[5], nodes[6], False, [0, None], [])
         graph.call_and_stage_known_orf(cursor,  traversal)
@@ -767,6 +774,7 @@ def test_call_and_stage_known_orf_cleavage_gain_downstream(self):
         graph, nodes = create_pgraph(data, 'ENST0001')
         graph.known_orf = [0,39]
         pool = VariantPeptideDict(graph.id)
+        pool.cleavage_params = graph.cleavage_params
         traversal = PVGTraversal(True, False, pool, (6,13), (18,39))
         orf = PVGOrf([0, None])
         cursor = PVGCursor(nodes[1], nodes[3], True, [orf])
diff --git a/test/unit/test_variant_peptide_dict.py b/test/unit/test_variant_peptide_dict.py
index 23a0f542..961a9522 100644
--- a/test/unit/test_variant_peptide_dict.py
+++ b/test/unit/test_variant_peptide_dict.py
@@ -75,4 +75,4 @@ def test_is_valid_x(self):
         invalid sequence. """
         cleavage_params = params.CleavageParams(enzyme='trypsin')
         misc_nodes = MiscleavedNodes([], cleavage_params)
-        self.assertFalse(misc_nodes.is_valid_seq('AAAAXAAA', set()))
+        self.assertFalse(misc_nodes.is_valid_seq('AAAAXAAA', set(), set()))

From 2d08a5110b2f6ed957a6a3950b7b4b973deb6173 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sun, 14 May 2023 16:20:10 +0800
Subject: [PATCH 4/9] doc (CHANGELOG): changelog updated

---
 CHANGELOG.md                           | 4 +++-
 moPepGen/svgraph/VariantPeptideDict.py | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c3c812d4..545b8cdd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,7 +44,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Fixed `callVariant` issue of altSplice insertion carries an intronic indel that goes back to the original reading frame. #726
 
-- Fixed `callVairant` to handle deletion that spans over an entire intron. #732
+- Fixed `callVariant` to handle deletion that spans over an entire intron. #732
+
+- Fixed `callVariant` to skip peptides earlier if they are either too long or too short to significantly improve efficiency. #736
 
 ## [0.11.5] - 2023-3-5
 
diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
index 37880c5a..1869f30f 100644
--- a/moPepGen/svgraph/VariantPeptideDict.py
+++ b/moPepGen/svgraph/VariantPeptideDict.py
@@ -113,8 +113,7 @@ def seq_has_valid_size(self, seq:Seq=None, size:int=None) -> bool:
     def join_miscleaved_peptides(self, pool:T, check_variants:bool,
             additional_variants:List[VariantRecord], denylist:Set[str],
             is_start_codon:bool=False, circ_rna:circ.CircRNAModel=None,
-            truncate_sec:bool=False, w2f:bool=False,
-            check_external_variants:bool=True
+            truncate_sec:bool=False, check_external_variants:bool=True
             ) -> Iterable[Tuple[aa.AminoAcidSeqRecord, VariantPeptideMetadata]]:
         """ join miscleaved peptides and update the peptide pool.
 
@@ -550,7 +549,6 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf],
             is_start_codon=is_start_codon,
             circ_rna=circ_rna,
             truncate_sec=self.truncate_sec,
-            w2f=self.w2f,
             check_external_variants=self.check_external_variants
         )
         for seq, metadata in seqs:

From 8c538afb8b254625de07c14d7efbaa2f60b84dd2 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sun, 14 May 2023 17:46:01 +0800
Subject: [PATCH 5/9] fix (bruteForce): sect peptides + w2f not considered

---
 moPepGen/svgraph/VariantPeptideDict.py |  2 +-
 moPepGen/util/brute_force.py           | 41 +++++++++++++-------------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
index 1869f30f..e0348fa9 100644
--- a/moPepGen/svgraph/VariantPeptideDict.py
+++ b/moPepGen/svgraph/VariantPeptideDict.py
@@ -615,7 +615,7 @@ def translational_modification(self, w2f:bool, denylist:Set[str]):
                         continue
                     for metadata in self.peptides[seq]:
                         cur_metadata = copy.copy(metadata)
-                        cur_metadata.label += '|'.join(v.id for v in comb)
+                        cur_metadata.label += '|' + '|'.join(v.id for v in comb)
 
                         aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod)
 
diff --git a/moPepGen/util/brute_force.py b/moPepGen/util/brute_force.py
index e93a88be..065ef7ba 100644
--- a/moPepGen/util/brute_force.py
+++ b/moPepGen/util/brute_force.py
@@ -1050,27 +1050,26 @@ def translational_modification(self, seq:Seq, lhs:int, tx_lhs:int,
 
         # W > F
         if self.w2f:
-            w2f_indices = []
-            i = 0
-            while i > -1:
-                i = seq.find('W', start=i)
-                if i > -1:
-                    w2f_indices.append(i)
-                    i += 1
-                    if i > len(seq):
-                        break
-
-            for k in range(1, len(w2f_indices) + 1):
-                for comb in combinations(w2f_indices, k):
-                    seq_mod = seq
-                    for i in comb:
-                        seq_mod_new = seq_mod[:i] + 'F'
-                        if i + 1 < len(seq):
-                            seq_mod_new += seq_mod[i+1:]
-                        seq_mod = seq_mod_new
-                    candidates.append(seq_mod)
-                    if is_start:
-                        candidates.append(seq_mod[1:])
+            for candidate in copy.copy(candidates):
+                w2f_indices = []
+                i = 0
+                while i > -1:
+                    i = candidate.find('W', start=i)
+                    if i > -1:
+                        w2f_indices.append(i)
+                        i += 1
+                        if i > len(candidate):
+                            break
+
+                for k in range(1, len(w2f_indices) + 1):
+                    for comb in combinations(w2f_indices, k):
+                        seq_mod = candidate
+                        for i in comb:
+                            seq_mod_new = seq_mod[:i] + 'F'
+                            if i + 1 < len(candidate):
+                                seq_mod_new += seq_mod[i+1:]
+                            seq_mod = seq_mod_new
+                        candidates.append(seq_mod)
 
         for candidate in candidates:
             if self.peptide_is_valid(candidate, denylist, check_canonical):

From 45e4a6bd361aa907c6eeff869743b2bee2aea94a Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 17 May 2023 01:25:02 +0800
Subject: [PATCH 6/9] fix (ThreeFrameTVG): use a more dynamic cutoff of
 `max_in_bubble_variants` to limit the number of variants to consider as
 combinations in a node when alignment a variant bubble.

---
 moPepGen/svgraph/ThreeFrameTVG.py | 49 ++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py
index 44175eb7..4391e10f 100644
--- a/moPepGen/svgraph/ThreeFrameTVG.py
+++ b/moPepGen/svgraph/ThreeFrameTVG.py
@@ -1473,7 +1473,8 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool:
 
         return subgraph1.location < subgraph2.location
 
-    def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode]) -> bool:
+    def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode],
+            max_in_bubble_variants:int) -> bool:
         """ Check the total number of variants of given nodes """
         if self.cleavage_params.max_variants_per_node == -1:
             return False
@@ -1481,7 +1482,28 @@ def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode]) -> bool:
         for node in nodes:
             for variant in node.variants:
                 variants.add(variant.variant)
-        return len(variants) > self.cleavage_params.max_variants_per_node
+        return len(variants) > max_in_bubble_variants
+
+    @staticmethod
+    def get_max_in_bubble_variants(n:int) -> int:
+        """ Get the `max_in_bubble_variants` based on the total number of variants
+        in a variant bubble. The values are set so the number of combinations to
+        consider won't be too much more than 5,000. """
+        if n <= 12:
+            return -1
+        if n <= 14:
+            return 7
+        if n <= 15:
+            return 6
+        if n <= 16:
+            return 5
+        if n <= 21:
+            return 4
+        if n <= 34:
+            return 3
+        if n <= 100:
+            return 2
+        return 1
 
     def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
         r""" Aligns all variants at that overlaps to the same start and end
@@ -1510,6 +1532,18 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
             raise ValueError('reading_frame_index not found')
         end_node, members = self.find_variant_bubble(node)
 
+        member_variants = set()
+        for member in members:
+            member_variants.update([v.variant.id for v in member.variants])
+
+        max_in_bubble_variants = self.get_max_in_bubble_variants(len(member_variants))
+
+        if len(member_variants) >= 13:
+            err.warning(
+                f"Hypermutated region detected with {len(member_variants)} variants."
+                f" `max_in_bubble_variants` of {max_in_bubble_variants} is used."
+            )
+
         if not end_node:
             raise err.FailedToFindVariantBubbleError()
 
@@ -1601,14 +1635,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
 
                 trash.add(out_node)
 
-                if self.nodes_have_too_many_variants([cur, out_node]):
-                    if not self.hypermutated_region_warned:
-                        err.HypermutatedRegionWarning(
-                            self.id,
-                            self.cleavage_params.max_variants_per_node,
-                            self.cleavage_params.additional_variants_per_misc
-                        )
-                        self.hypermutated_region_warned = True
+                if self.nodes_have_too_many_variants([cur, out_node], max_in_bubble_variants):
                     continue
 
                 # create new node with the combined sequence
@@ -1810,6 +1837,8 @@ def fit_into_codons(self) -> None:
                 queue.appendleft(cur)
                 continue
 
+            cur_seq = str(cur.seq.seq)
+
             self.align_variants(cur)
 
             self.collapse_equivalent_nodes(cur)

From f0c51077437bbb31affd967a82f1d54a1cd4e4b9 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 17 May 2023 01:32:53 +0800
Subject: [PATCH 7/9] fix (ThreeFrameTVG): wrong variable was used

---
 moPepGen/svgraph/ThreeFrameTVG.py |  2 +-
 test/unit/test_three_frame_tvg.py | 43 -------------------------------
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py
index 4391e10f..e8e4a97f 100644
--- a/moPepGen/svgraph/ThreeFrameTVG.py
+++ b/moPepGen/svgraph/ThreeFrameTVG.py
@@ -1476,7 +1476,7 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool:
     def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode],
             max_in_bubble_variants:int) -> bool:
         """ Check the total number of variants of given nodes """
-        if self.cleavage_params.max_variants_per_node == -1:
+        if max_in_bubble_variants == -1:
             return False
         variants = set()
         for node in nodes:
diff --git a/test/unit/test_three_frame_tvg.py b/test/unit/test_three_frame_tvg.py
index 9fab85d7..9df904a4 100644
--- a/test/unit/test_three_frame_tvg.py
+++ b/test/unit/test_three_frame_tvg.py
@@ -900,49 +900,6 @@ def test_align_variant_sub(self):
         expected = {'TA', 'TGCTGC'}
         self.assertEqual(out_node_seqs, expected)
 
-    def test_align_variantes_max_variants(self):
-        r""" Tests for aligning variants in heavily mutated local region
-
-            G
-           / \
-          | T |   T   A
-          |/ \|  / \ / \
-        AAT-A-C-C-C-T-T-G
-           \ /
-            C
-
-        """
-        var_data = [
-            (0, 'A', 'T', 'SNV', ''),
-            (0, 'A', 'G', 'SNV', ''),
-            (0, 'A', 'C', 'SNV', ''),
-            (0, 'C', 'G', 'SNV', ''),
-            (0, 'C', 'G', 'SNV', ''),
-            (0, 'T', 'A', 'SNV', ''),
-        ]
-        data = {
-            1:  ['AAT', ['RF0'], [], 1],
-            2:  ['A', [1], []],
-            3:  ['T', [1], [var_data[0]]],
-            4:  ['G', [1], [var_data[1]]],
-            5:  ['C', [1], [var_data[2]]],
-            6:  ['C', [2,3,4,5], []],
-            7:  ['C', [6], []],
-            8:  ['C', [6], [var_data[3]]],
-            9:  ['T', [7,8], []],
-            10: ['T', [9], []],
-            11: ['A', [9], [var_data[4]]],
-            12: ['G', [10,11], []]
-        }
-
-        graph, nodes = create_three_frame_tvg(data, 'AATACCTTG')
-        graph.cleavage_params.max_variants_per_node = 2
-        graph.align_variants(nodes[1])
-        for edge in nodes[1].out_edges:
-            out_node = edge.out_node
-            self.assertTrue(len(out_node.variants) <=
-                graph.cleavage_params.max_variants_per_node)
-
     def test_expand_alignments(self):
         r""" find_known_orf wihtout mutation
 

From 64d645facb33407c9ef8f59d066127a082d18059 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 17 May 2023 01:36:44 +0800
Subject: [PATCH 8/9] doc (CHANGELOG): changelog updated

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 545b8cdd..70ca8c9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -48,6 +48,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Fixed `callVariant` to skip peptides earlier if they are either too long or too short to significantly improve efficiency. #736
 
+- Fixed `callVariant` to handle hypermutated region with a dynamic cutoff. #738
+
 ## [0.11.5] - 2023-3-5
 
 ### Fixed

From b59cc8f5bd4b48ce93e170b0e7f0676feefa3bf0 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 17 May 2023 13:13:25 +0800
Subject: [PATCH 9/9] style (ThreeFrameTVG): function to staticmethod

---
 moPepGen/svgraph/ThreeFrameTVG.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py
index e8e4a97f..94f5645b 100644
--- a/moPepGen/svgraph/ThreeFrameTVG.py
+++ b/moPepGen/svgraph/ThreeFrameTVG.py
@@ -1473,7 +1473,8 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool:
 
         return subgraph1.location < subgraph2.location
 
-    def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode],
+    @staticmethod
+    def nodes_have_too_many_variants(nodes:Iterable[TVGNode],
             max_in_bubble_variants:int) -> bool:
         """ Check the total number of variants of given nodes """
         if max_in_bubble_variants == -1:
@@ -1837,8 +1838,6 @@ def fit_into_codons(self) -> None:
                 queue.appendleft(cur)
                 continue
 
-            cur_seq = str(cur.seq.seq)
-
             self.align_variants(cur)
 
             self.collapse_equivalent_nodes(cur)