From 81ace30e097e9277adbdd7d4ed1436247978af8f Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sun, 7 May 2023 00:10:18 +0800 Subject: [PATCH 1/9] fix (svgraph): update data structure to improve performance by using dict instead of set to avoid hashing the entire object again and again --- moPepGen/svgraph/PVGNode.py | 2 +- moPepGen/svgraph/VariantPeptideDict.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py index 673ac7ce..fb33aaef 100644 --- a/moPepGen/svgraph/PVGNode.py +++ b/moPepGen/svgraph/PVGNode.py @@ -744,7 +744,7 @@ def get_downstream_stop_altering_variants(self) -> Set[VariantRecord]: """ Get downstream stop altering variants """ final_variants = set() for out_node in self.out_nodes: - if out_node.seq.seq == '*': + if len(out_node.seq.seq) == 1 and out_node.seq.seq.startswith('*'): stop_alts = set() stop_alts.update([x.variant for x in out_node.variants if x.is_stop_altering and not x.downstream_cleavage_altering]) diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 7f579a1f..3a8931d4 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -104,11 +104,13 @@ def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf], while queue: cur_batch = queue.pop() cur_node = cur_batch[-1] - batch_vars:Set[VariantRecord] = set() + # Turn it into a doct of id to variant + batch_vars:Dict[str, VariantRecord] = {} for _node in cur_batch: for var in _node.variants: - batch_vars.add(var.variant) + if var.variant.id not in batch_vars: + batch_vars[var.variant.id] = var.variant n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1 @@ -138,7 +140,7 @@ def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf], additional_variants = _node.get_downstream_stop_altering_variants() - cur_vars = {v.id for v in batch_vars} + cur_vars = set(batch_vars.keys()) for var in _node.variants: cur_vars.add(var.variant.id) cur_vars.update({v.id for v in additional_variants}) @@ -188,7 +190,7 @@ def join_miscleaved_peptides(self, check_variants:bool, metadata = VariantPeptideMetadata() seq:str = None variants:Set[VariantRecord] = set() - in_seq_variants:Set[VariantRecord] = set() + in_seq_variants:Dict[str,VariantRecord] = {} selenocysteines = [] @@ -211,7 +213,8 @@ def join_miscleaved_peptides(self, check_variants:bool, continue variants.add(variant.variant) if not variant.variant.is_circ_rna(): - in_seq_variants.add(variant.variant) + if variant.variant.id not in in_seq_variants: + in_seq_variants[variant.variant.id] = variant.variant if i < len(queue) - 1: _node = self.leading_node if i == 0 else node indels = queue[i + 1].upstream_indel_map.get(_node) @@ -224,7 +227,7 @@ def join_miscleaved_peptides(self, check_variants:bool, if any(v.is_circ_rna() for v in variants) \ or any(v.is_circ_rna() for v in orf.start_gain): if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna): - if any(n.is_missing_any_variant(in_seq_variants) for n in queue): + if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue): continue metadata.orf = tuple(orf.orf) valid_orf = orf From 0e9b09dd1689dafb5bfd6e7a8d29e368ea191071 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Mon, 8 May 2023 20:09:42 +0800 Subject: [PATCH 2/9] fix (VariantPeptideDict): turn more set into dict --- moPepGen/svgraph/VariantPeptideDict.py | 38 ++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 3a8931d4..84ed9309 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -189,7 +189,7 @@ def join_miscleaved_peptides(self, check_variants:bool, queue = series.nodes metadata = VariantPeptideMetadata() seq:str = None - variants:Set[VariantRecord] = set() + variants:Dict[str,VariantRecord] = {} in_seq_variants:Dict[str,VariantRecord] = {} selenocysteines = [] @@ -211,7 +211,8 @@ def join_miscleaved_peptides(self, check_variants:bool, continue if variant.upstream_cleavage_altering: continue - variants.add(variant.variant) + if variant.variant.id not in variants: + variants[variant.variant.id] = variant.variant if not variant.variant.is_circ_rna(): if variant.variant.id not in in_seq_variants: in_seq_variants[variant.variant.id] = variant.variant @@ -220,11 +221,12 @@ def join_miscleaved_peptides(self, check_variants:bool, indels = queue[i + 1].upstream_indel_map.get(_node) if indels: for variant in indels: - variants.add(variant) + if variant.id not in variants: + variants[variant.id] = variant valid_orf = None for orf in self.orfs: - if any(v.is_circ_rna() for v in variants) \ + if any(v.is_circ_rna() for v in variants.values()) \ or any(v.is_circ_rna() for v in orf.start_gain): if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna): if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue): @@ -241,16 +243,24 @@ def join_miscleaved_peptides(self, check_variants:bool, continue cleavage_gain_down = queue[-1].get_cleavage_gain_from_downstream() - variants.update(cleavage_gain_down) + for v in cleavage_gain_down: + if v not in variants: + variants[v.id] = v if check_variants: - variants.update(valid_orf.start_gain) - variants.update(additional_variants) - variants.update(series.additional_variants) - - if any(v.is_circ_rna() for v in variants) \ + for v in valid_orf.start_gain: + if v.id not in variants: + variants[v.id] = v + for v in additional_variants: + if v.id not in variants: + variants[v.id] = v + for v in series.additional_variants: + if v.id not in variants: + variants[v.id] = v + + if any(v.is_circ_rna() for v in variants.values()) \ and queue[-1].any_unaccounted_downstream_cleavage_or_stop_altering( - {x for x in variants if not x.is_circ_rna()}): + {x for x in variants.values() if not x.is_circ_rna()}): continue if not seq: @@ -271,10 +281,10 @@ def join_miscleaved_peptides(self, check_variants:bool, continue metadata.is_pure_circ_rna = len(variants) == 1 and \ - list(variants)[0].is_circ_rna() + list(variants.values())[0].is_circ_rna() seqs = self.translational_modification(seq, metadata, denylist, - variants, is_start_codon, selenocysteines, substitutants, + variants.values(), is_start_codon, selenocysteines, substitutants, check_variants, check_external_variants ) for seq, metadata in seqs: @@ -371,7 +381,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, if is_valid or is_valid_start: cur_metadata = copy.copy(metadata) - cur_variants = copy.copy(variants) + cur_variants = set(variants) cur_variants.update(comb) label = vpi.create_variant_peptide_id( transcript_id=self.tx_id, variants=cur_variants, orf_id=None, From d76caf6ddca753e24985f57a876a940f0e4a4bd8 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sun, 14 May 2023 16:03:11 +0800 Subject: [PATCH 3/9] fix (callVariant): skip peptides earlier if they are either too long or too short to significantly improve efficiency. #736 --- moPepGen/cli/call_noncoding_peptide.py | 3 +- moPepGen/svgraph/PVGNode.py | 3 +- moPepGen/svgraph/PeptideVariantGraph.py | 5 +- moPepGen/svgraph/VariantPeptideDict.py | 474 ++++++++++++++---------- test/unit/test_peptide_variant_graph.py | 8 + test/unit/test_variant_peptide_dict.py | 2 +- 6 files changed, 293 insertions(+), 202 deletions(-) diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py index de54dc73..324267ed 100644 --- a/moPepGen/cli/call_noncoding_peptide.py +++ b/moPepGen/cli/call_noncoding_peptide.py @@ -200,7 +200,8 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel, check_orf=True, denylist=canonical_peptides, orf_assignment=orf_assignment, - w2f=w2f_reassignment + w2f=w2f_reassignment, + check_external_variants=False ) orfs = get_orf_sequences(pgraph, tx_id, tx_model.gene_id, tx_seq) return peptides, orfs diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py index fb33aaef..523231af 100644 --- a/moPepGen/svgraph/PVGNode.py +++ b/moPepGen/svgraph/PVGNode.py @@ -305,8 +305,9 @@ def get_cleavage_gain_variants(self) -> List[seqvar.VariantRecord]: def get_cleavage_gain_from_downstream(self) -> List[seqvar.VariantRecord]: """ Get the variants that gains the cleavage by downstream nodes """ cleavage_gain = [] + seq_len = len(self.seq.seq) upstream_cleave_alts = [v.variant for v in self.variants - if v.location.end == len(self.seq.seq)] + if v.location.end == seq_len] for node in self.out_nodes: if not node.variants: return [] diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py index 075279e9..7187a931 100644 --- a/moPepGen/svgraph/PeptideVariantGraph.py +++ b/moPepGen/svgraph/PeptideVariantGraph.py @@ -809,7 +809,8 @@ def call_variant_peptides(self, check_variants:bool=True, gene_id=self.gene_id, truncate_sec=truncate_sec, w2f=w2f, - check_external_variants=check_external_variants + check_external_variants=check_external_variants, + cleavage_params=self.cleavage_params ) traversal = PVGTraversal( check_variants=check_variants, check_orf=check_orf, @@ -855,6 +856,8 @@ def call_variant_peptides(self, check_variants:bool=True, if check_orf: self.create_orf_id_map() + peptide_pool.translational_modification(w2f, self.denylist) + return peptide_pool.get_peptide_sequences( keep_all_occurrence=keep_all_occurrence, orf_id_map=self.orf_id_map ) diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 84ed9309..37880c5a 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -19,6 +19,16 @@ from moPepGen.seqvar.VariantRecord import VariantRecord from moPepGen import params +class VariantPeptideMetadata(): + """ Variant peptide metadata """ + def __init__(self, label:str=None, orf:Tuple[int,int]=None, + is_pure_circ_rna:bool=False): + """ """ + self.label = label + self.orf = orf + self.is_pure_circ_rna = is_pure_circ_rna + +T = Dict[aa.AminoAcidSeqRecord, Set[VariantPeptideMetadata]] class MiscleavedNodeSeries(): """ Helper class when calling for miscleavage peptides. The nodes contained @@ -27,6 +37,27 @@ def __init__(self, nodes:List[PVGNode], additional_variants:Set[VariantRecord]): """ """ self.nodes = nodes self.additional_variants = additional_variants + self._len = None + + def __len__(self) -> int: + """ Get length of node sequences """ + if self._len is None: + self._len = sum([len(x.seq.seq) for x in self.nodes]) + return self._len + + def is_too_short(self, param:params.CleavageParams) -> bool: + """ Checks whether the sequence is too short """ + return len(self) < param.min_length + + def is_too_long(self, param:params.CleavageParams) -> bool: + """ Checks whether the sequence is too long """ + return not( + len(self) <= param.max_length + or ( + self.nodes[0].seq.seq.startswith('M') + and len(self) <= param.max_length + 1 + ) + ) class MiscleavedNodes(): """ Helper class for looking for peptides with miscleavages. This class @@ -46,7 +77,8 @@ class MiscleavedNodes(): def __init__(self, data:Deque[MiscleavedNodeSeries], cleavage_params:params.CleavageParams, orfs:List[PVGOrf]=None, tx_id:str=None, gene_id:str=None, - leading_node:PVGNode=None, subgraphs:SubgraphTree=None): + leading_node:PVGNode=None, subgraphs:SubgraphTree=None, + is_circ_rna:bool=False): """ constructor """ self.data = data self.orfs = orfs or [] @@ -55,120 +87,30 @@ def __init__(self, data:Deque[MiscleavedNodeSeries], self.cleavage_params = cleavage_params self.leading_node = leading_node self.subgraphs = subgraphs + self.is_circ_rna = is_circ_rna - @staticmethod - def find_miscleaved_nodes(node:PVGNode, orfs:List[PVGOrf], - cleavage_params:params.CleavageParams, tx_id:str, gene_id:str, - leading_node:PVGNode, subgraphs:SubgraphTree, is_circ_rna:bool - ) -> MiscleavedNodes: - """ Find all miscleaved nodes. - - node vs leading_node: - - When calling this function from a node within a PVG graph, a copy - of the node should be first created. This is because if the ORF - start site is located in the node, the peptide sequence should be - called form the ORF start site, rather than the beginning of the - sequence. So then a copy should be created from this node and - only keep the sequence after the ORF start. - - The `leading_node` must tbe the original copy of the node within - the graph, that the `node` is copied from. This is used to - retrieve INDELs from the downstream node. - - Args: - - `node` (PVGNode): The node that miscleaved peptide sequences - starts from it should be called. - - `orfs` (List[PVGPOrf]): The ORF start and end locations. - - `cleavage_params` (CleavageParams): Cleavage related parameters. - - `tx_id` (str): Transcript ID. - - `leading_node` (PVGNode): The start node that the miscleaved - peptides are called from. This node must present in the PVG graph. - """ - if not orfs: - raise ValueError('ORFs are empty') - queue = deque([[node]]) - nodes = MiscleavedNodes( - data=deque([]), - cleavage_params=cleavage_params, - orfs=orfs, - tx_id=tx_id, - gene_id=gene_id, - leading_node=leading_node, - subgraphs=subgraphs - ) - - if not (node.cpop_collapsed or node.truncated): - additional_variants = leading_node.get_downstream_stop_altering_variants() - series = MiscleavedNodeSeries([node], additional_variants) - nodes.data.append(series) - - while queue: - cur_batch = queue.pop() - cur_node = cur_batch[-1] - # Turn it into a doct of id to variant - batch_vars:Dict[str, VariantRecord] = {} - - for _node in cur_batch: - for var in _node.variants: - if var.variant.id not in batch_vars: - batch_vars[var.variant.id] = var.variant - - n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1 - - if n_cleavages >= cleavage_params.miscleavage: - continue - - # This is done to reduce the complexity when the node has too - # many out nodes, and its out nodes also have too many out nodes. - if cleavage_params.additional_variants_per_misc == -1: - allowed_n_vars = float('Inf') - else: - allowed_n_vars = cleavage_params.max_variants_per_node - allowed_n_vars += (n_cleavages + 1) * cleavage_params.additional_variants_per_misc - - for _node in cur_node.out_nodes: - if is_circ_rna and _node.is_hybrid_node(subgraphs): - continue - if _node.truncated: - continue - new_batch = copy.copy(cur_batch) - - is_stop = _node.seq.seq == '*' - if is_stop: - continue - - new_batch.append(_node) - - additional_variants = _node.get_downstream_stop_altering_variants() - - cur_vars = set(batch_vars.keys()) - for var in _node.variants: - cur_vars.add(var.variant.id) - cur_vars.update({v.id for v in additional_variants}) - if len(cur_vars) > allowed_n_vars: - continue - - if not _node.cpop_collapsed: - series = MiscleavedNodeSeries(copy.copy(new_batch), additional_variants) - nodes.data.append(series) - if n_cleavages + 1 == cleavage_params.miscleavage: - continue - if n_cleavages + 1 > cleavage_params.miscleavage: - raise ValueError('Something just went wrong') - queue.append(new_batch) - return nodes - - def is_valid_seq(self, seq:Seq, denylist:Set[str]) -> bool: + def is_valid_seq(self, seq:Seq, pool:Set[Seq], denylist:Set[str]) -> bool: """ Check whether the seq is valid """ - min_length = self.cleavage_params.min_length - max_length = self.cleavage_params.max_length + if seq in pool: + return True min_mw = self.cleavage_params.min_mw - - return seq not in denylist \ - and min_length <= len(seq) <= max_length \ + return self.seq_has_valid_size(seq) \ + and seq not in denylist \ and 'X' not in seq \ and SeqUtils.molecular_weight(seq, 'protein') >= min_mw - def join_miscleaved_peptides(self, check_variants:bool, + def seq_has_valid_size(self, seq:Seq=None, size:int=None) -> bool: + """ Check whether the seq has valid length """ + if seq is None and size is None: + raise ValueError + min_length = self.cleavage_params.min_length + max_length = self.cleavage_params.max_length + if size is None: + size = len(seq) + + return min_length <= size <= max_length + + def join_miscleaved_peptides(self, pool:T, check_variants:bool, additional_variants:List[VariantRecord], denylist:Set[str], is_start_codon:bool=False, circ_rna:circ.CircRNAModel=None, truncate_sec:bool=False, w2f:bool=False, @@ -188,7 +130,8 @@ def join_miscleaved_peptides(self, check_variants:bool, for series in self.data: queue = series.nodes metadata = VariantPeptideMetadata() - seq:str = None + seqs_to_join:List[Seq] = [] + size:int = 0 variants:Dict[str,VariantRecord] = {} in_seq_variants:Dict[str,VariantRecord] = {} @@ -196,14 +139,13 @@ def join_miscleaved_peptides(self, check_variants:bool, for i, node in enumerate(queue): other = str(node.seq.seq) - if seq is None: - seq = other - if truncate_sec: - selenocysteines = copy.copy(node.selenocysteines) - else: - if truncate_sec: - selenocysteines += [x.shift(len(seq)) for x in node.selenocysteines] - seq = seq + other + if truncate_sec: + if size > 0: + selenocysteines += [x.shift(size) for x in node.selenocysteines] + else: + selenocysteines += node.selenocysteines + seqs_to_join.append(other) + size += len(other) if check_variants: for variant in node.variants: @@ -226,8 +168,9 @@ def join_miscleaved_peptides(self, check_variants:bool, valid_orf = None for orf in self.orfs: - if any(v.is_circ_rna() for v in variants.values()) \ - or any(v.is_circ_rna() for v in orf.start_gain): + if self.is_circ_rna \ + and (any(v.is_circ_rna() for v in variants.values()) \ + or any(v.is_circ_rna() for v in orf.start_gain)): if orf.is_valid_orf_to_misc_nodes(queue, self.subgraphs, circ_rna): if any(n.is_missing_any_variant(in_seq_variants.values()) for n in queue): continue @@ -258,58 +201,49 @@ def join_miscleaved_peptides(self, check_variants:bool, if v.id not in variants: variants[v.id] = v - if any(v.is_circ_rna() for v in variants.values()) \ + if self.is_circ_rna \ + and any(v.is_circ_rna() for v in variants.values()) \ and queue[-1].any_unaccounted_downstream_cleavage_or_stop_altering( {x for x in variants.values() if not x.is_circ_rna()}): continue - if not seq: - continue - - if seq in denylist: + if size == 0: continue - seq = Seq(seq) - substitutants = self.find_codon_reassignments(seq, w2f) - if check_variants: if check_external_variants: if not variants: continue else: - if not (variants or selenocysteines or substitutants): + if not (variants or selenocysteines): continue - metadata.is_pure_circ_rna = len(variants) == 1 and \ - list(variants.values())[0].is_circ_rna() + if not selenocysteines \ + and not self.seq_has_valid_size(size=size) \ + and not (seqs_to_join[0].startswith('M') + and self.seq_has_valid_size(size=size-1)): + continue + + seq = Seq(''.join(seqs_to_join)) + if not seq in pool and seq in denylist: + continue + + metadata.is_pure_circ_rna = self.is_circ_rna \ + and len(variants) == 1 \ + and list(variants.values())[0].is_circ_rna() seqs = self.translational_modification(seq, metadata, denylist, - variants.values(), is_start_codon, selenocysteines, substitutants, - check_variants, check_external_variants + variants.values(), is_start_codon, selenocysteines, + check_variants, check_external_variants, pool ) for seq, metadata in seqs: yield seq, metadata - def find_codon_reassignments(self, seq:Seq, w2f:bool=False) -> List[VariantRecord]: - """ Find potential codon reassignments from the given sequence. """ - variants = [] - if w2f: - i = 0 - while i > -1: - i = seq.find('W', start=i) - if i > -1: - w2f_var = create_variant_w2f(self.tx_id, i) - variants.append(w2f_var) - i += 1 - if i >= len(seq): - break - return variants def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, denylist:Set[str], variants:Set[VariantRecord], is_start_codon:bool, selenocysteines:List[seqvar.VariantRecordWithCoordinate], - reassignments:List[VariantRecord], check_variants:bool, - check_external_variants:bool + check_variants:bool, check_external_variants:bool, pool:Set[Seq] ) -> Iterable[Tuple[aa.AminoAcidSeqRecord,VariantPeptideMetadata]]: """ Apply any modification that could happen during translation. The kinds of modifications that could happen are: @@ -317,9 +251,10 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, 2. Selenocysteine truncation. """ if variants or not check_variants: - is_valid = self.is_valid_seq(seq, denylist) - is_valid_start = is_start_codon and seq.startswith('M') and\ - self.is_valid_seq(seq[1:], denylist) + is_valid = self.is_valid_seq(seq, pool, denylist) + + is_valid_start = is_start_codon and seq.startswith('M') and\ + self.is_valid_seq(seq[1:], pool, denylist) if is_valid or is_valid_start: cur_metadata = copy.copy(metadata) @@ -340,9 +275,9 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, # Selenocysteine termination for sec in selenocysteines: seq_mod = seq[:sec.location.start] - is_valid = self.is_valid_seq(seq_mod, denylist) + is_valid = self.is_valid_seq(seq_mod, pool, denylist) is_valid_start = is_start_codon and seq_mod.startswith('M') and\ - self.is_valid_seq(seq_mod[1:], denylist) + self.is_valid_seq(seq_mod[1:], pool, denylist) if is_valid or is_valid_start: cur_metadata = copy.copy(metadata) @@ -365,37 +300,13 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata, aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod[1:]) yield aa_seq, cur_metadata - # W > F codon reassignments - for k in range(1, len(reassignments) + 1): - for comb in itertools.combinations(reassignments, k): - seq_mod = seq - for v in comb: - seq_mod_new = seq_mod[:v.location.start] + v.alt - if v.location.end < len(seq): - seq_mod_new += seq_mod[v.location.end:] - seq_mod = seq_mod_new - - is_valid = self.is_valid_seq(seq_mod, denylist) - is_valid_start = is_start_codon and seq_mod.startswith('M') and\ - self.is_valid_seq(seq_mod[1:], denylist) - - if is_valid or is_valid_start: - cur_metadata = copy.copy(metadata) - cur_variants = set(variants) - cur_variants.update(comb) - label = vpi.create_variant_peptide_id( - transcript_id=self.tx_id, variants=cur_variants, orf_id=None, - gene_id=self.gene_id - ) - cur_metadata.label = label + # if check_external_variants and not variants: + # return + + # if not self.seq_has_valid_size(seq): + # return - if is_valid: - aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod) - yield aa_seq, cur_metadata - if is_valid_start: - aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod[1:]) - yield aa_seq, cur_metadata @staticmethod def any_overlaps_and_all_missing_variant(nodes:Iterable[PVGNode], variant:VariantRecord): @@ -457,17 +368,6 @@ def update_peptide_pool(seq:aa.AminoAcidSeqRecord, + str(label_counter[label]) peptide_pool.add(seq) -class VariantPeptideMetadata(): - """ Variant peptide metadata """ - def __init__(self, label:str=None, orf:Tuple[int,int]=None, - is_pure_circ_rna:bool=False): - """ """ - self.label = label - self.orf = orf - self.is_pure_circ_rna = is_pure_circ_rna - -T = Dict[aa.AminoAcidSeqRecord, Set[VariantPeptideMetadata]] - class VariantPeptideDict(): """ Variant peptide pool as dict. @@ -483,18 +383,130 @@ class VariantPeptideDict(): every node. This is useful for PVG derived from a circRNA, and this argument is the circRNA variant. """ - def __init__(self, tx_id:str, peptides:T=None, labels:Dict[str,int]=None, - global_variant:VariantRecord=None, gene_id:str=None, - truncate_sec:bool=False, w2f:bool=False, check_external_variants:bool=True): + def __init__(self, tx_id:str, peptides:T=None, seqs:Set[Seq]=None, + labels:Dict[str,int]=None, global_variant:VariantRecord=None, + gene_id:str=None, truncate_sec:bool=False, w2f:bool=False, + check_external_variants:bool=True, + cleavage_params:params.CleavageParams=None): """ constructor """ self.tx_id = tx_id self.peptides = peptides or {} + self.seqs = seqs or set() self.labels = labels or {} self.global_variant = global_variant self.gene_id = gene_id self.truncate_sec = truncate_sec self.w2f = w2f self.check_external_variants = check_external_variants + self.cleavage_params = cleavage_params + + def find_miscleaved_nodes(self, node:PVGNode, orfs:List[PVGOrf], + cleavage_params:params.CleavageParams, tx_id:str, gene_id:str, + leading_node:PVGNode, subgraphs:SubgraphTree, is_circ_rna:bool + ) -> MiscleavedNodes: + """ Find all miscleaved nodes. + + node vs leading_node: + - When calling this function from a node within a PVG graph, a copy + of the node should be first created. This is because if the ORF + start site is located in the node, the peptide sequence should be + called form the ORF start site, rather than the beginning of the + sequence. So then a copy should be created from this node and + only keep the sequence after the ORF start. + - The `leading_node` must tbe the original copy of the node within + the graph, that the `node` is copied from. This is used to + retrieve INDELs from the downstream node. + + Args: + - `node` (PVGNode): The node that miscleaved peptide sequences + starts from it should be called. + - `orfs` (List[PVGPOrf]): The ORF start and end locations. + - `cleavage_params` (CleavageParams): Cleavage related parameters. + - `tx_id` (str): Transcript ID. + - `leading_node` (PVGNode): The start node that the miscleaved + peptides are called from. This node must present in the PVG graph. + """ + if not orfs: + raise ValueError('ORFs are empty') + queue = deque([[node]]) + nodes = MiscleavedNodes( + data=deque([]), + cleavage_params=cleavage_params, + orfs=orfs, + tx_id=tx_id, + gene_id=gene_id, + leading_node=leading_node, + subgraphs=subgraphs, + is_circ_rna=is_circ_rna + ) + + if not (node.cpop_collapsed or node.truncated): + additional_variants = leading_node.get_downstream_stop_altering_variants() + series = MiscleavedNodeSeries([node], additional_variants) + if series.is_too_long(self.cleavage_params) and not node.selenocysteines: + return nodes + if not series.is_too_short(self.cleavage_params): + nodes.data.append(series) + + while queue: + cur_batch = queue.pop() + cur_node = cur_batch[-1] + # Turn it into a doct of id to variant + batch_vars:Dict[str, VariantRecord] = {} + + for _node in cur_batch: + for var in _node.variants: + if var.variant.id not in batch_vars: + batch_vars[var.variant.id] = var.variant + + n_cleavages = len([x for x in cur_batch if not x.cpop_collapsed]) - 1 + + if n_cleavages >= cleavage_params.miscleavage: + continue + + # This is done to reduce the complexity when the node has too + # many out nodes, and its out nodes also have too many out nodes. + if cleavage_params.additional_variants_per_misc == -1: + allowed_n_vars = float('Inf') + else: + allowed_n_vars = cleavage_params.max_variants_per_node + allowed_n_vars += (n_cleavages + 1) * cleavage_params.additional_variants_per_misc + + for _node in cur_node.out_nodes: + if is_circ_rna and _node.is_hybrid_node(subgraphs): + continue + if _node.truncated: + continue + new_batch = copy.copy(cur_batch) + + is_stop = len(_node.seq.seq) == 1 and _node.seq.seq.startswith('*') + if is_stop: + continue + + new_batch.append(_node) + + additional_variants = _node.get_downstream_stop_altering_variants() + + cur_vars = set(batch_vars.keys()) + for var in _node.variants: + cur_vars.add(var.variant.id) + cur_vars.update({v.id for v in additional_variants}) + if len(cur_vars) > allowed_n_vars: + continue + + if not _node.cpop_collapsed: + series = MiscleavedNodeSeries(copy.copy(new_batch), additional_variants) + if series.is_too_long(self.cleavage_params) \ + and not _node.selenocysteines: + continue + if not series.is_too_short(self.cleavage_params): + nodes.data.append(series) + if n_cleavages + 1 == cleavage_params.miscleavage: + continue + if n_cleavages + 1 > cleavage_params.miscleavage: + raise ValueError('Something just went wrong') + queue.append(new_batch) + return nodes def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf], cleavage_params:params.CleavageParams, @@ -522,14 +534,16 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf], """ if leading_node is None: leading_node = node - miscleaved_nodes = MiscleavedNodes.find_miscleaved_nodes( + miscleaved_nodes = self.find_miscleaved_nodes( node=node, orfs=orfs, cleavage_params=cleavage_params, tx_id=self.tx_id, gene_id=self.gene_id, leading_node=leading_node, subgraphs=subgraphs, is_circ_rna=circ_rna is not None ) if self.global_variant and self.global_variant not in additional_variants: additional_variants.append(self.global_variant) + seqs = miscleaved_nodes.join_miscleaved_peptides( + pool=self.seqs, check_variants=check_variants, additional_variants=additional_variants, denylist=denylist, @@ -546,6 +560,70 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf], raise ValueError('Invalid amino acid symbol found in the sequence.') val = self.peptides.setdefault(seq, set()) val.add(metadata) + self.seqs.add(seq.seq) + + + def find_codon_reassignments(self, seq:Seq, w2f:bool=False) -> List[VariantRecord]: + """ Find potential codon reassignments from the given sequence. """ + variants = [] + if w2f: + i = 0 + while i > -1: + i = seq.find('W', start=i) + if i > -1: + w2f_var = create_variant_w2f(self.tx_id, i) + variants.append(w2f_var) + i += 1 + if i >= len(seq): + break + return variants + + def is_valid_seq(self, seq:Seq, denylist:Set[str]) -> bool: + """ Check whether the seq is valid """ + if seq in self.seqs: + return True + min_mw = self.cleavage_params.min_mw + return self.seq_has_valid_size(seq) \ + and seq not in denylist \ + and 'X' not in seq \ + and SeqUtils.molecular_weight(seq, 'protein') >= min_mw + + def seq_has_valid_size(self, seq:Seq) -> bool: + """ Check whether the seq has valid length """ + min_length = self.cleavage_params.min_length + max_length = self.cleavage_params.max_length + + return min_length <= len(seq) <= max_length + + def translational_modification(self, w2f:bool, denylist:Set[str]): + """ Sequence level translational modification, e.g., W2F reassignment """ + for seq in copy.copy(self.peptides): + reassignments:List[seqvar.VariantRecord] = [] + reassignments += self.find_codon_reassignments(seq.seq, w2f) + if not reassignments: + continue + + # W > F codon reassignments + for k in range(1, len(reassignments) + 1): + for comb in itertools.combinations(reassignments, k): + seq_mod = seq.seq + for v in comb: + seq_mod_new = seq_mod[:v.location.start] + v.alt + if v.location.end < len(seq): + seq_mod_new += seq_mod[v.location.end:] + seq_mod = seq_mod_new + + if not self.is_valid_seq(seq_mod, denylist): + continue + for metadata in self.peptides[seq]: + cur_metadata = copy.copy(metadata) + cur_metadata.label += '|'.join(v.id for v in comb) + + aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod) + + val = self.peptides.setdefault(aa_seq, set()) + val.add(cur_metadata) + self.seqs.add(aa_seq.seq) def get_peptide_sequences(self, keep_all_occurrence:bool=True, orf_id_map:Dict[Tuple[int,int],str]=None diff --git a/test/unit/test_peptide_variant_graph.py b/test/unit/test_peptide_variant_graph.py index 67d46e2f..91f546b2 100644 --- a/test/unit/test_peptide_variant_graph.py +++ b/test/unit/test_peptide_variant_graph.py @@ -600,6 +600,7 @@ def test_call_and_stage_known_orf_incds(self): graph, nodes = create_pgraph(data, 'ENST0001') graph.known_orf = [0,30] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (0,30), (0,10)) orf = PVGOrf([0, None]) cursor = PVGCursor(nodes[1], nodes[2], True, [orf]) @@ -627,6 +628,7 @@ def test_call_and_stage_known_orf_start_and_frameshift(self): nodes[4].reading_frame_index = 2 graph.known_orf = [18,60] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (18,60), (6,20)) orf = PVGOrf([0, None], set()) cursor = PVGCursor(nodes[1], nodes[2], False, [orf]) @@ -650,6 +652,7 @@ def test_call_and_stage_known_orf_multiple_methionine(self): graph, nodes = create_pgraph(data, 'ENST0001') graph.known_orf = [24,90] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (24,90), (8,30)) cursor = PVGCursor(nodes[1], nodes[2], False, [0, None], []) graph.call_and_stage_known_orf(cursor, traversal) @@ -673,6 +676,7 @@ def test_call_and_stage_known_orf_start_altering(self): graph.cds_start_nf = True graph.known_orf = [6,90] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (6,90), (2,30)) cursor = PVGCursor(graph.root, nodes[2], False, [0, None], []) graph.call_and_stage_known_orf(cursor, traversal) @@ -696,6 +700,7 @@ def test_call_and_stage_known_orf_cds_stop_gain(self): nodes[5].variants[0].is_stop_altering = True graph.known_orf = [0,90] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (0,90), (0,30)) orf = PVGOrf([0, None]) cursor = PVGCursor(nodes[2], nodes[4], True, [orf]) @@ -722,6 +727,7 @@ def test_call_and_stage_known_orf_cds_stop_lost(self): graph, nodes = create_pgraph(data, 'ENST0001') graph.known_orf = [0,39] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (0,42), (0,14)) orf = PVGOrf([0,None]) cursor = PVGCursor(nodes[4], nodes[5], False, [orf], []) @@ -748,6 +754,7 @@ def test_call_and_stage_known_orf_not_in_cds_no_start_altering(self): graph, nodes = create_pgraph(data, 'ENST0001') graph.known_orf = [18,39] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (6,13), (18,39)) cursor = PVGCursor(nodes[5], nodes[6], False, [0, None], []) graph.call_and_stage_known_orf(cursor, traversal) @@ -767,6 +774,7 @@ def test_call_and_stage_known_orf_cleavage_gain_downstream(self): graph, nodes = create_pgraph(data, 'ENST0001') graph.known_orf = [0,39] pool = VariantPeptideDict(graph.id) + pool.cleavage_params = graph.cleavage_params traversal = PVGTraversal(True, False, pool, (6,13), (18,39)) orf = PVGOrf([0, None]) cursor = PVGCursor(nodes[1], nodes[3], True, [orf]) diff --git a/test/unit/test_variant_peptide_dict.py b/test/unit/test_variant_peptide_dict.py index 23a0f542..961a9522 100644 --- a/test/unit/test_variant_peptide_dict.py +++ b/test/unit/test_variant_peptide_dict.py @@ -75,4 +75,4 @@ def test_is_valid_x(self): invalid sequence. """ cleavage_params = params.CleavageParams(enzyme='trypsin') misc_nodes = MiscleavedNodes([], cleavage_params) - self.assertFalse(misc_nodes.is_valid_seq('AAAAXAAA', set())) + self.assertFalse(misc_nodes.is_valid_seq('AAAAXAAA', set(), set())) From 2d08a5110b2f6ed957a6a3950b7b4b973deb6173 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sun, 14 May 2023 16:20:10 +0800 Subject: [PATCH 4/9] doc (CHANGELOG): changelog updated --- CHANGELOG.md | 4 +++- moPepGen/svgraph/VariantPeptideDict.py | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3c812d4..545b8cdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,7 +44,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fixed `callVariant` issue of altSplice insertion carries an intronic indel that goes back to the original reading frame. #726 -- Fixed `callVairant` to handle deletion that spans over an entire intron. #732 +- Fixed `callVariant` to handle deletion that spans over an entire intron. #732 + +- Fixed `callVariant` to skip peptides earlier if they are either too long or too short to significantly improve efficiency. #736 ## [0.11.5] - 2023-3-5 diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 37880c5a..1869f30f 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -113,8 +113,7 @@ def seq_has_valid_size(self, seq:Seq=None, size:int=None) -> bool: def join_miscleaved_peptides(self, pool:T, check_variants:bool, additional_variants:List[VariantRecord], denylist:Set[str], is_start_codon:bool=False, circ_rna:circ.CircRNAModel=None, - truncate_sec:bool=False, w2f:bool=False, - check_external_variants:bool=True + truncate_sec:bool=False, check_external_variants:bool=True ) -> Iterable[Tuple[aa.AminoAcidSeqRecord, VariantPeptideMetadata]]: """ join miscleaved peptides and update the peptide pool. @@ -550,7 +549,6 @@ def add_miscleaved_sequences(self, node:PVGNode, orfs:List[PVGOrf], is_start_codon=is_start_codon, circ_rna=circ_rna, truncate_sec=self.truncate_sec, - w2f=self.w2f, check_external_variants=self.check_external_variants ) for seq, metadata in seqs: From 8c538afb8b254625de07c14d7efbaa2f60b84dd2 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sun, 14 May 2023 17:46:01 +0800 Subject: [PATCH 5/9] fix (bruteForce): sect peptides + w2f not considered --- moPepGen/svgraph/VariantPeptideDict.py | 2 +- moPepGen/util/brute_force.py | 41 +++++++++++++------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index 1869f30f..e0348fa9 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -615,7 +615,7 @@ def translational_modification(self, w2f:bool, denylist:Set[str]): continue for metadata in self.peptides[seq]: cur_metadata = copy.copy(metadata) - cur_metadata.label += '|'.join(v.id for v in comb) + cur_metadata.label += '|' + '|'.join(v.id for v in comb) aa_seq = aa.AminoAcidSeqRecord(seq=seq_mod) diff --git a/moPepGen/util/brute_force.py b/moPepGen/util/brute_force.py index e93a88be..065ef7ba 100644 --- a/moPepGen/util/brute_force.py +++ b/moPepGen/util/brute_force.py @@ -1050,27 +1050,26 @@ def translational_modification(self, seq:Seq, lhs:int, tx_lhs:int, # W > F if self.w2f: - w2f_indices = [] - i = 0 - while i > -1: - i = seq.find('W', start=i) - if i > -1: - w2f_indices.append(i) - i += 1 - if i > len(seq): - break - - for k in range(1, len(w2f_indices) + 1): - for comb in combinations(w2f_indices, k): - seq_mod = seq - for i in comb: - seq_mod_new = seq_mod[:i] + 'F' - if i + 1 < len(seq): - seq_mod_new += seq_mod[i+1:] - seq_mod = seq_mod_new - candidates.append(seq_mod) - if is_start: - candidates.append(seq_mod[1:]) + for candidate in copy.copy(candidates): + w2f_indices = [] + i = 0 + while i > -1: + i = candidate.find('W', start=i) + if i > -1: + w2f_indices.append(i) + i += 1 + if i > len(candidate): + break + + for k in range(1, len(w2f_indices) + 1): + for comb in combinations(w2f_indices, k): + seq_mod = candidate + for i in comb: + seq_mod_new = seq_mod[:i] + 'F' + if i + 1 < len(candidate): + seq_mod_new += seq_mod[i+1:] + seq_mod = seq_mod_new + candidates.append(seq_mod) for candidate in candidates: if self.peptide_is_valid(candidate, denylist, check_canonical): From 45e4a6bd361aa907c6eeff869743b2bee2aea94a Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 17 May 2023 01:25:02 +0800 Subject: [PATCH 6/9] fix (ThreeFrameTVG): use a more dynamic cutoff of `max_in_bubble_variants` to limit the number of variants to consider as combinations in a node when alignment a variant bubble. --- moPepGen/svgraph/ThreeFrameTVG.py | 49 ++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index 44175eb7..4391e10f 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1473,7 +1473,8 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool: return subgraph1.location < subgraph2.location - def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode]) -> bool: + def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode], + max_in_bubble_variants:int) -> bool: """ Check the total number of variants of given nodes """ if self.cleavage_params.max_variants_per_node == -1: return False @@ -1481,7 +1482,28 @@ def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode]) -> bool: for node in nodes: for variant in node.variants: variants.add(variant.variant) - return len(variants) > self.cleavage_params.max_variants_per_node + return len(variants) > max_in_bubble_variants + + @staticmethod + def get_max_in_bubble_variants(n:int) -> int: + """ Get the `max_in_bubble_variants` based on the total number of variants + in a variant bubble. The values are set so the number of combinations to + consider won't be too much more than 5,000. """ + if n <= 12: + return -1 + if n <= 14: + return 7 + if n <= 15: + return 6 + if n <= 16: + return 5 + if n <= 21: + return 4 + if n <= 34: + return 3 + if n <= 100: + return 2 + return 1 def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: r""" Aligns all variants at that overlaps to the same start and end @@ -1510,6 +1532,18 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: raise ValueError('reading_frame_index not found') end_node, members = self.find_variant_bubble(node) + member_variants = set() + for member in members: + member_variants.update([v.variant.id for v in member.variants]) + + max_in_bubble_variants = self.get_max_in_bubble_variants(len(member_variants)) + + if len(member_variants) >= 13: + err.warning( + f"Hypermutated region detected with {len(member_variants)} variants." + f" `max_in_bubble_variants` of {max_in_bubble_variants} is used." + ) + if not end_node: raise err.FailedToFindVariantBubbleError() @@ -1601,14 +1635,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]: trash.add(out_node) - if self.nodes_have_too_many_variants([cur, out_node]): - if not self.hypermutated_region_warned: - err.HypermutatedRegionWarning( - self.id, - self.cleavage_params.max_variants_per_node, - self.cleavage_params.additional_variants_per_misc - ) - self.hypermutated_region_warned = True + if self.nodes_have_too_many_variants([cur, out_node], max_in_bubble_variants): continue # create new node with the combined sequence @@ -1810,6 +1837,8 @@ def fit_into_codons(self) -> None: queue.appendleft(cur) continue + cur_seq = str(cur.seq.seq) + self.align_variants(cur) self.collapse_equivalent_nodes(cur) From f0c51077437bbb31affd967a82f1d54a1cd4e4b9 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 17 May 2023 01:32:53 +0800 Subject: [PATCH 7/9] fix (ThreeFrameTVG): wrong variable was used --- moPepGen/svgraph/ThreeFrameTVG.py | 2 +- test/unit/test_three_frame_tvg.py | 43 ------------------------------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index 4391e10f..e8e4a97f 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1476,7 +1476,7 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool: def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode], max_in_bubble_variants:int) -> bool: """ Check the total number of variants of given nodes """ - if self.cleavage_params.max_variants_per_node == -1: + if max_in_bubble_variants == -1: return False variants = set() for node in nodes: diff --git a/test/unit/test_three_frame_tvg.py b/test/unit/test_three_frame_tvg.py index 9fab85d7..9df904a4 100644 --- a/test/unit/test_three_frame_tvg.py +++ b/test/unit/test_three_frame_tvg.py @@ -900,49 +900,6 @@ def test_align_variant_sub(self): expected = {'TA', 'TGCTGC'} self.assertEqual(out_node_seqs, expected) - def test_align_variantes_max_variants(self): - r""" Tests for aligning variants in heavily mutated local region - - G - / \ - | T | T A - |/ \| / \ / \ - AAT-A-C-C-C-T-T-G - \ / - C - - """ - var_data = [ - (0, 'A', 'T', 'SNV', ''), - (0, 'A', 'G', 'SNV', ''), - (0, 'A', 'C', 'SNV', ''), - (0, 'C', 'G', 'SNV', ''), - (0, 'C', 'G', 'SNV', ''), - (0, 'T', 'A', 'SNV', ''), - ] - data = { - 1: ['AAT', ['RF0'], [], 1], - 2: ['A', [1], []], - 3: ['T', [1], [var_data[0]]], - 4: ['G', [1], [var_data[1]]], - 5: ['C', [1], [var_data[2]]], - 6: ['C', [2,3,4,5], []], - 7: ['C', [6], []], - 8: ['C', [6], [var_data[3]]], - 9: ['T', [7,8], []], - 10: ['T', [9], []], - 11: ['A', [9], [var_data[4]]], - 12: ['G', [10,11], []] - } - - graph, nodes = create_three_frame_tvg(data, 'AATACCTTG') - graph.cleavage_params.max_variants_per_node = 2 - graph.align_variants(nodes[1]) - for edge in nodes[1].out_edges: - out_node = edge.out_node - self.assertTrue(len(out_node.variants) <= - graph.cleavage_params.max_variants_per_node) - def test_expand_alignments(self): r""" find_known_orf wihtout mutation From 64d645facb33407c9ef8f59d066127a082d18059 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 17 May 2023 01:36:44 +0800 Subject: [PATCH 8/9] doc (CHANGELOG): changelog updated --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 545b8cdd..70ca8c9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fixed `callVariant` to skip peptides earlier if they are either too long or too short to significantly improve efficiency. #736 +- Fixed `callVariant` to handle hypermutated region with a dynamic cutoff. #738 + ## [0.11.5] - 2023-3-5 ### Fixed From b59cc8f5bd4b48ce93e170b0e7f0676feefa3bf0 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 17 May 2023 13:13:25 +0800 Subject: [PATCH 9/9] style (ThreeFrameTVG): function to staticmethod --- moPepGen/svgraph/ThreeFrameTVG.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py index e8e4a97f..94f5645b 100644 --- a/moPepGen/svgraph/ThreeFrameTVG.py +++ b/moPepGen/svgraph/ThreeFrameTVG.py @@ -1473,7 +1473,8 @@ def first_node_is_smaller(self, first:TVGNode, second:TVGNode) -> bool: return subgraph1.location < subgraph2.location - def nodes_have_too_many_variants(self, nodes:Iterable[TVGNode], + @staticmethod + def nodes_have_too_many_variants(nodes:Iterable[TVGNode], max_in_bubble_variants:int) -> bool: """ Check the total number of variants of given nodes """ if max_in_bubble_variants == -1: @@ -1837,8 +1838,6 @@ def fit_into_codons(self) -> None: queue.appendleft(cur) continue - cur_seq = str(cur.seq.seq) - self.align_variants(cur) self.collapse_equivalent_nodes(cur)