Skip to content

Commit

Permalink
Merge pull request #859 from uclahs-cds/czhu-ass
Browse files Browse the repository at this point in the history
`--backsplicing-only` added to only call backsplicing site spanning peptides
  • Loading branch information
zhuchcn authored Mar 18, 2024
2 parents 32cf179 + cec4d26 commit c5a510b
Show file tree
Hide file tree
Showing 12 changed files with 274 additions and 104 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.3.1]

### Added:

- Flag `--backsplicing-only` added to `callVariant` to allow only calling noncanonical peptides spanning backsplicing site from circRNA events. #858

## [1.3.0] - 2024-3-11

### Fixed:
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from . import constant


__version__ = '1.3.0'
__version__ = '1.3.1'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
21 changes: 16 additions & 5 deletions moPepGen/cli/call_variant_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ def add_subparser_call_variant(subparsers:argparse._SubParsersAction):
help='Include peptides with W > F (Tryptophan to Phenylalanine) '
'reassignment.'
)
p.add_argument(
'--backsplicing-only',
action='store_true',
help='For circRNA, only keep noncanonical peptides spaning the backsplicing site.'
)
p.add_argument(
'--max-variants-per-node',
type=int,
Expand Down Expand Up @@ -201,6 +206,7 @@ def __init__(self, args:argparse.Namespace):
)
self.max_adjacent_as_mnv = args.max_adjacent_as_mnv
self.truncate_sec:bool = args.selenocysteine_termination
self.backsplicing_only:bool = args.backsplicing_only
self.w2f_reassignment = args.w2f_reassignment
self.noncanonical_transcripts = args.noncanonical_transcripts
self.invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
Expand Down Expand Up @@ -312,6 +318,7 @@ def call_variant_peptides_wrapper(tx_id:str,
max_adjacent_as_mnv:bool,
truncate_sec:bool,
w2f_reassignment:bool,
backsplicing_only:bool,
save_graph:bool,
**kwargs
) -> Tuple[Dict[Seq, List[AnnotatedPeptideLabel]], str, TypeDGraphs, TypePGraphs]:
Expand Down Expand Up @@ -399,7 +406,7 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]):
cleavage_params=cleavage_params,
max_adjacent_as_mnv=max_adjacent_as_mnv,
w2f_reassignment=w2f_reassignment, denylist=denylist,
save_graph=save_graph
save_graph=save_graph, backsplicing_only=backsplicing_only
)

except:
Expand Down Expand Up @@ -551,7 +558,8 @@ def call_variant_peptide(args:argparse.Namespace) -> None:
'save_graph': caller.graph_output_dir is not None,
'timeout': args.timeout_seconds,
'max_variants_per_node': tuple(args.max_variants_per_node),
'additional_variants_per_misc': tuple(args.additional_variants_per_misc)
'additional_variants_per_misc': tuple(args.additional_variants_per_misc),
'backsplicing_only': args.backsplicing_only
}
dispatches.append(dispatch)

Expand Down Expand Up @@ -735,7 +743,8 @@ def call_peptide_circ_rna(record:circ.CircRNAModel,
variant_pool:seqvar.VariantRecordPool,
gene_seqs:Dict[str, dna.DNASeqRecordWithCoordinates],
cleavage_params:params.CleavageParams, max_adjacent_as_mnv:bool,
w2f_reassignment:bool, denylist:Set[str], save_graph:bool
backsplicing_only:bool, w2f_reassignment:bool, denylist:Set[str],
save_graph:bool
)-> TypeCallPeptideReturnData:
""" Call variant peptides from a given circRNA """
gene_id = record.gene_id
Expand Down Expand Up @@ -770,7 +779,9 @@ def call_peptide_circ_rna(record:circ.CircRNAModel,
cgraph = svgraph.ThreeFrameCVG(
circ_seq, _id=record.id, circ_record=record,
cleavage_params=cleavage_params,
max_adjacent_as_mnv=max_adjacent_as_mnv
max_adjacent_as_mnv=max_adjacent_as_mnv,
coordinate_feature_type='gene',
coordinate_feature_id=gene_id
)
cgraph.init_three_frames()
cgraph.create_variant_circ_graph(variant_records)
Expand All @@ -780,7 +791,7 @@ def call_peptide_circ_rna(record:circ.CircRNAModel,
pgraph = cgraph.translate()
pgraph.create_cleavage_graph()
peptide_map = pgraph.call_variant_peptides(
denylist=denylist, circ_rna=record,
denylist=denylist, circ_rna=record, backsplicing_only=backsplicing_only,
w2f=w2f_reassignment, check_external_variants=True
)
if not save_graph:
Expand Down
13 changes: 13 additions & 0 deletions moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,19 @@ def get_orf_start(self, i:int=0) -> int:
# raise ValueError('Can not find ORF')
return -1

def get_subgraph_id_set(self) -> Set[str]:
""" Get all subgraph ID as set """
ids = set()
for loc in self.seq.locations:
if len(loc) == 0:
continue
ids.add(loc.ref.seqname)
for v in self.variants:
if v.variant.is_circ_rna():
continue
ids.add(v.location.seqname)
return ids

def get_subgraph_id_at(self, i:int) -> str:
""" Get the subgraph ID at the given position """
for loc in self.seq.locations:
Expand Down
72 changes: 46 additions & 26 deletions moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import deque
from functools import cmp_to_key
from Bio.Seq import Seq
from moPepGen import aa, circ, seqvar, params
from moPepGen import aa, params
from moPepGen.seqvar.VariantRecord import VariantRecord
from moPepGen.svgraph.SubgraphTree import SubgraphTree
from moPepGen.svgraph.VariantPeptideDict import VariantPeptideDict
Expand All @@ -16,6 +16,8 @@

if TYPE_CHECKING:
from .VariantPeptideDict import AnnotatedPeptideLabel
from moPepGen.circ import CircRNAModel
from moPepGen.params import CleavageParams

T = Tuple[Set[PVGNode],Dict[PVGNode,List[PVGNode]]]

Expand All @@ -41,7 +43,7 @@ class PeptideVariantGraph():
position.
"""
def __init__(self, root:PVGNode, _id:str,
known_orf:List[int,int], cleavage_params:params.CleavageParams=None,
known_orf:List[int,int], cleavage_params:CleavageParams=None,
orfs:Set[Tuple[int,int]]=None, reading_frames:List[PVGNode]=None,
orf_id_map:Dict[int,str]=None, cds_start_nf:bool=False,
hypermutated_region_warned:bool=False, denylist:Set[str]=None,
Expand Down Expand Up @@ -641,8 +643,8 @@ def cross_join(self, node:PVGNode, site:int, cleavage_range:Tuple[int, int]=None

def create_cleavage_graph(self) -> None:
""" Form a cleavage graph from a variant graph. After calling this
method, every each in the graph should represent a cleavage in the
sequence of the pull peptide of reference and variated.
method, each in the graph should represent a cleavage in the
sequence of the pull peptide of reference and variant.
Args:
rule (str): The rule for enzymatic cleavage, e.g., trypsin.
Expand Down Expand Up @@ -784,30 +786,43 @@ def update_orf(self, orf:List[int,int]) -> None:
self.orfs.add(tuple(orf))

def create_orf_id_map(self) -> None:
""" Creates and map for ORF start site and its ID (ORF1, ORF2, etc) """
""" Creates a map for ORF start site and its ID (ORF1, ORF2, etc) """
orfs = list(self.orfs)
orfs.sort()
self.orf_id_map = {v:f"ORF{i+1}" for i,v in enumerate(orfs)}

def call_variant_peptides(self, check_variants:bool=True,
check_orf:bool=False, keep_all_occurrence:bool=True, denylist:Set[str]=None,
circ_rna:circ.CircRNAModel=None, orf_assignment:str='max',
truncate_sec:bool=False, w2f:bool=False, check_external_variants:bool=True
circ_rna:CircRNAModel=None, orf_assignment:str='max',
backsplicing_only:bool=False, truncate_sec:bool=False, w2f:bool=False,
check_external_variants:bool=True
) -> Dict[Seq, List[AnnotatedPeptideLabel]]:
""" Walk through the graph and find all variated peptides.
""" Walk through the graph and find all noncanonical peptides.
Args:
miscleavage (int): Number of miscleavages allowed.
check_variants (bool): When true, only peptides that carries at
least 1 variant are kept. And when false, all unique peptides
are reported.
check_orf (bool): When true, the ORF ID will be added to the
variant peptide label.
keep_all_occurrence: Whether to keep all occurance of the peptide
within the graph/transcript.
Return:
A set of aa.AminoAcidSeqRecord.
- miscleavage (int): Number of miscleavages allowed.
- check_variants (bool): When true, only peptides that carry at
least 1 variant are kept. When false, all unique peptides
are reported.
- check_orf (bool): When true, the ORF ID will be added to the
variant peptide label.
- keep_all_occurrence (bool): Whether to keep all occurences of the
peptide within the graph/transcript.
- denylist (Set[str]): Denylist of peptides to be called as variant
peptides.
- circ_rna (CircRNAModel): The circRNA from which variant peptides are
called.
- orf_assignment (str): ORF assignment strategy. Options: ['max', 'min']
- backsplicing_only (bool): Whether to only output variant peptides
spanning the backsplicing site.
- truncate_sec (bool): Whether to consider selenocysteine termination
events.
- w2f (bool): Whether to consider W>F (tryptophan to phenylalanine)
substituants.
- check_external_variants (bool): When set to `False`, peptides not
harbouring any external variants will also be outputted. This can
be used when calling noncanonical peptides from noncoding
transcripts.
"""
if self.is_circ_rna() and not circ_rna:
raise ValueError('`circ_rna` must be given.')
Expand All @@ -828,7 +843,8 @@ def call_variant_peptides(self, check_variants:bool=True,
traversal = PVGTraversal(
check_variants=check_external_variants,
check_orf=check_orf, queue=queue, pool=peptide_pool,
circ_rna=circ_rna, orf_assignment=orf_assignment
circ_rna=circ_rna, orf_assignment=orf_assignment,
backsplicing_only=backsplicing_only
)

if self.has_known_orf():
Expand Down Expand Up @@ -919,7 +935,8 @@ def call_and_stage_known_orf_in_cds(self, cursor:PVGCursor,
additional_variants=additional_variants,
denylist=self.denylist,
leading_node=target_node,
subgraphs=self.subgraphs
subgraphs=self.subgraphs,
backsplicing_only=traversal.backsplicing_only
)
self.remove_node(node_copy)

Expand Down Expand Up @@ -1002,7 +1019,8 @@ def call_and_stage_known_orf_not_in_cds(self, cursor:PVGCursor,
additional_variants=additional_variants,
denylist=self.denylist,
leading_node=target_node,
subgraphs=self.subgraphs
subgraphs=self.subgraphs,
backsplicing_only=traversal.backsplicing_only
)
cleavage_gain = target_node.get_cleavage_gain_variants()
for out_node in target_node.out_nodes:
Expand Down Expand Up @@ -1199,7 +1217,8 @@ def call_and_stage_unknown_orf(self, cursor:PVGCursor,
denylist=self.denylist,
leading_node=target_node,
subgraphs=self.subgraphs,
circ_rna=traversal.circ_rna
circ_rna=traversal.circ_rna,
backsplicing_only=traversal.backsplicing_only
)
for node in trash:
self.remove_node(node)
Expand Down Expand Up @@ -1288,7 +1307,7 @@ def jsonfy(self):
class PVGCursor():
""" Helper class for cursors when graph traversal to call peptides. """
def __init__(self, in_node:PVGNode, out_node:PVGNode, in_cds:bool,
orfs:List[PVGOrf]=None, cleavage_gain:List[seqvar.VariantRecord]=None,
orfs:List[PVGOrf]=None, cleavage_gain:List[VariantRecord]=None,
finding_start_site:bool=True):
""" constructor """
self.in_node = in_node
Expand All @@ -1306,10 +1325,10 @@ class PVGTraversal():
"""
def __init__(self, check_variants:bool, check_orf:bool,
pool:VariantPeptideDict, known_orf_tx:Tuple[int,int]=None,
known_orf_aa:Tuple[int,int]=None, circ_rna:circ.CircRNAModel=None,
known_orf_aa:Tuple[int,int]=None, circ_rna:CircRNAModel=None,
queue:Deque[PVGCursor]=None,
stack:Dict[PVGNode, Dict[PVGNode, PVGCursor]]=None,
orf_assignment:str='max'):
orf_assignment:str='max', backsplicing_only:bool=False):
""" constructor """
self.check_variants = check_variants
self.check_orf = check_orf
Expand All @@ -1320,6 +1339,7 @@ def __init__(self, check_variants:bool, check_orf:bool,
self.pool = pool
self.stack = stack or {}
self.orf_assignment = orf_assignment
self.backsplicing_only = backsplicing_only

def is_done(self) -> bool:
""" Check if the traversal is done """
Expand Down
23 changes: 15 additions & 8 deletions moPepGen/svgraph/ThreeFrameCVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
import copy
from typing import Dict, Union, List, TYPE_CHECKING
from moPepGen.SeqFeature import FeatureLocation
from moPepGen import seqvar, params
from moPepGen import seqvar
from .ThreeFrameTVG import ThreeFrameTVG
from .TVGNode import TVGNode


if TYPE_CHECKING:
from moPepGen import circ
from moPepGen.circ import CircRNAModel
from moPepGen.dna import DNASeqRecordWithCoordinates
from .SubgraphTree import SubgraphTree
from moPepGen.params import CleavageParams
from moPepGen.seqvar import VariantRecordWithCoordinate, VariantRecord

class ThreeFrameCVG(ThreeFrameTVG):
""" Defines a directed cyclic graph for circular nucleotide molecules such
Expand All @@ -28,8 +30,10 @@ class ThreeFrameCVG(ThreeFrameTVG):
def __init__(self, seq:Union[DNASeqRecordWithCoordinates,None],
_id:str, root:TVGNode=None, reading_frames:List[TVGNode]=None,
cds_start_nf:bool=False, has_known_orf:bool=False,
circ_record:circ.CircRNAModel=None, attrs:dict=None,
subgraphs:SubgraphTree=None, cleavage_params:params.CleavageParams=None,
circ_record:CircRNAModel=None, attrs:dict=None,
coordinate_feature_type:str=None, coordinate_feature_id:str=None,
subgraphs:SubgraphTree=None, hypermutated_region_warned:bool=False,
cleavage_params:CleavageParams=None,
max_adjacent_as_mnv:int=2):
""" Construct a CircularVariantGraph
Expand Down Expand Up @@ -64,10 +68,13 @@ def __init__(self, seq:Union[DNASeqRecordWithCoordinates,None],
seq=seq, _id=_id, root=root, reading_frames=reading_frames,
cds_start_nf=cds_start_nf, has_known_orf=has_known_orf,
global_variant=circ_variant, subgraphs=subgraphs,
cleavage_params=cleavage_params, max_adjacent_as_mnv=max_adjacent_as_mnv
cleavage_params=cleavage_params, max_adjacent_as_mnv=max_adjacent_as_mnv,
coordinate_feature_type=coordinate_feature_type,
coordinate_feature_id=coordinate_feature_id,
hypermutated_region_warned=hypermutated_region_warned
)

def get_circ_variant_with_coordinate(self) -> seqvar.VariantRecordWithCoordinate:
def get_circ_variant_with_coordinate(self) -> VariantRecordWithCoordinate:
""" Add a variant record to the frameshifting of the root node. This
will treat all peptides as variant peptide in the later steps. """
location = FeatureLocation(
Expand All @@ -90,7 +97,7 @@ def init_three_frames(self, truncate_head:bool=False):
node.variants.append(var_i)
self.add_edge(node, root, 'reference')

def create_variant_circ_graph(self, variants:List[seqvar.VariantRecord]):
def create_variant_circ_graph(self, variants:List[VariantRecord]):
""" Apply a list of variants to the graph. Variants not in the
range are ignored. Variants at the first nucleotide of each fragment
of the sequence are also ignored, because it causes the exon splice
Expand Down Expand Up @@ -149,7 +156,7 @@ def extend_loop(self):
self.subgraphs.add_subgraph(
child_id=sub.id, parent_id=cur.id, level=sub.root.level,
start=self.seq.locations[-1].ref.end, end=self.seq.locations[-1].ref.end,
variant=self.global_variant, feature_type='gene', feature_id=self.gene_id
variant=self.global_variant, feature_type='gene', feature_id=self.circ.gene_id
)
for edge in sub.root.out_edges:
root = edge.out_node
Expand Down
Loading

0 comments on commit c5a510b

Please sign in to comment.