Skip to content

Commit

Permalink
Merge pull request #455 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
Fix `callVariant` with fusion
  • Loading branch information
lydiayliu authored May 13, 2022
2 parents 1c6c03d + 07734e6 commit ce68ca6
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 12 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [0.5.0] - 2022-05-13

- Fixed `callVariant` that it failed when the breakpoint of a fusion is right at the end of the start codon because it tried convert to end inclusion. #454

- Fixed `callVariant` that no variant peptides are called when the first variant is a fusion. #454

## [0.4.2] - 2022-05-11

### Changed
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '0.4.2'
__version__ = '0.5.0'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
8 changes: 6 additions & 2 deletions moPepGen/aa/AminoAcidSeqRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,9 @@ def __add__(self, other:AminoAcidSeqRecordWithCoordinates
rhs = other.locations[0].shift(len(self))
if lhs.ref.end == rhs.ref.start and lhs.query.end == rhs.query.start:
query = FeatureLocation(start=lhs.query.start, end=rhs.query.end)
ref = FeatureLocation(start=lhs.ref.start, end=rhs.ref.end)
ref = FeatureLocation(
start=lhs.ref.start, end=rhs.ref.end, seqname=lhs.ref.seqname
)
new_loc = MatchedLocation(query=query, ref=ref)
right_locs.pop(0)
left_locs[-1] = new_loc
Expand Down Expand Up @@ -333,9 +335,11 @@ def __hash__(self):
"""hash"""
return hash(self.seq)

def get_query_index(self, ref_index:int) -> int:
def get_query_index(self, ref_index:int, seqname:str) -> int:
""" Returns the query index wiht a given reference index """
for location in self.locations:
if location.ref.seqname != seqname:
continue
if ref_index in location.ref:
return location.query.start + ref_index - location.ref.start
return -1
4 changes: 4 additions & 0 deletions moPepGen/seqvar/VariantRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ def is_stop_lost(self, stop:int):

def to_end_inclusion(self, seq:DNASeqRecord):
""" Convert the variant to start exlusion and end inclusion format """
if self.alt.startswith('<'):
raise ValueError(
f'This variant should not be converted to end inclusion: {self}'
)
location = FeatureLocation(
seqname=self.location.seqname,
start=self.location.start + 1,
Expand Down
4 changes: 4 additions & 0 deletions moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ def remove_out_edges(self) -> None:
for node in copy.copy(self.out_nodes):
self.remove_out_edge(node)

def is_inbond_of(self, node:PVGNode) -> bool:
""" Checks if self is the inbond node of a given node. """
return node in self.out_nodes

def is_orphan(self) -> bool:
""" Checks if the node is orphan (no inbond or outbond node) """
return not self.in_nodes and not self.out_nodes
Expand Down
7 changes: 4 additions & 3 deletions moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,15 +774,16 @@ def call_and_stage_known_orf_not_in_cds(self, cursor:PVGCursor,
in_cds = cursor.in_cds
orf = cursor.orf
start_gain = cursor.start_gain
if target_node.reading_frame_index != self.known_reading_frame_index() or\
target_node.subgraph_id != self.id:
if target_node.reading_frame_index != self.known_reading_frame_index():
for out_node in target_node.out_nodes:
cur = PVGCursor(target_node, out_node, False, orf, [])
traversal.stage(target_node, out_node, cur)
return

start_index = target_node.seq.get_query_index(
traversal.known_orf_aa[0])
ref_index=traversal.known_orf_aa[0],
seqname=self.id
)
if start_index == -1:
for out_node in target_node.out_nodes:
cur = PVGCursor(
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/svgraph/TVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def translate(self) -> svgraph.PVGNode:
dna_ref_codon_start = loc.ref.start + dna_query_codon_start - query.start
ref_start = math.ceil((dna_ref_codon_start - self.reading_frame_index) / 3)
ref_end = ref_start + len(query)
ref = FeatureLocation(start=ref_start, end=ref_end)
ref = FeatureLocation(start=ref_start, end=ref_end, seqname=loc.ref.seqname)
locations.append(MatchedLocation(query=query, ref=ref))

seq.__class__ = aa.AminoAcidSeqRecordWithCoordinates
Expand Down
1 change: 1 addition & 0 deletions moPepGen/svgraph/ThreeFrameCVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, seq:Union[DNASeqRecordWithCoordinates,None],
"""
self.seq = seq
self.id = _id
if self.seq and not self.seq.locations:
self.add_default_sequence_locations()
self.attrs = attrs
Expand Down
14 changes: 10 additions & 4 deletions moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def __init__(self, seq:Union[dna.DNASeqRecordWithCoordinates,None],
transcript_id (str)
"""
self.seq = seq
self.id = _id
if self.seq and not self.seq.locations:
self.add_default_sequence_locations()
self.id = _id
self.root = root or self.create_node(seq=None)
self.reading_frames = reading_frames or [None, None, None]
if reading_frames and len(reading_frames) != 3:
Expand All @@ -74,7 +74,7 @@ def add_default_sequence_locations(self):
""" Add default sequence locations """
self.seq.locations = [MatchedLocation(
query=FeatureLocation(start=0, end=len(self.seq)),
ref=FeatureLocation(start=0, end=len(self.seq))
ref=FeatureLocation(start=0, end=len(self.seq), seqname=self.id)
)]

def update_node_level(self, level:int):
Expand Down Expand Up @@ -902,10 +902,16 @@ def create_variant_graph(self, variants:List[seqvar.VariantRecord],
# skipping start lost mutations
start_index = self.seq.orf.start + 3 if self.has_known_orf else 3

if variant.location.start == start_index - 1 and variant.is_insertion():
if variant.location.start == start_index - 1 and variant.is_insertion() and \
not variant.is_fusion():
variant.to_end_inclusion(self.seq)

if variant.location.start < start_index:
# Skip variants that the position is smaller than the first NT
# after start codon. Exception for fusion, that if the donor
# breakpoint is at the last NT of the start codon it is retained
# because it won't break the start codon.
if variant.location.start < start_index and not \
(variant.is_fusion() and variant.location.start == start_index - 1):
variant = next(variant_iter, None)
continue

Expand Down
2 changes: 2 additions & 0 deletions test/files/comb/CPCG0100_ENST00000265138.4_expected.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DILIGHERGNCHSHLNLQQTPP
GNCHSHLNLQQTPP
17 changes: 17 additions & 0 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,23 @@ def test_call_variant_peptide_all_sources(self):
expected = {'vep_moPepGen.fasta'}
self.assertEqual(files, expected)

def test_call_variant_peptide_fusion_only(self):
""" Test variant peptide calling with fusion and circRNA """
args = create_base_args()
args.input_path = [
self.data_dir/'fusion'/'fusion.gvf'
]
args.output_path = self.work_dir/'vep_moPepGen.fasta'
args.genome_fasta = self.data_dir/'genome.fasta'
args.annotation_gtf = self.data_dir/'annotation.gtf'
args.proteome_fasta = self.data_dir/'translate.fasta'
cli.call_variant_peptide(args)
files = {str(file.name) for file in self.work_dir.glob('*')}
expected = {'vep_moPepGen.fasta'}
self.assertEqual(files, expected)
seqs = list(SeqIO.parse(args.output_path, 'fasta'))
self.assertTrue(len(seqs) > 0)

def test_call_variant_peptide_case4(self):
""" Test variant peptide calling with alternative splicing """
args = create_base_args()
Expand Down
2 changes: 1 addition & 1 deletion test/unit/test_peptide_variant_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_pgraph(data:dict, _id:str, known_orf:List[int]=None,
for (query_start, query_end), (ref_start, ref_end) in val[3]:
loc = MatchedLocation(
query=FeatureLocation(start=query_start, end=query_end),
ref=FeatureLocation(start=ref_start, end=ref_end)
ref=FeatureLocation(start=ref_start, end=ref_end, seqname=_id)
)
locs.append(loc)

Expand Down

0 comments on commit ce68ca6

Please sign in to comment.