Skip to content

Commit

Permalink
Merge pull request #471 from uclahs-cds/czhu-fix-variant-record
Browse files Browse the repository at this point in the history
Fix issue that large insertions were failed to be converted to end inclusion
  • Loading branch information
zhuchcn authored Jun 2, 2022
2 parents 17226c9 + d4af82d commit 4efd22a
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 8 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

### Fixed

- Fixed the issue that large insertions, such as retained introns, that start right after the start codon were not converted to end inclusion successfully. #470

## [0.6.1] - 2022-05-31

### Fixed

- Deletion was mistreated as insertion when trying to convert to end-inclusion format. This only affects deletions that start on the third nucleotide of CDS. #468

---
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '0.6.1'
__version__ = '0.6.2'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
30 changes: 24 additions & 6 deletions moPepGen/seqvar/VariantRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,18 @@ def is_frameshifting(self) -> bool:
alt_len = len(self.alt)
return (ref_len - alt_len) % 3 != 0

def set_end_inclusion(self):
""" Set end inclusion to True """
self.attrs['END_INCLUSION'] = True

def unset_end_inclusion(self):
""" Set end inclusion to False """
self.attrs['END_INCLUSION'] = False

def is_end_inclusion(self) -> bool:
""" Is end inclusion """
return self.attrs.get('END_INCLUSION') is True

def frames_shifted(self):
""" get number of nucleotide shifted
TODO: tests needed """
Expand Down Expand Up @@ -403,7 +415,7 @@ def is_stop_lost(self, stop:int):

def to_end_inclusion(self, seq:DNASeqRecord):
""" Convert the variant to start exlusion and end inclusion format """
if self.alt.startswith('<'):
if self.type != 'Insertion' and self.alt.startswith('<'):
raise ValueError(
f'This variant should not be converted to end inclusion: {self}'
)
Expand All @@ -412,11 +424,17 @@ def to_end_inclusion(self, seq:DNASeqRecord):
start=self.location.start + 1,
end=self.location.end + 1
)
ref = self.ref[1:] + str(seq.seq[self.location.end])
alt = self.alt[1:] + str(seq.seq[self.location.end])
self.location = location
self.ref = ref
self.alt = alt
self.set_end_inclusion()
if self.type == 'Insertion':
ref = self.ref[1:] + str(seq.seq[self.location.end])
self.location = location
self.ref = ref
else:
ref = self.ref[1:] + str(seq.seq[self.location.end])
alt = self.alt[1:] + str(seq.seq[self.location.end])
self.location = location
self.ref = ref
self.alt = alt

def shift_breakpoint_to_closest_exon(self, anno:GenomicAnnotation):
""" Shift fusion breakpoints to the closest exon. Donor breakpoint will
Expand Down
5 changes: 4 additions & 1 deletion moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,10 @@ def apply_insertion(self, cursors:List[TVGNode],
Seq(variant.ref),
locations=[ref_seq_location]
)
insert_seq = ref_seq + insert_seq
if variant.is_end_inclusion():
insert_seq = insert_seq + ref_seq
else:
insert_seq = ref_seq + insert_seq
var = seqvar.VariantRecordWithCoordinate(
variant=variant,
location=FeatureLocation(start=1, end=len(insert_seq.seq))
Expand Down
36 changes: 36 additions & 0 deletions test/unit/test_three_frame_tvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,42 @@ def filter_func(x):
node_seqs = {str(edge.out_node.seq.seq) for edge in node.out_edges}
self.assertEqual(node_seqs, {'T', 'A'})

def test_apply_insertion_case3(self):
""" apply_insertion with end_inclusion """
anno = create_genomic_annotation(ANNOTATION_DATA)
genome = create_dna_record_dict(GENOME_DATA)
data = {
1: ['AAATAAATAAAT', ['RF0'], [], 0],
2: ['AATAAATAAAT', ['RF1'], [], 1],
3: ['ATAAATAAAT', ['RF2'], [], 2]
}
gene_id = 'ENSG0001'
ins_attrs = {
'DONOR_GENE_ID': 'ENSG0001',
'DONOR_START': 17,
'DONOR_END': 23
}
ins_var = create_variant(2,3,'T','<INS>','Insertion','',ins_attrs)
gene_seq = anno.genes[gene_id].get_gene_sequence(genome['chr1'])
ins_var.to_end_inclusion(gene_seq)

var_pool = seqvar.VariantRecordPool(anno=anno)
graph, nodes = create_three_frame_tvg(data, 'AAATAAATAAAT')
cursors = [nodes[1], nodes[2], nodes[3]]
graph.apply_insertion(cursors, ins_var, var_pool, genome, anno)

received = {str(list(n.out_edges)[0].out_node.seq.seq)
for n in graph.reading_frames}
self.assertEqual(received, {'A', 'AA', 'AAA'})

for root in graph.reading_frames:
node = root.get_out_nodes()[0]
for x in node.get_out_nodes():
if x.variants:
self.assertEqual(x.seq.seq, 'CCTATGT')
else:
self.assertEqual(x.seq.seq, 'T')

def test_apply_substitution_case1(self):
r""" T
/ \
Expand Down
38 changes: 38 additions & 0 deletions test/unit/test_variant_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from test.unit import create_variant, create_genomic_annotation, \
create_dna_record_dict, create_variants
from moPepGen import seqvar
from moPepGen.dna.DNASeqRecord import DNASeqRecord


GENOME_DATA = {
Expand Down Expand Up @@ -77,6 +78,43 @@ def test_is_spanning_over_splicing_site(self):
]
variant = create_variant(*variant_data)
self.assertTrue(variant.is_spanning_over_splicing_site(anno, 'ENST0001'))

def test_to_end_inclusion(self):
""" Test that variant record is converted to end inclusion """
tx_id = 'ENST0001'
variant_data = [
5, 6, 'C', 'CAAAA', 'INDEL', 'INDEL-5-C-CAAAA',
{'TRANSCRIPT_ID': tx_id}, 'ENSG0001'
]
variant = create_variant(*variant_data)
seq = DNASeqRecord('C' * 10, id=tx_id, name=tx_id, description=tx_id)
variant.to_end_inclusion(seq)
self.assertEqual(variant.ref, 'C')
self.assertEqual(variant.alt, 'AAAAC')
self.assertEqual(variant.location.start, 6)

# Insertion
variant2 = create_variant(*variant_data)
variant2.alt = '<Insertion>'
variant2.type = 'Insertion'
seq = DNASeqRecord('C' * 10, id=tx_id, name=tx_id, description=tx_id)
variant2.to_end_inclusion(seq)
self.assertEqual(variant2.ref, 'C')
self.assertEqual(variant2.location.start, 6)
self.assertTrue(variant2.is_end_inclusion())

def test_to_end_inclusion_fail(self):
""" Should fail when converting non insertion variants to end inclusion """
tx_id = 'ENST0001'
variant_data = [
5, 6, 'C', '<Deletion>', 'Deletion', 'RI-5',
{'TRANSCRIPT_ID': tx_id}, 'ENSG0001'
]
variant = create_variant(*variant_data)
seq = DNASeqRecord('C' * 10, id=tx_id, name=tx_id, description=tx_id)
with self.assertRaises(ValueError):
variant.to_end_inclusion(seq)

class TestTranscriptionalVariantSeries(unittest.TestCase):
""" Test cases for VariantRecordSeries """
def test_highest_hypermutated_region_complexity(self):
Expand Down

0 comments on commit 4efd22a

Please sign in to comment.