Skip to content

Commit

Permalink
Merge pull request #787 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
fix (callVariant): Fixed callVariant that `fit_into_codon` terminated…
  • Loading branch information
zhuchcn authored Aug 2, 2023
2 parents ebee918 + 7f930bb commit 1c21a9a
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- Fixed that in circRNA, cleavage gain from upstream node is added to the the wrong ORF. #783

- Fixed callVariant that `fit_into_codon` terminated early in fusion transcripts when there donor has a frameshift and accepter has a variant right after the breakpoint. #786

## [1.2.0] - 2023-07-04

### Fixed
Expand Down
3 changes: 3 additions & 0 deletions docs/files/fuzz_test_history.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ version commit submit_date variant_type n_match n_mismatch n_fail avg_time_call_
v1.1.0 6114857 2023-07-30 snv 10462 0 0 0:00:00.135468 0.3421855855832773 0:00:58.542349 119.97907892654416
v1.1.0 6114857 2023-07-30 indel 10504 0 0 0:00:00.179782 0.38469558675845283 0:00:42.905253 99.21484311919225
v1.1.0 6114857 2023-07-30 comprehensive 19582 0 0 0:00:00.306811 1.0169307177655384 0:00:38.741720 155.60354415410606
v1.1.0 6114857 2023-08-01 snv 7595 0 0 0:00:00.131920 0.33841692362010367 0:00:55.542089 116.61952386946663
v1.1.0 6114857 2023-08-01 indel 7435 0 0 0:00:00.175896 0.3814751285653182 0:00:40.585989 96.301218262223
v1.1.0 6114857 2023-08-01 comprehensive 14448 1 0 0:00:00.325781 4.520085501579521 0:00:39.910831 174.95797059352583
4 changes: 4 additions & 0 deletions moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1757,6 +1757,10 @@ def expand_alignments(self, start:TVGNode) -> List[TVGNode]:
# This is when the left or right intronic insertion of a fusion
# is smaller than 3. The `end` should contain an unique out node
end = self.merge_with_outbonds(end)[0]
elif end.global_variant and end.global_variant.is_fusion() \
and ref_node.has_exclusive_outbond_node():
end = self.merge_with_outbonds(ref_node)[0]
return [end]
else:
# Similar to above but here for AltSplice.
self.merge_into_inbonds(end)
Expand Down
21 changes: 21 additions & 0 deletions test/files/fuzz/43/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
chrF . gene 1 1214 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . transcript 1 1214 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF; is_protein_coding true;
chrF . selenocysteine 369 371 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . selenocysteine 1146 1148 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . selenocysteine 1170 1172 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . exon 1 223 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . CDS 106 223 . + 2 gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . CDS 353 582 . + 1 gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . exon 353 582 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . CDS 925 1214 . + 2 gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . exon 925 1214 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . UTR 1 105 . + . gene_id FAKEG00000381; transcript_id FAKET00000381; protein_id FAKEP00000381; tag mrna_end_NF;
chrF . gene 1239 3059 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . transcript 1239 3059 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398; is_protein_coding false;
chrF . exon 1239 1254 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 1663 1709 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 1748 1979 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 2020 2199 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 2325 2561 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 2754 2917 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
chrF . exon 2989 3059 . + . gene_id FAKEG00000398; transcript_id FAKET00000398; protein_id FAKEP00000398;
2 changes: 2 additions & 0 deletions test/files/fuzz/43/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CLITLESLRV
VWPQWCSLPRSCLT
26 changes: 26 additions & 0 deletions test/files/fuzz/43/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##fileformat=VCFv4.2
##mopepgen_version=1.2.1
##parser=parseVEP
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
#CHROM POS ID REF ALT QUAL FILTER INFO
FAKEG00000381 538 FAKEG00000381-537-TAGAGGGAAATGGG-T TAGAGGGAAATGGG T . . TRANSCRIPT_ID=FAKET00000381;GENOMIC_POSITION=chrF-537:551;GENE_SYMBOL=
FAKEG00000381 955 FUSION-FAKET00000381:954-FAKET00000398:843 T <FUSION> . . TRANSCRIPT_ID=FAKET00000381;GENE_SYMBOL=None;GENOMIC_POSITION=954;ACCEPTER_GENE_ID=FAKEG00000398;ACCEPTER_TRANSCRIPT_ID=FAKET00000398;ACCEPTER_SYMBOL=None;ACCEPTER_POSITION=844;ACCEPTER_GENOMIC_POSITION=2081
FAKEG00000398 845 FAKEG00000398-844-T-G T G . . TRANSCRIPT_ID=FAKET00000398;GENOMIC_POSITION=chrF-2082:2083;GENE_SYMBOL=
52 changes: 52 additions & 0 deletions test/files/fuzz/43/genome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
>chrF
ATACACTCTGGACCAAAACTCCGTAGAATGTGGAACTTGATACGTCACGGTTATAATGTC
GATTACCACGGCCGTAGGCTCTACAGGCCTGGCGCCCAACACGTTTAATGATCAATGCCG
CGGAGTCCTCAGGAAGGACTGAGGTCAATGACCGGCCCAAGGGTCATTCGGCAAGGGGCG
GCTCCCCTTCGACCTTGCCTATTATGGTGCATATACGTCACGCTGCAGATGACTTTCTCT
GTAGTGGACGCCTAGCCCAACAAAACACTCCGACATGGCGACGTTCGGAAGCTGCATTAA
TTCAGACATGACCGAGCTAAGGCTCACCGAGGGCGATGAGAGGTCGCTCTGGTCATTGCC
CCTTGCAATGAAGCGGATTTACATCTTCAGCCCACACCCTTCCCTTTCGGAAGCCAATAG
GCGAAAGAAAGGAGTACGACACTTCTCTAGTGATAGAGAAAGAGGGCCAATGCTGCGAAG
GTCTTCGGCCCGAACTACTTGCCCCGAAGTGTCTTATTACCTTGGAGAGCTTGAGGGTAG
AGGGAAATGGGATAGCCAGTTCCTGTCTTTGTGTTGCTCGTGGGGCATGCTTTCCCGTAT
CTCCCAGCCGGAGAGTACGAGCGCGAAGAATTTATCCGTCTAAGCGTGGTCATCCCCACA
GGCCGAATAACATGTGATCCCGTTGACCTCGTGCATAAGATGGTGCACCCATATCTAGCC
GGGCATTAAATTTCGGGGGTTGACAATCAACGCCTTAGAAATTGTGCATTCGAGTTGACT
CTTTTTGATCGTAGTAAGCCGCGATAGACCACTACCGTGAGCGACAATTGGGTCATAACC
ATCAAGCTTTTGCTCCTCTCCGCCCATGGAAACCATCAGTGGGATCTATTTCATTCGCAA
TAGCCCCTGCAGATCGCGGCACACTGTGGCCTCAATGGTGTAGCCTACCCAGATTTCGTA
TCACGGGTGGGGCGGGCTTTTCAGGTCTCATTGATAACCTTGCAGCCCCCAACACCAAGC
CTAGGGCTTTTCCTGGCCAGCATGTCTCATTGACACGTCTGCTCCAAACACATCGAGTAC
CGGATATACGCCTGGTGTACAGTTTCGCGCATTATCTGCTTGTCCGTGATTGGCGCCAAC
CTCAGTGAAGGATGCCGCAAAAGTCCAGGTGAATAAATGGAGACTTAACTCTGTATGCTG
CCCACCTGGTCGGGCACCAGATAATGTTTTTGGTTCACTTTTTAGTATTCGGCTCGTACC
CTTACAAGAACTTCGGTTCTTAACCGAAGCAACCAATATGCACAGCTGATTCCCCGACCC
GAGGTACAGCCCCACGTTGCAGAGCCACTACCTACCGCGCCACGCCGGTAAAACGCTTTC
GGTTCAATGCGCAGACCGCACAATTTTGGATGCGCCATTAATTCACAGCTAAACCTATGT
AGTTAAGCCAATTATTGGGAGGGACAGCTCATGAGACGTTCAATCTCTATTGATGCAGGG
GGCAAGAAGTTTAATAGTTCCGGGCAATTCTCGTGTTCGACACCGTCTAAGTCAAGTGTA
TGACAGGAACAGTATTTGCGCATACCAGCCTTCACGGGGTTGAGAGGCACTGATTGATAG
GTTTACTCTCTCAAATCTTCCGTGTGAAGTGCAAAGTCGGGCTAAGATGTCTATCTGACC
TGTTATCTTTACTAAACTTATGTATGTACTGTTCACAGAATAACAACCCATGACATCAGA
TGTTTTGATGGTATGTTTTGGTTTCGCTCGGCAAGACTGAGCACCGCTGCGAACATCCTA
AAGCGTCGGGGCGTCGACGAATAGTCATAGCTCGAAATAGAAGATTGATACCCTGAATCT
AGGAAATCAGATCACTGCAAGTTCCCCAGTTCCTTGTTAGCCCGCTGGCTTATTGATCTT
GTGTGGGAGCGACACGCGTTCAGGCTGGCCTCAATACCCCGTCTTAAACAGGGAGTCACT
GTCGATAGTGTCATGATGCCGGGATCAAAACGACTCCTCAGCACTGCATTAGTGCGGTAC
CCCTCAGATCGCCACGGGTTCGGTCGAGTGCCGCTTGTAGGCTTGTCTCACCTAACTAAG
AGAATGGTAGCATCGCCCTTGTGAACAAATATGAGAGTGCTTGCCGTCTTGGCGGATTAT
GTCCCAACAGTGCTGACTCGGCCAAATAGACTTGCTACAGTTACCGTCCCTTCTTTACAG
CGGGGTTTCATGACCTACCTAAGCGTACGTATACTAAAGGGATCCTATGACCTCTGGAAT
AATTATTCGGGTTAACAATTCGTCCGCGCTGCACCGTTGGTTTAAAGTTAATCAGCTGTT
GGGGGGAAGTAGCCCCATATAATTTAACCTTCCTAGGTGTCAGGTGAATCATTCAGATCG
TAGGCGATATAGTCGATCATGTACGACCCATCTTTTTGCCAGCGTGGTATACCCAAAACG
TTGTGCCATTTGAAGGGCGCTATATAGGCGTTTGATCTTATCAGTCTTGATGTCCTCACT
ACCAGCATCTCGCACAGTAGCCATCAAGAACGAAGTAGATCTTATGTTCTTGGAGGGCAT
CACGAGTTGCTAGGTGGGTCGGCTTGCTCAGGATGAGGCATTAGCGGTCTGCCCATAGGG
CTATAATGCATAACAGTGTGGTGAACAGAAACGAGCTTCTACGAGATACCGATTCAGCAT
GAGAGCTCCCAGTCGGTTCGGGACGTCGTTCAACCCTTCTGGTAGCTGGTCCGAATAATT
ATAGACTTTCTACATAAATGTACGCAGCGATAGCCGACTGTTGGCATGTCCTCGTAGTCA
GTCGCGGTCGTGAGTAGCGGTCAGCAGTGAGAGCGCGGCTCTACTAAGCCTAATGGCGTC
AAACGGTTTCAATGAGTACGTTCGAATTGGGGCATGTATGCAGGCTTACGCAAGATAATA
CACTATCCGATGACAGCCCGACCGCGGTCCCAGCTGTTCGCGTGACTACACGAAGGCGTT
GCTATGTCACTACCAGGCCCAAGAAGACTCGCCATGAAGAACGCTATAGAACCACATCC
5 changes: 5 additions & 0 deletions test/files/fuzz/43/proteome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>FAKEP00000381|FAKET00000381|FAKEG00000381|XXX
MINAAESSGRTEVNDRPKGHSARGGSPSTLPIMVHIRHAHCPLQUSGFTSSAHTLPFRKP
IGERKEYDTSLVIEKEGQCCEGLRPELLAPKCLITLESLRVEGNGIASSCLCVARVWPQW
CSLPRFRITGGAGFSGLIDNLAAPNTKPRAFPGQHVSLTRLLQTHRVPDIRLVYSFAHYL
LVRDWRQPQURMPQKSRUINGDLTLYAAHLVG
12 changes: 12 additions & 0 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,3 +1072,15 @@ def test_call_variant_peptide_case71(self):
expected = self.data_dir/'fuzz/42/brute_force.txt'
reference = self.data_dir/'fuzz/42'
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case72(self):
""" In this test case, the fusion accepter transcript has a variant
right after the accepter breakpoint, and the donor transcript has a
indel, causing the first node in the fusion subgraph has only 1 nucleotide.
"""
gvf = [
self.data_dir/'fuzz/43/fake_variants.gvf'
]
expected = self.data_dir/'fuzz/43/brute_force.txt'
reference = self.data_dir/'fuzz/43'
self.default_test_case(gvf, reference, expected)

0 comments on commit 1c21a9a

Please sign in to comment.