-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#494 - Start writing new importer, test conversion code
- Loading branch information
Showing
2 changed files
with
206 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
import re | ||
from collections import Counter, defaultdict | ||
from typing import Tuple, Optional, Dict | ||
|
||
from django.core.management.base import BaseCommand | ||
from pyhgvs.utils import read_genepred | ||
|
||
from genes.cached_web_resource.refseq import retrieve_refseq_gene_summaries | ||
from genes.gene_matching import GeneMatcher | ||
from genes.models import GeneAnnotationImport, HGNC, \ | ||
GeneSymbol, Gene, GeneVersion, Transcript, TranscriptVersion, GeneAnnotationRelease, ReleaseGeneVersion, \ | ||
ReleaseTranscriptVersion | ||
from genes.models_enums import AnnotationConsortium | ||
from library.django_utils import highest_pk | ||
from library.file_utils import open_handle_gzip | ||
from library.utils import invert_dict | ||
from snpdb.models.models_enums import SequenceRole | ||
from snpdb.models.models_genome import GenomeBuild, GenomeFasta | ||
|
||
|
||
class Command(BaseCommand): | ||
BATCH_SIZE = 2000 | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.annotation_consortium = None | ||
self.genome_build = None | ||
self.contig_id_to_fasta = None | ||
self.hgnc_ids = set(HGNC.objects.values_list("pk", flat=True)) | ||
# Known objects containers are updated with new inserts | ||
self.known_gene_symbols = set(GeneSymbol.objects.all().values_list("pk", flat=True)) | ||
self.known_gene_versions_by_gene_id = defaultdict(dict) | ||
self.known_transcript_versions_by_transcript_id = defaultdict(dict) | ||
|
||
def add_arguments(self, parser): | ||
consortia = [ac[1] for ac in AnnotationConsortium.choices] | ||
builds = [gb.name for gb in GenomeBuild.builds_with_annotation()] | ||
|
||
parser.add_argument('--dry-run', action='store_true', help="Don't actually modify anything") | ||
parser.add_argument('--genome-build', choices=builds, required=True) | ||
parser.add_argument('--annotation-consortium', choices=consortia, required=True) | ||
parser.add_argument('--release', required=False, | ||
help="Make a release (to match VEP) store all gene/transcript versions") | ||
parser.add_argument('filename', help="PyReference JSON.gz") | ||
|
||
def handle(self, *args, **options): | ||
filename = options["filename"] | ||
|
||
|
||
def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: | ||
start = transcript_data["start"] | ||
end = transcript_data["stop"] | ||
strand = transcript_data["strand"] | ||
# PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts | ||
cds_start = transcript_data.get("cds_start", start) | ||
cds_end = transcript_data.get("cds_end", end) | ||
# PyHGVS exons are in genomic order, pyhgvs are in stranded | ||
features = transcript_data["features_by_type"] | ||
exons = [[ed["start"], ed["stop"]] for ed in features["exon"]] | ||
if strand == '-': | ||
exons.reverse() | ||
|
||
return { | ||
'chrom': transcript_data["chr"], | ||
'start': start, | ||
'end': end, | ||
'strand': strand, | ||
'cds_start': cds_start, | ||
'cds_end': cds_end, | ||
'exons': exons, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
from unittest import TestCase | ||
|
||
from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs | ||
|
||
|
||
class TestAnnotationVCF(TestCase): | ||
""" These are all from GCF_000001405.25_GRCh37.p13_genomic.105.20201022.gff.gz """ | ||
|
||
def test_convert_transcript_pyreference_to_pyhgvs_positive_strand(self): | ||
PYREFERENCE_DATA_NM_030751_6 = { | ||
'biotype': ['protein_coding'], | ||
'cds_end': 31816192, | ||
'cds_start': 31608163, | ||
'chr': 'NC_000010.10', | ||
'features_by_type': {'3PUTR': [{'start': 31816192, 'stop': 31818732}], | ||
'5PUTR': [{'start': 31608144, 'stop': 31608163}], | ||
'CDS': [{'start': 31608163, 'stop': 31608221}, | ||
{'start': 31749965, 'stop': 31750166}, | ||
{'start': 31784707, 'stop': 31784767}, | ||
{'start': 31791275, 'stop': 31791437}, | ||
{'start': 31799600, 'stop': 31799803}, | ||
{'start': 31803530, 'stop': 31803636}, | ||
{'start': 31809053, 'stop': 31810864}, | ||
{'start': 31812860, 'stop': 31813041}, | ||
{'start': 31815599, 'stop': 31816192}], | ||
'exon': [{'start': 31608144, 'stop': 31608221}, | ||
{'start': 31749965, 'stop': 31750166}, | ||
{'start': 31784707, 'stop': 31784767}, | ||
{'start': 31791275, 'stop': 31791437}, | ||
{'start': 31799600, 'stop': 31799803}, | ||
{'start': 31803530, 'stop': 31803636}, | ||
{'start': 31809053, 'stop': 31810864}, | ||
{'start': 31812860, 'stop': 31813041}, | ||
{'start': 31815599, 'stop': 31818732}]}, | ||
'is_coding': 1, | ||
'start': 31608144, | ||
'stop': 31818732, | ||
'strand': '+' | ||
} | ||
|
||
EXPECTED_DATA = { | ||
"end": 31818732, | ||
"chrom": "NC_000010.10", | ||
"exons": [[31608144, 31608221], [31749965, 31750166], [31784707, 31784767], [31791275, 31791437], | ||
[31799600, 31799803], [31803530, 31803636], [31809053, 31810864], [31812860, 31813041], | ||
[31815599, 31818732]], | ||
"start": 31608144, # We originally had it as 31608145 but this matches exon | ||
"strand": "+", | ||
"cds_end": 31816192, | ||
"cds_start": 31608163, | ||
} | ||
|
||
output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_030751_6) | ||
self.maxDiff = None | ||
self.assertEquals(EXPECTED_DATA, output) | ||
|
||
def test_convert_transcript_pyreference_to_pyhgvs_negative_strand(self): | ||
PYREFERENCE_DATA_NM_004656_4 = { | ||
'biotype': ['protein_coding'], | ||
'cds_end': 52443894, | ||
'cds_start': 52436303, | ||
'chr': 'NC_000003.11', | ||
'features_by_type': {'3PUTR': [{'start': 52435023, 'stop': 52436303}], | ||
'5PUTR': [{'start': 52443894, 'stop': 52444024}], | ||
'CDS': [{'start': 52443857, 'stop': 52443894}, | ||
{'start': 52443729, 'stop': 52443759}, | ||
{'start': 52443569, 'stop': 52443624}, | ||
{'start': 52442489, 'stop': 52442622}, | ||
{'start': 52441973, 'stop': 52442093}, | ||
{'start': 52441414, 'stop': 52441476}, | ||
{'start': 52441189, 'stop': 52441332}, | ||
{'start': 52440844, 'stop': 52440923}, | ||
{'start': 52440268, 'stop': 52440392}, | ||
{'start': 52439780, 'stop': 52439928}, | ||
{'start': 52439125, 'stop': 52439310}, | ||
{'start': 52438468, 'stop': 52438602}, | ||
{'start': 52437431, 'stop': 52437910}, | ||
{'start': 52437153, 'stop': 52437314}, | ||
{'start': 52436794, 'stop': 52436887}, | ||
{'start': 52436617, 'stop': 52436690}, | ||
{'start': 52436303, 'stop': 52436437}], | ||
'exon': [{'start': 52443857, 'stop': 52444024}, | ||
{'start': 52443729, 'stop': 52443759}, | ||
{'start': 52443569, 'stop': 52443624}, | ||
{'start': 52442489, 'stop': 52442622}, | ||
{'start': 52441973, 'stop': 52442093}, | ||
{'start': 52441414, 'stop': 52441476}, | ||
{'start': 52441189, 'stop': 52441332}, | ||
{'start': 52440844, 'stop': 52440923}, | ||
{'start': 52440268, 'stop': 52440392}, | ||
{'start': 52439780, 'stop': 52439928}, | ||
{'start': 52439125, 'stop': 52439310}, | ||
{'start': 52438468, 'stop': 52438602}, | ||
{'start': 52437431, 'stop': 52437910}, | ||
{'start': 52437153, 'stop': 52437314}, | ||
{'start': 52436794, 'stop': 52436887}, | ||
{'start': 52436617, 'stop': 52436690}, | ||
{'start': 52435023, 'stop': 52436437}]}, | ||
'is_coding': 1, | ||
'start': 52435023, | ||
'stop': 52444024, | ||
'strand': '-', | ||
} | ||
|
||
EXPECTED_DATA = { | ||
"end": 52444024, | ||
"chrom": "NC_000003.11", | ||
"exons": [ | ||
[52435023, 52436437], | ||
[52436617, 52436690], | ||
[52436794, 52436887], | ||
[52437153, 52437314], | ||
[52437431, 52437910], | ||
[52438468, 52438602], | ||
[52439125, 52439310], | ||
[52439780, 52439928], | ||
[52440268, 52440392], | ||
[52440844, 52440923], | ||
[52441189, 52441332], | ||
[52441414, 52441476], | ||
[52441973, 52442093], | ||
[52442489, 52442622], | ||
[52443569, 52443624], | ||
[52443729, 52443759], | ||
[52443857, 52444024], | ||
], | ||
"start": 52435023, | ||
"strand": "-", | ||
"cds_end": 52443894, | ||
"cds_start": 52436303 | ||
} | ||
|
||
output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_004656_4) | ||
self.assertEquals(EXPECTED_DATA, output) |