#494 - Start writing new importer, test conversion code

SACGF · Sep 29, 2021 · 2156cd4 · 2156cd4
1 parent a0b599d
commit 2156cd4
Show file tree

Hide file tree

Showing 2 changed files with 206 additions and 0 deletions.
diff --git a/genes/management/commands/import_gene_annotation2.py b/genes/management/commands/import_gene_annotation2.py
@@ -0,0 +1,72 @@
+import os
+import re
+from collections import Counter, defaultdict
+from typing import Tuple, Optional, Dict
+
+from django.core.management.base import BaseCommand
+from pyhgvs.utils import read_genepred
+
+from genes.cached_web_resource.refseq import retrieve_refseq_gene_summaries
+from genes.gene_matching import GeneMatcher
+from genes.models import GeneAnnotationImport, HGNC, \
+    GeneSymbol, Gene, GeneVersion, Transcript, TranscriptVersion, GeneAnnotationRelease, ReleaseGeneVersion, \
+    ReleaseTranscriptVersion
+from genes.models_enums import AnnotationConsortium
+from library.django_utils import highest_pk
+from library.file_utils import open_handle_gzip
+from library.utils import invert_dict
+from snpdb.models.models_enums import SequenceRole
+from snpdb.models.models_genome import GenomeBuild, GenomeFasta
+
+
+class Command(BaseCommand):
+    BATCH_SIZE = 2000
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.annotation_consortium = None
+        self.genome_build = None
+        self.contig_id_to_fasta = None
+        self.hgnc_ids = set(HGNC.objects.values_list("pk", flat=True))
+        # Known objects containers are updated with new inserts
+        self.known_gene_symbols = set(GeneSymbol.objects.all().values_list("pk", flat=True))
+        self.known_gene_versions_by_gene_id = defaultdict(dict)
+        self.known_transcript_versions_by_transcript_id = defaultdict(dict)
+
+    def add_arguments(self, parser):
+        consortia = [ac[1] for ac in AnnotationConsortium.choices]
+        builds = [gb.name for gb in GenomeBuild.builds_with_annotation()]
+
+        parser.add_argument('--dry-run', action='store_true', help="Don't actually modify anything")
+        parser.add_argument('--genome-build', choices=builds, required=True)
+        parser.add_argument('--annotation-consortium', choices=consortia, required=True)
+        parser.add_argument('--release', required=False,
+                            help="Make a release (to match VEP) store all gene/transcript versions")
+        parser.add_argument('filename', help="PyReference JSON.gz")
+
+    def handle(self, *args, **options):
+        filename = options["filename"]
+
+
+def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
+    start = transcript_data["start"]
+    end = transcript_data["stop"]
+    strand = transcript_data["strand"]
+    # PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts
+    cds_start = transcript_data.get("cds_start", start)
+    cds_end = transcript_data.get("cds_end", end)
+    # PyHGVS exons are in genomic order, pyhgvs are in stranded
+    features = transcript_data["features_by_type"]
+    exons = [[ed["start"], ed["stop"]] for ed in features["exon"]]
+    if strand == '-':
+        exons.reverse()
+
+    return {
+        'chrom': transcript_data["chr"],
+        'start': start,
+        'end': end,
+        'strand': strand,
+        'cds_start': cds_start,
+        'cds_end': cds_end,
+        'exons': exons,
+    }
diff --git a/genes/tests/test_import_gene_annotation.py b/genes/tests/test_import_gene_annotation.py
@@ -0,0 +1,134 @@
+from unittest import TestCase
+
+from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs
+
+
+class TestAnnotationVCF(TestCase):
+    """ These are all from GCF_000001405.25_GRCh37.p13_genomic.105.20201022.gff.gz """
+
+    def test_convert_transcript_pyreference_to_pyhgvs_positive_strand(self):
+        PYREFERENCE_DATA_NM_030751_6 = {
+            'biotype': ['protein_coding'],
+            'cds_end': 31816192,
+            'cds_start': 31608163,
+            'chr': 'NC_000010.10',
+            'features_by_type': {'3PUTR': [{'start': 31816192, 'stop': 31818732}],
+                                 '5PUTR': [{'start': 31608144, 'stop': 31608163}],
+                                 'CDS': [{'start': 31608163, 'stop': 31608221},
+                                         {'start': 31749965, 'stop': 31750166},
+                                         {'start': 31784707, 'stop': 31784767},
+                                         {'start': 31791275, 'stop': 31791437},
+                                         {'start': 31799600, 'stop': 31799803},
+                                         {'start': 31803530, 'stop': 31803636},
+                                         {'start': 31809053, 'stop': 31810864},
+                                         {'start': 31812860, 'stop': 31813041},
+                                         {'start': 31815599, 'stop': 31816192}],
+                                 'exon': [{'start': 31608144, 'stop': 31608221},
+                                          {'start': 31749965, 'stop': 31750166},
+                                          {'start': 31784707, 'stop': 31784767},
+                                          {'start': 31791275, 'stop': 31791437},
+                                          {'start': 31799600, 'stop': 31799803},
+                                          {'start': 31803530, 'stop': 31803636},
+                                          {'start': 31809053, 'stop': 31810864},
+                                          {'start': 31812860, 'stop': 31813041},
+                                          {'start': 31815599, 'stop': 31818732}]},
+            'is_coding': 1,
+            'start': 31608144,
+            'stop': 31818732,
+            'strand': '+'
+        }
+
+        EXPECTED_DATA = {
+            "end": 31818732,
+            "chrom": "NC_000010.10",
+            "exons": [[31608144, 31608221], [31749965, 31750166], [31784707, 31784767], [31791275, 31791437],
+                      [31799600, 31799803], [31803530, 31803636], [31809053, 31810864], [31812860, 31813041],
+                      [31815599, 31818732]],
+            "start": 31608144,  # We originally had it as 31608145 but this matches exon
+            "strand": "+",
+            "cds_end": 31816192,
+            "cds_start": 31608163,
+        }
+
+        output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_030751_6)
+        self.maxDiff = None
+        self.assertEquals(EXPECTED_DATA, output)
+
+    def test_convert_transcript_pyreference_to_pyhgvs_negative_strand(self):
+        PYREFERENCE_DATA_NM_004656_4 = {
+            'biotype': ['protein_coding'],
+            'cds_end': 52443894,
+            'cds_start': 52436303,
+            'chr': 'NC_000003.11',
+            'features_by_type': {'3PUTR': [{'start': 52435023, 'stop': 52436303}],
+                                 '5PUTR': [{'start': 52443894, 'stop': 52444024}],
+                                 'CDS': [{'start': 52443857, 'stop': 52443894},
+                                         {'start': 52443729, 'stop': 52443759},
+                                         {'start': 52443569, 'stop': 52443624},
+                                         {'start': 52442489, 'stop': 52442622},
+                                         {'start': 52441973, 'stop': 52442093},
+                                         {'start': 52441414, 'stop': 52441476},
+                                         {'start': 52441189, 'stop': 52441332},
+                                         {'start': 52440844, 'stop': 52440923},
+                                         {'start': 52440268, 'stop': 52440392},
+                                         {'start': 52439780, 'stop': 52439928},
+                                         {'start': 52439125, 'stop': 52439310},
+                                         {'start': 52438468, 'stop': 52438602},
+                                         {'start': 52437431, 'stop': 52437910},
+                                         {'start': 52437153, 'stop': 52437314},
+                                         {'start': 52436794, 'stop': 52436887},
+                                         {'start': 52436617, 'stop': 52436690},
+                                         {'start': 52436303, 'stop': 52436437}],
+                                 'exon': [{'start': 52443857, 'stop': 52444024},
+                                          {'start': 52443729, 'stop': 52443759},
+                                          {'start': 52443569, 'stop': 52443624},
+                                          {'start': 52442489, 'stop': 52442622},
+                                          {'start': 52441973, 'stop': 52442093},
+                                          {'start': 52441414, 'stop': 52441476},
+                                          {'start': 52441189, 'stop': 52441332},
+                                          {'start': 52440844, 'stop': 52440923},
+                                          {'start': 52440268, 'stop': 52440392},
+                                          {'start': 52439780, 'stop': 52439928},
+                                          {'start': 52439125, 'stop': 52439310},
+                                          {'start': 52438468, 'stop': 52438602},
+                                          {'start': 52437431, 'stop': 52437910},
+                                          {'start': 52437153, 'stop': 52437314},
+                                          {'start': 52436794, 'stop': 52436887},
+                                          {'start': 52436617, 'stop': 52436690},
+                                          {'start': 52435023, 'stop': 52436437}]},
+            'is_coding': 1,
+            'start': 52435023,
+            'stop': 52444024,
+            'strand': '-',
+        }
+
+        EXPECTED_DATA = {
+            "end": 52444024,
+            "chrom": "NC_000003.11",
+            "exons": [
+                [52435023, 52436437],
+                [52436617, 52436690],
+                [52436794, 52436887],
+                [52437153, 52437314],
+                [52437431, 52437910],
+                [52438468, 52438602],
+                [52439125, 52439310],
+                [52439780, 52439928],
+                [52440268, 52440392],
+                [52440844, 52440923],
+                [52441189, 52441332],
+                [52441414, 52441476],
+                [52441973, 52442093],
+                [52442489, 52442622],
+                [52443569, 52443624],
+                [52443729, 52443759],
+                [52443857, 52444024],
+            ],
+            "start": 52435023,
+            "strand": "-",
+            "cds_end": 52443894,
+            "cds_start": 52436303
+        }
+
+        output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_004656_4)
+        self.assertEquals(EXPECTED_DATA, output)