Skip to content

Commit

Permalink
#494 - Remove alignment gap, new fields, testing
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Sep 30, 2021
1 parent 316a4f4 commit beb7131
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 32 deletions.
131 changes: 100 additions & 31 deletions genes/management/commands/import_gene_annotation2.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,13 @@
import gzip
import json
import os
import re
from collections import Counter, defaultdict
from typing import Tuple, Optional, Dict
import logging
from typing import Dict, List, Set

from django.core.management.base import BaseCommand
from pyhgvs.utils import read_genepred

from genes.cached_web_resource.refseq import retrieve_refseq_gene_summaries
from genes.gene_matching import GeneMatcher
from genes.models import GeneAnnotationImport, HGNC, \
GeneSymbol, Gene, GeneVersion, Transcript, TranscriptVersion, GeneAnnotationRelease, ReleaseGeneVersion, \
ReleaseTranscriptVersion
from genes.models import HGNC, GeneSymbol, GeneAnnotationImport
from genes.models_enums import AnnotationConsortium
from library.django_utils import highest_pk
from library.file_utils import open_handle_gzip
from library.utils import invert_dict
from snpdb.models.models_enums import SequenceRole
from snpdb.models.models_genome import GenomeBuild, GenomeFasta
from snpdb.models.models_genome import GenomeBuild


class Command(BaseCommand):
Expand Down Expand Up @@ -60,38 +50,117 @@ def add_arguments(self, parser):
parser.add_argument('--annotation-consortium', choices=consortia, required=True)
parser.add_argument('--release', required=False,
help="Make a release (to match VEP) store all gene/transcript versions")
group = parser.add_mutually_exclusive_group()
group.add_argument('--pyreference-json', help='PyReference JSON.gz')
group.add_argument('--merged-json', help='Merged JSON (from multiple PyReference files)')
parser.add_argument('--save-merged-file', help="Write a file (requires pyreference-json)")
# group = parser.add_mutually_exclusive_group()
parser.add_argument('--pyreference-json', nargs="+", action="extend", help='PyReference JSON.gz')
parser.add_argument('--merged-json', help='Merged JSON (from multiple PyReference files)')

def handle(self, *args, **options):
if pyreference_json := options["merged_json"]:
with open_handle_gzip(pyreference_json) as f:
pyreference_data = json.load(f)
if pyreference_json := options["pyreference_json"]:
pyreference_data = []
for prj_filename in pyreference_json:
logging.info("Loading %s", prj_filename)
with open_handle_gzip(prj_filename) as f:
pyreference_data.append(json.load(f))
merged_data = self._convert_to_merged_data(pyreference_data)
if save_merged_file := options["save_merged_file"]:
with gzip.open(save_merged_file, 'w') as outfile:
json_str = json.dumps(merged_data)
outfile.write(json_str.encode('ascii'))
else:
merged_json = options["merged_json"]
with open_handle_gzip(merged_json) as f:
merged_data = json.load(f)

self._import_merged_data(merged_data)

def _convert_to_merged_data(self, pyreference_data: Dict) -> Dict:
pass

def _import_merged_data(self, data: Dict):
@staticmethod
def _get_most_recent_transcripts(pyreference_data) -> List[Set]:
transcripts_in_files = [set(prd["transcripts_by_id"]) for prd in pyreference_data]
most_recent_transcripts = []
all_transcripts = set()
for tif in reversed(transcripts_in_files):
unique_transcripts = tif - all_transcripts
most_recent_transcripts.append(unique_transcripts)
all_transcripts |= unique_transcripts
most_recent_transcripts.reverse()
return most_recent_transcripts

def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]:
""" We want to make the minimal amount of data to insert - so only keep the last copy of transcripts """
print("_import_merged_data")
most_recent_transcripts = self._get_most_recent_transcripts(pyreference_data)

merged_data = []
for prd, transcripts in zip(pyreference_data, most_recent_transcripts):
gene_version = {}
transcript_versions = {}

for gene_id, gene in prd["genes_by_id"].items():
for transcript_accession in gene["transcripts"]:
if transcript_accession in transcripts:
gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene)

for transcript_accession in transcripts:
transcript = prd["transcripts_by_id"][transcript_accession]
transcript_versions[transcript_accession] = convert_transcript_pyreference_to_pyhgvs(transcript)

data = {
"gene_annotation_import": prd["reference_gtf"],
"gene_version": gene_version,
"transcript_versions": transcript_versions,
}
merged_data.append(data)

return merged_data


def _import_merged_data(self, merged_data: List[Dict]):
"""
[{
"gene_annotation_import": {"filename": "", "url": "", "file_md5sum": ""},
"gene_symbols": [],
"gene": [],
"gene_version": [],
"transcripts": [],
"transcript_versions": [],
"gene_version": {},
"transcript_versions": {},
},
}
"""
pass
print("_import_merged_data")

gene_symbols = []
genes = []
gene_versions = []
transcripts = []
transcript_versions = []

for data in merged_data:
import_data = data["gene_annotation_import"]
import_source = GeneAnnotationImport.objects.create(annotation_consortium=self.annotation_consortium,
genome_build=self.genome_build,
filename=import_data["filename"],
url=import_data["url"],
file_md5sum=import_data["file_md5sum"])

# Go through and create gene_symbols etc, assigning import_source

# Now, break up into new / old?

# bulk_create
# bulk update (with import source)




def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
gene_version_data = {
'biotype': ",".join(gene_data["biotype"]),
'description': gene_data["description"],
'gene_symbol': gene_data["name"],
}

if hgnc_str := gene_data.get("HGNC"):
# Has HGNC: (5 characters) at start of it
gene_version_data["hgnc"] = hgnc_str[5:]
return gene_version_data


def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
Expand Down
25 changes: 24 additions & 1 deletion genes/tests/test_import_gene_annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import TestCase

from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs
from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs, \
convert_gene_pyreference_to_gene_version_data


class TestAnnotationVCF(TestCase):
Expand Down Expand Up @@ -273,3 +274,25 @@ def test_convert_transcript_pyreference_to_pyhgvs_gaps(self):

output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_015120_4)
self.assertEquals(EXPECTED_DATA, output)

def test_convert_gene(self):
PYREFERENCE_GENE = {
'HGNC': 'HGNC:1100',
'biotype': ['protein_coding'],
'chr': 'NC_000017.11',
'description': 'BRCA1 DNA repair associated',
'name': 'BRCA1',
'start': 43044294,
'stop': 43125364,
'strand': '-',
'transcripts': ['NM_007294.4']
}

EXPECTED_DATA = {
'hgnc': '1100',
'biotype': 'protein_coding',
'description': 'BRCA1 DNA repair associated',
'gene_symbol': 'BRCA1',
}
output = convert_gene_pyreference_to_gene_version_data(PYREFERENCE_GENE)
self.assertEquals(EXPECTED_DATA, output)

0 comments on commit beb7131

Please sign in to comment.