diff --git a/genes/management/commands/import_gene_annotation2.py b/genes/management/commands/import_gene_annotation2.py index ae370603d..7bef7f2c9 100644 --- a/genes/management/commands/import_gene_annotation2.py +++ b/genes/management/commands/import_gene_annotation2.py @@ -1,23 +1,13 @@ +import gzip import json -import os -import re -from collections import Counter, defaultdict -from typing import Tuple, Optional, Dict +import logging +from typing import Dict, List, Set from django.core.management.base import BaseCommand -from pyhgvs.utils import read_genepred - -from genes.cached_web_resource.refseq import retrieve_refseq_gene_summaries -from genes.gene_matching import GeneMatcher -from genes.models import GeneAnnotationImport, HGNC, \ - GeneSymbol, Gene, GeneVersion, Transcript, TranscriptVersion, GeneAnnotationRelease, ReleaseGeneVersion, \ - ReleaseTranscriptVersion +from genes.models import HGNC, GeneSymbol, GeneAnnotationImport from genes.models_enums import AnnotationConsortium -from library.django_utils import highest_pk from library.file_utils import open_handle_gzip -from library.utils import invert_dict -from snpdb.models.models_enums import SequenceRole -from snpdb.models.models_genome import GenomeBuild, GenomeFasta +from snpdb.models.models_genome import GenomeBuild class Command(BaseCommand): @@ -60,15 +50,23 @@ def add_arguments(self, parser): parser.add_argument('--annotation-consortium', choices=consortia, required=True) parser.add_argument('--release', required=False, help="Make a release (to match VEP) store all gene/transcript versions") - group = parser.add_mutually_exclusive_group() - group.add_argument('--pyreference-json', help='PyReference JSON.gz') - group.add_argument('--merged-json', help='Merged JSON (from multiple PyReference files)') + parser.add_argument('--save-merged-file', help="Write a file (requires pyreference-json)") + # group = parser.add_mutually_exclusive_group() + parser.add_argument('--pyreference-json', nargs="+", action="extend", help='PyReference JSON.gz') + parser.add_argument('--merged-json', help='Merged JSON (from multiple PyReference files)') def handle(self, *args, **options): - if pyreference_json := options["merged_json"]: - with open_handle_gzip(pyreference_json) as f: - pyreference_data = json.load(f) + if pyreference_json := options["pyreference_json"]: + pyreference_data = [] + for prj_filename in pyreference_json: + logging.info("Loading %s", prj_filename) + with open_handle_gzip(prj_filename) as f: + pyreference_data.append(json.load(f)) merged_data = self._convert_to_merged_data(pyreference_data) + if save_merged_file := options["save_merged_file"]: + with gzip.open(save_merged_file, 'w') as outfile: + json_str = json.dumps(merged_data) + outfile.write(json_str.encode('ascii')) else: merged_json = options["merged_json"] with open_handle_gzip(merged_json) as f: @@ -76,22 +74,93 @@ def handle(self, *args, **options): self._import_merged_data(merged_data) - def _convert_to_merged_data(self, pyreference_data: Dict) -> Dict: - pass - - def _import_merged_data(self, data: Dict): + @staticmethod + def _get_most_recent_transcripts(pyreference_data) -> List[Set]: + transcripts_in_files = [set(prd["transcripts_by_id"]) for prd in pyreference_data] + most_recent_transcripts = [] + all_transcripts = set() + for tif in reversed(transcripts_in_files): + unique_transcripts = tif - all_transcripts + most_recent_transcripts.append(unique_transcripts) + all_transcripts |= unique_transcripts + most_recent_transcripts.reverse() + return most_recent_transcripts + + def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]: + """ We want to make the minimal amount of data to insert - so only keep the last copy of transcripts """ + print("_import_merged_data") + most_recent_transcripts = self._get_most_recent_transcripts(pyreference_data) + + merged_data = [] + for prd, transcripts in zip(pyreference_data, most_recent_transcripts): + gene_version = {} + transcript_versions = {} + + for gene_id, gene in prd["genes_by_id"].items(): + for transcript_accession in gene["transcripts"]: + if transcript_accession in transcripts: + gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene) + + for transcript_accession in transcripts: + transcript = prd["transcripts_by_id"][transcript_accession] + transcript_versions[transcript_accession] = convert_transcript_pyreference_to_pyhgvs(transcript) + + data = { + "gene_annotation_import": prd["reference_gtf"], + "gene_version": gene_version, + "transcript_versions": transcript_versions, + } + merged_data.append(data) + + return merged_data + + + def _import_merged_data(self, merged_data: List[Dict]): """ [{ "gene_annotation_import": {"filename": "", "url": "", "file_md5sum": ""}, - "gene_symbols": [], - "gene": [], - "gene_version": [], - "transcripts": [], - "transcript_versions": [], + "gene_version": {}, + "transcript_versions": {}, }, } """ - pass + print("_import_merged_data") + + gene_symbols = [] + genes = [] + gene_versions = [] + transcripts = [] + transcript_versions = [] + + for data in merged_data: + import_data = data["gene_annotation_import"] + import_source = GeneAnnotationImport.objects.create(annotation_consortium=self.annotation_consortium, + genome_build=self.genome_build, + filename=import_data["filename"], + url=import_data["url"], + file_md5sum=import_data["file_md5sum"]) + + # Go through and create gene_symbols etc, assigning import_source + + # Now, break up into new / old? + + # bulk_create + # bulk update (with import source) + + + + +def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict: + gene_version_data = { + 'biotype': ",".join(gene_data["biotype"]), + 'description': gene_data["description"], + 'gene_symbol': gene_data["name"], + } + + if hgnc_str := gene_data.get("HGNC"): + # Has HGNC: (5 characters) at start of it + gene_version_data["hgnc"] = hgnc_str[5:] + return gene_version_data def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: diff --git a/genes/tests/test_import_gene_annotation.py b/genes/tests/test_import_gene_annotation.py index 58d0b02a4..85885b310 100644 --- a/genes/tests/test_import_gene_annotation.py +++ b/genes/tests/test_import_gene_annotation.py @@ -1,6 +1,7 @@ from unittest import TestCase -from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs +from genes.management.commands.import_gene_annotation2 import convert_transcript_pyreference_to_pyhgvs, \ + convert_gene_pyreference_to_gene_version_data class TestAnnotationVCF(TestCase): @@ -273,3 +274,25 @@ def test_convert_transcript_pyreference_to_pyhgvs_gaps(self): output = convert_transcript_pyreference_to_pyhgvs(PYREFERENCE_DATA_NM_015120_4) self.assertEquals(EXPECTED_DATA, output) + + def test_convert_gene(self): + PYREFERENCE_GENE = { + 'HGNC': 'HGNC:1100', + 'biotype': ['protein_coding'], + 'chr': 'NC_000017.11', + 'description': 'BRCA1 DNA repair associated', + 'name': 'BRCA1', + 'start': 43044294, + 'stop': 43125364, + 'strand': '-', + 'transcripts': ['NM_007294.4'] + } + + EXPECTED_DATA = { + 'hgnc': '1100', + 'biotype': 'protein_coding', + 'description': 'BRCA1 DNA repair associated', + 'gene_symbol': 'BRCA1', + } + output = convert_gene_pyreference_to_gene_version_data(PYREFERENCE_GENE) + self.assertEquals(EXPECTED_DATA, output)