Skip to content

Commit

Permalink
#494 - Insert data
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Sep 30, 2021
1 parent b98eb41 commit c66fd3e
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 55 deletions.
193 changes: 140 additions & 53 deletions genes/management/commands/import_gene_annotation2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,13 @@
from typing import Dict, List, Set

from django.core.management.base import BaseCommand
from genes.models import HGNC, GeneSymbol, GeneAnnotationImport
from genes.models import GeneSymbol, GeneAnnotationImport, Gene, GeneVersion, TranscriptVersion, Transcript
from genes.models_enums import AnnotationConsortium
from library.file_utils import open_handle_gzip
from snpdb.models.models_genome import GenomeBuild


class Command(BaseCommand):
"""
* Insert new gene symbols
* Insert new Gene
* Insert new GeneVerison
* Insert new Transcripts
* Insert new TranscriptVersions
UPDATES
GeneVersion - symbol + import
TranscriptVersion - data, gene_version + import
-------------
ReleaseGeneVersion
ReleaseTranscriptVersion
"""
BATCH_SIZE = 2000

def add_arguments(self, parser):
Expand All @@ -55,6 +36,7 @@ def handle(self, *args, **options):
pyreference_data.append(json.load(f))
merged_data = self._convert_to_merged_data(pyreference_data)
if save_merged_file := options["save_merged_file"]:
logging.info("Writing '%s'", save_merged_file)
with gzip.open(save_merged_file, 'w') as outfile:
json_str = json.dumps(merged_data)
outfile.write(json_str.encode('ascii'))
Expand All @@ -81,61 +63,161 @@ def _get_most_recent_transcripts(pyreference_data) -> List[Set]:

def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]:
""" We want to make the minimal amount of data to insert - so only keep the last copy of transcripts """
print("_import_merged_data")
print("_convert_to_merged_data")
most_recent_transcripts = self._get_most_recent_transcripts(pyreference_data)

merged_data = []
for prd, transcripts in zip(pyreference_data, most_recent_transcripts):
gene_version = {}
transcript_versions = {}
transcript_gene_version = {}

for gene_id, gene in prd["genes_by_id"].items():
version = gene.get("version", 0)
gv_accession = f"{gene_id}.{version}"
need_gene = False
for transcript_accession in gene["transcripts"]:
if transcript_accession in transcripts:
gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene)
transcript_gene_version[transcript_accession] = gv_accession

if need_gene:
gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene)

for transcript_accession in transcripts:
transcript = prd["transcripts_by_id"][transcript_accession]
transcript_versions[transcript_accession] = convert_transcript_pyreference_to_pyhgvs(transcript)

data = {
"gene_annotation_import": prd["reference_gtf"],
"gene_version": gene_version,
"transcript_versions": transcript_versions,
}
merged_data.append(data)
tv_data = {
"biotype": transcript["biotype"],
"gene_version": transcript_gene_version[transcript_accession],
"data": convert_transcript_pyreference_to_pyhgvs(transcript),
}
transcript_versions[transcript_accession] = tv_data

if transcript_versions:
data = {
"gene_annotation_import": prd["reference_gtf"],
"gene_version": gene_version,
"transcript_versions": transcript_versions,
}
merged_data.append(data)

return merged_data


def _import_merged_data(self, merged_data: List[Dict]):
"""
"""
def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium, merged_data: List[Dict]):
""" """
print("_import_merged_data")

gene_symbols = []
genes = []
gene_versions = []
transcripts = []
transcript_versions = []
known_gene_symbols = set(GeneSymbol.objects.all().values_list("pk", flat=True))
genes_qs = Gene.objects.filter(annotation_consortium=annotation_consortium)
known_genes_ids = {genes_qs.values_list("identifier", flat=True)}
transcripts_qs = Transcript.objects.filter(annotation_consortium=annotation_consortium)
known_transcript_ids = {transcripts_qs.values_list("identifier", flat=True)}

gene_version_qs = GeneVersion.objects.filter(genome_build=genome_build,
annotation_consortium=annotation_consortium)
known_gene_version_ids_by_accession = {f"{gene_id}.{version}": pk for (pk, gene_id, version) in gene_version_qs.values_list("pk", "gene_id", "version")}
transcript_version_qs = TranscriptVersion.objects.filter(genome_build=genome_build,
annotation_consortium=annotation_consortium)
known_transcript_version_ids_by_accession = {f"{transcript_id}.{version}": pk for (pk, transcript_id, version) in transcript_version_qs.values_list("pk", "transcript_id", "version")}

for data in merged_data:
import_data = data["gene_annotation_import"]
logging.info("%s has %d transcripts", import_data, len(data["transcript_versions"]))
# import_source = GeneAnnotationImport.objects.create(annotation_consortium=self.annotation_consortium,
# genome_build=self.genome_build,
# filename=import_data["path"],
# url=import_data["url"],
# file_md5sum=import_data["md5sum"])

# Go through and create gene_symbols etc, assigning import_source

# Now, break up into new / old?

# bulk_create
# bulk update (with import source)


import_source = GeneAnnotationImport.objects.create(annotation_consortium=annotation_consortium,
genome_build=genome_build,
filename=import_data["path"],
url=import_data["url"],
file_md5sum=import_data["md5sum"])

new_gene_symbols = []
new_genes = []
new_gene_versions = []
modified_gene_versions = []

for gene_id, gv_data in data["gene_version"].items():
if gene_id not in known_genes_ids:
new_genes.append(Gene(identifier=gene_id,
annotation_consortium=annotation_consortium))

if symbol := gv_data["gene_symbol"]:
if symbol not in known_gene_symbols:
new_gene_symbols.append(GeneSymbol(symbol=symbol))
# RefSeq have no version, set as 0 if missing
version = gv_data.get("version", 0)

gene_version = GeneVersion(gene_id=gene_id,
version=version,
gene_symbol_id=symbol,
hgnc_id=gv_data.get("hgnc"),
description=gv_data.get("description"),
biotype=gv_data.get("biotype"),
genome_build=genome_build,
import_source=import_source)
gv_accession = f"{gene_id}.{version}"
if pk := known_gene_version_ids_by_accession.get(gv_accession):
gene_version.pk = pk
modified_gene_versions.append(gene_version)
else:
new_gene_versions.append(gene_version)

if new_gene_symbols:
logging.info("Creating %d new gene symbols", len(new_gene_symbols))
GeneSymbol.objects.bulk_create(new_gene_symbols, batch_size=self.BATCH_SIZE)

if new_genes:
logging.info("Creating %d new genes", len(new_genes))
Gene.objects.bulk_create(new_genes, batch_size=self.BATCH_SIZE)

if new_gene_versions:
logging.info("Creating %d new gene versions", len(new_gene_versions))
GeneVersion.objects.bulk_create(new_gene_versions, batch_size=self.BATCH_SIZE)

# Update with newly inserted gene versions - so that we have a PK to use below
known_gene_version_ids_by_accession.update({f"{gv.gene_id}.{gv.version}" for gv in new_gene_versions})

# Could potentially be duplicate gene versions (diff transcript versions from diff GFFs w/same GeneVersion)
if modified_gene_versions:
logging.info("Updating %d gene versions", len(modified_gene_versions))
GeneVersion.objects.bulk_update(modified_gene_versions,
["gene_symbol_id", "hgnc_id", "description", "biotype", "import_source"],
batch_size=self.BATCH_SIZE)

new_transcripts = []
new_transcript_versions = []
modified_transcript_versions = []

for transcript_accession, tv_data in data["transcript_version"].items():
transcript_id, version = TranscriptVersion.get_transcript_id_and_version(transcript_accession)
if transcript_id not in known_transcript_ids:
new_transcripts.append(Transcript(identifier=transcript_id,
annotation_consortium=annotation_consortium))

gene_version_id = known_gene_version_ids_by_accession[tv_data["gene_version"]]
transcript_version = TranscriptVersion(transcript_id=transcript_id,
version=version,
gene_version_id=gene_version_id,
genome_build=genome_build,
import_source=import_source,
biotype=tv_data["biotype"],
data=tv_data["data"])
if pk := known_transcript_version_ids_by_accession.get(transcript_accession):
transcript_version.pk = pk
modified_transcript_versions.append(transcript_version)
else:
new_transcript_versions.append(transcript_version)

if new_transcripts:
logging.info("Creating %d new transcripts", len(new_transcripts))
Transcript.objects.bulk_create(new_transcripts, batch_size=self.BATCH_SIZE)

if new_transcript_versions:
logging.info("Creating %d new transcript versions", len(new_transcript_versions))
TranscriptVersion.objects.bulk_create(new_transcript_versions, batch_size=self.BATCH_SIZE)

if modified_transcript_versions:
logging.info("Updating %d transcript versions", len(modified_transcript_versions))
TranscriptVersion.objects.bulk_update(modified_transcript_versions,
["gene_version_id", "import_source", "biotype", "data"],
batch_size=self.BATCH_SIZE)


def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
Expand All @@ -148,6 +230,11 @@ def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
if hgnc_str := gene_data.get("HGNC"):
# Has HGNC: (5 characters) at start of it
gene_version_data["hgnc"] = hgnc_str[5:]

# Only Ensembl Genes have versions
if version := gene_data.get("version"):
gene_data["version"] = version

return gene_version_data


Expand Down
20 changes: 20 additions & 0 deletions genes/migrations/0042_one_off_refseq_gene_version_zero.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 3.2.6 on 2021-09-30 12:51

from django.db import migrations


def _one_off_refseq_gene_version_zero(apps, schema_editor):
# Make RefSeq gene versions 0 instead of 1, so we know they're not real
GeneVersion = apps.get_model("genes", "GeneVersion")
GeneVersion.objects.filter(gene__annotation_consortium='R').update(version=0)


class Migration(migrations.Migration):

dependencies = [
('genes', '0041_auto_20210930_1415'),
]

operations = [
migrations.RunPython(_one_off_refseq_gene_version_zero)
]
4 changes: 2 additions & 2 deletions genes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ class GeneVersion(models.Model):
""" A specific version of a Gene for a particular version/genome build
Genes/TranscriptVersion needs to be able to represent both RefSeq and Ensembl """
gene = models.ForeignKey(Gene, on_delete=CASCADE)
version = models.IntegerField() # RefSeq GeneIDs are always 1 (not versioned)
version = models.IntegerField() # RefSeq GeneIDs are always 0 (not versioned) need non-null for unique_together
# symbol can be null as Ensembl has genes w/o symbols, eg ENSG00000238009 (lncRNA)
gene_symbol = models.ForeignKey(GeneSymbol, null=True, on_delete=CASCADE)
hgnc = models.ForeignKey(HGNC, null=True, on_delete=CASCADE)
Expand All @@ -431,7 +431,7 @@ class Meta:

@lazy
def accession(self):
if self.version is not None and self.gene.has_versions():
if self.gene.has_versions():
acc = f"{self.gene_id}.{self.version}"
else:
acc = self.gene_id
Expand Down

0 comments on commit c66fd3e

Please sign in to comment.