Skip to content

Commit

Permalink
#494 - Insert data fixes - case insensitive genes, biotype formatting…
Browse files Browse the repository at this point in the history
…, include gene versions
  • Loading branch information
davmlaw committed Oct 1, 2021
1 parent 7a42077 commit 30c4174
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 14 deletions.
29 changes: 16 additions & 13 deletions genes/management/commands/import_gene_annotation2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import Dict, List, Set

from django.core.management.base import BaseCommand
from django.db.models.functions import Upper

from genes.models import GeneSymbol, GeneAnnotationImport, Gene, GeneVersion, TranscriptVersion, Transcript
from genes.models_enums import AnnotationConsortium
from library.file_utils import open_handle_gzip
Expand Down Expand Up @@ -86,14 +88,15 @@ def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]:
for transcript_accession in gene["transcripts"]:
if transcript_accession in transcripts:
transcript_gene_version[transcript_accession] = gv_accession
need_gene = True

if need_gene:
gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene)

for transcript_accession in transcripts:
transcript = prd["transcripts_by_id"][transcript_accession]
tv_data = {
"biotype": transcript["biotype"],
"biotype": ",".join(transcript["biotype"]),
"gene_version": transcript_gene_version[transcript_accession],
"data": convert_transcript_pyreference_to_pyhgvs(transcript),
}
Expand All @@ -113,7 +116,7 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium,
""" """
print("_import_merged_data")

known_gene_symbols = set(GeneSymbol.objects.all().values_list("pk", flat=True))
known_uc_gene_symbols = set(GeneSymbol.objects.annotate(uc_symbol=Upper("symbol")).values_list("uc_symbol", flat=True))
genes_qs = Gene.objects.filter(annotation_consortium=annotation_consortium)
known_genes_ids = set(genes_qs.values_list("identifier", flat=True))
transcripts_qs = Transcript.objects.filter(annotation_consortium=annotation_consortium)
Expand All @@ -131,13 +134,12 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium,
for data in merged_data:
import_data = data["gene_annotation_import"]
logging.info("%s has %d transcripts", import_data, len(data["transcript_version"]))
import_source = GeneAnnotationImport.objects.create(annotation_consortium=annotation_consortium,
genome_build=genome_build,
filename=import_data["path"],
url=import_data["url"],
file_md5sum=import_data["md5sum"])

new_gene_symbols = []
import_source = GeneAnnotationImport.objects.get_or_create(annotation_consortium=annotation_consortium,
genome_build=genome_build,
filename=import_data["path"],
url=import_data["url"],
file_md5sum=import_data["md5sum"])[0]
new_gene_symbols = set()
new_genes = []
new_gene_versions = []
modified_gene_versions = []
Expand All @@ -148,8 +150,8 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium,
annotation_consortium=annotation_consortium))

if symbol := gv_data["gene_symbol"]:
if symbol not in known_gene_symbols:
new_gene_symbols.append(GeneSymbol(symbol=symbol))
if symbol.upper() not in known_uc_gene_symbols:
new_gene_symbols.add(symbol)
# RefSeq have no version, set as 0 if missing
version = gv_data.get("version", 0)

Expand All @@ -170,8 +172,9 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium,

if new_gene_symbols:
logging.info("Creating %d new gene symbols", len(new_gene_symbols))
GeneSymbol.objects.bulk_create(new_gene_symbols, batch_size=self.BATCH_SIZE)
known_gene_symbols.update({gene_symbol.symbol for gene_symbol in new_gene_symbols})
GeneSymbol.objects.bulk_create([GeneSymbol(symbol=symbol) for symbol in new_gene_symbols],
batch_size=self.BATCH_SIZE)
known_uc_gene_symbols.update((s.upper() for s in new_gene_symbols))

if new_genes:
logging.info("Creating %d new genes", len(new_genes))
Expand Down
4 changes: 3 additions & 1 deletion genes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ class GeneAnnotationImport(TimeStampedModel):
file_md5sum = models.TextField()

def __str__(self):
if self.url:
return self.url
return os.path.basename(self.filename)


Expand Down Expand Up @@ -672,7 +674,7 @@ def alignment_gap(self):
if "cdna_match" in self.data or "partial" in self.data:
return True
tvsi = TranscriptVersionSequenceInfo.get(self.accession)
return tvsi.length == self.length
return tvsi.length != self.length

# Ensembl transcripts use genomic sequence so there is never any gap
return False
Expand Down

0 comments on commit 30c4174

Please sign in to comment.