diff --git a/genes/management/commands/import_gene_annotation2.py b/genes/management/commands/import_gene_annotation2.py index 49192ffcc..9d505140f 100644 --- a/genes/management/commands/import_gene_annotation2.py +++ b/genes/management/commands/import_gene_annotation2.py @@ -4,6 +4,8 @@ from typing import Dict, List, Set from django.core.management.base import BaseCommand +from django.db.models.functions import Upper + from genes.models import GeneSymbol, GeneAnnotationImport, Gene, GeneVersion, TranscriptVersion, Transcript from genes.models_enums import AnnotationConsortium from library.file_utils import open_handle_gzip @@ -86,6 +88,7 @@ def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]: for transcript_accession in gene["transcripts"]: if transcript_accession in transcripts: transcript_gene_version[transcript_accession] = gv_accession + need_gene = True if need_gene: gene_version[gene_id] = convert_gene_pyreference_to_gene_version_data(gene) @@ -93,7 +96,7 @@ def _convert_to_merged_data(self, pyreference_data: List[Dict]) -> List[Dict]: for transcript_accession in transcripts: transcript = prd["transcripts_by_id"][transcript_accession] tv_data = { - "biotype": transcript["biotype"], + "biotype": ",".join(transcript["biotype"]), "gene_version": transcript_gene_version[transcript_accession], "data": convert_transcript_pyreference_to_pyhgvs(transcript), } @@ -113,7 +116,7 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium, """ """ print("_import_merged_data") - known_gene_symbols = set(GeneSymbol.objects.all().values_list("pk", flat=True)) + known_uc_gene_symbols = set(GeneSymbol.objects.annotate(uc_symbol=Upper("symbol")).values_list("uc_symbol", flat=True)) genes_qs = Gene.objects.filter(annotation_consortium=annotation_consortium) known_genes_ids = set(genes_qs.values_list("identifier", flat=True)) transcripts_qs = Transcript.objects.filter(annotation_consortium=annotation_consortium) @@ -131,13 +134,12 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium, for data in merged_data: import_data = data["gene_annotation_import"] logging.info("%s has %d transcripts", import_data, len(data["transcript_version"])) - import_source = GeneAnnotationImport.objects.create(annotation_consortium=annotation_consortium, - genome_build=genome_build, - filename=import_data["path"], - url=import_data["url"], - file_md5sum=import_data["md5sum"]) - - new_gene_symbols = [] + import_source = GeneAnnotationImport.objects.get_or_create(annotation_consortium=annotation_consortium, + genome_build=genome_build, + filename=import_data["path"], + url=import_data["url"], + file_md5sum=import_data["md5sum"])[0] + new_gene_symbols = set() new_genes = [] new_gene_versions = [] modified_gene_versions = [] @@ -148,8 +150,8 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium, annotation_consortium=annotation_consortium)) if symbol := gv_data["gene_symbol"]: - if symbol not in known_gene_symbols: - new_gene_symbols.append(GeneSymbol(symbol=symbol)) + if symbol.upper() not in known_uc_gene_symbols: + new_gene_symbols.add(symbol) # RefSeq have no version, set as 0 if missing version = gv_data.get("version", 0) @@ -170,8 +172,9 @@ def _import_merged_data(self, genome_build: GenomeBuild, annotation_consortium, if new_gene_symbols: logging.info("Creating %d new gene symbols", len(new_gene_symbols)) - GeneSymbol.objects.bulk_create(new_gene_symbols, batch_size=self.BATCH_SIZE) - known_gene_symbols.update({gene_symbol.symbol for gene_symbol in new_gene_symbols}) + GeneSymbol.objects.bulk_create([GeneSymbol(symbol=symbol) for symbol in new_gene_symbols], + batch_size=self.BATCH_SIZE) + known_uc_gene_symbols.update((s.upper() for s in new_gene_symbols)) if new_genes: logging.info("Creating %d new genes", len(new_genes)) diff --git a/genes/models.py b/genes/models.py index 2712e22a2..197cf9e9c 100644 --- a/genes/models.py +++ b/genes/models.py @@ -341,6 +341,8 @@ class GeneAnnotationImport(TimeStampedModel): file_md5sum = models.TextField() def __str__(self): + if self.url: + return self.url return os.path.basename(self.filename) @@ -672,7 +674,7 @@ def alignment_gap(self): if "cdna_match" in self.data or "partial" in self.data: return True tvsi = TranscriptVersionSequenceInfo.get(self.accession) - return tvsi.length == self.length + return tvsi.length != self.length # Ensembl transcripts use genomic sequence so there is never any gap return False