Skip to content

Commit

Permalink
#480 - store API responses so we can check lengths
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Sep 14, 2021
1 parent 686df0c commit a377329
Show file tree
Hide file tree
Showing 5 changed files with 312 additions and 87 deletions.
73 changes: 0 additions & 73 deletions genes/annotation_consortium_api.py

This file was deleted.

2 changes: 1 addition & 1 deletion genes/hgvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def _variant_to_hgvs(self, variant: Variant, transcript_name=None) -> Tuple[HGVS
raise ValueError(f"Could not convert {variant} to HGVS - tried: {attempts}")
else:
# No methods tried, mustn't have had any transcripts
raise TranscriptVersion.raise_bad_or_missing_transcript(self.genome_build, transcript_name)
raise TranscriptVersion.raise_bad_or_missing_transcript(transcript_name)
else:
hgvs_name = pyhgvs.variant_to_hgvs_name(chrom, offset, ref, alt, self.genome_build.genome_fasta.fasta,
transcript=None, max_allele_length=sys.maxsize)
Expand Down
59 changes: 59 additions & 0 deletions genes/management/commands/import_refseq_transcript_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import logging
from collections import defaultdict

from Bio import SeqIO
from django.core.management import BaseCommand
from django.db.models import QuerySet

from genes.models import TranscriptVersionInfo, TranscriptVersionInfoFastaFileImport, TranscriptVersion, Transcript
from genes.models_enums import AnnotationConsortium
from library.file_utils import file_md5sum, open_handle_gzip




class Command(BaseCommand):

def add_arguments(self, parser):
parser.add_argument('--overwrite', action='store_true', help='Delete and replace fasta import with same md5sum')
parser.add_argument('filename')

def handle(self, *args, **options):
filename = options["filename"]
overwrite = options["overwrite"]

md5_hash = file_md5sum(filename)
if existing_import := TranscriptVersionInfoFastaFileImport.objects.filter(md5_hash=md5_hash).first():
if overwrite:
print(f"Deleting existing TranscriptVersionInfos for fasta import {md5_hash}")
existing_import.delete()
else:
raise ValueError(f"Fasta import {md5_hash} exists, use --overwrite to delete old data")

known_transcripts = set(Transcript.objects.all().values_list("identifier", flat=True))
if not known_transcripts:
raise ValueError("No transcripts! Insert them first!")

fasta_import = TranscriptVersionInfoFastaFileImport.objects.create(md5_hash=md5_hash,
annotation_consortium=AnnotationConsortium.REFSEQ,
filename=filename)
skipped_transcripts = 0
records = []
with open_handle_gzip(filename, "rt") as f:
for record in SeqIO.parse(f, "fasta"):
transcript_id, version = TranscriptVersion.get_transcript_id_and_version(record.id)
if transcript_id not in known_transcripts:
skipped_transcripts += 1
continue

tvi = TranscriptVersionInfo(transcript_id=transcript_id, version=version,
fasta_import=fasta_import,
sequence=str(record.seq), length=len(record.seq))
records.append(tvi)

print(f"Skipped {skipped_transcripts} transcripts not in our database")
if num_records := len(records):
print(f"Inserting {num_records} TranscriptVersionInfo records")
TranscriptVersionInfo.objects.bulk_create(records, ignore_conflicts=True, batch_size=2000)

TranscriptVersionInfo.set_transcript_version_alignment_gap_if_length_different(records)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Generated by Django 3.2.6 on 2021-09-14 02:01

from django.db import migrations, models
import django.db.models.deletion
import django_extensions.db.fields


class Migration(migrations.Migration):

dependencies = [
('genes', '0038_lrgrefseqgene'),
]

operations = [
migrations.CreateModel(
name='TranscriptVersionInfoFastaFileImport',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', django_extensions.db.fields.CreationDateTimeField(auto_now_add=True, verbose_name='created')),
('modified', django_extensions.db.fields.ModificationDateTimeField(auto_now=True, verbose_name='modified')),
('md5_hash', models.CharField(max_length=32, unique=True)),
('annotation_consortium', models.CharField(choices=[('R', 'RefSeq'), ('E', 'Ensembl')], max_length=1)),
('filename', models.TextField()),
],
options={
'get_latest_by': 'modified',
'abstract': False,
},
),
migrations.CreateModel(
name='TranscriptVersionInfo',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', django_extensions.db.fields.CreationDateTimeField(auto_now_add=True, verbose_name='created')),
('modified', django_extensions.db.fields.ModificationDateTimeField(auto_now=True, verbose_name='modified')),
('version', models.IntegerField()),
('api_response', models.TextField(null=True)),
('sequence', models.TextField()),
('length', models.IntegerField()),
('fasta_import', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='genes.transcriptversioninfofastafileimport')),
('transcript', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='genes.transcript')),
],
options={
'unique_together': {('transcript', 'version')},
},
),
]
Loading

0 comments on commit a377329

Please sign in to comment.