Skip to content

Commit

Permalink
Merge pull request #144 from cancervariants/issue-140-main
Browse files Browse the repository at this point in the history
refactor: how vrs chromosome/sequence locations are stored in dynamodb
  • Loading branch information
korikuzma committed Oct 19, 2022
2 parents 89a5149 + 0340a2c commit df72bd9
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 95 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ uvicorn = "*"
click = "*"
boto3 = "*"
"ga4gh.vrsatile.pydantic" = "==0.0.11"
"ga4gh.vrs" = {version = "==0.8.0dev0"}

[dev-packages]
gene = {editable = true, path = "."}
gffutils = "*"
"biocommons.seqrepo" = "*"
"ga4gh.vrs" = {version = "==0.8.0dev0", extras = ["extras"]}
psycopg2-binary = "*"
pytest = "*"
pre-commit = "*"
Expand Down
37 changes: 10 additions & 27 deletions gene/etl/vrs_locations/chromosome_location.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""This module defines GA4GH Chromosome Location."""
from ga4gh.vrs import models
from ga4gh.core import ga4gh_identify
import re
import logging
import python_jsonschema_objects

from pydantic.error_wrappers import ValidationError

from gene.schemas import GeneChromosomeLocation


logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
Expand All @@ -12,25 +14,6 @@
class ChromosomeLocation:
"""The class for GA4GH Chromosome Location."""

def add_location(self, location):
"""Get a gene's Chromosome Location.
:param dict location: A gene's location.
:return: A dictionary of a GA4GH VRS ChromosomeLocation.
"""
chr_location = models.ChromosomeLocation(
species_id="taxonomy:9606",
chr=location['chr'],
interval=models.CytobandInterval(
start=location['start'],
end=location['end'],
type="CytobandInterval"
),
type="ChromosomeLocation"
)
chr_location._id = ga4gh_identify(chr_location)
return chr_location.as_dict()

def get_location(self, location, gene):
"""Transform a gene's location into a Chromosome Location.
Expand All @@ -50,11 +33,11 @@ def get_location(self, location, gene):
location['start'] = 'cen'
location['end'] = 'qter'
try:
chr_location = \
self.add_location(
location)
except python_jsonschema_objects.validators. \
ValidationError as e:
chr_location = GeneChromosomeLocation(
chr=location["chr"],
start=location["start"],
end=location["end"]).dict()
except ValidationError as e:
logger.info(f"{e} for {gene['symbol']}")
else:
return chr_location
Expand Down
19 changes: 5 additions & 14 deletions gene/etl/vrs_locations/sequence_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from typing import List
import logging

from ga4gh.vrs import models
from ga4gh.core import ga4gh_identify
from gene.schemas import GeneSequenceLocation

logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -44,18 +43,10 @@ def add_location(self, seqid, gene, params, sr):

if gene.start != '.' and gene.end != '.' and sequence_id:
if 0 <= gene.start <= gene.end:
seq_location = models.SequenceLocation(
sequence_id=sequence_id,
interval=models.SequenceInterval(
start=models.Number(value=gene.start - 1,
type="Number"),
end=models.Number(value=gene.end, type="Number"),
type="SequenceInterval"
),
type="SequenceLocation"
)
seq_location._id = ga4gh_identify(seq_location)
location = seq_location.as_dict()
location = GeneSequenceLocation(
start=gene.start - 1,
end=gene.end,
sequence_id=sequence_id).dict()
else:
logger.info(f"{params['concept_id']} has invalid interval:"
f"start={gene.start - 1} end={gene.end}")
Expand Down
77 changes: 55 additions & 22 deletions gene/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@
import re
from typing import List, Dict, Set, Any, TypeVar, Callable, Optional
from urllib.parse import quote
from .version import __version__
from datetime import datetime

from ga4gh.vrsatile.pydantic.vrs_models import VRSTypes
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key
from ga4gh.vrs import models
from ga4gh.core import ga4gh_identify

from gene import logger
from gene import NAMESPACE_LOOKUP, PREFIX_LOOKUP, ITEM_TYPES
from gene.database import Database
from gene.schemas import BaseGene, Gene, SourceMeta, MatchType, SourceName, \
ServiceMeta, SourcePriority, NormalizeService, SearchService, \
GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \
BaseNormalizationService
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key
from datetime import datetime
from gene import logger

from gene.version import __version__

NormService = TypeVar("NormService", bound=BaseNormalizationService)

Expand Down Expand Up @@ -78,20 +82,47 @@ def fetch_meta(self, src_name: str) -> SourceMeta:
logger.error(e.response['Error']['Message'])

@staticmethod
def _cast_location_ints(record: Dict) -> Dict:
"""Ensure Locations are formatted correctly -- interval start and end need to
be recast to ints from how they're structured in DynamoDB
def _transform_chromosome_location(loc: Dict) -> models.ChromosomeLocation:
"""Transform a chromosome location to VRS chromosome location
:param Dict loc: Chromosome location
:return: VRS chromosome location
"""
transformed_loc = models.ChromosomeLocation(
type="ChromosomeLocation",
species_id=loc["species_id"],
chr=loc["chr"],
interval=models.CytobandInterval(
type="CytobandInterval",
start=loc["start"],
end=loc["end"]))
return transformed_loc

def _transform_locations(self, record: Dict) -> Dict:
"""Transform gene locations to VRS Chromosome/Sequence Locations
:param Dict record: original record
:return: record with corrected locations attributes, if applicable
:return: record with transformed locations attributes, if applicable
"""
if 'locations' in record:
for loc in record['locations']:
if loc['interval']['type'] == "SequenceInterval":
loc['interval']['start']['value'] = \
int(loc['interval']['start']['value'])
loc['interval']['end']['value'] = \
int(loc['interval']['end']['value'])
record_locations = list()
if "locations" in record:
for loc in record["locations"]:
if loc["type"] == VRSTypes.SEQUENCE_LOCATION:
transformed_loc = models.SequenceLocation(
type="SequenceLocation",
sequence_id=loc["sequence_id"],
interval=models.SequenceInterval(
type="SequenceInterval",
start=models.Number(value=int(loc["start"]), type="Number"),
end=models.Number(value=int(loc["end"]), type="Number")))
else:
transformed_loc = self._transform_chromosome_location(loc)

transformed_loc._id = ga4gh_identify(transformed_loc)
transformed_loc = transformed_loc.as_dict()
record_locations.append(transformed_loc)

record["locations"] = record_locations
return record

def add_record(self,
Expand All @@ -109,7 +140,7 @@ def add_record(self,
"""
del item['label_and_type']
# DynamoDB Numbers get converted to Decimal
item = self._cast_location_ints(item)
item = self._transform_locations(item)
item["match_type"] = match_type
gene = Gene(**item)
src_name = item['src_name']
Expand Down Expand Up @@ -413,7 +444,9 @@ def add_gene_descriptor(
for ext_label, record_label in extension_and_record_labels:
if record_label in record and record[record_label]:
if ext_label == 'chromosome_location':
record[record_label] = record[record_label][0]
loc = self._transform_chromosome_location(record[record_label][0])
loc._id = ga4gh_identify(loc)
record[record_label] = loc.as_dict()
extensions.append(Extension(
name=ext_label,
value=record[record_label]
Expand Down Expand Up @@ -607,7 +640,7 @@ def _add_normalized_records(
if normalized_record["item_type"] == "identity":
record_source = SourceName[normalized_record["src_name"].upper()]
response.source_matches[record_source] = MatchesNormalized(
records=[BaseGene(**self._cast_location_ints(normalized_record))],
records=[BaseGene(**self._transform_locations(normalized_record))],
source_meta_=self.fetch_meta(record_source.value)
)
else:
Expand All @@ -618,7 +651,7 @@ def _add_normalized_records(
if not record:
continue
record_source = SourceName[record["src_name"].upper()]
gene = BaseGene(**self._cast_location_ints(record))
gene = BaseGene(**self._transform_locations(record))
if record_source in response.source_matches:
response.source_matches[record_source].records.append(gene)
else:
Expand Down
32 changes: 27 additions & 5 deletions gene/schemas.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""This module contains data models for representing VICC normalized
gene records.
"""
from typing import Type, List, Optional, Dict, Union, Any
from typing import Literal, Type, List, Optional, Dict, Union, Any
from pydantic import BaseModel, StrictBool, validator
from enum import Enum, IntEnum
from ga4gh.vrsatile.pydantic import return_value
from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, \
ChromosomeLocation, CURIE
from ga4gh.vrsatile.pydantic.vrs_models import CURIE, VRSTypes, SequenceLocation, \
ChromosomeLocation
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor
from pydantic.types import StrictStr
from pydantic.types import StrictStr, StrictInt


class SymbolStatus(str, Enum):
Expand Down Expand Up @@ -56,6 +56,25 @@ class MatchType(IntEnum):
NO_MATCH = 0


class GeneSequenceLocation(BaseModel):
"""Sequence Location model when storing in DynamoDB."""

type: Literal[VRSTypes.SEQUENCE_LOCATION] = VRSTypes.SEQUENCE_LOCATION
start: StrictInt
end: StrictInt
sequence_id: CURIE


class GeneChromosomeLocation(BaseModel):
"""Chromosome Location model when storing in DynamDB."""

type: Literal[VRSTypes.CHROMOSOME_LOCATION] = VRSTypes.CHROMOSOME_LOCATION
species_id: Literal["taxonomy:9606"] = "taxonomy:9606"
chr: StrictStr
start: StrictStr
end: StrictStr


class BaseGene(BaseModel):
"""Base gene model. Provide shared resources for records produced by
/search and /normalize_unmerged.
Expand All @@ -67,7 +86,10 @@ class BaseGene(BaseModel):
label: Optional[StrictStr]
strand: Optional[Strand]
location_annotations: Optional[List[StrictStr]] = []
locations: Optional[List[Union[SequenceLocation, ChromosomeLocation]]] = []
locations: Optional[Union[
List[Union[SequenceLocation, ChromosomeLocation]],
List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb
]] = [],
aliases: Optional[List[StrictStr]] = []
previous_symbols: Optional[List[StrictStr]] = []
xrefs: Optional[List[CURIE]] = []
Expand Down
30 changes: 13 additions & 17 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-i https://pypi.org/simple
anyio==3.6.1 ; python_full_version >= '3.6.2'
anyio==3.6.2 ; python_full_version >= '3.6.2'
appdirs==1.4.4
appnope==0.1.3 ; sys_platform == 'darwin'
argcomplete==2.0.0 ; python_version >= '3.6'
Expand All @@ -14,8 +14,8 @@ beautifulsoup4==4.11.1 ; python_full_version >= '3.6.0'
biocommons.seqrepo==0.6.5
bioutils==0.5.7 ; python_version >= '3.6'
bleach==5.0.1 ; python_version >= '3.7'
boto3==1.24.92
botocore==1.27.92 ; python_version >= '3.7'
boto3==1.24.93
botocore==1.27.93 ; python_version >= '3.7'
bs4==0.0.1
canonicaljson==1.6.3 ; python_version >= '3.7'
certifi==2022.9.24 ; python_version >= '3.6'
Expand All @@ -24,7 +24,6 @@ cfgv==3.3.1 ; python_full_version >= '3.6.1'
charset-normalizer==2.1.1 ; python_full_version >= '3.6.0'
click==8.1.3
coloredlogs==15.0.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
configparser==5.3.0 ; python_version >= '3.7'
coverage==6.5.0
coveralls==3.3.1
cssselect==1.1.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
Expand All @@ -41,12 +40,11 @@ fastjsonschema==2.16.2
filelock==3.8.0 ; python_version >= '3.7'
flake8==5.0.4
flake8-docstrings==1.6.0
ga4gh.vrs[extras]==0.8.0dev0
ga4gh.vrs==0.8.0dev0
ga4gh.vrsatile.pydantic==0.0.11
-e .
gffutils==0.11.1
h11==0.14.0 ; python_version >= '3.7'
hgvs==1.5.2
humanfriendly==10.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
identify==2.5.6 ; python_version >= '3.7'
idna==3.4 ; python_version >= '3.5'
Expand Down Expand Up @@ -86,7 +84,6 @@ numpy==1.23.4 ; python_version >= '3.8'
packaging==21.3 ; python_version >= '3.6'
pandocfilters==1.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
parse==1.19.0
parsley==1.3
parso==0.8.3 ; python_version >= '3.6'
pexpect==4.8.0 ; sys_platform != 'win32'
pickleshare==0.7.5
Expand All @@ -95,8 +92,7 @@ pluggy==1.0.0 ; python_version >= '3.6'
pre-commit==2.20.0
prometheus-client==0.15.0 ; python_version >= '3.6'
prompt-toolkit==3.0.31 ; python_full_version >= '3.6.2'
psutil==5.9.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
psycopg2==2.9.4 ; python_version >= '3.6'
psutil==5.9.3 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
psycopg2-binary==2.9.4
ptyprocess==0.7.0
pure-eval==0.2.2
Expand All @@ -116,18 +112,18 @@ pyrsistent==0.18.1 ; python_version >= '3.7'
pysam==0.19.1
pytest==7.1.3
pytest-cov==4.0.0
python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'
python-jsonschema-objects==0.4.1
pytz==2022.4
pytz==2022.5
pyyaml==6.0 ; python_version >= '3.6'
pyzmq==24.0.1 ; python_version >= '3.6'
requests==2.28.1
requests==2.28.1 ; python_version >= '3.7' and python_version < '4'
requests-html==0.10.0 ; python_full_version >= '3.6.0'
s3transfer==0.6.0 ; python_version >= '3.7'
send2trash==1.8.0
setuptools==65.5.0 ; python_version >= '3.7'
simplejson==3.17.6 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3'
six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
simplejson==3.17.6 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2'
six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'
sniffio==1.3.0 ; python_version >= '3.7'
snowballstemmer==2.2.0
soupsieve==2.3.2.post1 ; python_version >= '3.6'
Expand All @@ -136,12 +132,12 @@ stack-data==0.5.1
starlette==0.20.4 ; python_version >= '3.7'
tabulate==0.9.0 ; python_version >= '3.7'
terminado==0.16.0 ; python_version >= '3.7'
tinycss2==1.2.0 ; python_version >= '3.7'
toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
tinycss2==1.2.1 ; python_version >= '3.7'
toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'
tomli==2.0.1 ; python_version >= '3.7'
tornado==6.2 ; python_version >= '3.7'
tqdm==4.64.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
traitlets==5.4.0 ; python_version >= '3.7'
traitlets==5.5.0 ; python_version >= '3.7'
typing-extensions==4.4.0 ; python_version >= '3.7'
urllib3==1.26.12 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'
uvicorn==0.18.3
Expand Down
Loading

0 comments on commit df72bd9

Please sign in to comment.