diff --git a/Pipfile b/Pipfile index c7f0cf2e..7c33f6ba 100644 --- a/Pipfile +++ b/Pipfile @@ -10,12 +10,12 @@ uvicorn = "*" click = "*" boto3 = "*" "ga4gh.vrsatile.pydantic" = "==0.0.11" +"ga4gh.vrs" = {version = "==0.8.0dev0"} [dev-packages] gene = {editable = true, path = "."} gffutils = "*" "biocommons.seqrepo" = "*" -"ga4gh.vrs" = {version = "==0.8.0dev0", extras = ["extras"]} psycopg2-binary = "*" pytest = "*" pre-commit = "*" diff --git a/gene/etl/vrs_locations/chromosome_location.py b/gene/etl/vrs_locations/chromosome_location.py index e2c36e02..c407582b 100644 --- a/gene/etl/vrs_locations/chromosome_location.py +++ b/gene/etl/vrs_locations/chromosome_location.py @@ -1,9 +1,11 @@ """This module defines GA4GH Chromosome Location.""" -from ga4gh.vrs import models -from ga4gh.core import ga4gh_identify import re import logging -import python_jsonschema_objects + +from pydantic.error_wrappers import ValidationError + +from gene.schemas import GeneChromosomeLocation + logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -12,25 +14,6 @@ class ChromosomeLocation: """The class for GA4GH Chromosome Location.""" - def add_location(self, location): - """Get a gene's Chromosome Location. - - :param dict location: A gene's location. - :return: A dictionary of a GA4GH VRS ChromosomeLocation. - """ - chr_location = models.ChromosomeLocation( - species_id="taxonomy:9606", - chr=location['chr'], - interval=models.CytobandInterval( - start=location['start'], - end=location['end'], - type="CytobandInterval" - ), - type="ChromosomeLocation" - ) - chr_location._id = ga4gh_identify(chr_location) - return chr_location.as_dict() - def get_location(self, location, gene): """Transform a gene's location into a Chromosome Location. @@ -50,11 +33,11 @@ def get_location(self, location, gene): location['start'] = 'cen' location['end'] = 'qter' try: - chr_location = \ - self.add_location( - location) - except python_jsonschema_objects.validators. \ - ValidationError as e: + chr_location = GeneChromosomeLocation( + chr=location["chr"], + start=location["start"], + end=location["end"]).dict() + except ValidationError as e: logger.info(f"{e} for {gene['symbol']}") else: return chr_location diff --git a/gene/etl/vrs_locations/sequence_location.py b/gene/etl/vrs_locations/sequence_location.py index d33ecd11..26219c87 100644 --- a/gene/etl/vrs_locations/sequence_location.py +++ b/gene/etl/vrs_locations/sequence_location.py @@ -2,8 +2,7 @@ from typing import List import logging -from ga4gh.vrs import models -from ga4gh.core import ga4gh_identify +from gene.schemas import GeneSequenceLocation logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -44,18 +43,10 @@ def add_location(self, seqid, gene, params, sr): if gene.start != '.' and gene.end != '.' and sequence_id: if 0 <= gene.start <= gene.end: - seq_location = models.SequenceLocation( - sequence_id=sequence_id, - interval=models.SequenceInterval( - start=models.Number(value=gene.start - 1, - type="Number"), - end=models.Number(value=gene.end, type="Number"), - type="SequenceInterval" - ), - type="SequenceLocation" - ) - seq_location._id = ga4gh_identify(seq_location) - location = seq_location.as_dict() + location = GeneSequenceLocation( + start=gene.start - 1, + end=gene.end, + sequence_id=sequence_id).dict() else: logger.info(f"{params['concept_id']} has invalid interval:" f"start={gene.start - 1} end={gene.end}") diff --git a/gene/query.py b/gene/query.py index 32ec2e6c..6da3a7bc 100644 --- a/gene/query.py +++ b/gene/query.py @@ -2,19 +2,23 @@ import re from typing import List, Dict, Set, Any, TypeVar, Callable, Optional from urllib.parse import quote -from .version import __version__ +from datetime import datetime + +from ga4gh.vrsatile.pydantic.vrs_models import VRSTypes +from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension +from botocore.exceptions import ClientError +from boto3.dynamodb.conditions import Key +from ga4gh.vrs import models +from ga4gh.core import ga4gh_identify + +from gene import logger from gene import NAMESPACE_LOOKUP, PREFIX_LOOKUP, ITEM_TYPES from gene.database import Database from gene.schemas import BaseGene, Gene, SourceMeta, MatchType, SourceName, \ ServiceMeta, SourcePriority, NormalizeService, SearchService, \ GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \ BaseNormalizationService -from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension -from botocore.exceptions import ClientError -from boto3.dynamodb.conditions import Key -from datetime import datetime -from gene import logger - +from gene.version import __version__ NormService = TypeVar("NormService", bound=BaseNormalizationService) @@ -78,20 +82,47 @@ def fetch_meta(self, src_name: str) -> SourceMeta: logger.error(e.response['Error']['Message']) @staticmethod - def _cast_location_ints(record: Dict) -> Dict: - """Ensure Locations are formatted correctly -- interval start and end need to - be recast to ints from how they're structured in DynamoDB + def _transform_chromosome_location(loc: Dict) -> models.ChromosomeLocation: + """Transform a chromosome location to VRS chromosome location + + :param Dict loc: Chromosome location + :return: VRS chromosome location + """ + transformed_loc = models.ChromosomeLocation( + type="ChromosomeLocation", + species_id=loc["species_id"], + chr=loc["chr"], + interval=models.CytobandInterval( + type="CytobandInterval", + start=loc["start"], + end=loc["end"])) + return transformed_loc + + def _transform_locations(self, record: Dict) -> Dict: + """Transform gene locations to VRS Chromosome/Sequence Locations :param Dict record: original record - :return: record with corrected locations attributes, if applicable + :return: record with transformed locations attributes, if applicable """ - if 'locations' in record: - for loc in record['locations']: - if loc['interval']['type'] == "SequenceInterval": - loc['interval']['start']['value'] = \ - int(loc['interval']['start']['value']) - loc['interval']['end']['value'] = \ - int(loc['interval']['end']['value']) + record_locations = list() + if "locations" in record: + for loc in record["locations"]: + if loc["type"] == VRSTypes.SEQUENCE_LOCATION: + transformed_loc = models.SequenceLocation( + type="SequenceLocation", + sequence_id=loc["sequence_id"], + interval=models.SequenceInterval( + type="SequenceInterval", + start=models.Number(value=int(loc["start"]), type="Number"), + end=models.Number(value=int(loc["end"]), type="Number"))) + else: + transformed_loc = self._transform_chromosome_location(loc) + + transformed_loc._id = ga4gh_identify(transformed_loc) + transformed_loc = transformed_loc.as_dict() + record_locations.append(transformed_loc) + + record["locations"] = record_locations return record def add_record(self, @@ -109,7 +140,7 @@ def add_record(self, """ del item['label_and_type'] # DynamoDB Numbers get converted to Decimal - item = self._cast_location_ints(item) + item = self._transform_locations(item) item["match_type"] = match_type gene = Gene(**item) src_name = item['src_name'] @@ -413,7 +444,9 @@ def add_gene_descriptor( for ext_label, record_label in extension_and_record_labels: if record_label in record and record[record_label]: if ext_label == 'chromosome_location': - record[record_label] = record[record_label][0] + loc = self._transform_chromosome_location(record[record_label][0]) + loc._id = ga4gh_identify(loc) + record[record_label] = loc.as_dict() extensions.append(Extension( name=ext_label, value=record[record_label] @@ -607,7 +640,7 @@ def _add_normalized_records( if normalized_record["item_type"] == "identity": record_source = SourceName[normalized_record["src_name"].upper()] response.source_matches[record_source] = MatchesNormalized( - records=[BaseGene(**self._cast_location_ints(normalized_record))], + records=[BaseGene(**self._transform_locations(normalized_record))], source_meta_=self.fetch_meta(record_source.value) ) else: @@ -618,7 +651,7 @@ def _add_normalized_records( if not record: continue record_source = SourceName[record["src_name"].upper()] - gene = BaseGene(**self._cast_location_ints(record)) + gene = BaseGene(**self._transform_locations(record)) if record_source in response.source_matches: response.source_matches[record_source].records.append(gene) else: diff --git a/gene/schemas.py b/gene/schemas.py index 41b86239..6b178c4d 100644 --- a/gene/schemas.py +++ b/gene/schemas.py @@ -1,14 +1,14 @@ """This module contains data models for representing VICC normalized gene records. """ -from typing import Type, List, Optional, Dict, Union, Any +from typing import Literal, Type, List, Optional, Dict, Union, Any from pydantic import BaseModel, StrictBool, validator from enum import Enum, IntEnum from ga4gh.vrsatile.pydantic import return_value -from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, \ - ChromosomeLocation, CURIE +from ga4gh.vrsatile.pydantic.vrs_models import CURIE, VRSTypes, SequenceLocation, \ + ChromosomeLocation from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor -from pydantic.types import StrictStr +from pydantic.types import StrictStr, StrictInt class SymbolStatus(str, Enum): @@ -56,6 +56,25 @@ class MatchType(IntEnum): NO_MATCH = 0 +class GeneSequenceLocation(BaseModel): + """Sequence Location model when storing in DynamoDB.""" + + type: Literal[VRSTypes.SEQUENCE_LOCATION] = VRSTypes.SEQUENCE_LOCATION + start: StrictInt + end: StrictInt + sequence_id: CURIE + + +class GeneChromosomeLocation(BaseModel): + """Chromosome Location model when storing in DynamDB.""" + + type: Literal[VRSTypes.CHROMOSOME_LOCATION] = VRSTypes.CHROMOSOME_LOCATION + species_id: Literal["taxonomy:9606"] = "taxonomy:9606" + chr: StrictStr + start: StrictStr + end: StrictStr + + class BaseGene(BaseModel): """Base gene model. Provide shared resources for records produced by /search and /normalize_unmerged. @@ -67,7 +86,10 @@ class BaseGene(BaseModel): label: Optional[StrictStr] strand: Optional[Strand] location_annotations: Optional[List[StrictStr]] = [] - locations: Optional[List[Union[SequenceLocation, ChromosomeLocation]]] = [] + locations: Optional[Union[ + List[Union[SequenceLocation, ChromosomeLocation]], + List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb + ]] = [], aliases: Optional[List[StrictStr]] = [] previous_symbols: Optional[List[StrictStr]] = [] xrefs: Optional[List[CURIE]] = [] diff --git a/requirements-dev.txt b/requirements-dev.txt index fbc0854f..8f55fa74 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -i https://pypi.org/simple -anyio==3.6.1 ; python_full_version >= '3.6.2' +anyio==3.6.2 ; python_full_version >= '3.6.2' appdirs==1.4.4 appnope==0.1.3 ; sys_platform == 'darwin' argcomplete==2.0.0 ; python_version >= '3.6' @@ -14,8 +14,8 @@ beautifulsoup4==4.11.1 ; python_full_version >= '3.6.0' biocommons.seqrepo==0.6.5 bioutils==0.5.7 ; python_version >= '3.6' bleach==5.0.1 ; python_version >= '3.7' -boto3==1.24.92 -botocore==1.27.92 ; python_version >= '3.7' +boto3==1.24.93 +botocore==1.27.93 ; python_version >= '3.7' bs4==0.0.1 canonicaljson==1.6.3 ; python_version >= '3.7' certifi==2022.9.24 ; python_version >= '3.6' @@ -24,7 +24,6 @@ cfgv==3.3.1 ; python_full_version >= '3.6.1' charset-normalizer==2.1.1 ; python_full_version >= '3.6.0' click==8.1.3 coloredlogs==15.0.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -configparser==5.3.0 ; python_version >= '3.7' coverage==6.5.0 coveralls==3.3.1 cssselect==1.1.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' @@ -41,12 +40,11 @@ fastjsonschema==2.16.2 filelock==3.8.0 ; python_version >= '3.7' flake8==5.0.4 flake8-docstrings==1.6.0 -ga4gh.vrs[extras]==0.8.0dev0 +ga4gh.vrs==0.8.0dev0 ga4gh.vrsatile.pydantic==0.0.11 -e . gffutils==0.11.1 h11==0.14.0 ; python_version >= '3.7' -hgvs==1.5.2 humanfriendly==10.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' identify==2.5.6 ; python_version >= '3.7' idna==3.4 ; python_version >= '3.5' @@ -86,7 +84,6 @@ numpy==1.23.4 ; python_version >= '3.8' packaging==21.3 ; python_version >= '3.6' pandocfilters==1.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' parse==1.19.0 -parsley==1.3 parso==0.8.3 ; python_version >= '3.6' pexpect==4.8.0 ; sys_platform != 'win32' pickleshare==0.7.5 @@ -95,8 +92,7 @@ pluggy==1.0.0 ; python_version >= '3.6' pre-commit==2.20.0 prometheus-client==0.15.0 ; python_version >= '3.6' prompt-toolkit==3.0.31 ; python_full_version >= '3.6.2' -psutil==5.9.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -psycopg2==2.9.4 ; python_version >= '3.6' +psutil==5.9.3 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' psycopg2-binary==2.9.4 ptyprocess==0.7.0 pure-eval==0.2.2 @@ -116,18 +112,18 @@ pyrsistent==0.18.1 ; python_version >= '3.7' pysam==0.19.1 pytest==7.1.3 pytest-cov==4.0.0 -python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' python-jsonschema-objects==0.4.1 -pytz==2022.4 +pytz==2022.5 pyyaml==6.0 ; python_version >= '3.6' pyzmq==24.0.1 ; python_version >= '3.6' -requests==2.28.1 +requests==2.28.1 ; python_version >= '3.7' and python_version < '4' requests-html==0.10.0 ; python_full_version >= '3.6.0' s3transfer==0.6.0 ; python_version >= '3.7' send2trash==1.8.0 setuptools==65.5.0 ; python_version >= '3.7' -simplejson==3.17.6 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' -six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +simplejson==3.17.6 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2' +six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' sniffio==1.3.0 ; python_version >= '3.7' snowballstemmer==2.2.0 soupsieve==2.3.2.post1 ; python_version >= '3.6' @@ -136,12 +132,12 @@ stack-data==0.5.1 starlette==0.20.4 ; python_version >= '3.7' tabulate==0.9.0 ; python_version >= '3.7' terminado==0.16.0 ; python_version >= '3.7' -tinycss2==1.2.0 ; python_version >= '3.7' -toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' +tinycss2==1.2.1 ; python_version >= '3.7' +toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2' tomli==2.0.1 ; python_version >= '3.7' tornado==6.2 ; python_version >= '3.7' tqdm==4.64.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -traitlets==5.4.0 ; python_version >= '3.7' +traitlets==5.5.0 ; python_version >= '3.7' typing-extensions==4.4.0 ; python_version >= '3.7' urllib3==1.26.12 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4' uvicorn==0.18.3 diff --git a/requirements.txt b/requirements.txt index bcf61a84..0be2c461 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,35 @@ -i https://pypi.org/simple -anyio==3.6.1 ; python_full_version >= '3.6.2' -boto3==1.24.92 -botocore==1.27.92 ; python_version >= '3.7' +anyio==3.6.2 ; python_full_version >= '3.6.2' +attrs==22.1.0 ; python_version >= '3.5' +bioutils==0.5.7 ; python_version >= '3.6' +boto3==1.24.93 +botocore==1.27.93 ; python_version >= '3.7' +canonicaljson==1.6.3 ; python_version >= '3.7' +certifi==2022.9.24 ; python_version >= '3.6' +charset-normalizer==2.1.1 ; python_full_version >= '3.6.0' click==8.1.3 +coloredlogs==15.0.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' fastapi==0.85.1 +ga4gh.vrs==0.8.0dev0 ga4gh.vrsatile.pydantic==0.0.11 h11==0.14.0 ; python_version >= '3.7' +humanfriendly==10.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' idna==3.4 ; python_version >= '3.5' +inflection==0.5.1 ; python_version >= '3.5' jmespath==1.0.1 ; python_version >= '3.7' +jsonschema==3.2.0 +markdown==3.4.1 ; python_version >= '3.7' +numpy==1.23.4 ; python_version >= '3.8' pydantic==1.10.2 -python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pyrsistent==0.18.1 ; python_version >= '3.7' +python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' +python-jsonschema-objects==0.4.1 +pyyaml==6.0 ; python_version >= '3.6' +requests==2.28.1 ; python_version >= '3.7' and python_version < '4' s3transfer==0.6.0 ; python_version >= '3.7' -six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +setuptools==65.5.0 ; python_version >= '3.7' +simplejson==3.17.6 ; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2' +six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' sniffio==1.3.0 ; python_version >= '3.7' starlette==0.20.4 ; python_version >= '3.7' typing-extensions==4.4.0 ; python_version >= '3.7' diff --git a/setup.cfg b/setup.cfg index 9b99d78c..755ce114 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = click boto3 ga4gh.vrsatile.pydantic == 0.0.11 + ga4gh.vrs == 0.8.0dev0 tests_require = pytest @@ -40,7 +41,6 @@ tests_require = dev = gffutils biocommons.seqrepo - ga4gh.vrs[extras] == 0.8.0dev0 psycopg2-binary [tool:pytest] diff --git a/tests/unit/data/etl_data/ensembl_107.gff3 b/tests/unit/data/etl_data/ensembl_108.gff3 similarity index 100% rename from tests/unit/data/etl_data/ensembl_107.gff3 rename to tests/unit/data/etl_data/ensembl_108.gff3 diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 8b7d88fe..4ed03f71 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -91,7 +91,7 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, shutil.rmtree(e.src_data_dir) e._sequence_location.get_aliases = _get_aliases - e._data_src = etl_data_path / 'ensembl_107.gff3' + e._data_src = etl_data_path / 'ensembl_108.gff3' e._transform_data() e._add_meta() processed_ids += e._processed_ids diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 8a27eb03..aca97885 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -304,9 +304,9 @@ def test_meta_info(ensembl): assert resp.source_meta_.data_license == "custom" assert resp.source_meta_.data_license_url == \ "https://useast.ensembl.org/info/about/legal/disclaimer.html" - assert resp.source_meta_.version == "107" + assert resp.source_meta_.version == "108" assert resp.source_meta_.data_url == \ - "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.107.gff3.gz" # noqa: E501 + "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.108.gff3.gz" # noqa: E501 assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == ["GRCh38"] assert resp.source_meta_.data_license_attributes == {