Skip to content

Commit

Permalink
Merge pull request #453 from Ensembl/lcampbell/meta_getter
Browse files Browse the repository at this point in the history
Refactor genome_metadata dump module
  • Loading branch information
JAlvarezJarreta authored Nov 15, 2024
2 parents d8f752a + 07f756d commit 7fa1ab8
Show file tree
Hide file tree
Showing 5 changed files with 415 additions and 52 deletions.
11 changes: 10 additions & 1 deletion src/python/ensembl/io/genomio/data/schemas/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@
{ "type" : "array", "items" : { "type" : "string" } }
]
}
},
"database_info" : {
"type": "object",
"additionalProperties": false,
"description" : "Optional name of target database where meta data was retrieved.",
"properties" : {
"name": { "type" : "string" }
}
}
},

Expand All @@ -123,7 +131,8 @@
"genebuild" : { "$ref" : "#/definitions/genebuild_info" },
"provider" : { "$ref" : "#/definitions/provider_info" },
"BRC4" : { "$ref" : "#/definitions/BRC4_info" },
"added_seq" : { "$ref" : "#/definitions/added_sequence_info" }
"added_seq" : { "$ref" : "#/definitions/added_sequence_info" },
"database" : { "$ref" : "#/definitions/database_info" }
},
"required" : [
"species",
Expand Down
2 changes: 1 addition & 1 deletion src/python/ensembl/io/genomio/database/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def main(arg_list: list[str] | None = None) -> None:
"""Main script entry-point.
Args:
arg_list: TODO
arg_list: Arguments to parse passing list to parse_args().
"""
args = parse_args(arg_list)
Expand Down
7 changes: 3 additions & 4 deletions src/python/ensembl/io/genomio/database/meta_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A simple helper script to connect to a core database and retrieve a single meta_value
or multiple meta_value and dump meta_key/value pairs to stdout / JSON."""
"""Connect to a core database and retrieve a meta_key:meta_value pair(s)
and dump meta_key/value pairs to stdout / JSON."""

__all__ = ["get_meta_values"]

Expand Down Expand Up @@ -91,7 +91,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
"""
parser = ArgumentParser(description=__doc__)
parser.add_server_arguments(include_database=True, help="core database")
parser.add_server_arguments(include_database=True, help="server url and core database")
parser.add_argument_src_path(
"--meta_keys_list", help="Input File | List with >=2 meta_keys to query target database."
)
Expand All @@ -104,7 +104,6 @@ def main(arg_list: list[str] | None = None) -> None:
Args:
arg_list: Arguments to parse passing list to parse_args().
"""
args = parse_args(arg_list)
init_logging_with_args(args)
Expand Down
157 changes: 130 additions & 27 deletions src/python/ensembl/io/genomio/genome_metadata/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,29 @@
"filter_genome_meta",
"check_assembly_version",
"check_genebuild_version",
"metadata_dump_setup",
]

import argparse
import json
from typing import Any, Dict, Type
from typing import Any, Type
import logging
from pydoc import locate

from sqlalchemy import select
from sqlalchemy.orm import Session
from sqlalchemy.engine import URL

from ensembl.core.models import Meta
from ensembl.io.genomio.utils.json_utils import get_json
from ensembl.io.genomio.database import DBConnectionLite
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils import StrPath
from ensembl.utils.logging import init_logging_with_args


METADATA_FILTER: Dict[str, Dict[str, Type]] = {
DEFAULT_FILTER: dict[str, dict[str, Type]] = {
"database": {"name": str},
"added_seq": {"region_name": str},
"annotation": {"provider_name": str, "provider_url": str},
"assembly": {
Expand All @@ -60,14 +67,15 @@
}


def get_genome_metadata(session: Session) -> Dict[str, Any]:
def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]:
"""Returns the meta table content from the core database in a nested dictionary.
Args:
session: Session for the current core.
db_name: Target database name
"""
genome_metadata: Dict[str, Any] = {}
genome_metadata: dict[str, Any] = {}

meta_statement = select(Meta)
for row in session.execute(meta_statement).unique().all():
meta_key = row[0].meta_key
Expand All @@ -81,6 +89,10 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]:
genome_metadata[main_key][subkey] = [meta_value]
else:
genome_metadata[main_key] = {subkey: [meta_value]}

if db_name:
genome_metadata["database"] = {"name": f"{db_name}"}

# Parse genome metadata to simplify dictionary and check data consistency
for main_key, subkeys_dict in genome_metadata.items():
# Replace single-value lists by the value itself
Expand All @@ -96,45 +108,73 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]:
return genome_metadata


def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]:
def filter_genome_meta(
genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool
) -> dict[str, Any]:
"""Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.
Also converts to expected data types (to follow the genome JSON schema).
Args:
genome_metadata: Nested metadata key values from the core metadata table.
metafilter: Input JSON containing subset of meta table values to filter on.
meta_update: Deactivates additional meta updating.
"""
filtered_metadata: Dict[str, Any] = {}
for key, subfilter in METADATA_FILTER.items():
filtered_metadata: dict[str, Any] = {}

if metafilter:
metadata_filter: dict[str, dict[str, type]] = metafilter
else:
metadata_filter = DEFAULT_FILTER

for key, subfilter in metadata_filter.items():
if key in genome_metadata:
filtered_metadata[key] = {}
for subkey, value_type in subfilter.items():
if isinstance(value_type, str):
value_type = type(value_type)
if isinstance(value_type, int):
value_type = type(value_type)
if subkey in genome_metadata[key]:
value = genome_metadata[key][subkey]
if isinstance(value, list):
value = [value_type(x) for x in value]
else:
value = value_type(value)
filtered_metadata[key][subkey] = value
# Check assembly and genebuild versions
check_assembly_refseq(filtered_metadata)
check_assembly_version(filtered_metadata)
check_genebuild_version(filtered_metadata)

# Optional assembly and genebuild based filtering:
if meta_update:
# Check assembly and genebuild versions
check_assembly_refseq(filtered_metadata)
check_assembly_version(filtered_metadata)
check_genebuild_version(filtered_metadata)

return filtered_metadata


def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None:
def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None:
"""Update the GCA accession to use GCF if it is from RefSeq.
Args:
genome_metadata: Nested metadata key values from the core metadata table.
"""
assembly = gmeta_out.get("assembly", {})
if assembly.get("provider_name", "") == "RefSeq":
assembly["accession"] = assembly["accession"].replace("GCA", "GCF")
if assembly.get("provider_name"):
if assembly["provider_name"] == "RefSeq":
assembly["accession"] = assembly["accession"].replace("GCA", "GCF")
logging.info("GCA accession updated to RefSeq GFC accession.")
else:
logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}")
else:
logging.debug(
"Meta filter update to RefSeq accession not done: user meta filter missing: \
'assembly.provider_name'"
)


def check_assembly_version(genome_metadata: Dict[str, Any]) -> None:
def check_assembly_version(genome_metadata: dict[str, Any]) -> None:
"""Updates the assembly version of the genome metadata provided.
If `version` meta key is not and integer or it is not available, the assembly accession's version
Expand Down Expand Up @@ -164,7 +204,7 @@ def check_assembly_version(genome_metadata: Dict[str, Any]) -> None:
logging.info(f'Located version [v{assembly["version"]}] info from meta data.')


def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None:
def check_genebuild_version(genome_metadata: dict[str, Any]) -> None:
"""Updates the genebuild version (if not present) from the genebuild ID, removing the latter.
Args:
Expand All @@ -188,19 +228,82 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None:
genome_metadata["genebuild"].pop("id", None)


def main() -> None:
"""Main script entry-point."""
parser = ArgumentParser(
description="Fetch the genome metadata from a core database and print it in JSON format."
def convert_dict(meta_dict: dict) -> dict:
"""Converts text JSON to add type properties from string
Args:
meta_dict: User meta dictionary with literal string typing to be converted.
"""
new_dict = meta_dict.copy()
for key, value in meta_dict.items():
if isinstance(value, dict):
new_dict[key] = convert_dict(value)
else:
new_dict[key] = locate(value)
return new_dict


def metadata_dump_setup(
db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool
) -> dict[str, Any]:
"""Setup main stages of genome meta dump from user input arguments provided.
Args:
db_url: Target core database URL.
input_filter: Input JSON containing subset of meta table values to filter on.
no_update: Deactivate additional meta updating.
append_db: Append target core database name to output JSON.
"""
dbc = DBConnectionLite(db_url)
db_name = None
meta_filter = {}
if append_db:
db_name = db_url.database

if input_filter:
unconverted_json = get_json(input_filter)
meta_filter = convert_dict(unconverted_json)

with dbc.session_scope() as session:
genome_meta = get_genome_metadata(session, db_name)
genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update)

return genome_meta


def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
"""Return a populated namespace with the arguments parsed from a list or from the command line.
Args:
arg_list: List of arguments to parse. If `None`, grab them from the command line.
"""
parser = ArgumentParser(description=__doc__)
parser.add_server_arguments(include_database=True, help="server url and core database")
parser.add_argument_src_path(
"--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output."
)
parser.add_argument(
"--meta_update",
action="store_true",
help="Perform assembly and genebuild 'version' metadata checks & update if needed.",
)
parser.add_server_arguments(include_database=True)
parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.")
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()
return parser.parse_args(arg_list)


def main(arg_list: list[str] | None = None) -> None:
"""Main script entry-point.
Args:
arg_list: Arguments to parse passing list to parse_args().
"""
args = parse_args(arg_list)
init_logging_with_args(args)

dbc = DBConnectionLite(args.url)
with dbc.session_scope() as session:
genome_meta = get_genome_metadata(session)
genome_meta = filter_genome_meta(genome_meta)
genome_meta = metadata_dump_setup(
db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db
)

print(json.dumps(genome_meta, indent=2, sort_keys=True))
Loading

0 comments on commit 7fa1ab8

Please sign in to comment.