Skip to content

Commit

Permalink
added config file, code cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Oct 4, 2024
1 parent fcb900f commit b3ed3e0
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 53 deletions.
12 changes: 12 additions & 0 deletions conf/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"server_details" : {
"db_name" : "gb_transcriptomic_registry",
"db_host" : "mysql-ens-genebuild-prod-1",
"db_user" : "ensadmin",
"db_port" : 4527,
"db_pass" : ""
},
"tissue_types" : {
"prioritise" : ["heart", "lung", "brain", "ovary", "ovaries", "testes", "testis", "gonad", "gonads"]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,39 +16,74 @@
"""Select the best subset of short-read transcriptomic data to align to the genome"""

import argparse
import pymysql
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import json
import random
from collections import Counter
import pymysql


def mysql_fetch_data(
query: str, database: str, host: str, port: int, user: str, password: str
) -> Optional[List[Tuple]]:
"""
Fetch data from MySQL database based on the provided query.
#need to add path to config
with open(os.environ["ENSCODE"] + "/ensembl-genes-metadata/config.json", "r") as f:
config = json.load(f)
Args:
query: SQL query to be executed.
database: Name of the database.
host: Database host.
port: Port number for the connection.
user: Username for the database connection.
password: Password for the database connection.
def mysql_fetch_data(query, database, host, port, user, password):
Returns:
A list of tuples representing the rows fetched from the database, or None if an error occurs.
"""
try:
conn = pymysql.connect(
host=host, user=user, passwd=password, port=port, database=database.strip()
)
conn = pymysql.connect(host=host, user=user, passwd=password, port=port, database=database.strip())

with conn.cursor() as cursor:
cursor.execute(query)
info: List[Tuple] = cursor.fetchall()

cursor = conn.cursor()
cursor.execute(query)
info = cursor.fetchall()

except pymysql.Error as err:
print(err)

cursor.close()
conn.close()
finally:
conn.close()
return info

def select_data(taxon_id, reads_mapped_cutoff, prioritise_tissues, max_num_runs):
"""Select the best data to align to the genome."""

selected_runs = []
#select the runs from the database that have passed QC and the percent_mapped reads is greater than reads_mapped_cutoff
def select_data(
taxon_id: str,
reads_mapped_cutoff: float,
prioritise_tissues: bool,
max_num_runs: int,
config: Dict[str, Any],
) -> List[str]:
"""
Select the best data to align to the genome.
Args:
taxon_id: The taxon ID to filter by.
reads_mapped_cutoff: Minimum percent of reads mapped to pass.
prioritise_tissues: Whether to prioritise certain tissue types.
max_num_runs: Maximum number of runs to select.
config: Configuration details including database connection info and tissue prioritisation.
Returns:
A list of selected run accession IDs.
"""

selected_runs: List[str] = []
# select the runs from the database that have passed QC and the percent_mapped reads
# is greater than reads_mapped_cutoff
data_query = (
"SELECT run.run_accession, run.tissue, align.percent_mapped FROM run INNER JOIN align on run.run_id=align.run_id WHERE run.qc_status='qc_pass' AND align.percent_mapped>=" + str(reads_mapped_cutoff) + " and run.taxon_id=" + taxon_id + ";"
f"SELECT run.run_accession, run.tissue, align.uniquely_mapped_reads_percentage FROM "
f"run INNER JOIN align ON run.run_id=align.run_id WHERE run.qc_status='ALIGNED' AND "
f"align.uniquely_mapped_reads_percentage>={reads_mapped_cutoff} AND run.taxon_id={taxon_id};"
)
data_fetch = mysql_fetch_data(
data_query,
Expand All @@ -58,16 +93,18 @@ def select_data(taxon_id, reads_mapped_cutoff, prioritise_tissues, max_num_runs)
config["server_details"]["db_user"],
config["server_details"]["db_pass"],
)
run_dict = {}
for tuple in data_fetch:
run_dict[tuple[0]] = {"tissue":tuple[1],
"percent_mapped":tuple[2]}
if not data_fetch:
print("No data fetched or an error occurred.")
return []

# Prepare a dictionary to store the fetched run data
run_dict: Dict[str, Dict[str, Any]] = {
row[0]: {"tissue": row[1], "percent_mapped": row[2]} for row in data_fetch
}
if prioritise_tissues:
prioritised_tissues = config["tissue_types"]["prioritise"]

tissue_counter = Counter()
prioritised_runs = {}

prioritised_tissues: List[str] = config["tissue_types"]["prioritise"]
tissue_counter: Counter = Counter()
prioritised_runs: Dict[str, Dict[str, Any]] = {}
for key, value in run_dict.items():
if value["tissue"] in prioritised_tissues:
least_common_tissue, _ = tissue_counter.most_common()[-1] if tissue_counter else (None, None)
Expand All @@ -76,37 +113,32 @@ def select_data(taxon_id, reads_mapped_cutoff, prioritise_tissues, max_num_runs)
tissue_counter[value["tissue"]] += 1
if len(prioritised_runs) == max_num_runs:
break
#fill up remaining slots
# Fill up remaining slots
if len(prioritised_runs) < max_num_runs:
remaining_runs = {k: v for k, v in run_dict.items() if k not in prioritised_runs}
sample_size = min(max_num_runs - len(prioritised_runs), len(remaining_runs))
additional_runs = random.sample(remaining_runs.items(), sample_size)
prioritised_runs.update(additional_runs)

selected_runs = list(prioritised_runs)[:max_num_runs]

remaining_runs: Dict[str, Dict[str, Any]] = {
k: v for k, v in run_dict.items() if k not in prioritised_runs
}
sample_size: int = min(max_num_runs - len(prioritised_runs), len(remaining_runs))
additional_runs = random.sample(list(remaining_runs.items()), sample_size)
prioritised_runs.update(dict(additional_runs))

selected_runs = list(prioritised_runs.keys())[:max_num_runs]
else:
sorted_runs = sorted(run_dict,key=lambda x:run_dict[x]['percent_mapped'])
sorted_runs: List[str] = sorted(run_dict, key=lambda x: run_dict[x]["percent_mapped"], reverse=True)
selected_runs = sorted_runs[:max_num_runs]

return selected_runs
return selected_runs


def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Parameters")
parser.add_argument(
"-t",
"--taxon_id",
required=True,
help="Taxon ID"
)
parser.add_argument("-t", "--taxon_id", required=True, help="Taxon ID")

parser.add_argument(
"--reads_mapped_cutoff",
type=int, default=50, help="The minimum allowed for percent_mapped reads."
"--reads_mapped_cutoff", type=int, default=50, help="The minimum allowed for percent_mapped reads."
)

parser.add_argument(
"-p",
"--prioritise_tissues",
Expand All @@ -117,16 +149,32 @@ def parse_args():

parser.add_argument(
"--max_num_runs",
type=int, default=100, help="The maximum number of runs to be included in the output."
type=int,
default=100,
help="The maximum number of runs to be included in the output.",
)

return parser.parse_args()


def main() -> None:
"""Entrypoint"""
args = parse_args()
runs_to_use=select_data(args.taxon_id, args.reads_mapped_cutoff, args.prioritise_tissues, args.max_num_runs)
# Get the directory where the current script is located
script_dir = Path(__file__).parent.resolve()

# Define the path to 'ensembl-genes-metadata/config.json' relative to your script's location
config_path = script_dir.parents[3] / "ensembl-genes-metadata" / "conf" / "config.json"

# Open the config file
with open(config_path, "r") as f:
config = json.load(f)

runs_to_use = select_data(
args.taxon_id, args.reads_mapped_cutoff, args.prioritise_tissues, args.max_num_runs, config
)
print(runs_to_use)



if __name__ == "__main__":
main()

0 comments on commit b3ed3e0

Please sign in to comment.