forked from popsim-consortium/stdpopsim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_ensembl_data.py
89 lines (76 loc) · 3.23 KB
/
update_ensembl_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Script used to update genome data from Ensembl.
# TODO: this should be moved into the maintenance package,
# ideally where we turn it into a CLI that we run with
# python -m maintenance update-ensembl-data. That way we
# can keep all the code for various scripts in one place,
# and we don't clutter up the top-level directory with
# scripts like this one.
import pathlib
import subprocess
import contextlib
import stdpopsim
import maintenance
import argparse
import os
class DataWriter:
"""
Writes data obtained from upstream sources into the stdpopsim
package hierarchy.
"""
def __init__(self):
self.ensembl_client = maintenance.EnsemblRestClient()
@contextlib.contextmanager
def write(self, path):
with open(path, "w") as f:
print("# File autogenerated from Ensembl REST API. Do not edit.", file=f)
yield f
# TODO we can remove this once we've moved to using pre-commit and
# Black, as these will be autoformatted.
# use pip install black if this command fails
subprocess.check_call(["black", "-q", path])
def write_genome_data(self, ensembl_id):
tmp = ensembl_id.split("_")[:2]
id = "".join([x[0:3].capitalize() for x in tmp])
dir = path = f"stdpopsim/catalog/{id}"
if len(id) != 6:
raise ValueError(f"Cannot extract six character id from {ensembl_id}")
if not os.path.exists(dir):
raise ValueError(f"Directory {id} corresponding to {ensembl_id} does" +
"not exist")
print("Writing genome data for", id, ensembl_id)
path = f"{dir}/genome_data.py"
data = self.ensembl_client.get_genome_data(ensembl_id)
with self.write(path) as f:
print("data = ", data, file=f)
def write_ensembl_release(self):
release = self.ensembl_client.get_release()
print(f"Using Ensembl release {release}")
path = os.path.join("stdpopsim","catalog", "ensembl_info.py")
with self.write(path) as f:
print("release = ", release, file=f)
def main():
# Parser that controls which species should be updated
parser = argparse.ArgumentParser(description='Query ensembl API for chromosome lengths')
parser.add_argument('species', type=str, nargs='*',
help="""Ensembl ids for species to update with underscores in place of
spaces (e.g. homo_sapiens)""")
parser.add_argument('--list-species', dest='list_species', action='store_true',
help='list all species defined in stdpopsim')
args = parser.parse_args()
# Lists species and exits if user requested list of species
if args.list_species:
for species in stdpopsim.all_species():
print(species.ensembl_id)
return None
# Create a list of species ensembl ids based on user input
if len(args.species) == 0:
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
else:
embl_ids = [s.lower() for s in args.species]
# Iterate over list of species ensembl ids and write genome data
writer = DataWriter()
writer.write_ensembl_release()
for eid in embl_ids:
writer.write_genome_data(eid)
if __name__ == "__main__":
main()