-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathprofiling_pipeline.py
102 lines (89 loc) · 3.15 KB
/
profiling_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Perform the profiling pipeline (defined in profile.py) given all plates.
"""
import os
import pathlib
import subprocess
import pandas as pd
from profile_utils import get_pipeline_args, find_incomplete_plates
# Load Command Line Arguments
args = get_pipeline_args()
overwrite = args.overwrite # The default is False
batch = args.batch # The default is 2016_04_01_a549_48hr_batch1
plate_prefix = args.plate_prefix # The default is "SQ"
well_col = args.well_col # The default is "Image_Metadata_Well"
plate_col = args.plate_col # The default is "Image_Metadata_Plate"
extract_cell_line = args.extract_cell_line # The default is False
# Load constants
project = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"
profile_dir = pathlib.PurePath(
"/home/ubuntu/bucket/projects/", project, "workspace/backend"
)
barcode_platemap_dir = pathlib.PurePath(f"../metadata/platemaps/{batch}")
output_base_dir = pathlib.PurePath(batch)
completed_file_match = "normalized_feature_select.csv.gz"
# Load barcode platemap information
barcode_platemap_file = pathlib.PurePath(barcode_platemap_dir, "barcode_platemap.csv")
barcode_platemap_df = pd.read_csv(barcode_platemap_file)
# Load platemap information
platemap_dir = pathlib.PurePath(barcode_platemap_dir, "platemap")
# Load plate information
plate_dir = pathlib.PurePath(profile_dir, batch)
plates = [x for x in os.listdir(plate_dir) if x.startswith(plate_prefix)]
if not overwrite:
# Only process plates that are not already completely processed
plates = find_incomplete_plates(
plates=plates, output_dir=output_base_dir, file_match=completed_file_match
)
# Load and check MOA information
moa_file = pathlib.PurePath(
"../metadata/moa/repurposing_info_external_moa_map_resolved.tsv"
)
moa_df = pd.read_csv(moa_file, sep="\t")
assert isinstance(
moa_df, pd.DataFrame
), "Error, MOA file does not exist. Is the path updated?"
# Process every plate
for plate in plates:
print(f"Now processing... Plate: {plate}")
output_dir = pathlib.Path(output_base_dir, plate)
output_dir.mkdir(parents=True, exist_ok=True)
cell_count_dir = pathlib.Path("cell_count", batch, plate)
cell_count_dir.mkdir(parents=True, exist_ok=True)
platemap_id = barcode_platemap_df.query(
"Assay_Plate_Barcode == @plate"
).Plate_Map_Name.values[0]
if extract_cell_line:
cell_id = platemap_id.split("_")[1]
else:
cell_id = "A549"
platemap_file = pathlib.PurePath(platemap_dir, f"{platemap_id}.txt")
sql_base = pathlib.PurePath(profile_dir, batch, plate, f"{plate}.sqlite")
sql_file = f"sqlite:////{sql_base}"
cmd = [
"python",
"profile_cells.py",
"--sql_file",
sql_file,
"--batch",
batch,
"--plate_name",
plate,
"--platemap_file",
platemap_file,
"--barcode_platemap_file",
barcode_platemap_file,
"--moa_file",
moa_file,
"--output_dir",
output_dir,
"--cell_id",
cell_id,
"--cell_count_dir",
cell_count_dir,
"--well_col",
well_col,
"--plate_col",
plate_col,
]
subprocess.call(cmd)