forked from huhrichard/DEEP_extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofile_table.py
144 lines (113 loc) · 5.51 KB
/
profile_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import matplotlib
import matplotlib.pyplot as plt
import os, fnmatch
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import fdrcorrection
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook
import argparse
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 18
sign_pair = ['<', '>=']
inequality_operators = {'<': lambda x, y: x < y,
'<=': lambda x, y: x <= y,
'>': lambda x, y: x > y,
'>=': lambda x, y: x >= y}
plt.rcParams.update({'font.size': BIGGER_SIZE})
def find(pattern, path):
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
# p_val_cont = fdr_df.loc[fdr_df.apply(lambda x: '(Cont.)' in x['outcome'], axis=1), 'p_val'].values
# p_val_bin = fdr_df.loc[fdr_df.apply(lambda x: '(Binary)' in x['outcome'], axis=1), 'p_val'].values
#
# _, fdr_cont = fdrcorrection(p_val_cont)
# _, fdr_bin = fdrcorrection(p_val_bin)
#
# fdr_df.loc[fdr_df.apply(lambda x: '(Cont.)' in x['outcome'], axis=1), 'fdr'] = fdr_cont
# fdr_df.loc[fdr_df.apply(lambda x: '(Binary)' in x['outcome'], axis=1), 'fdr'] = fdr_bin
relation_dict = {'pos_correlate': 2.0,
# 'mixed_sign_profile': 0.0,
'neg_correlate': 1.0,
# 'na': 0
}
# relation_dict = {'Positive Coef.': 2.0,
# # 'mixed_sign_profile': 0.0,
# 'Negative Coef.': 1.0,
# # 'na': 0
# }
# relation_dict_inv = {v:k for k, v in relation_dict.items()}
relation_list = [k for k in relation_dict]
# relation_list = ['NA'] + relation_list
# relation_list.append('NA')
relation_list.append('NA')
relation_inv_dict = {v: k for k, v in relation_dict.items()}
def summarize_plot(result_dir='', pollutant_suffix='', method_suffix=''):
# print(pollutant_suffix)
fdr_path = os.path.join(result_dir, 'merged_fdr.csv')
fdr_df = pd.read_csv(fdr_path, sep=',')
print(fdr_df)
fdr_df = fdr_df.loc[fdr_df['fdr'] < 0.05]
# fdr_df = fdr_df.loc[fdr_df['relation'] == 'pos_correlate']
print(fdr_df)
# fdr_df['profile'] = fdr_df['profile'].str.replace(pollutant_suffix, '', regex=False)
profile_str = fdr_df['profile'].str
print(profile_str)
print((profile_str.count('>') > 1) & (profile_str.count('<') == 0))
# print(profile_str)
fdr_df['all_greater'] = (profile_str.count('>') > 1) & (profile_str.count('<') == 0)
fdr_df['all_less'] = (profile_str.count('<') > 1) & (profile_str.count('>') == 0)
# merged_df['multi_pollutants'] = False
fdr_df['multi_pollutants'] = profile_str.count('\t\t') > 0
fdr_df = fdr_df.rename(columns={'coef': 'mean'})
print(fdr_df.columns)
category_outcome = {'all_greater_combination': fdr_df['all_greater'],
# 'all_less': merged_df['all_less'],
# 'mixed_sign_multi_pollutants': (merged_df['multi_pollutants'] & ~merged_df['all_greater'] & ~merged_df['all_less']),
'individual_pollutant': ~fdr_df['multi_pollutants']}
for profile_cat, profile_bool in category_outcome.items():
fdr_df_cat = fdr_df[profile_bool]
fdr_df_cat['se'] = np.abs((fdr_df_cat['mean'] - fdr_df_cat['coef_95CI_lower']))/1.96
fdr_df_cat['mean'] = np.abs(fdr_df_cat['mean'])
fdr_df_cat.sort_values(by=['outcome', 'mean'],
ascending=[True, False],
inplace=True)
fdr_df_cat.reset_index(inplace=True)
fdr_df_cat['Modelnum'] = fdr_df_cat.index
fdr_df_cat = fdr_df_cat[['Modelnum', 'mean', 'se', 'freq', 'fdr', 'outcome', 'profile']]
fdr_df_cat['pol1'] = np.nan
fdr_df_cat['fdr_str'] = ''
for row_idx, pollutant_row in fdr_df_cat.iterrows():
p = pollutant_row['profile'].split('\t\t')
for p_idx, p_sub in enumerate(p):
p_coln = 'pol{}'.format(p_idx+1)
if not (p_coln in fdr_df_cat.columns):
fdr_df_cat[p_coln] = np.nan
p_no_sign = p[p_idx].split(sign_pair[1])[0].split(sign_pair[0])[0].title().split('(')[0]
fdr_df_cat.loc[row_idx,p_coln] = p_no_sign
p_fdr = fdr_df_cat.loc[row_idx, 'fdr']
if p_fdr >= 0.01:
fdr_df_cat.loc[row_idx, 'fdr_str'] = '{:.2f}'.format(p_fdr)
else:
fdr_df_cat.loc[row_idx, 'fdr_str'] = '<0.01'
fdr_df_cat.drop(columns=['fdr'], inplace=True)
fdr_df_cat.rename(columns={'fdr_str': 'fdr'}, inplace=True)
fdr_df_cat['out_first_only'] = np.nan
outcome_unique = fdr_df_cat['outcome'].unique()
for u_out in outcome_unique:
first_idx = fdr_df_cat[fdr_df_cat['outcome'] == u_out].first_valid_index()
fdr_df_cat.loc[first_idx, 'out_first_only'] = u_out
fdr_df_cat.to_csv('{}/r_summary_{}.csv'.format(result_dir, profile_cat))
r_cmd = "R CMD BATCH --no-save --no-restore '--args result_dir=\"{}\"' Rscript/profile_table.R".format(result_dir)
os.system(r_cmd)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Parameters of DEEP extraction')
parser.add_argument('--result_dir', '-rd', type=str, required=True, help='Directory of DEEP result files')
# outcome_fdr = find('fdr*.csv', )
args = parser.parse_args()
summarize_plot(result_dir=args.result_dir)