-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcol_comparison_subset.py
103 lines (75 loc) · 3.38 KB
/
col_comparison_subset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""Re-runs the comparisons that were below the bonferroni alpha on a sub set of the individuals in the data set"""
from sys import argv
from os import listdir, mkdir
from os.path import join, isdir
from pickle import load, dump
from pandas import read_csv, DataFrame
from tqdm import tqdm
from time import time
from utils.utils import compare, get_col_types, SUBSET_PATH, get_comp_key, SUBSET_COMP_DICTS_PATH
def main():
"""Main method"""
idx: int = int(argv[1])
subset: str = argv[2]
comp_dir: str = argv[3]
comp_dicts_path: str = SUBSET_COMP_DICTS_PATH.format(subset)
if not isdir(comp_dicts_path):
mkdir(comp_dicts_path)
col_types: dict = get_col_types()
new_comps: list = sorted(listdir(comp_dir))
for i, file_name in enumerate(list(new_comps)):
if not file_name.endswith('.p'):
del new_comps[i]
new_comps: str = new_comps[idx]
assert new_comps.endswith('.p')
new_comps: str = join(comp_dir, new_comps)
print('Loading Filtered Comparisons at:', new_comps)
new_comps: dict = load(open(new_comps, 'rb'))
original_len: int = len(new_comps)
print('Number Of Filtered Comparisons (Original Length):', original_len)
t1: float = time()
dataset_cols: dict = get_dataset_cols(subset=subset, filtered_comps=new_comps)
print('Time Extracting The Data Set Columns: {:.2f} Minutes'.format((time() - t1) / 60))
print('Number Of Features To Re-Analyze:', len(dataset_cols))
n_skipped: int = 0
t1: float = time()
for (feat1, feat2), p in tqdm(list(new_comps.items())):
key: tuple = get_comp_key(feat1=feat1, feat2=feat2)
if dataset_cols[feat1] is None or dataset_cols[feat2] is None:
# We can't compare features that have only one unique value as a result of the sub setting
n_skipped += 1
del new_comps[key]
else:
p: float = compare(header1=feat1, header2=feat2, dataset_cols=dataset_cols, col_types=col_types)
if p == float('inf'):
n_skipped += 1
del new_comps[key]
else:
new_comps[key] = p
new_len: int = len(new_comps)
assert new_len + n_skipped == original_len
print('Time Re-Analyzing On The Sub Set: {:.2f} Minutes'.format((time() - t1) / 60))
print('Number Of Comparisons Skipped Due To One Unique Value In Sub Set:', n_skipped)
print('Number Of Comparisons Left (New Length):', new_len)
new_comps_path: str = '{}.p'.format(idx)
new_comps_path: str = join(comp_dicts_path, new_comps_path)
dump(new_comps, open(new_comps_path, 'wb'))
def get_dataset_cols(subset: str, filtered_comps: dict) -> dict:
"""Gets the columns from the subset to be used in the new comparisons"""
filtered_headers: set = set()
for feat1, feat2 in filtered_comps.keys():
filtered_headers.add(feat1)
filtered_headers.add(feat2)
subset_path: str = SUBSET_PATH.format(subset)
subset: DataFrame = read_csv(subset_path, usecols=filtered_headers)
dataset_cols: dict = {}
for header in filtered_headers:
col: list = list(subset[header])
if len(set(col)) == 1:
# We can't compare features that have only one unique value as a result of the sub setting
dataset_cols[header] = None
else:
dataset_cols[header] = col
return dataset_cols
if __name__ == '__main__':
main()