-
Notifications
You must be signed in to change notification settings - Fork 3
/
config.py
161 lines (132 loc) · 5.42 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import pathlib
repo_root = pathlib.Path(__file__).resolve().parent.parent
# important subdirectories
data_dir = repo_root / 'data'
results_dir = repo_root / 'results'
# location of saved expression data
pancan_data = data_dir / 'pancancer_data.pkl'
rnaseq_data = data_dir / 'tcga_expression_matrix_processed.tsv.gz'
sample_counts = data_dir / 'tcga_sample_counts.tsv'
sample_info = data_dir / 'tcga_sample_identifiers.tsv'
# location of test data
test_data_dir = repo_root / 'tests' / 'data'
test_expression = test_data_dir / 'expression_subsampled.tsv.gz'
test_pancan_data = test_data_dir / 'pancancer_data_subsampled.pkl'
test_stratified_results = str(test_data_dir / 'stratified_results_{}.tsv')
test_cancer_type_results = str(test_data_dir / 'cancer_type_results_{}_{}.tsv')
# parameters for classification using raw gene expression
num_features_raw = 8000
# hyperparameters for classification experiments
shuffle_by_cancer_type = True
hyper_filter = 2
filter_prop = 0.05
filter_count = 15
folds = 3
max_iter = 200
alphas = [1e-4, 0.001, 0.01, 0.1, 1, 10]
l1_ratios = [0.0, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
ridge_c_values = [1e-6, 1e-5, 1e-4, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# default seed for random number generator
default_seed = 42
# gene mutation info used in tests
test_genes = ['TP53', 'KRAS', 'ARID1A']
# gene/classification combos for stratified CV model tests
# 2/24/2023: TP53 unit test was failing on GitHub Actions but not locally,
# not sure why but I don't want to spend more time on it so just going to
# comment it out for now
# stratified_gene_info = [('TP53', 'TSG'),
# ('KRAS', 'Oncogene'),
# ('ARID1A', 'TSG')]
stratified_gene_info = [('KRAS', 'Oncogene'), ('ARID1A', 'TSG')]
# gene/classification/cancer type combos for stratified CV model tests
cancer_type_gene_info = [('TP53', 'TSG', 'BRCA'),
('TP53', 'TSG', 'LGG'),
('KRAS', 'Oncogene', 'COAD'),
('KRAS', 'Oncogene', 'READ'),
('ARID1A', 'TSG', 'UCEC')]
# genes for cross-cancer POC test
cross_cancer_genes = [
'KRAS', 'HRAS', 'NRAS', 'BRAF', 'NF1', # RAS pathway genes
'TP53', 'CDKN2A', 'ATM', 'PTEN', 'RB1', # TSGs/DDR genes
'TTN' # control gene
]
# cancer types for cross-cancer POC test
cross_cancer_types = [
'THCA', 'COAD', 'GBM', 'LGG', 'SKCM'
]
# parameters for "add cancer" experiments
# how many cancer types to add to target cancer
# 0 = just use target cancer, -1 = use all cancers (pan-cancer model)
num_train_cancer_types = [0, 1, 2, 4, -1]
# similarity matrix to use for 'similarity' addition option
similarity_matrix_file = data_dir / 'expression_confusion_matrix.tsv'
# repo/commit information to retrieve precomputed cancer gene information
genes_base_url = 'https://raw.githubusercontent.com/cognoma/genes/'
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'
# feature selection experiments
fs_methods = [
'mad',
'pancan_f_test',
'median_f_test',
'mad_f_test',
'random'
]
# location of feature selection results
cancer_type_fs_plots_dir = (
repo_root / '02_cancer_type_classification' / 'fs_plots'
)
cancer_type_lasso_range_dir = (
repo_root / '02_cancer_type_classification' / 'lasso_range_plots'
)
# location of tumor purity data
tumor_purity_data = data_dir / 'TCGA_mastercalls.abs_tables_JSedit.fixed.txt'
purity_fs_plots_dir = (
repo_root / '07_purity_prediction' / 'fs_plots'
)
# location of CCLE data
ccle_sample_info = data_dir / 'ccle' / 'ccle_sample_info.csv'
ccle_expression = data_dir / 'ccle' / 'ccle_expression.csv'
ccle_mutation = data_dir / 'ccle' / 'ccle_mutations_maf.csv'
ccle_mutation_binary = data_dir / 'ccle' / 'ccle_mutations_binary.csv'
ccle_mutation_burden = data_dir / 'ccle' / 'ccle_mutation_burden.csv'
ccle_cnv_ratios = data_dir / 'ccle' / 'ccle_gene_cnv.csv'
ccle_cnv_loss = data_dir / 'ccle' / 'ccle_cnv_loss_binary.tsv'
ccle_cnv_gain = data_dir / 'ccle' / 'ccle_cnv_gain_binary.tsv'
cell_line_drug_response = data_dir / 'ccle' / 'drug_response'
cell_line_drug_response_matrix = data_dir / 'ccle' / 'ccle_drug_response_ic50.tsv'
cell_line_drug_response_matrix_binary = data_dir / 'ccle' / 'ccle_drug_response_binary.tsv'
cell_line_drug_response_egfri_binary = data_dir / 'ccle' / 'ccle_drug_response_egfri_binary.tsv'
# parameters for CCLE experiments
ccle_filter_count = 5
ccle_filter_prop = 0.1
# enumerate liquid cancer types in CCLE, we sometimes use
# these to stratify train/test sets
ccle_liquid_cancer_types = [
'Leukemia',
'Lymphoma',
'Myeloma'
]
# location of "merged" gene set from mpmp repo
merged_cancer_genes = data_dir / 'merged_with_annotations.tsv'
# gene aliases for Vogelstein dataset
gene_aliases = {
'MLL2': 'KMT2D',
'MLL3': 'KMT2C',
'FAM123B': 'AMER1'
}
ccle_fs_plots_dir = (
repo_root / '08_cell_line_prediction' / 'fs_plots'
)
# location of clinical data from TCGA
clinical_data = data_dir / 'TCGA-CDR-SupplementalTableS1.xlsx'
# info for microsatellite instability prediction
msi_data_dir = data_dir / 'msi_data'
msi_cancer_types = ['COAD', 'READ', 'STAD', 'UCEC']
# toggle for whether or not to use SGD-based optimizers for lasso experiments
# True uses SGDClassifier/SGDRegressor, False uses LogisticRegression/Lasso
lasso_sgd = False
mlp_folds = -1
mlp_search_n_iter = 10
mlp_max_iter = 200
constant_search_lr = [0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.01]
generalization_data_dir = repo_root / 'generalization_data'