-
Notifications
You must be signed in to change notification settings - Fork 0
/
unit_tests.py
237 lines (192 loc) · 15.1 KB
/
unit_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from ptm_pose import pose_config, project, annotate, analyze
from ptm_pose import plots as pose_plots
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
database_dir = '../../Database_Information/'
psp_regulatory_site_file = database_dir + 'PhosphoSitePlus/Regulatory_sites.gz'
psp_ks_file = database_dir + 'PhosphoSitePlus/Kinase_Substrate_Dataset.gz'
psp_disease_file = database_dir + 'PhosphoSitePlus/Disease-associated_sites.gz'
elm_interactions = database_dir + '/ELM/elm_interactions.tsv'
elm_motifs = database_dir + '/ELM/elm_classes.tsv'
PTMint = database_dir + '/PTMInt/PTM_experimental_evidence.csv'
PTMcode_interprotein = database_dir + '/PTMcode/PTMcode2_associations_between_proteins.txt.gz'
DEPOD = True
RegPhos = database_dir + '/RegPhos/RegPhos_Phos_human.txt'
ptmsigdb_file = '../../Database_Information/PTMsigDB/data_PTMsigDB_all_sites_v2.0.0.xlsx'
file_dict = {'PhosphoSitePlus': {'Function':psp_regulatory_site_file, 'Process':psp_regulatory_site_file, 'Disease':psp_disease_file, 'Kinase':psp_ks_file, 'Interactions': psp_regulatory_site_file, 'Perturbation': ptmsigdb_file},
'ELM': {'Interactions':elm_interactions, 'Motif Match':elm_motifs},
'PTMcode': {'Interactions':PTMcode_interprotein},
'PTMInt': {'Interactions':PTMint},
'RegPhos': {'Kinase':RegPhos},
'DEPOD': {'Phosphatase':DEPOD},
'PTMsigDB':{'WikiPathway':ptmsigdb_file, 'NetPath':ptmsigdb_file,'mSigDB':ptmsigdb_file, 'Perturbation (DIA2)':ptmsigdb_file, 'Perturbation (DIA)': ptmsigdb_file, 'Perturbation (PRM)':ptmsigdb_file,'Kinase':ptmsigdb_file}}
def configure_unit_test():
pose_config.download_translator()
pose_config.download_ptm_coordinates()
def projection_test(dataset = 'Yang2016', check_type = 'no changes', splicegraph_file = None):
"""
Test the projection of PTMs onto splicing events to identify differentially included PTMs and PTMs with altered flanking sequences using the project model. Compare to see if results changed from previous runs.
Parameters
----------
dataset : str
Dataset to test. Options include 'Yang2016' and 'TCGA_PRAD' (default is 'Yang2016')
check_type : str
Type of check to perform. Options include 'no changes', which will raise an error if there are differences from previous results, and 'report changes', which will print PTMs that are different (default is 'no changes')
splicegraph_file : str
Path to splicegraph file for TCGA PRAD dataset, which is only needed for splicegraph (default is None)
"""
#load dataset of interest and project PTMs
if dataset == 'Yang2016':
#load validation results for comparison
spliced_ptms_true = pd.read_csv('./Test_Datasets/Yang2016/spliced_ptms.csv')
altered_flanks_true = pd.read_csv('./Test_Datasets/Yang2016/altered_flanks.csv')
#### Yang 2016 (hg19) ######
siRNA_data = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 2)
SE_events = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 2).iloc[0:179]
MXE_events = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 183).iloc[0:37]
fiveASS_events = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 222).iloc[0:6]
threeASS_events = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 230).iloc[0:7]
RI_events = pd.read_excel('./Test_datasets/Yang2016/esrp1_knockdown_data_Yang2016.xlsx', sheet_name='rMATS ESRP KD', header = 239)
#project ptms and identify altered flanking sequences using MATS function
results = project.project_ptms_onto_MATS(SE_events = SE_events, MXE_events = MXE_events, RI_events = RI_events, threeASS_events = threeASS_events, fiveASS_events = fiveASS_events, coordinate_type = 'hg19', PROCESSES = 1, identify_flanking_sequences=True)
splice_data, spliced_ptms, altered_flanks = results
#check if TSC2 S981 is in the spliced PTMs
spliced_ptms['Label'] = spliced_ptms['Gene'] + '_' + spliced_ptms['Residue'] + spliced_ptms['PTM Position in Canonical Isoform'].astype(str)
assert 'TSC2_S981' in spliced_ptms['Label'].values or 'TSC2_S981.0' in spliced_ptms['Label'].values
elif dataset == 'TCGA_PRAD':
if splicegraph_file is None:
raise ValueError('splicegraph_dir must be specified for TCGA PRAD dataset')
#load validation results for comparison
spliced_ptms_true = pd.read_csv('./Test_Datasets/TCGA_PRAD/spliced_ptms.csv')
altered_flanks_true = pd.read_csv('./Test_Datasets/TCGA_PRAD/altered_flanks.csv')
#### TCGA PRAD (hg19) ######
psi_data = pd.read_csv('./Test_datasets/TCGA_PRAD/sig_events_PRAD.csv')
splicegraph = pd.read_csv(splicegraph_file, sep = '\t')
results = project.project_ptms_onto_SpliceSeq(psi_data = psi_data, splicegraph = splicegraph, coordinate_type = 'hg19', PROCESSES = 1, identify_flanking_sequences=True, dPSI_col = 'deltaPSI_MW', sig_col = 'p-adj_MW', extra_cols = ['Effect Size_MW'])
psi_data, spliced_ptms, altered_flanks = results
#check if TSC2 S981 is in the spliced PTMs
spliced_ptms['Label'] = spliced_ptms['Gene'] + '_' + spliced_ptms['Residue'] + spliced_ptms['PTM Position in Canonical Isoform'].astype(str)
assert 'TSC2_S981' in spliced_ptms['Label'].values or 'TSC2_S981.0' in spliced_ptms['Label'].values
else:
raise ValueError('Dataset not recognized. Options include Yang2016 and TCGA_PRAD')
#compare results to expected values, either raising an error or printing differences
if check_type == 'no changes':
spliced_ptms_true = np.unique(spliced_ptms_true['UniProtKB Accession'] + '_' + spliced_ptms_true['Residue'] + spliced_ptms_true['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
spliced_ptms_new = np.unique(spliced_ptms['UniProtKB Accession'] + '_' + spliced_ptms['Residue'] + spliced_ptms['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
intersection = set(spliced_ptms_true).intersection(set(spliced_ptms_new))
assert len(intersection) == len(spliced_ptms_true)
altered_flanks_true = np.unique(altered_flanks_true['UniProtKB Accession'] + '_' + altered_flanks_true['Residue'] + altered_flanks_true['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
altered_flanks_new = np.unique(altered_flanks['UniProtKB Accession'] + '_' + altered_flanks['Residue'] + altered_flanks['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
intersection = set(altered_flanks_true).intersection(set(altered_flanks_new))
assert len(intersection) == len(altered_flanks_true)
print('No changes found! Projection test passed!')
elif check_type == 'report changes':
spliced_ptms_true = np.unique(spliced_ptms_true['UniProtKB Accession'] + '_' + spliced_ptms_true['Residue'] + spliced_ptms_true['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
spliced_ptms_new = np.unique(spliced_ptms['UniProtKB Accession'] + '_' + spliced_ptms['Residue'] + spliced_ptms['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
only_in_reference = set(spliced_ptms_true).difference(set(spliced_ptms_new))
only_in_new = set(spliced_ptms_new).difference(set(spliced_ptms_true))
print('Differentially included PTMs:')
if len(only_in_reference) > 0 or len(only_in_new) > 0:
print('Only in reference: {}'.format(', '.join(only_in_reference)))
print('Only in new: {}'.format(', '.join(only_in_new)))
print('\n')
else:
print('No differences found')
print('\n')
altered_flanks_true = np.unique(altered_flanks_true['UniProtKB Accession'] + '_' + altered_flanks_true['Residue'] + altered_flanks_true['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
altered_flanks_new = np.unique(altered_flanks['UniProtKB Accession'] + '_' + altered_flanks['Residue'] + altered_flanks['PTM Position in Canonical Isoform'].astype(float).astype(int).astype(str))
only_in_reference = set(altered_flanks_true).difference(set(altered_flanks_new))
only_in_new = set(altered_flanks_new).difference(set(altered_flanks_true))
print('Altered flanking sequences:')
if len(only_in_reference) > 0 or len(only_in_new) > 0:
print('Only in reference: {}'.format(', '.join(only_in_reference)))
print('Only in new: {}'.format(', '.join(only_in_new)))
print('\n')
else:
print('No differences found')
print('\n')
return spliced_ptms, altered_flanks
def add_annotations_test(spliced_ptms, altered_flanks):
#save previous annotation data and then remove any previous annotations
annotation_cols = [col for col in spliced_ptms.columns if ':' in col]
spliced_annotation_counts = spliced_ptms[annotation_cols].count()
spliced_ptms = spliced_ptms.drop(columns = annotation_cols)
annotation_cols = [col for col in altered_flanks.columns if ':' in col]
altered_annotation_counts = altered_flanks[annotation_cols].count()
altered_flanks = altered_flanks.drop(columns = annotation_cols)
print('Annotation differentially spliced PTMs')
#differentially included ptms
spliced_ptms = annotate.annotate_ptms(spliced_ptms, psp_regulatory_site_file=psp_regulatory_site_file, psp_ks_file=psp_ks_file, psp_disease_file=psp_disease_file, elm_interactions=elm_interactions, elm_motifs = elm_motifs, PTMint=PTMint,PTMcode_interprotein=PTMcode_interprotein, DEPOD=DEPOD, RegPhos=RegPhos, ptmsigdb_file=ptmsigdb_file)
#check if annotations are the same, if not, print differences
for annot in spliced_annotation_counts.index:
if annot in spliced_ptms.columns:
if spliced_annotation_counts[annot] != spliced_ptms[annot].count():
print(f'Annotation {annot} has changed from {spliced_annotation_counts[annot]} to {spliced_ptms[annot].count()}')
print('Annotation altered flanking sequences')
#altered flanking sequences
altered_flanks = annotate.annotate_ptms(altered_flanks, psp_regulatory_site_file=psp_regulatory_site_file, psp_ks_file=psp_ks_file, psp_disease_file=psp_disease_file, elm_interactions=elm_interactions, elm_motifs = elm_motifs, PTMint=PTMint, PTMcode_interprotein=PTMcode_interprotein, DEPOD=DEPOD, RegPhos=RegPhos, ptmsigdb_file=ptmsigdb_file)
#check if annotations are the same, if not, print differences
for annot in spliced_annotation_counts.index:
if annot in spliced_ptms.columns:
if spliced_annotation_counts[annot] != spliced_ptms[annot].count():
print(f'Annotation {annot} has changed from {spliced_annotation_counts[annot]} to {spliced_ptms[annot].count()}')
return spliced_ptms, altered_flanks
def get_file(database = 'PhosphoSitePlus', annotation_type = 'Function'):
return file_dict[database][annotation_type]
def annotations_analysis_test(spliced_ptms, altered_flanks):
print('Combining Interaction Information')
combined = analyze.combine_outputs(spliced_ptms = spliced_ptms, altered_flanks = altered_flanks)
print('Getting annotation summaries')
print(analyze.get_annotation_categories(combined))
pose_plots.show_available_annotations(spliced_ptms)
plt.show()
for database in pose_config.annotation_col_dict.keys():
for annot_type in pose_config.annotation_col_dict[database]:
if not (database == 'ELM' and annot_type == 'Motif Match') and not (database == 'PTMcode' and annot_type == 'Intraprotein'):
print(f'Testing analysis on {database} {annot_type} annotation type')
for collapse_annotations in [True, False]:
annotations, annotation_counts = analyze.get_ptm_annotations(combined, annotation_type = annot_type, database = database)
print('Calculating enrichment with precomputed background')
file = get_file(database = database, annotation_type = annot_type)
enrichment = analyze.annotation_enrichment(combined, annotation_type = annot_type, database = database, annotation_file = file, collapse_on_similar = collapse_annotations, save_background = True)
if annot_type == 'Function' and database == 'PhosphoSitePlus':
print('Calculating enrichment by significance')
enrichment = analyze.annotation_enrichment(combined, annotation_type = annot_type, database = database, collapse_on_similar = collapse_annotations, annotation_file=file, background_type = 'significance', alpha = 0.01, min_dPSI = 0.2)
if annot_type == 'Function' and database == 'PhosphoSitePlus':
print('Calculating enrichment for specific modifications')
enrichment = analyze.annotation_enrichment(combined, annotation_type = annot_type, database = database, collapse_on_similar = collapse_annotations, annotation_file = file, mod_class='Phosphorylation')
pose_plots.plot_annotations(combined, database=database, annot_type=annot_type, collapse_on_similar = collapse_annotations)
plt.show()
print('Annotation unit test passed!')
def analysis_unit_tests(spliced_ptms, altered_flanks, kstar_network_dir):
print('Testing overview analysis')
pose_plots.modification_breakdown(spliced_ptms = spliced_ptms, altered_flanks = altered_flanks)
plt.show()
print('Testing gene set enrichment')
enr_results = analyze.gene_set_enrichment(spliced_ptms = spliced_ptms, altered_flanks=altered_flanks, min_dPSI = 0.1)
pose_plots.plot_EnrichR_pies(enr_results)
plt.show()
print('Testing interaction network functions')
protein_interactions = analyze.protein_interactions(spliced_ptms)
protein_interactions.get_interaction_network()
protein_interactions.get_interaction_stats()
protein_interactions.plot_interaction_network()
plt.show()
protein_interactions.plot_network_centrality()
plt.show()
print('Testing altered flanking sequence analysis')
altered_flanks = analyze.compare_flanking_sequences(altered_flanks)
altered_flanks = analyze.compare_inclusion_motifs(altered_flanks)
pose_plots.location_of_altered_flanking_residues(altered_flanks, modification_class = 'Phosphorylation')
plt.show()
pose_plots.alterations_matrix(altered_flanks.head(10))
plt.show()
print("Testing KSTAR analysis")
kstar = analyze.kstar_enrichment(spliced_ptms, network_dir = kstar_network_dir, phospho_type = 'Y')
print(kstar.return_enriched_kinases())
kstar = analyze.kstar_enrichment(spliced_ptms, network_dir=kstar_network_dir, phospho_type = 'ST')
print(kstar.return_enriched_kinases())
print('Analysis unit test passed!')
def comprehensive_unit_test():
pass