-
Notifications
You must be signed in to change notification settings - Fork 0
/
proteomeScoutAPI.py
836 lines (632 loc) · 25.8 KB
/
proteomeScoutAPI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
# ProteomeScoutAPI
#
# ===============================================================================
# ABOUT
# ===============================================================================
# Version 2.0
#
# June 2020
#
# Oritinally By Alex Holehouse, Washington University in St. Louis
# Contact [email protected]
# This version maintained by Naegle Lab
# Contact Kristen Naegle at [email protected]
#
#
#
# ===============================================================================
# OVERVIEW
# ===============================================================================
# ProteomeScoutAPI is a Python module which can be used to connect to and parse
# ProteomeScout flatfiles.
#
# Specifically, the goal of this module is to allow anyone to interact with
# ProteomeScout data without the need to
#
# 1) Repeatedly query the ProteomeScout sever
#
# 2) Have any knowledge of SQL, or use an SQL-Python ORM
#
# 3) Facilitate rapid exploration of the ProteomeScout dataset
#
# ===============================================================================
# Usage
# ===============================================================================
# The general approach to usage is as follows
#
# 0) Import the proteomeScoutAPI module
#
# 1) Create the API object by loading a flat file. ProteomeScout flat files can
# be obtained from https://proteomescout.wustl.edu/compendia
#
# PTM_API = ProteomeScoutAPI('<flat filename goes here>')
#
# 2) Query the API object using the functions. Available functions are
#
# PTM_API.get_phosphosites(ID)
# PTM_API.get_PTMs(ID)
# PTM_API.get_mutations(ID)
#
# For more information on these functions I suggest reading the rest of the
# source code, or once you've loaded an API object type
#
# help(PTM_API.get_mutations)
#
# 3) The PTM_APU.uniqueKeys is a list of the unique accession numbers to provide
# an easy way to loop over all the unique entries. NOTE THAT IDS HAVE REDUNDANCY
# which is deliberate (and makes interfacing easy) but cycling over all the IDs
# in the API object would be incorrect and lead to double counting
#
# ===============================================================================
# EXAMPLES
# ===============================================================================
# from proteomeScoutAPI import ProteomeScoutAPI
#
#
# ID = "O46631"
#
# PTM_API = ProteomeScoutAPI("proteomescout_mammalia_20140831.tsv")
#
# PTM_API.get_mutations(ID)
#
# PTM_API.get_PTMs(ID)
#
# # the following loop prints all the phosphosites in all the proteins
# for ID in PTM_API.uniqueKeys():
# print PTM_API.get_phosphosites(ID)
#
# ===============================================================================
# Exception if file is bad
class BadProteomeScoutFile(Exception):
pass
class ProteomeScoutAPI:
def __init__(self,filename):
"""
filename should be a ProteomeScout flatfile
"""
self.database={}
self.uniqueKeys=[]
# this will throw and exception if there's a problem with the file
self.__checkFile(filename)
self.__buildAPI(filename)
def __checkFile(self, filename):
"""
Internal function to check we (apparently) have a valid
proteomeScout file
"""
try:
with open(filename, 'r') as f:
first_line = f.readline()
if not len(first_line.split("\t")) == 19:
raise BadProteomeScoutFile("N/A")
except:
BadProteomeScoutFile("Invalid ProteomeScout flat file %s.\nFile is invalid or corrupted" % str(filename))
def __buildAPI(self, datafile):
# read in the file line by line
with open(datafile) as f:
content = f.readlines()
# read the flatfile headers and parse
# the contents
headers_raw = content[0].split('\t')
headers=[]
for i in headers_raw:
headers.append(i.strip())
# remove the header line from the read in data
content.pop(0)
# for each line in the data
for line in content:
# split the record and get the canonical ID
record = line.split('\t')
IDlist_raw = record[1].split(";")
ProteomeScoutID = record[0]
IDlist = []
for ID in IDlist_raw:
IDlist.append(ID.strip())
# add the first ID to the list of unique keys
self.uniqueKeys.append(ProteomeScoutID)
# now construct the object dictionary. Note we
# have hardcoded the number of header columns here
# though if in future versions of the ProteomeScout
# dataset more columns are added you coulded extend this
# here
OBJ={}
for i in range(1,19):
OBJ[headers[i]] = record[i]
# ALWAYS add the record via the ProteomeScout uniquekey
self.database[ProteomeScoutID] = OBJ
# for each other ID associated with the record, assign that accession-record
# mapping assuming the record is not overwriting a more complete record which
# already exists.
#
# This means users can use whatever accession type they want to interact
# with the ProteomeScoutAPI, while recognizing that many different accessions
# will be referring to the same protein
for ID in IDlist:
# A possible issue is the possibility that the same accession
# points to several ProteomeScout records. Rather than merging records,
# which can introduce problems, we've chosen to deal with this by defaulting
# to the record with the largest number of modifications associated with it
#
# In most examples of record duplicaion there is clearly a 'major' and 'minor'
# record. This occurs through insufficient cross-referencing in databases
# outside of ProteomeScout. The 'minor' record is typically a subset of
# the 'major' record
if ID in self.database:
# if the newly found record has more PTMs associated with it than the
# original record then overwrite
if len(OBJ['modifications'].split(";")) > len(self.database[ID]['modifications'].split(";")):
self.database[ID] = OBJ
else:
self.database[ID] = OBJ
def get_PTMs(self, ID):
"""
Return all PTMs associated with the ID in question.
POSTCONDITIONS:
Returns a list of tuples of modifications
[(position, residue, modification-type),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no modifications
"""
try:
record = self.database[ID]
except KeyError:
return -1
mods = record["modifications"]
mods_raw=mods.split(";")
mods_clean =[]
for i in mods_raw:
tmp = i.strip()
tmp = tmp.split("-")
# append a tuple of (position, residue, type)
mods_clean.append((tmp[0][1:], tmp[0][0], "-".join(tmp[1:])))
return mods_clean
def get_structure(self, ID):
"""
Return all structures associated with the ID in question.
POSTCONDITIONS:
Returns a list of tuples of structure
if there is a problem with the start and end position, these will be
returned as -1
[(domain_name, start_position, end_position),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no structures
"""
try:
record = self.database[ID]
except KeyError:
return -1
structs = record["structure"]
structs_raw=structs.split(";")
structs_clean =[]
for i in structs_raw:
if i:
tmp = i.strip()
tmp = tmp.split(":")
if len(tmp)>=2:
name, sites = tmp
tmp = sites.split("-")
structs_clean.append((name, tmp[0],tmp[1]))
else:
print("ERROR: the structure did not match expected format %s"%(i))
#doms_clean.append((tmp, -1, -1))
return structs_clean
def get_domains(self, ID, domain_type):
"""
Return all domains associated with the ID in question.
For pfam domains domain_type is 'pfam'
For UniProt domains domain_type is 'uniprot'
POSTCONDITIONS:
Returns a list of tuples of domains
if there is a problem with the start and end position, these will be
returned as -1
[(domain_name, start_position, end_position),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no modifications
"""
try:
record = self.database[ID]
except KeyError:
return -1
domain_type = domain_type.lower()
if domain_type == 'pfam':
doms = record["pfam_domains"]
elif domain_type == 'uniprot':
doms = record["uniprot_domains"]
else:
print("%s is an unrecognized domain type. Use 'pfam' or 'uniprot'"%(domain_type))
return -2
#check for atypical records and replace the extra ; to avoid errors in parsing
doms = doms.replace("; atypical", " atypical")
doms = doms.replace("; truncated", " truncated")
doms = doms.replace("; second part", " second part")
doms = doms.replace("; first part", " first part")
doms_raw=doms.split(";")
doms_clean =[]
for i in doms_raw:
if i:
tmp = i.strip()
tmp = tmp.split(":")
if len(tmp)>=2:
name, sites = tmp
tmp = sites.split("-")
doms_clean.append((name, tmp[0], tmp[1]))
else:
print("ERROR: the domain did not match expected format %s"%(i))
#if this happens, it's likely due to an extra ; where atypical occurs, so replace ; atypical before the split.
#doms_clean.append((tmp, -1, -1))
return doms_clean
def get_domains_harmonized(self, ID):
"""
This will harmonize pfam and uniprot domains into one tuple output
with the following rules:
1. Use the pfam domain name (since it does not append numbers if more than one domain of that type)
2. Use the Uniprot domain boundaries (since they are typically more expansive)
3. If uniprot does not have a pfam domain, use the pfam domain as it is
POSTCONDITIONS:
Returns a list of tuples of domains
if there is a problem with the start and end position, these will be
returned as -1
[(domain_name, start_position, end_position),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no domains
"""
#First find which domains overlap from pfam to uniprot, keeping those
# they overlap if the start position or end positions are
pfam = self.get_domains(ID, 'pfam')
uniprot = self.get_domains(ID, 'uniprot')
if pfam == -1:
return -1
harmonized = []
source = [] #keeping track of source, but not currently returning it
#create a dictionary of matches from pfam to uniprot and vice versa
pfamDict = {}
uniprotDict = {}
for p in pfam:
p_name, p_start, p_stop = p
pfamSpan = set(range(int(p_start), int(p_stop)))
pfamDict[p] = []
for uni in uniprot:
uni_name, uni_start, uni_stop = uni
uniSpan = set(range(int(uni_start), int(uni_stop)))
if uniSpan.intersection(pfamSpan):
pfamDict[p] = uni
uniprotDict[uni] = p
break
#repeat from uniprot for uniprot domains not yet matched
for uni in uniprot:
found = 0
if uni not in uniprotDict:
uni_name, uni_start, uni_stop = uni
uniSpan = set(range(int(uni_start), int(uni_stop)))
uniprotDict[uni] = []
for p in pfam:
p_name, p_start, p_stop = p
pfamSpan = set(range(int(p_start), int(p_stop)))
if uniSpan.intersection(pfamSpan):
pfamDict[p] = uni
uniprotDict[uni] = p
found = 1
break
#here, if uni does not exist in uniprot, then add it (a uniprot domain not in pfam)
if not found:
pfamDict[uni] = uni
#now take all uniprot domains and any in pfam that have no match to uniprot.
# Use the pfam domain name (which doesn't add numbers if there are tandem domains, e.g.)
for p in pfamDict:
if not pfamDict[p]: #there is no matching uniprot domain
harmonized.append(p)
source.append('pfam')
else: #there is, but we want to change the uniprot name to have the pfam domain
uni_name, uni_start, uni_stop = pfamDict[p]
p_name = p[0]
harmonized.append((p_name, uni_start, uni_stop))
source.append('uniprot')
return harmonized
def get_Scansite(self, ID):
"""
Return all Scansite annotations that exist for a protein record (ID)
POSTCONDITIONS:
Returns a dict of tuples of scansite predictions. dictionary keys are the type of Scansite prediction (bind, kinase)
Tuples in each are
[(residuePosition, kinase/bind name, score),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no scansite predictions
"""
try:
record = self.database[ID]
except KeyError:
return -1
scansite = record["scansite_predictions"]
if not scansite:
return [] #no scansite records found
#scansite line is <residue><position>-scansite_<type>-<name>:<score>
# multiple entries are separated by semicolons
scansiteDict = {}
for record in scansite.split(';'):
arr = record.split('-')
res_pos = arr[0].strip(' ')
scan_type = arr[1]
nameScore = arr[2]
if scan_type not in scansiteDict:
scansiteDict[scan_type] = [] #send array of tuples
name, score = nameScore.split(':')
info = [res_pos, name, score]
scansiteDict[scan_type].append(info)
return scansiteDict
def get_Scansite_byPos(self, ID, res_pos):
"""
Return all Scansite annotations that exist at a specific residue position, given as the AminoAcidPos, e.g. T183
POSTCONDITIONS:
Returns a list of tuples of modifications
[(position, residue, modification-type),...,]
Returns -1 if unable to find the ID
Returns empty dictionary if no Scansite. Returns emtpy lists in Dictionary items
if no Scansite predictions found at that site
"""
scansiteDict = self.get_Scansite(ID)
subsetDict = {}
for typeScan in scansiteDict:
subsetDict[typeScan] = []
for prediction in scansiteDict[typeScan]:
if prediction[0] == res_pos:
subsetDict[typeScan].append(prediction)
return subsetDict
def get_nearbyPTMs(self,ID,pos, window):
"""
Return all PTMs associated with the ID in question that reside within
+/-window, relative to the designated position (pos)
POSTCONDITIONS:
Returns a list of tuples of modifications
[(position, residue, modification-type),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no modifications
"""
mods = self.get_PTMs(ID)
modsInWin = []
if mods == -1:
return -1
elif len(mods)==0:
return []
else:
for m in mods:
site = int(m[0])
if site >= pos-window and site <= pos+window:
modsInWin.append(m)
return modsInWin
def get_species(self,ID):
"""
Return the species associated with the ID in question.
POSTCONDITIONS:
Returns a string of the species name
Returns '-1' if unable to find the ID
Returns '' (empty list) if no species
"""
try:
record = self.database[ID]
except KeyError:
return '-1'
species = record["species"]
return species
def get_sequence(self, ID):
"""
Return the sequence associated with the ID in question.
POSTCONDITIONS:
Returns a string of the sequence
Returns '-1' if unable to find the ID
Returns '' (empty list) if no sequence
"""
try:
record = self.database[ID]
except KeyError:
return '-1'
sequence = record["sequence"]
return sequence
def get_acc_gene(self, ID):
"""
Return the gene name associated with the ID in question.
POSTCONDITIONS:
Returns a string of the gene name
Returns '-1' if unable to find the ID
Returns '' (empty list) if no gene name
"""
try:
record = self.database[ID]
except KeyError:
return '-1'
acc_gene = record["acc_gene"]
return acc_gene
def get_phosphosites(self,ID):
"""
Return all phosphosites associated with the ID in question.
POSTCONDITIONS:
Returns a list of tuples of phosphosites
[(position, residue, phosphosite-type),...,]
Returns -1 if unable to find the ID
Returns [] (empty list) if no modifications
"""
mods = self.get_PTMs(ID)
if mods == -1:
return -1
phospho=[]
for mod in mods:
if mod[2].find("Phospho") >= 0:
phospho.append(mod)
return phospho
def get_evidence(self, ID):
"""
Return all evidence associated with modifications for the ID in question.
POSTCONDITIONS:
Returns a list of tuples of evidence
[(numEvidences, [ArrayOfEvidences], (numEvidences ScecondSite, [ArrayOfEvidences], etc.]
Returns -1 if unable to find the ID
Returns [] (empty list) if no modifications/evidences
"""
try:
record = self.database[ID]
except KeyError:
return -1
evidence = record["evidence"]
evidence_raw=evidence.split(";")
evidence_clean = []
for site in evidence_raw: #an array of evidences, corresponding in position to each site.
indEvidences = site.split(",")
evidencesArrTemp = []
for i in indEvidences:
i.strip()
evidencesArrTemp.append(int(i))
numEvidences = len(evidencesArrTemp)
# append a tuple of length of evidences, and a list of evidences
evidence_clean.append([numEvidences, evidencesArrTemp])
return evidence_clean
def get_mutations(self, ID):
"""
Return all mutations associated with the ID in question.
mutations = PTM_API.get_mutations(ID)
POSTCONDITIONS:
Returns a list of tuples of mutations
[(original residue, position, new residue, annotation),...,]
Returns -1 if unable to find the ID
Returns -2 if the number of mutations and annotations do not match
Returns [] (empty list) if no mutations
"""
try:
record = self.database[ID]
except KeyError:
return -1
mutations = record["mutations"]
mutations_ann = record["mutation_annotations"]
if len(mutations) == 0:
return []
mutations_raw=mutations.split(";")
mutations_clean=[]
mutations_ann_raw = mutations_ann.split("|")
if len(mutations_raw) != len(mutations_ann_raw):
print("Error: Not the same number of annotations (%d) and mutations (%d)\n"%(len(mutations_ann_raw), len(mutations_raw)))
return -2
for idx, i in enumerate(mutations_raw):
tmp = i.strip()
tmpArr = tmp.split(":")
tmp = tmpArr[0];
if len(tmpArr) == 2:
label = tmpArr[1]
else:
label = ''
# append a tuple of (position, residue, type)
mutations_clean.append((tmp[1:-1], tmp[0], tmp[-1], label,
mutations_ann_raw[idx].strip()))
return mutations_clean
def get_GO(self, ID):
"""
Return all GO terms associated with the ID in question
POSTCONDITIONS:
Returns a list of GO Terms
Returns a -1 if unable to find the ID
Returns a [] (empty list) if no GO terms
"""
try:
record = self.database[ID]
except KeyError:
return -1
GO_terms = record["GO_terms"]
if len(GO_terms)==0:
return []
GO_termsArr = GO_terms.split(";")
GO_terms_clean = []
for i in GO_termsArr:
GO_terms_clean.append(i.strip())
return GO_terms_clean
def get_kinaseLoops(self, ID):
"""
Return kinase activation loop with the ID in question
POSTCONDITIONS:
Returns a tuple of (loop/predicted, start, stop)
Returns a -1 if unable to find the ID
Returns a [] (empty list) if no kinase activation loops
"""
try:
record = self.database[ID]
except KeyError:
return -1
finalLoops = []
kinase_loops = record['kinase_loops']
if kinase_loops:
loops = kinase_loops.split(';')
for loop in loops:
info = loop.split(':')
loopType = info[0]
start, stop = info[1].split('-')
finalLoops.append((loopType, start, stop))
else:
return []
return finalLoops
def get_accessions(self,ID):
"""
Return a list of accessions associated with the protein
POSTCONDITIONS:
Returns a list of the protein's accessions
Returns a -1 if unable to find the ID
"""
try:
record = self.database[ID]
except KeyError:
return -1
raw_accessions = record['accessions']
split_accessions = raw_accessions.split(';')
acc_list = []
for acc in split_accessions:
acc_list.append(acc.strip())
return(acc_list)
def get_PTMs_withEvidenceThreshold(self, ID, evidenceThreshold):
"""
Return all PTMs associated with the ID in question that have at least evidenceThreshold or more pieces
of experimental or database evidence
POSTCONDITIONS:
Returns a list of tuples of modifications
[(position, residue, modification-type),...,]
Returns -1 if unable to find the ID
Returns -2 if the modifications have a mismatched evidence array
Returns [] (empty list) if no modifications fit that threshold
"""
evidences = self.get_evidence(ID)
mods = self.get_PTMs(ID)
modsMeetThreshold = []
if mods == -1:
return -1
elif len(mods)==0:
return []
else:
if len(evidences) != len(mods):
return -2
for i in range(0, len(mods)):
numEvidences, evidenceArr = evidences[i]
if numEvidences >= evidenceThreshold:
modsMeetThreshold.append(mods[i])
return modsMeetThreshold
def get_PTMs_withEvidence(self, ID):
"""
Return PTMs as a list of dictionaries with both the modification information and the evidence associated with that.
POSTCONDITIONS:
Returns a list of dictionaries, keys are 'mod' and 'evidence'.
dictionary[0]['mod'] is an array with (position, residue, modification-type) of the first PTM in the ID record
dictionary[0]['evidence'] is a list of experiment IDs for that PTM
Returns -1 if unable to find ID
Returns -2 if the modifications have a mimatched evidence array
Returns [] (empty list) if no modifications
"""
modList = []
evidences = self.get_evidence(ID)
mods = self.get_PTMs(ID)
if mods == -1:
return -1
elif len(mods)==0:
return []
else:
if len(evidences) != len(mods):
return -2
for i in range(0, len(mods)):
dictTemp = {}
dictTemp['mod'] = mods[i]
numEvidences, evidenceArr = evidences[i]
dictTemp['evidence'] = evidenceArr
modList.append(dictTemp)
return modList