-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_ptm_coordinates.py
93 lines (66 loc) · 4.99 KB
/
generate_ptm_coordinates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
import pandas as pd
from ExonPTMapper import mapping
import datetime
import pyliftover
from tqdm import tqdm
mapper = mapping.PTM_mapper()
mapper.find_ptms_all(phosphositeplus_file = '../../Database_Information/PhosphoSitePlus/reformatted_sites/phosphositeplus_data.csv')
mapper.mapPTMs_all()
#get coordinate info
ptm_coordinates = mapper.ptm_coordinates.copy()
#annotate with ptm position in canonical isoform
ptm_coordinates['UniProtKB Accession'] = ptm_coordinates['Source of PTM'].apply(lambda x: x.split(';'))
ptm_coordinates['Residue'] = ptm_coordinates['UniProtKB Accession'].apply(lambda x: x[0].split('_')[1][0])
#get location of PTM in canonical isoform, if found in canonical isoform
ptm_coordinates['PTM Position in Canonical Isoform'] = ptm_coordinates['UniProtKB Accession'].apply(lambda x: [ptm.split('_')[1][1:] for ptm in x if ptm.split('_')[0] in config.canonical_isoIDs.values()])
ptm_coordinates['PTM Position in Canonical Isoform'] = ptm_coordinates['PTM Position in Canonical Isoform'].apply(lambda x: ';'.join(x) if len(x) > 0 else np.nan)
ptm_coordinates['Found in Canonical'] = ptm_coordinates['PTM Position in Canonical Isoform'].apply(lambda x: x == x)
#set accession to use: if canonical use the non-isoform id, else use the first isoform id in list
ptm_coordinates['UniProtKB Accession'] = ptm_coordinates.apply(lambda x: ';'.join([ptm.split('-')[0]for ptm in x['UniProtKB Accession'] if ptm.split('_')[0] in config.canonical_isoIDs.values()]) if x['Found in Canonical'] else x['UniProtKB Accession'][0].split('_')[0], axis = 1)
#update location of PTM to be from first isoform in list if not found in canonical isoform
ptm_coordinates['PTM Position in Canonical Isoform'] = ptm_coordinates.apply(lambda x: x['PTM Position in Canonical Isoform'] if x['Found in Canonical'] else x['Source of PTM'].split(';')[0].split('_')[1][1:], axis = 1)
new_coords = []
liftover_object = pyliftover.LiftOver('hg38','hg19')
for i, row in tqdm(ptm_coordinates.iterrows(), total = ptm_coordinates.shape[0], desc = 'Converting from hg38 to hg19 coordinates'):
new_coords.append(mapping.convert_genomic_coordinates(row['Gene Location (hg38)'], row['Chromosome/scaffold name'], row['Strand'], from_type = 'hg38', to_type = 'hg19', liftover_object = liftover_object))
ptm_coordinates['Gene Location (hg19)'] = new_coords
new_coords = []
liftover_object = pyliftover.LiftOver('hg19','hg18')
for i, row in tqdm(ptm_coordinates.iterrows(), total = ptm_coordinates.shape[0], desc = 'Converting from hg19 to hg18 coordinates'):
new_coords.append(mapping.convert_genomic_coordinates(row['Gene Location (hg19)'], row['Chromosome/scaffold name'], row['Strand'], from_type = 'hg19', to_type = 'hg18', liftover_object = liftover_object))
ptm_coordinates['Gene Location (hg18)'] = new_coords
ptm_coordinates['PTM Position in Canonical Isoform'] = ptm_coordinates['PTM Position in Canonical Isoform'].str.split(';')
ptm_coordinates['UniProtKB Accession'] = ptm_coordinates['UniProtKB Accession'].str.split(';')
ptm_coordinates = ptm_coordinates.explode(['UniProtKB Accession', 'PTM Position in Canonical Isoform']).reset_index()
#add whether or not the PTM is constitutive
mapper = mapping.PTM_mapper() #reload original mapper object to get nonconstitutive list
#grab nonconstitutive list from ptm_info object
ptm_info = mapper.ptm_info.copy()
nonconstitutive_list = ptm_info[ptm_info['PTM Conservation Score'] != 1].index.values
def check_constitutive(ptm, nonconstitutive_list):
"""
For a given list of ptms, check if any of the ptms are found in the list of nonconstitutive ptms, meaning that they have previously been found to be missing from isoforms in Ensembl
Parameters
----------
ptm : list
List of PTMs to check (each ptm should be in the form of "UniProtID_ResiduePosition" (e.g. "P12345-1_Y100"))
nonconstitutive_list : list
List of PTMs that have been found to be nonconstitutive (based on data from ptm_info object generated by ExonPTMapper)
"""
for p in ptm:
if p in nonconstitutive_list:
return False
ptm_coordinates['Split PTMs'] = ptm_coordinates['Source of PTM'].apply(lambda x: x.split(';'))
const_list = []
for i, row in tqdm(ptm_coordinates.iterrows(), total = ptm_coordinates.shape[0], desc = 'Checking for constitutive PTMs'):
const_list.append(check_constitutive(row['Split PTMs'], nonconstitutive_list))
ptm_coordinates['Constitutive'] = const_list
#add flanking sequence information
ptm_coordinates['Expected Flanking Sequence'] = ptm_coordinates['Split PTMs'].apply(lambda x: ';'.join(set([mapper.ptm_info.loc[p, 'Flanking Sequence'] for p in x])))
#drop unnecessary columns and save
ptm_coordinates = ptm_coordinates.drop(columns = 'Split PTMs')
ptm_coordinates.to_csv('../PTM_POSE/ptm_pose/Resource_Files/ptm_coordinates.csv', index = False)
#write to text file indicating when the data was last updated
with open('./ptm_pose/Resource_Files/last_updated.txt', 'w') as f:
f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))