-
Notifications
You must be signed in to change notification settings - Fork 0
/
GO_list.py
105 lines (84 loc) · 3.17 KB
/
GO_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import os
path = 'path/to/folder'
file = "interactors_stringDB_ID_all.xlsx"
# function subset df to needed columns and factors
def format_df(df):
# remove all rows with df["uniqueness"] == both
# bArr1 only vs bArr2 only will be tested
df.drop(df[df['uniqueness'] == 'both'].index, axis='index', inplace=True)
# subset table with needed cols
get_cols = [
'preferredName_B',
'uniqueness',
'uniprot_ID_proteinB',
'Gene Name',
'Protein Name',
'GO IDs',
'GO terms']
df_sub = df[get_cols]
return df_sub
# for each category get GO term and IDs, returns df
def get_category_id(GO_term_list, GO_id_list, category):
# gets index of term of specific category
indices = [id for id, i in enumerate(GO_term_list) if category in i]
GO_terms = []
GO_ids = []
# this may be needed later?
if len(indices) == 0:
GO_terms = [float("NaN")]
GO_ids = [float("NaN")]
else:
# indices are then used to get the corresponding terms and ids
for index in indices:
GO_terms += [GO_term_list[index]]
GO_ids += [GO_id_list[index]]
GO_df = pd.DataFrame({'GO_terms': GO_terms, 'GO_ids': GO_ids})
return GO_df
def complete_category_df(df, row, category_df):
n_rows = len(category_df)
for cols in df.columns[0:len(df.columns) - 2]:
value = df[cols].iloc[row]
category_df = pd.concat((category_df,
pd.Series([value] * n_rows).rename(cols)),
axis='columns')
return category_df
def create_category_df(df, row, category):
temp_row = df.iloc[row]
# split string to get single terms or ids
GO_terms = str(temp_row['GO terms']).split(r'; ')
GO_ids = str(temp_row['GO IDs']).split(r'; ')
category_df = get_category_id(GO_terms, GO_ids, category)
category_df = complete_category_df(df, row, category_df)
return category_df
# create GO df for each category and
def GO_by_row(df, category):
print("Getting GOs for " + str(len(df)) + "x proteins")
# initiate output df
C_df = pd.DataFrame()
# each row represents one interaction protein
for row in range(len(df)):
print(df['preferredName_B'].iloc[row])
C_temp = create_category_df(df, row, category)
C_df = pd.concat((C_df, C_temp), axis='index', ignore_index=True)
return C_df
def main(path_to_folder, filename):
df = pd.read_excel(io=path_to_folder + filename, engine="openpyxl")
df_sub = format_df(df)
print("Category: cellular component")
C_GO = GO_by_row(df_sub, 'C:')
print("Category: molecular function")
F_GO = GO_by_row(df_sub, 'F:')
print("Category: biological process")
P_GO = GO_by_row(df_sub, 'P:')
print("Exporting results...")
new_folder = path_to_folder.replace("/uniprot_details/", "/GO_analysis/")
try:
os.mkdir(new_folder)
except FileExistsError:
pass
C_GO.to_excel(new_folder + filename[:-5] + "_GO_C.xlsx")
F_GO.to_excel(new_folder + filename[:-5] + "_GO_F.xlsx")
P_GO.to_excel(new_folder + filename[:-5] + "_GO_P.xlsx")
print("Finished")
main(path, file)