-
Notifications
You must be signed in to change notification settings - Fork 0
/
reformat_stringPPIdf.py
58 lines (46 loc) · 1.8 KB
/
reformat_stringPPIdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
path = 'path/to/folder'
all_df = pd.read_excel(path + "interactors_stringDB.xlsx")
all_df.rename(columns={all_df.columns[0]: "index"}, inplace=True)
#### reformat retrieved interactors
### identify non-unique interactors, add columns with unique or non-unique
print(all_df.columns)
print(all_df.shape)
# index True for all duplicates, type is pd.Series
all_dupl = all_df.duplicated(subset="stringId_B", keep = False)
# create pd.Series for uniq/bArr1/bArr2
unique = []
for i, e in enumerate(all_dupl):
if e: # means if e = True
unique += ["both"]
else:
# add "bArr1" or "bArr2"
unique += [all_df['preferredName_A'][i]]
unique = pd.Series(unique)
# add this series as new column to data
all_df_uniq = pd.concat([all_df, unique], axis = 1)
all_df_uniq.rename(columns = {0 : "uniqueness"}, inplace=True)
# export xlsx with all interactors
all_df_uniq.to_excel(path + "all_interactors.xlsx")
# create separate DFs based on "uniqueness" column
arrb1_only = all_df_uniq[all_df_uniq["uniqueness"] == "ARRB1"]
arrb2_only = all_df_uniq[all_df_uniq["uniqueness"] == "ARRB2"]
both_arrb = all_df_uniq[all_df_uniq["uniqueness"] == "both"]
# export all DFs
all_df_uniq.to_excel(path + "all_interactors.xlsx")
arrb1_only.to_excel(path + "ARRB1_interactors.xlsx")
arrb2_only.to_excel(path + "ARRB2_interactors.xlsx")
both_arrb.to_excel(path + "interactors_both_ARRB.xlsx")
# print dataset stats
print("-----------------------")
print("barr1 only interactors")
print(len(arrb1_only))
print("barr2 only interactors")
print(len(arrb2_only))
print("interactors of both barr")
print(len(both_arrb))
print("-----------------------")
print("barr1 interactors")
print(len(all_df[all_df["preferredName_A"] == "ARRB1"]))
print("barr2 interactors")
print(len(all_df[all_df["preferredName_A"] == "ARRB2"]))