-
Notifications
You must be signed in to change notification settings - Fork 3
/
assign_complex_names_from_corum.py
90 lines (73 loc) · 2.95 KB
/
assign_complex_names_from_corum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 10 11:42:23 2021
@author: Meghana
"""
import pickle
import re
def jaccard_coeff(set1, set2):
ls1 = len(set1)
ls2 = len(set2)
if ls1 == 0 and ls2 == 0:
return 1
inter = len(set1.intersection(set2))
return float(inter) / (ls1 + ls2 - inter)
import pandas as pd
df = pd.read_excel('./humap/CORUM_latest/corum_2019.xlsx')
# some names are different in df['subunits(Gene name)'], so get from converted names instead
with open('./humap/corum_namesoriginal.txt') as f:
orig_corum_complexes_gene_names_converted = f.readlines()
comp2name = list(zip(df['ComplexName'][:2916],orig_corum_complexes_gene_names_converted))
comp2name = dict([(frozenset(elt_set[1].rstrip().split(' ')),elt_set[0]) for elt_set in comp2name])
results_folder = "./humap/results_73_neg_unif_10xisa_e0.01_T01.75_a0.005_qi_o0.375"
results_file = results_folder + "/res_pred_names.out"
with open(results_file) as f:
complexes = [frozenset(line.rstrip().split()[:-1]) for line in f.readlines()]
complexes_with_names = dict()
name_list = []
comp_num2name = dict()
perfect_matches_num = 0
name_duplicate_nums = dict()
for i,comp in enumerate(complexes):
max_jc = 0
closest_match = None
name = None
# get closest match
for corum_comp in comp2name:
jc = jaccard_coeff(comp, corum_comp)
if jc > max_jc:
max_jc = jc
closest_match = corum_comp
if closest_match:
if float("{:.1f}".format(max_jc)) == 1.0:
name = comp2name[closest_match]
perfect_matches_num += 1
else:
if float("{:.1f}".format(max_jc)) == 0.0:
name = "{:.2f}".format(0.05) + ' ' + comp2name[closest_match]
else:
name = "{:.1f}".format(max_jc) + ' ' + comp2name[closest_match]
if not name:
name = ' '.join(comp)
if name+'\n' in name_list: # Ensuring unique names
if name not in name_duplicate_nums:
name_duplicate_nums[name] = 1
else:
name_duplicate_nums[name] += 1
# get last number in list
num = name_duplicate_nums[name]+1
if num == '':
num=1
name = name + ' match '+ str(num)
complexes_with_names[comp] = name
comp_num2name['Complex'+str(i+1)]=name
name_list.append(name+'\n')
with open(results_folder+'/res_pred_complex_num2name.pkl','wb') as f:
pickle.dump(comp_num2name,f)
with open(results_folder+'/res_pred_complex_set2name.pkl','wb') as f:
pickle.dump(complexes_with_names,f)
with open(results_folder+'/res_metrics.out','a') as f:
print('No. of perfectly recalled original CORUM matches = ',perfect_matches_num,file = f) # 10
print('No. of duplicate complexes (match with corum)= ',sum(name_duplicate_nums.values()),file = f) # 10
with open(results_folder+'/res_pred_complex_names.txt','w') as f:
f.writelines(name_list)