forked from mdahasan/mClass---Multiple-cancer-classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_gene_clustering.py
149 lines (112 loc) · 4.28 KB
/
3_gene_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#
# When computing U, the number of comparisons equals the product of the number of values in group A times the number of values in group B.
# If the null hypothesis is true, then the value of U should be about half that value. If the value of U is much smaller than that, the P
# value will be small. The smallest possible value of U is zero. The largest possible value is half the product of the number of values in
# group A times the number of values in group B.
#
import numpy as np
import pandas as pd
import sys
import operator
from sklearn.metrics import pairwise
# import matplotlib.pyplot as plt
from scipy import stats
check_dict = dict()
def find_similar_genes(sim_list):
threshold = 0.55
similar_genes = list()
for i in range(len(sim_list)):
if(sim_list[i] > threshold):
similar_genes.append(i)
check_dict[i] = True
return similar_genes
def similarity_measure_form_cluster(X, gene_name_list):
row, col = X.shape
gene_cluster_dict = dict()
cluster_index = 0
# THIS IS HOW TO COMPUTE COSINE SIMILARIRY!!!
sim_matrix = pairwise.cosine_similarity(X.T)
x, y = sim_matrix.shape
gene_similairty_cluster = dict()
for i in range(x):
similar_genes = list()
similar_genes.append(i)
check_dict[i] = True
sim_list = sim_matrix[:, i]
similar_genes.extend(find_similar_genes(sim_list))
gene_similairty_cluster[i] = similar_genes
cluster_index = 1
for key in gene_similairty_cluster:
fout = open('Cluster_' + str(cluster_index) + '.clstr', 'w')
gene_list = gene_similairty_cluster[key]
for i in range(len(gene_list)):
fout.write(str(gene_name_list[gene_list[i]]) + '\n')
print('Building cluster: ' + str(cluster_index))
cluster_index += 1
fout.close()
def pairwise_ttest(X):
row, col = X.shape
print(row, col)
gene_cluster_dict = dict()
cluster_index = 0
while(row > 1):
gene_name_list = X.columns.values
i = 0
gene1 = X[gene_name_list[i]]
gene_list = list()
gene_list.append(gene_name_list[i])
for j in range(i + 1, row):
gene2 = X[gene_name_list[j]]
(t, p) = stats.ttest_ind(gene1, gene2, equal_var = False)
if(t < 0 and p < 0.05):
gene_list.append(gene_name_list[j])
gene_cluster_dict[cluster_index] = gene_list
cluster_index += 1
print(gene_list)
X.drop(gene_list, axis = 1, inplace = True)
row, col = X.shape
print(row, col)
for key in gene_cluster_dict:
print(key, gene_cluster_dict[key])
for key in gene_cluster_dict:
print(key, len(gene_cluster_dict[key]))
def gene_filter_modifying_dataset(X):
samples, genes = X.shape
gene_name_list = X.columns.values
np_X = X.to_numpy()
gene_snps_sum = np_X.sum(axis = 0)
gene_snp_dict = dict()
for i in range(len(gene_name_list)):
gene_snp_dict[gene_name_list[i]] = gene_snps_sum[i]
sorted_gene_snp_list_pair = sorted(gene_snp_dict.items(), key = operator.itemgetter(1), reverse = True)
# setting threshold to cutoff genes without minimum number of snps across samples (1% of the samples has snps)
snp_count_threshold = int((samples * 1)/ 100)
# write the gene snp frequency (FOR ALL SAMPLES!!!)
fout = open('gene_snp_frequency_down.txt', 'w')
qualified_gene_list = list()
for i in range(len(sorted_gene_snp_list_pair)):
if(sorted_gene_snp_list_pair[i][1] >= snp_count_threshold):
qualified_gene_list.append(sorted_gene_snp_list_pair[i][0])
fout.write(str(sorted_gene_snp_list_pair[i][0]) + '\t' + str(sorted_gene_snp_list_pair[i][1]) + '\n')
fout.close()
# rebuild the dataframe with only the qualified genes with enough snps across samples
qualified_gene_df = pd.DataFrame()
for i in range(len(qualified_gene_list)):
qualified_gene_df[qualified_gene_list[i]] = X[qualified_gene_list[i]].values
return qualified_gene_df
def main():
data_file = sys.argv[1] # read the all_data.txt
X = pd.read_table(data_file, sep = '\t', header = 'infer')
y = X['Cancer_type']
X.drop('Cancer_type', axis = 1, inplace = True)
# filter genes with enough number of snps across samples
print('Filtering genes... ')
filtered_df = gene_filter_modifying_dataset(X)
# run cosine similarity test to cluster genes
gene_name_list = filtered_df.columns.values
# for i in range(len(gene_name_list)):
# print(gene_name_list[i])
print('Pairwise similarity measurement between genes... ')
similarity_measure_form_cluster(filtered_df, gene_name_list)
if __name__ == "__main__":
main()