-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusters_histogram.py
51 lines (36 loc) · 1.27 KB
/
clusters_histogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import argparse
import matplotlib.pyplot as plt
def parse_args():
parser = argparse.ArgumentParser(description="Histogram of sequences distribution in the clusters")
parser.add_argument("-i", "--input", required=True,
default="/data",
help=f"path to .clstr file from CD-HIT output")
return parser.parse_args()
# Counts number of hits in the cluster
def count_lines_between_clusters(file_path):
counts = []
with open(file_path) as f:
# omit first line
lines = f.readlines()[1:]
counter = 0
for line in lines:
if line.startswith(">Cluster"):
counts.append(counter)
counter = 0
else:
counter += 1
counts.append(counter)
return counts
# Histogram of counted hits
def plot_clusters_histogram(file_path):
hits_in_cluster = count_lines_between_clusters(file_path)
plt.hist(hits_in_cluster, bins = 100, log = True, color='deepskyblue', edgecolor='black')
plt.title('Distribution of hits in a clusters')
plt.xlabel('Number of hits')
plt.ylabel('Number of clusters')
plt.show()
def main():
args = parse_args()
plot_clusters_histogram(args.input)
if __name__ == '__main__':
main()