-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_peaks.py
executable file
·132 lines (116 loc) · 4.68 KB
/
filter_peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
"""
Filter peaks by score value using config file.
Usage:
filter_peaks.py -d <input_directory> -c <config_file> -o <output_directory>
Options:
-h --help Show this screen.
--version Show version.
-d <input_directory> Directory with BED files containing peaks for all marks to be processed.
-c <config_file> Text file with chosen calls and score thresholds.
-o <output_directory> Output directory name.
"""
import sys
print
modules = ["docopt", "os"]
exit_flag = False
for module in modules:
try:
__import__(module)
except ImportError:
exit_flag = True
sys.stderr.write("Error: Python module " + module + " is not installed.\n")
if exit_flag:
sys.stderr.write("You can install these modules with a command: pip install <module>\n")
sys.stderr.write("(Administrator privileges may be required.)\n")
sys.exit(1)
from docopt import docopt
from os.path import basename
from os.path import splitext
from os.path import join
from os.path import exists
from os.path import isfile
from os.path import isdir
from os import makedirs
from os import listdir
from os import system
from os import devnull
from os import getcwd
from sys import stdout
def filter_peaks(bed_file_full, output_directory, score_thr, caller_name):
if caller_name == 'SICER':
bed_file_full_filtered = (splitext(bed_file_full))[0] + (splitext(bed_file_full))[1] + \
'_greater' + str(int(score_thr)) + '.bed'
else:
bed_file_full_filtered = (splitext(bed_file_full))[0] + '_greater' + \
str(int(score_thr)) + '.bed'
with open(bed_file_full_filtered, 'w') as dst, open(bed_file_full, 'r') as src:
for line in src:
fields_list = line.rstrip('\n').split()
if caller_name == 'SICER':
score_str = fields_list[3]
else:
score_str = fields_list[4]
score = float(score_str)
if score > score_thr:
dst.write(line)
if __name__ == '__main__':
arguments = docopt(__doc__, version='filter_peaks 0.1')
input_directory = arguments["-d"].rstrip('/')
if not exists(input_directory):
print "Error: Can't find input directory: no such directory '" + \
input_directory + "'. Exit.\n"
sys.exit(1)
if not isdir(input_directory):
print "Error: Input directory must be a directory:). Something else given. Exit.\n"
sys.exit(1)
config_file = arguments["-c"]
if not exists(config_file):
print "Error: Can't find config file: no such file '" + \
config_file + "'. Exit.\n"
sys.exit(1)
if not isfile(config_file):
print "Error: Config file must be a regular file. Something else given. Exit.\n"
sys.exit(1)
output_directory = arguments["-o"].rstrip('/')
if exists(output_directory) and not isdir(output_directory):
print "Error: Output directory must be a directory:). Something else given. Exit.\n"
sys.exit(1)
if not exists(output_directory):
makedirs(output_directory)
with open(config_file, 'r') as src:
for line in src:
field_list = line.rstrip('\n').split()
mark_name = field_list[0]
caller_name = field_list[1]
if caller_name == 'SICER':
gap_size = int(field_list[2])
score_thr_str = field_list[3]
elif caller_name == 'MACS':
score_thr_str = field_list[2]
else:
print "Error: Undefined caller name found in the config file: " + caller_name + \
". It must be SICER or MACS. Exit.\n"
sys.exit(1)
if score_thr_str != '-':
score_thr = float(score_thr_str)
else:
score_thr = 0
for file in listdir(input_directory):
if mark_name in file:
if caller_name == 'SICER':
if 'scoreisland' in file and 'G' + str(gap_size) in file:
bed_file_full = join(input_directory, file)
break
else:
if 'MACS' in file:
bed_file_full = join(input_directory, file)
break
print
if score_thr == 0:
print 'Pass ' + bed_file_full
stdout.flush()
continue
print 'Filter ' + bed_file_full + ' by score >= ' + str(score_thr) + ' ...'
stdout.flush()
filter_peaks(bed_file_full, output_directory, score_thr, caller_name)