-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathncrf_sort.py
executable file
·153 lines (125 loc) · 4.18 KB
/
ncrf_sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
"""
Sort the alignments output by Noise Cancelling Repeat Finder.
"""
from sys import argv,stdin,stdout,stderr,exit
from os import path as os_path
from ncrf_parse import alignments
def usage(s=None):
message = """
usage: ncrf_cat <output_from_NCRF> | ncrf_sort [options]
--sortby=mratio[-|+] sort by decreasing or increasing match ratio
(by default we sort by decreasing match ratio)
--sortby=score[-|+] sort by decreasing or increasing alignment score
--sortby=match[-|+] sort by decreasing or increasing alignment match
count
--sortby=length[-|+] sort by decreasing or increasing length; length is
the number of sequence bases aligned
--sortby=name sort by sequence name (and position)
--sortby=position[-|+] sort by sequence name (and decreasing or increasing
position)"""
if (s == None): exit (message)
else: exit ("%s\n%s" % (s,message))
def main():
# parse the command line
sortBy = "mratio-"
requireEof = True
for arg in argv[1:]:
if ("=" in arg):
argVal = arg.split("=",1)[1]
if (arg == "--sortby=mratio-") or (arg == "--sortby=mratio"):
sortBy = "mratio-"
elif (arg == "--sortby=mratio+"):
sortBy = "mratio+"
elif (arg == "--sortby=score-") or (arg == "--sortby=score"):
sortBy = "score-"
elif (arg == "--sortby=score+"):
sortBy = "score+"
elif (arg == "--sortby=match-") or (arg == "--sortby=match"):
sortBy = "match-"
elif (arg == "--sortby=match+"):
sortBy = "match+"
elif (arg == "--sortby=length-") or (arg == "--sortby=length"):
sortBy = "length-"
elif (arg == "--sortby=length+"):
sortBy = "length+"
elif (arg == "--sortby=name") or (arg == "--sortby=name+"):
sortBy = "name,pos+"
elif (arg == "--sortby=position") or (arg == "--sortby=position+") \
or (arg == "--sortby=pos") or (arg == "--sortby=pos+"):
sortBy = "name,pos+"
elif (arg == "--sortby=position-") or (arg == "--sortby=pos-"):
sortBy = "name,pos-"
elif (arg in ["--noendmark","--noeof","--nomark"]): # (unadvertised)
requireEof = False
elif (arg.startswith("--")):
usage("unrecognized option: %s" % arg)
else:
usage("unrecognized option: %s" % arg)
# collect the alignments
alignmentList = []
for a in alignments(stdin,requireEof):
if (sortBy == "mratio-"):
key = -a.mRatio
elif (sortBy == "mratio+"):
key = a.mRatio
elif (sortBy == "score-"):
key = -a.score
elif (sortBy == "score+"):
key = a.score
elif (sortBy == "match-"):
key = -a.nMatch
elif (sortBy == "match+"):
key = a.nMatch
elif (sortBy == "length-"):
key = -a.seqBaseCount
elif (sortBy == "length+"):
key = a.seqBaseCount
elif (sortBy == "name,pos+"):
key = (name_particle(a.seqName),a.start,a.end)
elif (sortBy == "name,pos-"):
key = (name_particle(a.seqName),-a.start,-a.end)
else:
exit("%s: internal error: unknown key \"%s\""
% (os_path.basename(argv[0]),sortBy))
alignmentList += [(key,a)]
# sort and print them
alignmentList.sort()
isFirst = True
for (_,a) in alignmentList:
if (isFirst): isFirst = False
else: print
print "\n".join(a.lines)
if (requireEof):
print "# ncrf end-of-file"
# name_particle--
# Split a sequence name into parts, so that sorting will produce a saner
# result when names have numeric parts.
#
# For example, "SRR2036394.36267" is returned as ("SRR",2036394,".",36267).
digits = "0123456789"
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
letters += letters.lower()
def name_particle(s):
particles = []
particle = []
pState = None
for ch in s:
if (ch in digits): chState = "number"
elif (ch in letters): chState = "letters"
else: chState = "puntuation"
if (pState == chState):
particle += [ch]
continue
if (particle != []):
particle = "".join(particle)
if (pState == "number"): particle = int(particle)
particles += [particle]
particle = [ch]
pState = chState
if (particle != []):
particle = "".join(particle)
if (pState == "number"): particle = int(particle)
particles += [particle]
return tuple(particles)
if __name__ == "__main__": main()