-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_exonerate2gff3.pl
183 lines (157 loc) · 5.27 KB
/
simple_exonerate2gff3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/perl
#
# Convert exonerate output (with gff enabled) to gff3 format
#
# usage: simple_exonerate2gff3.pl [options] file1 file2 file3 > outfile.gff3
#
# options:
# -p Parent feature
# Defaults to 'gene'
# e.g. 'similarity'
# -r Regex for ID capture
# Defaults to 'Query (.*?) ;'
# e.g. '\tgene\t.*sequence (.*?) ;'
# -P Process RYO entry? boolean (Have to manually edit this program to process it how you wish)
# no error handling on this part - user discretion is advised
# -R RYO string for capture - instead of ending after GFF dump, pickup until specified string
# e.g. '^---END RYO---'
# no error handling on this part - user discretion is advised
#
# 6-30-2016 Travis Saari
use warnings;
use strict;
use Getopt::Std;
my %options = ();
getopts('p:r:R:P', \%options);
my $entry_count = 0; #Initialize count of GFF entries
my @entrylines; #Initialize array for entry lines
my $startCapture = '^# --- START OF GFF DUMP ---';
my $endCapture = '^# --- END OF GFF DUMP ---';
if ($options{'R'}) {
$endCapture = $options{'R'};
}
while(<>) {
#Use flip-flop operator to push lines between & including the regular expressions
if (my $rc = /$startCapture/../$endCapture/) {
push @entrylines, $_;
if ($rc =~ /E\d+/) {
#flip-flop operator trick. Do the following after last entryline
$entry_count++;
my $gff3_entry = process_entry(@entrylines);
print "$gff3_entry\n###\n" if $gff3_entry;
@entrylines=(); #Clear array for next entry
}
}
}
################################
# Subroutines
################################
sub process_entry {
my @lines = @_;
my @other_lines; #Initialize array for other lines (e.g. RYO)
my @lines_new; #Initialize array for processed lines
#Set up for extracting ID by regex
my $joined_lines = join('', @lines);
my $Regex_ID = 'Query (.*?) ;' ; # Default regex
if ( $options{'r'} ) {
$Regex_ID = $options{'r'}; # User-def regex
}
#Extract ID from joined lines using regex
my $ID;
$joined_lines =~ m/$Regex_ID/ ;
if (not (defined $1)) {
print STDERR "Regex string failed to match an ID in entry $entry_count. Defaulting to \'entry_$entry_count\'\n";
$ID = "entry_$entry_count";
} else {
$ID = $1; #Captured ID
}
#Set up for determining parent-child relationship
my $parent = 'gene'; # Default parent
if ( $options{'p'} ) {
$parent = $options{'p'}; # User-def parent
}
#Convert GFF lines
my $parent_found = 0; #Keep track of parent features found in gff lines
#Keep track of GFF line number where parent feature is found
my $current_gff_line = 0;
my $parent_line_no;
foreach my $line (@lines) {
my @attr = split("\t", $line);
if ($line !~ /^#/ && scalar(@attr) == 9) { #Only GFF lines
my $prepend; # Initialize variable for if/else result
if ($attr[2] eq $parent) {
$parent_found++; #Parent found
$parent_line_no = $current_gff_line;
$prepend = "ID=" . $ID . ";";
} else {
$prepend = "Parent=" . $ID . ";";
}
#Modify 9th column, then push result to @lines_new
$attr[8] = process_gff_tags($prepend, $attr[8]);
push(@lines_new, join("\t", @attr));
$current_gff_line++;
}
else {
push(@other_lines, $line);
}
}
#If boolean flag to process RYO
if ($options{'P'}) {
my $newTags = process_RYO_to_tags(@other_lines);
#append new tags to parent line
$lines_new[$parent_line_no] .= ";$newTags";
}
#If exactly one parent feature found in entry, return processed entry
if ($parent_found == 1) {
my $processed_entry = join("\n", @lines_new);
return $processed_entry;
} else { #Return empty otherwise
print STDERR "Parent feature \'$parent\' found $parent_found times in entry: $ID. Skipping...\n";
return;
}
}
# Trim whitespace, use equal sign for tag-value pairing,
# and prepend something onto the tag string (e.g. ID=Foo;)
sub process_gff_tags {
my ($prepend, $tag_str) = @_;
my @tags_orig = split(";", $tag_str);
my @tags_new = ();
foreach my $tag (@tags_orig) {
$tag = trim($tag);
$tag =~ s/ /=/;
push(@tags_new, $tag);
}
$tag_str = join(';', @tags_new);
$tag_str = $prepend . $tag_str;
return $tag_str;
}
# This is specific to my RYO string and my desired use case
# It is kind of hack-y
# Edit this subroutine to change the output
sub process_RYO_to_tags {
my @allLines = @_;
my @RYOlines; #Initialize variable for RYOlines
foreach(@allLines) {
push (@RYOlines, $_) if $_ =~ /^>/;
}
my $subjectLine = $RYOlines[0];
my @attr = split(' ', $subjectLine);
my @keepThese = splice(@attr,1,2);
my $len = $keepThese[0];
$len =~ s/.*?=//;
my $alnlen = $keepThese[1];
$alnlen =~ s/.*?=//;
my $pct_aln = $alnlen/$len*100;
$pct_aln = sprintf("%.1f", $pct_aln);
substr($pct_aln, 0, 0) = 'aln_pct='; #prepend 'aln-pct' to value
$pct_aln .= ';'; #append semicolon
push(@keepThese, $pct_aln);
my $newTags = join(';', @keepThese);
return $newTags;
}
# Simple regex trim whitespace
sub trim {
my $s = shift;
$s =~ s/^\s+|\s+$//g;
return $s;
}