-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathblast2fasta.pl
executable file
·136 lines (131 loc) · 5.14 KB
/
blast2fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/perl
use strict;
# Script to parse BLAST results as fasta file. Required: BLAST database as fasta file, tabular BLAST output (BLAST+: '-outfmt 6', BLAST: '-m 8').
# Output will be directed to STDOUT. Sequence names in output will contain information on BLAST search (query name, length of hit, % identity, evalue).
# The complete fasta and BLAST files will be read into memory, so this script will not work on extremely large files!
my $blastfile = $ARGV[0];
my $fastafile = $ARGV[1];
# define input files
if(!defined $ARGV[0] && !defined $ARGV[1]){
print "This script will parse BLAST hits from a fasta file.\nUSAGE:\tperl blast2fasta_1.1.pl [BLAST output] [fasta file]\n";
}
# Check if input files were provided, else show usage information
open (BLAST , '<' , $blastfile) or die "\n>FILE MISSING<\n\nplease specify path to tabular BLAST output!\n\n";
# open BLAST file or die with error message
my @querynames;
my @blastline;
my @seqnames;
my @startseq;
my @endseq;
my @evalue;
my @length;
my @identity;
my @reversenumbers;
my $linecounter=-1;
# define variables
while (my $line = <BLAST>) {
# read through BLAST output line by line
chomp $line;
$linecounter++;
@blastline=undef;
@blastline = split /\t/, $line;
# split at tabs, store each of the columns in array
if(scalar(@blastline)!=12){
die "BADLY FORMATTED BLAST FILE!\nUse '-outfmt 6' option in BLAST+!\n";}
# check if BLAST output contains 12 columns- if not, print error message and die
else{
push(@querynames,$blastline[0]);
push(@seqnames,$blastline[1]);
push(@evalue,$blastline[10]);
push(@length,$blastline[3]);
push(@identity,$blastline[2]);
# store query names, subject sequence names, evalue of hits, length of hits and % identity in separate arrays
if($blastline[8]<=$blastline[9]){
push(@startseq,$blastline[8]);
push(@endseq,$blastline[9]);
if($blastline[7]<=$blastline[6]){
push(@reversenumbers, $linecounter);
}
}
else{
push(@startseq,$blastline[9]);
push(@endseq,$blastline[8]);
if($blastline[6]<=$blastline[7]){
push(@reversenumbers, $linecounter);
}
}
# finally, store start and end postion of hit in separate arrays
# if hit is on minus strand, revert start and end!
}
}
close(BLAST);
open (FASTA , '<' , $fastafile) or die "\n>FILE MISSING<\n\nplease specify path to fasta file!\n\n";
# open FASTA file or die with error message
my $linecount = -1;
my @names;
my @seqs;
my $sequence;
# define variables
while (my $line = <FASTA>) {
chomp $line;
# chomp through fasta file line by line
if ($line=~ /\>/g){
$linecount++;
$names[$linecount]=$line;
$sequence=undef;
# search for > with regexpr, if present, write name into array
}
else{
$seqs[$linecount] = $sequence.= uc($line);
# other lines = sequences, write into another array
}
}
close(FASTA);
# Subroutine for reverse complementing a sequence
sub revcom {
my($dna) = @_;
# First reverse the sequence
my $revcom = reverse $dna;
# Next, complement the sequence, dealing with upper and lower case
# A->T, T->A, C->G, G->C
$revcom =~ tr/ACGTacgt/TGCAtgca/;
return $revcom;
}
# now, use BLAST arrays and sequence arrays to write out the desired parts of the fasta file:
my $seqnamecounter=-1;
my $namecounter;
my $substring;
my $revstring;
# define variables
foreach my $seqname(@seqnames){
# do for each of the subject sequences from the BLAST output:
$seqnamecounter++;
# count each loop
$namecounter=-1;
# reset counter for next loop
foreach my $name(@names){
# do for each sequence of original fasta file:
$name=~s/>//;
# get rid of '>'
$namecounter++;
# count each loop
if($name eq $seqname){
if ( grep( /^$seqnamecounter$/, @reversenumbers ) ) {
# if one of the sequence names corresponds to the subject sequence in the BLAST output:
$substring = substr($seqs[$namecounter], $startseq[$seqnamecounter]-1, $endseq[$seqnamecounter]-$startseq[$seqnamecounter]+1);
$revstring = revcom($substring);
# use a substring of the corresponding fasta sequence
# structure: [which sequence], [offset (=stored start position)], [length (=end position minus start position)]
print ">".$seqname."_query:".$querynames[$seqnamecounter]."_length:".$length[$seqnamecounter]."_pid:".$identity[$seqnamecounter]."_evalue:".$evalue[$seqnamecounter],"\n$revstring\n"
# finally, print name, information on BLAST search and sequence to STDOUT
}
else{# if one of the sequence names corresponds to the subject sequence in the BLAST output:
$substring = substr($seqs[$namecounter], $startseq[$seqnamecounter]-1, $endseq[$seqnamecounter]-$startseq[$seqnamecounter]+1);
# use a substring of the corresponding fasta sequence
# structure: [which sequence], [offset (=stored start position)], [length (=end position minus start position)]
print ">".$seqname."_query:".$querynames[$seqnamecounter]."_length:".$length[$seqnamecounter]."_pid:".$identity[$seqnamecounter]."_evalue:".$evalue[$seqnamecounter],"\n$substring\n"
# finally, print name, information on BLAST search and sequence to STDOUT
}
}
}
}