-
Notifications
You must be signed in to change notification settings - Fork 0
/
DNAtester
152 lines (120 loc) · 3.54 KB
/
DNAtester
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/perl
use warnings;
use strict;
use English;
use File::Temp qw/ tempdir /;
use Cwd;
# Augustus tester and annotation
#USAGE:
# perl perlAugustus.pl FASTA BED
### INPUT -------------------------------------------------------------
my $FASTA=$ARGV[0];
my $GFF=$ARGV[1];
my $dir = getcwd . "/";
my $AUGUSTUSbin = "augustus";
### Read in Genome
## read input sequence inside an HASH (geneID "\t" fasta)
# unzip if needed
if ($FASTA =~ /.gz$/) {
open(IN, "gunzip -c $FASTA |") or die "can't open $FASTA";
}
else {
open(IN, $FASTA) or die "can't open $FASTA";
}
## read input sequence inside an hash
my %seqs = ();
my $header = '';
while (my $line = <IN>){
chomp $line;
$line =~ s/^\s*$//;
if ($line =~ m/^>(.*)$/){
# remove sequence description from header
$header = (split ' ', $1)[0];
} else {
$seqs{"$header"} .= $line;
}
}
close (IN);
## Print out temp chromosome files
my $TEMPchromoFasta = tempdir( DIR => $dir, CLEANUP => 1 );
my $NewChomosomeName;
my $NewChromosomeSequence;
while ( ($NewChomosomeName, $NewChromosomeSequence) = each %seqs ) {
my $FILE = $TEMPchromoFasta . '/ChromosomeFile_' . $NewChomosomeName . ".fa" ;
open(FH, '>>', $FILE) or die $!;
print FH ">$NewChomosomeName\n$NewChromosomeSequence\n";
close(FH);
}
### Read in Bed (array) + sort
my @BEDgene;
my $OUTFILE;
my %AAsequence = ();
open(IN, $GFF) or die "can't open $GFF";
while(<IN>){
chomp;
push @BEDgene, $_;
}
close(IN);
@BEDgene = sort @BEDgene;
# Augustus
foreach (@BEDgene){
my @bedRow = (split ' ', $_, 4);
my $Chromosome = $bedRow[0];
my $start = $bedRow[1]-500;
my $end = $bedRow[2]+500;
# test if chromosome is available (aka annotation and fasta use same names)
if(!exists($seqs{$Chromosome})) {
print "ERROR: Make sure that chromosomes have the same name in annotation and fasta.\n";
exit;
}
# Select chromosome file
my $ChromoFile = $TEMPchromoFasta . '/ChromosomeFile_' . $Chromosome . ".fa";
# Prepare Output file name
my $TEMPaugustusOUT = tempdir( DIR => $dir, CLEANUP => 1 );
$OUTFILE = $TEMPaugustusOUT . '/Augustus_' . $Chromosome . '_' . $start ;
# Run Augustus
system "$AUGUSTUSbin --genemodel=complete --singlestrand=true --predictionStart=$start --predictionEnd=$end --gff3=on --UTR=off --species=arabidopsis $ChromoFile --outfile=$OUTFILE" ;
# Parse Output file
open(OUT, $OUTFILE) or die "can't open $OUTFILE";
while(<OUT>){
chomp;
# don't keep lines with comments
if ($_ !~ m/^#/){
# keep only gene, transcript and CDS
if ($_ =~ m/(gene|transcript|CDS)/){
my @gffRow = (split ' ', $_, 9);
# New Names
my $Description ;
my $NAME = $Chromosome . '_' . $start . '_' . $end;
$Description = $gffRow[8];
my $Replacement = $NAME . '_var';
$Description =~ s/g/$Replacement/ ;
# New gff line
my $NEWline = $gffRow[0] . "\tmwyler\t" . $gffRow[2] . "\t" . $gffRow[3] . "\t" . $gffRow[4] . "\t" . $gffRow[5] . "\t" . $gffRow[6] . "\t" . $gffRow[7] . "\t" . $Description;
print "$NEWline\n";
}
}
# get AA sequence
my $AAheader = $Chromosome . '_' . $start . '_' . $end;
if ($_ =~ m/^#/){
chomp ;
$AAsequence{"$AAheader"} .= $_;
}
}
close (OUT);
# Print Out Fasta with AA sequence
my $FASTAout = $GFF;
$FASTAout =~ s/\..*$/.fa/;
open(FH, '>', $FASTAout) or die $!;
while ( my ($k,$v) = each %AAsequence ) {
# test if any sequence is present
if ($v !~ m/(none)/){
$v =~ s/^.*\[//g;
$v =~ s/\].*//g;
$v =~ s/#//g;
$v =~ s/ //g;
print FH ">$k\n$v\n";
}
}
close(FH);
}