-
Notifications
You must be signed in to change notification settings - Fork 3
/
NEXT-RNAi.pl
9192 lines (8603 loc) · 336 KB
/
NEXT-RNAi.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl -w
#####################################
#####################################
#####################################
### ###
### NEXT-RNAi program description ###
### ###
#####################################
#####################################
#####################################
=head1 NAME
NEXT-RNAi Version 1.41 (10/19/2011) - Designing and evaluating genome-wide libraries for RNAi screens
=head1 DESCRIPTION
NEXT-RNAi is a software for the design and evaluation of genome-wide RNAi libraries and performs all steps
from the prediction of specific and efficient RNAi target sites to the visualization of designed reagents
in their genomic context. The software enables the design and evaluation of siRNAs and long dsRNAs and was
implemented in an organism-independent manner allowing designs for all sequenced and annotated genomes.
Please visit http://www.nextrnai.org/ for complete documentation of NEXT-RNAi.
=head1 SYNOPSIS
perl nextrnai.pl -i <inputfile> [-s <split inputfile>] -r <reagent> -d <Bowtie database> -e <evaluation> [-o <optionsfile>] [-n <probe name>] [-h help] [-p interactive mode]
=head1 OPTIONS
=over 2
-i <inputfile>
=over 1
Inputfile containing target sequences (in FASTA format)
=back 1
-s <int>
=over 1
<int> number of features (FASTA sequences) from input file that are processed at once (optional, default=4000)
=back 1
-r <reagent>
=over 1
Reagent type (d = long dsRNA, s = short interfering RNA) designed or evaluated
=back 1
-d <Bowtie database/index>
=over 1
Location of Bowtie database/index file (pre-build with bowtie-build), multiple inputs are allowed (separated by '+')(optional, if set to 'nodb' NEXT-RNAi will run without 'off-target' evaluation)
=back 1
-e <evaluation>
=over 1
NO: de novo design of RNAi reagents
OLIGO: evaluation of primers for long dsRNAs (-r d) or siRNAs (-r s)
DSRNA: evaluation of long dsRNAs (-r d)
DSRNA+OLIGO: evaluation of long dsRNAs and underlying primers (-r d)
=back 1
-o <optionsfile>
=over 1
File containing further settings for RNAi reagent design/evaluation in a TAG=VALUE format (optional)
=back 1
-n <probe name>
=over 1
Name tag for files generated by NEXT-RNAi (optional, default=Probe)
=back 1
-h <help>
=over 1
Show help (optional)
=back 1
-p <interactive mode>
=over 1
Start interactive setting of NEXT-RNAi options (optional)
=head1 PARAMETERS FOR OPTIONS FILE
=over 2
=head2 Program locations (NEXT-RNAi dependencies)
=back 4
=over 3
PRIMER3
Set location of primer3_core script required for primer designs during the design and evaluation of long dsRNAs (default = /usr/bin/). Primer3 settings can be influenced in an additional options file (see PRIMER3OPT below).
BOWTIE
Set location of bowtie script required by NEXT-RNAi (default = /usr/bin/). Bowtie is used for mappings to determine the specificity of an RNAi reagent (against the database defined with -d) and for mappings to determine the location of an RNAi reagent in the genome. The mapping of RNAi reagents is a prerequisite for generation of GFF and AFF output files and for the calculation of FEATURE contents and requires the definition of a mapping database (GENOMEBOWTIE).
LOWCOMPEVAL
Set location of mdust program for the evaluation of low-complexity regions in the input sequences (default = disabled).
BLAT
Set location of blat program for mapping RNAi reagents to the genome (default = disabled). By default BOWTIE is used for mappings. However, if reagents were designed on CDS (SOURCE=CDS) Blat is required to allow for gapped alignments to the genome. BLAT mapping can be influenced by a set of further options (see GENOMEFASTA, BLATALIGN, BLATSPLIT, BLATPROGRAM, BLATHOST, BLATPORT below). The mapping of RNAi reagents is a prerequisite for generation of GFF and AFF output files and for the calculation of FEATURE contents.
HOMOLOGY
Set location of blastall program, the FASTA database used to determine homology (needs prior formatting to a FASTA database with formatdb command included in Blast package) and the e-value cutoff for homology (e.g. 1e-10). These three input parameters are separated by comma (default = disabled).
VIENNA
Set location of RNAfold.pl script that belongs to the Vienna RNA package (default = /usr/bin/). This program is required only for efficiency predictions using the RATIONAL method (see EFFICIENCY below).
=back 4
=over 2
=head2 Design settings
=back 4
=over 3
SIRNALENGTH
Set length [nt] of siRNAs used for off-target evaluation (default = 19).
DESIGNWINDOW
Set minimal and maximal length of desired RNAi reagents (default = '80,500') separated by comma.
DESIGNNUM
Set number of RNAi reagents to be designed for each identified specific region in the queried target sequences (default = 50).
OUTPUTNUM
Set number of RNAi reagents to be returned for each queried target sequence (default = 1).
PRIMER3OPT
Set location of file with options for PRIMER3 program in 'TAG=VALUE' format (visit Primer3 documentation for help), default settings are used otherwise (default = disabled).
PRIMERTAG
Set sequence to be added 5' to both, forward and reverse primer sequences (for the design of long dsRNAs), e.g. a T7- or SP6-tag for in vitro transcription (default = disabled).
EFFICIENCY
Set efficiency calculation method and efficiency cutoff score separated by comma (e.g. 'EFFICIENCY=SIR,50'). Available calculation methods are 'RATIONAL' for calculations according to Reynolds et al. (requires VIENNA software), or 'SIR' according to Shah et al. (default = 'SIR'). The efficiency cutoff defines the minimal required efficiency for a siRNA to be selected (only for de novo designs, -e NO). Further documentation about efficiency prediction is available here.
TARGETSEQ
If set to 'FULL' NEXT-RNAi is forced to use the complete input target sequences as design template, otherwise only calculated specific regions are considered (default = CALC for de novo designs, default = FULL for evaluations).
FEATURE
Set location of file containing feature location information. A tab-delimited file with headers 'FeatureName', 'FeatureLoc' (location in GENOMEBOWTIE / GENOMEFASTA database), 'FeatureStart' (start of feature in GENOMEBOWTIE / GENOMEFASTA) and 'FeatureEnd' (end of feature in GENOMEBOWTIE / GENOMEFASTA database). Requires mapping of reagents to GENOMEBOWTIE or GENOMEFASTA databases (default = disabled).
LOWCOMPEVAL
Set location of mdust program for the evaluation of low-complexity regions in the input sequences (default = disabled).
CANEVAL
Option for calculation of CA[ACGT] tandem trinucleotide repeats in target or reagent sequences. This option is enabled by setting the minimal number of CAN repeats (e.g. 6) to be detected (default = disabled).
SEEDMATCH
Calculation of seed matches from siRNA sense strand (starting at position 2) to a defined FASTA file OR a Bowtie database/index file (if a FASTA file was provided, NEXT-RNAi expects the bowtie-build script for building the bowtie index in the BOWTIE folder). This option requires setting the length of the seed region (between 6 and 8), the maximal seed complement frequency allowed (for filtering of target sequences) and the location of the FASTA file or Bowtie database/index (pre-build with bowtie-build) separated by comma (default = disabled).
MIRSEED
Calculation of (e.g. miRNA-) seeds within a long dsRNA or siRNA from a given FASTA file containing miRNA sequences. Requires length of seed region
(between 6 and 8, starting from position 2 in miRNA sense sequences) and location of FASTA file (separated by comma), siRNAs containing seeds will be excluded from designs (default = disabled).
POOL
Results for siRNA evaluations can be summarized for pools of sequences. This option requires setting of the location of a tab-delimited file containing the headers 'siRNAID' and 'POOLID' to define connections between query siRNA identifiers and corresponding siRNA-pool identifiers. This options is only available for the evaluation of siRNAs (default = disabled).
INDEPENDENT
Set location of FASTA file or Bowtie database/index containing sequences that should be avoided for independent reagent designs (file is appended to the off-target database) (default = disabled). In case a FASTA file was provided the 'bowtie-build' script is required in the location defined for BOWTIE (to build the Bowtie index).
INTRON
Percentage nt of a long dsRNA allowed to target intronic regions (default = 25).
RANKD
Long dsRNA designs are by default ranked for percent specificity in first place and number of contained siRNAs predicted to be efficient in second place. NEXT-RNAi can be forced to rank designs for the absolute number of specific siRNAs contained in the long dsRNAs in second place (RANKD = SPEC), which maximized the length of long dsRNA designs (default = EFF for efficiency ranking in second place).
TARGETTYPE
For the design of RNAi reagents against non-annotated genes this parameter should be set to 'NA' (Not Annotated). This affects the specificity ranking of designs in a way to avoid targeting any annotated gene (for designs against annotated genes (default) the ranking maximizes the specificity for the intended target gene).
REDESIGN
Define whether NEXT-RNAi is allowed to enter a (re-)design method (REDESIGN = ON) to enable the design of RNAi reagents for input sequences that do not meet the user-defined quality measures (specificity (SIRNALENGTH), EFFICIENCY, LOWCOMPEVAL, CANEVAL, SEEDMATCH and MIRSEED) (default = OFF).
OTEEVAL
For evaluation of designed RNAi reagents for 'off-target' effects in additional databases. This options requires the location of a Bowtie database/index; the siRNA length [nt] for mappings; whether off-target effects should be evaluated by positional information ('pos', database has to be the same as in GENOMEBOWTIE / GENOMEFASTA) or by target information ('target' uses targetgroups defined in TARGETGROUPS). Database, siRNA length and evaluation option are separated by comma. Multiple evaluations can be queried (default = disabled).
=back 4
=over 2
=head2 Mapping reagents to the 'off-target' (-d) database
=back 4
=over 3
TARGETGROUPS
Location of file defining which sequences in the database file (-d option) belong to one group (e.g. splice variants of a gene) (default = disabled). A tab-delimited file containing the headers 'Target' (e.g. transcripts) and 'TargetGroup' (e.g. the gene the transcript belongs to) is required. NEXT-RNAi will then consider e.g. siRNAs that target multiple transcripts of the same gene as specific for this gene. Multiple files containing targetgroups can be defined in the options file.
EXCLUDED
Location of file containing identifiers from the off-target database (-d option) that should be excluded as target sites, but not considered as real off-targets in case they were hit (e.g. UTR regions). A text file with the header 'Exclude' listing identifiers to be excluded is required. Multiple 'EXCLUDED' files can be queried (default = disabled).
INTENDED
Location of file containing sequence identifiers from the input file connected to their intended target (same as 'TargetGroup' identifier in TARGETGROUPS file) that forces NEXT-RNAi always to output this gene as the primary, intended target of the reagent. A tab-delimited file with the headers 'Query' and 'Intended' listing the identifiers is required. Multiple 'INTENDED' files can be queried (default = disabled).
=back 4
=over 2
=head2 Mapping reagents to the genome using Bowtie
=back 4
=over 3
GENOMEBOWTIE
Set location of mapping database/index for Bowtie. Bowtie needs mapping databases (indices) that were build with the bowtie-build script from FASTA files. The mapping of RNAi reagents is a prerequisite for generation of GFF and AFF output files and for the calculation of FEATURE contents.
=back 4
=over 2
=head2 Mapping reagents to the genome using blat or gfClient
=back 4
=over 3
SOURCE
Set type of source where target sequences were retrieved from ('GENOMIC' for genomic (unspliced) sources, 'CDS' for spliced sources). It affects the type of mapping: for 'CDS' sources BLAT is required, for 'GENOMIC' sources BOWTIE is used (default = GENOMIC)
BLATPROGRAM
Set either to 'blat' for local Blat alignments or to 'gfClient' for alignments using a running Blat server (default = blat). The 'blat' option requires setting of a FASTA database with the GENOMEFASTA option, the 'gfClient' option requires BLATHOST and BLATPORT settings to connect to the Blat server.
GENOMEFASTA
Set location of FASTA mapping database for Blat. The mapping of RNAi reagents is a prerequisite for generation of GFF and AFF output files and for the calculation of FEATURE contents.
BLATHOST
Name of server that runs the Blat server (gfServer), required to run Blat mappings using the BLATPROGRAM gfClient.
BLATPORT
Port to connect to a particular instance (database) of the Blat server defined in BLATHOST. Required to run Blat mappings using the BLATPROGRAM gfClient.
BLATSPLIT
Split parameter for a large FASTA database defined in GENOMEFASTA (using blat as BLATPROGRAM). The FASTA database will be splitted in parts only containing the defined number of sequences (default = 0, means no splitting).
BLATALIGN
If set to 'PERFECT', NEXT-RNAi only allows perfect matches during mapping of sequences to the genome with blat or gfClient. If set to 'PARTIAL', also partial mappings are evaluated (default = PERFECT).
TXNFASTA
Set location of off-target database (as used for -d option) in FASTA format. This option is required for gapped alignments using Blat (e.g. to map siRNAs spanning exon-exon boundaries). NEXT-RNAi can use the mapping information from the off-target database to extend the reagent's sequence and re-map it to the genome (default = disabled).
=back 4
=over 2
=head2 Output settings
=back 4
=over 3
OUTPUT
Set output folder for files created by NEXT-RNAi (default location is input file location)
GFF
Enables output of general feature format (GFF) file by choosing either 'GFF2' or 'GFF3' format and requires prior mapping of reagents (see BOWTIE and BLAT options) (default = disabled).
GBROWSEBASE
Set URL to a generic genome browser (GBrowse) instance for visualization of designed reagents in their genomic context (default = disabled). The URL needs to be a link to the 'gbrowse_img' script of the GBrowse instance, e.g. for accessing our Drosophila melanogaster genome browser use http://www.dkfz.de/signaling/cgi-bin/gbrowse_img/flybase/. The visualization requires prior mapping of reagents (see BOWTIE and BLAT options) and further tracks can be added by setting of the GBROWSETRACK option.
GBROWSETRACK
Set generic genome browser (GBrowse) tracks to be visualized with the designed RNAi reagents (default = disabled). Multiple tracks can be enabled by '+' concatenation (e.g. 'GENE+TXN' for showing genes and transcripts in our Drosophila melanogaster GBrowse, see GBROWSEBASE option). The visualization requires prior mapping of reagents (see BOWTIE and BLAT options) and setting of the GBROWSEBASE URL.
AFF
Set to 'YES' for generation of an annotations file that allows for the direct upload of design results to GBrowse (default = disabled). This requires prior mapping of reagents (see BOWTIE and BLAT options).
=back 2
=head1 AUTHOR
Thomas Horn ([email protected]) and Michael Boutros ([email protected])
=cut
use warnings;
use diagnostics;
use strict;
use Getopt::Long;
use Pod::Usage;
########################
########################
########################
### ###
### Global variables ###
### ###
########################
########################
########################
# store input/output/report file locations
our %fileLocs = ();
###################
###################
###################
### ###
### Subroutines ###
### ###
###################
###################
###################
##
## Newlines of various operation systems are removed
##
sub cleanLine {
my $line = $_[0];
$line =~s/\n//g;
$line =~s/\r//g;
$line =~s/\r\n//g;
return $line;
}
##
## Administrate queried options / parameters and check for valid input
##
sub options {
my ($option,$value,$options,$UserOptions,$error) = @_;
# valid option
if (exists $$options{$option}){
my @optionLen = scalar(@{ $$options{$option} });
# options were only single import is allowed
if ((scalar(@optionLen) eq 1) && (!exists $$UserOptions{$option})){
$$options{$option}[0] = $value;
$$UserOptions{$option} = 1;
}
else {
# options with multiple inputs allowed
if (($option eq "TARGETGROUPS") || ($option eq "EXCLUDED") || ($option eq "INTENDED") || ($option eq "GENOMEBOWTIE") || ($option eq "GENOMEFASTA") || ($option eq "OTEEVAL") || ($option eq "INDEPENDENT")){
push (@{ $$options{$option} }, $value);
$$UserOptions{$option}++;
}
else {
print $error "$option\tMultiple callings in optionsfile not allowed for this parameter (only first one used)\n";
}
}
}
else {
# invalid option
print $error "$option\tInvalid option\n";
}
}
##
## Organization of input/output files
##
sub fileLoc{
my ($option,$loc,$name) = @_;
if (($option eq 'Unlink') || ($option eq 'Looped')){
# collect files to unlink
if (!exists $fileLocs{$option}){
$fileLocs{$option} = [$loc, ];
}
else {
push (@{ $fileLocs{$option} }, $loc);
}
}
else {
# save file locations for output
$fileLocs{$option}{$name} = $loc;
}
}
##
## Split FASTA DB files
##
sub splitDB {
my ($split,$databaseFile,$splitfiles,$error) = @_;
# count sequences (FASTA headers) in DB file with grep
my $counter = `grep \">\" $databaseFile -c`;
$counter = &cleanLine($counter);
if ($counter > $split){
# define output file name
my $index = 1;
my $dbOut = $databaseFile.'_'.$index;
while (-e $dbOut){
$index++;
$dbOut = $databaseFile.'_'.$index;
}
# write queried number of features per file
open (DBOUT, ">$dbOut") || die "Cannot open DBOUT: $!\n";
push (@$splitfiles, $dbOut);
&fileLoc('Unlink',$dbOut);
open (DBFILE, "<$databaseFile") || die "Cannot open DBFILE: $!\n";
my $featcount = 0;
while (my $line = <DBFILE>){
$line = &cleanLine($line);
if ($line=~/^>.*/){
if ($featcount < $split){
print DBOUT "$line\n";
$featcount++;
}
else {
close DBOUT;
$featcount = 0;
$index++;
my $dbOut = $databaseFile.'_'.$index;
while (-e $dbOut){
$index++;
$dbOut = $databaseFile.'_'.$index;
}
open (DBOUT, ">$dbOut") || die "Cannot open DBOUT: $!\n";
push (@$splitfiles, $dbOut);
&fileLoc('Unlink',$dbOut);
print DBOUT "$line\n";
$featcount++;
}
}
else {
print DBOUT "$line\n";
}
}
close DBFILE;
close DBOUT;
}
}
##
## Read identifiers and sequences from FASTA file
##
sub readFASTA {
my ($input,$IDSeq,$error,$option) = @_;
open (FASTA, "<$input") || die "Cannot open FASTA: $!\n";
my $ID = "";
while (my $line = <FASTA>){
$line = &cleanLine($line);
# read input until first space only
if ($line=~/^>(\S+)/){
$ID = $1;
if (!exists $$IDSeq{$ID}){
$$IDSeq{$ID} = "";
}
else {
delete $$IDSeq{$ID};
print $error "$ID\tIdentifier is not unique in file $input and is not considered for further calculations\n";
}
}
else {
$line=~s/\s//g;
if (exists $$IDSeq{$ID}){
# replace 'U'/'u' with 'T' in case RNA sequence was queried
$line=~s/u/T/gi;
# sequence must only contain ACGTacgt if option is 'strict'
if ($option eq 'strict'){
if ($line=~/^[ACGTNacgtn]+$/){
$$IDSeq{$ID}.= $line;
}
else {
delete $$IDSeq{$ID};
print $error "$ID\t$input: invalid sequence (only A,C,G,T are allowed), not considered for further calculations\n";
}
}
elsif ($option eq 'permissive'){
$$IDSeq{$ID}.= $line;
}
}
}
}
close FASTA;
}
##
## In silico DICER to generate siRNAs of desired length from target sequences
##
sub edicer {
my ($dicer,$fraglength,$IDSeq,$IDSeqKeys) = @_;
open (EDICER, ">$dicer") || die "Cannot open EDICER: $!\n";
for (my $z=0;$z<scalar(@$IDSeqKeys);$z++){
for (my $i=0;$i+($fraglength-1)<length($$IDSeq{$$IDSeqKeys[$z]});$i++){
my $j = $i+1;
my $siRNA = substr($$IDSeq{$$IDSeqKeys[$z]},$i,$fraglength);
print EDICER ">$$IDSeqKeys[$z]\_$j\n$siRNA\n";
}
}
close EDICER;
}
##
## Build index from a file
##
sub build_index {
my ($data_file,$index_file,$input,$name,$IDindex) = @_;
my $offset = 0;
my $lineNum = 1;
while (my $line = <$data_file>){
$line = &cleanLine($line);
my @columns = split(/\t/,$line);
if ($input eq 'bowtie'){
if (!exists $$IDindex{$columns[0]}{$name}{$lineNum}){
$$IDindex{$columns[0]}{$name}{$lineNum} = "";
}
}
elsif ($input eq 'blat'){
if ((scalar(@columns) eq 21) && ($columns[0]=~/^\d+$/)){
if (!exists $$IDindex{$columns[9]}{$name}{$lineNum}){
$$IDindex{$columns[9]}{$name}{$lineNum} = "";
}
}
}
elsif ($input eq 'mapped'){
if (!exists $$IDindex{$columns[0]}{$name}{$lineNum}){
$$IDindex{$columns[0]}{$name}{$lineNum} = "";
}
}
elsif ($input eq 'primer-mapped'){
my $id = $columns[0].'_1';
if (!exists $$IDindex{$id}{$name}{$lineNum}){
$$IDindex{$id}{$name}{$lineNum} = "";
}
}
my $pack = pack("N", $offset);
print $index_file pack("N", $offset);
$offset = tell($data_file);
$lineNum++;
}
}
##
## Get lines from a file via index
##
sub line_with_index {
my ($data_file,$index_file,$line_number) = @_;
my $size; # size of an index entry
my $i_offset; # offset into the index of the entry
my $entry; # index entry
my $d_offset; # offset into the data file
$size = length(pack("N", 0));
$i_offset = $size * ($line_number-1);
seek($index_file, $i_offset, 0) or return;
read($index_file, $entry, $size);
$d_offset = unpack("N", $entry);
seek($data_file, $d_offset, 0);
return scalar(<$data_file>);
}
##
## Bowtie parsing, count hits of siRNAs in off-target database
##
sub BowtieTarget {
my ($KeysRef,$bowtie,$TargetExclude,$InputTarget,$InputsiRNATarget,$siRNATargetExclude,$reagent,$evaluation,$siRNAPos) = @_;
my $input = "";
my $inputsiRNA = "";
if (-e $bowtie){
open (BOWTIE, "<$bowtie") || die "Cannot open $bowtie: $!\n";
while (my $line = <BOWTIE>){
$line = &cleanLine($line);
my (@columns) = ();
@columns = split(/\t/, $line);
if (scalar(@columns) eq 7){
if ($columns[0] =~/(\S+)(_\d+)/){
$input = $1;
$inputsiRNA = $1.$2;
}
# number of matches of complete input sequence to a certain sequence in the off-target database
if ((!defined %$TargetExclude) || (!exists $$TargetExclude{$columns[2]})){
if (!exists $$InputTarget{$input}{$columns[2]}){
$$InputTarget{$input}{$columns[2]} = 1;
}
else {
$$InputTarget{$input}{$columns[2]}++;
}
# number of matches of each single siRNA to a certain sequence in the off-target database
if (!exists $$InputsiRNATarget{$input}{$inputsiRNA}{$columns[2]}){
$$InputsiRNATarget{$input}{$inputsiRNA}{$columns[2]} = 1;
}
else {
$$InputsiRNATarget{$input}{$inputsiRNA}{$columns[2]}++;
}
}
else {
if (!exists $$siRNATargetExclude{$inputsiRNA}{$columns[2]}){
$$siRNATargetExclude{$inputsiRNA}{$columns[2]} = 1;
}
else {
$$siRNATargetExclude{$inputsiRNA}{$columns[2]}++;
}
}
# save target position in off-target database for evaluation of siRNAs
if (((defined $reagent) && ($reagent eq 's')) && ((defined $evaluation) && ($evaluation eq 'OLIGO'))){
if (!exists $$siRNAPos{$inputsiRNA}{$columns[2]}){
$$siRNAPos{$inputsiRNA}{$columns[2]} = [ $columns[3]+1, ];
}
else {
push (@{ $$siRNAPos{$inputsiRNA}{$columns[2]} },$columns[3]+1);
}
}
}
}
close BOWTIE;
}
# input sequences with no target are assigned to 'NA' target
for (my $i=0;$i<scalar(@$KeysRef);$i++){
if (!exists $$InputTarget{$$KeysRef[$i]}){
$$InputTarget{$$KeysRef[$i]}{"NA"} = 0;
}
}
}
##
## Connect target groups to input sequences
##
sub targetGroups {
my ($KeysRef,$targetGroups,$InputTarget,$InputtargetGroups,$error) = @_;
for (my $i=0;$i<scalar(@$KeysRef);$i++){
my @TargetKeys = keys %{ $$InputTarget{$$KeysRef[$i]} };
# input sequences with no target are defined as own target group, number of hits is set to 0
if ((scalar(@TargetKeys) eq 1) && ($TargetKeys[0] eq "NA")){
undef @TargetKeys;
push (@TargetKeys,$$KeysRef[$i]);
$$InputTarget{$$KeysRef[$i]}{$$KeysRef[$i]} = 0;
}
# sort targets according to number of hits to identify the 'real' target
my @TargetHits = ();
for (my $j=0;$j<scalar(@TargetKeys);$j++){
push (@TargetHits,$$InputTarget{$$KeysRef[$i]}{$TargetKeys[$j]});
}
@TargetKeys = @TargetKeys[ sort {$TargetHits[$b] <=> $TargetHits[$a]} 0 .. $#TargetKeys];
@TargetHits = sort {$b <=> $a} (@TargetHits);
# set target group for best hit, important for identification of off-target effects
if (exists $$targetGroups{$TargetKeys[0]}){
$$InputtargetGroups{$$KeysRef[$i]} = $$targetGroups{$TargetKeys[0]};
}
else {
$$InputtargetGroups{$$KeysRef[$i]} = $$KeysRef[$i];
if ($TargetHits[0] > 0){
print $error "$$KeysRef[$i]\tNo target group defined for $TargetKeys[0], target group set to $$KeysRef[$i]\n";
}
}
}
}
##
## Parse siRNA features (efficiency, seed matches, low-complexity, CAN repeats)
##
sub featsiRNA {
my ($input,$pos,$FilterPos) = @_;
my @unspec = split(/\|/, $$FilterPos{$input}{$pos});
my %unspec = ();
my $unspec = '';
for (my $i=0;$i<scalar(@unspec);$i++){
if (!exists $unspec{$unspec[$i]}){
$unspec{$unspec[$i]} = '';
}
}
# write signature for each siRNA in binary mode |efficiency|seedmatch|lowcomplexity|CANrepeat|mirseed
if (exists $unspec{'eff'}){
$unspec.= '|1';
}
else {
$unspec.= '|0';
}
if (exists $unspec{'seed'}){
$unspec.= '|1';
}
else {
$unspec.= '|0';
}
if (exists $unspec{'low'}){
$unspec.= '|1';
}
else {
$unspec.= '|0';
}
if (exists $unspec{'can'}){
$unspec.= '|1';
}
else {
$unspec.= '|0';
}
if (exists $unspec{'mirseed'}){
$unspec.= '|1';
}
else {
$unspec.= '|0';
}
return $unspec;
}
##
## Find specific regions in input sequences
##
sub SpecRegion {
my ($IDSeq,$IDSeqKeys,$InputsiRNATarget,$targetGroups,$InputtargetGroups,$siRNATargetExclude,$InputSpecRegion,$InputSpecsiRNA,$fraglength,$evaluation,$specreg,$FilterPos) = @_;
# get query keys
my @input = keys %{ $InputsiRNATarget };
for (my $i=0;$i<scalar(@input);$i++){
# get siRNA keys
my @inputsiRNA = keys %{ $$InputsiRNATarget{$input[$i]}};
my @num = ();
for (my $j=0;$j<scalar(@inputsiRNA);$j++){
if ($inputsiRNA[$j]=~/_(\d+)$/){
push (@num, $1);
}
}
# sort siRNA for their position in query sequence
@inputsiRNA = @inputsiRNA[ sort {$num[$a] <=> $num[$b]} 0 .. $#inputsiRNA ];
@num = sort {$a<=>$b}(@num);
for (my $j=0;$j<scalar(@inputsiRNA);$j++){
# fill NAs for non-targeting regions
if (((!exists $$InputSpecsiRNA{$input[$i]}) && ($num[$j] ne 1)) || ((exists $$InputSpecsiRNA{$input[$i]}) && (($num[$j]-1) > scalar(@{ $$InputSpecsiRNA{$input[$i]} })))){
my $diff = 0;
if (exists $$InputSpecsiRNA{$input[$i]}){
$diff = $num[$j] - 1 - scalar(@{ $$InputSpecsiRNA{$input[$i]} });
}
else {
$$InputSpecsiRNA{$input[$i]} = [];
$diff = $num[$j] - 1;
}
my $position = 0;
for (my $k=0;$k<$diff;$k++){
if (($k eq 0) && (exists $$InputSpecsiRNA{$input[$i]})){
$position = scalar(@{ $$InputSpecsiRNA{$input[$i]} }) + 1;
}
else {
$position++;
}
my $unspec2 = 'NA';
if (exists $$FilterPos{$input[$i]}{$position}){
$unspec2.= &featsiRNA($input[$i],$position,$FilterPos);
}
else {
$unspec2.= '|0|0|0|0|0';
}
push (@{ $$InputSpecsiRNA{$input[$i]} }, $unspec2);
}
}
# get target keys
my @targets = keys %{ $$InputsiRNATarget{$input[$i]}{$inputsiRNA[$j]} };
my $specpointer = 0;
my $filterpointer = 0;
my $excludepointer = 0;
SPECCHECK:
for (my $k=0;$k<scalar(@targets);$k++){
# set target group if defined, otherwise target is target group
my $ref = "";
if (exists $$targetGroups{$targets[$k]}){
$ref = $$targetGroups{$targets[$k]};
}
else {
$ref = $targets[$k];
}
# compare targetgroup of actual target with targetgroup of query, define queried filters as off-targets
if ($ref ne $$InputtargetGroups{$input[$i]}){
$specpointer = 1;
last SPECCHECK;
}
}
# check whether siRNA contains any unwanted feature
if (exists $$FilterPos{$input[$i]}{$num[$j]}){
$filterpointer = 1;
}
# check whether siRNA has any unwanted target
if (exists $$siRNATargetExclude{$inputsiRNA[$j]}){
$excludepointer = 1;
}
if (($specpointer eq 0) && ($filterpointer eq 0) && ($excludepointer eq 0)){
# create new entry for 'specific' siRNA
if (!exists $$InputSpecRegion{$input[$i]}){
$$InputSpecRegion{$input[$i]} = [ [ $num[$j], $num[$j] ], ];
# new entry in first position
if ($num[$j] eq 1){
$$InputSpecsiRNA{$input[$i]} = [ 1, ];
}
else {
push (@{ $$InputSpecsiRNA{$input[$i]} }, 1);
}
}
else {
# create new specific region if the region before is unspecific, otherwise expand existsing specific region
if ($$InputSpecRegion{$input[$i]}[-1][1] eq "un"){
$$InputSpecRegion{$input[$i]}[-1][0] = $num[$j];
$$InputSpecRegion{$input[$i]}[-1][1] = $num[$j];
push (@{ $$InputSpecsiRNA{$input[$i]} }, 1);
}
else {
$$InputSpecRegion{$input[$i]}[-1][1] = $num[$j];
push (@{ $$InputSpecsiRNA{$input[$i]} }, 1);
}
}
}
else {
my $unspec = "";
# although region is in FilterPos, it can be specific for target
if ($specpointer eq 0){
$unspec = 1;
}
else {
$unspec = 0;
}
if (exists $$FilterPos{$input[$i]}{$num[$j]}){
$unspec.= &featsiRNA($input[$i],$num[$j],$FilterPos);
}
else {
$unspec.= '|0|0|0|0|0';
}
# create new entry for 'unspecific' siRNA
if (!exists $$InputSpecRegion{$input[$i]}){
$$InputSpecRegion{$input[$i]} = [ [ $num[$j], "un" ], ];
# new entry in first position
if ($num[$j] eq 1){
$$InputSpecsiRNA{$input[$i]} = [ $unspec, ];
}
else {
push (@{ $$InputSpecsiRNA{$input[$i]} }, $unspec);
}
}
else {
# create new unspecific region if the region before is specific, otherwise expand existsing unspecific region
if ($$InputSpecRegion{$input[$i]}[-1][1] ne "un"){
# new unspecific region created
push (@{ $$InputSpecRegion{$input[$i]} }, [ $num[$j], "un" ]);
push (@{ $$InputSpecsiRNA{$input[$i]} }, $unspec);
}
else {
# unspecific region expanded
$$InputSpecRegion{$input[$i]}[-1][0] = $num[$j];
push (@{ $$InputSpecsiRNA{$input[$i]} }, $unspec);
}
}
}
}
}
# evaluate calculations for each input sequence
for (my $i=0;$i<scalar(@$IDSeqKeys);$i++){
if (!exists $$InputSpecRegion{$$IDSeqKeys[$i]}){
# define whole sequence as specific region, so that primer design is possible
my $end = length($$IDSeq{$$IDSeqKeys[$i]}) - $fraglength;
my $siRNA = $end + 1;
$$InputSpecRegion{$$IDSeqKeys[$i]} = [ [1,$siRNA], ];
# add NAs to specificity array
$$InputSpecsiRNA{$$IDSeqKeys[$i]} = [];
for (my $j=0;$j<$end;$j++){
my $position = $j + 1;
my $unspec2 = 'NA';
if (exists $$FilterPos{$$IDSeqKeys[$i]}{$position}){
$unspec2.= &featsiRNA($$IDSeqKeys[$i],$position,$FilterPos);
}
else {
$unspec2.= '|0|0|0|0|0';
}
push (@{ $$InputSpecsiRNA{$$IDSeqKeys[$i]} }, $unspec2);
}
}
else {
# remove 'unspecific ends'
if ($$InputSpecRegion{$$IDSeqKeys[$i]}[-1][1] eq "un"){
if (scalar @{ $$InputSpecRegion{$$IDSeqKeys[$i]} } eq 1){
my $end = length($$IDSeq{$$IDSeqKeys[$i]}) - $fraglength + 1;
$$InputSpecRegion{$$IDSeqKeys[$i]} = [ [1,$end], ];
}
else {
pop(@{ $$InputSpecRegion{$$IDSeqKeys[$i]} });
}
}
}
# add last 'NAs' if last part of target region is located within a region with no target
my $siRNA = length($$IDSeq{$$IDSeqKeys[$i]}) - $fraglength + 1;
if (scalar(@{ $$InputSpecsiRNA{$$IDSeqKeys[$i]} }) < $siRNA){
my $diff = $siRNA - scalar(@{ $$InputSpecsiRNA{$$IDSeqKeys[$i]} });
my $position = 0;
for (my $k=0;$k<$diff;$k++){
if ($k eq 0){
$position = scalar(@{ $$InputSpecsiRNA{$$IDSeqKeys[$i]} }) + 1;
}
else {
$position++;
}
my $unspec2 = 'NA';
if (exists $$FilterPos{$$IDSeqKeys[$i]}{$position}){
$unspec2.= &featsiRNA($$IDSeqKeys[$i],$position,$FilterPos);
}
else {
$unspec2.= '|0|0|0|0|0';
}
push (@{ $$InputSpecsiRNA{$$IDSeqKeys[$i]} }, $unspec2);
}
}
# if evaluation was queried or designs on full target sequence is forced with specreg, specific region is set to complete sequence
if (($evaluation ne "NO") || ($specreg eq "FULL")){
$$InputSpecRegion{$$IDSeqKeys[$i]} = [ [1,$siRNA], ];
}
}
}
##
## Calculate local foldings of siRNAs with Vienna RNAfold perl script
##
sub ViennaRNA {
# create siRNA file in input format for Vienna RNAfold
my ($identifier,$sequence,$fraglength,$outfolder,$Viennaloc,$sirnas,$hairpins) = @_;
my $outsiRNA = $outfolder.'NEXT-RNAi_'.$identifier.'.siRNA';
open (SIRNA, ">$outsiRNA") || die "Cannot open SIRNA: $!\n";
for (my $i=0;$i+($fraglength-1)<length($sequence);$i++){
my $siRNA = substr($sequence,$i,$fraglength);
print SIRNA "$siRNA\n";
}
close SIRNA;
# run Vienna RNAfold.pl
my $outVienna = $outfolder.'NEXT-RNAi_'.$identifier.'.RNAfold';
system ("$Viennaloc\\RNAfold.pl $outsiRNA >$outVienna") eq 0 || die "Failed to open RNAfold.pl: $?\n";
# parse RNAfold.pl output
open (HAIRPINS, "<$outVienna") || die "Cannot open HAIRPINS: $!\n";
while (my $line = <HAIRPINS>){
$line = &cleanLine($line);