-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path8_getAnnotated.pl
120 lines (108 loc) · 3.44 KB
/
8_getAnnotated.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/perl
# -----------------------------------------
# Updated Date: 2014/03/24
# Input:
# (1) Files were generated by getTranscript.pl.
# (2) Annotated data from GENCODE. (or the format must be the same with the GENCODE)
# (3) The package implemented by the pointer to access GENOCDE database easily.
# Output: The potential items whith their annotated data.
# Environemt: Linux or Windows
# Description: Try to add more information to the potential items, significant expression difference on both gene level
# and transcript level.
# -----------------------------------------
use strict;
use annotated;
# C:\>perl ./getAnnotated.pl ./gencode.v19.long_noncoding_RNAs.gtf ./bothGeTr.csv ./output.csv
if(scalar(@ARGV) < 3) {
die("Usage: perl ./getAnnotated.pl <annotated.gtf> <geneAndTranscript.txt> <output.csv>");
}
# --------------------------------------------------------------------------
# deal with the annotated file
open(fin,"$ARGV[0]") or die("Error: Make sure that file $ARGV[0] exist.\n");
my @all_annotated = ();
my @tmp = ();
my @attribute = ();
my @eachComponent = ();
sub regular {
# add ( and ) to get complete passed value
my ($getString, $getOption) = @_;
if($getOption == 0) {
$getString =~ m/\s*?.*?\s*?"(.*?)"/;
return $1;
}
elsif($getOption == 1) {
$getString =~ m/"(.*?)"/;
return $1;
}
}
foreach my $line (<fin>) {
chomp($line);
# deal with the comment
if($line =~ m/^##/) {
next;
}
@tmp = split("\t",$line);
$eachComponent[0] = $tmp[0];
$eachComponent[1] = $tmp[1];
$eachComponent[2] = $tmp[2];
$eachComponent[3] = $tmp[6];
@attribute = split(";",$tmp[8]);
$eachComponent[4] = regular($attribute[0],0);
$eachComponent[5] = regular($attribute[1],0);
$eachComponent[6] = regular($attribute[2],0);
$eachComponent[7] = regular($attribute[4],0);
$eachComponent[8] = regular($attribute[7],0);
my $annotatedObj = new annotated(@eachComponent);
push(@all_annotated, $annotatedObj);
}
close(fin);
print "Total annotated data count is: ";
print scalar(@all_annotated);
print "\n";
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
# deal with the significant file storing gene and transcript name
open(fin,"$ARGV[1]") or die("Error: Make sure that file $ARGV[1] exist.\n");
if(! open(fout,">$ARGV[2]")) {
close(fin);
die("Error: Output file $ARGV[2] went error.\n");
}
my $colLabel = 0;
my $catOutput = "";
@tmp = ();
@eachComponent = ();
foreach my $line (<fin>) {
chomp($line);
if($colLabel == 0) {
$colLabel = 1;
my $colLabelName = $line.",\"chr\",\"source\",\"type\",\"strand\",\"geneType\",\"geneName\",\"transcriptName\"";
print fout "$colLabelName\n";
next;
}
@tmp = split(",",$line);
$eachComponent[0] = regular($tmp[0],1);
$eachComponent[1] = regular($tmp[1],1);
$catOutput = ""; # clear the output string
foreach my $obj (@all_annotated) {
bless($obj,"annotated");
if($obj -> getContent(4) eq $eachComponent[1] and $obj -> getContent(5) eq $eachComponent[0]) {
for(my $i = 0; $i < 9; $i++) {
if ($i == 4 or $i == 5) {
next;
}
$catOutput .= "\"";
$catOutput .= $obj -> getContent($i);
$catOutput .= "\"";
if($i != 8) {
$catOutput .= ",";
}
}
print fout "$line,$catOutput\n";
last;
}
}
}
close(fout);
close(fin);
print "State: Output file $ARGV[2] was finished.\n";
# --------------------------------------------------------------------------