| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# -*-CPerl-*- |
|
2
|
|
|
|
|
|
|
# Last changed Time-stamp: <2014-12-20 00:34:19 mtw> |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Bio::ViennaNGS::Util; |
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
1402
|
use 5.12.0; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
38
|
|
|
7
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
29
|
|
|
8
|
1
|
|
|
1
|
|
4
|
use version; our $VERSION = qv('0.12_07'); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
5
|
|
|
9
|
1
|
|
|
1
|
|
73
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
32
|
|
|
10
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
26
|
|
|
11
|
1
|
|
|
1
|
|
207
|
use Bio::Perl 1.00690001; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Bio::DB::Sam 1.39; |
|
13
|
|
|
|
|
|
|
use Data::Dumper; |
|
14
|
|
|
|
|
|
|
use File::Basename qw(basename fileparse); |
|
15
|
|
|
|
|
|
|
use File::Temp qw(tempfile); |
|
16
|
|
|
|
|
|
|
use IPC::Cmd qw(can_run run); |
|
17
|
|
|
|
|
|
|
use Path::Class; |
|
18
|
|
|
|
|
|
|
use Math::Round; |
|
19
|
|
|
|
|
|
|
use Carp; |
|
20
|
|
|
|
|
|
|
use Bio::ViennaNGS::FeatureChain; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
23
|
|
|
|
|
|
|
our @EXPORT = (); |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our @EXPORT_OK = qw ( bed_or_bam2bw sortbed bed2bigBed computeTPM |
|
26
|
|
|
|
|
|
|
featCount_data parse_multicov write_multicov |
|
27
|
|
|
|
|
|
|
unique_array kmer_enrichment extend_chain |
|
28
|
|
|
|
|
|
|
parse_bed6 fetch_chrom_sizes); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
32
|
|
|
|
|
|
|
#^^^^^^^^^^ Variables ^^^^^^^^^^^# |
|
33
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
our @featCount = (); |
|
36
|
|
|
|
|
|
|
my %unique = (); |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
39
|
|
|
|
|
|
|
#^^^^^^^^^^^ Subroutines ^^^^^^^^^^# |
|
40
|
|
|
|
|
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub featCount_data { |
|
43
|
|
|
|
|
|
|
return \@featCount; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub bed_or_bam2bw { |
|
47
|
|
|
|
|
|
|
my ($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log) = @_; |
|
48
|
|
|
|
|
|
|
my ($fn_bg_tmp,$fn_bg,$fn_bw); |
|
49
|
|
|
|
|
|
|
my ($bn,$path,$ext,$cmd); |
|
50
|
|
|
|
|
|
|
my @processed_files = (); |
|
51
|
|
|
|
|
|
|
my $factor = 1.; |
|
52
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
croak "ERROR [$this_function] \$type is '$type', however it is expected to be either 'bam' or 'bed'\n" |
|
55
|
|
|
|
|
|
|
unless ($type eq "bam") || ($type eq "bed"); |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my $genomeCoverageBed = can_run('genomeCoverageBed') or |
|
58
|
|
|
|
|
|
|
croak "ERROR [$this_function] genomeCoverageBed utility not found"; |
|
59
|
|
|
|
|
|
|
my $bedGraphToBigWig = can_run('bedGraphToBigWig') or |
|
60
|
|
|
|
|
|
|
croak "ERROR [$this_function] bedGraphToBigWig utility not found"; |
|
61
|
|
|
|
|
|
|
my $awk = can_run('awk') or |
|
62
|
|
|
|
|
|
|
croak "ERROR [$this_function] awk utility not found"; |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
if(defined $log){ |
|
65
|
|
|
|
|
|
|
open(LOG, ">>", $log) or croak $!; |
|
66
|
|
|
|
|
|
|
print LOG "LOG [$this_function] \$infile: $infile\n"; |
|
67
|
|
|
|
|
|
|
print LOG "LOG [$this_function] \$dest: $dest\n"; |
|
68
|
|
|
|
|
|
|
print LOG "LOG [$this_function] \$chromsizes: $chromsizes\n"; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $infile\n" |
|
72
|
|
|
|
|
|
|
unless (-e $infile); |
|
73
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist\n" |
|
74
|
|
|
|
|
|
|
unless (-d $dest); |
|
75
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $chromsizes\n" |
|
76
|
|
|
|
|
|
|
unless (-e $chromsizes); |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
if ($want_norm == 1){ |
|
79
|
|
|
|
|
|
|
$factor = $scale/$size; |
|
80
|
|
|
|
|
|
|
print LOG "LOG [$this_function] normalization: $factor = ($scale/$size)\n" |
|
81
|
|
|
|
|
|
|
if(defined $log); |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($infile, qr /\..*/); |
|
85
|
|
|
|
|
|
|
$fn_bg_tmp = file($dest,$bn.".tmp.bg"); |
|
86
|
|
|
|
|
|
|
$fn_bg = file($dest,$bn.".bg"); |
|
87
|
|
|
|
|
|
|
if($strand eq "+"){ |
|
88
|
|
|
|
|
|
|
$fn_bw = file($dest,$bn.".pos.bw"); |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
else { |
|
91
|
|
|
|
|
|
|
$fn_bw = file($dest,$bn.".neg.bw"); |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
$cmd = "$genomeCoverageBed -bg -scale $factor -split "; |
|
95
|
|
|
|
|
|
|
if ($type eq "bed"){ $cmd .= "-i $infile -g $chromsizes"; } # chrom.sizes only required for processing BED |
|
96
|
|
|
|
|
|
|
else { $cmd .= "-ibam $infile "; } |
|
97
|
|
|
|
|
|
|
$cmd .= " > $fn_bg_tmp"; |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
if($strand eq "-"){ |
|
100
|
|
|
|
|
|
|
$cmd .= " && cat $fn_bg_tmp | $awk \'{ \$4 = - \$4 ; print \$0 }\' > $fn_bg"; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
else{ |
|
103
|
|
|
|
|
|
|
$fn_bg = $fn_bg_tmp; |
|
104
|
|
|
|
|
|
|
} |
|
105
|
|
|
|
|
|
|
$cmd .= " && $bedGraphToBigWig $fn_bg $chromsizes $fn_bw"; |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
if (defined $log){ print LOG "LOG [$this_function] $cmd\n";} |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = |
|
110
|
|
|
|
|
|
|
run( command => $cmd, verbose => 0 ); |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
if( !$success ) { |
|
113
|
|
|
|
|
|
|
print STDERR "ERROR [$this_function] External command call unsuccessful\n"; |
|
114
|
|
|
|
|
|
|
print STDERR "ERROR: this is what the command printed:\n"; |
|
115
|
|
|
|
|
|
|
print join "", @$full_buf; |
|
116
|
|
|
|
|
|
|
croak $!; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
if (defined $log){ close(LOG); } |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
unlink ($fn_bg_tmp); |
|
121
|
|
|
|
|
|
|
unlink ($fn_bg); |
|
122
|
|
|
|
|
|
|
return $fn_bw; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub bed2bigBed { |
|
126
|
|
|
|
|
|
|
my ($infile,$chromsizes,$dest,$log) = @_; |
|
127
|
|
|
|
|
|
|
my ($bn,$path,$ext,$cmd,$outfile); |
|
128
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
129
|
|
|
|
|
|
|
my $bed2bigBed = can_run('bedToBigBed') or |
|
130
|
|
|
|
|
|
|
croak "ERROR [$this_function] bedToBigBed utility not found"; |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
if (defined $log){ |
|
133
|
|
|
|
|
|
|
open(LOG, ">>", $log) or croak $!; |
|
134
|
|
|
|
|
|
|
print LOG "LOG [$this_function] \$infile: $infile -- \$chromsizes: $chromsizes --\$dest: $dest\n"; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $infile" |
|
138
|
|
|
|
|
|
|
unless (-e $infile); |
|
139
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $chromsizes" |
|
140
|
|
|
|
|
|
|
unless (-e $chromsizes); |
|
141
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist" |
|
142
|
|
|
|
|
|
|
unless (-d $dest); |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# .bed6 .bed12 extensions are replaced by .bb |
|
145
|
|
|
|
|
|
|
($bn,$path,$ext) = fileparse($infile, qr /\.bed[126]?/); |
|
146
|
|
|
|
|
|
|
$outfile = file($dest, "$bn.bb"); |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$cmd = "$bed2bigBed $infile -extraIndex=name -tab $chromsizes $outfile"; |
|
149
|
|
|
|
|
|
|
if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; } |
|
150
|
|
|
|
|
|
|
my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = |
|
151
|
|
|
|
|
|
|
run( command => $cmd, verbose => 0 ); |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
if( !$success ) { |
|
154
|
|
|
|
|
|
|
print STDERR "ERROR [$this_function] Call to $bed2bigBed unsuccessful\n"; |
|
155
|
|
|
|
|
|
|
print STDERR "ERROR: this is what the command printed:\n"; |
|
156
|
|
|
|
|
|
|
print join "", @$full_buf; |
|
157
|
|
|
|
|
|
|
croak $!; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
if (defined $log){ close(LOG); } |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
return $outfile; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub sortbed { |
|
166
|
|
|
|
|
|
|
my ($infile,$dest,$outfile,$rm_orig,$log) = @_; |
|
167
|
|
|
|
|
|
|
my ($cmd,$out); |
|
168
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
169
|
|
|
|
|
|
|
my $bedtools = can_run('bedtools') or |
|
170
|
|
|
|
|
|
|
croak "ERROR [$this_function bedtools utility not found"; |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
croak "ERROR [$this_function] Cannot find $infile" |
|
173
|
|
|
|
|
|
|
unless (-e $infile); |
|
174
|
|
|
|
|
|
|
croak "ERROR [$this_function] $dest does not exist" |
|
175
|
|
|
|
|
|
|
unless (-d $dest); |
|
176
|
|
|
|
|
|
|
if (defined $log){open(LOG, ">>", $log) or croak $!;} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
$out = file($dest,$outfile); |
|
179
|
|
|
|
|
|
|
$cmd = "$bedtools sort -i $infile > $out"; |
|
180
|
|
|
|
|
|
|
if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; } |
|
181
|
|
|
|
|
|
|
my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = |
|
182
|
|
|
|
|
|
|
run( command => $cmd, verbose => 0 ); |
|
183
|
|
|
|
|
|
|
if( !$success ) { |
|
184
|
|
|
|
|
|
|
print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n"; |
|
185
|
|
|
|
|
|
|
print STDERR "ERROR: this is what the command printed:\n"; |
|
186
|
|
|
|
|
|
|
print join "", @$full_buf; |
|
187
|
|
|
|
|
|
|
croak $!; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
if($rm_orig){ |
|
191
|
|
|
|
|
|
|
unlink($infile) or |
|
192
|
|
|
|
|
|
|
carp "WARN [$this_function] Could not unlink $infile"; |
|
193
|
|
|
|
|
|
|
if (defined $log){ |
|
194
|
|
|
|
|
|
|
print LOG "[$this_function] removed $infile $!\n"; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
if (defined $log){ close(LOG); } |
|
199
|
|
|
|
|
|
|
} |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
sub computeTPM { |
|
202
|
|
|
|
|
|
|
my ($featCount_sample,$rl) = @_; |
|
203
|
|
|
|
|
|
|
my ($TPM,$T,$totalTPM) = (0)x3; |
|
204
|
|
|
|
|
|
|
my ($i,$features,$meanTPM); |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
$features = keys %$featCount_sample; # of of features in hash |
|
207
|
|
|
|
|
|
|
print Dumper(\%$featCount_sample);print ">>$rl<<\n"; |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# iterate through $featCount_sample twice: |
|
210
|
|
|
|
|
|
|
# 1. for computing T (denominator in TPM formula) |
|
211
|
|
|
|
|
|
|
foreach $i (keys %$featCount_sample){ |
|
212
|
|
|
|
|
|
|
$T += ($$featCount_sample{$i}{count} * $rl)/($$featCount_sample{$i}{len}); |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
# 2. for computng actual TPM values |
|
215
|
|
|
|
|
|
|
foreach $i (keys %$featCount_sample){ |
|
216
|
|
|
|
|
|
|
$TPM = 1000000 * $$featCount_sample{$i}{count} * $rl/($$featCount_sample{$i}{len} * $T); |
|
217
|
|
|
|
|
|
|
$$featCount_sample{$i}{TPM} = $TPM; |
|
218
|
|
|
|
|
|
|
$totalTPM += $TPM; |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
$meanTPM = $totalTPM/$features; |
|
221
|
|
|
|
|
|
|
# print "totalTPM=$totalTPM | meanTPM=$meanTPM\n"; |
|
222
|
|
|
|
|
|
|
return $meanTPM; |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
sub parse_multicov { |
|
226
|
|
|
|
|
|
|
my ($file) = @_; |
|
227
|
|
|
|
|
|
|
my @mcData = (); |
|
228
|
|
|
|
|
|
|
my ($mcSamples,$i); |
|
229
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
croak "ERROR [$this_function] multicov file $file not available\n" |
|
232
|
|
|
|
|
|
|
unless (-e $file); |
|
233
|
|
|
|
|
|
|
open (MULTICOV_IN, "< $file") or croak $!; |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
while (<MULTICOV_IN>){ |
|
236
|
|
|
|
|
|
|
chomp; |
|
237
|
|
|
|
|
|
|
@mcData = split(/\t/); # 0:chr|1:start|2:end|3:name|4:score|5:strand |
|
238
|
|
|
|
|
|
|
$mcSamples = (scalar @mcData)-6; # multicov extends BED6 |
|
239
|
|
|
|
|
|
|
#print "$_\n"; |
|
240
|
|
|
|
|
|
|
for ($i=0;$i<$mcSamples;$i++){ |
|
241
|
|
|
|
|
|
|
$featCount[$i]{$mcData[3]} = { |
|
242
|
|
|
|
|
|
|
chr => $mcData[0], |
|
243
|
|
|
|
|
|
|
start => $mcData[1], |
|
244
|
|
|
|
|
|
|
end => $mcData[2], |
|
245
|
|
|
|
|
|
|
name => $mcData[3], |
|
246
|
|
|
|
|
|
|
score => $mcData[4], |
|
247
|
|
|
|
|
|
|
strand => $mcData[5], |
|
248
|
|
|
|
|
|
|
len => eval($mcData[2]-$mcData[1]), |
|
249
|
|
|
|
|
|
|
count => $mcData[eval(6+$i)], |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
#print Dumper(@mcData); |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
close(MULTICOV_IN); |
|
255
|
|
|
|
|
|
|
return $mcSamples; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub write_multicov { |
|
259
|
|
|
|
|
|
|
my ($item,$dest,$base_name) = @_; |
|
260
|
|
|
|
|
|
|
my ($outfile,$mcSamples,$nrFeatures,$feat,$i); |
|
261
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
croak "ERROR [$this_function]: $dest does not exist\n" |
|
264
|
|
|
|
|
|
|
unless (-d $dest); |
|
265
|
|
|
|
|
|
|
$outfile = file($dest,$base_name.".".$item.".multicov.csv"); |
|
266
|
|
|
|
|
|
|
open (MULTICOV_OUT, "> $outfile") or croak $!; |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
$mcSamples = scalar @featCount; # of samples in %{$featCount} |
|
269
|
|
|
|
|
|
|
$nrFeatures = scalar keys %{$featCount[1]}; # of keys in %{$featCount}[1] |
|
270
|
|
|
|
|
|
|
#print "=====> write_multicov: writing multicov file $outfile with $nrFeatures lines and $mcSamples conditions\n"; |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# check whether each column in %$featCount has the same number of entries |
|
273
|
|
|
|
|
|
|
for($i=0;$i<$mcSamples;$i++){ |
|
274
|
|
|
|
|
|
|
my $fc = scalar keys %{$featCount[$i]}; # of keys in %{$featCount} |
|
275
|
|
|
|
|
|
|
#print "condition $i => $fc keys\n"; |
|
276
|
|
|
|
|
|
|
unless($nrFeatures == $fc){ |
|
277
|
|
|
|
|
|
|
croak "ERROR [$this_function]: unequal element count in \%\$featCount\nExpected $nrFeatures have $fc in condition $i\n"; |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
foreach $feat (keys %{$featCount[1]}){ |
|
282
|
|
|
|
|
|
|
my @mcLine = (); |
|
283
|
|
|
|
|
|
|
# process standard BED6 fields first |
|
284
|
|
|
|
|
|
|
push @mcLine, (${$featCount[1]}{$feat}->{chr}, |
|
285
|
|
|
|
|
|
|
${$featCount[1]}{$feat}->{start}, |
|
286
|
|
|
|
|
|
|
${$featCount[1]}{$feat}->{end}, |
|
287
|
|
|
|
|
|
|
${$featCount[1]}{$feat}->{name}, |
|
288
|
|
|
|
|
|
|
${$featCount[1]}{$feat}->{score}, |
|
289
|
|
|
|
|
|
|
${$featCount[1]}{$feat}->{strand}); |
|
290
|
|
|
|
|
|
|
# process multicov values for all samples |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
for($i=0;$i<$mcSamples;$i++){ |
|
293
|
|
|
|
|
|
|
# print "------------> "; print "processing $i th condition "; print "<-----------\n"; |
|
294
|
|
|
|
|
|
|
unless (defined ${$featCount[$i]}{$feat}){ |
|
295
|
|
|
|
|
|
|
croak "Could not find item $feat in mcSample $i\n"; |
|
296
|
|
|
|
|
|
|
} |
|
297
|
|
|
|
|
|
|
push @mcLine, ${$featCount[$i]}{$feat}->{$item}; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
#print Dumper(\@mcLine); |
|
301
|
|
|
|
|
|
|
print MULTICOV_OUT join("\t",@mcLine)."\n"; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
close(MULTICOV_OUT); |
|
304
|
|
|
|
|
|
|
} |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
sub unique_array{ |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
my $arrayref = shift; |
|
309
|
|
|
|
|
|
|
my @array = @{$arrayref}; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
foreach my $item (@array) |
|
312
|
|
|
|
|
|
|
{ |
|
313
|
|
|
|
|
|
|
$unique{$item} ++; |
|
314
|
|
|
|
|
|
|
} |
|
315
|
|
|
|
|
|
|
my @arrayuid = sort {$a cmp $b} keys %unique; |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
return(\@arrayuid); |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub kmer_enrichment{ |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
my @seqs = @{$_[0]}; |
|
323
|
|
|
|
|
|
|
my $klen = $_[1]; |
|
324
|
|
|
|
|
|
|
# my @seq = split( //, $read_tmp ); |
|
325
|
|
|
|
|
|
|
my $kstring =''; |
|
326
|
|
|
|
|
|
|
#return variables |
|
327
|
|
|
|
|
|
|
my %km; |
|
328
|
|
|
|
|
|
|
foreach my $sequences (@seqs){ |
|
329
|
|
|
|
|
|
|
# print STDERR $sequences,"\n"; |
|
330
|
|
|
|
|
|
|
my @seq = split( //, $sequences ); |
|
331
|
|
|
|
|
|
|
for ( my $seq_pos = 0; $seq_pos <= $#seq-$klen ; $seq_pos++ ) { |
|
332
|
|
|
|
|
|
|
for (my $i=$seq_pos;$i<=$seq_pos+($klen-1);$i++){ |
|
333
|
|
|
|
|
|
|
$kstring .= $seq[$i]; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
$km{$kstring}++; |
|
336
|
|
|
|
|
|
|
$kstring = ""; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
return( \%km ); |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
sub extend_chain{ |
|
343
|
|
|
|
|
|
|
my %sizes = %{$_[0]}; |
|
344
|
|
|
|
|
|
|
my $chain = $_[1]; |
|
345
|
|
|
|
|
|
|
my $l = $_[2]; |
|
346
|
|
|
|
|
|
|
my $r = $_[3]; |
|
347
|
|
|
|
|
|
|
my $u = $_[4]; |
|
348
|
|
|
|
|
|
|
my $d = $_[5]; |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
##return a new chain with extended coordinates |
|
351
|
|
|
|
|
|
|
my $extendchain = $chain -> clone(); |
|
352
|
|
|
|
|
|
|
## got through all features in original chain, calculate new start and end and safe in extendchain |
|
353
|
|
|
|
|
|
|
my @featarray = @{$extendchain->chain}; |
|
354
|
|
|
|
|
|
|
foreach my $feature (@featarray){ |
|
355
|
|
|
|
|
|
|
my $chrom = $feature->chromosome; |
|
356
|
|
|
|
|
|
|
my $start = $feature->start; |
|
357
|
|
|
|
|
|
|
my $end = $feature->end; |
|
358
|
|
|
|
|
|
|
my $strand = $feature->strand; |
|
359
|
|
|
|
|
|
|
my $right = 0; |
|
360
|
|
|
|
|
|
|
my $left = 0; |
|
361
|
|
|
|
|
|
|
my $width = nearest(1,($end-$start)/2); |
|
362
|
|
|
|
|
|
|
$width = 0 if ($d > 0 || $u > 0); |
|
363
|
|
|
|
|
|
|
if ($strand eq "+"){ |
|
364
|
|
|
|
|
|
|
if ($d > 0){ |
|
365
|
|
|
|
|
|
|
$start = $end; |
|
366
|
|
|
|
|
|
|
$r = $d; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
if ($u > 0){ |
|
369
|
|
|
|
|
|
|
$end = $start; |
|
370
|
|
|
|
|
|
|
$l = $u; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
|
|
|
|
|
|
$right=$r; |
|
373
|
|
|
|
|
|
|
$left=$l; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
elsif ($strand eq "-"){ |
|
376
|
|
|
|
|
|
|
if ($u > 0){ |
|
377
|
|
|
|
|
|
|
$start = $end; |
|
378
|
|
|
|
|
|
|
$l = $u; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
if ($d > 0){ |
|
381
|
|
|
|
|
|
|
$end = $start; |
|
382
|
|
|
|
|
|
|
$r = $d; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
$right=$l; |
|
385
|
|
|
|
|
|
|
$left=$r; |
|
386
|
|
|
|
|
|
|
} |
|
387
|
|
|
|
|
|
|
if (($right-$width) <= 0){ |
|
388
|
|
|
|
|
|
|
$right = 0; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
else{ |
|
391
|
|
|
|
|
|
|
$right-=$width; |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
if (($left-$width) <= 0 ){ |
|
394
|
|
|
|
|
|
|
$left = 0; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
else{ |
|
397
|
|
|
|
|
|
|
$left-=$width; |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
if ( $start-$left >= 1 ){ |
|
400
|
|
|
|
|
|
|
if ($end+$right >= $sizes{"chr".$chrom}){ |
|
401
|
|
|
|
|
|
|
$end = $sizes{"chr".$chrom}; |
|
402
|
|
|
|
|
|
|
} |
|
403
|
|
|
|
|
|
|
else{ |
|
404
|
|
|
|
|
|
|
$end += $right; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
$start -= $left; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
elsif ( $start-$left <= 0 ){ |
|
409
|
|
|
|
|
|
|
$start = 0; |
|
410
|
|
|
|
|
|
|
if ($end+$right >= $sizes{"chr".$chrom}){ |
|
411
|
|
|
|
|
|
|
$end = $sizes{"chr".$chrom}; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
else{ |
|
414
|
|
|
|
|
|
|
$end = $end+$right; |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
else{ |
|
418
|
|
|
|
|
|
|
die "Something wrong here!\n"; |
|
419
|
|
|
|
|
|
|
} |
|
420
|
|
|
|
|
|
|
$feature->start($start); |
|
421
|
|
|
|
|
|
|
$feature->end($end); |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
$extendchain->type('extended'); |
|
424
|
|
|
|
|
|
|
return($extendchain); |
|
425
|
|
|
|
|
|
|
} |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub parse_bed6{ |
|
428
|
|
|
|
|
|
|
my $bedfile = shift; |
|
429
|
|
|
|
|
|
|
open (my $Bed, "<:gzip(autopop)",$bedfile) or die "$!"; |
|
430
|
|
|
|
|
|
|
my @featurelist; ## This will become a FeatureChain |
|
431
|
|
|
|
|
|
|
while(<$Bed>){ |
|
432
|
|
|
|
|
|
|
### This should be done by FeatureIO |
|
433
|
|
|
|
|
|
|
chomp (my $raw = $_); |
|
434
|
|
|
|
|
|
|
push my @line , split (/\t/,$raw); |
|
435
|
|
|
|
|
|
|
push @line, "\." if ( !$line[5] ); |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
(my $chromosome = $line[0])=~ s/chr//g; |
|
438
|
|
|
|
|
|
|
my $start = $line[1]; |
|
439
|
|
|
|
|
|
|
my $end = $line[2]; |
|
440
|
|
|
|
|
|
|
my $name = $line[3]; |
|
441
|
|
|
|
|
|
|
my $score = $line[4]; |
|
442
|
|
|
|
|
|
|
my $strand = $line[5]; |
|
443
|
|
|
|
|
|
|
my $extension = ''; |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
if ($line[6]){ |
|
446
|
|
|
|
|
|
|
for (6..$#line){ |
|
447
|
|
|
|
|
|
|
$extension .= $line[$_]."\t"; |
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
$extension = substr($extension,0,-1); |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
my $feat = Bio::ViennaNGS::Feature->new(chromosome=>$chromosome, |
|
452
|
|
|
|
|
|
|
start=>$start, |
|
453
|
|
|
|
|
|
|
end=>$end, |
|
454
|
|
|
|
|
|
|
name=>$name, |
|
455
|
|
|
|
|
|
|
score=>$score, |
|
456
|
|
|
|
|
|
|
strand=>$strand, |
|
457
|
|
|
|
|
|
|
extension=>$extension); |
|
458
|
|
|
|
|
|
|
push @featurelist, $feat; |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
return (\@featurelist); |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub fetch_chrom_sizes{ |
|
464
|
|
|
|
|
|
|
my $species = shift; |
|
465
|
|
|
|
|
|
|
my %sizes; |
|
466
|
|
|
|
|
|
|
my @chromsize; |
|
467
|
|
|
|
|
|
|
my $this_function = (caller(0))[3]; |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
my $test_fetchChromSizes = can_run('fetchChromSizes') or |
|
470
|
|
|
|
|
|
|
say "ERROR [$this_function] fetchChromSizes utility not found"; |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
my $cmd = "fetchChromSizes $species"; |
|
473
|
|
|
|
|
|
|
my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = run(command => $cmd, verbose => 0); |
|
474
|
|
|
|
|
|
|
if ($success){ |
|
475
|
|
|
|
|
|
|
@chromsize = @{$stdout_buf}; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
else{ |
|
478
|
|
|
|
|
|
|
print STDERR "Using UCSCs fetchChromSizes failed, trying alternative mysql fetch!\n"; |
|
479
|
|
|
|
|
|
|
my $test_fetchChromSizes = can_run('mysql') or |
|
480
|
|
|
|
|
|
|
die "ERROR [$this_function] mysql utility not found"; |
|
481
|
|
|
|
|
|
|
$cmd = "mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from $species.chromInfo\""; ### Alternative to UCSC fetchChromSizes, has mysql dependency |
|
482
|
|
|
|
|
|
|
my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) = run(command => $cmd, verbose => 0); |
|
483
|
|
|
|
|
|
|
if ($success){ |
|
484
|
|
|
|
|
|
|
@chromsize = @{$stdout_buf}; |
|
485
|
|
|
|
|
|
|
} |
|
486
|
|
|
|
|
|
|
else{ |
|
487
|
|
|
|
|
|
|
print STDERR "ERROR [$this_function] External command call unsuccessful\n"; |
|
488
|
|
|
|
|
|
|
print STDERR "ERROR: this is what the command printed:\n"; |
|
489
|
|
|
|
|
|
|
print join "", @$full_buf; |
|
490
|
|
|
|
|
|
|
croak $!; |
|
491
|
|
|
|
|
|
|
die "Fetching of chromosome sizes failed, please either download fetchChromSizes from the UCSC script collection, or install mysql!\n"; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
foreach (@chromsize){ |
|
496
|
|
|
|
|
|
|
chomp($_); |
|
497
|
|
|
|
|
|
|
foreach (split(/\n/,$_)){ |
|
498
|
|
|
|
|
|
|
my ($chr,$size)=split (/\t/,$_); |
|
499
|
|
|
|
|
|
|
$sizes{$chr}=$size; |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
return(\%sizes); |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
1; |
|
507
|
|
|
|
|
|
|
__END__ |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=head1 NAME |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
Bio::ViennaNGS::Util - Utility routines for Next-Generation Sequencing data |
|
513
|
|
|
|
|
|
|
analysis |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
use Bio::ViennaNGS::Util; |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# make bigWig from BED or BAM |
|
520
|
|
|
|
|
|
|
$type = "bam"; |
|
521
|
|
|
|
|
|
|
$strand = "+"; |
|
522
|
|
|
|
|
|
|
$bwfile = bed_or_bam2bw($type,$infile,$cs_in,$strand,$destdir,$wantnorm,$size_p,$scale,$logfile); |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
# make bigBed from BED |
|
525
|
|
|
|
|
|
|
my $bb = bed2bigBed($bed_in,$cs_in,$destdir,$logfile); |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# sort a BED file |
|
528
|
|
|
|
|
|
|
sortbed($bed_in,$destdir,$bed_out,$rm_orig,$logfile) |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# compute transcript abundance in TPM |
|
531
|
|
|
|
|
|
|
$meanTPM = computeTPM($sample,$readlength); |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# parse a bedtools multicov compatible file |
|
534
|
|
|
|
|
|
|
$conds = parse_multicov($infile); |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
# write bedtools multicov compatible file |
|
537
|
|
|
|
|
|
|
write_multicov("TPM",$destdir,$basename); |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
Bio::ViennaNGS::Util is a collection of utility subroutines for |
|
542
|
|
|
|
|
|
|
building efficient Next-Generation Sequencing (NGS) data analysis |
|
543
|
|
|
|
|
|
|
pipelines. |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=head2 ROUTINES |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=over |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=item bed_or_bam2bw($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log) |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
Creates stranded, normalized BigWig coverage profiles from |
|
552
|
|
|
|
|
|
|
strand-specific BAM or BED files (provided via C<$infile>). The |
|
553
|
|
|
|
|
|
|
routine expects a file type 'bam' or 'bed' via the C<$type> |
|
554
|
|
|
|
|
|
|
argument. C<$chromsizes> is the chromosome.sizes files, C<$strand> is |
|
555
|
|
|
|
|
|
|
either "+" or "-" and C<$dest> contains the output path for |
|
556
|
|
|
|
|
|
|
results. For normlization of bigWig profiles, additional attributes |
|
557
|
|
|
|
|
|
|
are required: C<$want_norm> triggers normalization with values 0 or |
|
558
|
|
|
|
|
|
|
1. C<$size> is the number of fragments/elements in the BAM or BED file |
|
559
|
|
|
|
|
|
|
and C<$scale> gives the number to which data is normalized (ie. every |
|
560
|
|
|
|
|
|
|
bedGraph entry is multiplied by a factor (C<$scale>/C<$size>). C<$log> |
|
561
|
|
|
|
|
|
|
is expected to contain either the full path and file name of log file |
|
562
|
|
|
|
|
|
|
or 'undef'. The routine returns the full file name of the newly |
|
563
|
|
|
|
|
|
|
generated bigWig file. |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
While this routine can handle non-straned BAM/BED files (in which case |
|
566
|
|
|
|
|
|
|
C<$strand> should be set to "+" and hence all coverage profiles will |
|
567
|
|
|
|
|
|
|
be created with a positive sign, even if they map to the negative |
|
568
|
|
|
|
|
|
|
strand), usage of strand-specific data is highly recommended. For BAM |
|
569
|
|
|
|
|
|
|
file, this is easily achieved by calling the L<bam_split> routine (see |
|
570
|
|
|
|
|
|
|
above) prior to this one, thus creating dedicated BAM files containing |
|
571
|
|
|
|
|
|
|
exclusively reads mapped to the positive or negative strand, |
|
572
|
|
|
|
|
|
|
respectively. |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
It is important to know that this routine B<does not extract> reads |
|
575
|
|
|
|
|
|
|
mapped to either strand from a non-stranded BAM/BED file if the |
|
576
|
|
|
|
|
|
|
C<$strand> argument is given. It rather adjusts the sign of B<all> |
|
577
|
|
|
|
|
|
|
mapped reads/features in a BAM/BED file and then creates bigWig |
|
578
|
|
|
|
|
|
|
files. See the L<split_bam> routine for extracting reads mapped to |
|
579
|
|
|
|
|
|
|
either strand. |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Stranded bigWigs can easily be visualized via |
|
582
|
|
|
|
|
|
|
L<TrackHubs|http://genome.ucsc.edu/goldenPath/help/hgTrackHubHelp.html> |
|
583
|
|
|
|
|
|
|
in the UCSC Genome Browser. Internally, the conversion from BAM/BED to |
|
584
|
|
|
|
|
|
|
bigWig is accomplished via two third-party applications: |
|
585
|
|
|
|
|
|
|
L<genomeCoverageBed|http://bedtools.readthedocs.org/en/latest/content/tools/genomecov.html> |
|
586
|
|
|
|
|
|
|
and |
|
587
|
|
|
|
|
|
|
L<bedGraphToBigWig|http://hgdownload.cse.ucsc.edu/admin/exe/>. Intermediate |
|
588
|
|
|
|
|
|
|
bedGraph files are removed automatically once the bigWig files are |
|
589
|
|
|
|
|
|
|
ready. |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
=item sortbed($infile,$dest,$outfile,$rm_orig,$log) |
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
Sorts BED file C<$infile> with F<bedtools sortt>. C<$dest> and |
|
594
|
|
|
|
|
|
|
C<outfile> name path and filename of the resulting sorted BED |
|
595
|
|
|
|
|
|
|
file. C<$rm_infile> is either 1 or 0 and indicated whether the |
|
596
|
|
|
|
|
|
|
original C<$infile> should be deleted. C<$log> holds path and name of |
|
597
|
|
|
|
|
|
|
log file. |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
=item bed2bigBed($infile,$chromsizes,$dest,$log) |
|
600
|
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
Creates an indexed bigBed file from a BED file. C<$infile> is the BED |
|
602
|
|
|
|
|
|
|
file to be transformed, C<$chromsizes> is the chromosome.sizes file |
|
603
|
|
|
|
|
|
|
and C<$dest> contains the output path for results. C<$log> is the name |
|
604
|
|
|
|
|
|
|
of a log file, or undef if no logging is reuqired. A '.bed', '.bed6' |
|
605
|
|
|
|
|
|
|
or '.bed12' suffix in C<$infile> will be replace by '.bb' in the |
|
606
|
|
|
|
|
|
|
output. Else, the name of the output bigBed file will be the value of |
|
607
|
|
|
|
|
|
|
C<$infile> plus '.bb' appended. |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
The conversion from BED to bigBed is done by a third-party utility |
|
610
|
|
|
|
|
|
|
(bedToBigBed), which is executed by IPC::Cmd. |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=item computeTPM($featCount_sample,$rl) |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
Computes expression in Transcript per Million (TPM) [Wagner |
|
615
|
|
|
|
|
|
|
et.al. Theory Biosci. (2012)]. C<$featCount_sample> is a reference to |
|
616
|
|
|
|
|
|
|
a Hash of Hashes data straucture where keys are feature names and |
|
617
|
|
|
|
|
|
|
values hold a hash that must at least contain length and raw read |
|
618
|
|
|
|
|
|
|
counts. Practically, C<$featCount_sample> is represented by _one_ |
|
619
|
|
|
|
|
|
|
element of C<@featCount>, which is populated from a multicov file by |
|
620
|
|
|
|
|
|
|
C<parse_multicov()>. C<$rl> is the read length of the sequencing run. |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
Returns the mean TPM of the processed sample, which is invariant among |
|
623
|
|
|
|
|
|
|
samples. (TPM models relative molar concentration and thus fulfills |
|
624
|
|
|
|
|
|
|
the invariant average criterion.) |
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
=item parse_multicov($file) |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
Parse a bedtools multicov (multiBamCov) file, i.e. an extended BED6 |
|
629
|
|
|
|
|
|
|
file, into an Array of Hash of Hashes data structure |
|
630
|
|
|
|
|
|
|
(C<@featCount>). C<$file> is the input file. Returns the number of |
|
631
|
|
|
|
|
|
|
samples present in the multicov file, ie. the numner of columns |
|
632
|
|
|
|
|
|
|
extending the canonical BED6 columns in the input multicov file. |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
=item write_multicov($item,$dest,$base_name) |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
Write C<@featCount> data to a bedtools multicov (multiBamCov)-type |
|
637
|
|
|
|
|
|
|
file. C<$item> specifies the type of information from C<@featCount> |
|
638
|
|
|
|
|
|
|
HoH entries, e.g. TPM or RPKM. These values must have been computed |
|
639
|
|
|
|
|
|
|
and inserted into C<@featCount> beforehand by |
|
640
|
|
|
|
|
|
|
e.g. C<computeTPM()>. C<$dest> gives the absolute path and |
|
641
|
|
|
|
|
|
|
C<$base_name> the basename (will be extended by C<$item>.csv) of the |
|
642
|
|
|
|
|
|
|
output file. |
|
643
|
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=back |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=over 7 |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=item L<Bio::Perl> >= 1.00690001 |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=item L<BIO::DB::Sam> >= 1.39 |
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item L<File::Basename> |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
=item L<File::Temp> |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=item L<Path::Class> |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=item L<IPC::Cmd> |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item L<Carp> |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=back |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
L<Bio::ViennaNGS> uses third-party tools for computing intersections |
|
667
|
|
|
|
|
|
|
of BED files: F<bedtools intersect> from the |
|
668
|
|
|
|
|
|
|
L<BEDtools|http://bedtools.readthedocs.org/en/latest/content/tools/intersect.html> |
|
669
|
|
|
|
|
|
|
suite is used to compute overlaps and F<bedtools sort> is used to sort |
|
670
|
|
|
|
|
|
|
BED output files. Make sure that those third-party utilities are |
|
671
|
|
|
|
|
|
|
available on your system, and that hey can be found and executed by |
|
672
|
|
|
|
|
|
|
the Perl interpreter. We recommend installing the latest version of |
|
673
|
|
|
|
|
|
|
L<BEDtools|https://github.com/arq5x/bedtools2> on your system. |
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=head1 AUTHORS |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=over |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=item Jörg Fallmann E<lt>fall@tbi.univie.ac.atE<gt> |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=back |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
690
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.12.4 or, |
|
691
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
This software is distributed in the hope that it will be useful, but |
|
694
|
|
|
|
|
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of |
|
695
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=cut |