File Coverage

blib/lib/Bio/ViennaNGS/Util.pm
Criterion Covered Total %
statement 36 284 12.6
branch 0 126 0.0
condition 0 9 0.0
subroutine 12 22 54.5
pod 3 10 30.0
total 51 451 11.3


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2015-02-09 09:19:47 mtw>
3              
4              
5             package Bio::ViennaNGS::Util;
6              
7 1     1   3 use Exporter;
  1         2  
  1         35  
8 1     1   7 use version; our $VERSION = qv('0.12_16');
  1         1  
  1         4  
9 1     1   52 use strict;
  1         1  
  1         21  
10 1     1   3 use warnings;
  1         1  
  1         20  
11 1     1   3 use Data::Dumper;
  1         1  
  1         77  
12 1     1   4 use File::Basename qw(fileparse);
  1         1  
  1         74  
13 1     1   734 use IPC::Cmd qw(can_run run);
  1         72350  
  1         61  
14 1     1   468 use Path::Class qw(dir file);
  1         24858  
  1         138  
15 1     1   8 use File::Path qw(make_path remove_tree);
  1         2  
  1         41  
16 1     1   438 use Math::Round;
  1         814  
  1         50  
17 1     1   6 use Carp;
  1         1  
  1         47  
18 1     1   347 use Bio::ViennaNGS::FeatureChain;
  1         2  
  1         2504  
19              
20             our @ISA = qw(Exporter);
21             our @EXPORT = ();
22              
23             our @EXPORT_OK = qw ( bed_or_bam2bw sortbed bed2bigBed unique_array
24             kmer_enrichment extend_chain parse_bed6
25             fetch_chrom_sizes mkdircheck rmdircheck);
26              
27              
28             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
29             #^^^^^^^^^^ Variables ^^^^^^^^^^^#
30             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
31              
32             my %unique = ();
33              
34             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
35             #^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
36             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
37              
38             sub bed_or_bam2bw {
39 0     0 1   my ($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log) = @_;
40 0           my ($fn_bg_tmp,$fn_bg,$fn_bw);
41 0           my ($bn,$path,$ext,$cmd);
42 0           my @processed_files = ();
43 0           my $factor = 1.;
44 0           my $this_function = (caller(0))[3];
45              
46 0 0 0       croak "ERROR [$this_function] \$type is '$type', however it is expected to be either 'bam' or 'bed'\n"
47             unless ($type eq "bam") || ($type eq "bed");
48              
49 0 0         my $genomeCoverageBed = can_run('genomeCoverageBed') or
50             croak "ERROR [$this_function] genomeCoverageBed utility not found";
51 0 0         my $bedGraphToBigWig = can_run('bedGraphToBigWig') or
52             croak "ERROR [$this_function] bedGraphToBigWig utility not found";
53 0 0         my $awk = can_run('awk') or
54             croak "ERROR [$this_function] awk utility not found";
55              
56 0 0         if(defined $log){
57 0 0         open(LOG, ">>", $log) or croak $!;
58 0           print LOG "LOG [$this_function] \$infile: $infile\n";
59 0           print LOG "LOG [$this_function] \$dest: $dest\n";
60 0           print LOG "LOG [$this_function] \$chromsizes: $chromsizes\n";
61             }
62              
63 0 0         croak "ERROR [$this_function] Cannot find $infile\n"
64             unless (-e $infile);
65 0 0         croak "ERROR [$this_function] $dest does not exist\n"
66             unless (-d $dest);
67 0 0         croak "ERROR [$this_function] Cannot find $chromsizes\n"
68             unless (-e $chromsizes);
69              
70 0 0         if ($want_norm == 1){
71 0           $factor = $scale/$size;
72 0 0         print LOG "LOG [$this_function] normalization: $factor = ($scale/$size)\n"
73             if(defined $log);
74             }
75              
76 0           ($bn,$path,$ext) = fileparse($infile, qr /\..*/);
77 0           $fn_bg_tmp = file($dest,$bn.".tmp.bg");
78 0           $fn_bg = file($dest,$bn.".bg");
79 0 0         if($strand eq "+"){
80 0           $fn_bw = file($dest,$bn.".pos.bw");
81             }
82             else {
83 0           $fn_bw = file($dest,$bn.".neg.bw");
84             }
85              
86 0           $cmd = "$genomeCoverageBed -bg -scale $factor -split ";
87 0 0         if ($type eq "bed"){ $cmd .= "-i $infile -g $chromsizes"; } # chrom.sizes only required for processing BED
  0            
88 0           else { $cmd .= "-ibam $infile "; }
89 0           $cmd .= " > $fn_bg_tmp";
90              
91 0 0         if($strand eq "-"){
92 0           $cmd .= " && cat $fn_bg_tmp | $awk \'{ \$4 = - \$4 ; print \$0 }\' > $fn_bg";
93             }
94             else{
95 0           $fn_bg = $fn_bg_tmp;
96             }
97 0           $cmd .= " && $bedGraphToBigWig $fn_bg $chromsizes $fn_bw";
98              
99 0 0         if (defined $log){ print LOG "LOG [$this_function] $cmd\n";}
  0            
100              
101 0           my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
102             run( command => $cmd, verbose => 0 );
103              
104 0 0         if( !$success ) {
105 0           print STDERR "ERROR [$this_function] External command call unsuccessful\n";
106 0           print STDERR "ERROR: this is what the command printed:\n";
107 0           print join "", @$full_buf;
108 0           croak $!;
109             }
110 0 0         if (defined $log){ close(LOG); }
  0            
111              
112 0           unlink ($fn_bg_tmp);
113 0           unlink ($fn_bg);
114 0           return $fn_bw;
115             }
116              
117             sub bed2bigBed {
118 0     0 1   my ($infile,$chromsizes,$dest,$log) = @_;
119 0           my ($bn,$path,$ext,$cmd,$outfile);
120 0           my $this_function = (caller(0))[3];
121 0 0         my $bed2bigBed = can_run('bedToBigBed') or
122             croak "ERROR [$this_function] bedToBigBed utility not found";
123              
124 0 0         if (defined $log){
125 0 0         open(LOG, ">>", $log) or croak $!;
126 0           print LOG "LOG [$this_function] \$infile: $infile -- \$chromsizes: $chromsizes --\$dest: $dest\n";
127             }
128              
129 0 0         croak "ERROR [$this_function] Cannot find $infile"
130             unless (-e $infile);
131 0 0         croak "ERROR [$this_function] Cannot find $chromsizes"
132             unless (-e $chromsizes);
133 0 0         croak "ERROR [$this_function] $dest does not exist"
134             unless (-d $dest);
135              
136             # .bed6 .bed12 extensions are replaced by .bb
137 0           ($bn,$path,$ext) = fileparse($infile, qr /\.bed[126]?/);
138 0           $outfile = file($dest, "$bn.bb");
139              
140 0           $cmd = "$bed2bigBed $infile -extraIndex=name -tab $chromsizes $outfile";
141 0 0         if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; }
  0            
142 0           my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
143             run( command => $cmd, verbose => 0 );
144              
145 0 0         if( !$success ) {
146 0           print STDERR "ERROR [$this_function] Call to $bed2bigBed unsuccessful\n";
147 0           print STDERR "ERROR: this is what the command printed:\n";
148 0           print join "", @$full_buf;
149 0           croak $!;
150             }
151              
152 0 0         if (defined $log){ close(LOG); }
  0            
153              
154 0           return $outfile;
155             }
156              
157             sub sortbed {
158 0     0 1   my ($infile,$dest,$outfile,$rm_orig,$log) = @_;
159 0           my ($cmd,$out);
160 0           my $this_function = (caller(0))[3];
161 0 0         my $bedtools = can_run('bedtools') or
162             croak "ERROR [$this_function] bedtools utility not found";
163              
164 0 0         croak "ERROR [$this_function] Cannot find $infile"
165             unless (-e $infile);
166 0 0         croak "ERROR [$this_function] $dest does not exist"
167             unless (-d $dest);
168 0 0         if (defined $log){open(LOG, ">>", $log) or croak $!;}
  0 0          
169              
170 0           $out = file($dest,$outfile);
171 0           $cmd = "$bedtools sort -i $infile > $out";
172 0 0         if (defined $log){ print LOG "LOG [$this_function] $cmd\n"; }
  0            
173 0           my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
174             run( command => $cmd, verbose => 0 );
175 0 0         if( !$success ) {
176 0           print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
177 0           print STDERR "ERROR: this is what the command printed:\n";
178 0           print join "", @$full_buf;
179 0           croak $!;
180             }
181              
182 0 0         if($rm_orig){
183 0 0         unlink($infile) or
184             carp "WARN [$this_function] Could not unlink $infile";
185 0 0         if (defined $log){
186 0           print LOG "[$this_function] removed $infile $!\n";
187             }
188             }
189              
190 0 0         if (defined $log){ close(LOG); }
  0            
191             }
192              
193             sub unique_array{
194              
195 0     0 0   my $arrayref = shift;
196 0           my @array = @{$arrayref};
  0            
197              
198 0           foreach my $item (@array)
199             {
200 0           $unique{$item} ++;
201             }
202 0           my @arrayuid = sort {$a cmp $b} keys %unique;
  0            
203              
204 0           return(\@arrayuid);
205             }
206              
207             sub kmer_enrichment{
208              
209 0     0 0   my @seqs = @{$_[0]};
  0            
210 0           my $klen = $_[1];
211             # my @seq = split( //, $read_tmp );
212 0           my $kstring ='';
213             #return variables
214 0           my %km;
215 0           foreach my $sequences (@seqs){
216             # print STDERR $sequences,"\n";
217 0           my @seq = split( //, $sequences );
218 0           for ( my $seq_pos = 0; $seq_pos <= $#seq-$klen ; $seq_pos++ ) {
219 0           for (my $i=$seq_pos;$i<=$seq_pos+($klen-1);$i++){
220 0           $kstring .= $seq[$i];
221             }
222 0           $km{$kstring}++;
223 0           $kstring = "";
224             }
225             }
226 0           return( \%km );
227             }
228              
229             sub extend_chain{
230 0     0 0   my %sizes = %{$_[0]};
  0            
231 0           my $chain = $_[1];
232 0           my $l = $_[2];
233 0           my $r = $_[3];
234 0           my $u = $_[4];
235 0           my $d = $_[5];
236 0           my $e = $_[6];
237 0           my $this_function = (caller(0))[3];
238              
239             ##return a new chain with extended coordinates
240 0           my $extendchain = $chain -> clone();
241             ## got through all features in original chain, calculate new start and end and safe in extendchain
242 0           my @featarray = @{$extendchain->chain};
  0            
243 0           foreach my $feature (@featarray){
244 0           my $chrom = $feature->chromosome;
245 0           my $start = $feature->start;
246 0           my $end = $feature->end;
247 0           my $strand = $feature->strand;
248 0           my $right = 0;
249 0           my $left = 0;
250 0           my $width = nearest(1,($end-$start)/2);
251 0 0 0       $width = 0 if ($d > 0 || $u > 0 || !$e);
      0        
252 0 0         if ($strand eq "+"){
    0          
253 0 0         if ($d > 0){
254 0           $start = $end;
255 0           $r = $d;
256             }
257 0 0         if ($u > 0){
258 0           $end = $start;
259 0           $l = $u;
260             }
261 0           $right=$r;
262 0           $left=$l;
263             }
264             elsif ($strand eq "-"){
265 0 0         if ($u > 0){
266 0           $start = $end;
267 0           $l = $u;
268             }
269 0 0         if ($d > 0){
270 0           $end = $start;
271 0           $r = $d;
272             }
273 0           $right=$l;
274 0           $left=$r;
275             }
276 0 0         if (($right-$width) <= 0){
277 0           $right = 0;
278             }
279             else{
280 0           $right-=$width;
281             }
282 0 0         if (($left-$width) <= 0 ){
283 0           $left = 0;
284             }
285             else{
286 0           $left-=$width;
287             }
288 0 0         if ( $start-$left >= 1 ){
    0          
289 0 0         if ($end+$right >= $sizes{"chr".$chrom}){
290 0           $end = $sizes{"chr".$chrom};
291             }
292             else{
293 0           $end += $right;
294             }
295 0           $start -= $left;
296             }
297             elsif ( $start-$left <= 0 ){
298 0           $start = 0;
299 0 0         if ($end+$right >= $sizes{"chr".$chrom}){
300 0           $end = $sizes{"chr".$chrom};
301             }
302             else{
303 0           $end = $end+$right;
304             }
305             }
306             else{
307 0           croak "ERROR [$this_function] Something wrong here!\n";
308             }
309 0           $feature->start($start);
310 0           $feature->end($end);
311             }
312 0           $extendchain->type('extended');
313 0           return($extendchain);
314             }
315              
316             sub parse_bed6{
317 0     0 0   my $bedfile = shift;
318 0           my $this_function = (caller(0))[3];
319 0 0         open (my $Bed, "<:gzip(autopop)",$bedfile) or
320             croak "ERROR [$this_function] $!";
321 0           my @featurelist; ## This will become a FeatureChain
322 0           while(<$Bed>){
323             ### This should be done by FeatureIO
324 0           chomp (my $raw = $_);
325 0           push my @line , split (/\t/,$raw);
326 0 0         push @line, "\." if ( !$line[5] );
327              
328 0           (my $chromosome = $line[0])=~ s/chr//g;
329 0           my $start = $line[1];
330 0           my $end = $line[2];
331 0           my $name = $line[3];
332 0           my $score = $line[4];
333 0           my $strand = $line[5];
334 0           my $extension = '';
335              
336 0 0         if ($line[6]){
337 0           for (6..$#line){
338 0           $extension .= $line[$_]."\t";
339             }
340 0           $extension = substr($extension,0,-1);
341             }
342 0           my $feat = Bio::ViennaNGS::Feature->new(chromosome=>$chromosome,
343             start=>$start,
344             end=>$end,
345             name=>$name,
346             score=>$score,
347             strand=>$strand,
348             extension=>$extension);
349 0           push @featurelist, $feat;
350             }
351 0           return (\@featurelist);
352             }
353              
354             sub fetch_chrom_sizes{
355 0     0 0   my $species = shift;
356 0           my %sizes;
357             my @chromsize;
358 0           my $this_function = (caller(0))[3];
359              
360 0 0         my $test_fetchChromSizes = can_run('fetchChromSizes') or
361             croak "ERROR [$this_function] fetchChromSizes utility not found";
362              
363 0           my $cmd = "fetchChromSizes $species";
364 0           my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
365             run(command => $cmd, verbose => 0);
366 0 0         if ($success){
367 0           @chromsize = @{$stdout_buf};
  0            
368             }
369             else{
370 0           carp "WARN [$this_function] Using UCSCs fetchChromSizes failed, trying alternative mysql fetch!\n";
371 0 0         my $test_fetchChromSizes = can_run('mysql') or
372             croak "ERROR [$this_function] mysql utility not found";
373 0           $cmd = "mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from $species.chromInfo\""; ### Alternative to UCSC fetchChromSizes, has mysql dependency
374 0           my( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
375             run(command => $cmd, verbose => 0);
376 0 0         if ($success){
377 0           @chromsize = @{$stdout_buf};
  0            
378             }
379             else{
380 0           carp "ERROR [$this_function] External command call unsuccessful\n";
381 0           carp "ERROR [$this_function] this is what the command printed:\n";
382 0           print join "", @$full_buf;
383 0           confess "Fetching of chromosome sizes failed, please either download fetchChromSizes from the UCSC script collection, or install mysql!\n";
384             }
385             }
386              
387 0           foreach (@chromsize){
388 0           chomp($_);
389 0           foreach (split(/\n/,$_)){
390 0           my ($chr,$size)=split (/\t/,$_);
391 0           $sizes{$chr}=$size;
392             }
393             }
394              
395 0           return(\%sizes);
396             }
397              
398             sub mkdircheck {
399 0     0 0   my @dirstocreate=();
400 0           my $this_function = (caller(0))[3];
401 0           while(@_){
402 0           push @dirstocreate, shift(@_);
403             }
404 0           foreach (@dirstocreate){
405 0           my @total = split(/[\/\\]/,$_);
406 0           my $dir;
407 0           while(@total){
408 0 0         $dir = dir(shift(@total)) unless (defined $dir);
409 0           $dir = dir($dir,shift(@total));
410             }
411 0 0         return if (-d $dir);
412 0 0         make_path($dir,{ verbose => 1 }) or croak "Error creating directory: $dir\t$, in $this_function!";
413             }
414             }
415              
416             sub rmdircheck {
417 0     0 0   my @dirstorm=();
418 0           my $this_function = (caller(0))[3];
419 0           while(@_){
420 0           push @dirstorm, shift(@_);
421             }
422 0           foreach (@dirstorm){
423 0           my @total = split(/[\/\\]/,$_);
424 0           my $dir='';
425 0           while(@total){
426 0 0         $dir = dir(shift(@total)) unless (defined $dir);
427 0           $dir = dir($dir,shift(@total));
428             }
429 0 0         return if (!-d $dir);
430 0 0         remove_tree($dir,{ verbose => 1 }) or croak "Error deleting directory: $dir, in $this_function!";
431             }
432             }
433              
434             1;
435             __END__
436              
437              
438             =head1 NAME
439              
440             Bio::ViennaNGS::Util - Utility routines for Next-Generation Sequencing data
441             analysis
442              
443             =head1 SYNOPSIS
444              
445             use Bio::ViennaNGS::Util;
446              
447             # make bigWig from BED or BAM
448             $type = "bam";
449             $strand = "+";
450             $bwfile = bed_or_bam2bw($type,$infile,$cs_in,$strand,$destdir,$wantnorm,$size_p,$scale,$logfile);
451              
452             # make bigBed from BED
453             my $bb = bed2bigBed($bed_in,$cs_in,$destdir,$logfile);
454              
455             # sort a BED file
456             sortbed($bed_in,$destdir,$bed_out,$rm_orig,$logfile)
457              
458             =head1 DESCRIPTION
459              
460             Bio::ViennaNGS::Util is a collection of utility subroutines for
461             building efficient Next-Generation Sequencing (NGS) data analysis
462             pipelines.
463              
464             =head2 ROUTINES
465              
466             =over
467              
468             =item bed_or_bam2bw($type,$infile,$chromsizes,$strand,$dest,$want_norm,$size,$scale,$log)
469              
470             Creates stranded, normalized BigWig coverage profiles from
471             strand-specific BAM or BED files (provided via C<$infile>). The
472             routine expects a file type 'bam' or 'bed' via the C<$type>
473             argument. C<$chromsizes> is the chromosome.sizes files, C<$strand> is
474             either "+" or "-" and C<$dest> contains the output path for
475             results. For normlization of bigWig profiles, additional attributes
476             are required: C<$want_norm> triggers normalization with values 0 or
477             1. C<$size> is the number of fragments/elements in the BAM or BED file
478             and C<$scale> gives the number to which data is normalized (ie. every
479             bedGraph entry is multiplied by a factor (C<$scale>/C<$size>). C<$log>
480             is expected to contain either the full path and file name of log file
481             or 'undef'. The routine returns the full file name of the newly
482             generated bigWig file.
483              
484             While this routine can handle non-straned BAM/BED files (in which case
485             C<$strand> should be set to "+" and hence all coverage profiles will
486             be created with a positive sign, even if they map to the negative
487             strand), usage of strand-specific data is highly recommended. For BAM
488             file, this is easily achieved by calling the L<bam_split> routine (see
489             above) prior to this one, thus creating dedicated BAM files containing
490             exclusively reads mapped to the positive or negative strand,
491             respectively.
492              
493             It is important to know that this routine B<does not extract> reads
494             mapped to either strand from a non-stranded BAM/BED file if the
495             C<$strand> argument is given. It rather adjusts the sign of B<all>
496             mapped reads/features in a BAM/BED file and then creates bigWig
497             files. See the L<split_bam> routine for extracting reads mapped to
498             either strand.
499              
500             Stranded bigWigs can easily be visualized via
501             L<TrackHubs|http://genome.ucsc.edu/goldenPath/help/hgTrackHubHelp.html>
502             in the UCSC Genome Browser. Internally, the conversion from BAM/BED to
503             bigWig is accomplished via two third-party applications:
504             L<genomeCoverageBed|http://bedtools.readthedocs.org/en/latest/content/tools/genomecov.html>
505             and
506             L<bedGraphToBigWig|http://hgdownload.cse.ucsc.edu/admin/exe/>. Intermediate
507             bedGraph files are removed automatically once the bigWig files are
508             ready.
509              
510             =item sortbed($infile,$dest,$outfile,$rm_orig,$log)
511              
512             Sorts BED file C<$infile> with F<bedtools sort>. C<$dest> and
513             C<outfile> name path and filename of the resulting sorted BED
514             file. C<$rm_infile> is either 1 or 0 and indicated whether the
515             original C<$infile> should be deleted. C<$log> holds path and name of
516             log file.
517              
518             =item bed2bigBed($infile,$chromsizes,$dest,$log)
519              
520             Creates an indexed bigBed file from a BED file. C<$infile> is the BED
521             file to be transformed, C<$chromsizes> is the chromosome.sizes file
522             and C<$dest> contains the output path for results. C<$log> is the name
523             of a log file, or undef if no logging is reuqired. A '.bed', '.bed6'
524             or '.bed12' suffix in C<$infile> will be replaced by '.bb' in the
525             output. Else, the name of the output bigBed file will be the value of
526             C<$infile> plus '.bb' appended.
527              
528             The conversion from BED to bigBed is done by a third-party utility
529             (bedToBigBed), which is executed by L<IPC::Cmd>.
530              
531              
532             =back
533              
534             =head1 DEPENDENCIES
535              
536             =over
537              
538             =item L<Bio::ViennaNGS::FeatureChain>
539              
540             =item L<Bio::Perl> >= 1.00690001
541              
542             =item L<File::Basename>
543              
544             =item L<File::Path>
545              
546             =item L<Path::Class>
547              
548             =item L<IPC::Cmd>
549              
550             =item L<Math::Round>
551              
552             =item L<Carp>
553              
554             =back
555              
556             L<Bio::ViennaNGS> uses third-party tools for computing intersections
557             of BED files: F<bedtools intersect> from the
558             L<BEDtools|http://bedtools.readthedocs.org/en/latest/content/tools/intersect.html>
559             suite is used to compute overlaps and F<bedtools sort> is used to sort
560             BED output files. Make sure that those third-party utilities are
561             available on your system, and that hey can be found and executed by
562             the Perl interpreter. We recommend installing the latest version of
563             L<BEDtools|https://github.com/arq5x/bedtools2> on your system.
564              
565             =head1 AUTHORS
566              
567             =over
568              
569             =item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
570              
571             =item Joerg Fallmann E<lt>fall@tbi.univie.ac.atE<gt>
572              
573             =back
574              
575             =head1 COPYRIGHT AND LICENSE
576              
577             Copyright (C) 2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
578              
579             This library is free software; you can redistribute it and/or modify
580             it under the same terms as Perl itself, either Perl version 5.10.0 or,
581             at your option, any later version of Perl 5 you may have available.
582              
583             This software is distributed in the hope that it will be useful, but
584             WITHOUT ANY WARRANTY; without even the implied warranty of
585             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
586              
587             =cut