File Coverage

blib/lib/Bio/ViennaNGS/BamStatSummary.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2015-02-06 16:28:21 mtw>
3              
4             package Bio::ViennaNGS::BamStatSummary;
5              
6 1     1   1303 use version; our $VERSION = qv('0.12_15');
  1         1  
  1         5  
7 1     1   63 use Moose;
  1         1  
  1         9  
8 1     1   5568 use Carp;
  1         2  
  1         60  
9 1     1   5 use POSIX qw(floor);
  1         2  
  1         8  
10 1     1   265 use Statistics::R;
  0            
  0            
11             use Data::Dumper;
12             use Path::Class;
13             use namespace::autoclean;
14             use Bio::ViennaNGS::BamStat;
15             use Tie::Hash::Indexed;
16             use File::Basename;
17              
18             has 'data' => (
19             is => 'ro',
20             isa => 'ArrayRef [Bio::ViennaNGS::BamStat]',
21             default => sub { [] },
22             );
23              
24             has 'countStat' => (
25             is => 'rw',
26             isa => 'HashRef',
27             predicate => 'has_countStat',
28             default => sub { {} },
29             # auto_deref => '1',
30             );
31              
32             has 'outpath' => (
33             is => 'rw',
34             isa => 'Str',
35             required => '1',
36             );
37              
38             has 'rlib' => (
39             is => 'rw',
40             isa => 'Str',
41             required => '1',
42             );
43              
44             has 'files' => (
45             is => 'rw',
46             isa => 'ArrayRef',
47             required => 1,
48             predicate => 'has_files',
49             );
50              
51             has 'control_match' => ( # provides stats how many mapped bases match the reference genome
52             is => 'rw',
53             isa => 'Bool',
54             default => '1',
55             predicate => 'has_control_match',
56             );
57              
58             has 'control_clip' => ( # provides stats how many bases are soft or hard clipped
59             is => 'rw',
60             isa => 'Bool',
61             default => '1',
62             predicate => 'has_control_clip',
63             );
64              
65             has 'control_split' => ( # provides stats how many/often mapped reads are split
66             is => 'rw',
67             isa => 'Bool',
68             default => '1',
69             predicate => 'has_control_split',
70             );
71              
72             has 'control_qual' => ( # provides stats on quality of the match
73             is => 'rw',
74             isa => 'Bool',
75             default => '1',
76             predicate => 'has_control_qual',
77             );
78              
79             has 'control_edit' => ( # provides stats on the edit distance between read and mapped reference
80             is => 'rw',
81             isa => 'Bool',
82             default => '1',
83             predicate => 'has_control_edit',
84             );
85              
86             has 'control_flag' => ( # analyses the sam bit flag for qual/strands/pair_vs_single reads
87             is => 'rw',
88             isa => 'Bool',
89             default => '1',
90             predicate => 'has_control_flag',
91             );
92              
93             has 'control_score' => ( # provides stats on per-base quality scores
94             is => 'rw',
95             isa => 'Bool',
96             default => '1',
97             predicate => 'has_control_score',
98             );
99              
100             has 'control_uniq' => ( # gives number and stats of multiplicity of readaligments
101             is => 'rw',
102             isa => 'Bool',
103             default => '1',
104             predicate => 'has_control_uniq',
105             );
106              
107             has 'is_segemehl' => ( # toggles to consider segemehl specific bam feature
108             is => 'rw',
109             isa => 'Bool',
110             default => '0',
111             predicate => 'has_is_segemehl',
112             );
113              
114             sub populate_data {
115             my ($self) = @_;
116             foreach my $bamfile (@{$self->files}){
117             #carp ">> processing $bamfile\n";
118             my $bo = Bio::ViennaNGS::BamStat->new(bam => $bamfile);
119             $bo->stat_singleBam();
120             push (@{$self->data}, $bo);
121             }
122             }
123              
124              
125             sub populate_countStat {
126             my ($self) = @_;
127              
128             tie my %hdr, 'Tie::Hash::Indexed';
129             %hdr = (
130             "sample" => "Sample",
131             "total_alignments" => "# Total alignments",
132             "mapped_reads" => "# Mapped reads",
133             "umapped_reads" => "# Unique mapped reads",
134             "mmapped_reads" => "# Multi mapped reads",
135             "aligned_pairs" => "# Aligned in pairs",
136             "aligned_mm" => "# Aligned mate missing",
137             "aligned_se" => "# Aligned single end",
138             "aligned_fwd" => "# Aligned forward strand",
139             "aligned_rev" => "# Aligned reverse strand");
140             ${$self->countStat}{'header'} = \%hdr;
141              
142             foreach my $sample (@{$self->data}){
143             my ($basename,$dir,$ext) = fileparse($$sample{'bam'},qr/\.[^.]*/);
144             ${$self->countStat}{$basename}{'total_alignments'} = floor(0.5 + $$sample{'data_out'}->{'aln_count'}->{'total'} );
145             ${$self->countStat}{$basename}{'mapped_reads'} = floor(0.5 + $$sample{'data_out'}->{'uniq'}->{'mapped_reads'} );
146             ${$self->countStat}{$basename}{'umapped_reads'} = floor(0.5 + $$sample{'data_out'}->{'uniq'}->{'uniq_mapped_reads'} );
147             ${$self->countStat}{$basename}{'mmapped_reads'} = floor(0.5 + $$sample{'data_out'}->{'uniq'}->{'mapped_reads'} - $$sample{'data_out'}->{'uniq'}->{'uniq_mapped_reads'} );
148             ${$self->countStat}{$basename}{'aligned_pairs'} = floor(0.5 + ($$sample{'data_out'}->{'aln_count'}->{'mapped_pair'})/2 );
149             ${$self->countStat}{$basename}{'aligned_mm'} = floor(0.5 + $$sample{'data_out'}->{'aln_count'}->{'unmapped_pair'} );
150             ${$self->countStat}{$basename}{'aligned_se'} = floor(0.5 + $$sample{'data_out'}->{'aln_count'}->{'mapped_single'} );
151             ${$self->countStat}{$basename}{'aligned_fwd'} = floor(0.5 + $$sample{'data_out'}->{'strand'}->{'forward'} );
152             ${$self->countStat}{$basename}{'aligned_rev'} = floor(0.5 + $$sample{'data_out'}->{'strand'}->{'reverse'} );
153             }
154             }
155              
156             sub dump_countStat {
157             my ($self,$how) = @_;
158             my $mn = "mapping_stats.csv";
159             my $fn = file($self->outpath,$mn);
160            
161             open(OUT, "> $fn") or croak "cannot open OUT $!";
162             print OUT join ("\t", values %{$self->countStat->{'header'} })."\n";
163              
164             foreach my $sample (keys %{$self->countStat} ){
165             next if ($sample eq 'header');
166             my @line = ();
167             foreach my $key ( keys %{$self->countStat->{'header'}} ) {
168             if ($key eq 'sample'){
169             push (@line, $sample);
170             next;
171             }
172             push @line, $self->countStat->{$sample}->{$key};
173             }
174             print OUT join ("\t", @line)."\n";
175             }
176              
177             # print Dumper($self->countStat);
178             close (OUT);
179             }
180              
181             sub make_BarPlot{
182             my ($self) = @_;
183             my @Rstat_data_count = ();
184              
185             ## collect data for read.table string
186             push @Rstat_data_count, 'Samples', grep {!/header/} keys %{$self->countStat}; # first line with sample names
187             $Rstat_data_count[-1]="$Rstat_data_count[-1]\n"; # end 1st line
188              
189             push @Rstat_data_count, 'aligned_se';
190             foreach my $sample ( grep {!/header/} keys %{$self->countStat} ) { # 2nd line with single end aln counts
191             push @Rstat_data_count, $self->countStat->{$sample}{'aligned_se'};
192             }
193             $Rstat_data_count[-1]="$Rstat_data_count[-1]\n"; # end 2nd line
194              
195             push @Rstat_data_count, 'aligned_pairs';
196             foreach my $sample ( grep {!/header/} keys %{$self->countStat} ) { # 3rd line with aligned pairs count
197             push @Rstat_data_count, $self->countStat->{$sample}{'aligned_pairs'};
198             }
199             $Rstat_data_count[-1]="$Rstat_data_count[-1]\n"; # end 3rd line
200              
201             push @Rstat_data_count, 'aligned_mm';
202             foreach my $sample ( grep {!/header/} keys %{$self->countStat} ) { # 4th line with incomplete aligned pairs
203             push @Rstat_data_count, $self->countStat->{$sample}{'aligned_mm'};
204             }
205             $Rstat_data_count[-1]="$Rstat_data_count[-1]\n"; # end 4th line
206              
207             ## produce bar plot
208             my $mn = "mapping_stats.pdf";
209             my $fn = file($self->outpath,$mn);
210             my $datastring = join(" ", @Rstat_data_count);
211             $self->plot_barplot($fn, "Mapped reads", $datastring); # produce plot with read.table string input
212             }
213              
214             sub plot_barplot { #plot barplot read.table text string
215             my ($self, $filename, $ylab, $data_string) = @_;
216             my ($bn,$odir,$ext) = fileparse($filename, qr /\..*/);
217             #my $rlibpath = '/usr/bin/R';
218             my $rlibpath = $self->rlib;
219              
220             $filename .= '.pdf' unless ($ext eq '.pdf');
221              
222             my $R = Statistics::R->new();
223             $R->startR;
224             $R->set('rlib', $rlibpath);
225             $R->set('log_dir', $odir);
226             $R->run("pdf('${filename}')") ;
227             $R->run("dat<-read.table(text = \"$data_string\", header = TRUE, row.names=1)") ;
228             $R->run("dat_m<-as.matrix(dat)") ;
229             ##$R->run("colors<-terrain.colors(nrow(dat_m), alpha = 1)") ;
230             $R->run("colors<-c('lightblue','lightgreen','lightcoral', terrain.colors(nrow(dat_m)-3, alpha = 1))") ;
231             $R->run("types<-row.names(dat_m)") ;
232             $R->run("par(mar = c(15,3,5,5), oma = c(1, 1, 4, 1))") ;
233             # $R->run("barplot(dat_m, xlim=c(0,ncol(dat_m)+2), col=colors, legend.text = TRUE, args.legend = list(x = ncol(dat_m) + 2, y=max(colSums(dat_m)), bty = 'n' ), ylab='$ylab', xlab='Samples')") ;
234             # $R->run("barplot(dat_m, xlim=c(0,ncol(dat_m)), col=colors, legend.text = TRUE, args.legend = list(x = ncol(dat_m) + 5, y=-5, bty = 'o' ), ylab='$ylab', xlab='Samples', las=3)") ;
235             # $R->run("barplot(dat_m, xlim=c(0,ncol(dat_m)), col=colors, legend.text = TRUE, args.legend = list(\"topright\", horiz = TRUE, bty = 'o' ), ylab='$ylab', xlab='', las=3)") ;
236             $R->run("barplot(dat_m, xlim=c(0,ncol(dat_m)), col=colors, ylab='$ylab', xlab='', las=3)") ;
237             $R->run("par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0),mar = c(0, 0, 0, 0), new = TRUE)") ;
238             $R->run("legend('top', types, horiz = TRUE, inset = c(0,0), bty = 'n', fill = colors, cex = 1.2 )") ;
239             $R->run("dev.off()") ;
240             $R->stopR;
241             }
242              
243              
244             sub make_BoxPlot{
245             my ($self, $whattodo) = @_;
246             my @Rstat_data = ();
247             my @Rstat_length = ();
248             my @Rstat_names = ();
249            
250             ## collect data for read.table string
251             foreach my $sample (@{$self->data}){
252             my ($basename,$dir,$ext) = fileparse($$sample{'bam'},qr/\.[^.]*/);
253            
254             push @Rstat_data, statsstring(@{$$sample{$whattodo}});
255             push @Rstat_length, scalar(@{$$sample{$whattodo}});
256             push @Rstat_names, "'$basename'",
257            
258             }
259              
260             my $data_string="summarydata<-list(stats=matrix(c(".join(",",@Rstat_data)."),5,".scalar(@Rstat_names)."), n=c(".join(",",@Rstat_length)."), names=c(".join(",",@Rstat_names)."))";
261              
262             ## produce box plot
263             if(@Rstat_data){
264            
265             my $mn = "${whattodo}_stats.pdf";
266             my $fn = file($self->outpath,$mn);
267            
268             $self->plot_bxplot($fn, $whattodo, $data_string);
269             }
270             }
271              
272             sub plot_bxplot{
273             my ($self, $filename, $ylab, $datacommand_string) = @_;
274             my ($bn,$odir,$ext) = fileparse($filename, qr /\..*/);
275             my $rlibpath = $self->rlib;
276             #my $rlibpath = '/usr/bin/R';
277             $filename .= '.pdf' unless ($ext eq '.pdf');
278            
279             my $R = Statistics::R->new();
280             $R->startR;
281             $R->set('rlib', $rlibpath);
282             $R->set('log_dir', $odir);
283             $R->run("pdf('${filename}')") ;
284             $R->run("$datacommand_string") ;
285             $R->run("bxp(summarydata, medcol = 'red', ylab='$ylab', xlab='',las=3)") ;
286             $R->run("dev.off()") ;
287             $R->stopR;
288             }
289              
290             sub statsstring{
291             # usage: %h = %{stats(@a)};
292             my @vals = sort {$a <=> $b} @_;
293             my %stats = ();
294             my @statstring = ();
295              
296             if(@vals){
297             push @statstring, sprintf("%.2f", &min(\@vals)); ## min
298             push @statstring, sprintf("%.2f", $vals[int(@vals/4)]); ## 1.quartile
299             if(@vals%2){
300             push @statstring, $vals[int(@vals/2)]; ## odd median
301             }
302             else{
303             push @statstring, ($vals[int(@vals/2)-1] + $vals[int(@vals/2)])/2; ## even median
304             }
305             push @statstring, sprintf("%.2f", $vals[int((@vals*3)/4)]); ## 3.quartile
306             push @statstring, sprintf("%.2f", &max(\@vals)); ## max
307             }
308             else{
309             @statstring=qw/0 0 0 0 0/;
310             }
311             return(@statstring);
312             }
313              
314             sub max { # usage: $h = %{max(\@a)};
315             my ($arrayref) = @_;
316             my $max = $arrayref->[0];
317             foreach (@$arrayref) {$max = $_ if $_ > $max}
318             return $max;
319             }
320              
321             sub min { # usage: $h = %{min(\@a)};
322             my ($arrayref) = @_;
323             my $min = $arrayref->[0];
324             foreach (@$arrayref) {$min = $_ if $_ < $min}
325             return $min;
326             }
327              
328             1;
329              
330             __END__
331              
332              
333             =head1 NAME
334              
335             Bio::ViennaNGS::BamStatSummary - Moose interface to analyze, summarize
336             and compare BAM mapping statistics data structure produced by
337             Bio::ViennaNGS::BamStat
338              
339             =head1 SYNOPSIS
340              
341             use Bio::ViennaNGS::BamStatSummary;
342              
343             $bamsummary->populate_data();
344             $bamsummary->populate_countStat();
345             $bamsummary->dump_countStat("csv");
346             $bamsummary->make_BarPlot();
347              
348             $bamsummary->make_BoxPlot("data_edit" ) if( $bamsummary->has_control_edit );
349             $bamsummary->make_BoxPlot("data_clip" ) if( $bamsummary->has_control_clip );
350             $bamsummary->make_BoxPlot("data_match") if( $$bamsummary->has_control_match );
351             $bamsummary->make_BoxPlot("data_qual" ) if( $bamsummary->has_control_qual );
352              
353              
354             =head1 DESCRIPTION
355              
356             This module provides a L<Moose> interface to process the mapping
357             statistics of single BAM file. It uses the data structure as produced
358             by L<Bio::ViennaNGS::BamStat>, summarizes the data and compares
359             different BAM files. Output is written both as CSV files and graphical
360             representation of the results. Internally, this modules build on
361             L<Statistics::R>.
362              
363              
364             =head1 DEPENDENCIES
365              
366             =over
367              
368             =item L<Statistics::R>
369              
370             =item L<Path::Class>
371              
372             =item L<Tie::Hash::Indexed>
373              
374             =item L<Moose>
375              
376             =back
377              
378             =head1 SEE ALSO
379              
380             =over
381              
382             =item L<Bio::ViennaNGS>
383              
384             =item L<Bio::ViennaNGS::BamStat>
385              
386             =back
387              
388             =head1 AUTHORS
389              
390             =over
391              
392             =item Fabian Amman E<lt>fabian@tbi.univie.ac.atE<gt>
393              
394             =item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
395              
396             =back
397              
398             =head1 COPYRIGHT AND LICENSE
399              
400             Copyright (C) 2015 by Michael T. Wolfinger
401              
402             This library is free software; you can redistribute it and/or modify
403             it under the same terms as Perl itself, either Perl version 5.10.0 or,
404             at your option, any later version of Perl 5 you may have available.
405              
406             This software is distributed in the hope that it will be useful, but
407             WITHOUT ANY WARRANTY; without even the implied warranty of
408             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
409              
410             =cut