File Coverage

blib/lib/Bio/ViennaNGS/Bam.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2015-02-09 09:20:43 mtw>
3              
4             package Bio::ViennaNGS::Bam;
5              
6 1     1   1330 use Exporter;
  1         3  
  1         64  
7 1     1   35 use version; our $VERSION = qv('0.12_16');
  1         2  
  1         10  
8 1     1   98 use strict;
  1         1  
  1         39  
9 1     1   4 use warnings;
  1         2  
  1         38  
10 1     1   497 use Bio::Perl 1.00690001;
  1         83321  
  1         95  
11 1     1   206 use Bio::DB::Sam 1.37;
  0            
  0            
12             use Data::Dumper;
13             use File::Basename qw(fileparse);
14             use File::Temp qw(tempfile);
15             use Path::Class;
16             use Carp;
17              
18             our @ISA = qw(Exporter);
19             our @EXPORT = ();
20              
21             our @EXPORT_OK = qw ( split_bam uniquify_bam );
22              
23             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
24             #^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
25             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
26              
27             sub split_bam {
28             my %data = ();
29             my @NHval = ();
30             my @processed_files = ();
31             my $verbose = 0;
32             my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_;
33             my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp);
34             my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg);
35             my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg);
36             my ($seq_id,$start,$stop,$strand,$target_names,$id,$score);
37             my $this_function = (caller(0))[3];
38             my %count_entries = (
39             total => 0,
40             uniq => 0,
41             pos => 0,
42             neg => 0,
43             skip => 0,
44             cur => 0,
45             mult => 0,
46             se_alis => 0,
47             pe_alis => 0,
48             flag => 0,
49             );
50             $data{count} = \%count_entries;
51             $data{flag} = ();
52              
53             croak "ERROR [$this_function] $bamfile does not exist\n"
54             unless (-e $bamfile);
55             croak "ERROR [$this_function] $dest_dir does not exist\n"
56             unless (-d $dest_dir);
57              
58             open(LOG, ">", $log) or croak $!;
59              
60             (undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0);
61             (undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0);
62              
63             $bam = Bio::DB::Bam->open($bamfile, "r");
64             $header = $bam->header;
65             $target_names = $header->target_name;
66              
67             ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
68             unless ($dest_dir =~ /\/$/){$dest_dir .= "/";}
69             $bamname_pos = $dest_dir.$bn.".pos".$ext;
70             $bamname_neg = $dest_dir.$bn.".neg".$ext;
71             $bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w')
72             or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!";
73             $bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w')
74             or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!";
75              
76             if ($want_bed == 1){
77             $bedname_pos = $dest_dir.$bn.".pos.bed";
78             $bedname_neg = $dest_dir.$bn.".neg.bed";
79             open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg);
80             }
81              
82             $bam_pos->header_write($header);$bam_neg->header_write($header);
83             if($reverse == 1) { # switch +/- strand mapping
84             $tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp;
85             $tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp;
86             }
87              
88             while (my $read= $bam->read1() ) {
89             @NHval = ();
90             $data{count}{total}++;
91             if($verbose == 1){print STDERR $read->query->name."\t";}
92              
93             # check if NH (the SAM tag used to indicate multiple mappings) is set
94             if ($read->has_tag("NH")) {
95             @NHval = $read->get_tag_values("NH");
96             $NH = $NHval[0];
97             if ($NH == 1) {
98             $data{count}{uniq}++;
99             if ($verbose == 1) {print STDERR "NH:i:1\t";}
100             }
101             else {
102             $data{count}{mult}++;
103             if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";}
104             if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper
105             $data{count}{skip}++;
106             next;
107             }
108             }
109             $data{count}{cur}++;
110             }
111             else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";}
112              
113             # print Dumper ($read->query);
114             $strand = $read->strand;
115             $seq_id = $target_names->[$read->tid];
116             $start = $read->start;
117             $stop = $read->end;
118             $id = "x"; # $read->qname;
119             $score = 100;
120              
121             if ( $read->get_tag_values('PAIRED') ) { # paired-end
122             if($verbose == 1) {print STDERR "pe\t";}
123             $data{count}{pe_alis}++;
124             if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted
125             if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";}
126             if ( $strand eq "1" ){
127             $bam_pos->write1($read);
128             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
129             if ($reverse == 0){
130             $data{count}{pos}++; $eff_strand=$strand;
131             if ($want_bed){printf $bed_pos "%s\n", "+";}
132             }
133             else {
134             $data{count}{neg}++; $eff_strand=-1*$strand;
135             if ($want_bed){printf $bed_pos "%s\n", "-";}
136             }
137             }
138             elsif ($strand eq "-1") {
139             $bam_neg->write1($read);
140             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
141             if ($reverse == 0){
142             $data{count}{neg}++; $eff_strand=$strand;
143             if ($want_bed){printf $bed_neg "%s\n", "-";}
144             }
145             else {
146             $data{count}{pos}++;$eff_strand=-1*$strand;
147             if ($want_bed){printf $bed_neg "%s\n", "+";}
148             }
149             }
150             else {croak "Strand neither + nor - ...exiting!\n";}
151             }
152             else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located
153             # on the other strand
154             if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";}
155             if ( $strand eq "1" ) {
156             $bam_neg->write1($read);
157             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
158             if ($reverse == 0){
159             $data{count}{neg}++;$eff_strand=$strand;
160             if ($want_bed){printf $bed_neg "%s\n", "-";}
161             }
162             else {
163             $data{count}{pos}++; $eff_strand=-1*$strand;
164             if ($want_bed){printf $bed_neg "%s\n", "+";}
165             }
166             }
167             elsif ( $strand eq "-1" ) {
168             $bam_pos->write1($read);
169             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
170             if ($reverse == 0){
171             $data{count}{pos}++;$eff_strand=$strand;
172             if ($want_bed){printf $bed_pos "%s\n", "+";}
173             }
174             else {
175             $data{count}{neg}++;$eff_strand=-1*$strand;
176             if ($want_bed){printf $bed_pos "%s\n", "-";}
177             }
178             }
179             else {croak "Strand neither + nor - ...exiting!\n";}
180             }
181             }
182             else { # single-end
183             if($verbose == 1) {print STDERR "se\t";}
184             $data{count}{se_alis}++;
185             if ( $read->strand eq "1" ){
186             $bam_pos->write1($read);
187             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
188             if ($reverse == 0){
189             $data{count}{pos}++;$eff_strand=$strand;
190             if ($want_bed){printf $bed_pos "%s\n", "+";}
191             }
192             else {
193             $data{count}{neg}++;$eff_strand=-1*$strand;
194             if ($want_bed){printf $bed_pos "%s\n", "-";}
195             }
196             }
197             elsif ($read->strand eq "-1") {
198             $bam_neg->write1($read);
199             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
200             if ($reverse == 0){
201             $data{count}{neg}++;$eff_strand=$strand;
202             if ($want_bed){printf $bed_neg "%s\n", "-";}
203             }
204             else {
205             $data{count}{pos}++;$eff_strand=-1*$strand;
206             if ($want_bed){printf $bed_neg "%s\n", "+";}
207             }
208             }
209             else {croak "Strand neither + nor - ...exiting!\n";}
210             }
211             if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";}
212              
213             # collect statistics of SAM flags
214             $flag = $read->flag;
215             unless (exists $data{flag}{$flag}){
216             $data{flag}{$flag} = 0;
217             }
218             $data{flag}{$flag}++;
219             if ($verbose == 1) {print STDERR "\n";}
220             } # end while
221              
222             rename ($tmp_bam_pos, $bamname_pos);
223             rename ($tmp_bam_neg, $bamname_neg);
224             push (@processed_files, ($bamname_pos,$bamname_neg));
225             push (@processed_files, ($data{count}{pos},$data{count}{neg}));
226             if ($want_bed){
227             push (@processed_files, ($bedname_pos,$bedname_neg))
228             }
229              
230             # error checks
231             unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) {
232             printf "ERROR: paired-end + single-end alignments != total alignment count\n";
233             print Dumper(\%data);
234             croak $!;
235             }
236             unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) {
237             printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos};
238             printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg};
239             printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg});
240             printf STDERR "%20d cur_count (should be)\n",$data{count}{cur};
241             printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n";
242             print Dumper(\%data);
243             croak $!;
244             }
245             foreach (keys %{$data{flag}}){
246             $data{count}{flag} += $data{flag}{$_};
247             }
248             unless ($data{count}{flag} == $data{count}{cur}){
249             printf STDERR "%20d alignments considered\n",$data{count}{cur};
250             printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag};
251             printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n";
252             print Dumper(\%data);
253             croak $!;
254             }
255              
256             # logging output
257             printf LOG "# bam_split.pl log for file \'$bamfile\'\n";
258             printf LOG "#-----------------------------------------------------------------\n";
259             printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total};
260             printf LOG "%20d unique-mappers (%6.2f%% of total)\n",
261             $data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ;
262             printf LOG "%20d multi-mappers (%6.2f%% of total)\n",
263             $data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total});
264             printf LOG "%20d multi-mappers skipped\n", $data{count}{skip};
265             printf LOG "%20d alignments considered\n", $data{count}{cur};
266             printf LOG "%20d paired-end\n", $data{count}{pe_alis};
267             printf LOG "%20s single-end\n", $data{count}{se_alis};
268             printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n",
269             $data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur});
270             printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n",
271             $data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur});
272             printf LOG "#-----------------------------------------------------------------\n";
273             printf LOG "Dumper output:\n". Dumper(\%data);
274             close(LOG);
275             return @processed_files;
276             }
277              
278             sub uniquify_bam {
279             my ($bamfile,$dest,$log) = @_;
280             my ($bam, $bn,$path,$ext,$read,$header);
281             my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult);
282             my ($count_all,$count_uniq,$count_mult) = (0)x3;
283             my @processed_files = ();
284             my $this_function = (caller(0))[3];
285              
286             croak "ERROR [$this_function] Cannot find $bamfile\n"
287             unless (-e $bamfile);
288             croak "ERROR [$this_function] $dest does not exist\n"
289             unless (-d $dest);
290              
291             ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
292              
293             (undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0);
294             (undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0);
295              
296             $bam = Bio::DB::Bam->open($bamfile, "r");
297             $fn_uniq = file($dest,$bn.".uniq".$ext);
298             $fn_mult = file($dest,$bn.".mult".$ext);
299             $header = $bam->header; # TODO: modify header, leave traces ...
300              
301             $bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w')
302             or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
303             $bam_mult = Bio::DB::Bam->open($tmp_mult,'w')
304             or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
305             $bam_uniq->header_write($header);
306             $bam_mult->header_write($header);
307              
308             while ($read = $bam->read1() ) {
309             $count_all++;
310             if ($read->aux_get("NH") == 1){ # uniquely mapped reads
311             $bam_uniq->write1($read);
312             $count_uniq++;
313             }
314             else { # multiply mapped reads
315             $bam_mult->write1($read);
316             $count_mult++;
317             }
318             }
319              
320             croak "ERROR [$this_function] Read counts don't match\n"
321             unless ($count_uniq + $count_mult == $count_all);
322              
323             rename ($tmp_uniq, $fn_uniq);
324             rename ($tmp_mult, $fn_mult);
325             push (@processed_files, ($fn_uniq,$fn_mult));
326              
327             if (defined $log){
328             my $lf = file($dest,$log);
329             open(LOG, ">>", $lf) or croak $!;
330             printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n",
331             $count_all,$count_uniq,$count_mult;
332             close(LOG);
333             }
334             }
335              
336              
337             1;
338             __END__
339              
340              
341             =head1 NAME
342              
343             Bio::ViennaNGS::Bam - High-level access to BAM files
344              
345             =head1 SYNOPSIS
346              
347             use Bio::ViennaNGS::Bam;
348              
349             # split a single-end or paired-end BAM file by strands
350             @result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile);
351              
352             # extract unique and multi mappers from a BAM file
353             @result = uniquify_bam($bam_in,$outdir,$logfile);
354              
355             =head1 DESCRIPTION
356              
357             Bio::ViennaNGS::BAM provides high-level access to BAM file. Building
358             on L<Bio::DB::Sam>, it provides code to extract specific portions from
359             BAM files. It comes with routines for splitting BAM files by strand
360             (which is often required for visualization of NGS data) and extracting
361             uniquely and multiply aligned reads from BAM files.
362              
363             =head2 ROUTINES
364              
365             =over
366              
367             =item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log)
368              
369             Splits BAM file $bam according to [+] and [-] strand. C<$reverse>,
370             C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1,
371             triggering forced reversion of strand mapping (due to RNA-seq protocol
372             constraints), filtering of unique mappers (identified via NH:i:1 SAM
373             argument), and forced output of a BED file corresponding to
374             strand-specific mapping, respectively. C<$log> holds name and path of
375             the log file.
376              
377             Strand-splitting is done in a way that in paired-end alignments, FIRST
378             and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE
379             reads determine the strand, while SECOND_MATE reads are assigned the
380             opposite strand I<per definitionem>. This also holds if the reads are
381             not mapped in proper pairs and even if there is no mapping partner at
382             all.
383              
384             Sometimes the library preparation protocol causes inversion of the
385             read assignment (with respect to the underlying annotation). In those
386             cases, the natural mapping of the reads can be obtained by the
387             C<$reverse> flag.
388              
389             This routine returns an array whose fist two elements are the file
390             names of the newly generate BAM files with reads mapped to the
391             positive, and negative strand, respectively. Elements three and four
392             are the number of fragments mapped to the positive and negative
393             strand. If the C<$want_bed> option was given elements fiveand six are
394             the file names of the output BED files for positive and negative
395             strand, respectively.
396              
397             NOTE: Filtering of unique mappers is only safe for single-end
398             experiments; In paired-end experiments, read and mate are treated
399             separately, thus allowing for scenarios where eg. one read is a
400             multi-mapper, whereas its associate mate is a unique mapper, resulting
401             in an ambiguous alignment of the entire fragment.
402              
403             =item uniquify_bam($bam,$dest,$log)
404              
405             Extract I<uniquely> and I<multiply> aligned reads from BAM file
406             C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for
407             unique and multi mappers are created in the output folder C<$dest>,
408             which are named B<basename.uniq.bam> and B<basename.mult.bam>,
409             respectively. If defined, a logfile named C<$log> is created in the
410             output folder.
411              
412             This routine returns an array holding file names of the newly created
413             BAM files for unique and multi mappers, respectively.
414              
415             NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to
416             decorate unique and multi mappers. As such, this routine will not work
417             unless your BAM file has these attributes.
418              
419             =head1 DEPENDENCIES
420              
421             =over
422              
423             =item L<Bio::Perl> >= 1.00690001
424              
425             =item L<BIO::DB::Sam> >= 1.37
426              
427             =item L<File::Basename>
428              
429             =item L<File::Temp>
430              
431             =item L<Path::Class>
432              
433             =item L<Carp>
434              
435             =back
436              
437             =head1 AUTHORS
438              
439             =over
440              
441             =item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
442              
443             =back
444              
445             =head1 COPYRIGHT AND LICENSE
446              
447             Copyright (C) 2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
448              
449             This library is free software; you can redistribute it and/or modify
450             it under the same terms as Perl itself, either Perl version 5.10.0 or,
451             at your option, any later version of Perl 5 you may have available.
452              
453             This software is distributed in the hope that it will be useful, but
454             WITHOUT ANY WARRANTY; without even the implied warranty of
455             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
456              
457             =cut