File Coverage

blib/lib/Bio/ViennaNGS/Bam.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2014-12-20 00:31:03 mtw>
3              
4             package Bio::ViennaNGS::Bam;
5              
6 1     1   1177 use 5.12.0;
  1         3  
  1         38  
7 1     1   4 use Exporter;
  1         1  
  1         40  
8 1     1   3 use version; our $VERSION = qv('0.12_07');
  1         1  
  1         5  
9 1     1   63 use strict;
  1         2  
  1         24  
10 1     1   7 use warnings;
  1         1  
  1         32  
11 1     1   202 use Bio::Perl 1.00690001;
  0            
  0            
12             use Bio::DB::Sam 1.39;
13             use Data::Dumper;
14             use File::Basename qw(fileparse);
15             use File::Temp qw(tempfile);
16             use Path::Class;
17             use Carp;
18              
19             our @ISA = qw(Exporter);
20             our @EXPORT = ();
21              
22             our @EXPORT_OK = qw ( split_bam uniquify_bam );
23              
24             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
25             #^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
26             #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
27              
28             sub split_bam {
29             my %data = ();
30             my @NHval = ();
31             my @processed_files = ();
32             my $verbose = 0;
33             my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_;
34             my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp);
35             my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg);
36             my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg);
37             my ($seq_id,$start,$stop,$strand,$target_names,$id,$score);
38             my $this_function = (caller(0))[3];
39             my %count_entries = (
40             total => 0,
41             uniq => 0,
42             pos => 0,
43             neg => 0,
44             skip => 0,
45             cur => 0,
46             mult => 0,
47             se_alis => 0,
48             pe_alis => 0,
49             flag => 0,
50             );
51             $data{count} = \%count_entries;
52             $data{flag} = ();
53              
54             croak "ERROR [$this_function] $bamfile does not exist\n"
55             unless (-e $bamfile);
56             croak "ERROR [$this_function] $dest_dir does not exist\n"
57             unless (-d $dest_dir);
58              
59             open(LOG, ">", $log) or croak $!;
60              
61             (undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0);
62             (undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0);
63              
64             $bam = Bio::DB::Bam->open($bamfile, "r");
65             $header = $bam->header;
66             $target_names = $header->target_name;
67              
68             ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
69             unless ($dest_dir =~ /\/$/){$dest_dir .= "/";}
70             $bamname_pos = $dest_dir.$bn.".pos".$ext;
71             $bamname_neg = $dest_dir.$bn.".neg".$ext;
72             $bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w')
73             or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!";
74             $bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w')
75             or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!";
76              
77             if ($want_bed == 1){
78             $bedname_pos = $dest_dir.$bn.".pos.bed";
79             $bedname_neg = $dest_dir.$bn.".neg.bed";
80             open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg);
81             }
82              
83             $bam_pos->header_write($header);$bam_neg->header_write($header);
84             if($reverse == 1) { # switch +/- strand mapping
85             $tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp;
86             $tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp;
87             }
88              
89             while (my $read= $bam->read1() ) {
90             @NHval = ();
91             $data{count}{total}++;
92             if($verbose == 1){print STDERR $read->query->name."\t";}
93              
94             # check if NH (the SAM tag used to indicate multiple mappings) is set
95             if ($read->has_tag("NH")) {
96             @NHval = $read->get_tag_values("NH");
97             $NH = $NHval[0];
98             if ($NH == 1) {
99             $data{count}{uniq}++;
100             if ($verbose == 1) {print STDERR "NH:i:1\t";}
101             }
102             else {
103             $data{count}{mult}++;
104             if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";}
105             if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper
106             $data{count}{skip}++;
107             next;
108             }
109             }
110             $data{count}{cur}++;
111             }
112             else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";}
113              
114             # print Dumper ($read->query);
115             $strand = $read->strand;
116             $seq_id = $target_names->[$read->tid];
117             $start = $read->start;
118             $stop = $read->end;
119             $id = "x"; # $read->qname;
120             $score = 100;
121              
122             if ( $read->get_tag_values('PAIRED') ) { # paired-end
123             if($verbose == 1) {print STDERR "pe\t";}
124             $data{count}{pe_alis}++;
125             if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted
126             if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";}
127             if ( $strand eq "1" ){
128             $bam_pos->write1($read);
129             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
130             if ($reverse == 0){
131             $data{count}{pos}++; $eff_strand=$strand;
132             if ($want_bed){printf $bed_pos "%s\n", "+";}
133             }
134             else {
135             $data{count}{neg}++; $eff_strand=-1*$strand;
136             if ($want_bed){printf $bed_pos "%s\n", "-";}
137             }
138             }
139             elsif ($strand eq "-1") {
140             $bam_neg->write1($read);
141             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
142             if ($reverse == 0){
143             $data{count}{neg}++; $eff_strand=$strand;
144             if ($want_bed){printf $bed_neg "%s\n", "-";}
145             }
146             else {
147             $data{count}{pos}++;$eff_strand=-1*$strand;
148             if ($want_bed){printf $bed_neg "%s\n", "+";}
149             }
150             }
151             else {croak "Strand neither + nor - ...exiting!\n";}
152             }
153             else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located
154             # on the other strand
155             if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";}
156             if ( $strand eq "1" ) {
157             $bam_neg->write1($read);
158             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
159             if ($reverse == 0){
160             $data{count}{neg}++;$eff_strand=$strand;
161             if ($want_bed){printf $bed_neg "%s\n", "-";}
162             }
163             else {
164             $data{count}{pos}++; $eff_strand=-1*$strand;
165             if ($want_bed){printf $bed_neg "%s\n", "+";}
166             }
167             }
168             elsif ( $strand eq "-1" ) {
169             $bam_pos->write1($read);
170             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
171             if ($reverse == 0){
172             $data{count}{pos}++;$eff_strand=$strand;
173             if ($want_bed){printf $bed_pos "%s\n", "+";}
174             }
175             else {
176             $data{count}{neg}++;$eff_strand=-1*$strand;
177             if ($want_bed){printf $bed_pos "%s\n", "-";}
178             }
179             }
180             else {croak "Strand neither + nor - ...exiting!\n";}
181             }
182             }
183             else { # single-end
184             if($verbose == 1) {print STDERR "se\t";}
185             $data{count}{se_alis}++;
186             if ( $read->strand eq "1" ){
187             $bam_pos->write1($read);
188             if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
189             if ($reverse == 0){
190             $data{count}{pos}++;$eff_strand=$strand;
191             if ($want_bed){printf $bed_pos "%s\n", "+";}
192             }
193             else {
194             $data{count}{neg}++;$eff_strand=-1*$strand;
195             if ($want_bed){printf $bed_pos "%s\n", "-";}
196             }
197             }
198             elsif ($read->strand eq "-1") {
199             $bam_neg->write1($read);
200             if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;}
201             if ($reverse == 0){
202             $data{count}{neg}++;$eff_strand=$strand;
203             if ($want_bed){printf $bed_neg "%s\n", "-";}
204             }
205             else {
206             $data{count}{pos}++;$eff_strand=-1*$strand;
207             if ($want_bed){printf $bed_neg "%s\n", "+";}
208             }
209             }
210             else {croak "Strand neither + nor - ...exiting!\n";}
211             }
212             if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";}
213              
214             # collect statistics of SAM flags
215             $flag = $read->flag;
216             unless (exists $data{flag}{$flag}){
217             $data{flag}{$flag} = 0;
218             }
219             $data{flag}{$flag}++;
220             if ($verbose == 1) {print STDERR "\n";}
221             } # end while
222              
223             rename ($tmp_bam_pos, $bamname_pos);
224             rename ($tmp_bam_neg, $bamname_neg);
225             push (@processed_files, ($bamname_pos,$bamname_neg));
226             push (@processed_files, ($data{count}{pos},$data{count}{neg}));
227             if ($want_bed){
228             push (@processed_files, ($bedname_pos,$bedname_neg))
229             }
230              
231             # error checks
232             unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) {
233             printf "ERROR: paired-end + single-end alignments != total alignment count\n";
234             print Dumper(\%data);
235             croak $!;
236             }
237             unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) {
238             printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos};
239             printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg};
240             printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg});
241             printf STDERR "%20d cur_count (should be)\n",$data{count}{cur};
242             printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n";
243             print Dumper(\%data);
244             croak $!;
245             }
246             foreach (keys %{$data{flag}}){
247             $data{count}{flag} += $data{flag}{$_};
248             }
249             unless ($data{count}{flag} == $data{count}{cur}){
250             printf STDERR "%20d alignments considered\n",$data{count}{cur};
251             printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag};
252             printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n";
253             print Dumper(\%data);
254             croak $!;
255             }
256              
257             # logging output
258             printf LOG "# bam_split.pl log for file \'$bamfile\'\n";
259             printf LOG "#-----------------------------------------------------------------\n";
260             printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total};
261             printf LOG "%20d unique-mappers (%6.2f%% of total)\n",
262             $data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ;
263             printf LOG "%20d multi-mappers (%6.2f%% of total)\n",
264             $data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total});
265             printf LOG "%20d multi-mappers skipped\n", $data{count}{skip};
266             printf LOG "%20d alignments considered\n", $data{count}{cur};
267             printf LOG "%20d paired-end\n", $data{count}{pe_alis};
268             printf LOG "%20s single-end\n", $data{count}{se_alis};
269             printf LOG "%20d fragments on [+] strand (%6.2f%% of considered)\n",
270             $data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur});
271             printf LOG "%20d fragments on [-] strand (%6.2f%% of considered)\n",
272             $data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur});
273             printf LOG "#-----------------------------------------------------------------\n";
274             printf LOG "Dumper output:\n". Dumper(\%data);
275             close(LOG);
276             return @processed_files;
277             }
278              
279             sub uniquify_bam {
280             my ($bamfile,$dest,$log) = @_;
281             my ($bam, $bn,$path,$ext,$read,$header);
282             my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult);
283             my ($count_all,$count_uniq,$count_mult) = (0)x3;
284             my @processed_files = ();
285             my $this_function = (caller(0))[3];
286              
287             croak "ERROR [$this_function] Cannot find $bamfile\n"
288             unless (-e $bamfile);
289             croak "ERROR [$this_function] $dest does not exist\n"
290             unless (-d $dest);
291              
292             ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
293              
294             (undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0);
295             (undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0);
296              
297             $bam = Bio::DB::Bam->open($bamfile, "r");
298             $fn_uniq = file($dest,$bn.".uniq".$ext);
299             $fn_mult = file($dest,$bn.".mult".$ext);
300             $header = $bam->header; # TODO: modify header, leave traces ...
301              
302             $bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w')
303             or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
304             $bam_mult = Bio::DB::Bam->open($tmp_mult,'w')
305             or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
306             $bam_uniq->header_write($header);
307             $bam_mult->header_write($header);
308              
309             while ($read = $bam->read1() ) {
310             $count_all++;
311             if ($read->aux_get("NH") == 1){ # uniquely mapped reads
312             $bam_uniq->write1($read);
313             $count_uniq++;
314             }
315             else { # multiply mapped reads
316             $bam_mult->write1($read);
317             $count_mult++;
318             }
319             }
320              
321             croak "ERROR [$this_function] Read counts don't match\n"
322             unless ($count_uniq + $count_mult == $count_all);
323              
324             rename ($tmp_uniq, $fn_uniq);
325             rename ($tmp_mult, $fn_mult);
326             push (@processed_files, ($fn_uniq,$fn_mult));
327              
328             if (defined $log){
329             my $lf = file($dest,$log);
330             open(LOG, ">>", $lf) or croak $!;
331             printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n",
332             $count_all,$count_uniq,$count_mult;
333             close(LOG);
334             }
335             }
336              
337              
338             1;
339             __END__
340              
341              
342             =head1 NAME
343              
344             Bio::ViennaNGS::Bam - High-level access to BAM files
345              
346             =head1 SYNOPSIS
347              
348             use Bio::ViennaNGS::Bam;
349              
350             # split a single-end or paired-end BAM file by strands
351             @result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile);
352              
353             # extract unique and multi mappers from a BAM file
354             @result = uniquify_bam($bam_in,$outdir,$logfile);
355              
356             =head1 DESCRIPTION
357              
358             Bio::ViennaNGS::BAM provides high-level access to BAM file. Building
359             on L<Bio::DB::Sam>, it provides code to extract specific portions from
360             BAM files. It comes with routines for splitting BAM files by strand
361             (which is often required for visualization of NGS data) and extracting
362             uniquely and multiply aligned reads from BAM files.
363              
364             =head2 ROUTINES
365              
366             =over
367              
368             =item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log)
369              
370             Splits BAM file $bam according to [+] and [-] strand. C<$reverse>,
371             C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1,
372             triggering forced reversion of strand mapping (due to RNA-seq protocol
373             constraints), filtering of unique mappers (identified via NH:i:1 SAM
374             argument), and forced output of a BED file corresponding to
375             strand-specific mapping, respectively. C<$log> holds name and path of
376             the log file.
377              
378             Strand-splitting is done in a way that in paired-end alignments, FIRST
379             and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE
380             reads determine the strand, while SECOND_MATE reads are assigned the
381             opposite strand I<per definitionem>. This also holds if the reads are
382             not mapped in proper pairs and even if there is no mapping partner at
383             all.
384              
385             Sometimes the library preparation protocol causes inversion of the
386             read assignment (with respect to the underlying annotation). In those
387             cases, the natural mapping of the reads can be obtained by the
388             C<$reverse> flag.
389              
390             This routine returns an array whose fist two elements are the file
391             names of the newly generate BAM files with reads mapped to the
392             positive, and negative strand, respectively. Elements three and four
393             are the number of fragments mapped to the positive and negative
394             strand. If the C<$want_bed> option was given elements fiveand six are
395             the file names of the output BED files for positive and negative
396             strand, respectively.
397              
398             NOTE: Filtering of unique mappers is only safe for single-end
399             experiments; In paired-end experiments, read and mate are treated
400             separately, thus allowing for scenarios where eg. one read is a
401             multi-mapper, whereas its associate mate is a unique mapper, resulting
402             in an ambiguous alignment of the entire fragment.
403              
404             =item uniquify_bam($bam,$dest,$log)
405              
406             Extract I<uniquely> and I<multiply> aligned reads from BAM file
407             C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for
408             unique and multi mappers are created in the output folder C<$dest>,
409             which are named B<basename.uniq.bam> and B<basename.mult.bam>,
410             respectively. If defined, a logfile named C<$log> is created in the
411             output folder.
412              
413             This routine returns an array holding file names of the newly created
414             BAM files for unique and multi mappers, respectively.
415              
416             NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to
417             decorate unique and multi mappers. As such, this routine will not work
418             unless your BAM file has these attributes.
419              
420             =head1 DEPENDENCIES
421              
422             =over 6
423              
424             =item L<Bio::Perl> >= 1.00690001
425              
426             =item L<BIO::DB::Sam> >= 1.39
427              
428             =item L<File::Basename>
429              
430             =item L<File::Temp>
431              
432             =item L<Path::Class>
433              
434             =item L<Carp>
435              
436             =back
437              
438             =head1 AUTHORS
439              
440             =over
441              
442             =item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
443              
444             =back
445              
446             =head1 COPYRIGHT AND LICENSE
447              
448             Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
449              
450             This library is free software; you can redistribute it and/or modify
451             it under the same terms as Perl itself, either Perl version 5.12.4 or,
452             at your option, any later version of Perl 5 you may have available.
453              
454             This software is distributed in the hope that it will be useful, but
455             WITHOUT ANY WARRANTY; without even the implied warranty of
456             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
457              
458             =cut