| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # -*-CPerl-*- | 
| 2 |  |  |  |  |  |  | # Last changed Time-stamp: <2015-02-09 09:20:43 mtw> | 
| 3 |  |  |  |  |  |  |  | 
| 4 |  |  |  |  |  |  | package Bio::ViennaNGS::Bam; | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 1 |  |  | 1 |  | 1330 | use Exporter; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 64 |  | 
| 7 | 1 |  |  | 1 |  | 35 | use version; our $VERSION = qv('0.12_16'); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 10 |  | 
| 8 | 1 |  |  | 1 |  | 98 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 39 |  | 
| 9 | 1 |  |  | 1 |  | 4 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 38 |  | 
| 10 | 1 |  |  | 1 |  | 497 | use Bio::Perl 1.00690001; | 
|  | 1 |  |  |  |  | 83321 |  | 
|  | 1 |  |  |  |  | 95 |  | 
| 11 | 1 |  |  | 1 |  | 206 | use Bio::DB::Sam 1.37; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | use Data::Dumper; | 
| 13 |  |  |  |  |  |  | use File::Basename qw(fileparse); | 
| 14 |  |  |  |  |  |  | use File::Temp qw(tempfile); | 
| 15 |  |  |  |  |  |  | use Path::Class; | 
| 16 |  |  |  |  |  |  | use Carp; | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 19 |  |  |  |  |  |  | our @EXPORT = (); | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | our @EXPORT_OK = qw ( split_bam uniquify_bam ); | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 24 |  |  |  |  |  |  | #^^^^^^^^^^^ Subroutines ^^^^^^^^^^# | 
| 25 |  |  |  |  |  |  | #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | sub split_bam { | 
| 28 |  |  |  |  |  |  | my %data = (); | 
| 29 |  |  |  |  |  |  | my @NHval = (); | 
| 30 |  |  |  |  |  |  | my @processed_files = (); | 
| 31 |  |  |  |  |  |  | my $verbose = 0; | 
| 32 |  |  |  |  |  |  | my ($bamfile,$reverse,$want_uniq,$want_bed,$dest_dir,$log) = @_; | 
| 33 |  |  |  |  |  |  | my ($bam,$sam,$bn,$path,$ext,$header,$flag,$NH,$eff_strand,$tmp); | 
| 34 |  |  |  |  |  |  | my ($bam_pos,$bam_neg,$tmp_bam_pos,$tmp_bam_neg,$bamname_pos,$bamname_neg); | 
| 35 |  |  |  |  |  |  | my ($bed_pos,$bed_neg,$bedname_pos,$bedname_neg); | 
| 36 |  |  |  |  |  |  | my ($seq_id,$start,$stop,$strand,$target_names,$id,$score); | 
| 37 |  |  |  |  |  |  | my $this_function = (caller(0))[3]; | 
| 38 |  |  |  |  |  |  | my %count_entries = ( | 
| 39 |  |  |  |  |  |  | total     => 0, | 
| 40 |  |  |  |  |  |  | uniq      => 0, | 
| 41 |  |  |  |  |  |  | pos       => 0, | 
| 42 |  |  |  |  |  |  | neg       => 0, | 
| 43 |  |  |  |  |  |  | skip      => 0, | 
| 44 |  |  |  |  |  |  | cur       => 0, | 
| 45 |  |  |  |  |  |  | mult      => 0, | 
| 46 |  |  |  |  |  |  | se_alis   => 0, | 
| 47 |  |  |  |  |  |  | pe_alis   => 0, | 
| 48 |  |  |  |  |  |  | flag      => 0, | 
| 49 |  |  |  |  |  |  | ); | 
| 50 |  |  |  |  |  |  | $data{count} = \%count_entries; | 
| 51 |  |  |  |  |  |  | $data{flag} = (); | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | croak "ERROR [$this_function] $bamfile does not exist\n" | 
| 54 |  |  |  |  |  |  | unless (-e $bamfile); | 
| 55 |  |  |  |  |  |  | croak "ERROR [$this_function] $dest_dir does not exist\n" | 
| 56 |  |  |  |  |  |  | unless (-d $dest_dir); | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | open(LOG, ">", $log) or croak $!; | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | (undef,$tmp_bam_pos) = tempfile('BAM_POS_XXXXXXX',UNLINK=>0); | 
| 61 |  |  |  |  |  |  | (undef,$tmp_bam_neg) = tempfile('BAM_NEG_XXXXXXX',UNLINK=>0); | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | $bam = Bio::DB::Bam->open($bamfile, "r"); | 
| 64 |  |  |  |  |  |  | $header = $bam->header; | 
| 65 |  |  |  |  |  |  | $target_names = $header->target_name; | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); | 
| 68 |  |  |  |  |  |  | unless ($dest_dir =~ /\/$/){$dest_dir .= "/";} | 
| 69 |  |  |  |  |  |  | $bamname_pos = $dest_dir.$bn.".pos".$ext; | 
| 70 |  |  |  |  |  |  | $bamname_neg = $dest_dir.$bn.".neg".$ext; | 
| 71 |  |  |  |  |  |  | $bam_pos = Bio::DB::Bam->open($tmp_bam_pos,'w') | 
| 72 |  |  |  |  |  |  | or croak "ERROR [$this_function] Could not open bam_pos file for writing: $!"; | 
| 73 |  |  |  |  |  |  | $bam_neg = Bio::DB::Bam->open($tmp_bam_neg,'w') | 
| 74 |  |  |  |  |  |  | or croak "ERROR [$this_function] Could not open bam_neg file for writing: $!"; | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | if ($want_bed == 1){ | 
| 77 |  |  |  |  |  |  | $bedname_pos = $dest_dir.$bn.".pos.bed"; | 
| 78 |  |  |  |  |  |  | $bedname_neg = $dest_dir.$bn.".neg.bed"; | 
| 79 |  |  |  |  |  |  | open($bed_pos, ">", $bedname_pos); open($bed_neg, ">", $bedname_neg); | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | $bam_pos->header_write($header);$bam_neg->header_write($header); | 
| 83 |  |  |  |  |  |  | if($reverse == 1) {  # switch +/- strand mapping | 
| 84 |  |  |  |  |  |  | $tmp = $bam_pos;$bam_pos = $bam_neg;$bam_neg = $tmp; | 
| 85 |  |  |  |  |  |  | $tmp = $bed_pos;$bed_pos = $bed_neg;$bed_neg = $tmp; | 
| 86 |  |  |  |  |  |  | } | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | while (my $read= $bam->read1() ) { | 
| 89 |  |  |  |  |  |  | @NHval = (); | 
| 90 |  |  |  |  |  |  | $data{count}{total}++; | 
| 91 |  |  |  |  |  |  | if($verbose == 1){print STDERR $read->query->name."\t";} | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | # check if NH (the SAM tag used to indicate multiple mappings) is set | 
| 94 |  |  |  |  |  |  | if ($read->has_tag("NH")) { | 
| 95 |  |  |  |  |  |  | @NHval = $read->get_tag_values("NH"); | 
| 96 |  |  |  |  |  |  | $NH = $NHval[0]; | 
| 97 |  |  |  |  |  |  | if ($NH == 1) { | 
| 98 |  |  |  |  |  |  | $data{count}{uniq}++; | 
| 99 |  |  |  |  |  |  | if ($verbose == 1) {print STDERR "NH:i:1\t";} | 
| 100 |  |  |  |  |  |  | } | 
| 101 |  |  |  |  |  |  | else { | 
| 102 |  |  |  |  |  |  | $data{count}{mult}++; | 
| 103 |  |  |  |  |  |  | if ($verbose == 1) {print STDERR "NH:i:".$NH."\t";} | 
| 104 |  |  |  |  |  |  | if ($want_uniq == 1) { # skip processing this read if it is a mutli-mapper | 
| 105 |  |  |  |  |  |  | $data{count}{skip}++; | 
| 106 |  |  |  |  |  |  | next; | 
| 107 |  |  |  |  |  |  | } | 
| 108 |  |  |  |  |  |  | } | 
| 109 |  |  |  |  |  |  | $data{count}{cur}++; | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  | else{ carp "WARN [$this_function] Read ".$read->query->name." does not have NH attribute\n";} | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | #  print Dumper ($read->query); | 
| 114 |  |  |  |  |  |  | $strand = $read->strand; | 
| 115 |  |  |  |  |  |  | $seq_id = $target_names->[$read->tid]; | 
| 116 |  |  |  |  |  |  | $start  = $read->start; | 
| 117 |  |  |  |  |  |  | $stop   = $read->end; | 
| 118 |  |  |  |  |  |  | $id     = "x"; # $read->qname; | 
| 119 |  |  |  |  |  |  | $score  = 100; | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | if ( $read->get_tag_values('PAIRED') ) { # paired-end | 
| 122 |  |  |  |  |  |  | if($verbose == 1) {print STDERR "pe\t";} | 
| 123 |  |  |  |  |  |  | $data{count}{pe_alis}++; | 
| 124 |  |  |  |  |  |  | if ( $read->get_tag_values('FIRST_MATE') ){ # 1st mate; take its strand as granted | 
| 125 |  |  |  |  |  |  | if($verbose == 1) {print STDERR "FIRST_MATE\t".$strand." ";} | 
| 126 |  |  |  |  |  |  | if ( $strand eq "1" ){ | 
| 127 |  |  |  |  |  |  | $bam_pos->write1($read); | 
| 128 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 129 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 130 |  |  |  |  |  |  | $data{count}{pos}++; $eff_strand=$strand; | 
| 131 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "+";} | 
| 132 |  |  |  |  |  |  | } | 
| 133 |  |  |  |  |  |  | else { | 
| 134 |  |  |  |  |  |  | $data{count}{neg}++; $eff_strand=-1*$strand; | 
| 135 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "-";} | 
| 136 |  |  |  |  |  |  | } | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  | elsif ($strand eq "-1") { | 
| 139 |  |  |  |  |  |  | $bam_neg->write1($read); | 
| 140 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 141 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 142 |  |  |  |  |  |  | $data{count}{neg}++; $eff_strand=$strand; | 
| 143 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "-";} | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  | else { | 
| 146 |  |  |  |  |  |  | $data{count}{pos}++;$eff_strand=-1*$strand; | 
| 147 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "+";} | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  | else {croak "Strand neither + nor - ...exiting!\n";} | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  | else{ # 2nd mate; reverse strand since the fragment it belongs to is ALWAYS located | 
| 153 |  |  |  |  |  |  | # on the other strand | 
| 154 |  |  |  |  |  |  | if($verbose == 1) {print STDERR "SECOND_MATE\t".$strand." ";} | 
| 155 |  |  |  |  |  |  | if ( $strand eq "1" ) { | 
| 156 |  |  |  |  |  |  | $bam_neg->write1($read); | 
| 157 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 158 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 159 |  |  |  |  |  |  | $data{count}{neg}++;$eff_strand=$strand; | 
| 160 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "-";} | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  | else { | 
| 163 |  |  |  |  |  |  | $data{count}{pos}++; $eff_strand=-1*$strand; | 
| 164 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "+";} | 
| 165 |  |  |  |  |  |  | } | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  | elsif ( $strand eq "-1" ) { | 
| 168 |  |  |  |  |  |  | $bam_pos->write1($read); | 
| 169 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 170 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 171 |  |  |  |  |  |  | $data{count}{pos}++;$eff_strand=$strand; | 
| 172 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "+";} | 
| 173 |  |  |  |  |  |  | } | 
| 174 |  |  |  |  |  |  | else { | 
| 175 |  |  |  |  |  |  | $data{count}{neg}++;$eff_strand=-1*$strand; | 
| 176 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "-";} | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  | else {croak "Strand neither + nor - ...exiting!\n";} | 
| 180 |  |  |  |  |  |  | } | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  | else { # single-end | 
| 183 |  |  |  |  |  |  | if($verbose == 1) {print STDERR "se\t";} | 
| 184 |  |  |  |  |  |  | $data{count}{se_alis}++; | 
| 185 |  |  |  |  |  |  | if ( $read->strand eq "1" ){ | 
| 186 |  |  |  |  |  |  | $bam_pos->write1($read); | 
| 187 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 188 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 189 |  |  |  |  |  |  | $data{count}{pos}++;$eff_strand=$strand; | 
| 190 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "+";} | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  | else { | 
| 193 |  |  |  |  |  |  | $data{count}{neg}++;$eff_strand=-1*$strand; | 
| 194 |  |  |  |  |  |  | if ($want_bed){printf $bed_pos "%s\n", "-";} | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  | elsif ($read->strand eq "-1") { | 
| 198 |  |  |  |  |  |  | $bam_neg->write1($read); | 
| 199 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\t%d\t%d\t%s\t%d\t",$seq_id,eval($start-1),$stop,$id,$score;} | 
| 200 |  |  |  |  |  |  | if ($reverse == 0){ | 
| 201 |  |  |  |  |  |  | $data{count}{neg}++;$eff_strand=$strand; | 
| 202 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "-";} | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  | else { | 
| 205 |  |  |  |  |  |  | $data{count}{pos}++;$eff_strand=-1*$strand; | 
| 206 |  |  |  |  |  |  | if ($want_bed){printf $bed_neg "%s\n", "+";} | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  | else {croak "Strand neither + nor - ...exiting!\n";} | 
| 210 |  |  |  |  |  |  | } | 
| 211 |  |  |  |  |  |  | if($verbose == 1) {print STDERR "--> ".$eff_strand."\t";} | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | # collect statistics of SAM flags | 
| 214 |  |  |  |  |  |  | $flag = $read->flag; | 
| 215 |  |  |  |  |  |  | unless (exists $data{flag}{$flag}){ | 
| 216 |  |  |  |  |  |  | $data{flag}{$flag} = 0; | 
| 217 |  |  |  |  |  |  | } | 
| 218 |  |  |  |  |  |  | $data{flag}{$flag}++; | 
| 219 |  |  |  |  |  |  | if ($verbose == 1) {print STDERR "\n";} | 
| 220 |  |  |  |  |  |  | } # end while | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | rename ($tmp_bam_pos, $bamname_pos); | 
| 223 |  |  |  |  |  |  | rename ($tmp_bam_neg, $bamname_neg); | 
| 224 |  |  |  |  |  |  | push (@processed_files, ($bamname_pos,$bamname_neg)); | 
| 225 |  |  |  |  |  |  | push (@processed_files, ($data{count}{pos},$data{count}{neg})); | 
| 226 |  |  |  |  |  |  | if ($want_bed){ | 
| 227 |  |  |  |  |  |  | push (@processed_files, ($bedname_pos,$bedname_neg)) | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | # error checks | 
| 231 |  |  |  |  |  |  | unless ($data{count}{pe_alis} + $data{count}{se_alis} == $data{count}{cur}) { | 
| 232 |  |  |  |  |  |  | printf "ERROR:  paired-end + single-end alignments != total alignment count\n"; | 
| 233 |  |  |  |  |  |  | print Dumper(\%data); | 
| 234 |  |  |  |  |  |  | croak $!; | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  | unless ($data{count}{pos} + $data{count}{neg} == $data{count}{cur}) { | 
| 237 |  |  |  |  |  |  | printf STDERR "%20d fragments on [+] strand\n",$data{count}{pos}; | 
| 238 |  |  |  |  |  |  | printf STDERR "%20d fragments on [-] strand\n",$data{count}{neg}; | 
| 239 |  |  |  |  |  |  | printf STDERR "%20d sum\n",eval($data{count}{pos}+$data{count}{neg}); | 
| 240 |  |  |  |  |  |  | printf STDERR "%20d cur_count (should be)\n",$data{count}{cur}; | 
| 241 |  |  |  |  |  |  | printf STDERR "ERROR: pos alignments + neg alignments != total alignments\n"; | 
| 242 |  |  |  |  |  |  | print Dumper(\%data); | 
| 243 |  |  |  |  |  |  | croak $!; | 
| 244 |  |  |  |  |  |  | } | 
| 245 |  |  |  |  |  |  | foreach (keys %{$data{flag}}){ | 
| 246 |  |  |  |  |  |  | $data{count}{flag} += $data{flag}{$_}; | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  | unless ($data{count}{flag} == $data{count}{cur}){ | 
| 249 |  |  |  |  |  |  | printf STDERR "%20d alignments considered\n",$data{count}{cur}; | 
| 250 |  |  |  |  |  |  | printf STDERR "%20d alignments found in flag statistics\n",$data{count}{flag}; | 
| 251 |  |  |  |  |  |  | printf STDERR "ERROR: #considered alignments != #alignments from flag stat\n"; | 
| 252 |  |  |  |  |  |  | print Dumper(\%data); | 
| 253 |  |  |  |  |  |  | croak $!; | 
| 254 |  |  |  |  |  |  | } | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | # logging output | 
| 257 |  |  |  |  |  |  | printf LOG "# bam_split.pl log for file \'$bamfile\'\n"; | 
| 258 |  |  |  |  |  |  | printf LOG "#-----------------------------------------------------------------\n"; | 
| 259 |  |  |  |  |  |  | printf LOG "%20d total alignments (unique & multi-mapper)\n",$data{count}{total}; | 
| 260 |  |  |  |  |  |  | printf LOG "%20d unique-mappers (%6.2f%% of total)\n", | 
| 261 |  |  |  |  |  |  | $data{count}{uniq},eval(100*$data{count}{uniq}/$data{count}{total}) ; | 
| 262 |  |  |  |  |  |  | printf LOG "%20d multi-mappers  (%6.2f%% of total)\n", | 
| 263 |  |  |  |  |  |  | $data{count}{mult},eval(100*$data{count}{mult}/$data{count}{total}); | 
| 264 |  |  |  |  |  |  | printf LOG "%20d multi-mappers skipped\n", $data{count}{skip}; | 
| 265 |  |  |  |  |  |  | printf LOG "%20d alignments considered\n", $data{count}{cur}; | 
| 266 |  |  |  |  |  |  | printf LOG "%20d paired-end\n", $data{count}{pe_alis}; | 
| 267 |  |  |  |  |  |  | printf LOG "%20s single-end\n", $data{count}{se_alis}; | 
| 268 |  |  |  |  |  |  | printf LOG "%20d fragments on [+] strand  (%6.2f%% of considered)\n", | 
| 269 |  |  |  |  |  |  | $data{count}{pos},eval(100*$data{count}{pos}/$data{count}{cur}); | 
| 270 |  |  |  |  |  |  | printf LOG "%20d fragments on [-] strand  (%6.2f%% of considered)\n", | 
| 271 |  |  |  |  |  |  | $data{count}{neg},eval(100*$data{count}{neg}/$data{count}{cur}); | 
| 272 |  |  |  |  |  |  | printf LOG "#-----------------------------------------------------------------\n"; | 
| 273 |  |  |  |  |  |  | printf LOG "Dumper output:\n". Dumper(\%data); | 
| 274 |  |  |  |  |  |  | close(LOG); | 
| 275 |  |  |  |  |  |  | return @processed_files; | 
| 276 |  |  |  |  |  |  | } | 
| 277 |  |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  | sub uniquify_bam { | 
| 279 |  |  |  |  |  |  | my ($bamfile,$dest,$log) = @_; | 
| 280 |  |  |  |  |  |  | my ($bam, $bn,$path,$ext,$read,$header); | 
| 281 |  |  |  |  |  |  | my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult); | 
| 282 |  |  |  |  |  |  | my ($count_all,$count_uniq,$count_mult) = (0)x3; | 
| 283 |  |  |  |  |  |  | my @processed_files = (); | 
| 284 |  |  |  |  |  |  | my $this_function = (caller(0))[3]; | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | croak "ERROR [$this_function] Cannot find $bamfile\n" | 
| 287 |  |  |  |  |  |  | unless (-e $bamfile); | 
| 288 |  |  |  |  |  |  | croak "ERROR [$this_function] $dest does not exist\n" | 
| 289 |  |  |  |  |  |  | unless (-d $dest); | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | (undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0); | 
| 294 |  |  |  |  |  |  | (undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0); | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | $bam     = Bio::DB::Bam->open($bamfile, "r"); | 
| 297 |  |  |  |  |  |  | $fn_uniq = file($dest,$bn.".uniq".$ext); | 
| 298 |  |  |  |  |  |  | $fn_mult = file($dest,$bn.".mult".$ext); | 
| 299 |  |  |  |  |  |  | $header  = $bam->header; # TODO: modify header, leave traces ... | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | $bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w') | 
| 302 |  |  |  |  |  |  | or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; | 
| 303 |  |  |  |  |  |  | $bam_mult = Bio::DB::Bam->open($tmp_mult,'w') | 
| 304 |  |  |  |  |  |  | or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; | 
| 305 |  |  |  |  |  |  | $bam_uniq->header_write($header); | 
| 306 |  |  |  |  |  |  | $bam_mult->header_write($header); | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | while ($read = $bam->read1() ) { | 
| 309 |  |  |  |  |  |  | $count_all++; | 
| 310 |  |  |  |  |  |  | if ($read->aux_get("NH") == 1){ # uniquely mapped reads | 
| 311 |  |  |  |  |  |  | $bam_uniq->write1($read); | 
| 312 |  |  |  |  |  |  | $count_uniq++; | 
| 313 |  |  |  |  |  |  | } | 
| 314 |  |  |  |  |  |  | else { # multiply mapped reads | 
| 315 |  |  |  |  |  |  | $bam_mult->write1($read); | 
| 316 |  |  |  |  |  |  | $count_mult++; | 
| 317 |  |  |  |  |  |  | } | 
| 318 |  |  |  |  |  |  | } | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | croak "ERROR [$this_function] Read counts don't match\n" | 
| 321 |  |  |  |  |  |  | unless ($count_uniq + $count_mult == $count_all); | 
| 322 |  |  |  |  |  |  |  | 
| 323 |  |  |  |  |  |  | rename ($tmp_uniq, $fn_uniq); | 
| 324 |  |  |  |  |  |  | rename ($tmp_mult, $fn_mult); | 
| 325 |  |  |  |  |  |  | push (@processed_files, ($fn_uniq,$fn_mult)); | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | if (defined $log){ | 
| 328 |  |  |  |  |  |  | my $lf = file($dest,$log); | 
| 329 |  |  |  |  |  |  | open(LOG, ">>", $lf) or croak $!; | 
| 330 |  |  |  |  |  |  | printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n", | 
| 331 |  |  |  |  |  |  | $count_all,$count_uniq,$count_mult; | 
| 332 |  |  |  |  |  |  | close(LOG); | 
| 333 |  |  |  |  |  |  | } | 
| 334 |  |  |  |  |  |  | } | 
| 335 |  |  |  |  |  |  |  | 
| 336 |  |  |  |  |  |  |  | 
| 337 |  |  |  |  |  |  | 1; | 
| 338 |  |  |  |  |  |  | __END__ | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  |  | 
| 341 |  |  |  |  |  |  | =head1 NAME | 
| 342 |  |  |  |  |  |  |  | 
| 343 |  |  |  |  |  |  | Bio::ViennaNGS::Bam - High-level access to BAM files | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | use Bio::ViennaNGS::Bam; | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | # split a single-end  or paired-end BAM file by strands | 
| 350 |  |  |  |  |  |  | @result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile); | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  | # extract unique and multi mappers from a BAM file | 
| 353 |  |  |  |  |  |  | @result = uniquify_bam($bam_in,$outdir,$logfile); | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  | Bio::ViennaNGS::BAM provides high-level access to BAM file. Building | 
| 358 |  |  |  |  |  |  | on L<Bio::DB::Sam>, it provides code to extract specific portions from | 
| 359 |  |  |  |  |  |  | BAM files. It comes with routines for splitting BAM files by strand | 
| 360 |  |  |  |  |  |  | (which is often required for visualization of NGS data) and extracting | 
| 361 |  |  |  |  |  |  | uniquely and multiply aligned reads from BAM files. | 
| 362 |  |  |  |  |  |  |  | 
| 363 |  |  |  |  |  |  | =head2 ROUTINES | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | =over | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | =item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log) | 
| 368 |  |  |  |  |  |  |  | 
| 369 |  |  |  |  |  |  | Splits BAM file $bam according to [+] and [-] strand. C<$reverse>, | 
| 370 |  |  |  |  |  |  | C<$want_uniq> and C<$want_bed> are switches with values of 0 or 1, | 
| 371 |  |  |  |  |  |  | triggering forced reversion of strand mapping (due to RNA-seq protocol | 
| 372 |  |  |  |  |  |  | constraints), filtering of unique mappers (identified via NH:i:1 SAM | 
| 373 |  |  |  |  |  |  | argument), and forced output of a BED file corresponding to | 
| 374 |  |  |  |  |  |  | strand-specific mapping, respectively. C<$log> holds name and path of | 
| 375 |  |  |  |  |  |  | the log file. | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | Strand-splitting is done in a way that in paired-end alignments, FIRST | 
| 378 |  |  |  |  |  |  | and SECOND mates (reads) are treated as _one_ fragment, ie FIRST_MATE | 
| 379 |  |  |  |  |  |  | reads determine the strand, while SECOND_MATE reads are assigned the | 
| 380 |  |  |  |  |  |  | opposite strand I<per definitionem>. This also holds if the reads are | 
| 381 |  |  |  |  |  |  | not mapped in proper pairs and even if there is no mapping partner at | 
| 382 |  |  |  |  |  |  | all. | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  | Sometimes the library preparation protocol causes inversion of the | 
| 385 |  |  |  |  |  |  | read assignment (with respect to the underlying annotation). In those | 
| 386 |  |  |  |  |  |  | cases, the natural mapping of the reads can be obtained by the | 
| 387 |  |  |  |  |  |  | C<$reverse> flag. | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  | This routine returns an array whose fist two elements are the file | 
| 390 |  |  |  |  |  |  | names of the newly generate BAM files with reads mapped to the | 
| 391 |  |  |  |  |  |  | positive, and negative strand, respectively. Elements three and four | 
| 392 |  |  |  |  |  |  | are the number of fragments mapped to the positive and negative | 
| 393 |  |  |  |  |  |  | strand. If the C<$want_bed> option was given elements fiveand six are | 
| 394 |  |  |  |  |  |  | the file names of the output BED files for positive and negative | 
| 395 |  |  |  |  |  |  | strand, respectively. | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | NOTE: Filtering of unique mappers is only safe for single-end | 
| 398 |  |  |  |  |  |  | experiments; In paired-end experiments, read and mate are treated | 
| 399 |  |  |  |  |  |  | separately, thus allowing for scenarios where eg. one read is a | 
| 400 |  |  |  |  |  |  | multi-mapper, whereas its associate mate is a unique mapper, resulting | 
| 401 |  |  |  |  |  |  | in an ambiguous alignment of the entire fragment. | 
| 402 |  |  |  |  |  |  |  | 
| 403 |  |  |  |  |  |  | =item uniquify_bam($bam,$dest,$log) | 
| 404 |  |  |  |  |  |  |  | 
| 405 |  |  |  |  |  |  | Extract I<uniquely> and I<multiply> aligned reads from BAM file | 
| 406 |  |  |  |  |  |  | C<$bam> by means of the I<NH:i:> SAM attribute. New BAM files for | 
| 407 |  |  |  |  |  |  | unique and multi mappers are created in the output folder C<$dest>, | 
| 408 |  |  |  |  |  |  | which are named B<basename.uniq.bam> and B<basename.mult.bam>, | 
| 409 |  |  |  |  |  |  | respectively. If defined, a logfile named C<$log> is created in the | 
| 410 |  |  |  |  |  |  | output folder. | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | This routine returns an array holding file names of the newly created | 
| 413 |  |  |  |  |  |  | BAM files for unique and multi mappers, respectively. | 
| 414 |  |  |  |  |  |  |  | 
| 415 |  |  |  |  |  |  | NOTE: Not all short read mappers use the I<NH:i:> SAM attribute to | 
| 416 |  |  |  |  |  |  | decorate unique and multi mappers. As such, this routine will not work | 
| 417 |  |  |  |  |  |  | unless your BAM file has these attributes. | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | =head1 DEPENDENCIES | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | =over | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | =item  L<Bio::Perl> >= 1.00690001 | 
| 424 |  |  |  |  |  |  |  | 
| 425 |  |  |  |  |  |  | =item  L<BIO::DB::Sam> >= 1.37 | 
| 426 |  |  |  |  |  |  |  | 
| 427 |  |  |  |  |  |  | =item  L<File::Basename> | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | =item  L<File::Temp> | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | =item  L<Path::Class> | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  | =item  L<Carp> | 
| 434 |  |  |  |  |  |  |  | 
| 435 |  |  |  |  |  |  | =back | 
| 436 |  |  |  |  |  |  |  | 
| 437 |  |  |  |  |  |  | =head1 AUTHORS | 
| 438 |  |  |  |  |  |  |  | 
| 439 |  |  |  |  |  |  | =over | 
| 440 |  |  |  |  |  |  |  | 
| 441 |  |  |  |  |  |  | =item Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> | 
| 442 |  |  |  |  |  |  |  | 
| 443 |  |  |  |  |  |  | =back | 
| 444 |  |  |  |  |  |  |  | 
| 445 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  | Copyright (C) 2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt> | 
| 448 |  |  |  |  |  |  |  | 
| 449 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or modify | 
| 450 |  |  |  |  |  |  | it under the same terms as Perl itself, either Perl version 5.10.0 or, | 
| 451 |  |  |  |  |  |  | at your option, any later version of Perl 5 you may have available. | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | This software is distributed in the hope that it will be useful, but | 
| 454 |  |  |  |  |  |  | WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 455 |  |  |  |  |  |  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | =cut |