File Coverage

blib/lib/Bio/ViennaNGS/SpliceJunc.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2014-10-03 00:09:06 mtw>
3              
4             package Bio::ViennaNGS::SpliceJunc;
5              
6 1     1   84055 use Exporter;
  1         2  
  1         51  
7 1     1   6 use version; our $VERSION = qv('0.04');
  1         2  
  1         8  
8 1     1   79 use strict;
  1         6  
  1         31  
9 1     1   5 use warnings;
  1         2  
  1         26  
10 1     1   775 use Data::Dumper;
  1         9838  
  1         79  
11 1     1   304 use Bio::ViennaNGS;
  0            
  0            
12             use IPC::Cmd qw(can_run run);
13             use Path::Class;
14             use Carp;
15              
16             our @ISA = qw(Exporter);
17             our @EXPORT = ();
18             our @EXPORT_OK = qw(bed6_ss_from_bed12 bed6_ss_from_rnaseq
19             intersect_sj ss_isCanonical);
20              
21             # bed6_ss_from_bed12( $bed12,$dest,$window,$can,$fastaobjR )
22             #
23             # Extracts splice junctions from BED12 annotation.
24             #
25             # Writes a BED6 file for each transcript found in the BED12, listing
26             # all splice sites of this transcript, optionally flanking it with a
27             # window of +/-$window nt.
28             sub bed6_ss_from_bed12{
29             my ($bed12,$dest,$window,$can,$fastaobjR) = @_;
30             my ($i,$tr_name,$pos5,$pos3);
31             my $splicesites = 0;
32             my $c = 0;
33             my @bedline = ();
34             my $this_function = (caller(0))[3];
35              
36             croak "ERROR [$this_function] $bed12 does not exists\n"
37             unless (-e $bed12);
38             croak "ERROR [$this_function] $dest does not exist"
39             unless (-d $dest);
40              
41             open(BED12IN, "< $bed12") or croak $!;
42             while(){
43             chomp;
44             my ($chr,$chromStart,$chromEnd,$name,$score,$strand,$thickStart,
45             $thickEnd,$itemRgb,$blockCount,$blockSizes,$blockStarts) = split("\t");
46             my @blockSize = split(/,/,$blockSizes);
47             my @blockStart = split(/,/,$blockStarts);
48             unless (scalar @blockSize == scalar @blockStart){
49             croak "ERROR: unequal element count in blockStarts and blockSizes\n";
50             }
51             my $fn = sprintf("%s_%d-%d_%s.annotatedSS.bed6",$chr,$chromStart,$chromEnd,$name);
52             my $bed6_fn = file($dest,$fn);
53             my $tr_count = 1;
54              
55             if ($blockCount >1){ # only transcripts with 2 or more exons (!!!)
56              
57             open(BED6OUT, "> $bed6_fn") or croak "cannot open BED6OUT $!";
58              
59             for ($i=0;$i<$blockCount-1;$i++){
60             $pos5 = $chromStart+$blockStart[$i]+$blockSize[$i];
61             $pos3 = $chromStart+$blockStart[$i+1];
62             if($can){$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);}
63             $tr_name = sprintf("%s.%02d",$name,$tr_count++);
64             @bedline = join("\t",$chr,eval($pos5-$window),
65             eval($pos5+$window),$tr_name,$c,$strand);
66             print BED6OUT "@bedline\n";
67             @bedline = join("\t",$chr,eval($pos3-$window),
68             eval($pos3+$window),$tr_name,$c,$strand);
69             print BED6OUT "@bedline\n";
70             } # end for
71              
72             close(BED6OUT);
73             } # end if
74             } # end while
75             close(BED12IN);
76             }
77              
78             # bed6_ss_from_rnaseq ($bed_in,$dest,$window,$mincov,$can,$fastaobjR)
79             #
80             # Extracts splice junctions from mapped RNA-seq data
81             #
82             # Writes a BED6 file for each splice junction present in the input,
83             # optionally flanking it with a window of +/-$window nt. Only splice
84             # junctions supported by at least $mcov reads are considered.
85             sub bed6_ss_from_rnaseq{
86             my ($bed_in,$dest,$window,$mcov,$can,$fastaobjR) = @_;
87             my ($reads,$proper,$passed,$pos5,$pos3);
88             my $c = 0;
89             my @bedline = ();
90             my $this_function = (caller(0))[3];
91              
92             croak "ERROR [$this_function] $bed_in does not exist\n"
93             unless (-e $bed_in);
94             croak "ERROR [$this_function] $dest does not exist"
95             unless (-d $dest);
96              
97             open(INBED, "< $bed_in") or croak $!;
98             while(){
99             chomp;
100             my ($chr, $start, $end, $info, $score, $strand) = split("\t");
101             $end = $end-2; # required for segemehl's BED6 files
102              
103             if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
104             $reads = $1;
105             $proper = $4;
106             $passed = $5;
107             next unless ($proper eq 'N');
108             next unless ($passed =~ /[PFM]$/);
109             next if($reads < $mcov);
110             }
111             else {
112             croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
113             }
114             $pos5 = $start;
115             $pos3 = $end;
116             if($can){$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);}
117             my $fn = sprintf("%s_%d-%d.mappedSS.bed6",$chr,$start,$end);
118             my $bed6_fn = file($dest,$fn);
119             open(BED6OUT, "> $bed6_fn");
120             @bedline = join("\t",$chr,eval($start-$window),
121             eval($start+$window),$info,$c,$strand);
122             print BED6OUT "@bedline\n";
123             @bedline = join("\t",$chr,eval($end-$window),
124             eval($end+$window),$info,$c,$strand);
125             print BED6OUT "@bedline\n";
126             close(BED6OUT);
127             }
128             close(INBED);
129             }
130              
131             # intersect_sj($p_annot,$p_mapped,$dest,$prefix,$window,$mil)
132             #
133             # Intersect splice junctions determined by RNA-seq with annotated
134             # splice junctions. Determine novel and existing splice junctions.
135             #
136             # Writes BED6 files for existing and novel splice junctions to $dest
137             # and reurns an array with the absolute path to the two resulting BED
138             # files
139             sub intersect_sj{
140             my ($p_annot,$p_mapped,$dest,$prefix,$window,$mil) = @_;
141             my ($dha,$dhm);
142             my $processed = 0;
143             my @junctions = ();
144             my @transcript_beds = ();
145             my %asj = (); # annotated splice junctions hash
146             my $bedtools = can_run('bedtools') or croak "bedtools not found";
147             my $sortBed = can_run('sortBed') or croak "sortBed not found";
148             my $this_function = (caller(0))[3];
149              
150             croak "ERROR [$this_function] $p_annot does not exist\n"
151             unless (-d $p_annot);
152             croak "ERROR [$this_function] $p_mapped does not exist\n"
153             unless (-d $p_mapped);
154             croak "ERROR [$this_function] $dest does not exist\n"
155             unless (-d $dest);
156              
157             # get a list of all files in $p_annot
158             opendir($dha, $p_annot) or croak "Cannot opendir $p_annot: $!";
159             while(readdir $dha) { push @transcript_beds, $_; }
160             closedir($dha);
161              
162             # get a list of all splice junctions seen in RNA-seq data
163             opendir($dhm, $p_mapped) or croak "Cannot opendir $p_mapped: $!";
164             @junctions = grep { /^(chr\d+)\_(\d+)-(\d+)\.mappedSS\.bed6/ } readdir($dhm);
165              
166             # process splice junctions seen in RNA-seq data
167             foreach my $file (@junctions){
168             $processed++;
169             croak "Unexpected file name pattern\n" unless ($file =~ /(chr\d+)\_(\d+)-(\d+)/);
170             my $sc = $1;
171             my $s5 = $2;
172             my $s3 = $3;
173             my $pattern = $sc."_".$s5."-".$s3;
174              
175             my @annotated_beds = grep { /^(chr\d+)\_(\d+)-(\d+)/ && $2<=$s5 && $3>=$s3 && $sc eq $1} @transcript_beds;
176             #print"\t intersecting against ".(eval $#annotated_beds+1)." transcripts: @annotated_beds \n";
177              
178             # intersect currently opened SJ against all transcripts in @annotated_beds
179             foreach my $i (@annotated_beds){
180             my $a = file($p_mapped,$file);
181             my $b = file($p_annot,$i);
182             my $intersect_cmd = "$bedtools intersect -a $a -b $b -c -nobuf";
183             open(INTERSECT, $intersect_cmd."|");
184             while(){
185             chomp;
186             if ($_ =~ /1$/) { $asj{$pattern} = 1;}
187             }
188             close(INTERSECT);
189             }
190             if ( $processed % 1000 == 0){
191             print STDERR "processed $processed splice junctions\n";
192             }
193             }
194             closedir($dhm);
195              
196             # go through the mapped splice junctions files once again and
197             # separate novel from existing splice junctions
198             if (length($prefix)>0){$prefix .= ".";}
199             my $outname_exist = file($dest, $prefix."exist.SS.bed");
200             my $outname_exist_u = file($dest, $prefix."exist.SS.u.bed");
201             my $outname_novel = file($dest, $prefix."novel.SS.bed");
202             my $outname_novel_u = file($dest, $prefix."novel.SS.u.bed");
203             open (EXISTOUT, "> $outname_exist_u");
204             open (NOVELOUT, "> $outname_novel_u");
205              
206             # write new ones to NOVELOUT; existing ones to EXISTOUT
207             foreach my $file (@junctions){
208             if ($file =~ m/^(chr\d+\_\d+-\d+)/){
209             my $pattern = $1;
210             my $fn = file($p_mapped,$file);
211             open (SJ, "< $fn") or croak "Cannot open $fn $!";
212             while(){
213             chomp;
214             $_ = m/^(chr\w+)\s(\d+)\s(\d+)\s(splits:\d+:\d+:\d+:\w:\w)\s(\d+)\s([+-01])/;
215             my $chr = $1;
216             my $start = $2;
217             my $end = $3;
218             my $name = $4;
219             my $score = $5;
220             my $strand = $6;
221             my @bedline = join("\t",$chr,eval($start+$window),
222             eval($start+$window+1),$name,$score,$strand);
223             if (exists $asj{$pattern}){ # annotated splice junction
224             print EXISTOUT "@bedline\n";
225             }
226             else { # novel splice junction
227             print NOVELOUT "@bedline\n";
228             }
229             } # end while
230             close(SJ);
231             } # end if
232             else{ carp "Error with parsing BED6 junction file names __FILE__ __LINE__\n";}
233             }
234             close(EXISTOUT);
235             close(NOVELOUT);
236              
237             # sort the resulting bed files
238             my $cmd = "$bedtools sort -i $outname_exist_u > $outname_exist";
239             my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
240             run( command => $cmd, verbose => 0 );
241             if( !$success ) {
242             print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
243             print STDERR "ERROR: this is what the command printed:\n";
244             print join "", @$full_buf;
245             croak $!;
246             }
247             unlink($outname_exist_u);
248              
249             $cmd = "$bedtools sort -i $outname_novel_u > $outname_novel";
250             ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
251             run( command => $cmd, verbose => 0 );
252             if( !$success ) {
253             print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
254             print STDERR "ERROR: this is what the command printed:\n";
255             print join "", @$full_buf;
256             croak $!;
257             }
258             unlink($outname_novel_u);
259              
260             printf STDERR "processed $processed splice junctions\n";
261             my @result = ();
262             push(@result, $outname_exist);
263             push(@result, $outname_novel);
264             return @result;
265             }
266              
267             # ss_isCanonical ( $chr,$p5,$p3,$fastaobjR )
268              
269             # Checks whether a given splice junction is canonical, ie. whether the
270             # first and last two nucleotides of the enclosed intron correspond to
271             # a certain nucleotide motif. $chr is the chromosome name, $p5 and $p3
272             # the 5' and 3' ends of the splice junction and $fastaobjR is a
273             # Bio::PrimarySeq::Fasta object holding the underlying reference
274             # genome.
275             #
276             # Th most common canonical splice junction motif is GT-AG (shown
277             # below). Other canonical motifs are GC->AG and AT->AC.
278             #
279             # ------------------>
280             # 5'===]GT..........AG[====3'
281             #
282             # <-----------------
283             # 3'===]GA..........TG[====5'
284             #
285             sub ss_isCanonical{
286             my ($chr,$p5,$p3,$fastaobjR) = @_;
287             my ($seqL,$seqR,$pattern);
288             my %fastaO = %$fastaobjR;
289             my $ss_motif_length = 2;
290             my $this_function = (caller(0))[3];
291             my $c = -1;
292              
293             $seqL = get_stranded_subsequence($fastaO{$chr},$p5+1,$p5+$ss_motif_length,"+");
294             $seqR = get_stranded_subsequence($fastaO{$chr},$p3-$ss_motif_length+1,$p3,"+");
295              
296             $pattern = sprintf("%s|%s",$seqL,$seqR);
297             #print STDERR "[$this_function] p5->p3 ($p5 -- $p3) $seqL -- $seqR\n";
298              
299             if ($pattern =~ /^GT|AG$/) { $c = 1000;}
300             elsif ($pattern =~ /^CT|AC$/) { $c = 1000; }
301             elsif ($pattern =~ /^GC|AG$/) { $c = 1000; }
302             elsif ($pattern =~ /^CT|GC$/) { $c = 1000; }
303             elsif ($pattern =~ /^AT|AC$/) { $c = 1000; }
304             elsif ($pattern =~ /^GT|AT$/) { $c = 1000; }
305             else { $c = 0;}
306              
307             return $c;
308             }
309              
310             1;
311             __END__