File Coverage

blib/lib/Bio/ViennaNGS/SpliceJunc.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2014-11-27 15:38:43 mtw>
3              
4             package Bio::ViennaNGS::SpliceJunc;
5              
6 1     1   71305 use Exporter;
  1         2  
  1         46  
7 1     1   5 use version; our $VERSION = qv('0.05');
  1         1  
  1         8  
8 1     1   71 use strict;
  1         6  
  1         31  
9 1     1   4 use warnings;
  1         1  
  1         28  
10 1     1   642 use Data::Dumper;
  1         8111  
  1         70  
11 1     1   304 use Bio::ViennaNGS;
  0            
  0            
12             use Bio::ViennaNGS::Fasta;
13             use IPC::Cmd qw(can_run run);
14             use File::Basename;
15             use Path::Class;
16             use Carp;
17              
18             our @ISA = qw(Exporter);
19             our @EXPORT = ();
20             our @EXPORT_OK = qw(bed6_ss_from_bed12 bed6_ss_from_rnaseq
21             bed6_ss_to_bed12 intersect_sj ss_isCanonical);
22              
23             # bed6_ss_from_bed12( $bed12,$dest,$window,$can,$fastaO)
24             #
25             # Extracts splice junctions from BED12 annotation.
26             #
27             # Writes a BED6 file for each transcript found in the BED12, listing
28             # all splice sites of this transcript, optionally flanking it with a
29             # window of +/-$window nt.
30             sub bed6_ss_from_bed12{
31             my ($bed12,$dest,$window,$can,$fastaobjR) = @_;
32             my ($i,$tr_name,$pos5,$pos3);
33             my ($splicesites,$c,$totalsj,$cansj) = 0x4;
34             my @bedline = ();
35             my $this_function = (caller(0))[3];
36              
37             croak "ERROR [$this_function] $bed12 does not exists\n"
38             unless (-e $bed12);
39             croak "ERROR [$this_function] $dest does not exist"
40             unless (-d $dest);
41              
42             open(BED12IN, "< $bed12") or croak $!;
43             while(){
44             chomp;
45             my ($chr,$chromStart,$chromEnd,$name,$score,$strand,$thickStart,
46             $thickEnd,$itemRgb,$blockCount,$blockSizes,$blockStarts) = split("\t");
47             my @blockSize = split(/,/,$blockSizes);
48             my @blockStart = split(/,/,$blockStarts);
49             unless (scalar @blockSize == scalar @blockStart){
50             croak "ERROR: unequal element count in blockStarts and blockSizes\n";
51             }
52             my $fn = sprintf("%s_%d-%d_%s.annotatedSS.bed6",$chr,$chromStart,$chromEnd,$name);
53             my $bed6_fn = file($dest,$fn);
54             my $tr_count = 1;
55              
56             if ($blockCount >1){ # only transcripts with 2 or more exons (!!!)
57              
58             open(BED6OUT, "> $bed6_fn") or croak "cannot open BED6OUT $!";
59              
60             for ($i=0;$i<$blockCount-1;$i++){
61             $totalsj++;
62             $pos5 = $chromStart+$blockStart[$i]+$blockSize[$i];
63             $pos3 = $chromStart+$blockStart[$i+1];
64             if($can){
65             $c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);
66             $cansj++;
67             }
68             $tr_name = sprintf("%s.%02d",$name,$tr_count++);
69             @bedline = join("\t",$chr,eval($pos5-$window),
70             eval($pos5+$window),$tr_name,$c,$strand);
71             print BED6OUT "@bedline\n";
72             @bedline = join("\t",$chr,eval($pos3-$window),
73             eval($pos3+$window),$tr_name,$c,$strand);
74             print BED6OUT "@bedline\n";
75             } # end for
76              
77             close(BED6OUT);
78             } # end if
79             } # end while
80             close(BED12IN);
81             }
82              
83             # bed6_ss_from_rnaseq ($bed_in,$dest,$window,$mincov,$can,$fastaO)
84             #
85             # Extracts splice junctions from mapped RNA-seq data
86             #
87             # Writes a BED6 file for each splice junction present in the input,
88             # optionally flanking it with a window of +/-$window nt. Only splice
89             # junctions supported by at least $mcov reads are considered.
90             sub bed6_ss_from_rnaseq{
91             my ($bed_in,$dest,$window,$mcov,$can,$fastaobjR) = @_;
92             my ($reads,$proper,$passed,$pos5,$pos3);
93             my $c = 0;
94             my @bedline = ();
95             my $this_function = (caller(0))[3];
96              
97             croak "ERROR [$this_function] $bed_in does not exist\n"
98             unless (-e $bed_in);
99             croak "ERROR [$this_function] $dest does not exist"
100             unless (-d $dest);
101              
102             open(INBED, "< $bed_in") or croak $!;
103             while(){
104             chomp;
105             my ($chr, $start, $end, $info, $score, $strand) = split("\t");
106             $end = $end-2; # required for segemehl's BED6 files
107              
108             if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
109             $reads = $1;
110             $proper = $4;
111             $passed = $5;
112             next unless ($proper eq 'N');
113             next unless ($passed =~ /[PFM]$/);
114             next if($reads < $mcov);
115             }
116             else {
117             croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
118             }
119             $pos5 = $start;
120             $pos3 = $end;
121             if($can){$c = ss_isCanonical($chr,$pos5,$pos3,$fastaobjR);}
122             my $fn = sprintf("%s_%d-%d.mappedSS.bed6",$chr,$start,$end);
123             my $bed6_fn = file($dest,$fn);
124             open(BED6OUT, "> $bed6_fn");
125             @bedline = join("\t",$chr,eval($start-$window),
126             eval($start+$window),$info,$c,$strand);
127             print BED6OUT "@bedline\n";
128             @bedline = join("\t",$chr,eval($end-$window),
129             eval($end+$window),$info,$c,$strand);
130             print BED6OUT "@bedline\n";
131             close(BED6OUT);
132             }
133             close(INBED);
134             }
135              
136             # bed6_ss_to_bed12 ($bed_in,$dest,$window,$mcov)
137             #
138             # Produce BED12 from BED6 file holdig splice junctions from mapped
139             # RNA-seq data
140             sub bed6_ss_to_bed12{
141             my ($bed_in,$dest,$window,$mcov,$circ) = @_;
142             my ($reads,$proper,$passed,$pos5,$pos3,$basename,$fn,$bed12_fn);
143             my @result = ();
144             my @bedline = ();
145             my $this_function = (caller(0))[3];
146              
147             croak "ERROR [$this_function] $bed_in does not exist\n"
148             unless (-e $bed_in);
149             croak "ERROR [$this_function] $dest does not exist"
150             unless (-d $dest);
151              
152             $basename = fileparse($bed_in,qr/\.[^.]*/);
153             $fn = $basename.".bed12";
154             $bed12_fn = file($dest,$fn);
155             open(BED12OUT, "> $bed12_fn");
156              
157             open(INBED, "< $bed_in") or croak $!;
158             while(){
159             chomp;
160             my ($chr, $start, $end, $info, $score, $strand) = split("\t");
161              
162             if ($info =~ /^splits\:(\d+)\:(\d+)\:(\d+):(\w):(\w)/){
163             $reads = $1;
164             $proper = $4;
165             $passed = $5;
166             if ($circ == 1){ # skip 'N' (normal), 'L' (left) and 'R' (right)
167             next unless ($proper =~ /[C]$/);
168             }
169             else { # skip 'L', 'R' and 'C'
170             next unless ($proper =~ /[N]$/);
171             }
172             next unless ($passed =~ /[PM]$/); # ignore 'F'
173             next if($reads < $mcov);
174             }
175             else {
176             croak "ERROR [$this_function] unsupported INFO field in input BED:\n$info\n";
177             }
178             $pos5 = $start;
179             $pos3 = $end;
180              
181             @bedline = join("\t",$chr,eval($start-$window),
182             eval($end+$window),$info,$score,$strand,$start,$end,
183             "0","2","1,1","0,".eval($end-$start-1));
184             print BED12OUT "@bedline\n";
185             }
186             close(INBED);
187             close(BED12OUT);
188              
189             push (@result, $bed12_fn);
190             return @result;
191             }
192              
193             # intersect_sj($p_annot,$p_mapped,$dest,$prefix,$window,$mil)
194             #
195             # Intersect splice junctions determined by RNA-seq with annotated
196             # splice junctions. Determine novel and existing splice junctions.
197             #
198             # Writes BED6 files for existing and novel splice junctions to $dest
199             # and reurns an array with the absolute path to the two resulting BED
200             # files
201             sub intersect_sj{
202             my ($p_annot,$p_mapped,$dest,$prefix,$window,$mil) = @_;
203             my ($dha,$dhm);
204             my $processed = 0;
205             my @junctions = ();
206             my @transcript_beds = ();
207             my %asj = (); # annotated splice junctions hash
208             my $bedtools = can_run('bedtools') or croak "bedtools not found";
209             my $sortBed = can_run('sortBed') or croak "sortBed not found";
210             my $this_function = (caller(0))[3];
211              
212             croak "ERROR [$this_function] $p_annot does not exist\n"
213             unless (-d $p_annot);
214             croak "ERROR [$this_function] $p_mapped does not exist\n"
215             unless (-d $p_mapped);
216             croak "ERROR [$this_function] $dest does not exist\n"
217             unless (-d $dest);
218              
219             # get a list of all files in $p_annot
220             opendir($dha, $p_annot) or croak "Cannot opendir $p_annot: $!";
221             while(readdir $dha) { push @transcript_beds, $_; }
222             closedir($dha);
223              
224             # get a list of all splice junctions seen in RNA-seq data
225             opendir($dhm, $p_mapped) or croak "Cannot opendir $p_mapped: $!";
226             @junctions = grep { /^(chr\d+)\_(\d+)-(\d+)\.mappedSS\.bed6/ } readdir($dhm);
227              
228             # process splice junctions seen in RNA-seq data
229             foreach my $file (@junctions){
230             $processed++;
231             croak "Unexpected file name pattern\n" unless ($file =~ /(chr\d+)\_(\d+)-(\d+)/);
232             my $sc = $1;
233             my $s5 = $2;
234             my $s3 = $3;
235             my $pattern = $sc."_".$s5."-".$s3;
236              
237             my @annotated_beds = grep { /^(chr\d+)\_(\d+)-(\d+)/ && $2<=$s5 && $3>=$s3 && $sc eq $1} @transcript_beds;
238             #print"\t intersecting against ".(eval $#annotated_beds+1)." transcripts: @annotated_beds \n";
239              
240             # intersect currently opened SJ against all transcripts in @annotated_beds
241             foreach my $i (@annotated_beds){
242             my $a = file($p_mapped,$file);
243             my $b = file($p_annot,$i);
244             my $intersect_cmd = "$bedtools intersect -a $a -b $b -c -nobuf";
245             open(INTERSECT, $intersect_cmd."|");
246             while(){
247             chomp;
248             if ($_ =~ /1$/) { $asj{$pattern} = 1;}
249             }
250             close(INTERSECT);
251             }
252             if ( $processed % 1000 == 0){
253             print STDERR "processed $processed splice junctions\n";
254             }
255             }
256             closedir($dhm);
257              
258             # go through the mapped splice junctions files once again and
259             # separate novel from existing splice junctions
260             if (length($prefix)>0){$prefix .= ".";}
261             my $outname_exist = file($dest, $prefix."exist.SS.bed");
262             my $outname_exist_u = file($dest, $prefix."exist.SS.u.bed");
263             my $outname_novel = file($dest, $prefix."novel.SS.bed");
264             my $outname_novel_u = file($dest, $prefix."novel.SS.u.bed");
265             open (EXISTOUT, "> $outname_exist_u");
266             open (NOVELOUT, "> $outname_novel_u");
267              
268             # write new ones to NOVELOUT; existing ones to EXISTOUT
269             foreach my $file (@junctions){
270             if ($file =~ m/^(chr\d+\_\d+-\d+)/){
271             my $pattern = $1;
272             my $fn = file($p_mapped,$file);
273             open (SJ, "< $fn") or croak "Cannot open $fn $!";
274             while(){
275             chomp;
276             $_ = m/^(chr\w+)\s(\d+)\s(\d+)\s(splits:\d+:\d+:\d+:\w:\w)\s(\d+)\s([+-01])/;
277             my $chr = $1;
278             my $start = $2;
279             my $end = $3;
280             my $name = $4;
281             my $score = $5;
282             my $strand = $6;
283             my @bedline = join("\t",$chr,eval($start+$window),
284             eval($start+$window+1),$name,$score,$strand);
285             if (exists $asj{$pattern}){ # annotated splice junction
286             print EXISTOUT "@bedline\n";
287             }
288             else { # novel splice junction
289             print NOVELOUT "@bedline\n";
290             }
291             } # end while
292             close(SJ);
293             } # end if
294             else{ carp "Error with parsing BED6 junction file names __FILE__ __LINE__\n";}
295             }
296             close(EXISTOUT);
297             close(NOVELOUT);
298              
299             # sort the resulting bed files
300             my $cmd = "$bedtools sort -i $outname_exist_u > $outname_exist";
301             my ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
302             run( command => $cmd, verbose => 0 );
303             if( !$success ) {
304             print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
305             print STDERR "ERROR: this is what the command printed:\n";
306             print join "", @$full_buf;
307             croak $!;
308             }
309             unlink($outname_exist_u);
310              
311             $cmd = "$bedtools sort -i $outname_novel_u > $outname_novel";
312             ( $success, $error_message, $full_buf, $stdout_buf, $stderr_buf ) =
313             run( command => $cmd, verbose => 0 );
314             if( !$success ) {
315             print STDERR "ERROR [$this_function] Call to $bedtools unsuccessful\n";
316             print STDERR "ERROR: this is what the command printed:\n";
317             print join "", @$full_buf;
318             croak $!;
319             }
320             unlink($outname_novel_u);
321              
322             printf STDERR "processed $processed splice junctions\n";
323             my @result = ();
324             push(@result, $outname_exist);
325             push(@result, $outname_novel);
326             return @result;
327             }
328              
329             # ss_isCanonical ( $chr,$p5,$p3,$fastaO )
330              
331             # Checks whether a given splice junction is canonical, ie. whether the
332             # first and last two nucleotides of the enclosed intron correspond to
333             # a certain nucleotide motif. $chr is the chromosome name, $p5 and $p3
334             # the 5' and 3' ends of the splice junction and $fastaobjR is a
335             # Bio::PrimarySeq::Fasta object holding the underlying reference
336             # genome.
337             #
338             # Th most common canonical splice junction motif is GT-AG (shown
339             # below). Other canonical motifs are GC->AG and AT->AC.
340             #
341             # ------------------>
342             # 5'===]GT..........AG[====3'
343             #
344             # <-----------------
345             # 3'===]GA..........TG[====5'
346             #
347             sub ss_isCanonical{
348             my ($chr,$p5,$p3,$fo) = @_;
349             my ($seqL,$seqR,$pattern);
350             my $ss_motif_length = 2;
351             my $this_function = (caller(0))[3];
352             my $c = -1;
353              
354             $seqL = $fo->stranded_subsequence($chr,$p5+1,$p5+$ss_motif_length,"+");
355             $seqR = $fo->stranded_subsequence($chr,$p3-$ss_motif_length+1,$p3,"+");
356              
357             $pattern = sprintf("%s|%s",$seqL,$seqR);
358             #print STDERR "[$this_function] p5->p3 ($p5 -- $p3) $seqL -- $seqR\n";
359              
360             if ($pattern =~ /^GT|AG$/) { $c = 1000;}
361             elsif ($pattern =~ /^CT|AC$/) { $c = 1000; }
362             elsif ($pattern =~ /^GC|AG$/) { $c = 1000; }
363             elsif ($pattern =~ /^CT|GC$/) { $c = 1000; }
364             elsif ($pattern =~ /^AT|AC$/) { $c = 1000; }
365             elsif ($pattern =~ /^GT|AT$/) { $c = 1000; }
366             else { $c = 0;}
367              
368             return $c;
369             }
370              
371             1;
372             __END__