File Coverage

blib/lib/Bio/RNA/SpliceSites/Scoring/MaxEntScan.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package Bio::RNA::SpliceSites::Scoring::MaxEntScan;
2              
3 1     1   12424 use 5.008;
  1         2  
  1         31  
4 1     1   4 use strict;
  1         1  
  1         24  
5 1     1   3 use warnings;
  1         4  
  1         23  
6 1     1   3 use Carp;
  1         1  
  1         59  
7              
8             #Data submodules; these names are from the original maxEntScan distribution.
9 1     1   284074 use Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x3acc;
  0            
  0            
10             use Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x5;
11             use Bio::RNA::SpliceSites::Scoring::SpliceModels::splice5sequences;
12              
13             require Exporter;
14              
15             our @ISA = qw/ Exporter /;
16              
17             my $functions = [ qw/ score5 score3 / ];
18             our %EXPORT_TAGS = ( 'all' => $functions , );
19             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
20             our $VERSION = '0.04';
21              
22             sub log2 {
23             my $number = shift;
24             log($number)/log(2);
25             }
26              
27             sub is_scalar_reference {
28             my $reference_to_validate = shift;
29             ref( $reference_to_validate ) eq "SCALAR" ? 1 : 0;
30             }
31              
32             sub is_kmer {
33             my ( $reference_to_validate , $valid_length ) = @_;
34             length $$reference_to_validate == $valid_length ? 1 : 0;
35             }
36              
37             sub is_genetic_alphabet {
38             my $reference_to_validate = shift;
39             $$reference_to_validate =~ /^[ACTGactg]+$/ ? 1 : 0;
40             }
41              
42             sub split_sequence {
43             #Split the provided splice site sequence into the splice donor/acceptor dinucleotide and the concatenated remainder.
44             my ( $reference_to_sequence , $splice_site_type ) = @_;
45             my @sequence_array = split // , uc( $$reference_to_sequence );
46             my ( $dinucleotide , $outer_portion );
47             if ( $splice_site_type == 5 ) {
48             $dinucleotide = join "" , @sequence_array[3..4]; #Positions 4 and 5 are the splice donor dinucleotide.
49             $outer_portion = ( join "" , @sequence_array[0..2] ) . ( join "" , @sequence_array[5..8] );
50             }
51             elsif ( $splice_site_type == 3 ) {
52             $dinucleotide = join "" , @sequence_array[18..19]; #Positions 19 and 20 are the splice acceptor dinucleotide.
53             $outer_portion = ( join "" , @sequence_array[0..17] ) . ( join "" , @sequence_array[20..22] ); #Join nucleotides 1-18 and 21-23.
54             }
55             else {
56             croak "Invalid type of splice site to split: must be either 5 or 3.\n";
57             }
58             return ( $dinucleotide , $outer_portion );
59             }
60              
61             sub get_splice_5_sequence_matrix_value {
62             my $outer_portion = shift;
63             exists $Bio::RNA::SpliceSites::Scoring::SpliceModels::splice5sequences::table->{ $outer_portion } ?
64             return $Bio::RNA::SpliceSites::Scoring::SpliceModels::splice5sequences::table->{ $outer_portion } : carp "Unable to find sequence matrix for key $outer_portion.\n";
65             }
66              
67             sub get_splice_5_score_matrix_value {
68             my $sequence_matrix_value = shift; #The term "matrix" was used in the original maxEntScan programming and is retained here for clarity.
69             exists $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x5::table->[ $sequence_matrix_value ] ?
70             return $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x5::table->[ $sequence_matrix_value ] : carp "Index out of score matrix range: $sequence_matrix_value\n";
71             }
72              
73             sub score_consensus {
74             my ( $dinucleotide , $type ) = @_;
75             my %bgd = ( 'A' => 0.27 , 'C' => 0.23 , 'G' => 0.23 , 'T' => 0.27 ); #Shared between each type of splice site.
76             my ( %cons1 , %cons2 ); #Populate conditional to splice site type: donor or acceptor.
77              
78             if ( $type == 5 ) {
79             return 15.7507349436393 if $dinucleotide eq 'GT'; #Short circuit for the perfect GT splice donor dinucleotide.
80             %cons1 = ( 'A' => 0.004 , 'C' => 0.0032 , 'G' => 0.9896 , 'T' => 0.0032 );
81             %cons2 = ( 'A' => 0.0034 , 'C' => 0.0039 , 'G' => 0.0042 , 'T' => 0.9884 );
82             }
83             elsif ( $type == 3 ) {
84             #return ????? if $dinucleotide eq 'AG'; #Short circuit for perfect AG splice acceptor dinucleotide.
85             %cons1 = ( 'A' => 0.9903 , 'C' => 0.0032 , 'G' => 0.0034 , 'T' => 0.003 );
86             %cons2 = ( 'A' => 0.0027 , 'C' => 0.0037 , 'G' => 0.9905 , 'T' => 0.003 );
87             }
88             else {
89             croak "Invalid type of splice site to score consensus for: must be either 5 or 3.\n";
90             }
91              
92             my ( $first_nucleotide , $second_nucleotide ) = split // , $dinucleotide;
93             return ( $cons1{$first_nucleotide} * $cons2{$second_nucleotide} ) / ( $bgd{$first_nucleotide} * $bgd{$second_nucleotide} );
94             }
95              
96             sub score5 {
97             my $sequence_reference = shift;
98             #Validate argument:
99             unless ( is_scalar_reference( $sequence_reference ) ) {
100             carp "Not a scalar reference.\n";
101             return 'invalid_invocation';
102             }
103             unless ( is_kmer( $sequence_reference , 9 ) ) {
104             carp "Invalid 5'ss length: must be 9 nucleotides long.\n";
105             return 'invalid_length';
106             }
107             unless ( is_genetic_alphabet( $sequence_reference ) ) {
108             carp "Invalid alphabet: must be only [ACTGactg] with no 'n' nucleotides.\n";
109             return 'invalid_alphabet';
110             }
111              
112             my ( $dinucleotide , $outer_portion ) = split_sequence( $sequence_reference , 5 );
113              
114             #Compute the log2 of the product of the score_5_consensus() subroutine return for the entire sequence (left_side_of_product)
115             # and the me2x5 (score matrix) value for the sequence matrix value of the "outer portion" of the splice site, which is the heptamer with the donor dinucleotide removed
116             # from the splice site nonamer, which is assigned to the scalar variable $outer_portion
117              
118             my $score = log2( score_consensus( $dinucleotide , 5 ) * get_splice_5_score_matrix_value( get_splice_5_sequence_matrix_value( $outer_portion ) ) );
119             return sprintf( "%.2f" , $score ); #2 decimals.
120             }
121              
122             sub hash_seq {
123             #Returns a hash key for a sequence as a 4-radix integer.
124             #E.g. given sequence 'CAGAAGT', returns 4619 as a scalar.
125             my $sequence = shift;
126             $sequence=~ y/ACGT/0123/;
127             my @sequence_array = split // , $sequence;
128             my $sum = 0;
129             my $end = length( $sequence ) - 1;
130              
131             my @four_radix = qw/ 1 4 16 64 256 1024 4096 16384 /;
132             $sum += $sequence_array[$_] * $four_radix[ $end - $_ ] for 0 .. $end;
133             return $sum;
134             }
135              
136             sub get_max_ent_score {
137             my ( $sequence , $table_ref ) = @_; #Table ref is a reference to an array of references to hashes prepared in the me2x3acc submodule.
138             my @partial_score = ( $table_ref->[0]{ hash_seq( substr $sequence , 0 , 7 ) } ,
139             $table_ref->[1]{ hash_seq( substr $sequence , 7 , 7 ) } ,
140             $table_ref->[2]{ hash_seq( substr $sequence , 14 , 7 ) } ,
141             $table_ref->[3]{ hash_seq( substr $sequence , 4 , 7 ) } ,
142             $table_ref->[4]{ hash_seq( substr $sequence , 11 , 7 ) } ,
143             $table_ref->[5]{ hash_seq( substr $sequence , 4 , 3 ) } ,
144             $table_ref->[6]{ hash_seq( substr $sequence , 7 , 4 ) } ,
145             $table_ref->[7]{ hash_seq( substr $sequence , 11 , 3 ) } ,
146             $table_ref->[8]{ hash_seq( substr $sequence , 14 , 4 ) } );
147             my $final_score = $partial_score[0] * $partial_score[1] * $partial_score[2] * $partial_score[3] * $partial_score[4] /
148             ( $partial_score[5] * $partial_score[6] * $partial_score[7] * $partial_score[8] );
149             return $final_score;
150             }
151              
152             sub score3 {
153             my $sequence_reference = shift; #Should be 23nt long.
154             #Validate argument:
155             unless ( is_scalar_reference( $sequence_reference ) ) {
156             carp "Not a scalar reference.\n";
157             return 'invalid_invocation';
158             }
159             unless ( is_kmer( $sequence_reference , 23 ) ) {
160             carp "Invalid 3'ss length: must be 23nt long.\n";
161             return 'invalid_length';
162             }
163             unless ( is_genetic_alphabet( $sequence_reference ) ) {
164             carp "Invalid alphabet: must be only [ACTGactg] with no 'n' nucleotides.\n";
165             return 'invalid_alphabet';
166             }
167              
168             my ( $dinucleotide , $outer_portion ) = split_sequence( $sequence_reference , 3 );
169             my $score = log2( score_consensus( $dinucleotide , 3 ) * get_max_ent_score( $outer_portion , $Bio::RNA::SpliceSites::Scoring::SpliceModels::me2x3acc::table ) );
170             return sprintf( "%.2f" , $score ); #2 decimals.
171             }
172              
173             1;
174              
175             __END__