File Coverage

Bio/Matrix/PSM/SiteMatrix.pm
Criterion Covered Total %
statement 375 556 67.4
branch 131 256 51.1
condition 67 115 58.2
subroutine 24 28 85.7
pod 21 21 100.0
total 618 976 63.3


line stmt bran cond sub pod time code
1             #---------------------------------------------------------
2              
3             =head1 NAME
4              
5             Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
6             position scoring matrix (or position weight matrix) and log-odds
7              
8             =head1 SYNOPSIS
9              
10             use Bio::Matrix::PSM::SiteMatrix;
11             # Create from memory by supplying probability matrix hash
12             # both as strings or arrays
13             # where the frequencies $a,$c,$g and $t are supplied either as
14             # arrayref or string. Accordingly, lA, lC, lG and lT are the log
15             # odds (only as arrays, no checks done right now)
16             my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
17             #or
18             my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
19             '100104',0.001,19.2,'CRE1');
20             #Where a stands for all (this frequency=1), see explanation bellow
21             my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
22             -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
23             -IC=>$ic,-e_val=>$score, -id=>$mid);
24             my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
25             #Or get it from a file:
26             use Bio::Matrix::PSM::IO;
27             my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
28             while (my $psm=$psmIO->next_psm) {
29             #Now we have a Bio::Matrix::PSM::Psm object,
30             # see Bio::Matrix::PSM::PsmI for details
31             #This is a Bio::Matrix::PSM::SiteMatrix object now
32             my $matrix=$psm->matrix;
33             }
34              
35             # Get a simple consensus, where alphabet is {A,C,G,T,N},
36             # choosing the character that both satisfies a supplied or default threshold
37             # frequency and is the most frequenct character at each position, or N.
38             # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
39             # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
40             # would be 'N'.
41             my $consensus=$site->consensus;
42              
43             # Get the IUPAC ambiguity code representation of the data in the matrix.
44             # Because the frequencies may have been pseudo-count corrected, insignificant
45             # frequences (below 0.05 by default) are ignored. So a position with
46             # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
47             # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
48             # 0.25, 0.25, 0.25, 0.25 would get 'N'.
49             my $iupac=$site->IUPAC;
50              
51             # Getting/using regular expression (a representation of the IUPAC string)
52             my $regexp=$site->regexp;
53             my $count=grep($regexp,$seq);
54             my $count=($seq=~ s/$regexp/$1/eg);
55             print "Motif $mid is present $count times in this sequence\n";
56              
57             =head1 DESCRIPTION
58              
59             SiteMatrix is designed to provide some basic methods when working with position
60             scoring (weight) matrices, such as transcription factor binding sites for
61             example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
62             the minimum information you should provide to construct a PSM object. The
63             vectors can be provided as strings with frequenciesx10 rounded to an int, going
64             from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
65             representation of a matrix and it is quite useful when working with relational
66             DB. If arrays are provided as an input (references to arrays actually) they can
67             be any number, real or integer (frequency or count).
68              
69             When creating the object you can ask the constructor to make a simple pseudo
70             count correction by adding a number (typically 1) to all positions (with the
71             -correction option). After adding the number the frequencies will be
72             calculated. Only use correction when you supply counts, not frequencies.
73              
74             Throws an exception if: You mix as an input array and string (for example A
75             matrix is given as array, C - as string). The position vector is (0,0,0,0). One
76             of the probability vectors is shorter than the rest.
77              
78             Summary of the methods I use most frequently (details bellow):
79              
80             iupac - return IUPAC compliant consensus as a string
81             score - Returns the score as a real number
82             IC - information content. Returns a real number
83             id - identifier. Returns a string
84             accession - accession number. Returns a string
85             next_pos - return the sequence probably for each letter, IUPAC
86             symbol, IUPAC probability and simple sequence
87             consenus letter for this position. Rewind at the end. Returns a hash.
88             pos - current position get/set. Returns an integer.
89             regexp - construct a regular expression based on IUPAC consensus.
90             For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
91             width - site width
92             get_string - gets the probability vector for a single base as a string.
93             get_array - gets the probability vector for a single base as an array.
94             get_logs_array - gets the log-odds vector for a single base as an array.
95              
96             New methods, which might be of interest to anyone who wants to store
97             PSM in a relational database without creating an entry for each
98             position is the ability to compress the PSM vector into a string with
99             losing usually less than 1% of the data. this can be done with:
100              
101             my $str=$matrix->get_compressed_freq('A');
102             or
103             my $str=$matrix->get_compressed_logs('A');
104              
105             Loading from a database should be done with new, but is not yest implemented.
106             However you can still uncompress such string with:
107              
108             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
109             or
110             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
111              
112             =head1 FEEDBACK
113              
114             =head2 Mailing Lists
115              
116             User feedback is an integral part of the evolution of this and other
117             Bioperl modules. Send your comments and suggestions preferably to one
118             of the Bioperl mailing lists. Your participation is much appreciated.
119              
120             bioperl-l@bioperl.org - General discussion
121             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
122              
123             =head2 Support
124              
125             Please direct usage questions or support issues to the mailing list:
126              
127             I
128              
129             rather than to the module maintainer directly. Many experienced and
130             reponsive experts will be able look at the problem and quickly
131             address it. Please include a thorough description of the problem
132             with code and data examples if at all possible.
133              
134             =head2 Reporting Bugs
135              
136             Report bugs to the Bioperl bug tracking system to help us keep track
137             the bugs and their resolution. Bug reports can be submitted via the
138             web:
139              
140             https://github.com/bioperl/bioperl-live/issues
141              
142             =head1 AUTHOR - Stefan Kirov
143              
144             Email skirov@utk.edu
145              
146             =head1 CONTRIBUTORS
147              
148             Sendu Bala, bix@sendu.me.uk
149              
150             =head1 APPENDIX
151              
152             The rest of the documentation details each of the object methods.
153             Internal methods are usually preceded with a _
154              
155             =cut
156              
157             # Let the code begin...
158             package Bio::Matrix::PSM::SiteMatrix;
159 5     5   528 use strict;
  5         8  
  5         157  
160              
161 5     5   18 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
  5         5  
  5         1763  
162              
163             =head2 new
164              
165             Title : new
166             Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
167             -pG=>$g,-pT=>$t,
168             -IC=>$ic,
169             -e_val=>$score,
170             -id=>$mid);
171             Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
172             Throws : If inconsistent data for all vectors (A,C,G and T) is
173             provided, if you mix input types (string vs array) or if a
174             position freq is 0.
175             Returns : Bio::Matrix::PSM::SiteMatrix object
176             Args : -pA => vector with the frequencies or counts of A
177             -pC => vector for C
178             -pG => vector for G
179             -pt => vector for T
180             -lA => vector for the log of A
181             -lC => vector for the log of C
182             -lG => vector for the log of G
183             -lT => vector for the log of T
184             -IC => real number, the information content of this matrix
185             -e_val => real number, the expect value
186             -id => string, an identifier
187             -width => int, width of the matrix in nucleotides
188             -sites => int, the number of sites that went into this matrix
189             -model => hash ref, background frequencies for A, C, G and T
190             -correction => number, the number to add to all positions to achieve
191             psuedo count correction (default 0: no correction)
192             NB: do not use correction when your input is
193             frequences!
194             -accession_number => string, an accession number
195              
196             Vectors can be strings of the frequencies where the frequencies are
197             multiplied by 10 and rounded to the nearest whole number, and where
198             'a' is used to denote the maximal frequency 10. There should be no
199             punctuation (spaces etc.) in the string. For example, 'a0501'.
200             Alternatively frequencies or counts can be represented by an array
201             ref containing the counts, frequencies or logs as any kind of
202             number.
203              
204             =cut
205              
206             sub new {
207 14     14 1 190 my ($class, @args) = @_;
208 14         62 my $self = $class->SUPER::new(@args);
209 14         13 my $consensus;
210             # Too many things to rearrange, and I am creating simultanuously >500
211             # such objects routinely, so this becomes performance issue
212             my %input;
213 14         37 while (@args) {
214 114         151 (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
215 114         189 $input{$key} = shift @args;
216             }
217 14         23 $self->{_position} = 0;
218 14         27 $self->{IC} = $input{IC};
219 14         24 $self->{e_val} = $input{e_val};
220 14         20 $self->{width} = $input{width};
221 14         18 $self->{logA} = $input{lA};
222 14         18 $self->{logC} = $input{lC};
223 14         27 $self->{logG} = $input{lG};
224 14         23 $self->{logT} = $input{lT};
225 14         18 $self->{sites} = $input{sites};
226 14   100     37 $self->{id} = $input{id} || 'null';
227 14   50     45 $self->{correction} = $input{correction} || 0;
228 14         15 $self->{accession_number} = $input{accession_number};
229 14 50 66     105 return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT}));
      66        
      33        
230            
231             # This should go to _initialize?
232             # Check for input type- no mixing alllowed, throw ex
233 12 100       49 if (ref($input{pA}) =~ /ARRAY/i ) {
234 11 50       21 $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC}));
235 11 50       20 $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG}));
236 11 50       20 $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT}));
237 11         14 $self->{probA} = $input{pA};
238 11         13 $self->{probC} = $input{pC};
239 11         24 $self->{probG} = $input{pG};
240 11         14 $self->{probT} = $input{pT};
241             }
242             else {
243 1 50       3 $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC}));
244 1 50       3 $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG}));
245 1 50       3 $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT}));
246 1         5 $self->{probA} = [split(//,$input{pA})];
247 1         4 $self->{probC} = [split(//,$input{pC})];
248 1         5 $self->{probG} = [split(//,$input{pG})];
249 1         4 $self->{probT} = [split(//,$input{pT})];
250 1         2 for (my $i=0; $i<= @{$self->{probA}}+1; $i++) {
  8         16  
251             # we implictely assume these are MEME-style frequencies x 10, so
252             # 'a' represents the 'maximum', 10. Other positions can actually
253             # add up to over 10 due to rounding, but I don't think that is a
254             # problem?
255 7 100 100     4 if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') {
  7         14  
  3         8  
256 1         2 ${$self->{probA}}[$i]='10';
  1         3  
257             }
258 7 100 100     5 if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') {
  7         17  
  3         8  
259 1         1 ${$self->{probC}}[$i]='10';
  1         1  
260             }
261 7 50 66     7 if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') {
  7         13  
  3         7  
262 0         0 ${$self->{probG}}[$i]='10';
  0         0  
263             }
264 7 50 66     6 if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') {
  7         12  
  2         6  
265 0         0 ${$self->{probT}}[$i]='10';
  0         0  
266             }
267             }
268             }
269            
270             # Check for position with 0 for all bases, throw exception if so
271 12         18 for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
  279         349  
272 267 50       149 if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) {
  267         200  
  267         270  
  267         217  
  267         370  
273 0         0 $self->throw("Position meaningless-all frequencies are 0");
274             }
275            
276             # apply psuedo-count correction to all values - this will result in
277             # very bad frequencies if the input is already frequences and a
278             # correction value as large as 1 is used!
279 267 50       324 if ($self->{correction}) {
280 0         0 ${$self->{probA}}[$i] += $self->{correction};
  0         0  
281 0         0 ${$self->{probC}}[$i] += $self->{correction};
  0         0  
282 0         0 ${$self->{probG}}[$i] += $self->{correction};
  0         0  
283 0         0 ${$self->{probT}}[$i] += $self->{correction};
  0         0  
284             }
285            
286             # (re)calculate frequencies
287 267         158 my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i];
  267         210  
  267         196  
  267         195  
  267         227  
288 267         174 ${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div;
  267         193  
  267         213  
289 267         162 ${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div;
  267         203  
  267         200  
290 267         159 ${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div;
  267         182  
  267         185  
291 267         173 ${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div;
  267         247  
  267         193  
292             }
293            
294             # Calculate the logs
295 12 50 66     42 if ((!defined($self->{logA})) && ($input{model})) {
296 0         0 $self->calc_weight($input{model});
297             }
298            
299             # Make consensus, throw if any one of the vectors is shorter
300 12         35 $self->_calculate_consensus;
301 12         69 return $self;
302             }
303              
304             =head2 _calculate_consensus
305              
306             Title : _calculate_consensus
307             Function: Internal stuff
308              
309             =cut
310              
311             sub _calculate_consensus {
312 12     12   14 my $self=shift;
313 12         12 my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
  12         18  
  12         11  
  12         20  
314 12         11 my $len=$#{$self->{probA}};
  12         11  
315 12 50       29 $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
316 12 50       27 $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
317 12 50       18 $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
318 12         31 for (my $i=0; $i<$len+1; $i++) {
319             #*** IUPACp values not actually used (eg. by next_pos)
320 267         153 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
  267         344  
  267         304  
  267         232  
  267         194  
  267         181  
  267         299  
321 267         223 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
  267         309  
  267         656  
  267         225  
  267         186  
  267         199  
  267         317  
322             }
323 12         19 return $self;
324             }
325              
326             =head2 calc_weight
327              
328             Title : calc_weight
329             Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
330             Function: Recalculates the PSM (or weights) based on the PFM (the frequency
331             matrix) and user supplied background model.
332             Throws : if no model is supplied
333             Returns : n/a
334             Args : reference to a hash with background frequencies for A,C,G and T
335              
336             =cut
337              
338             sub calc_weight {
339 0     0 1 0 my ($self, $model) = @_;
340 0         0 my %model;
341 0         0 $model{probA}=$model->{A};
342 0         0 $model{probC}=$model->{C};
343 0         0 $model{probG}=$model->{G};
344 0         0 $model{probT}=$model->{T};
345 0         0 foreach my $let (qw(probA probC probG probT)) {
346 0         0 my @str;
347 0 0 0     0 $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
348 0         0 foreach my $f (@{$self->{$let}}) {
  0         0  
349 0         0 my $w=log($f)-log($model{$let});
350 0         0 push @str,$w;
351             }
352 0         0 my $llet=$let;
353 0         0 $llet=~s/prob/log/;
354 0         0 $self->{$llet}=\@str;
355             }
356 0         0 return $self;
357             }
358              
359             =head2 next_pos
360              
361             Title : next_pos
362             Usage :
363             Function: Retrives the next position features: frequencies for A,C,G,T, the
364             main letter (as in consensus) and the probabilty for this letter to
365             occur at this position and the current position
366             Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
367             Args : none
368              
369             =cut
370              
371             sub next_pos {
372 104     104 1 94 my $self = shift;
373 104 50       136 $self->throw("instance method called on class") unless ref $self;
374 104         62 my $len=@{$self->{seq}};
  104         100  
375 104         75 my $pos=$self->{_position};
376             # End reached?
377 104 100       110 if ($pos<$len) {
378 101         58 my $pA=${$self->{probA}}[$pos];
  101         101  
379 101         63 my $pC=${$self->{probC}}[$pos];
  101         76  
380 101         56 my $pG=${$self->{probG}}[$pos];
  101         138  
381 101         59 my $pT=${$self->{probT}}[$pos];
  101         71  
382 101         67 my $lA=${$self->{logA}}[$pos];
  101         79  
383 101         59 my $lC=${$self->{logC}}[$pos];
  101         76  
384 101         54 my $lG=${$self->{logG}}[$pos];
  101         82  
385 101         83 my $lT=${$self->{logT}}[$pos];
  101         74  
386 101         63 my $base=${$self->{seq}}[$pos];
  101         79  
387 101         56 my $prob=${$self->{seqp}}[$pos];
  101         79  
388 101         77 $self->{_position}++;
389 101         269 my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob);
390 101         524 return %seq;
391             }
392 3         2 else {$self->{_position}=0; return;}
  3         9  
393             }
394              
395             =head2 curpos
396              
397             Title : curpos
398             Usage :
399             Function: Gets/sets the current position. Converts to 0 if argument is minus
400             and to width if greater than width
401             Returns : integer
402             Args : integer
403              
404             =cut
405              
406             sub curpos {
407 2     2 1 297 my $self = shift;
408 2         3 my $prev = $self->{_position};
409 2 50       5 if (@_) { $self->{_position} = shift; }
  0         0  
410 2         6 return $prev;
411             }
412              
413             =head2 e_val
414              
415             Title : e_val
416             Usage :
417             Function: Gets/sets the e-value
418             Returns : real number
419             Args : none to get, real number to set
420              
421             =cut
422              
423             sub e_val {
424 3     3 1 6 my $self = shift;
425 3         5 my $prev = $self->{e_val};
426 3 100       10 if (@_) { $self->{e_val} = shift; }
  1         2  
427 3         9 return $prev;
428             }
429              
430             =head2 IC
431              
432             Title : IC
433             Usage :
434             Function: Get/set the Information Content
435             Returns : real number
436             Args : none to get, real number to set
437              
438             =cut
439              
440             sub IC {
441 1     1 1 2 my $self = shift;
442 1         2 my $prev = $self->{IC};
443 1 50       3 if (@_) { $self->{IC} = shift; }
  0         0  
444 1         2 return $prev;
445             }
446              
447             =head2 accession_number
448              
449             Title : accession_number
450             Function: Get/set the accession number, this will be unique id for the
451             SiteMatrix object as well for any other object, inheriting from
452             SiteMatrix
453             Returns : string
454             Args : none to get, string to set
455              
456             =cut
457              
458             sub accession_number {
459 0     0 1 0 my $self = shift;
460 0         0 my $prev = $self->{accession_number};
461 0 0       0 if (@_) { $self->{accession_number} = shift; }
  0         0  
462 0         0 return $prev;
463             }
464              
465             =head2 consensus
466              
467             Title : consensus
468             Usage :
469             Function: Returns the consensus
470             Returns : string
471             Args : (optional) threshold value 1 to 10, default 5
472             '5' means the returned characters had a 50% or higher presence at
473             their position
474              
475             =cut
476              
477             sub consensus {
478 8     8 1 888 my ($self, $thresh) = @_;
479 8 50       32 if ($thresh) {
480 0         0 my $len=$#{$self->{probA}};
  0         0  
481 0         0 for (my $i=0; $i<$len+1; $i++) {
482 0         0 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
483             }
484             }
485 8         8 my $consensus='';
486 8         6 foreach my $letter (@{$self->{seq}}) {
  8         18  
487 180         137 $consensus .= $letter;
488             }
489 8         34 return $consensus;
490             }
491              
492             =head2 width
493              
494             Title : width
495             Usage :
496             Function: Returns the length of the sites in used to make this matrix
497             Returns : int
498             Args : none
499              
500             =cut
501              
502             sub width {
503 2     2 1 4 my $self = shift;
504 2         3 my $width=@{$self->{probA}};
  2         6  
505 2         5 return $width;
506             }
507              
508             =head2 sites
509              
510             Title : sites
511             Usage :
512             Function: Get/set the number of sites that were used to make this matrix
513             Returns : int
514             Args : none to get, int to set
515              
516             =cut
517              
518             sub sites {
519 0     0 1 0 my $self = shift;
520 0 0       0 if (@_) { $self->{sites} = shift }
  0         0  
521 0   0     0 return $self->{sites} || return;
522             }
523              
524             =head2 IUPAC
525              
526             Title : IUPAC
527             Usage :
528             Function: Returns IUPAC compliant consensus
529             Returns : string
530             Args : optionally, also supply a whole number (int) of 1 or higher to set
531             the significance level when considering the frequencies. 1 (the
532             default) means a 0.05 significance level: frequencies lower than
533             0.05 will be ignored. 2 Means a 0.005 level, and so on.
534              
535             =cut
536              
537             sub IUPAC {
538 5     5 1 2648 my ($self, $thresh) = @_;
539 5 50       15 if ($thresh) {
540 0         0 my $len=$#{$self->{probA}};
  0         0  
541 0         0 for (my $i=0; $i<$len+1; $i++) {
542 0         0 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
543             }
544             }
545 5         10 my $iu=$self->{IUPAC};
546 5         7 my $iupac='';
547 5         6 foreach my $let (@{$iu}) {
  5         10  
548 92         74 $iupac .= $let;
549             }
550 5         19 return $iupac;
551             }
552              
553             =head2 _to_IUPAC
554              
555             Title : _to_IUPAC
556             Usage :
557             Function: Converts a single position to IUPAC compliant symbol.
558             For rules see the implementation
559             Returns : char, real number
560             Args : real numbers for frequencies of A,C,G,T (positional)
561              
562             optionally, also supply a whole number (int) of 1 or higher to set
563             the significance level when considering the frequencies. 1 (the
564             default) means a 0.05 significance level: frequencies lower than
565             0.05 will be ignored. 2 Means a 0.005 level, and so on.
566              
567             =cut
568              
569             sub _to_IUPAC {
570 267     267   213 my ($a, $c, $g, $t, $thresh) = @_;
571 267   50     549 $thresh ||= 1;
572 267         154 $thresh = int($thresh);
573 267         850 $a = sprintf ("%.${thresh}f", $a);
574 267         494 $c = sprintf ("%.${thresh}f", $c);
575 267         440 $g = sprintf ("%.${thresh}f", $g);
576 267         458 $t = sprintf ("%.${thresh}f", $t);
577            
578 267         418 my $total = $a + $c + $g + $t;
579            
580 267 100       397 return 'A' if ($a == $total);
581 199 100       223 return 'G' if ($g == $total);
582 190 100       252 return 'C' if ($c == $total);
583 108 100       146 return 'T' if ($t == $total);
584 93         55 my $r=$g+$a;
585 93 50       111 return 'R' if ($r == $total);
586 93         65 my $y=$t+$c;
587 93 100       112 return 'Y' if ($y == $total);
588 86         60 my $m=$a+$c;
589 86 100       108 return 'M' if ($m == $total);
590 62         38 my $k=$g+$t;
591 62 100       87 return 'K' if ($k == $total);
592 49         37 my $s=$g+$c;
593 49 100       67 return 'S' if ($s == $total);
594 42         34 my $w=$a+$t;
595 42 100       69 return 'W' if ($w == $total);
596 20         15 my $d=$r+$t;
597 20 100       28 return 'D' if ($d == $total);
598 17         18 my $v=$r+$c;
599 17 100       30 return 'V' if ($v == $total);
600 9         11 my $b=$y+$g;
601 9 100       19 return 'B' if ($b == $total);
602 7         8 my $h=$y+$a;
603 7 100       16 return 'H' if ($h == $total);
604 1         2 return 'N';
605             }
606              
607             =head2 _to_cons
608              
609             Title : _to_cons
610             Usage :
611             Function: Converts a single position to simple consensus character and returns
612             its probability. For rules see the implementation
613             Returns : char, real number
614             Args : real numbers for A,C,G,T (positional), and optional 5th argument of
615             threshold (as a number between 1 and 10, where 5 is default and
616             means the returned character had a 50% or higher presence at this
617             position)
618              
619             =cut
620              
621             sub _to_cons {
622 267     267   223 my ($A, $C, $G, $T, $thresh) = @_;
623 267   50     555 $thresh ||= 5;
624            
625             # this multiplication by 10 is just to satisfy the thresh range of 1-10
626 267         193 my $a = $A * 10;
627 267         211 my $c = $C * 10;
628 267         201 my $g = $G * 10;
629 267         156 my $t = $T * 10;
630            
631 267 100 100     767 return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
      100        
      100        
632 244 50 100     425 return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
      66        
633            
634             # threshold could be lower than 50%, so must check is not only over
635             # threshold, but also the highest frequency
636 244 50 100     763 return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
      66        
      66        
637 146 50 100     606 return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
      66        
      66        
638 45 50 66     169 return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
      66        
      33        
639 26 100 33     199 return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
      66        
      100        
640            
641 2         4 return 'N',10;
642             }
643              
644             =head2 get_string
645              
646             Title : get_string
647             Usage :
648             Function: Returns given probability vector as a string. Useful if you want to
649             store things in a rel database, where arrays are not first choice
650             Throws : If the argument is outside {A,C,G,T}
651             Returns : string
652             Args : character {A,C,G,T}
653              
654             =cut
655              
656             sub get_string {
657 1     1 1 2 my $self=shift;
658 1         1 my $base=shift;
659 1         1 my $string='';
660 1         2 my @prob;
661            
662             BASE: {
663 1 50       1 if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; }
  1         4  
  1         1  
  1         4  
  1         3  
664 0 0       0 if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; }
  0         0  
  0         0  
  0         0  
665 0 0       0 if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; }
  0         0  
  0         0  
  0         0  
666 0 0       0 if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; }
  0         0  
  0         0  
  0         0  
667 0         0 $self->throw ("No such base: $base!\n");
668             }
669            
670 1         2 foreach my $prob (@prob) {
671 5         4 my $corrected = $prob*10;
672 5         7 my $next=sprintf("%.0f",$corrected);
673 5 100       9 $next='a' if ($next eq '10');
674 5         7 $string .= $next;
675             }
676 1         6 return $string;
677             }
678              
679             =head2 get_array
680              
681             Title : get_array
682             Usage :
683             Function: Returns an array with frequencies for a specified base
684             Returns : array
685             Args : char
686              
687             =cut
688              
689             sub get_array {
690 3     3 1 14 my $self=shift;
691 3         7 my $base=uc(shift);
692 3 50       9 return @{$self->{probA}} if ($base eq 'A');
  3         22  
693 0 0       0 return @{$self->{probC}} if ($base eq 'C');
  0         0  
694 0 0       0 return @{$self->{probG}} if ($base eq 'G');
  0         0  
695 0 0       0 return @{$self->{probT}} if ($base eq 'T');
  0         0  
696 0         0 $self->throw("No such base: $base!\n");
697             }
698              
699             =head2 get_logs_array
700              
701             Title : get_logs_array
702             Usage :
703             Function: Returns an array with log_odds for a specified base
704             Returns : array
705             Args : char
706              
707             =cut
708              
709             sub get_logs_array {
710 1     1 1 9 my $self=shift;
711 1         2 my $base=uc(shift);
712 1 50 33     6 return @{$self->{logA}} if (($base eq 'A') && ($self->{logA}));
  1         16  
713 0 0 0     0 return @{$self->{logC}} if (($base eq 'C') && ($self->{logC}));
  0         0  
714 0 0 0     0 return @{$self->{logG}} if (($base eq 'G') && ($self->{logG}));
  0         0  
715 0 0 0     0 return @{$self->{logT}} if (($base eq 'T') && ($self->{logT}));
  0         0  
716 0 0       0 $self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
717 0         0 return;
718             }
719              
720             =head2 id
721              
722             Title : id
723             Usage :
724             Function: Gets/sets the site id
725             Returns : string
726             Args : string
727              
728             =cut
729              
730             sub id {
731 12     12 1 22 my $self = shift;
732 12         19 my $prev = $self->{id};
733 12 100       22 if (@_) { $self->{id} = shift; }
  2         2  
734 12         30 return $prev;
735             }
736              
737             =head2 regexp
738              
739             Title : regexp
740             Usage :
741             Function: Returns a regular expression which matches the IUPAC convention.
742             N will match X, N, - and .
743             Returns : string
744             Args : none (works at the threshold last used for making the IUPAC string)
745              
746             =cut
747              
748             sub regexp {
749 1     1 1 1 my $self=shift;
750 1         2 my $regexp;
751 1         2 foreach my $letter (@{$self->{IUPAC}}) {
  1         3  
752 5         3 my $reg;
753             LETTER: {
754 5 100       4 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
  5         9  
  1         2  
  1         2  
755 4 100       6 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
  1         1  
  1         1  
756 3 50       5 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
  0         0  
  0         0  
757 3 50       6 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
  0         0  
  0         0  
758 3 50       4 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
  0         0  
  0         0  
759 3 50       5 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
  0         0  
  0         0  
760 3 50       14 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
  0         0  
  0         0  
761 3 50       6 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
  0         0  
  0         0  
762 3 50       5 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
  0         0  
  0         0  
763 3 50       5 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
  0         0  
  0         0  
764 3 100       5 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
  1         2  
  1         1  
765 2 50       3 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
  0         0  
  0         0  
766 2 100       5 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
  1         1  
  1         1  
767 1 50       3 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
  1         1  
  1         2  
768 0         0 $reg='\S';
769             }
770 5         6 $regexp .= $reg;
771             }
772 1         5 return $regexp;
773             }
774              
775             =head2 regexp_array
776              
777             Title : regexp_array
778             Usage :
779             Function: Returns a regular expression which matches the IUPAC convention.
780             N will match X, N, - and .
781             Returns : array
782             Args : none (works at the threshold last used for making the IUPAC string)
783             To do : I have separated regexp and regexp_array, but
784             maybe they can be rewritten as one - just check what should be returned
785              
786             =cut
787              
788             sub regexp_array {
789 1     1 1 2 my $self=shift;
790 1         1 my @regexp;
791 1         2 foreach my $letter (@{$self->{IUPAC}}) {
  1         3  
792 5         4 my $reg;
793             LETTER: {
794 5 100       5 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
  5         7  
  1         2  
  1         2  
795 4 100       7 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
  1         1  
  1         2  
796 3 50       5 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
  0         0  
  0         0  
797 3 50       5 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
  0         0  
  0         0  
798 3 50       5 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
  0         0  
  0         0  
799 3 50       5 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
  0         0  
  0         0  
800 3 50       5 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
  0         0  
  0         0  
801 3 50       5 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
  0         0  
  0         0  
802 3 50       5 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
  0         0  
  0         0  
803 3 50       4 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
  0         0  
  0         0  
804 3 100       5 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
  1         1  
  1         2  
805 2 50       3 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
  0         0  
  0         0  
806 2 100       4 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
  1         1  
  1         2  
807 1 50       2 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
  1         2  
  1         1  
808 0         0 $reg='\S';
809             }
810 5         8 push @regexp,$reg;
811             }
812 1         6 return @regexp;
813             }
814              
815              
816             =head2 _compress_array
817              
818             Title : _compress_array
819             Usage :
820             Function: Will compress an array of real signed numbers to a string (ie vector
821             of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
822             unsigned
823             Returns : String
824             Args : array reference, followed by an max value and direction (optional,
825             default 1-unsigned),1 unsigned, any other is signed.
826              
827             =cut
828              
829             sub _compress_array {
830 3     3   4 my ($array,$lm,$direct)=@_;
831 3         4 my $str;
832 3 50 33     15 return unless(($array) && ($lm));
833 3 50       6 $direct=1 unless ($direct);
834 3 100       10 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
835 3         3 foreach my $c (@{$array}) {
  3         11  
836 62 50       71 $c=$lm if ($c>$lm);
837 62 50 33     86 $c=-$lm if (($c<-$lm) && ($direct !=1));
838 62 50 66     99 $c=0 if (($c<0) && ($direct ==1));
839 62         53 my $byte=int($k1*$c);
840 62 100       69 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
841 62         49 my $char=chr($byte);
842 62         55 $str.=$char;
843             }
844 3         9 return $str;
845             }
846              
847             =head2 _uncompress_string
848              
849             Title : _uncompress_string
850             Usage :
851             Function: Will uncompress a string (vector of bytes) to create an array of
852             real signed numbers (opposite to_compress_array)
853             Returns : string, followed by an max value and
854             direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
855             Args : array
856              
857             =cut
858              
859             sub _uncompress_string {
860 3     3   14 my ($str,$lm,$direct)=@_;
861 3         2 my @array;
862 3 50 33     12 return unless(($str) && ($lm));
863 3 50       4 $direct=1 unless ($direct);
864 3 100       14 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
865 3         16 foreach my $c (split(//,$str)) {
866 62         38 my $byte=ord($c);
867 62 100       64 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
868 62         43 my $num=$byte/$k1;
869 62         50 push @array,$num;
870             }
871 3         17 return @array;
872             }
873              
874             =head2 get_compressed_freq
875              
876             Title : get_compressed_freq
877             Usage :
878             Function: A method to provide a compressed frequency vector. It uses one byte
879             to code the frequence for one of the probability vectors for one
880             position. Useful for relational database. Improvment of the previous
881             0..a coding.
882             Example : my $strA=$self->get_compressed_freq('A');
883             Returns : String
884             Args : char
885              
886             =cut
887              
888             sub get_compressed_freq {
889 2     2 1 893 my $self=shift;
890 2         4 my $base=shift;
891 2         3 my $string='';
892 2         3 my @prob;
893             BASE: {
894 2 50       1 if ($base eq 'A') {
  2         27  
895 2 50       6 @prob= @{$self->{probA}} unless (!defined($self->{probA}));
  2         8  
896 2         5 last BASE;
897             }
898 0 0       0 if ($base eq 'G') {
899 0 0       0 @prob= @{$self->{probG}} unless (!defined($self->{probG}));
  0         0  
900 0         0 last BASE;
901             }
902 0 0       0 if ($base eq 'C') {
903 0 0       0 @prob= @{$self->{probC}} unless (!defined($self->{probC}));
  0         0  
904 0         0 last BASE;
905             }
906 0 0       0 if ($base eq 'T') {
907 0 0       0 @prob= @{$self->{probT}} unless (!defined($self->{probT}));
  0         0  
908 0         0 last BASE;
909             }
910 0         0 $self->throw ("No such base: $base!\n");
911             }
912 2         7 my $str= _compress_array(\@prob,1,1);
913 2         6 return $str;
914             }
915              
916             =head2 get_compressed_logs
917              
918             Title : get_compressed_logs
919             Usage :
920             Function: A method to provide a compressed log-odd vector. It uses one byte to
921             code the log value for one of the log-odds vectors for one position.
922             Example : my $strA=$self->get_compressed_logs('A');
923             Returns : String
924             Args : char
925              
926             =cut
927              
928             sub get_compressed_logs {
929 1     1 1 2 my $self=shift;
930 1         1 my $base=shift;
931 1         1 my $string='';
932 1         2 my @prob;
933             BASE: {
934 1 50       1 if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; }
  1 50       4  
  1         3  
  1         6  
  1         14  
935 0 0       0 if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; }
  0 0       0  
  0         0  
  0         0  
936 0 0       0 if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; }
  0 0       0  
  0         0  
  0         0  
937 0 0       0 if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; }
  0 0       0  
  0         0  
  0         0  
938 0         0 $self->throw ("No such base: $base!\n");
939             }
940 1         5 return _compress_array(\@prob,1000,2);
941             }
942              
943             =head2 sequence_match_weight
944              
945             Title : sequence_match_weight
946             Usage :
947             Function: This method will calculate the score of a match, based on the PWM
948             if such is associated with the matrix object. Returns undef if no
949             PWM data is available.
950             Throws : if the length of the sequence is different from the matrix width
951             Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
952             Returns : Floating point
953             Args : string
954              
955             =cut
956              
957             sub sequence_match_weight {
958 1     1 1 544 my ($self,$seq)=@_;
959 1 50       5 return unless ($self->{logA});
960 1         19 my $width=$self->width;
961 1 50       1 $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}});
  1         4  
962 1         1 $seq = uc($seq);
963 1         7 my @seq=split(//,$seq);
964 1         1 my $score = 0;
965 1         1 my $i=0;
966 1         2 foreach my $pos (@seq) {
967 25         17 my $tv = 'log'.$pos;
968 25 50       28 $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
969 25 50       32 $score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0;
970 25         21 $i++;
971             }
972 1         5 return $score;
973             }
974              
975             =head2 get_all_vectors
976              
977             Title : get_all_vectors
978             Usage :
979             Function: returns all possible sequence vectors to satisfy the PFM under
980             a given threshold
981             Throws : If threshold outside of 0..1 (no sense to do that)
982             Example : my @vectors=$self->get_all_vectors(4);
983             Returns : Array of strings
984             Args : (optional) floating
985              
986             =cut
987              
988             sub get_all_vectors {
989 0     0 1   my $self=shift;
990 0           my $thresh=shift;
991 0 0 0       $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
992 0           my @seq=split(//,$self->consensus($thresh*10));
993 0           my @perm;
994 0           for my $i (0..@{$self->{probA}}) {
  0            
995 0 0         push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
  0            
996 0 0         push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
  0            
997 0 0         push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
  0            
998 0 0         push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
  0            
999 0 0         push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
  0            
1000             }
1001 0           my $fpos=shift @perm;
1002 0           my @strings=@$fpos;
1003 0           foreach my $pos (@perm) {
1004 0           my @newstr;
1005 0           foreach my $let (@$pos) {
1006 0           foreach my $string (@strings) {
1007 0           my $newstring = $string . $let;
1008 0           push @newstr,$newstring;
1009             }
1010             }
1011 0           @strings=@newstr;
1012             }
1013 0           return @strings;
1014             }
1015              
1016             1;