File Coverage

Bio/Matrix/PSM/SiteMatrix.pm
Criterion Covered Total %
statement 375 556 67.4
branch 131 256 51.1
condition 67 115 58.2
subroutine 24 28 85.7
pod 21 21 100.0
total 618 976 63.3


line stmt bran cond sub pod time code
1             #---------------------------------------------------------
2              
3             =head1 NAME
4              
5             Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
6             position scoring matrix (or position weight matrix) and log-odds
7              
8             =head1 SYNOPSIS
9              
10             use Bio::Matrix::PSM::SiteMatrix;
11             # Create from memory by supplying probability matrix hash
12             # both as strings or arrays
13             # where the frequencies $a,$c,$g and $t are supplied either as
14             # arrayref or string. Accordingly, lA, lC, lG and lT are the log
15             # odds (only as arrays, no checks done right now)
16             my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
17             #or
18             my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
19             '100104',0.001,19.2,'CRE1');
20             #Where a stands for all (this frequency=1), see explanation below
21             my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
22             -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
23             -IC=>$ic,-e_val=>$score, -id=>$mid);
24             my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
25             #Or get it from a file:
26             use Bio::Matrix::PSM::IO;
27             my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
28             while (my $psm=$psmIO->next_psm) {
29             #Now we have a Bio::Matrix::PSM::Psm object,
30             # see Bio::Matrix::PSM::PsmI for details
31             #This is a Bio::Matrix::PSM::SiteMatrix object now
32             my $matrix=$psm->matrix;
33             }
34              
35             # Get a simple consensus, where alphabet is {A,C,G,T,N},
36             # choosing the character that both satisfies a supplied or default threshold
37             # frequency and is the most frequenct character at each position, or N.
38             # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
39             # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
40             # would be 'N'.
41             my $consensus=$site->consensus;
42              
43             # Get the IUPAC ambiguity code representation of the data in the matrix.
44             # Because the frequencies may have been pseudo-count corrected, insignificant
45             # frequences (below 0.05 by default) are ignored. So a position with
46             # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
47             # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
48             # 0.25, 0.25, 0.25, 0.25 would get 'N'.
49             my $iupac=$site->IUPAC;
50              
51             # Getting/using regular expression (a representation of the IUPAC string)
52             my $regexp=$site->regexp;
53             my $count=grep($regexp,$seq);
54             my $count=($seq=~ s/$regexp/$1/eg);
55             print "Motif $mid is present $count times in this sequence\n";
56              
57             =head1 DESCRIPTION
58              
59             SiteMatrix is designed to provide some basic methods when working with position
60             scoring (weight) matrices, such as transcription factor binding sites for
61             example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
62             the minimum information you should provide to construct a PSM object. The
63             vectors can be provided as strings with frequenciesx10 rounded to an int, going
64             from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
65             representation of a matrix and it is quite useful when working with relational
66             DB. If arrays are provided as an input (references to arrays actually) they can
67             be any number, real or integer (frequency or count).
68              
69             When creating the object you can ask the constructor to make a simple pseudo
70             count correction by adding a number (typically 1) to all positions (with the
71             -correction option). After adding the number the frequencies will be
72             calculated. Only use correction when you supply counts, not frequencies.
73              
74             Throws an exception if: You mix as an input array and string (for example A
75             matrix is given as array, C - as string). The position vector is (0,0,0,0). One
76             of the probability vectors is shorter than the rest.
77              
78             Summary of the methods I use most frequently (details below):
79              
80             iupac - return IUPAC compliant consensus as a string
81             score - Returns the score as a real number
82             IC - information content. Returns a real number
83             id - identifier. Returns a string
84             accession - accession number. Returns a string
85             next_pos - return the sequence probably for each letter, IUPAC
86             symbol, IUPAC probability and simple sequence
87             consenus letter for this position. Rewind at the end. Returns a hash.
88             pos - current position get/set. Returns an integer.
89             regexp - construct a regular expression based on IUPAC consensus.
90             For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
91             width - site width
92             get_string - gets the probability vector for a single base as a string.
93             get_array - gets the probability vector for a single base as an array.
94             get_logs_array - gets the log-odds vector for a single base as an array.
95              
96             New methods, which might be of interest to anyone who wants to store
97             PSM in a relational database without creating an entry for each
98             position is the ability to compress the PSM vector into a string with
99             losing usually less than 1% of the data. this can be done with:
100              
101             my $str=$matrix->get_compressed_freq('A');
102             or
103             my $str=$matrix->get_compressed_logs('A');
104              
105             Loading from a database should be done with new, but is not yest implemented.
106             However you can still uncompress such string with:
107              
108             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
109             or
110             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
111              
112             =head1 FEEDBACK
113              
114             =head2 Mailing Lists
115              
116             User feedback is an integral part of the evolution of this and other
117             Bioperl modules. Send your comments and suggestions preferably to one
118             of the Bioperl mailing lists. Your participation is much appreciated.
119              
120             bioperl-l@bioperl.org - General discussion
121             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
122              
123             =head2 Support
124              
125             Please direct usage questions or support issues to the mailing list:
126              
127             I
128              
129             rather than to the module maintainer directly. Many experienced and
130             reponsive experts will be able look at the problem and quickly
131             address it. Please include a thorough description of the problem
132             with code and data examples if at all possible.
133              
134             =head2 Reporting Bugs
135              
136             Report bugs to the Bioperl bug tracking system to help us keep track
137             the bugs and their resolution. Bug reports can be submitted via the
138             web:
139              
140             https://github.com/bioperl/bioperl-live/issues
141              
142             =head1 AUTHOR - Stefan Kirov
143              
144             Email skirov@utk.edu
145              
146             =head1 CONTRIBUTORS
147              
148             Sendu Bala, bix@sendu.me.uk
149              
150             =head1 APPENDIX
151              
152             The rest of the documentation details each of the object methods.
153             Internal methods are usually preceded with a _
154              
155             =cut
156              
157             # Let the code begin...
158             package Bio::Matrix::PSM::SiteMatrix;
159 5     5   471 use strict;
  5         8  
  5         139  
160              
161 5     5   46 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
  5         8  
  5         1509  
162              
163             =head2 new
164              
165             Title : new
166             Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
167             -pG=>$g,-pT=>$t,
168             -IC=>$ic,
169             -e_val=>$score,
170             -id=>$mid);
171             Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
172             Throws : If inconsistent data for all vectors (A,C,G and T) is
173             provided, if you mix input types (string vs array) or if a
174             position freq is 0.
175             Returns : Bio::Matrix::PSM::SiteMatrix object
176             Args : -pA => vector with the frequencies or counts of A
177             -pC => vector for C
178             -pG => vector for G
179             -pt => vector for T
180             -lA => vector for the log of A
181             -lC => vector for the log of C
182             -lG => vector for the log of G
183             -lT => vector for the log of T
184             -IC => real number, the information content of this matrix
185             -e_val => real number, the expect value
186             -id => string, an identifier
187             -width => int, width of the matrix in nucleotides
188             -sites => int, the number of sites that went into this matrix
189             -model => hash ref, background frequencies for A, C, G and T
190             -correction => number, the number to add to all positions to achieve
191             pseudo count correction (default 0: no correction)
192             NB: do not use correction when your input is
193             frequences!
194             -accession_number => string, an accession number
195              
196             Vectors can be strings of the frequencies where the frequencies are
197             multiplied by 10 and rounded to the nearest whole number, and where
198             'a' is used to denote the maximal frequency 10. There should be no
199             punctuation (spaces etc.) in the string. For example, 'a0501'.
200             Alternatively frequencies or counts can be represented by an array
201             ref containing the counts, frequencies or logs as any kind of
202             number.
203              
204             =cut
205              
206             sub new {
207 14     14 1 140 my ($class, @args) = @_;
208 14         57 my $self = $class->SUPER::new(@args);
209 14         26 my $consensus;
210             # Too many things to rearrange, and I am creating simultanuously >500
211             # such objects routinely, so this becomes performance issue
212             my %input;
213 14         28 while (@args) {
214 114         204 (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
215 114         221 $input{$key} = shift @args;
216             }
217 14         22 $self->{_position} = 0;
218 14         27 $self->{IC} = $input{IC};
219 14         30 $self->{e_val} = $input{e_val};
220 14         17 $self->{width} = $input{width};
221 14         19 $self->{logA} = $input{lA};
222 14         21 $self->{logC} = $input{lC};
223 14         28 $self->{logG} = $input{lG};
224 14         17 $self->{logT} = $input{lT};
225 14         19 $self->{sites} = $input{sites};
226 14   100     33 $self->{id} = $input{id} || 'null';
227 14   50     41 $self->{correction} = $input{correction} || 0;
228 14         21 $self->{accession_number} = $input{accession_number};
229 14 50 66     89 return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT}));
      66        
      33        
230            
231             # This should go to _initialize?
232             # Check for input type- no mixing alllowed, throw ex
233 12 100       50 if (ref($input{pA}) =~ /ARRAY/i ) {
234 11 50       23 $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC}));
235 11 50       21 $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG}));
236 11 50       16 $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT}));
237 11         15 $self->{probA} = $input{pA};
238 11         19 $self->{probC} = $input{pC};
239 11         21 $self->{probG} = $input{pG};
240 11         16 $self->{probT} = $input{pT};
241             }
242             else {
243 1 50       3 $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC}));
244 1 50       2 $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG}));
245 1 50       2 $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT}));
246 1         5 $self->{probA} = [split(//,$input{pA})];
247 1         4 $self->{probC} = [split(//,$input{pC})];
248 1         5 $self->{probG} = [split(//,$input{pG})];
249 1         3 $self->{probT} = [split(//,$input{pT})];
250 1         3 for (my $i=0; $i<= @{$self->{probA}}+1; $i++) {
  8         16  
251             # we implictely assume these are MEME-style frequencies x 10, so
252             # 'a' represents the 'maximum', 10. Other positions can actually
253             # add up to over 10 due to rounding, but I don't think that is a
254             # problem?
255 7 100 100     5 if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') {
  7         12  
  3         7  
256 1         2 ${$self->{probA}}[$i]='10';
  1         2  
257             }
258 7 100 100     7 if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') {
  7         11  
  3         7  
259 1         1 ${$self->{probC}}[$i]='10';
  1         2  
260             }
261 7 50 66     8 if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') {
  7         13  
  3         7  
262 0         0 ${$self->{probG}}[$i]='10';
  0         0  
263             }
264 7 50 66     7 if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') {
  7         12  
  2         4  
265 0         0 ${$self->{probT}}[$i]='10';
  0         0  
266             }
267             }
268             }
269            
270             # Check for position with 0 for all bases, throw exception if so
271 12         19 for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
  279         376  
272 267 50       238 if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) {
  267         269  
  267         345  
  267         301  
  267         392  
273 0         0 $self->throw("Position meaningless-all frequencies are 0");
274             }
275            
276             # apply pseudo-count correction to all values - this will result in
277             # very bad frequencies if the input is already frequences and a
278             # correction value as large as 1 is used!
279 267 50       362 if ($self->{correction}) {
280 0         0 ${$self->{probA}}[$i] += $self->{correction};
  0         0  
281 0         0 ${$self->{probC}}[$i] += $self->{correction};
  0         0  
282 0         0 ${$self->{probG}}[$i] += $self->{correction};
  0         0  
283 0         0 ${$self->{probT}}[$i] += $self->{correction};
  0         0  
284             }
285            
286             # (re)calculate frequencies
287 267         233 my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i];
  267         259  
  267         264  
  267         271  
  267         285  
288 267         242 ${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div;
  267         260  
  267         303  
289 267         232 ${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div;
  267         254  
  267         287  
290 267         245 ${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div;
  267         262  
  267         269  
291 267         227 ${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div;
  267         309  
  267         262  
292             }
293            
294             # Calculate the logs
295 12 50 66     45 if ((!defined($self->{logA})) && ($input{model})) {
296 0         0 $self->calc_weight($input{model});
297             }
298            
299             # Make consensus, throw if any one of the vectors is shorter
300 12         43 $self->_calculate_consensus;
301 12         104 return $self;
302             }
303              
304             =head2 _calculate_consensus
305              
306             Title : _calculate_consensus
307             Function: Internal stuff
308              
309             =cut
310              
311             sub _calculate_consensus {
312 12     12   15 my $self=shift;
313 12         16 my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
  12         25  
  12         20  
  12         23  
314 12         16 my $len=$#{$self->{probA}};
  12         17  
315 12 50       29 $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
316 12 50       24 $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
317 12 50       19 $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
318 12         30 for (my $i=0; $i<$len+1; $i++) {
319             #*** IUPACp values not actually used (eg. by next_pos)
320 267         269 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
  267         415  
  267         420  
  267         295  
  267         278  
  267         270  
  267         333  
321 267         320 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
  267         384  
  267         604  
  267         291  
  267         256  
  267         265  
  267         343  
322             }
323 12         19 return $self;
324             }
325              
326             =head2 calc_weight
327              
328             Title : calc_weight
329             Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
330             Function: Recalculates the PSM (or weights) based on the PFM (the frequency
331             matrix) and user supplied background model.
332             Throws : if no model is supplied
333             Returns : n/a
334             Args : reference to a hash with background frequencies for A,C,G and T
335              
336             =cut
337              
338             sub calc_weight {
339 0     0 1 0 my ($self, $model) = @_;
340 0         0 my %model;
341 0         0 $model{probA}=$model->{A};
342 0         0 $model{probC}=$model->{C};
343 0         0 $model{probG}=$model->{G};
344 0         0 $model{probT}=$model->{T};
345 0         0 foreach my $let (qw(probA probC probG probT)) {
346 0         0 my @str;
347 0 0 0     0 $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
348 0         0 foreach my $f (@{$self->{$let}}) {
  0         0  
349 0         0 my $w=log($f)-log($model{$let});
350 0         0 push @str,$w;
351             }
352 0         0 my $llet=$let;
353 0         0 $llet=~s/prob/log/;
354 0         0 $self->{$llet}=\@str;
355             }
356 0         0 return $self;
357             }
358              
359             =head2 next_pos
360              
361             Title : next_pos
362             Usage :
363             Function: Retrieves the next position features: frequencies for A,C,G,T, the
364             main letter (as in consensus) and the probabilty for this letter to
365             occur at this position and the current position
366             Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
367             Args : none
368              
369             =cut
370              
371             sub next_pos {
372 104     104 1 101 my $self = shift;
373 104 50       157 $self->throw("instance method called on class") unless ref $self;
374 104         85 my $len=@{$self->{seq}};
  104         123  
375 104         98 my $pos=$self->{_position};
376             # End reached?
377 104 100       126 if ($pos<$len) {
378 101         79 my $pA=${$self->{probA}}[$pos];
  101         125  
379 101         88 my $pC=${$self->{probC}}[$pos];
  101         105  
380 101         93 my $pG=${$self->{probG}}[$pos];
  101         127  
381 101         84 my $pT=${$self->{probT}}[$pos];
  101         100  
382 101         94 my $lA=${$self->{logA}}[$pos];
  101         105  
383 101         88 my $lC=${$self->{logC}}[$pos];
  101         101  
384 101         88 my $lG=${$self->{logG}}[$pos];
  101         102  
385 101         108 my $lT=${$self->{logT}}[$pos];
  101         109  
386 101         85 my $base=${$self->{seq}}[$pos];
  101         102  
387 101         84 my $prob=${$self->{seqp}}[$pos];
  101         98  
388 101         92 $self->{_position}++;
389 101         330 my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob);
390 101         573 return %seq;
391             }
392 3         4 else {$self->{_position}=0; return;}
  3         8  
393             }
394              
395             =head2 curpos
396              
397             Title : curpos
398             Usage :
399             Function: Gets/sets the current position. Converts to 0 if argument is minus
400             and to width if greater than width
401             Returns : integer
402             Args : integer
403              
404             =cut
405              
406             sub curpos {
407 2     2 1 264 my $self = shift;
408 2         3 my $prev = $self->{_position};
409 2 50       6 if (@_) { $self->{_position} = shift; }
  0         0  
410 2         6 return $prev;
411             }
412              
413             =head2 e_val
414              
415             Title : e_val
416             Usage :
417             Function: Gets/sets the e-value
418             Returns : real number
419             Args : none to get, real number to set
420              
421             =cut
422              
423             sub e_val {
424 3     3 1 6 my $self = shift;
425 3         5 my $prev = $self->{e_val};
426 3 100       7 if (@_) { $self->{e_val} = shift; }
  1         2  
427 3         8 return $prev;
428             }
429              
430             =head2 IC
431              
432             Title : IC
433             Usage :
434             Function: Get/set the Information Content
435             Returns : real number
436             Args : none to get, real number to set
437              
438             =cut
439              
440             sub IC {
441 1     1 1 1 my $self = shift;
442 1         2 my $prev = $self->{IC};
443 1 50       3 if (@_) { $self->{IC} = shift; }
  0         0  
444 1         2 return $prev;
445             }
446              
447             =head2 accession_number
448              
449             Title : accession_number
450             Function: Get/set the accession number, this will be unique id for the
451             SiteMatrix object as well for any other object, inheriting from
452             SiteMatrix
453             Returns : string
454             Args : none to get, string to set
455              
456             =cut
457              
458             sub accession_number {
459 0     0 1 0 my $self = shift;
460 0         0 my $prev = $self->{accession_number};
461 0 0       0 if (@_) { $self->{accession_number} = shift; }
  0         0  
462 0         0 return $prev;
463             }
464              
465             =head2 consensus
466              
467             Title : consensus
468             Usage :
469             Function: Returns the consensus
470             Returns : string
471             Args : (optional) threshold value 1 to 10, default 5
472             '5' means the returned characters had a 50% or higher presence at
473             their position
474              
475             =cut
476              
477             sub consensus {
478 8     8 1 1042 my ($self, $thresh) = @_;
479 8 50       21 if ($thresh) {
480 0         0 my $len=$#{$self->{probA}};
  0         0  
481 0         0 for (my $i=0; $i<$len+1; $i++) {
482 0         0 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
483             }
484             }
485 8         12 my $consensus='';
486 8         10 foreach my $letter (@{$self->{seq}}) {
  8         17  
487 180         177 $consensus .= $letter;
488             }
489 8         24 return $consensus;
490             }
491              
492             =head2 width
493              
494             Title : width
495             Usage :
496             Function: Returns the length of the sites in used to make this matrix
497             Returns : int
498             Args : none
499              
500             =cut
501              
502             sub width {
503 2     2 1 4 my $self = shift;
504 2         3 my $width=@{$self->{probA}};
  2         4  
505 2         6 return $width;
506             }
507              
508             =head2 sites
509              
510             Title : sites
511             Usage :
512             Function: Get/set the number of sites that were used to make this matrix
513             Returns : int
514             Args : none to get, int to set
515              
516             =cut
517              
518             sub sites {
519 0     0 1 0 my $self = shift;
520 0 0       0 if (@_) { $self->{sites} = shift }
  0         0  
521 0   0     0 return $self->{sites} || return;
522             }
523              
524             =head2 IUPAC
525              
526             Title : IUPAC
527             Usage :
528             Function: Returns IUPAC compliant consensus
529             Returns : string
530             Args : optionally, also supply a whole number (int) of 1 or higher to set
531             the significance level when considering the frequencies. 1 (the
532             default) means a 0.05 significance level: frequencies lower than
533             0.05 will be ignored. 2 Means a 0.005 level, and so on.
534              
535             =cut
536              
537             sub IUPAC {
538 5     5 1 2624 my ($self, $thresh) = @_;
539 5 50       12 if ($thresh) {
540 0         0 my $len=$#{$self->{probA}};
  0         0  
541 0         0 for (my $i=0; $i<$len+1; $i++) {
542 0         0 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
  0         0  
543             }
544             }
545 5         11 my $iu=$self->{IUPAC};
546 5         7 my $iupac='';
547 5         5 foreach my $let (@{$iu}) {
  5         8  
548 92         91 $iupac .= $let;
549             }
550 5         18 return $iupac;
551             }
552              
553             =head2 _to_IUPAC
554              
555             Title : _to_IUPAC
556             Usage :
557             Function: Converts a single position to IUPAC compliant symbol.
558             For rules see the implementation
559             Returns : char, real number
560             Args : real numbers for frequencies of A,C,G,T (positional)
561              
562             optionally, also supply a whole number (int) of 1 or higher to set
563             the significance level when considering the frequencies. 1 (the
564             default) means a 0.05 significance level: frequencies lower than
565             0.05 will be ignored. 2 Means a 0.005 level, and so on.
566              
567             =cut
568              
569             sub _to_IUPAC {
570 267     267   343 my ($a, $c, $g, $t, $thresh) = @_;
571 267   50     623 $thresh ||= 1;
572 267         239 $thresh = int($thresh);
573 267         860 $a = sprintf ("%.${thresh}f", $a);
574 267         551 $c = sprintf ("%.${thresh}f", $c);
575 267         499 $g = sprintf ("%.${thresh}f", $g);
576 267         494 $t = sprintf ("%.${thresh}f", $t);
577            
578 267         568 my $total = $a + $c + $g + $t;
579            
580 267 100       430 return 'A' if ($a == $total);
581 199 100       258 return 'G' if ($g == $total);
582 190 100       278 return 'C' if ($c == $total);
583 108 100       178 return 'T' if ($t == $total);
584 93         98 my $r=$g+$a;
585 93 50       120 return 'R' if ($r == $total);
586 93         91 my $y=$t+$c;
587 93 100       138 return 'Y' if ($y == $total);
588 86         89 my $m=$a+$c;
589 86 100       135 return 'M' if ($m == $total);
590 62         63 my $k=$g+$t;
591 62 100       97 return 'K' if ($k == $total);
592 49         62 my $s=$g+$c;
593 49 100       72 return 'S' if ($s == $total);
594 42         46 my $w=$a+$t;
595 42 100       70 return 'W' if ($w == $total);
596 20         22 my $d=$r+$t;
597 20 100       36 return 'D' if ($d == $total);
598 17         21 my $v=$r+$c;
599 17 100       34 return 'V' if ($v == $total);
600 9         13 my $b=$y+$g;
601 9 100       19 return 'B' if ($b == $total);
602 7         7 my $h=$y+$a;
603 7 100       16 return 'H' if ($h == $total);
604 1         1 return 'N';
605             }
606              
607             =head2 _to_cons
608              
609             Title : _to_cons
610             Usage :
611             Function: Converts a single position to simple consensus character and returns
612             its probability. For rules see the implementation
613             Returns : char, real number
614             Args : real numbers for A,C,G,T (positional), and optional 5th argument of
615             threshold (as a number between 1 and 10, where 5 is default and
616             means the returned character had a 50% or higher presence at this
617             position)
618              
619             =cut
620              
621             sub _to_cons {
622 267     267   375 my ($A, $C, $G, $T, $thresh) = @_;
623 267   50     610 $thresh ||= 5;
624            
625             # this multiplication by 10 is just to satisfy the thresh range of 1-10
626 267         282 my $a = $A * 10;
627 267         242 my $c = $C * 10;
628 267         240 my $g = $G * 10;
629 267         229 my $t = $T * 10;
630            
631 267 100 100     669 return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
      100        
      100        
632 244 50 100     409 return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
      66        
633            
634             # threshold could be lower than 50%, so must check is not only over
635             # threshold, but also the highest frequency
636 244 50 100     642 return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
      66        
      66        
637 146 50 100     518 return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
      66        
      66        
638 45 50 66     149 return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
      66        
      33        
639 26 100 33     151 return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
      66        
      100        
640            
641 2         4 return 'N',10;
642             }
643              
644             =head2 get_string
645              
646             Title : get_string
647             Usage :
648             Function: Returns given probability vector as a string. Useful if you want to
649             store things in a rel database, where arrays are not first choice
650             Throws : If the argument is outside {A,C,G,T}
651             Returns : string
652             Args : character {A,C,G,T}
653              
654             =cut
655              
656             sub get_string {
657 1     1 1 2 my $self=shift;
658 1         2 my $base=shift;
659 1         1 my $string='';
660 1         2 my @prob;
661            
662             BASE: {
663 1 50       1 if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; }
  1         4  
  1         2  
  1         3  
  1         2  
664 0 0       0 if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; }
  0         0  
  0         0  
  0         0  
665 0 0       0 if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; }
  0         0  
  0         0  
  0         0  
666 0 0       0 if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; }
  0         0  
  0         0  
  0         0  
667 0         0 $self->throw ("No such base: $base!\n");
668             }
669            
670 1         3 foreach my $prob (@prob) {
671 5         7 my $corrected = $prob*10;
672 5         8 my $next=sprintf("%.0f",$corrected);
673 5 100       8 $next='a' if ($next eq '10');
674 5         6 $string .= $next;
675             }
676 1         4 return $string;
677             }
678              
679             =head2 get_array
680              
681             Title : get_array
682             Usage :
683             Function: Returns an array with frequencies for a specified base
684             Returns : array
685             Args : char
686              
687             =cut
688              
689             sub get_array {
690 3     3 1 10 my $self=shift;
691 3         7 my $base=uc(shift);
692 3 50       7 return @{$self->{probA}} if ($base eq 'A');
  3         16  
693 0 0       0 return @{$self->{probC}} if ($base eq 'C');
  0         0  
694 0 0       0 return @{$self->{probG}} if ($base eq 'G');
  0         0  
695 0 0       0 return @{$self->{probT}} if ($base eq 'T');
  0         0  
696 0         0 $self->throw("No such base: $base!\n");
697             }
698              
699             =head2 get_logs_array
700              
701             Title : get_logs_array
702             Usage :
703             Function: Returns an array with log_odds for a specified base
704             Returns : array
705             Args : char
706              
707             =cut
708              
709             sub get_logs_array {
710 1     1 1 5 my $self=shift;
711 1         2 my $base=uc(shift);
712 1 50 33     5 return @{$self->{logA}} if (($base eq 'A') && ($self->{logA}));
  1         6  
713 0 0 0     0 return @{$self->{logC}} if (($base eq 'C') && ($self->{logC}));
  0         0  
714 0 0 0     0 return @{$self->{logG}} if (($base eq 'G') && ($self->{logG}));
  0         0  
715 0 0 0     0 return @{$self->{logT}} if (($base eq 'T') && ($self->{logT}));
  0         0  
716 0 0       0 $self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
717 0         0 return;
718             }
719              
720             =head2 id
721              
722             Title : id
723             Usage :
724             Function: Gets/sets the site id
725             Returns : string
726             Args : string
727              
728             =cut
729              
730             sub id {
731 12     12 1 30 my $self = shift;
732 12         16 my $prev = $self->{id};
733 12 100       19 if (@_) { $self->{id} = shift; }
  2         2  
734 12         29 return $prev;
735             }
736              
737             =head2 regexp
738              
739             Title : regexp
740             Usage :
741             Function: Returns a regular expression which matches the IUPAC convention.
742             N will match X, N, - and .
743             Returns : string
744             Args : none (works at the threshold last used for making the IUPAC string)
745              
746             =cut
747              
748             sub regexp {
749 1     1 1 2 my $self=shift;
750 1         1 my $regexp;
751 1         2 foreach my $letter (@{$self->{IUPAC}}) {
  1         3  
752 5         4 my $reg;
753             LETTER: {
754 5 100       4 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
  5         10  
  1         1  
  1         2  
755 4 100       6 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
  1         1  
  1         2  
756 3 50       4 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
  0         0  
  0         0  
757 3 50       6 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
  0         0  
  0         0  
758 3 50       4 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
  0         0  
  0         0  
759 3 50       4 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
  0         0  
  0         0  
760 3 50       5 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
  0         0  
  0         0  
761 3 50       5 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
  0         0  
  0         0  
762 3 50       11 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
  0         0  
  0         0  
763 3 50       4 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
  0         0  
  0         0  
764 3 100       5 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
  1         1  
  1         1  
765 2 50       5 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
  0         0  
  0         0  
766 2 100       3 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
  1         1  
  1         2  
767 1 50       2 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
  1         1  
  1         2  
768 0         0 $reg='\S';
769             }
770 5         8 $regexp .= $reg;
771             }
772 1         3 return $regexp;
773             }
774              
775             =head2 regexp_array
776              
777             Title : regexp_array
778             Usage :
779             Function: Returns a regular expression which matches the IUPAC convention.
780             N will match X, N, - and .
781             Returns : array
782             Args : none (works at the threshold last used for making the IUPAC string)
783             To do : I have separated regexp and regexp_array, but
784             maybe they can be rewritten as one - just check what should be returned
785              
786             =cut
787              
788             sub regexp_array {
789 1     1 1 2 my $self=shift;
790 1         2 my @regexp;
791 1         2 foreach my $letter (@{$self->{IUPAC}}) {
  1         2  
792 5         6 my $reg;
793             LETTER: {
794 5 100       5 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
  5         8  
  1         1  
  1         2  
795 4 100       5 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
  1         2  
  1         1  
796 3 50       4 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
  0         0  
  0         0  
797 3 50       5 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
  0         0  
  0         0  
798 3 50       4 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
  0         0  
  0         0  
799 3 50       4 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
  0         0  
  0         0  
800 3 50       4 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
  0         0  
  0         0  
801 3 50       4 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
  0         0  
  0         0  
802 3 50       5 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
  0         0  
  0         0  
803 3 50       5 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
  0         0  
  0         0  
804 3 100       4 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
  1         2  
  1         1  
805 2 50       3 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
  0         0  
  0         0  
806 2 100       4 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
  1         1  
  1         1  
807 1 50       2 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
  1         1  
  1         1  
808 0         0 $reg='\S';
809             }
810 5         8 push @regexp,$reg;
811             }
812 1         4 return @regexp;
813             }
814              
815              
816             =head2 _compress_array
817              
818             Title : _compress_array
819             Usage :
820             Function: Will compress an array of real signed numbers to a string (ie vector
821             of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
822             unsigned
823             Returns : String
824             Args : array reference, followed by an max value and direction (optional,
825             default 1-unsigned),1 unsigned, any other is signed.
826              
827             =cut
828              
829             sub _compress_array {
830 3     3   5 my ($array,$lm,$direct)=@_;
831 3         2 my $str;
832 3 50 33     12 return unless(($array) && ($lm));
833 3 50       3 $direct=1 unless ($direct);
834 3 100       9 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
835 3         3 foreach my $c (@{$array}) {
  3         6  
836 62 50       77 $c=$lm if ($c>$lm);
837 62 50 33     81 $c=-$lm if (($c<-$lm) && ($direct !=1));
838 62 50 66     96 $c=0 if (($c<0) && ($direct ==1));
839 62         62 my $byte=int($k1*$c);
840 62 100       75 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
841 62         55 my $char=chr($byte);
842 62         67 $str.=$char;
843             }
844 3         10 return $str;
845             }
846              
847             =head2 _uncompress_string
848              
849             Title : _uncompress_string
850             Usage :
851             Function: Will uncompress a string (vector of bytes) to create an array of
852             real signed numbers (opposite to_compress_array)
853             Returns : string, followed by an max value and
854             direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
855             Args : array
856              
857             =cut
858              
859             sub _uncompress_string {
860 3     3   14 my ($str,$lm,$direct)=@_;
861 3         3 my @array;
862 3 50 33     9 return unless(($str) && ($lm));
863 3 50       4 $direct=1 unless ($direct);
864 3 100       6 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
865 3         12 foreach my $c (split(//,$str)) {
866 62         52 my $byte=ord($c);
867 62 100       76 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
868 62         52 my $num=$byte/$k1;
869 62         65 push @array,$num;
870             }
871 3         13 return @array;
872             }
873              
874             =head2 get_compressed_freq
875              
876             Title : get_compressed_freq
877             Usage :
878             Function: A method to provide a compressed frequency vector. It uses one byte
879             to code the frequence for one of the probability vectors for one
880             position. Useful for relational database. Improvement of the previous
881             0..a coding.
882             Example : my $strA=$self->get_compressed_freq('A');
883             Returns : String
884             Args : char
885              
886             =cut
887              
888             sub get_compressed_freq {
889 2     2 1 516 my $self=shift;
890 2         4 my $base=shift;
891 2         3 my $string='';
892 2         2 my @prob;
893             BASE: {
894 2 50       3 if ($base eq 'A') {
  2         5  
895 2 50       5 @prob= @{$self->{probA}} unless (!defined($self->{probA}));
  2         6  
896 2         4 last BASE;
897             }
898 0 0       0 if ($base eq 'G') {
899 0 0       0 @prob= @{$self->{probG}} unless (!defined($self->{probG}));
  0         0  
900 0         0 last BASE;
901             }
902 0 0       0 if ($base eq 'C') {
903 0 0       0 @prob= @{$self->{probC}} unless (!defined($self->{probC}));
  0         0  
904 0         0 last BASE;
905             }
906 0 0       0 if ($base eq 'T') {
907 0 0       0 @prob= @{$self->{probT}} unless (!defined($self->{probT}));
  0         0  
908 0         0 last BASE;
909             }
910 0         0 $self->throw ("No such base: $base!\n");
911             }
912 2         5 my $str= _compress_array(\@prob,1,1);
913 2         5 return $str;
914             }
915              
916             =head2 get_compressed_logs
917              
918             Title : get_compressed_logs
919             Usage :
920             Function: A method to provide a compressed log-odd vector. It uses one byte to
921             code the log value for one of the log-odds vectors for one position.
922             Example : my $strA=$self->get_compressed_logs('A');
923             Returns : String
924             Args : char
925              
926             =cut
927              
928             sub get_compressed_logs {
929 1     1 1 2 my $self=shift;
930 1         1 my $base=shift;
931 1         2 my $string='';
932 1         1 my @prob;
933             BASE: {
934 1 50       1 if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; }
  1 50       3  
  1         3  
  1         5  
  1         2  
935 0 0       0 if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; }
  0 0       0  
  0         0  
  0         0  
936 0 0       0 if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; }
  0 0       0  
  0         0  
  0         0  
937 0 0       0 if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; }
  0 0       0  
  0         0  
  0         0  
938 0         0 $self->throw ("No such base: $base!\n");
939             }
940 1         2 return _compress_array(\@prob,1000,2);
941             }
942              
943             =head2 sequence_match_weight
944              
945             Title : sequence_match_weight
946             Usage :
947             Function: This method will calculate the score of a match, based on the PWM
948             if such is associated with the matrix object. Returns undef if no
949             PWM data is available.
950             Throws : if the length of the sequence is different from the matrix width
951             Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
952             Returns : Floating point
953             Args : string
954              
955             =cut
956              
957             sub sequence_match_weight {
958 1     1 1 504 my ($self,$seq)=@_;
959 1 50       3 return unless ($self->{logA});
960 1         15 my $width=$self->width;
961 1 50       1 $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}});
  1         3  
962 1         2 $seq = uc($seq);
963 1         6 my @seq=split(//,$seq);
964 1         2 my $score = 0;
965 1         1 my $i=0;
966 1         2 foreach my $pos (@seq) {
967 25         25 my $tv = 'log'.$pos;
968 25 50       31 $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
969 25 50       38 $score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0;
970 25         25 $i++;
971             }
972 1         5 return $score;
973             }
974              
975             =head2 get_all_vectors
976              
977             Title : get_all_vectors
978             Usage :
979             Function: returns all possible sequence vectors to satisfy the PFM under
980             a given threshold
981             Throws : If threshold outside of 0..1 (no sense to do that)
982             Example : my @vectors=$self->get_all_vectors(4);
983             Returns : Array of strings
984             Args : (optional) floating
985              
986             =cut
987              
988             sub get_all_vectors {
989 0     0 1   my $self=shift;
990 0           my $thresh=shift;
991 0 0 0       $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
992 0           my @seq=split(//,$self->consensus($thresh*10));
993 0           my @perm;
994 0           for my $i (0..@{$self->{probA}}) {
  0            
995 0 0         push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
  0            
996 0 0         push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
  0            
997 0 0         push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
  0            
998 0 0         push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
  0            
999 0 0         push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
  0            
1000             }
1001 0           my $fpos=shift @perm;
1002 0           my @strings=@$fpos;
1003 0           foreach my $pos (@perm) {
1004 0           my @newstr;
1005 0           foreach my $let (@$pos) {
1006 0           foreach my $string (@strings) {
1007 0           my $newstring = $string . $let;
1008 0           push @newstr,$newstring;
1009             }
1010             }
1011 0           @strings=@newstr;
1012             }
1013 0           return @strings;
1014             }
1015              
1016             1;