File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm
Criterion Covered Total %
statement 101 241 41.9
branch 37 114 32.4
condition 7 39 17.9
subroutine 13 23 56.5
pod 11 14 78.5
total 169 431 39.2


line stmt bran cond sub pod time code
1             package Bio::CUA::CUB::Calculator;
2              
3             =pod
4              
5             =head1 NAME
6              
7             Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8             (CUB) indice for protein-coding sequences
9              
10             =head1 SYNOPSIS
11              
12             use Bio::CUA::CUB::Calculator;
13              
14             my $calc = Bio::CUA::CUB::Calculator->new(
15             -codon_table => 1,
16             -tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17             );
18              
19             # calculate tAI for each sequence
20             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21             or
22             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23              
24             while(my $seq = $io->next_seq)
25             {
26             my $tai = $calc->tai($seq);
27             printf("%10s: %.7f\n", $seq->id, $tai);
28             }
29              
30             =head1 DESCRIPTION
31              
32             Codon usage bias (CUB) can be represented at two levels, codon and
33             sequence. The latter is often computed as the geometric means of the
34             sequence's codons. This module caculates CUB metrics at sequence
35             level.
36              
37             Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38             adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39             Number of Codons) and their variants. See the methods below for
40             details.
41              
42             =cut
43              
44 2     2   3645 use 5.006;
  2         6  
45 2     2   9 use strict;
  2         3  
  2         47  
46 2     2   7 use warnings;
  2         3  
  2         58  
47 2     2   9 use parent qw/Bio::CUA::CUB/;
  2         2  
  2         10  
48 2     2   87 use Bio::CUA::CodonTable;
  2         2  
  2         39  
49 2     2   6 use Scalar::Util qw/blessed/;
  2         2  
  2         5274  
50              
51             =head1 METHODS
52              
53             =head2 new
54              
55             Title : new
56             Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
57             Function: initialize the calculator
58             Returns : an object of this class
59             Args : a hash with following acceptable keys:
60            
61             B:
62              
63             =over
64              
65             =item C<-codon_table>
66              
67             the genetic code table applied for following sequence analyses. It
68             can be specified by an integer (genetic code table id), an object of
69             L, or a map-file. See the method
70             L for details.
71              
72             =back
73              
74             B
75              
76             =over
77              
78             =item C<-optimal_codons>
79              
80             a file contains all the optimal codons, one codon per line. Or a
81             hashref with keys being the optimal codons
82              
83             =back
84              
85             B
86              
87             =over
88              
89             =item C<-CAI_values>
90              
91             a file containing CAI values for each codon, excluding 3
92             stop codons, so 61 lines with each line containing a codon and its
93             value separated by space or tab.
94             or
95             a hashref with each key being a codon and each value being CAI index
96             for the codon.
97              
98             =back
99              
100             B
101              
102             =over
103              
104             =item C<-tAI_values>
105              
106             similar to C<-CAI_values>, a file or a hash containing tAI value
107             for each codon.
108              
109             =back
110              
111             B
112              
113             =over
114              
115             =item C<-base_background>
116              
117             optional.
118             an arrayref containing base frequency of 4 bases (in the order
119             A,T,C, and G) derived from background data such as introns.
120             Or one of the following values: 'seq', 'seq3', which will lead to
121             estimating base frequencies from each analyzed sequence in whol or
122             its 3rd codon position, respectively.
123              
124             It can also be specified for each analyzed sequence with the methods
125             L and L
126              
127             =back
128              
129             =cut
130              
131             sub new
132             {
133 1     1 1 2609 my ($caller, @args) = @_;
134              
135             # option -codon_table is processed in this parent class
136 1         11 my $self = $caller->SUPER::new(@args);
137              
138             # process all the parameters
139 1         4 my $hashRef = $self->_array_to_hash(\@args);
140 1         5 while(my ($tag, $val) = each %$hashRef)
141             {
142             # tag 'codon_table' is now processed by parent package
143 3 50       21 if($tag =~ /^optimal/o) # optimal codons
    100          
    100          
    50          
144             {
145             # a hash using codons as keys
146             my $optimalCodons = ref($val) eq 'HASH'?
147 0 0       0 {map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
  0         0  
148 0         0 $self->{'_optimal_codons'} = $optimalCodons;
149             }elsif($tag =~ /^cai/o) # CAI values
150             {
151             # a hash like codon => CAI_value
152 1 50       9 my $caiValues = ref($val) eq 'HASH'?
153             $val : $self->_parse_file($val,2);
154 1         6 $self->{'_cai_values'} = $caiValues;
155             }elsif($tag =~ /^tai/o) # tAI values
156             {
157             # a hash like codon => tAI_value
158 1 50       6 my $taiValues = ref($val) eq 'HASH'?
159             $val : $self->_parse_file($val,2);
160 1         5 $self->{'_tai_values'} = $taiValues;
161             }elsif($tag =~ /^base/o) # background base composition
162             {
163 0 0 0     0 if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
164             {
165 0         0 $self->{'_base_comp'} = $val;
166             }else
167             {
168 0         0 $self->throw("Unknown value '$val' for parameter",
169             "-base_background");
170             }
171             }else
172             {
173             # Unknown parameter '$tag', ignored
174             }
175             }
176              
177 1         10 $self->no_atg(1); # exclude ATG in tAI calculation
178             # check the input values
179             # 1. make sure all the sense codons have CAI or tAI values if
180             # provided
181              
182 1         4 return $self;
183             }
184              
185             =head1 sequence input
186              
187             all the following methods accept one of the following formats as
188             sequence input
189              
190             =over
191              
192             =item 1
193            
194             string of nucleotide sequence with length of 3N,
195              
196             =item 2
197              
198             sequence object which has a method I to get the sequence string,
199              
200             =item 3
201              
202             a sequence file in fasta format
203              
204             =item 4
205              
206             reference to a codon count hash, like
207             $codons = {
208             AGC => 50,
209             GTC => 124,
210             ... ...
211             }.
212              
213             =back
214              
215             =head2 cai
216              
217             Title : cai
218             Usage : $caiValue = $self->cai($seq);
219             Function: calculate the CAI value for the sequence
220             Returns : a number, or undef if failed
221             Args : see L
222             Note: codons without synonymous competitors are excluded in
223             calculation.
224              
225             =cut
226              
227             sub cai
228             {
229 13     13 1 57 my ($self, $seq) = @_;
230 13         34 $self->_xai($seq, 'CAI');
231             }
232              
233             # the real calculator of tAI or CAI as both have the same formula
234             sub _xai
235             {
236 26     26   33 my ($self, $seq, $type) = @_;
237              
238 26         23 my $name;
239             my $xaiHash;
240 26 100       105 if($type =~ /cai/i)
    50          
241             {
242 13         21 $name = 'CAI';
243 13         23 $xaiHash = $self->{"_cai_values"};
244             }elsif($type =~ /tai/i)
245             {
246 13         18 $name = 'tAI';
247 13         23 $xaiHash = $self->{"_tai_values"};
248             }else
249             {
250 0         0 $self->throw("Unknown adaptation index type '$type'");
251             }
252 26 50       48 unless($xaiHash)
253             {
254 0         0 $self->warn("$name values for codons were not provided for",
255             "this analyzer, so can not calculate $name for sequences");
256 0         0 return undef;
257             }
258              
259 26 50       68 my $codonList = $self->get_codon_list($seq) or return;
260              
261 26         38 my $xai = 0;
262 26         27 my $seqLen = 0; # this excludes some unsuitable codons
263             # get non-degenerative codons which are excluded in CAI
264 26         100 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  52         121  
265 26         61 my @senseCodons = $self->codon_table->all_sense_codons;
266 26         102 foreach my $codon (@senseCodons)
267             {
268 1586 100       2184 next unless($codonList->{$codon}); # no observation of this codon
269             # excluding non-degenerate codons for CAI calculation
270 1560 100 100     2698 next if($nonDegCodons{$codon} and $type =~ /cai/i);
271 1534 100       2105 unless(exists $xaiHash->{$codon})
272             {
273 13 0 33     53 $self->warn("Codon '$codon' is ignored")
    0          
274             if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
275 13         25 next;
276             }
277 1521         1204 my $cnt = $codonList->{$codon};
278             # to overcome real number overflow, use log
279 1521         2801 $xai += $cnt*log($xaiHash->{$codon});
280 1521         1851 $seqLen += $cnt;
281             }
282              
283 26 50       63 return undef unless($xai); # no codons with CAI/tAI
284              
285 26         73 $xai = exp($xai/$seqLen);
286 26         307 return $xai;
287             }
288              
289             =head2 fop
290              
291             Title : fop
292             Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
293             Function: calculate the fraction of optimal codons in the sequence
294             Returns : a number, or undef if failed
295             Args : for sequence see L.
296             if optional argument '$withNonDegenerate' is true, then
297             non-degenerate codons (those do not have synonymous partners) are
298             included in calculation. Default is excluding these codons.
299              
300             =cut
301              
302             sub fop
303             {
304 0     0 1 0 my ($self, $seq, $withNonDeg) = @_;
305              
306 0 0       0 my $optimalCodons = $self->{'_optimal_codons'} or
307             $self->throw("No optimal codons associated with $self");
308              
309 0 0       0 my $codonList = $self->get_codon_list($seq) or return;
310             # get non-degenerate codons
311 0         0 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  0         0  
312              
313              
314 0         0 my $optCnt = 0; # optimal codons
315 0         0 my $total = 0;
316 0         0 while(my ($codon, $cnt) = each %$codonList)
317             {
318             # excluding non-degenerate codons if necessary
319 0 0 0     0 next if(!$withNonDeg and $nonDegCodons{$codon});
320 0 0       0 $optCnt += $cnt if($optimalCodons->{$codon});
321 0         0 $total += $cnt;
322             }
323              
324 0   0     0 return $optCnt/($total || 1);
325             }
326              
327             =head2 tai
328              
329             Title : tai
330             Usage : $taiValue = $self->tai($seq);
331             Function: calculate the tAI value for the sequence
332             Returns : a number, or undef if failed
333             Args : for sequence see L.
334              
335             Note: codons which do not have tAI values are ignored from input
336             sequence
337              
338             =cut
339              
340             sub tai
341             {
342 13     13 1 56 my ($self, $seq) = @_;
343 13         26 $self->_xai($seq, 'tAI');
344             }
345              
346             # an alias
347             sub tAI
348             {
349 0     0 0 0 my ($self, $seq) = @_;
350 0         0 $self->_xai($seq, 'tAI');
351             }
352              
353             =head2 enc
354              
355             Title : enc
356             Usage : $encValue = $self->enc($seq,[$minTotal]);
357             Function: calculate ENC for the sequence using the original method
358             I
359             Returns : a number, or undef if failed
360             Args : for sequence see L.
361             Optional argument I specifies minimal count
362             for an amino acid; if observed count is smaller than this count, this
363             amino acid's F will not be calculated but inferred. Deafult is 5.
364              
365             Note: when the F of a redundancy group is unavailable due to lack of
366             sufficient data, it will be estimated from other groups following Wright's
367             method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
368             degeneracy degree of that group.
369              
370             =cut
371             sub enc
372             {
373 13     13 1 56 my ($self, $seq, $minTotal) = @_;
374 13         31 $self->_enc_factory($seq, $minTotal, 'mean');
375             }
376              
377             =head2 enc_r
378              
379             Title : enc_r
380             Usage : $encValue = $self->enc_r($seq,[$minTotal]);
381             Function: similar to the method L, except that missing F values
382             are estimated in a different way.
383             Returns : a number, or undef if failed
384             Args : for sequence see L.
385             Optional argument I specifies minimal count
386             for an amino acid; if observed count is smaller than this count, this
387             amino acid's F will not be calculated but inferred. Deafult is 5.
388              
389             Note: for missing Fx of degeneracy class 'x', we first estimated the
390             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
391             classes with known F values. Then Fx is obtained by solving the simple
392             equation.
393              
394             =cut
395              
396             sub enc_r
397             {
398 13     13 1 53 my ($self, $seq, $minTotal) = @_;
399 13         37 $self->_enc_factory($seq, $minTotal, 'equal_ratio');
400             }
401              
402             =head2 encp
403              
404             Title : encp
405             Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
406             Function: calculate ENC for the sequence using the updated method
407             by Novembre I<2002, MBE>, which corrects the background nucleotide
408             composition.
409             Returns : a number, or undef if failed
410             Args : for sequence see L.
411            
412             Optional argument I specifies minimal count
413             for an amino acid; if observed count is smaller than this count, this
414             amino acid's F will not be calculated but inferred. Deafult is 5.
415              
416             another optional argument gives the background nucleotide composition
417             in the order of A,T,C,G in an array, if not provided, it will use the
418             default one provided when calling the method L. If stil
419             unavailable, error occurs.
420              
421             =cut
422              
423             sub encp
424             {
425 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
426 0         0 $self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
427             }
428              
429             =head2 encp_r
430              
431             Title : encp_r
432             Usage : $encpValue =
433             $self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
434             Function: similar to the method L, except that missing F values
435             are estimated using a different way.
436             Returns : a number, or undef if failed
437             Args : for sequence see L.
438            
439             Optional argument I specifies minimal count
440             for an amino acid; if observed count is smaller than this count, this
441             amino acid's F will not be calculated but inferred. Deafult is 5.
442              
443             another optional argument gives the background nucleotide composition
444             in the order of A,T,C,G in an array, if not provided, it will use the
445             default one provided when calling the method L. If stil
446             unavailable, error occurs.
447              
448             Note: for missing Fx of degeneracy class 'x', we first estimated the
449             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
450             classes with known F values. Then Fx is obtained by solving the simple
451             equation.
452              
453             =cut
454              
455             sub encp_r
456             {
457 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
458 0         0 $self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
459             }
460              
461             # real function calculate different versions of ENC
462             # parameters explanation
463             # seq: sequence string, sequence object, sequence file, or hash
464             # reference to codon list
465             # correctBaseComp: if true, correct background base composition using
466             # Novembre's method
467             # F_EstimateMethod: how to estimate average F for a certain redundancy
468             # class if that class does not have observed data so can't be
469             # calculated; 'mean' is for Wright's method, and 'equal_ratio' for
470             # Zhenguo's method. The latter assumes a similar (1/F[r])/r for each
471             # redundancy class with redundancy degree 'r'
472             # baseComposition: optional, a reference to an array containing
473             # background nucleotide composition. If provided, it overides the
474             # values set when method L was called.
475             sub _enc_factory
476             {
477 26     26   31 my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
478              
479 26 50       58 $minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
480             # acid for it to be included in F calculation
481            
482             # a hash ref, codon => counts
483 26 50       57 my $codonList = $self->get_codon_list($seq) or return;
484 26 50 33     270 my $seqId = (blessed($seq) and $seq->can('id'))? $seq->id : '';
485              
486             # determine expected codon frequency if necessary
487 26         39 my $expectedCodonFreq;
488             # determine base compositions now
489 26 50       46 if($correctBaseComp)
490             {
491 0 0       0 if(!defined($baseComposition)) # not provided for this sequence
492             {
493 0         0 my $defaultBaseComp = $self->base_composition;
494 0 0       0 unless($defaultBaseComp)
495             {
496 0         0 $self->warn("No default base composition for seq",
497             " '$seqId', so no GC-corrected ENC");
498 0         0 return undef;
499             }
500 0 0       0 if($defaultBaseComp eq 'seq')
    0          
501             {
502 0         0 $baseComposition =
503             $self->estimate_base_composition($codonList);
504             }elsif($defaultBaseComp eq 'seq3')
505             {
506 0         0 $baseComposition =
507             $self->estimate_base_composition($codonList,3);
508             }else
509             {
510 0         0 $baseComposition = $defaultBaseComp;
511             }
512             } # otherwise sequence-specific base-composition is provided
513             # here
514              
515             # codon frequency may not be estimated due to invalid
516             # compositions
517             $expectedCodonFreq =
518 0 0       0 $self->expect_codon_freq($baseComposition)
519             if($baseComposition);
520 0 0       0 return undef unless($expectedCodonFreq);
521             }
522              
523              
524             # now let us calculate F for each redundancy class
525             # determined by codon table, containing all amino acid classes
526 26         68 my $AARedundancyClasses = $self->aa_degeneracy_classes; #
527 26         25 my %FavgByClass; # record the average F from each class
528 26         84 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
529             {
530             # number of observed AA types in this class
531 130         98 my $numAAInClass = 0; # number of amino acid species in this class
532 130         95 my $Fsum = 0;
533 130         261 while(my ($AA, $codonArray) = each %$AAHash)
534             {
535 494 100       700 if($redundancy == 1) # this class has only one codon
536             {
537 26         29 $numAAInClass = scalar(keys %$AAHash);
538 26         26 $Fsum = $numAAInClass; # each AA contribute 1
539 26         35 last;
540             }
541             # total count of observed residules for this AA
542 468         353 my $AAcnt = 0;
543 468         526 foreach (@$codonArray)
544             {
545             # check the codon exists in this seq
546 1534 100       2128 next unless(exists $codonList->{$_});
547 1508         1528 $AAcnt += $codonList->{$_};
548             }
549            
550             # skip if occurence of this amino acid is less than the
551             # minimal threshold
552 468 100 66     1396 next if($AAcnt < $minTotal or $AAcnt < 2);
553              
554             # now calculate F for this AA species
555 464 50       514 if($correctBaseComp) # correct base composition
556             {
557 0         0 my $chisq = 0;
558             # get the freq of codons of this amino acids
559 0         0 my $totalFreq = 0;
560 0         0 foreach (@$codonArray)
561             {
562 0         0 $totalFreq += $expectedCodonFreq->{$_};
563             }
564 0         0 foreach (@$codonArray)
565             {
566             # set unobserved codons to 0
567 0   0     0 my $codonCnt = $codonList->{$_} || 0;
568             my $expectedFreq =
569 0         0 $expectedCodonFreq->{$_}/$totalFreq;
570 0         0 $chisq += ($codonCnt/$AAcnt -
571             $expectedFreq)**2/$expectedFreq;
572             }
573 0         0 $chisq *= $AAcnt; # don't forget multiply this
574 0         0 $Fsum += ($chisq + $AAcnt -
575             $redundancy)/($redundancy*($AAcnt-1));
576             }else # no correction, use old Wright method
577             {
578 464         323 my $pSquareSum = 0;
579 464         483 foreach (@$codonArray)
580             {
581 1526         1124 my $codonCnt = $codonList->{$_};
582 1526 100       1939 next unless($codonCnt);
583 1504         1894 $pSquareSum += ($codonCnt/$AAcnt)**2;
584             }
585 464         546 $Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
586             }
587             # increase the number of AA species in this class
588 464         1064 $numAAInClass++;
589             }
590             # check whether all AA species are ignored or not observed
591 130 50       197 if($numAAInClass > 0)
592             {
593             # note, in some special cases, Fsum == 0 even though
594             # $numAAInClass >0, for example for a 6-fold amino acid,
595             # if each of its codon is observed only once, it would
596             # result in Faa = 0. so we need add restriction on this
597 130 50       481 $FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
598             0);
599             } # otherwise no data
600             }
601              
602             # estimate missing redundancy classes due to no observation of
603             # that class's AAs, and get the final Nc values
604 26         25 my $enc = 0;
605 26         61 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
606             {
607             # the number of AA species in this class, determined by the
608             # codon table, not the input seq
609 130         109 my $AAcntInClass = scalar(keys %$AAHash);
610 130 50       224 if(exists $FavgByClass{$redundancy})
611             {
612             die "$redundancy, $AAcntInClass in seq '$seqId':$!"
613 130 50       177 unless($FavgByClass{$redundancy});
614 130         143 $enc += $AAcntInClass/$FavgByClass{$redundancy};
615 130         271 next;
616             }
617              
618             # otherwise this class was not observed
619 0 0       0 my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
620 0         0 my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
621             $equalRatio);
622 0 0       0 unless($estimatedFavg)
623             {
624 0         0 $self->warn("Cannot estimate average F for class with",
625             "redundancy=$redundancy in sequence $seqId, ",
626             "probably no known F values for any class");
627 0         0 return undef;
628             }
629 0         0 $enc += $AAcntInClass/$estimatedFavg;
630             }
631              
632 26         236 return $enc;
633             }
634              
635             # estimate F average
636             sub _estimate_F
637             {
638 0     0     my ($knownF,$redundancy,$equalRatio) = @_;
639              
640 0 0         return 1 if($redundancy == 1);
641              
642 0 0         if($equalRatio) # get the mean (1/Fr-1)/(r-1)
643             {
644 0           my $ratioSum;
645 0           my $cnt = 0; # number of known Fs
646 0           while(my ($r, $F) = each %$knownF)
647             {
648 0 0         next if $r < 2; # excluding class of redundancy==1
649 0           $ratioSum += (1/$F-1)/($r-1);
650 0           $cnt++;
651             }
652              
653 0 0         if( $cnt > 0)
654             {
655 0           my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
656 0           return $Fx;
657             }else # no known F for any class with redundancy > 1
658             {
659 0           return undef;
660             }
661              
662             }else # otherwise use Wright's method
663             {
664 0 0         if($redundancy == 3)
665             {
666 0   0       my $F2 = $knownF->{2} || 1/2; # class 2
667 0   0       my $F4 = $knownF->{4} || 1/4; # class 4
668 0           return ($F2 + $F4)/2;
669             }else
670             {
671 0           return 1/$redundancy; # assuming no bias
672             }
673             }
674              
675             }
676              
677             # get the default base compostion of this object
678             sub base_composition
679             {
680 0     0 0   my $self = shift;
681              
682 0           return $self->{'_base_comp'};
683             }
684              
685             =head2 estimate_base_composition
686              
687             Title : estimate_base_composition
688             Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
689             Function: estimate base compositions in the sequence
690             Returns : an array of numbers in the order of A,T,C,G, or its
691             reference if in the scalar context
692             Args : a sequence string or a reference of hash containing codons
693             and their counts (eg., AGG => 30), and optionally an integer; the integer
694             specifies which codon position's nucleotide will be used instead of
695             all three codon positions.
696              
697             =cut
698              
699             sub estimate_base_composition
700             {
701 0     0 1   my ($self, $seq, $pos) = @_;
702              
703 0           my %bases;
704             # check if input is a codon list
705             my $codonList;
706 0 0         if(ref($seq) eq 'HASH') # a codon list
707             {
708 0           $codonList = $seq;
709             }else # a sequence string or object
710             {
711 0           $seq = $self->_get_seq_str($seq);
712             }
713              
714 0 0         if($pos)
715             {
716 0 0 0       $self->throw("Only 1, 2, or 3 are acceptable for pos,",
717             "'$pos' is not valid here") unless($pos > 0 and $pos < 4);
718 0 0         if($codonList) # input is a codon list
719             {
720 0           my $base;
721 0           while(my ($codon, $cnt) = each %$codonList)
722             {
723 0           $base = substr($codon, $pos-1,1);
724 0           $bases{$base} += $cnt;
725             }
726             }else # a sequence
727             {
728 0           my $seqLen = length($seq);
729 0           my $accuLen = $pos - 1;
730 0           my $period = 3; # a codon length
731 0           my $base;
732 0           while($accuLen < $seqLen)
733             {
734 0           $base = substr($seq,$accuLen,1);
735 0           $bases{$base}++;
736 0           $accuLen += $period;
737             }
738             }
739             }else # all nucleotides
740             {
741 0 0         if($codonList) # input is a codon list
742             {
743 0           while(my ($codon, $cnt) = each %$codonList)
744             {
745 0           map { $bases{$_} += $cnt } split('', $codon);
  0            
746             }
747             }else
748             {
749 0           map { $bases{$_}++ } split('',$seq);
  0            
750             }
751             }
752              
753 0           my $total = 0;
754 0           my @comp;
755 0           foreach ($self->bases) # only consider A,T,C,G
756             {
757 0   0       $total += $bases{$_} || 0;
758 0   0       push @comp, $bases{$_} || 0;
759             }
760 0           @comp = map { $_/$total } @comp;
  0            
761              
762 0 0         return wantarray? @comp : \@comp;
763             }
764              
765             =head2 gc_fraction
766              
767             Title : gc_fraction
768             Usage : $frac = $self->gc_fraction($seq,[$pos])
769             Function: get fraction of GC content in the sequence
770             Returns : a floating number between 0 and 1.
771             Args : a sequence string or a reference of hash containing codons
772             and their counts (eg., AGG => 30), and optionally an integer; the integer
773             specifies which codon position's nucleotide will be used for
774             calculation (i.e., 1, 2, or 3), instead of all three positions.
775              
776             =cut
777              
778             sub gc_frac
779             {
780 0     0 0   my ($self, @args) = @_;
781 0           $self->gc_fraction(@args);
782             }
783              
784             sub gc_fraction
785             {
786 0     0 1   my ($self, @args) = @_;
787              
788 0           my @composition = $self->estimate_base_composition(@args);
789 0           my @bases = $self->bases;
790 0           my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
  0            
791              
792 0           my $frac = 0;
793 0           foreach (@indice)
794             {
795 0           $frac += $composition[$_];
796             }
797              
798 0           return $frac;
799             }
800              
801             =head2 expect_codon_freq
802              
803             Title : expect_codon_freq
804             Usage : $codonFreq = $self->expect_codon_freq($base_composition)
805             Function: return the expected frequency of codons
806             Returns : reference to a hash in which codon is hash key, and
807             fraction is hash value
808             Args : reference to an array of base compositions in the order of
809             [A, T, C, G], represented as either counts or fractions
810              
811             =cut
812              
813             sub expect_codon_freq
814             {
815 0     0 1   my ($self, $baseComp) = @_;
816              
817 0 0 0       unless($baseComp and ref($baseComp) eq 'ARRAY')
818             {
819 0           $self->warn("Invalid base composition '$baseComp'",
820             " for expect_codon_freq, which should be an array reference");
821 0           return undef;
822             }
823              
824 0           my @bases = $self->bases;
825 0           my $compSum = 0; # used to normalize in case they are not summed to 1
826 0           my $zeroCnt = 0; # count of zero values
827 0           foreach (0..3)
828             {
829 0 0         $zeroCnt++ if($baseComp->[$_] == 0);
830 0           $compSum += $baseComp->[$_];
831             }
832              
833             # set zero value a pseudo count, depending on the provided values
834             # are fractions or counts
835 0 0         my $pseudoCnt = $compSum > 2? 1 : 1/100;
836 0           $compSum += $pseudoCnt * $zeroCnt;
837 0   0       my %freq = map { $bases[$_] => ($baseComp->[$_] || $pseudoCnt)/$compSum } 0..3;
  0            
838 0           my %result;
839 0           foreach my $b1 (@bases)
840             {
841 0           foreach my $b2 (@bases)
842             {
843 0           foreach my $b3 (@bases)
844             {
845 0           my $codon = $b1.$b2.$b3;
846 0           $result{$codon} = $freq{$b1}*$freq{$b2}*$freq{$b3};
847             }
848             }
849             }
850              
851 0           return \%result;
852             }
853              
854             =head1 AUTHOR
855              
856             Zhenguo Zhang, C<< >>
857              
858             =head1 BUGS
859              
860             Please report any bugs or feature requests to C, or through
861             the web interface at L. I will be notified, and then you'll
862             automatically be notified of progress on your bug as I make changes.
863              
864              
865             =head1 SUPPORT
866              
867             You can find documentation for this module with the perldoc command.
868              
869             perldoc Bio::CUA::CUB::Calculator
870              
871              
872             You can also look for information at:
873              
874             =over 4
875              
876             =item * RT: CPAN's request tracker (report bugs here)
877              
878             L
879              
880             =item * AnnoCPAN: Annotated CPAN documentation
881              
882             L
883              
884             =item * CPAN Ratings
885              
886             L
887              
888             =item * Search CPAN
889              
890             L
891              
892             =back
893              
894              
895             =head1 ACKNOWLEDGEMENTS
896              
897              
898             =head1 LICENSE AND COPYRIGHT
899              
900             Copyright 2015 Zhenguo Zhang.
901              
902             This program is free software: you can redistribute it and/or modify
903             it under the terms of the GNU General Public License as published by
904             the Free Software Foundation, either version 3 of the License, or
905             (at your option) any later version.
906              
907             This program is distributed in the hope that it will be useful,
908             but WITHOUT ANY WARRANTY; without even the implied warranty of
909             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
910             GNU General Public License for more details.
911              
912             You should have received a copy of the GNU General Public License
913             along with this program. If not, see L.
914              
915              
916             =cut
917              
918             1; # End of Bio::CUA::CUB::Calculator