File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm
Criterion Covered Total %
statement 98 235 41.7
branch 36 112 32.1
condition 6 36 16.6
subroutine 12 22 54.5
pod 11 14 78.5
total 163 419 38.9


line stmt bran cond sub pod time code
1             package Bio::CUA::CUB::Calculator;
2              
3             =pod
4              
5             =head1 NAME
6              
7             Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8             (CUB) indice for protein-coding sequences
9              
10             =head1 SYNOPSIS
11              
12             use Bio::CUA::CUB::Calculator;
13              
14             my $calc = Bio::CUA::CUB::Calculator->new(
15             -codon_table => 1,
16             -tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17             );
18              
19             # calculate tAI for each sequence
20             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21             or
22             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23              
24             while(my $seq = $io->next_seq)
25             {
26             my $tai = $calc->tai($seq);
27             printf("%10s: %.7f\n", $seq->id, $tai);
28             }
29              
30             =head1 DESCRIPTION
31              
32             Codon usage bias (CUB) can be represented at two levels, codon and
33             sequence. The latter is often computed as the geometric means of the
34             sequence's codons. This module caculates CUB metrics at sequence
35             level.
36              
37             Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38             adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39             Number of Codons) and their variants. See the methods below for
40             details.
41              
42             =cut
43              
44 2     2   4194 use 5.006;
  2         8  
  2         97  
45 2     2   15 use strict;
  2         4  
  2         86  
46 2     2   14 use warnings;
  2         3  
  2         88  
47 2     2   14 use parent qw/Bio::CUA::CUB/;
  2         3  
  2         22  
48 2     2   111 use Bio::CUA::CodonTable;
  2         4  
  2         5698  
49              
50             =head1 METHODS
51              
52             =head2 new
53              
54             Title : new
55             Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
56             Function: initialize the calculator
57             Returns : an object of this class
58             Args : a hash with following acceptable keys:
59            
60             B:
61              
62             =over
63              
64             =item C<-codon_table>
65              
66             the genetic code table applied for following sequence analyses. It
67             can be specified by an integer (genetic code table id), an object of
68             L, or a map-file. See the method
69             L for details.
70              
71             =back
72              
73             B
74              
75             =over
76              
77             =item C<-optimal_codons>
78              
79             a file contains all the optimal codons, one codon per line. Or a
80             hashref with keys being the optimal codons
81              
82             =back
83              
84             B
85              
86             =over
87              
88             =item C<-CAI_values>
89              
90             a file containing CAI values for each codon, excluding 3
91             stop codons, so 61 lines with each line containing a codon and its
92             value separated by space or tab.
93             or
94             a hashref with each key being a codon and each value being CAI index
95             for the codon.
96              
97             =back
98              
99             B
100              
101             =over
102              
103             =item C<-tAI_values>
104              
105             similar to C<-CAI_values>, a file or a hash containing tAI value
106             for each codon.
107              
108             =back
109              
110             B
111              
112             =over
113              
114             =item C<-base_background>
115              
116             optional.
117             an arrayref containing base frequency of 4 bases (in the order
118             A,T,C, and G) derived from background data such as introns.
119             Or one of the following values: 'seq', 'seq3', which will lead to
120             estimating base frequencies from each analyzed sequence in whol or
121             its 3rd codon position, respectively.
122              
123             It can also be specified for each analyzed sequence with the methods
124             L and L
125              
126             =back
127              
128             =cut
129              
130             sub new
131             {
132 1     1 1 2873 my ($caller, @args) = @_;
133              
134             # option -codon_table is processed in this parent class
135 1         12 my $self = $caller->SUPER::new(@args);
136              
137             # process all the parameters
138 1         4 my $hashRef = $self->_array_to_hash(\@args);
139 1         6 while(my ($tag, $val) = each %$hashRef)
140             {
141             # tag 'codon_table' is now processed by parent package
142 3 50       29 if($tag =~ /^optimal/o) # optimal codons
    100          
    100          
    50          
143             {
144             # a hash using codons as keys
145 0         0 my $optimalCodons = ref($val) eq 'HASH'?
146 0 0       0 {map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
147 0         0 $self->{'_optimal_codons'} = $optimalCodons;
148             }elsif($tag =~ /^cai/o) # CAI values
149             {
150             # a hash like codon => CAI_value
151 1 50       8 my $caiValues = ref($val) eq 'HASH'?
152             $val : $self->_parse_file($val,2);
153 1         10 $self->{'_cai_values'} = $caiValues;
154             }elsif($tag =~ /^tai/o) # tAI values
155             {
156             # a hash like codon => tAI_value
157 1 50       11 my $taiValues = ref($val) eq 'HASH'?
158             $val : $self->_parse_file($val,2);
159 1         8 $self->{'_tai_values'} = $taiValues;
160             }elsif($tag =~ /^base/o) # background base composition
161             {
162 0 0 0     0 if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
163             {
164 0         0 $self->{'_base_comp'} = $val;
165             }else
166             {
167 0         0 $self->throw("Unknown value '$val' for parameter",
168             "-base_composition");
169             }
170             }else
171             {
172             # Unknown parameter '$tag', ignored
173             }
174             }
175              
176 1         18 $self->no_atg(1); # exclude ATG in tAI calculation
177             # check the input values
178             # 1. make sure all the sense codons have CAI or tAI values if
179             # provided
180              
181 1         11 return $self;
182             }
183              
184             =head1 sequence input
185              
186             all the following methods accept one of the following formats as
187             sequence input
188              
189             =over
190              
191             =item 1
192            
193             string of nucleotide sequence with length of 3N,
194              
195             =item 2
196              
197             sequence object which has a method I to get the sequence string,
198              
199             =item 3
200              
201             a sequence file in fasta format
202              
203             =item 4
204              
205             reference to a codon count hash, like
206             $codons = {
207             AGC => 50,
208             GTC => 124,
209             ... ...
210             }.
211              
212             =back
213              
214             =head2 cai
215              
216             Title : cai
217             Usage : $caiValue = $self->cai($seq);
218             Function: calculate the CAI value for the sequence
219             Returns : a number, or undef if failed
220             Args : see L
221             Note: codons without synonymous competitors are excluded in
222             calculation.
223              
224             =cut
225              
226             sub cai
227             {
228 13     13 1 98 my ($self, $seq) = @_;
229 13         56 $self->_xai($seq, 'CAI');
230             }
231              
232             # the real calculator of tAI or CAI as both have the same formula
233             sub _xai
234             {
235 26     26   60 my ($self, $seq, $type) = @_;
236              
237 26         52 my $name;
238             my $xaiHash;
239 26 100       198 if($type =~ /cai/i)
    50          
240             {
241 13         29 $name = 'CAI';
242 13         35 $xaiHash = $self->{"_cai_values"};
243             }elsif($type =~ /tai/i)
244             {
245 13         24 $name = 'tAI';
246 13         39 $xaiHash = $self->{"_tai_values"};
247             }else
248             {
249 0         0 $self->throw("Unknown adaptation index type '$type'");
250             }
251 26 50       79 unless($xaiHash)
252             {
253 0         0 $self->warn("$name values for codons were not provided for",
254             "this analyzer, so can not calculate $name for sequences");
255 0         0 return undef;
256             }
257              
258 26 50       132 my $codonList = $self->get_codon_list($seq) or return;
259              
260 26         61 my $xai = 0;
261 26         47 my $seqLen = 0; # this excludes some unsuitable codons
262             # get non-degenerative codons which are excluded in CAI
263 26         193 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  52         184  
264 26         112 my @senseCodons = $self->codon_table->all_sense_codons;
265 26         144 foreach my $codon (@senseCodons)
266             {
267 1586 100       3138 next unless($codonList->{$codon}); # no observation of this codon
268             # excluding non-degenerate codons for CAI calculation
269 1560 100 100     3521 next if($nonDegCodons{$codon} and $type =~ /cai/i);
270 1534 100       2853 unless(exists $xaiHash->{$codon})
271             {
272 13 0 33     91 $self->warn("Codon '$codon' is ignored")
    0          
273             if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
274 13         39 next;
275             }
276 1521         1711 my $cnt = $codonList->{$codon};
277             # to overcome real number overflow, use log
278 1521         4536 $xai += $cnt*log($xaiHash->{$codon});
279 1521         2615 $seqLen += $cnt;
280             }
281              
282 26 50       104 return undef unless($xai); # no codons with CAI/tAI
283              
284 26         114 $xai = exp($xai/$seqLen);
285 26         578 return $xai;
286             }
287              
288             =head2 fop
289              
290             Title : fop
291             Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
292             Function: calculate the fraction of optimal codons in the sequence
293             Returns : a number, or undef if failed
294             Args : for sequence see L.
295             if optional argument '$withNonDegenerate' is true, then
296             non-degenerate codons (those do not have synonymous partners) are
297             included in calculation. Default is excluding these codons.
298              
299             =cut
300              
301             sub fop
302             {
303 0     0 1 0 my ($self, $seq, $withNonDeg) = @_;
304              
305 0 0       0 my $optimalCodons = $self->{'_optimal_codons'} or
306             $self->throw("No optimal codons associated with $self");
307              
308 0 0       0 my $codonList = $self->get_codon_list($seq) or return;
309             # get non-degenerate codons
310 0         0 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  0         0  
311              
312              
313 0         0 my $optCnt = 0; # optimal codons
314 0         0 my $total = 0;
315 0         0 while(my ($codon, $cnt) = each %$codonList)
316             {
317             # excluding non-degenerate codons if necessary
318 0 0 0     0 next if(!$withNonDeg and $nonDegCodons{$codon});
319 0 0       0 $optCnt += $cnt if($optimalCodons->{$codon});
320 0         0 $total += $cnt;
321             }
322              
323 0   0     0 return $optCnt/($total || 1);
324             }
325              
326             =head2 tai
327              
328             Title : tai
329             Usage : $taiValue = $self->tai($seq);
330             Function: calculate the tAI value for the sequence
331             Returns : a number, or undef if failed
332             Args : for sequence see L.
333              
334             Note: codons which do not have tAI values are ignored from input
335             sequence
336              
337             =cut
338              
339             sub tai
340             {
341 13     13 1 91 my ($self, $seq) = @_;
342 13         55 $self->_xai($seq, 'tAI');
343             }
344              
345             # an alias
346             sub tAI
347             {
348 0     0 0 0 my ($self, $seq) = @_;
349 0         0 $self->_xai($seq, 'tAI');
350             }
351              
352             =head2 enc
353              
354             Title : enc
355             Usage : $encValue = $self->enc($seq,[$minTotal]);
356             Function: calculate ENC for the sequence using the original method
357             I
358             Returns : a number, or undef if failed
359             Args : for sequence see L.
360             Optional argument I specifies minimal count
361             for an amino acid; if observed count is smaller than this count, this
362             amino acid's F will not be calculated but inferred. Deafult is 5.
363              
364             Note: when the F of a redundancy group is unavailable due to lack of
365             sufficient data, it will be estimated from other groups following Wright's
366             method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
367             degeneracy degree of that group.
368              
369             =cut
370             sub enc
371             {
372 13     13 1 84 my ($self, $seq, $minTotal) = @_;
373 13         57 $self->_enc_factory($seq, $minTotal, 'mean');
374             }
375              
376             =head2 enc_r
377              
378             Title : enc_r
379             Usage : $encValue = $self->enc_r($seq,[$minTotal]);
380             Function: similar to the method L, except that missing F values
381             are estimated in a different way.
382             Returns : a number, or undef if failed
383             Args : for sequence see L.
384             Optional argument I specifies minimal count
385             for an amino acid; if observed count is smaller than this count, this
386             amino acid's F will not be calculated but inferred. Deafult is 5.
387              
388             Note: for missing Fx of degeneracy class 'x', we first estimated the
389             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
390             classes with known F values. Then Fx is obtained by solving the simple
391             equation.
392              
393             =cut
394              
395             sub enc_r
396             {
397 13     13 1 77 my ($self, $seq, $minTotal) = @_;
398 13         46 $self->_enc_factory($seq, $minTotal, 'equal_ratio');
399             }
400              
401             =head2 encp
402              
403             Title : encp
404             Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
405             Function: calculate ENC for the sequence using the updated method
406             by Novembre I<2002, MBE>, which corrects the background nucleotide
407             composition.
408             Returns : a number, or undef if failed
409             Args : for sequence see L.
410            
411             Optional argument I specifies minimal count
412             for an amino acid; if observed count is smaller than this count, this
413             amino acid's F will not be calculated but inferred. Deafult is 5.
414              
415             another optional argument gives the background nucleotide composition
416             in the order of A,T,C,G in an array, if not provided, it will use the
417             default one provided when calling the method L. If stil
418             unavailable, error occurs.
419              
420             =cut
421              
422             sub encp
423             {
424 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
425 0         0 $self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
426             }
427              
428             =head2 encp_r
429              
430             Title : encp_r
431             Usage : $encpValue =
432             $self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
433             Function: similar to the method L, except that missing F values
434             are estimated using a different way.
435             Returns : a number, or undef if failed
436             Args : for sequence see L.
437            
438             Optional argument I specifies minimal count
439             for an amino acid; if observed count is smaller than this count, this
440             amino acid's F will not be calculated but inferred. Deafult is 5.
441              
442             another optional argument gives the background nucleotide composition
443             in the order of A,T,C,G in an array, if not provided, it will use the
444             default one provided when calling the method L. If stil
445             unavailable, error occurs.
446              
447             Note: for missing Fx of degeneracy class 'x', we first estimated the
448             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
449             classes with known F values. Then Fx is obtained by solving the simple
450             equation.
451              
452             =cut
453              
454             sub encp_r
455             {
456 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
457 0         0 $self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
458             }
459              
460             # real function calculate different versions of ENC
461             # parameters explanation
462             # seq: sequence string, sequence object, sequence file, or hash
463             # reference to codon list
464             # correctBaseComp: if true, correct background base composition using
465             # Novembre's method
466             # F_EstimateMethod: how to estimate average F for a certain redundancy
467             # class if that class does not have observed data so can't be
468             # calculated; 'mean' is for Wright's method, and 'equal_ratio' for
469             # Zhenbguo's method. The latter assumes a similar (1/F[r])/r for each
470             # redundancy class with redundancy degree 'r'
471             # baseComposition: optional, a reference to an array containing
472             # background nucleotide composition. If provided, it overides the
473             # values set when method L was called.
474             sub _enc_factory
475             {
476 26     26   61 my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
477              
478 26 50       83 $minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
479             # acid for it to be included in F calculation
480            
481             # a hash ref, codon => counts
482 26 50       122 my $codonList = $self->get_codon_list($seq) or return;
483              
484              
485             # determine expected codon frequency if necessary
486 26         58 my $expectedCodonFreq;
487             # determine base compositions now
488 26 50       82 if($correctBaseComp)
489             {
490 0 0       0 if(!defined($baseComposition)) # not provided for this sequence
491             {
492 0 0       0 my $defaultBaseComp = $self->base_composition or
493             $self->throw("No base composition information, so can not"
494             ,"compute background corrected version of ENC");
495 0 0       0 if($defaultBaseComp eq 'seq')
    0          
496             {
497 0         0 $baseComposition =
498             $self->estimate_base_composition($codonList);
499             }elsif($defaultBaseComp eq 'seq3')
500             {
501 0         0 $baseComposition =
502             $self->estimate_base_composition($codonList,3);
503             }else
504             {
505 0         0 $baseComposition = $defaultBaseComp;
506             }
507             } # otherwise sequence-specific base-composition is provided
508             # here
509              
510             # codon frequency may not be estimated due to invalid
511             # compositions
512             $expectedCodonFreq =
513 0         0 $self->expect_codon_freq($baseComposition);
514 0 0       0 return undef unless($expectedCodonFreq);
515             }
516              
517              
518             # now let us calculate F for each redundancy class
519             # determined by codon table, containing all amino acid classes
520 26         180 my $AARedundancyClasses = $self->aa_degeneracy_classes; #
521 26         52 my %FavgByClass; # record the average F from each class
522 26         217 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
523             {
524             # number of observed AA types in this class
525 130         157 my $numAAInClass = 0; # number of amino acid species in this class
526 130         131 my $Fsum = 0;
527 130         430 while(my ($AA, $codonArray) = each %$AAHash)
528             {
529 494 100       1070 if($redundancy == 1) # this class has only one codon
530             {
531 26         49 $numAAInClass = scalar(keys %$AAHash);
532 26         38 $Fsum = $numAAInClass; # each AA contribute 1
533 26         59 last;
534             }
535             # total count of observed residules for this AA
536 468         498 my $AAcnt = 0;
537 468         792 foreach (@$codonArray)
538             {
539             # check the codon exists in this seq
540 1534 100       2948 next unless(exists $codonList->{$_});
541 1508         2597 $AAcnt += $codonList->{$_};
542             }
543            
544             # skip if occurence of this amino acid is less than the
545             # minimal threshold
546 468 100 66     2131 next if($AAcnt < $minTotal or $AAcnt < 2);
547              
548             # now calculate F for this AA species
549 464 50       769 if($correctBaseComp) # correct base composition
550             {
551 0         0 my $chisq = 0;
552             # get the freq of codons of this amino acids
553 0         0 my $totalFreq = 0;
554 0         0 foreach (@$codonArray)
555             {
556 0         0 $totalFreq += $expectedCodonFreq->{$_};
557             }
558 0         0 foreach (@$codonArray)
559             {
560             # set unobserved codons to 0
561 0   0     0 my $codonCnt = $codonList->{$_} || 0;
562 0         0 my $expectedFreq =
563             $expectedCodonFreq->{$_}/$totalFreq;
564 0         0 $chisq += ($codonCnt/$AAcnt -
565             $expectedFreq)**2/$expectedFreq;
566             }
567 0         0 $chisq *= $AAcnt; # don't forget multiply this
568 0         0 $Fsum += ($chisq + $AAcnt -
569             $redundancy)/($redundancy*($AAcnt-1));
570             }else # no correction, use old Wright method
571             {
572 464         454 my $pSquareSum = 0;
573 464         722 foreach (@$codonArray)
574             {
575 1526         1858 my $codonCnt = $codonList->{$_};
576 1526 100       2678 next unless($codonCnt);
577 1504         3262 $pSquareSum += ($codonCnt/$AAcnt)**2;
578             }
579 464         1052 $Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
580             }
581             # increase the number of AA species in this class
582 464         1742 $numAAInClass++;
583             }
584             # check whether all AA species are ignored or not observed
585 130 50       309 if($numAAInClass > 0)
586             {
587             # note, in some special cases, Fsum == 0 even though
588             # $numAAInClass >0, for example for a 6-fold amino acid,
589             # if each of its codon is observed only once, it would
590             # result in Faa = 0. so we need add restriction on this
591 130 50       778 $FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
592             0);
593             } # otherwise no data
594             }
595              
596             # estimate missing redundancy classes due to no observation of
597             # that class's AAs, and get the final Nc values
598 26         41 my $enc = 0;
599 26         109 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
600             {
601             # the number of AA species in this class, determined by the
602             # codon table, not the input seq
603 130         172 my $AAcntInClass = scalar(keys %$AAHash);
604 130 50       303 if(exists $FavgByClass{$redundancy})
605             {
606 130 50       277 die "$redundancy, $AAcntInClass:$!"
607             unless($FavgByClass{$redundancy});
608 130         215 $enc += $AAcntInClass/$FavgByClass{$redundancy};
609 130         391 next;
610             }
611              
612             # otherwise this class was not observed
613 0 0       0 my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
614 0         0 my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
615             $equalRatio);
616 0 0       0 unless($estimatedFavg)
617             {
618 0         0 $self->warn("Can not estimate average F for class with",
619             "redundancy=$redundancy, probably no known values for this class");
620 0         0 return undef;
621             }
622 0         0 $enc += $AAcntInClass/$estimatedFavg;
623             }
624              
625 26         419 return $enc;
626             }
627              
628             # estimate F average
629             sub _estimate_F
630             {
631 0     0     my ($knownF,$redundancy,$equalRatio) = @_;
632              
633 0 0         return 1 if($redundancy == 1);
634              
635 0 0         if($equalRatio) # get the mean (1/Fr-1)/(r-1)
636             {
637 0           my $ratioSum;
638 0           my $cnt = 0; # number of known Fs
639 0           while(my ($r, $F) = each %$knownF)
640             {
641 0 0         next if $r < 2; # excluding class of redundancy==1
642 0           $ratioSum += (1/$F-1)/($r-1);
643 0           $cnt++;
644             }
645              
646 0 0         if( $cnt > 0)
647             {
648 0           my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
649 0           return $Fx;
650             }else # no known F for any class with redundancy > 1
651             {
652 0           return undef;
653             }
654              
655             }else # otherwise use Wright's method
656             {
657 0 0         if($redundancy == 3)
658             {
659 0   0       my $F2 = $knownF->{2} || 1/2; # class 2
660 0   0       my $F4 = $knownF->{4} || 1/4; # class 4
661 0           return ($F2 + $F4)/2;
662             }else
663             {
664 0           return 1/$redundancy; # assuming no bias
665             }
666             }
667              
668             }
669              
670             # get the default base compostion of this object
671             sub base_composition
672             {
673 0     0 0   my $self = shift;
674              
675 0           return $self->{'_base_comp'};
676             }
677              
678             =head2 estimate_base_composition
679              
680             Title : estimate_base_composition
681             Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
682             Function: estimate base compositions in the sequence
683             Returns : an array of numbers in the order of A,T,C,G, or its
684             reference if in the scalar context
685             Args : a sequence string or a reference of hash containing codons
686             and their counts (eg., AGG => 30), and optionally an integer; the integer
687             specifies which codon position's nucleotide will be used instead of
688             all three codon positions.
689              
690             =cut
691              
692             sub estimate_base_composition
693             {
694 0     0 1   my ($self, $seq, $pos) = @_;
695              
696 0           my %bases;
697             # check if input is a codon list
698             my $codonList;
699 0 0         if(ref($seq) eq 'HASH') # a codon list
700             {
701 0           $codonList = $seq;
702             }else # a sequence string or object
703             {
704 0           $seq = $self->_get_seq_str($seq);
705             }
706              
707 0 0         if($pos)
708             {
709 0 0 0       $self->throw("Only 1, 2, or 3 are acceptable for pos,",
710             "'$pos' is not valid here") unless($pos > 0 and $pos < 4);
711 0 0         if($codonList) # input is a codon list
712             {
713 0           my $base;
714 0           while(my ($codon, $cnt) = each %$codonList)
715             {
716 0           $base = substr($codon, $pos-1,1);
717 0           $bases{$base} += $cnt;
718             }
719             }else # a sequence
720             {
721 0           my $seqLen = length($seq);
722 0           my $accuLen = $pos - 1;
723 0           my $period = 3; # a codon length
724 0           my $base;
725 0           while($accuLen < $seqLen)
726             {
727 0           $base = substr($seq,$accuLen,1);
728 0           $bases{$base}++;
729 0           $accuLen += $period;
730             }
731             }
732             }else # all nucleotides
733             {
734 0 0         if($codonList) # input is a codon list
735             {
736 0           while(my ($codon, $cnt) = each %$codonList)
737             {
738 0           map { $bases{$_} += $cnt } split('', $codon);
  0            
739             }
740             }else
741             {
742 0           map { $bases{$_}++ } split('',$seq);
  0            
743             }
744             }
745              
746 0           my $total = 0;
747 0           my @comp;
748 0           foreach ($self->bases) # only consider A,T,C,G
749             {
750 0   0       $total += $bases{$_} || 0;
751 0   0       push @comp, $bases{$_} || 0;
752             }
753 0           @comp = map { $_/$total } @comp;
  0            
754              
755 0 0         return wantarray? @comp : \@comp;
756             }
757              
758             =head2 gc_fraction
759              
760             Title : gc_fraction
761             Usage : $frac = $self->gc_fraction($seq,[$pos])
762             Function: get fraction of GC content in the sequence
763             Returns : a floating number between 0 and 1.
764             Args : a sequence string or a reference of hash containing codons
765             and their counts (eg., AGG => 30), and optionally an integer; the integer
766             specifies which codon position's nucleotide will be used for
767             calculation (i.e., 1, 2, or 3), instead of all three positions.
768              
769             =cut
770              
771             sub gc_frac
772             {
773 0     0 0   my ($self, @args) = @_;
774 0           $self->gc_fraction(@args);
775             }
776              
777             sub gc_fraction
778             {
779 0     0 1   my ($self, @args) = @_;
780              
781 0           my @composition = $self->estimate_base_composition(@args);
782 0           my @bases = $self->bases;
783 0           my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
  0            
784              
785 0           my $frac = 0;
786 0           foreach (@indice)
787             {
788 0           $frac += $composition[$_];
789             }
790              
791 0           return $frac;
792             }
793              
794             =head2 expect_codon_freq
795              
796             Title : expect_codon_freq
797             Usage : $codonFreq = $self->expect_codon_freq($base_composition)
798             Function: return the expected frequency of codons
799             Returns : reference to a hash in which codon is hash key, and
800             fraction is hash value
801             Args : reference to an array of base compositions in the order of
802             [A, T, C, G], represented as either counts or fractions
803              
804             =cut
805              
806             sub expect_codon_freq
807             {
808 0     0 1   my ($self, $baseComp) = @_;
809              
810 0 0         if(uc($baseComp) eq 'NA') # for cases the composition is
811             # unavailable
812             {
813 0           return undef;
814             }
815              
816 0 0 0       $self->throw("Input base composition for expect_codon_freq is not an array reference")
817             unless($baseComp and ref($baseComp) eq 'ARRAY');
818              
819 0           my @bases = $self->bases;
820 0           my $compSum = 0; # used to normalize in case they are not summed to 1
821 0           my $zeroCnt = 0; # count of zero values
822 0           foreach (0..3)
823             {
824 0 0         $zeroCnt++ if($baseComp->[$_] == 0);
825 0           $compSum += $baseComp->[$_];
826             }
827              
828             # set zero value a pseudo count, depending on the provided values
829             # are fractions or counts
830 0 0         my $pseudoCnt = $compSum > 2? 1 : 1/100;
831 0           $compSum += $pseudoCnt * $zeroCnt;
832 0   0       my %freq = map { $bases[$_] => ($baseComp->[$_] || $pseudoCnt)/$compSum } 0..3;
  0            
833 0           my %result;
834 0           foreach my $b1 (@bases)
835             {
836 0           foreach my $b2 (@bases)
837             {
838 0           foreach my $b3 (@bases)
839             {
840 0           my $codon = $b1.$b2.$b3;
841 0           $result{$codon} = $freq{$b1}*$freq{$b2}*$freq{$b3};
842             }
843             }
844             }
845              
846 0           return \%result;
847             }
848              
849             =head1 AUTHOR
850              
851             Zhenguo Zhang, C<< >>
852              
853             =head1 BUGS
854              
855             Please report any bugs or feature requests to C, or through
856             the web interface at L. I will be notified, and then you'll
857             automatically be notified of progress on your bug as I make changes.
858              
859              
860             =head1 SUPPORT
861              
862             You can find documentation for this module with the perldoc command.
863              
864             perldoc Bio::CUA::CUB::Calculator
865              
866              
867             You can also look for information at:
868              
869             =over 4
870              
871             =item * RT: CPAN's request tracker (report bugs here)
872              
873             L
874              
875             =item * AnnoCPAN: Annotated CPAN documentation
876              
877             L
878              
879             =item * CPAN Ratings
880              
881             L
882              
883             =item * Search CPAN
884              
885             L
886              
887             =back
888              
889              
890             =head1 ACKNOWLEDGEMENTS
891              
892              
893             =head1 LICENSE AND COPYRIGHT
894              
895             Copyright 2015 Zhenguo Zhang.
896              
897             This program is free software: you can redistribute it and/or modify
898             it under the terms of the GNU General Public License as published by
899             the Free Software Foundation, either version 3 of the License, or
900             (at your option) any later version.
901              
902             This program is distributed in the hope that it will be useful,
903             but WITHOUT ANY WARRANTY; without even the implied warranty of
904             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
905             GNU General Public License for more details.
906              
907             You should have received a copy of the GNU General Public License
908             along with this program. If not, see L.
909              
910              
911             =cut
912              
913             1; # End of Bio::CUA::CUB::Calculator