File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm
Criterion Covered Total %
statement 99 239 41.4
branch 37 114 32.4
condition 7 39 17.9
subroutine 12 22 54.5
pod 11 14 78.5
total 166 428 38.7


line stmt bran cond sub pod time code
1             package Bio::CUA::CUB::Calculator;
2              
3             =pod
4              
5             =head1 NAME
6              
7             Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8             (CUB) indice for protein-coding sequences
9              
10             =head1 SYNOPSIS
11              
12             use Bio::CUA::CUB::Calculator;
13              
14             my $calc = Bio::CUA::CUB::Calculator->new(
15             -codon_table => 1,
16             -tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17             );
18              
19             # calculate tAI for each sequence
20             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21             or
22             my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23              
24             while(my $seq = $io->next_seq)
25             {
26             my $tai = $calc->tai($seq);
27             printf("%10s: %.7f\n", $seq->id, $tai);
28             }
29              
30             =head1 DESCRIPTION
31              
32             Codon usage bias (CUB) can be represented at two levels, codon and
33             sequence. The latter is often computed as the geometric means of the
34             sequence's codons. This module caculates CUB metrics at sequence
35             level.
36              
37             Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38             adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39             Number of Codons) and their variants. See the methods below for
40             details.
41              
42             =cut
43              
44 2     2   5124 use 5.006;
  2         8  
  2         135  
45 2     2   16 use strict;
  2         4  
  2         88  
46 2     2   12 use warnings;
  2         3  
  2         88  
47 2     2   13 use parent qw/Bio::CUA::CUB/;
  2         4  
  2         18  
48 2     2   129 use Bio::CUA::CodonTable;
  2         3  
  2         6091  
49              
50             =head1 METHODS
51              
52             =head2 new
53              
54             Title : new
55             Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
56             Function: initialize the calculator
57             Returns : an object of this class
58             Args : a hash with following acceptable keys:
59            
60             B:
61              
62             =over
63              
64             =item C<-codon_table>
65              
66             the genetic code table applied for following sequence analyses. It
67             can be specified by an integer (genetic code table id), an object of
68             L, or a map-file. See the method
69             L for details.
70              
71             =back
72              
73             B
74              
75             =over
76              
77             =item C<-optimal_codons>
78              
79             a file contains all the optimal codons, one codon per line. Or a
80             hashref with keys being the optimal codons
81              
82             =back
83              
84             B
85              
86             =over
87              
88             =item C<-CAI_values>
89              
90             a file containing CAI values for each codon, excluding 3
91             stop codons, so 61 lines with each line containing a codon and its
92             value separated by space or tab.
93             or
94             a hashref with each key being a codon and each value being CAI index
95             for the codon.
96              
97             =back
98              
99             B
100              
101             =over
102              
103             =item C<-tAI_values>
104              
105             similar to C<-CAI_values>, a file or a hash containing tAI value
106             for each codon.
107              
108             =back
109              
110             B
111              
112             =over
113              
114             =item C<-base_background>
115              
116             optional.
117             an arrayref containing base frequency of 4 bases (in the order
118             A,T,C, and G) derived from background data such as introns.
119             Or one of the following values: 'seq', 'seq3', which will lead to
120             estimating base frequencies from each analyzed sequence in whol or
121             its 3rd codon position, respectively.
122              
123             It can also be specified for each analyzed sequence with the methods
124             L and L
125              
126             =back
127              
128             =cut
129              
130             sub new
131             {
132 1     1 1 5312 my ($caller, @args) = @_;
133              
134             # option -codon_table is processed in this parent class
135 1         22 my $self = $caller->SUPER::new(@args);
136              
137             # process all the parameters
138 1         10 my $hashRef = $self->_array_to_hash(\@args);
139 1         9 while(my ($tag, $val) = each %$hashRef)
140             {
141             # tag 'codon_table' is now processed by parent package
142 3 50       45 if($tag =~ /^optimal/o) # optimal codons
    100          
    100          
    50          
143             {
144             # a hash using codons as keys
145 0         0 my $optimalCodons = ref($val) eq 'HASH'?
146 0 0       0 {map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
147 0         0 $self->{'_optimal_codons'} = $optimalCodons;
148             }elsif($tag =~ /^cai/o) # CAI values
149             {
150             # a hash like codon => CAI_value
151 1 50       30 my $caiValues = ref($val) eq 'HASH'?
152             $val : $self->_parse_file($val,2);
153 1         11 $self->{'_cai_values'} = $caiValues;
154             }elsif($tag =~ /^tai/o) # tAI values
155             {
156             # a hash like codon => tAI_value
157 1 50       11 my $taiValues = ref($val) eq 'HASH'?
158             $val : $self->_parse_file($val,2);
159 1         14 $self->{'_tai_values'} = $taiValues;
160             }elsif($tag =~ /^base/o) # background base composition
161             {
162 0 0 0     0 if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
163             {
164 0         0 $self->{'_base_comp'} = $val;
165             }else
166             {
167 0         0 $self->throw("Unknown value '$val' for parameter",
168             "-base_background");
169             }
170             }else
171             {
172             # Unknown parameter '$tag', ignored
173             }
174             }
175              
176 1         25 $self->no_atg(1); # exclude ATG in tAI calculation
177             # check the input values
178             # 1. make sure all the sense codons have CAI or tAI values if
179             # provided
180              
181 1         12 return $self;
182             }
183              
184             =head1 sequence input
185              
186             all the following methods accept one of the following formats as
187             sequence input
188              
189             =over
190              
191             =item 1
192            
193             string of nucleotide sequence with length of 3N,
194              
195             =item 2
196              
197             sequence object which has a method I to get the sequence string,
198              
199             =item 3
200              
201             a sequence file in fasta format
202              
203             =item 4
204              
205             reference to a codon count hash, like
206             $codons = {
207             AGC => 50,
208             GTC => 124,
209             ... ...
210             }.
211              
212             =back
213              
214             =head2 cai
215              
216             Title : cai
217             Usage : $caiValue = $self->cai($seq);
218             Function: calculate the CAI value for the sequence
219             Returns : a number, or undef if failed
220             Args : see L
221             Note: codons without synonymous competitors are excluded in
222             calculation.
223              
224             =cut
225              
226             sub cai
227             {
228 13     13 1 104 my ($self, $seq) = @_;
229 13         75 $self->_xai($seq, 'CAI');
230             }
231              
232             # the real calculator of tAI or CAI as both have the same formula
233             sub _xai
234             {
235 26     26   63 my ($self, $seq, $type) = @_;
236              
237 26         108 my $name;
238             my $xaiHash;
239 26 100       251 if($type =~ /cai/i)
    50          
240             {
241 13         31 $name = 'CAI';
242 13         42 $xaiHash = $self->{"_cai_values"};
243             }elsif($type =~ /tai/i)
244             {
245 13         29 $name = 'tAI';
246 13         44 $xaiHash = $self->{"_tai_values"};
247             }else
248             {
249 0         0 $self->throw("Unknown adaptation index type '$type'");
250             }
251 26 50       95 unless($xaiHash)
252             {
253 0         0 $self->warn("$name values for codons were not provided for",
254             "this analyzer, so can not calculate $name for sequences");
255 0         0 return undef;
256             }
257              
258 26 50       163 my $codonList = $self->get_codon_list($seq) or return;
259              
260 26         84 my $xai = 0;
261 26         49 my $seqLen = 0; # this excludes some unsuitable codons
262             # get non-degenerative codons which are excluded in CAI
263 26         315 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  52         317  
264 26         140 my @senseCodons = $self->codon_table->all_sense_codons;
265 26         179 foreach my $codon (@senseCodons)
266             {
267 1586 100       4002 next unless($codonList->{$codon}); # no observation of this codon
268             # excluding non-degenerate codons for CAI calculation
269 1560 100 100     4702 next if($nonDegCodons{$codon} and $type =~ /cai/i);
270 1534 100       3308 unless(exists $xaiHash->{$codon})
271             {
272 13 0 33     258 $self->warn("Codon '$codon' is ignored")
    0          
273             if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
274 13         58 next;
275             }
276 1521         1840 my $cnt = $codonList->{$codon};
277             # to overcome real number overflow, use log
278 1521         5596 $xai += $cnt*log($xaiHash->{$codon});
279 1521         3017 $seqLen += $cnt;
280             }
281              
282 26 50       153 return undef unless($xai); # no codons with CAI/tAI
283              
284 26         163 $xai = exp($xai/$seqLen);
285 26         770 return $xai;
286             }
287              
288             =head2 fop
289              
290             Title : fop
291             Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
292             Function: calculate the fraction of optimal codons in the sequence
293             Returns : a number, or undef if failed
294             Args : for sequence see L.
295             if optional argument '$withNonDegenerate' is true, then
296             non-degenerate codons (those do not have synonymous partners) are
297             included in calculation. Default is excluding these codons.
298              
299             =cut
300              
301             sub fop
302             {
303 0     0 1 0 my ($self, $seq, $withNonDeg) = @_;
304              
305 0 0       0 my $optimalCodons = $self->{'_optimal_codons'} or
306             $self->throw("No optimal codons associated with $self");
307              
308 0 0       0 my $codonList = $self->get_codon_list($seq) or return;
309             # get non-degenerate codons
310 0         0 my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
  0         0  
311              
312              
313 0         0 my $optCnt = 0; # optimal codons
314 0         0 my $total = 0;
315 0         0 while(my ($codon, $cnt) = each %$codonList)
316             {
317             # excluding non-degenerate codons if necessary
318 0 0 0     0 next if(!$withNonDeg and $nonDegCodons{$codon});
319 0 0       0 $optCnt += $cnt if($optimalCodons->{$codon});
320 0         0 $total += $cnt;
321             }
322              
323 0   0     0 return $optCnt/($total || 1);
324             }
325              
326             =head2 tai
327              
328             Title : tai
329             Usage : $taiValue = $self->tai($seq);
330             Function: calculate the tAI value for the sequence
331             Returns : a number, or undef if failed
332             Args : for sequence see L.
333              
334             Note: codons which do not have tAI values are ignored from input
335             sequence
336              
337             =cut
338              
339             sub tai
340             {
341 13     13 1 118 my ($self, $seq) = @_;
342 13         71 $self->_xai($seq, 'tAI');
343             }
344              
345             # an alias
346             sub tAI
347             {
348 0     0 0 0 my ($self, $seq) = @_;
349 0         0 $self->_xai($seq, 'tAI');
350             }
351              
352             =head2 enc
353              
354             Title : enc
355             Usage : $encValue = $self->enc($seq,[$minTotal]);
356             Function: calculate ENC for the sequence using the original method
357             I
358             Returns : a number, or undef if failed
359             Args : for sequence see L.
360             Optional argument I specifies minimal count
361             for an amino acid; if observed count is smaller than this count, this
362             amino acid's F will not be calculated but inferred. Deafult is 5.
363              
364             Note: when the F of a redundancy group is unavailable due to lack of
365             sufficient data, it will be estimated from other groups following Wright's
366             method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
367             degeneracy degree of that group.
368              
369             =cut
370             sub enc
371             {
372 13     13 1 107 my ($self, $seq, $minTotal) = @_;
373 13         67 $self->_enc_factory($seq, $minTotal, 'mean');
374             }
375              
376             =head2 enc_r
377              
378             Title : enc_r
379             Usage : $encValue = $self->enc_r($seq,[$minTotal]);
380             Function: similar to the method L, except that missing F values
381             are estimated in a different way.
382             Returns : a number, or undef if failed
383             Args : for sequence see L.
384             Optional argument I specifies minimal count
385             for an amino acid; if observed count is smaller than this count, this
386             amino acid's F will not be calculated but inferred. Deafult is 5.
387              
388             Note: for missing Fx of degeneracy class 'x', we first estimated the
389             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
390             classes with known F values. Then Fx is obtained by solving the simple
391             equation.
392              
393             =cut
394              
395             sub enc_r
396             {
397 13     13 1 169 my ($self, $seq, $minTotal) = @_;
398 13         52 $self->_enc_factory($seq, $minTotal, 'equal_ratio');
399             }
400              
401             =head2 encp
402              
403             Title : encp
404             Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
405             Function: calculate ENC for the sequence using the updated method
406             by Novembre I<2002, MBE>, which corrects the background nucleotide
407             composition.
408             Returns : a number, or undef if failed
409             Args : for sequence see L.
410            
411             Optional argument I specifies minimal count
412             for an amino acid; if observed count is smaller than this count, this
413             amino acid's F will not be calculated but inferred. Deafult is 5.
414              
415             another optional argument gives the background nucleotide composition
416             in the order of A,T,C,G in an array, if not provided, it will use the
417             default one provided when calling the method L. If stil
418             unavailable, error occurs.
419              
420             =cut
421              
422             sub encp
423             {
424 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
425 0         0 $self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
426             }
427              
428             =head2 encp_r
429              
430             Title : encp_r
431             Usage : $encpValue =
432             $self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
433             Function: similar to the method L, except that missing F values
434             are estimated using a different way.
435             Returns : a number, or undef if failed
436             Args : for sequence see L.
437            
438             Optional argument I specifies minimal count
439             for an amino acid; if observed count is smaller than this count, this
440             amino acid's F will not be calculated but inferred. Deafult is 5.
441              
442             another optional argument gives the background nucleotide composition
443             in the order of A,T,C,G in an array, if not provided, it will use the
444             default one provided when calling the method L. If stil
445             unavailable, error occurs.
446              
447             Note: for missing Fx of degeneracy class 'x', we first estimated the
448             ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
449             classes with known F values. Then Fx is obtained by solving the simple
450             equation.
451              
452             =cut
453              
454             sub encp_r
455             {
456 0     0 1 0 my ($self, $seq, $minTotal, $baseComp) = @_;
457 0         0 $self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
458             }
459              
460             # real function calculate different versions of ENC
461             # parameters explanation
462             # seq: sequence string, sequence object, sequence file, or hash
463             # reference to codon list
464             # correctBaseComp: if true, correct background base composition using
465             # Novembre's method
466             # F_EstimateMethod: how to estimate average F for a certain redundancy
467             # class if that class does not have observed data so can't be
468             # calculated; 'mean' is for Wright's method, and 'equal_ratio' for
469             # Zhenguo's method. The latter assumes a similar (1/F[r])/r for each
470             # redundancy class with redundancy degree 'r'
471             # baseComposition: optional, a reference to an array containing
472             # background nucleotide composition. If provided, it overides the
473             # values set when method L was called.
474             sub _enc_factory
475             {
476 26     26   62 my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
477              
478 26 50       112 $minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
479             # acid for it to be included in F calculation
480            
481             # a hash ref, codon => counts
482 26 50       142 my $codonList = $self->get_codon_list($seq) or return;
483 26 50 33     647 my $seqId = (ref($seq) and $seq->can('id'))? $seq->id : '';
484              
485             # determine expected codon frequency if necessary
486 26         77 my $expectedCodonFreq;
487             # determine base compositions now
488 26 50       94 if($correctBaseComp)
489             {
490 0 0       0 if(!defined($baseComposition)) # not provided for this sequence
491             {
492 0         0 my $defaultBaseComp = $self->base_composition;
493 0 0       0 unless($defaultBaseComp)
494             {
495 0         0 $self->warn("No default base composition for seq",
496             " '$seqId', so no GC-corrected ENC");
497 0         0 return undef;
498             }
499 0 0       0 if($defaultBaseComp eq 'seq')
    0          
500             {
501 0         0 $baseComposition =
502             $self->estimate_base_composition($codonList);
503             }elsif($defaultBaseComp eq 'seq3')
504             {
505 0         0 $baseComposition =
506             $self->estimate_base_composition($codonList,3);
507             }else
508             {
509 0         0 $baseComposition = $defaultBaseComp;
510             }
511             } # otherwise sequence-specific base-composition is provided
512             # here
513              
514             # codon frequency may not be estimated due to invalid
515             # compositions
516             $expectedCodonFreq =
517 0 0       0 $self->expect_codon_freq($baseComposition)
518             if($baseComposition);
519 0 0       0 return undef unless($expectedCodonFreq);
520             }
521              
522              
523             # now let us calculate F for each redundancy class
524             # determined by codon table, containing all amino acid classes
525 26         200 my $AARedundancyClasses = $self->aa_degeneracy_classes; #
526 26         52 my %FavgByClass; # record the average F from each class
527 26         247 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
528             {
529             # number of observed AA types in this class
530 130         168 my $numAAInClass = 0; # number of amino acid species in this class
531 130         157 my $Fsum = 0;
532 130         421 while(my ($AA, $codonArray) = each %$AAHash)
533             {
534 494 100       1095 if($redundancy == 1) # this class has only one codon
535             {
536 26         3570 $numAAInClass = scalar(keys %$AAHash);
537 26         51 $Fsum = $numAAInClass; # each AA contribute 1
538 26         78 last;
539             }
540             # total count of observed residules for this AA
541 468         541 my $AAcnt = 0;
542 468         847 foreach (@$codonArray)
543             {
544             # check the codon exists in this seq
545 1534 100       3166 next unless(exists $codonList->{$_});
546 1508         2844 $AAcnt += $codonList->{$_};
547             }
548            
549             # skip if occurence of this amino acid is less than the
550             # minimal threshold
551 468 100 66     3486 next if($AAcnt < $minTotal or $AAcnt < 2);
552              
553             # now calculate F for this AA species
554 464 50       741 if($correctBaseComp) # correct base composition
555             {
556 0         0 my $chisq = 0;
557             # get the freq of codons of this amino acids
558 0         0 my $totalFreq = 0;
559 0         0 foreach (@$codonArray)
560             {
561 0         0 $totalFreq += $expectedCodonFreq->{$_};
562             }
563 0         0 foreach (@$codonArray)
564             {
565             # set unobserved codons to 0
566 0   0     0 my $codonCnt = $codonList->{$_} || 0;
567 0         0 my $expectedFreq =
568             $expectedCodonFreq->{$_}/$totalFreq;
569 0         0 $chisq += ($codonCnt/$AAcnt -
570             $expectedFreq)**2/$expectedFreq;
571             }
572 0         0 $chisq *= $AAcnt; # don't forget multiply this
573 0         0 $Fsum += ($chisq + $AAcnt -
574             $redundancy)/($redundancy*($AAcnt-1));
575             }else # no correction, use old Wright method
576             {
577 464         721 my $pSquareSum = 0;
578 464         951 foreach (@$codonArray)
579             {
580 1526         2238 my $codonCnt = $codonList->{$_};
581 1526 100       3230 next unless($codonCnt);
582 1504         3302 $pSquareSum += ($codonCnt/$AAcnt)**2;
583             }
584 464         1234 $Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
585             }
586             # increase the number of AA species in this class
587 464         1958 $numAAInClass++;
588             }
589             # check whether all AA species are ignored or not observed
590 130 50       387 if($numAAInClass > 0)
591             {
592             # note, in some special cases, Fsum == 0 even though
593             # $numAAInClass >0, for example for a 6-fold amino acid,
594             # if each of its codon is observed only once, it would
595             # result in Faa = 0. so we need add restriction on this
596 130 50       877 $FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
597             0);
598             } # otherwise no data
599             }
600              
601             # estimate missing redundancy classes due to no observation of
602             # that class's AAs, and get the final Nc values
603 26         72 my $enc = 0;
604 26         141 while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
605             {
606             # the number of AA species in this class, determined by the
607             # codon table, not the input seq
608 130         169 my $AAcntInClass = scalar(keys %$AAHash);
609 130 50       278 if(exists $FavgByClass{$redundancy})
610             {
611 130 50       279 die "$redundancy, $AAcntInClass in seq '$seqId':$!"
612             unless($FavgByClass{$redundancy});
613 130         300 $enc += $AAcntInClass/$FavgByClass{$redundancy};
614 130         420 next;
615             }
616              
617             # otherwise this class was not observed
618 0 0       0 my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
619 0         0 my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
620             $equalRatio);
621 0 0       0 unless($estimatedFavg)
622             {
623 0         0 $self->warn("Cannot estimate average F for class with",
624             "redundancy=$redundancy in sequence $seqId, ",
625             "probably no known F values for any class");
626 0         0 return undef;
627             }
628 0         0 $enc += $AAcntInClass/$estimatedFavg;
629             }
630              
631 26         680 return $enc;
632             }
633              
634             # estimate F average
635             sub _estimate_F
636             {
637 0     0     my ($knownF,$redundancy,$equalRatio) = @_;
638              
639 0 0         return 1 if($redundancy == 1);
640              
641 0 0         if($equalRatio) # get the mean (1/Fr-1)/(r-1)
642             {
643 0           my $ratioSum;
644 0           my $cnt = 0; # number of known Fs
645 0           while(my ($r, $F) = each %$knownF)
646             {
647 0 0         next if $r < 2; # excluding class of redundancy==1
648 0           $ratioSum += (1/$F-1)/($r-1);
649 0           $cnt++;
650             }
651              
652 0 0         if( $cnt > 0)
653             {
654 0           my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
655 0           return $Fx;
656             }else # no known F for any class with redundancy > 1
657             {
658 0           return undef;
659             }
660              
661             }else # otherwise use Wright's method
662             {
663 0 0         if($redundancy == 3)
664             {
665 0   0       my $F2 = $knownF->{2} || 1/2; # class 2
666 0   0       my $F4 = $knownF->{4} || 1/4; # class 4
667 0           return ($F2 + $F4)/2;
668             }else
669             {
670 0           return 1/$redundancy; # assuming no bias
671             }
672             }
673              
674             }
675              
676             # get the default base compostion of this object
677             sub base_composition
678             {
679 0     0 0   my $self = shift;
680              
681 0           return $self->{'_base_comp'};
682             }
683              
684             =head2 estimate_base_composition
685              
686             Title : estimate_base_composition
687             Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
688             Function: estimate base compositions in the sequence
689             Returns : an array of numbers in the order of A,T,C,G, or its
690             reference if in the scalar context
691             Args : a sequence string or a reference of hash containing codons
692             and their counts (eg., AGG => 30), and optionally an integer; the integer
693             specifies which codon position's nucleotide will be used instead of
694             all three codon positions.
695              
696             =cut
697              
698             sub estimate_base_composition
699             {
700 0     0 1   my ($self, $seq, $pos) = @_;
701              
702 0           my %bases;
703             # check if input is a codon list
704             my $codonList;
705 0 0         if(ref($seq) eq 'HASH') # a codon list
706             {
707 0           $codonList = $seq;
708             }else # a sequence string or object
709             {
710 0           $seq = $self->_get_seq_str($seq);
711             }
712              
713 0 0         if($pos)
714             {
715 0 0 0       $self->throw("Only 1, 2, or 3 are acceptable for pos,",
716             "'$pos' is not valid here") unless($pos > 0 and $pos < 4);
717 0 0         if($codonList) # input is a codon list
718             {
719 0           my $base;
720 0           while(my ($codon, $cnt) = each %$codonList)
721             {
722 0           $base = substr($codon, $pos-1,1);
723 0           $bases{$base} += $cnt;
724             }
725             }else # a sequence
726             {
727 0           my $seqLen = length($seq);
728 0           my $accuLen = $pos - 1;
729 0           my $period = 3; # a codon length
730 0           my $base;
731 0           while($accuLen < $seqLen)
732             {
733 0           $base = substr($seq,$accuLen,1);
734 0           $bases{$base}++;
735 0           $accuLen += $period;
736             }
737             }
738             }else # all nucleotides
739             {
740 0 0         if($codonList) # input is a codon list
741             {
742 0           while(my ($codon, $cnt) = each %$codonList)
743             {
744 0           map { $bases{$_} += $cnt } split('', $codon);
  0            
745             }
746             }else
747             {
748 0           map { $bases{$_}++ } split('',$seq);
  0            
749             }
750             }
751              
752 0           my $total = 0;
753 0           my @comp;
754 0           foreach ($self->bases) # only consider A,T,C,G
755             {
756 0   0       $total += $bases{$_} || 0;
757 0   0       push @comp, $bases{$_} || 0;
758             }
759 0           @comp = map { $_/$total } @comp;
  0            
760              
761 0 0         return wantarray? @comp : \@comp;
762             }
763              
764             =head2 gc_fraction
765              
766             Title : gc_fraction
767             Usage : $frac = $self->gc_fraction($seq,[$pos])
768             Function: get fraction of GC content in the sequence
769             Returns : a floating number between 0 and 1.
770             Args : a sequence string or a reference of hash containing codons
771             and their counts (eg., AGG => 30), and optionally an integer; the integer
772             specifies which codon position's nucleotide will be used for
773             calculation (i.e., 1, 2, or 3), instead of all three positions.
774              
775             =cut
776              
777             sub gc_frac
778             {
779 0     0 0   my ($self, @args) = @_;
780 0           $self->gc_fraction(@args);
781             }
782              
783             sub gc_fraction
784             {
785 0     0 1   my ($self, @args) = @_;
786              
787 0           my @composition = $self->estimate_base_composition(@args);
788 0           my @bases = $self->bases;
789 0           my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
  0            
790              
791 0           my $frac = 0;
792 0           foreach (@indice)
793             {
794 0           $frac += $composition[$_];
795             }
796              
797 0           return $frac;
798             }
799              
800             =head2 expect_codon_freq
801              
802             Title : expect_codon_freq
803             Usage : $codonFreq = $self->expect_codon_freq($base_composition)
804             Function: return the expected frequency of codons
805             Returns : reference to a hash in which codon is hash key, and
806             fraction is hash value
807             Args : reference to an array of base compositions in the order of
808             [A, T, C, G], represented as either counts or fractions
809              
810             =cut
811              
812             sub expect_codon_freq
813             {
814 0     0 1   my ($self, $baseComp) = @_;
815              
816 0 0 0       unless($baseComp and ref($baseComp) eq 'ARRAY')
817             {
818 0           $self->warn("Invalid base composition '$baseComp'",
819             " for expect_codon_freq, which should be an array reference");
820 0           return undef;
821             }
822              
823 0           my @bases = $self->bases;
824 0           my $compSum = 0; # used to normalize in case they are not summed to 1
825 0           my $zeroCnt = 0; # count of zero values
826 0           foreach (0..3)
827             {
828 0 0         $zeroCnt++ if($baseComp->[$_] == 0);
829 0           $compSum += $baseComp->[$_];
830             }
831              
832             # set zero value a pseudo count, depending on the provided values
833             # are fractions or counts
834 0 0         my $pseudoCnt = $compSum > 2? 1 : 1/100;
835 0           $compSum += $pseudoCnt * $zeroCnt;
836 0   0       my %freq = map { $bases[$_] => ($baseComp->[$_] || $pseudoCnt)/$compSum } 0..3;
  0            
837 0           my %result;
838 0           foreach my $b1 (@bases)
839             {
840 0           foreach my $b2 (@bases)
841             {
842 0           foreach my $b3 (@bases)
843             {
844 0           my $codon = $b1.$b2.$b3;
845 0           $result{$codon} = $freq{$b1}*$freq{$b2}*$freq{$b3};
846             }
847             }
848             }
849              
850 0           return \%result;
851             }
852              
853             =head1 AUTHOR
854              
855             Zhenguo Zhang, C<< >>
856              
857             =head1 BUGS
858              
859             Please report any bugs or feature requests to C, or through
860             the web interface at L. I will be notified, and then you'll
861             automatically be notified of progress on your bug as I make changes.
862              
863              
864             =head1 SUPPORT
865              
866             You can find documentation for this module with the perldoc command.
867              
868             perldoc Bio::CUA::CUB::Calculator
869              
870              
871             You can also look for information at:
872              
873             =over 4
874              
875             =item * RT: CPAN's request tracker (report bugs here)
876              
877             L
878              
879             =item * AnnoCPAN: Annotated CPAN documentation
880              
881             L
882              
883             =item * CPAN Ratings
884              
885             L
886              
887             =item * Search CPAN
888              
889             L
890              
891             =back
892              
893              
894             =head1 ACKNOWLEDGEMENTS
895              
896              
897             =head1 LICENSE AND COPYRIGHT
898              
899             Copyright 2015 Zhenguo Zhang.
900              
901             This program is free software: you can redistribute it and/or modify
902             it under the terms of the GNU General Public License as published by
903             the Free Software Foundation, either version 3 of the License, or
904             (at your option) any later version.
905              
906             This program is distributed in the hope that it will be useful,
907             but WITHOUT ANY WARRANTY; without even the implied warranty of
908             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
909             GNU General Public License for more details.
910              
911             You should have received a copy of the GNU General Public License
912             along with this program. If not, see L.
913              
914              
915             =cut
916              
917             1; # End of Bio::CUA::CUB::Calculator