File Coverage

blib/lib/Bio/CUA/CUB/Calculator.pm

Criterion	Covered	Total	%
statement	101	241	41.9
branch	37	114	32.4
condition	7	39	17.9
subroutine	13	23	56.5
pod	11	14	78.5
total	169	431	39.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::CUA::CUB::Calculator;
2
3							=pod
4
5							=head1 NAME
6
7							Bio::CUA::CUB::Calculator -- A module to calculate codon usage bias
8							(CUB) indice for protein-coding sequences
9
10							=head1 SYNOPSIS
11
12							use Bio::CUA::CUB::Calculator;
13
14							my $calc = Bio::CUA::CUB::Calculator->new(
15							-codon_table => 1,
16							-tAI_values => 'tai.out' # from Bio::CUA::CUB::Builder
17							);
18
19							# calculate tAI for each sequence
20							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa");
21							or
22							my $io = Bio::CUA::SeqIO->new(-file => "seqs.fa", -format => 'fasta');
23
24							while(my $seq = $io->next_seq)
25							{
26							my $tai = $calc->tai($seq);
27							printf("%10s: %.7f\n", $seq->id, $tai);
28							}
29
30							=head1 DESCRIPTION
31
32							Codon usage bias (CUB) can be represented at two levels, codon and
33							sequence. The latter is often computed as the geometric means of the
34							sequence's codons. This module caculates CUB metrics at sequence
35							level.
36
37							Supported CUB metrics include CAI (codon adaptation index), tAI (tRNA
38							adaptation index), Fop (Frequency of optimal codons), ENC (Effective
39							Number of Codons) and their variants. See the methods below for
40							details.
41
42							=cut
43
44	2			2		3645	use 5.006;
	2					6
45	2			2		9	use strict;
	2					3
	2					47
46	2			2		7	use warnings;
	2					3
	2					58
47	2			2		9	use parent qw/Bio::CUA::CUB/;
	2					2
	2					10
48	2			2		87	use Bio::CUA::CodonTable;
	2					2
	2					39
49	2			2		6	use Scalar::Util qw/blessed/;
	2					2
	2					5274
50
51							=head1 METHODS
52
53							=head2 new
54
55							Title : new
56							Usage : my $calc=Bio::CUA::CUB::Calculator->new(@args);
57							Function: initialize the calculator
58							Returns : an object of this class
59							Args : a hash with following acceptable keys:
60
61							B:
62
63							=over
64
65							=item C<-codon_table>
66
67							the genetic code table applied for following sequence analyses. It
68							can be specified by an integer (genetic code table id), an object of
69							L, or a map-file. See the method
70							L for details.
71
72							=back
73
74							B
75
76							=over
77
78							=item C<-optimal_codons>
79
80							a file contains all the optimal codons, one codon per line. Or a
81							hashref with keys being the optimal codons
82
83							=back
84
85							B
86
87							=over
88
89							=item C<-CAI_values>
90
91							a file containing CAI values for each codon, excluding 3
92							stop codons, so 61 lines with each line containing a codon and its
93							value separated by space or tab.
94							or
95							a hashref with each key being a codon and each value being CAI index
96							for the codon.
97
98							=back
99
100							B
101
102							=over
103
104							=item C<-tAI_values>
105
106							similar to C<-CAI_values>, a file or a hash containing tAI value
107							for each codon.
108
109							=back
110
111							B
112
113							=over
114
115							=item C<-base_background>
116
117							optional.
118							an arrayref containing base frequency of 4 bases (in the order
119							A,T,C, and G) derived from background data such as introns.
120							Or one of the following values: 'seq', 'seq3', which will lead to
121							estimating base frequencies from each analyzed sequence in whol or
122							its 3rd codon position, respectively.
123
124							It can also be specified for each analyzed sequence with the methods
125							L and L
126
127							=back
128
129							=cut
130
131							sub new
132							{
133	1			1	1	2609	my ($caller, @args) = @_;
134
135							# option -codon_table is processed in this parent class
136	1					11	my $self = $caller->SUPER::new(@args);
137
138							# process all the parameters
139	1					4	my $hashRef = $self->_array_to_hash(\@args);
140	1					5	while(my ($tag, $val) = each %$hashRef)
141							{
142							# tag 'codon_table' is now processed by parent package
143	3	50				21	if($tag =~ /^optimal/o) # optimal codons
		100
		100
		50
144							{
145							# a hash using codons as keys
146							my $optimalCodons = ref($val) eq 'HASH'?
147	0	0				0	{map { $_ => 1 } keys(%$val)} : $self->_parse_file($val,1);
	0					0
148	0					0	$self->{'_optimal_codons'} = $optimalCodons;
149							}elsif($tag =~ /^cai/o) # CAI values
150							{
151							# a hash like codon => CAI_value
152	1	50				9	my $caiValues = ref($val) eq 'HASH'?
153							$val : $self->_parse_file($val,2);
154	1					6	$self->{'_cai_values'} = $caiValues;
155							}elsif($tag =~ /^tai/o) # tAI values
156							{
157							# a hash like codon => tAI_value
158	1	50				6	my $taiValues = ref($val) eq 'HASH'?
159							$val : $self->_parse_file($val,2);
160	1					5	$self->{'_tai_values'} = $taiValues;
161							}elsif($tag =~ /^base/o) # background base composition
162							{
163	0	0	0			0	if(ref($val) eq 'ARRAY' or $val =~ /^seq/)
164							{
165	0					0	$self->{'_base_comp'} = $val;
166							}else
167							{
168	0					0	$self->throw("Unknown value '$val' for parameter",
169							"-base_background");
170							}
171							}else
172							{
173							# Unknown parameter '$tag', ignored
174							}
175							}
176
177	1					10	$self->no_atg(1); # exclude ATG in tAI calculation
178							# check the input values
179							# 1. make sure all the sense codons have CAI or tAI values if
180							# provided
181
182	1					4	return $self;
183							}
184
185							=head1 sequence input
186
187							all the following methods accept one of the following formats as
188							sequence input
189
190							=over
191
192							=item 1
193
194							string of nucleotide sequence with length of 3N,
195
196							=item 2
197
198							sequence object which has a method I to get the sequence string,
199
200							=item 3
201
202							a sequence file in fasta format
203
204							=item 4
205
206							reference to a codon count hash, like
207							$codons = {
208							AGC => 50,
209							GTC => 124,
210							... ...
211							}.
212
213							=back
214
215							=head2 cai
216
217							Title : cai
218							Usage : $caiValue = $self->cai($seq);
219							Function: calculate the CAI value for the sequence
220							Returns : a number, or undef if failed
221							Args : see L
222							Note: codons without synonymous competitors are excluded in
223							calculation.
224
225							=cut
226
227							sub cai
228							{
229	13			13	1	57	my ($self, $seq) = @_;
230	13					34	$self->_xai($seq, 'CAI');
231							}
232
233							# the real calculator of tAI or CAI as both have the same formula
234							sub _xai
235							{
236	26			26		33	my ($self, $seq, $type) = @_;
237
238	26					23	my $name;
239							my $xaiHash;
240	26	100				105	if($type =~ /cai/i)
		50
241							{
242	13					21	$name = 'CAI';
243	13					23	$xaiHash = $self->{"_cai_values"};
244							}elsif($type =~ /tai/i)
245							{
246	13					18	$name = 'tAI';
247	13					23	$xaiHash = $self->{"_tai_values"};
248							}else
249							{
250	0					0	$self->throw("Unknown adaptation index type '$type'");
251							}
252	26	50				48	unless($xaiHash)
253							{
254	0					0	$self->warn("$name values for codons were not provided for",
255							"this analyzer, so can not calculate $name for sequences");
256	0					0	return undef;
257							}
258
259	26	50				68	my $codonList = $self->get_codon_list($seq) or return;
260
261	26					38	my $xai = 0;
262	26					27	my $seqLen = 0; # this excludes some unsuitable codons
263							# get non-degenerative codons which are excluded in CAI
264	26					100	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	52					121
265	26					61	my @senseCodons = $self->codon_table->all_sense_codons;
266	26					102	foreach my $codon (@senseCodons)
267							{
268	1586	100				2184	next unless($codonList->{$codon}); # no observation of this codon
269							# excluding non-degenerate codons for CAI calculation
270	1560	100	100			2698	next if($nonDegCodons{$codon} and $type =~ /cai/i);
271	1534	100				2105	unless(exists $xaiHash->{$codon})
272							{
273	13	0	33			53	$self->warn("Codon '$codon' is ignored")
		0
274							if($self->debug and ($self->no_atg? ($codon ne 'ATG') : 1));
275	13					25	next;
276							}
277	1521					1204	my $cnt = $codonList->{$codon};
278							# to overcome real number overflow, use log
279	1521					2801	$xai += $cnt*log($xaiHash->{$codon});
280	1521					1851	$seqLen += $cnt;
281							}
282
283	26	50				63	return undef unless($xai); # no codons with CAI/tAI
284
285	26					73	$xai = exp($xai/$seqLen);
286	26					307	return $xai;
287							}
288
289							=head2 fop
290
291							Title : fop
292							Usage : $fopValue = $self->fop($seq[,$withNonDegenerate]);
293							Function: calculate the fraction of optimal codons in the sequence
294							Returns : a number, or undef if failed
295							Args : for sequence see L.
296							if optional argument '$withNonDegenerate' is true, then
297							non-degenerate codons (those do not have synonymous partners) are
298							included in calculation. Default is excluding these codons.
299
300							=cut
301
302							sub fop
303							{
304	0			0	1	0	my ($self, $seq, $withNonDeg) = @_;
305
306	0	0				0	my $optimalCodons = $self->{'_optimal_codons'} or
307							$self->throw("No optimal codons associated with $self");
308
309	0	0				0	my $codonList = $self->get_codon_list($seq) or return;
310							# get non-degenerate codons
311	0					0	my %nonDegCodons = map { $_ => 1 } $self->codons_by_degeneracy(1);
	0					0
312
313
314	0					0	my $optCnt = 0; # optimal codons
315	0					0	my $total = 0;
316	0					0	while(my ($codon, $cnt) = each %$codonList)
317							{
318							# excluding non-degenerate codons if necessary
319	0	0	0			0	next if(!$withNonDeg and $nonDegCodons{$codon});
320	0	0				0	$optCnt += $cnt if($optimalCodons->{$codon});
321	0					0	$total += $cnt;
322							}
323
324	0		0			0	return $optCnt/($total \|\| 1);
325							}
326
327							=head2 tai
328
329							Title : tai
330							Usage : $taiValue = $self->tai($seq);
331							Function: calculate the tAI value for the sequence
332							Returns : a number, or undef if failed
333							Args : for sequence see L.
334
335							Note: codons which do not have tAI values are ignored from input
336							sequence
337
338							=cut
339
340							sub tai
341							{
342	13			13	1	56	my ($self, $seq) = @_;
343	13					26	$self->_xai($seq, 'tAI');
344							}
345
346							# an alias
347							sub tAI
348							{
349	0			0	0	0	my ($self, $seq) = @_;
350	0					0	$self->_xai($seq, 'tAI');
351							}
352
353							=head2 enc
354
355							Title : enc
356							Usage : $encValue = $self->enc($seq,[$minTotal]);
357							Function: calculate ENC for the sequence using the original method
358							I
359							Returns : a number, or undef if failed
360							Args : for sequence see L.
361							Optional argument I specifies minimal count
362							for an amino acid; if observed count is smaller than this count, this
363							amino acid's F will not be calculated but inferred. Deafult is 5.
364
365							Note: when the F of a redundancy group is unavailable due to lack of
366							sufficient data, it will be estimated from other groups following Wright's
367							method, that is, F3=(F2+F4)/2, and for others, F=1/r where r is the
368							degeneracy degree of that group.
369
370							=cut
371							sub enc
372							{
373	13			13	1	56	my ($self, $seq, $minTotal) = @_;
374	13					31	$self->_enc_factory($seq, $minTotal, 'mean');
375							}
376
377							=head2 enc_r
378
379							Title : enc_r
380							Usage : $encValue = $self->enc_r($seq,[$minTotal]);
381							Function: similar to the method L, except that missing F values
382							are estimated in a different way.
383							Returns : a number, or undef if failed
384							Args : for sequence see L.
385							Optional argument I specifies minimal count
386							for an amino acid; if observed count is smaller than this count, this
387							amino acid's F will not be calculated but inferred. Deafult is 5.
388
389							Note: for missing Fx of degeneracy class 'x', we first estimated the
390							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
391							classes with known F values. Then Fx is obtained by solving the simple
392							equation.
393
394							=cut
395
396							sub enc_r
397							{
398	13			13	1	53	my ($self, $seq, $minTotal) = @_;
399	13					37	$self->_enc_factory($seq, $minTotal, 'equal_ratio');
400							}
401
402							=head2 encp
403
404							Title : encp
405							Usage : $encpValue = $self->encp($seq,[$minTotal,[$A,$T,$C,$G]]);
406							Function: calculate ENC for the sequence using the updated method
407							by Novembre I<2002, MBE>, which corrects the background nucleotide
408							composition.
409							Returns : a number, or undef if failed
410							Args : for sequence see L.
411
412							Optional argument I specifies minimal count
413							for an amino acid; if observed count is smaller than this count, this
414							amino acid's F will not be calculated but inferred. Deafult is 5.
415
416							another optional argument gives the background nucleotide composition
417							in the order of A,T,C,G in an array, if not provided, it will use the
418							default one provided when calling the method L. If stil
419							unavailable, error occurs.
420
421							=cut
422
423							sub encp
424							{
425	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
426	0					0	$self->_enc_factory($seq, $minTotal, 'mean', 1, $baseComp);
427							}
428
429							=head2 encp_r
430
431							Title : encp_r
432							Usage : $encpValue =
433							$self->encp_r($seq,[$minTotal,[$A,$T,$C,$G]]);
434							Function: similar to the method L, except that missing F values
435							are estimated using a different way.
436							Returns : a number, or undef if failed
437							Args : for sequence see L.
438
439							Optional argument I specifies minimal count
440							for an amino acid; if observed count is smaller than this count, this
441							amino acid's F will not be calculated but inferred. Deafult is 5.
442
443							another optional argument gives the background nucleotide composition
444							in the order of A,T,C,G in an array, if not provided, it will use the
445							default one provided when calling the method L. If stil
446							unavailable, error occurs.
447
448							Note: for missing Fx of degeneracy class 'x', we first estimated the
449							ratio (1/Fx-1)/(x-1) by averaging the ratios of other degeneracy
450							classes with known F values. Then Fx is obtained by solving the simple
451							equation.
452
453							=cut
454
455							sub encp_r
456							{
457	0			0	1	0	my ($self, $seq, $minTotal, $baseComp) = @_;
458	0					0	$self->_enc_factory($seq, $minTotal, 'equal_ratio', 1, $baseComp);
459							}
460
461							# real function calculate different versions of ENC
462							# parameters explanation
463							# seq: sequence string, sequence object, sequence file, or hash
464							# reference to codon list
465							# correctBaseComp: if true, correct background base composition using
466							# Novembre's method
467							# F_EstimateMethod: how to estimate average F for a certain redundancy
468							# class if that class does not have observed data so can't be
469							# calculated; 'mean' is for Wright's method, and 'equal_ratio' for
470							# Zhenguo's method. The latter assumes a similar (1/F[r])/r for each
471							# redundancy class with redundancy degree 'r'
472							# baseComposition: optional, a reference to an array containing
473							# background nucleotide composition. If provided, it overides the
474							# values set when method L was called.
475							sub _enc_factory
476							{
477	26			26		31	my ($self, $seq, $minTotal, $F_EstimateMethod, $correctBaseComp, $baseComposition) = @_;
478
479	26	50				58	$minTotal = 5 unless(defined $minTotal); # the minumum count of residule for a given amino
480							# acid for it to be included in F calculation
481
482							# a hash ref, codon => counts
483	26	50				57	my $codonList = $self->get_codon_list($seq) or return;
484	26	50	33			270	my $seqId = (blessed($seq) and $seq->can('id'))? $seq->id : '';
485
486							# determine expected codon frequency if necessary
487	26					39	my $expectedCodonFreq;
488							# determine base compositions now
489	26	50				46	if($correctBaseComp)
490							{
491	0	0				0	if(!defined($baseComposition)) # not provided for this sequence
492							{
493	0					0	my $defaultBaseComp = $self->base_composition;
494	0	0				0	unless($defaultBaseComp)
495							{
496	0					0	$self->warn("No default base composition for seq",
497							" '$seqId', so no GC-corrected ENC");
498	0					0	return undef;
499							}
500	0	0				0	if($defaultBaseComp eq 'seq')
		0
501							{
502	0					0	$baseComposition =
503							$self->estimate_base_composition($codonList);
504							}elsif($defaultBaseComp eq 'seq3')
505							{
506	0					0	$baseComposition =
507							$self->estimate_base_composition($codonList,3);
508							}else
509							{
510	0					0	$baseComposition = $defaultBaseComp;
511							}
512							} # otherwise sequence-specific base-composition is provided
513							# here
514
515							# codon frequency may not be estimated due to invalid
516							# compositions
517							$expectedCodonFreq =
518	0	0				0	$self->expect_codon_freq($baseComposition)
519							if($baseComposition);
520	0	0				0	return undef unless($expectedCodonFreq);
521							}
522
523
524							# now let us calculate F for each redundancy class
525							# determined by codon table, containing all amino acid classes
526	26					68	my $AARedundancyClasses = $self->aa_degeneracy_classes; #
527	26					25	my %FavgByClass; # record the average F from each class
528	26					84	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
529							{
530							# number of observed AA types in this class
531	130					98	my $numAAInClass = 0; # number of amino acid species in this class
532	130					95	my $Fsum = 0;
533	130					261	while(my ($AA, $codonArray) = each %$AAHash)
534							{
535	494	100				700	if($redundancy == 1) # this class has only one codon
536							{
537	26					29	$numAAInClass = scalar(keys %$AAHash);
538	26					26	$Fsum = $numAAInClass; # each AA contribute 1
539	26					35	last;
540							}
541							# total count of observed residules for this AA
542	468					353	my $AAcnt = 0;
543	468					526	foreach (@$codonArray)
544							{
545							# check the codon exists in this seq
546	1534	100				2128	next unless(exists $codonList->{$_});
547	1508					1528	$AAcnt += $codonList->{$_};
548							}
549
550							# skip if occurence of this amino acid is less than the
551							# minimal threshold
552	468	100	66			1396	next if($AAcnt < $minTotal or $AAcnt < 2);
553
554							# now calculate F for this AA species
555	464	50				514	if($correctBaseComp) # correct base composition
556							{
557	0					0	my $chisq = 0;
558							# get the freq of codons of this amino acids
559	0					0	my $totalFreq = 0;
560	0					0	foreach (@$codonArray)
561							{
562	0					0	$totalFreq += $expectedCodonFreq->{$_};
563							}
564	0					0	foreach (@$codonArray)
565							{
566							# set unobserved codons to 0
567	0		0			0	my $codonCnt = $codonList->{$_} \|\| 0;
568							my $expectedFreq =
569	0					0	$expectedCodonFreq->{$_}/$totalFreq;
570	0					0	$chisq += ($codonCnt/$AAcnt -
571							$expectedFreq)**2/$expectedFreq;
572							}
573	0					0	$chisq *= $AAcnt; # don't forget multiply this
574	0					0	$Fsum += ($chisq + $AAcnt -
575							$redundancy)/($redundancy*($AAcnt-1));
576							}else # no correction, use old Wright method
577							{
578	464					323	my $pSquareSum = 0;
579	464					483	foreach (@$codonArray)
580							{
581	1526					1124	my $codonCnt = $codonList->{$_};
582	1526	100				1939	next unless($codonCnt);
583	1504					1894	$pSquareSum += ($codonCnt/$AAcnt)**2;
584							}
585	464					546	$Fsum += ($AAcnt*$pSquareSum -1)/($AAcnt-1);
586							}
587							# increase the number of AA species in this class
588	464					1064	$numAAInClass++;
589							}
590							# check whether all AA species are ignored or not observed
591	130	50				197	if($numAAInClass > 0)
592							{
593							# note, in some special cases, Fsum == 0 even though
594							# $numAAInClass >0, for example for a 6-fold amino acid,
595							# if each of its codon is observed only once, it would
596							# result in Faa = 0. so we need add restriction on this
597	130	50				481	$FavgByClass{$redundancy} = $Fsum/$numAAInClass if($Fsum >
598							0);
599							} # otherwise no data
600							}
601
602							# estimate missing redundancy classes due to no observation of
603							# that class's AAs, and get the final Nc values
604	26					25	my $enc = 0;
605	26					61	while(my ($redundancy, $AAHash) = each %$AARedundancyClasses)
606							{
607							# the number of AA species in this class, determined by the
608							# codon table, not the input seq
609	130					109	my $AAcntInClass = scalar(keys %$AAHash);
610	130	50				224	if(exists $FavgByClass{$redundancy})
611							{
612							die "$redundancy, $AAcntInClass in seq '$seqId':$!"
613	130	50				177	unless($FavgByClass{$redundancy});
614	130					143	$enc += $AAcntInClass/$FavgByClass{$redundancy};
615	130					271	next;
616							}
617
618							# otherwise this class was not observed
619	0	0				0	my $equalRatio = $F_EstimateMethod eq 'mean'? 0 : 1;
620	0					0	my $estimatedFavg = _estimate_F(\%FavgByClass, $redundancy,
621							$equalRatio);
622	0	0				0	unless($estimatedFavg)
623							{
624	0					0	$self->warn("Cannot estimate average F for class with",
625							"redundancy=$redundancy in sequence $seqId, ",
626							"probably no known F values for any class");
627	0					0	return undef;
628							}
629	0					0	$enc += $AAcntInClass/$estimatedFavg;
630							}
631
632	26					236	return $enc;
633							}
634
635							# estimate F average
636							sub _estimate_F
637							{
638	0			0			my ($knownF,$redundancy,$equalRatio) = @_;
639
640	0	0					return 1 if($redundancy == 1);
641
642	0	0					if($equalRatio) # get the mean (1/Fr-1)/(r-1)
643							{
644	0						my $ratioSum;
645	0						my $cnt = 0; # number of known Fs
646	0						while(my ($r, $F) = each %$knownF)
647							{
648	0	0					next if $r < 2; # excluding class of redundancy==1
649	0						$ratioSum += (1/$F-1)/($r-1);
650	0						$cnt++;
651							}
652
653	0	0					if( $cnt > 0)
654							{
655	0						my $Fx = 1/($ratioSum/$cnt*($redundancy-1)+1);
656	0						return $Fx;
657							}else # no known F for any class with redundancy > 1
658							{
659	0						return undef;
660							}
661
662							}else # otherwise use Wright's method
663							{
664	0	0					if($redundancy == 3)
665							{
666	0		0				my $F2 = $knownF->{2} \|\| 1/2; # class 2
667	0		0				my $F4 = $knownF->{4} \|\| 1/4; # class 4
668	0						return ($F2 + $F4)/2;
669							}else
670							{
671	0						return 1/$redundancy; # assuming no bias
672							}
673							}
674
675							}
676
677							# get the default base compostion of this object
678							sub base_composition
679							{
680	0			0	0		my $self = shift;
681
682	0						return $self->{'_base_comp'};
683							}
684
685							=head2 estimate_base_composition
686
687							Title : estimate_base_composition
688							Usage : @baseComp = $self->estimate_base_composition($seq,[$pos])
689							Function: estimate base compositions in the sequence
690							Returns : an array of numbers in the order of A,T,C,G, or its
691							reference if in the scalar context
692							Args : a sequence string or a reference of hash containing codons
693							and their counts (eg., AGG => 30), and optionally an integer; the integer
694							specifies which codon position's nucleotide will be used instead of
695							all three codon positions.
696
697							=cut
698
699							sub estimate_base_composition
700							{
701	0			0	1		my ($self, $seq, $pos) = @_;
702
703	0						my %bases;
704							# check if input is a codon list
705							my $codonList;
706	0	0					if(ref($seq) eq 'HASH') # a codon list
707							{
708	0						$codonList = $seq;
709							}else # a sequence string or object
710							{
711	0						$seq = $self->_get_seq_str($seq);
712							}
713
714	0	0					if($pos)
715							{
716	0	0	0				$self->throw("Only 1, 2, or 3 are acceptable for pos,",
717							"'$pos' is not valid here") unless($pos > 0 and $pos < 4);
718	0	0					if($codonList) # input is a codon list
719							{
720	0						my $base;
721	0						while(my ($codon, $cnt) = each %$codonList)
722							{
723	0						$base = substr($codon, $pos-1,1);
724	0						$bases{$base} += $cnt;
725							}
726							}else # a sequence
727							{
728	0						my $seqLen = length($seq);
729	0						my $accuLen = $pos - 1;
730	0						my $period = 3; # a codon length
731	0						my $base;
732	0						while($accuLen < $seqLen)
733							{
734	0						$base = substr($seq,$accuLen,1);
735	0						$bases{$base}++;
736	0						$accuLen += $period;
737							}
738							}
739							}else # all nucleotides
740							{
741	0	0					if($codonList) # input is a codon list
742							{
743	0						while(my ($codon, $cnt) = each %$codonList)
744							{
745	0						map { $bases{$_} += $cnt } split('', $codon);
	0
746							}
747							}else
748							{
749	0						map { $bases{$_}++ } split('',$seq);
	0
750							}
751							}
752
753	0						my $total = 0;
754	0						my @comp;
755	0						foreach ($self->bases) # only consider A,T,C,G
756							{
757	0		0				$total += $bases{$_} \|\| 0;
758	0		0				push @comp, $bases{$_} \|\| 0;
759							}
760	0						@comp = map { $_/$total } @comp;
	0
761
762	0	0					return wantarray? @comp : \@comp;
763							}
764
765							=head2 gc_fraction
766
767							Title : gc_fraction
768							Usage : $frac = $self->gc_fraction($seq,[$pos])
769							Function: get fraction of GC content in the sequence
770							Returns : a floating number between 0 and 1.
771							Args : a sequence string or a reference of hash containing codons
772							and their counts (eg., AGG => 30), and optionally an integer; the integer
773							specifies which codon position's nucleotide will be used for
774							calculation (i.e., 1, 2, or 3), instead of all three positions.
775
776							=cut
777
778							sub gc_frac
779							{
780	0			0	0		my ($self, @args) = @_;
781	0						$self->gc_fraction(@args);
782							}
783
784							sub gc_fraction
785							{
786	0			0	1		my ($self, @args) = @_;
787
788	0						my @composition = $self->estimate_base_composition(@args);
789	0						my @bases = $self->bases;
790	0						my @indice = grep { $bases[$_] =~ /[GC]/ } 0..$#bases;
	0
791
792	0						my $frac = 0;
793	0						foreach (@indice)
794							{
795	0						$frac += $composition[$_];
796							}
797
798	0						return $frac;
799							}
800
801							=head2 expect_codon_freq
802
803							Title : expect_codon_freq
804							Usage : $codonFreq = $self->expect_codon_freq($base_composition)
805							Function: return the expected frequency of codons
806							Returns : reference to a hash in which codon is hash key, and
807							fraction is hash value
808							Args : reference to an array of base compositions in the order of
809							[A, T, C, G], represented as either counts or fractions
810
811							=cut
812
813							sub expect_codon_freq
814							{
815	0			0	1		my ($self, $baseComp) = @_;
816
817	0	0	0				unless($baseComp and ref($baseComp) eq 'ARRAY')
818							{
819	0						$self->warn("Invalid base composition '$baseComp'",
820							" for expect_codon_freq, which should be an array reference");
821	0						return undef;
822							}
823
824	0						my @bases = $self->bases;
825	0						my $compSum = 0; # used to normalize in case they are not summed to 1
826	0						my $zeroCnt = 0; # count of zero values
827	0						foreach (0..3)
828							{
829	0	0					$zeroCnt++ if($baseComp->[$_] == 0);
830	0						$compSum += $baseComp->[$_];
831							}
832
833							# set zero value a pseudo count, depending on the provided values
834							# are fractions or counts
835	0	0					my $pseudoCnt = $compSum > 2? 1 : 1/100;
836	0						$compSum += $pseudoCnt * $zeroCnt;
837	0		0				my %freq = map { $bases[$_] => ($baseComp->[$_] \|\| $pseudoCnt)/$compSum } 0..3;
	0
838	0						my %result;
839	0						foreach my $b1 (@bases)
840							{
841	0						foreach my $b2 (@bases)
842							{
843	0						foreach my $b3 (@bases)
844							{
845	0						my $codon = $b1.$b2.$b3;
846	0						$result{$codon} = $freq{$b1}$freq{$b2}$freq{$b3};
847							}
848							}
849							}
850
851	0						return \%result;
852							}
853
854							=head1 AUTHOR
855
856							Zhenguo Zhang, C<< >>
857
858							=head1 BUGS
859
860							Please report any bugs or feature requests to C, or through
861							the web interface at L. I will be notified, and then you'll
862							automatically be notified of progress on your bug as I make changes.
863
864
865							=head1 SUPPORT
866
867							You can find documentation for this module with the perldoc command.
868
869							perldoc Bio::CUA::CUB::Calculator
870
871
872							You can also look for information at:
873
874							=over 4
875
876							=item * RT: CPAN's request tracker (report bugs here)
877
878							L
879
880							=item * AnnoCPAN: Annotated CPAN documentation
881
882							L
883
884							=item * CPAN Ratings
885
886							L
887
888							=item * Search CPAN
889
890							L
891
892							=back
893
894
895							=head1 ACKNOWLEDGEMENTS
896
897
898							=head1 LICENSE AND COPYRIGHT
899
900							Copyright 2015 Zhenguo Zhang.
901
902							This program is free software: you can redistribute it and/or modify
903							it under the terms of the GNU General Public License as published by
904							the Free Software Foundation, either version 3 of the License, or
905							(at your option) any later version.
906
907							This program is distributed in the hope that it will be useful,
908							but WITHOUT ANY WARRANTY; without even the implied warranty of
909							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
910							GNU General Public License for more details.
911
912							You should have received a copy of the GNU General Public License
913							along with this program. If not, see L.
914
915
916							=cut
917
918							1; # End of Bio::CUA::CUB::Calculator