File Coverage

Bio/Align/DNAStatistics.pm
Criterion Covered Total %
statement 515 635 81.1
branch 94 152 61.8
condition 25 52 48.0
subroutine 39 45 86.6
pod 20 30 66.6
total 693 914 75.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Align::DNAStatistics
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
17              
18             =head1 SYNOPSIS
19              
20             use Bio::AlignIO;
21             use Bio::Align::DNAStatistics;
22              
23             my $stats = Bio::Align::DNAStatistics->new();
24             my $alignin = Bio::AlignIO->new(-format => 'emboss',
25             -file => 't/data/insulin.water');
26             my $aln = $alignin->next_aln;
27             my $jcmatrix = $stats->distance(-align => $aln,
28             -method => 'Jukes-Cantor');
29              
30             print $jcmatrix->print_matrix;
31             ## and for measurements of synonymous /nonsynonymous substitutions ##
32              
33             my $in = Bio::AlignIO->new(-format => 'fasta',
34             -file => 't/data/nei_gojobori_test.aln');
35             my $alnobj = $in->next_aln;
36             my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
37             my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
38             print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
39             for (sort keys %{$results->[0]} ){
40             next if /Seq/;
41             printf("%-9s %.4f \n",$_ , $results->[0]{$_});
42             }
43              
44             my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
45             for my $an (@$results2){
46             print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
47             for (sort keys %$an ){
48             next if /Seq/;
49             printf("%-9s %.4f \n",$_ , $an->{$_});
50             }
51             print "\n\n";
52             }
53              
54             my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
55             for (sort keys %$result3 ){
56             next if /Seq/;
57             printf("%-9s %.4f \n",$_ , $result3->{$_});
58             }
59              
60             =head1 DESCRIPTION
61              
62             This object contains routines for calculating various statistics and
63             distances for DNA alignments. The routines are not well tested and do
64             contain errors at this point. Work is underway to correct them, but
65             do not expect this code to give you the right answer currently! Use
66             dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
67             distances.
68              
69              
70             Several different distance method calculations are supported. Listed
71             in brackets are the pattern which will match
72              
73             =over 3
74              
75             =item *
76              
77             JukesCantor [jc|jukes|jukescantor|jukes-cantor]
78              
79             =item *
80              
81             Uncorrected [jcuncor|uncorrected]
82              
83             =item *
84              
85             F81 [f81|felsenstein]
86              
87             =item *
88              
89             Kimura [k2|k2p|k80|kimura]
90              
91             =item *
92              
93             Tamura [t92|tamura|tamura92]
94              
95             =item *
96              
97             F84 [f84|felsenstein84]
98              
99             =item *
100              
101             TajimaNei [tajimanei|tajima\-nei]
102              
103             =item *
104              
105             JinNei [jinnei|jin\-nei] (not implemented)
106              
107             =back
108              
109             There are also three methods to calculate the ratio of synonymous to
110             non-synonymous mutations. All are implementations of the Nei-Gojobori
111             evolutionary pathway method and use the Jukes-Cantor method of
112             nucleotide substitution. This method works well so long as the
113             nucleotide frequencies are roughly equal and there is no significant
114             transition/transversion bias. In order to use these methods there are
115             several pre-requisites for the alignment.
116              
117             =over 3
118              
119             =item 1
120              
121             DNA alignment must be based on protein alignment. Use the subroutine
122             L to achieve this.
123              
124             =item 2
125              
126             Therefore alignment gaps must be in multiples of 3 (representing an aa
127             deletion/insertion) and at present must be indicated by a '-' symbol.
128              
129             =item 3
130              
131             Alignment must be solely of coding region and be in reading frame 0 to
132             achieve meaningful results
133              
134             =item 4
135              
136             Alignment must therefore be a multiple of 3 nucleotides long.
137              
138             =item 5
139              
140             All sequences must be the same length (including gaps). This should be
141             the case anyway if the sequences have been automatically aligned using
142             a program like Clustal.
143              
144             =item 6
145              
146             Only the standard codon alphabet is supported at present.
147              
148             =back
149              
150             calc_KaKs_pair() calculates a number of statistics for a named pair of
151             sequences in the alignment.
152              
153             calc_all_KaKs_pairs() calculates these statistics for all pairwise
154             comparisons in an MSA. The statistics returned are:
155              
156             =over 3
157              
158             =item *
159              
160             S_d - Number of synonymous mutations between the 2 sequences.
161              
162             =item *
163              
164             N_d - Number of non-synonymous mutations between the 2 sequences.
165              
166             =item *
167              
168             S - Mean number of synonymous sites in both sequences.
169              
170             =item *
171              
172             N - mean number of synonymous sites in both sequences.
173              
174             =item *
175              
176             P_s - proportion of synonymous differences in both sequences given by
177             P_s = S_d/S.
178              
179             =item *
180              
181             P_n - proportion of non-synonymous differences in both sequences given
182             by P_n = S_n/S.
183              
184             =item *
185              
186             D_s - estimation of synonymous mutations per synonymous site (by
187             Jukes-Cantor).
188              
189             =item *
190              
191             D_n - estimation of non-synonymous mutations per non-synonymous site (by
192             Jukes-Cantor).
193              
194             =item *
195              
196             D_n_var - estimation of variance of D_n .
197              
198             =item *
199              
200             D_s_var - estimation of variance of S_n.
201              
202             =item *
203              
204             z_value - calculation of z value.Positive value indicates D_n E D_s,
205             negative value indicates D_s E D_n.
206              
207             =back
208              
209             The statistics returned by calc_average_KaKs are:
210              
211             =over 3
212              
213             =item *
214              
215             D_s - Average number of synonymous mutations/synonymous site.
216              
217             =item *
218              
219             D_n - Average number of non-synonymous mutations/non-synonymous site.
220              
221             =item *
222              
223             D_s_var - Estimated variance of Ds from bootstrapped alignments.
224              
225             =item *
226              
227             D_n_var - Estimated variance of Dn from bootstrapped alignments.
228              
229             =item *
230              
231             z_score - calculation of z value. Positive value indicates D_n ED_s,
232             negative values vice versa.
233              
234             =back
235              
236             The design of the code is based around the explanation of the
237             Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
238             Phylogenetics" by Nei and Kumar, published by Oxford University
239             Press. The methods have been tested using the worked example 4.1 in
240             the book, and reproduce those results. If people like having this sort
241             of analysis in BioPerl other methods for estimating Ds and Dn can be
242             provided later.
243              
244             Much of the DNA distance code is based on implementations in EMBOSS
245             (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
246             al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
247             Mitchison.
248              
249             =head1 REFERENCES
250              
251             =over 3
252              
253             =item *
254              
255             D_JukesCantor
256              
257             "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
258             Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
259             Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
260             1969, pp. 21-132.
261              
262             =item *
263              
264             D_Tamura
265              
266             K Tamura, Mol. Biol. Evol. 1992, 9, 678.
267              
268             =item *
269              
270             D_Kimura
271              
272             M Kimura, J. Mol. Evol., 1980, 16, 111.
273              
274             =item *
275              
276             JinNei
277              
278             Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
279              
280             =item *
281              
282             D_TajimaNei
283              
284             Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
285              
286             =back
287              
288             =head1 FEEDBACK
289              
290             =head2 Mailing Lists
291              
292             User feedback is an integral part of the evolution of this and other
293             Bioperl modules. Send your comments and suggestions preferably to
294             the Bioperl mailing list. Your participation is much appreciated.
295              
296             bioperl-l@bioperl.org - General discussion
297             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
298              
299             =head2 Support
300              
301             Please direct usage questions or support issues to the mailing list:
302              
303             I
304              
305             rather than to the module maintainer directly. Many experienced and
306             reponsive experts will be able look at the problem and quickly
307             address it. Please include a thorough description of the problem
308             with code and data examples if at all possible.
309              
310             =head2 Reporting Bugs
311              
312             Report bugs to the Bioperl bug tracking system to help us keep track
313             of the bugs and their resolution. Bug reports can be submitted via the
314             web:
315              
316             https://github.com/bioperl/bioperl-live/issues
317              
318             =head1 AUTHOR - Jason Stajich
319              
320             Email jason-AT-bioperl.org
321              
322             =head1 CONTRIBUTORS
323              
324             Richard Adams, richard.adams@ed.ac.uk
325              
326             =head1 APPENDIX
327              
328             The rest of the documentation details each of the object methods.
329             Internal methods are usually preceded with a _
330              
331             =cut
332              
333              
334             # Let the code begin...
335              
336              
337             package Bio::Align::DNAStatistics;
338 4         361 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
339             $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
340 4     4   1166 $CODONS %synchanges $synsites $Precision $GCChhars);
  4         7  
341 4     4   22 use strict;
  4         4  
  4         74  
342 4     4   907 use Bio::Align::PairwiseStatistics;
  4         7  
  4         98  
343 4     4   1013 use Bio::Matrix::PhylipDist;
  4         9  
  4         107  
344 4     4   580 use Bio::Tools::IUPAC;
  4         6  
  4         453  
345              
346             BEGIN {
347 4     4   15 $GapChars = '[\.\-]';
348 4         4 $GCChhars = '[GCS]';
349 4         18 @Nucleotides = qw(A G T C);
350 4         6 $SeqCount = 2;
351 4         5 $Precision = 5;
352            
353             # these values come from EMBOSS distmat implementation
354 4         19 %NucleotideIndexes = ( 'A' => 0,
355             'T' => 1,
356             'C' => 2,
357             'G' => 3,
358              
359             'AT' => 0,
360             'AC' => 1,
361             'AG' => 2,
362             'CT' => 3,
363             'GT' => 4,
364             'CG' => 5,
365              
366             # these are wrong now
367             # 'S' => [ 1, 3],
368             # 'W' => [ 0, 4],
369             # 'Y' => [ 2, 3],
370             # 'R' => [ 0, 1],
371             # 'M' => [ 0, 3],
372             # 'K' => [ 1, 2],
373             # 'B' => [ 1, 2, 3],
374             # 'H' => [ 0, 2, 3],
375             # 'V' => [ 0, 1, 3],
376             # 'D' => [ 0, 1, 2],
377             );
378              
379 4         6 $DefaultGapPenalty = 0;
380             # could put ambiguities here?
381 4         31 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
382             'T' => [ 'A', 'G'],
383             'C' => [ 'A', 'G'],
384             'G' => [ 'C', 'T'],
385             },
386             'Transitions' => { 'A' => [ 'G' ],
387             'G' => [ 'A' ],
388             'C' => [ 'T' ],
389             'T' => [ 'C' ],
390             },
391             );
392 4         91 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
393             'jcuncor|uncorrected' => 'Uncorrected',
394             'f81|felsenstein81' => 'F81',
395             'k2|k2p|k80|kimura' => 'Kimura',
396             't92|tamura|tamura92' => 'Tamura',
397             'f84|felsenstein84' => 'F84',
398             'tajimanei|tajima\-nei' => 'TajimaNei',
399             'jinnei|jin\-nei' => 'JinNei');
400              
401             }
402 4     4   22 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
  4         11  
  4         21812  
403              
404             ## generate look up hashes for Nei_Gojobori methods##
405             $CODONS = get_codons();
406             my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
407             #create look up hash of number of possible synonymous mutations per codon
408             $synsites = get_syn_sites();
409             #create reference look up hash of single basechanges in codons
410             %synchanges = get_syn_changes();
411              
412              
413              
414             =head2 new
415              
416             Title : new
417             Usage : my $obj = Bio::Align::DNAStatistics->new();
418             Function: Builds a new Bio::Align::DNAStatistics object
419             Returns : Bio::Align::DNAStatistics
420             Args : none
421              
422              
423             =cut
424              
425             sub new {
426 2     2 1 617 my ($class,@args) = @_;
427 2         11 my $self = $class->SUPER::new(@args);
428            
429 2         11 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
430              
431 2         8 return $self;
432             }
433              
434              
435             =head2 distance
436              
437             Title : distance
438             Usage : my $distance_mat = $stats->distance(-align => $aln,
439             -method => $method);
440             Function: Calculates a distance matrix for all pairwise distances of
441             sequences in an alignment.
442             Returns : L object
443             Args : -align => Bio::Align::AlignI object
444             -method => String specifying specific distance method
445             (implementing class may assume a default)
446             See also: L
447              
448             =cut
449              
450             sub distance{
451 13     13 1 100 my ($self,@args) = @_;
452 13         61 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
453 13 50 33     112 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
      33        
454 0         0 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
455             }
456 13   50     33 $method ||= 'JukesCantor';
457 13         54 foreach my $m ( keys %DistanceMethods ) {
458 58 100 66     1455 if(defined $m && $method =~ /$m/i ) {
459 13         43 my $mtd = "D_$DistanceMethods{$m}";
460 13         71 return $self->$mtd($aln);
461             }
462             }
463 0         0 $self->warn("Unrecognized distance method $method must be one of [".
464             join(',',$self->available_distance_methods())."]");
465 0         0 return;
466             }
467              
468             =head2 available_distance_methods
469              
470             Title : available_distance_methods
471             Usage : my @methods = $stats->available_distance_methods();
472             Function: Enumerates the possible distance methods
473             Returns : Array of strings
474             Args : none
475              
476              
477             =cut
478              
479             sub available_distance_methods{
480 0     0 1 0 my ($self,@args) = @_;
481 0         0 return values %DistanceMethods;
482             }
483              
484             =head2 D - distance methods
485              
486              
487             =cut
488              
489              
490             =head2 D_JukesCantor
491              
492             Title : D_JukesCantor
493             Usage : my $d = $stat->D_JukesCantor($aln)
494             Function: Calculates D (pairwise distance) between 2 sequences in an
495             alignment using the Jukes-Cantor 1 parameter model.
496             Returns : L
497             Args : L of DNA sequences
498             double - gap penalty
499              
500              
501             =cut
502              
503             sub D_JukesCantor{
504 2     2 1 7 my ($self,$aln,$gappenalty) = @_;
505 2 50       8 return 0 unless $self->_check_arg($aln);
506 2 50       9 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
507             # ambiguities ignored at this point
508 2         4 my (@seqs,@names,@values,%dist);
509 2         4 my $seqct = 0;
510 2         18 foreach my $seq ( $aln->each_seq) {
511 4         10 push @names, $seq->display_id;
512 4         11 push @seqs, uc $seq->seq();
513 4         21 $seqct++;
514             }
515 2         8 my $precisionstr = "%.$Precision"."f";
516 2         9 for(my $i = 0; $i < $seqct-1; $i++ ) {
517             # (diagonals) distance is 0 for same sequence
518 2         11 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
519 2         25 $values[$i][$i] = sprintf($precisionstr,0);
520              
521 2         11 for( my $j = $i+1; $j < $seqct; $j++ ) {
522 2         7 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
523             $seqs[$j]);
524             # just want diagonals
525 2         12 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
526             $matrix->[2]->[2] + $matrix->[3]->[3] );
527 2         10 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
528 2         11 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
529             # fwd and rev lookup
530 2         9 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
531 2         9 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
532 2         20 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
533             # (diagonals) distance is 0 for same sequence
534 2         7 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
535 2         18 $values[$j][$j] = sprintf($precisionstr,0);
536             }
537             }
538 2         20 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
539             -matrix => \%dist,
540             -names => \@names,
541             -values => \@values);
542             }
543              
544             =head2 D_F81
545              
546             Title : D_F81
547             Usage : my $d = $stat->D_F81($aln)
548             Function: Calculates D (pairwise distance) between 2 sequences in an
549             alignment using the Felsenstein 1981 distance model.
550             Relaxes the assumption of equal base frequencies that is
551             in JC.
552             Returns : L
553             Args : L of DNA sequences
554              
555              
556             =cut
557              
558             sub D_F81{
559 2     2 1 7 my ($self,$aln,$gappenalty) = @_;
560 2 50       8 return 0 unless $self->_check_arg($aln);
561 2 50       8 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
562             # ambiguities ignored at this point
563 2         6 my (@seqs,@names,@values,%dist);
564 2         5 my $seqct = 0;
565 2         6 foreach my $seq ( $aln->each_seq) {
566 4         10 push @names, $seq->display_id;;
567 4         13 push @seqs, uc $seq->seq();
568 4         7 $seqct++;
569             }
570 2         7 my $precisionstr = "%.$Precision"."f";
571 2         8 for(my $i = 0; $i < $seqct-1; $i++ ) {
572             # (diagonals) distance is 0 for same sequence
573 2         10 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
574 2         23 $values[$i][$i] = sprintf($precisionstr,0);
575              
576 2         9 for( my $j = $i+1; $j < $seqct; $j++ ) {
577            
578 2         18 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
579             $seqs[$j]);
580             # just want diagonals
581 2         11 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
582             $matrix->[2]->[2] + $matrix->[3]->[3] );
583 2         10 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
584 2         12 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
585             # fwd and rev lookup
586 2         9 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
587 2         7 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
588 2         18 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
589             # (diagonals) distance is 0 for same sequence
590 2         8 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
591 2         17 $values[$j][$j] = sprintf($precisionstr,0);
592             }
593             }
594 2         19 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
595             -matrix => \%dist,
596             -names => \@names,
597             -values => \@values);
598             }
599              
600             =head2 D_Uncorrected
601              
602             Title : D_Uncorrected
603             Usage : my $d = $stats->D_Uncorrected($aln)
604             Function: Calculate a distance D, no correction for multiple substitutions
605             is used. In rare cases where sequences may not overlap, 'NA' is
606             substituted for the distance.
607             Returns : L
608             Args : L (DNA Alignment)
609             [optional] gap penalty
610              
611             =cut
612              
613             sub D_Uncorrected {
614 3     3 1 10 my ($self,$aln,$gappenalty) = @_;
615 3 50       8 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
616 3 50       8 return 0 unless $self->_check_arg($aln);
617             # ambiguities ignored at this point
618 3         5 my (@seqs,@names,@values,%dist);
619 3         6 my $seqct = 0;
620 3         9 foreach my $seq ( $aln->each_seq) {
621 10         18 push @names, $seq->display_id;
622 10         16 push @seqs, uc $seq->seq();
623 10         14 $seqct++;
624             }
625              
626 3         8 my $precisionstr = "%.$Precision"."f";
627 3         8 my $len = $aln->length;
628 3         13 for( my $i = 0; $i < $seqct-1; $i++ ) {
629             # (diagonals) distance is 0 for same sequence
630 7         18 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
631 7         34 $values[$i][$i] = sprintf($precisionstr,0);
632            
633 7         19 for( my $j = $i+1; $j < $seqct; $j++ ) {
634 13         30 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
635             $seqs[$j]);
636 13         34 my $m = ( $matrix->[0]->[0] +
637             $matrix->[1]->[1] +
638             $matrix->[2]->[2] +
639             $matrix->[3]->[3] );
640 13         22 my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
641            
642 13 100       36 $self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
643             unless $denom;
644            
645 12 100       27 my $D = $denom ? 1 - ( $m / $denom) : -1;
646             # fwd and rev lookup
647 12         28 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
648 12         21 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
649 12 100       64 $values[$j][$i] = $values[$i][$j] = $denom ? sprintf($precisionstr,$D)
650             : sprintf("%-*s", $Precision + 2, $D);
651             # (diagonals) distance is 0 for same sequence
652 12         26 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
653 12         62 $values[$j][$j] = sprintf($precisionstr,0);
654             }
655             }
656 2         17 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
657             -matrix => \%dist,
658             -names => \@names,
659             -values => \@values);
660             }
661              
662              
663             # M Kimura, J. Mol. Evol., 1980, 16, 111.
664              
665             =head2 D_Kimura
666              
667             Title : D_Kimura
668             Usage : my $d = $stat->D_Kimura($aln)
669             Function: Calculates D (pairwise distance) between all pairs of sequences
670             in an alignment using the Kimura 2 parameter model.
671             Returns : L
672             Args : L of DNA sequences
673              
674              
675             =cut
676              
677             sub D_Kimura {
678 2     2 1 6 my ($self,$aln) = @_;
679 2 50       5 return 0 unless $self->_check_arg($aln);
680             # ambiguities ignored at this point
681 2         5 my (@names,@values,%dist);
682 2         4 my $seqct = 0;
683 2         7 foreach my $seq ( $aln->each_seq) {
684 4         9 push @names, $seq->display_id;
685 4         6 $seqct++;
686             }
687              
688 2         7 my $precisionstr = "%.$Precision"."f";
689              
690 2         9 for( my $i = 0; $i < $seqct-1; $i++ ) {
691             # (diagonals) distance is 0 for same sequence
692 2         8 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
693 2         15 $values[$i][$i] = sprintf($precisionstr,0);
694              
695 2         11 for( my $j = $i+1; $j < $seqct; $j++ ) {
696 2         11 my $pairwise = $aln->select_noncont($i+1,$j+1);
697 2         10 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
698 2 50       8 unless( $L ) {
699 0         0 $L = 1;
700             }
701 2         14 my $P = $self->transitions($pairwise) / $L;
702 2         10 my $Q = $self->transversions($pairwise) / $L;
703 2         7 my $K = 0;
704 2         8 my $denom = ( 1 - (2 * $P) - $Q);
705 2 50       8 if( $denom == 0 ) {
706 0         0 $self->throw("cannot find distance for ",$i+1,
707             ",",$j+1," $P, $Q\n");
708             }
709 2         7 my $a = 1 / ( 1 - (2 * $P) - $Q);
710 2         6 my $b = 1 / ( 1 - 2 * $Q );
711 2 50 33     10 if( $a < 0 || $b < 0 ) {
712 0         0 $K = -1;
713             } else{
714 2         11 $K = (1/2) * log ( $a ) + (1/4) * log($b);
715             }
716             # fwd and rev lookup
717 2         11 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
718 2         8 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
719 2         27 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
720             # (diagonals) distance is 0 for same sequence
721 2         7 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
722 2         23 $values[$j][$j] = sprintf($precisionstr,0);
723             }
724             }
725 2         17 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
726             -matrix => \%dist,
727             -names => \@names,
728             -values => \@values);
729             }
730              
731              
732             =head2 D_Kimura_variance
733              
734             Title : D_Kimura
735             Usage : my $d = $stat->D_Kimura_variance($aln)
736             Function: Calculates D (pairwise distance) between all pairs of sequences
737             in an alignment using the Kimura 2 parameter model.
738             Returns : array of 2 L,
739             the first is the Kimura distance and the second is
740             a matrix of variance V(K)
741             Args : L of DNA sequences
742              
743              
744             =cut
745              
746             sub D_Kimura_variance {
747 0     0 1 0 my ($self,$aln) = @_;
748 0 0       0 return 0 unless $self->_check_arg($aln);
749             # ambiguities ignored at this point
750 0         0 my (@names,@values,%dist,@var);
751 0         0 my $seqct = 0;
752 0         0 foreach my $seq ( $aln->each_seq) {
753 0         0 push @names, $seq->display_id;
754 0         0 $seqct++;
755             }
756              
757 0         0 my $precisionstr = "%.$Precision"."f";
758              
759 0         0 for( my $i = 0; $i < $seqct-1; $i++ ) {
760             # (diagonals) distance is 0 for same sequence
761 0         0 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
762 0         0 $values[$i][$i] = sprintf($precisionstr,0);
763              
764 0         0 for( my $j = $i+1; $j < $seqct; $j++ ) {
765 0         0 my $pairwise = $aln->select_noncont($i+1,$j+1);
766 0         0 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
767 0 0       0 unless( $L ) {
768 0         0 $L = 1;
769             }
770 0         0 my $P = $self->transitions($pairwise) / $L;
771 0         0 my $Q = $self->transversions($pairwise) / $L;
772 0         0 my ($a,$b,$K,$var_k);
773 0         0 my $a_denom = ( 1 - (2 * $P) - $Q);
774 0         0 my $b_denom = 1 - 2 * $Q;
775 0 0 0     0 unless( $a_denom > 0 && $b_denom > 0 ) {
776 0         0 $a = 1;
777 0         0 $b = 1;
778 0         0 $K = -1;
779 0         0 $var_k = -1;
780             } else {
781 0         0 $a = 1 / $a_denom;
782 0         0 $b = 1 / $b_denom;
783 0         0 $K = (1/2) * log ( $a ) + (1/4) * log($b);
784             # from Wu and Li 1985 which in turn is from Kimura 1980
785 0         0 my $c = ( $a - $b ) / 2;
786 0         0 my $d = ( $a + $b ) / 2;
787 0         0 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
788             }
789              
790             # fwd and rev lookup
791 0         0 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
792 0         0 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
793 0         0 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
794             # (diagonals) distance is 0 for same sequence
795 0         0 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
796 0         0 $values[$j]->[$j] = sprintf($precisionstr,0);
797            
798 0         0 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
799 0         0 $var[$j]->[$j] = $values[$j]->[$j];
800             }
801             }
802 0         0 return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
803             -matrix => \%dist,
804             -names => \@names,
805             -values => \@values),
806             Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
807             -matrix => \%dist,
808             -names => \@names,
809             -values => \@var)
810             );
811             }
812              
813              
814             # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
815              
816             =head2 D_Tamura
817              
818             Title : D_Tamura
819             Usage : Calculates D (pairwise distance) between 2 sequences in an
820             alignment using Tamura 1992 distance model.
821             Returns : L
822             Args : L of DNA sequences
823              
824              
825             =cut
826              
827             sub D_Tamura {
828 2     2 1 8 my ($self,$aln) = @_;
829 2 50       6 return 0 unless $self->_check_arg($aln);
830             # ambiguities ignored at this point
831 2         6 my (@seqs,@names,@values,%dist,$i,$j);
832 2         3 my $seqct = 0;
833 2         6 my $length = $aln->length;
834 2         7 foreach my $seq ( $aln->each_seq) {
835 4         10 push @names, $seq->display_id;;
836 4         10 push @seqs, uc $seq->seq();
837 4         7 $seqct++;
838             }
839              
840 2         6 my $precisionstr = "%.$Precision"."f";
841 2         4 my (@gap,@gc,@trans,@tranv,@score);
842 2         3 $i = 0;
843 2         4 for my $t1 ( @seqs ) {
844 4         7 $j = 0;
845 4         6 for my $t2 ( @seqs ) {
846 8         12 $gap[$i][$j] = 0;
847 8         17 for( my $k = 0; $k < $length; $k++ ) {
848 1532         2373 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
849             substr($seqs[$j],$k,1) );
850 1532 100 100     6414 if( $c1 =~ /^$GapChars$/ ||
    100          
851             $c2 =~ /^$GapChars$/ ) {
852 120         231 $gap[$i][$j]++;
853             } elsif( $c2 =~ /^$GCChhars$/i ) {
854 960         1728 $gc[$i][$j]++;
855             }
856             }
857 8         17 $gc[$i][$j] = ( $gc[$i][$j] /
858             ($length - $gap[$i][$j]) );
859 8         14 $j++;
860             }
861 4         14 $i++;
862             }
863            
864 2         14 for( $i = 0; $i < $seqct-1; $i++ ) {
865             # (diagonals) distance is 0 for same sequence
866 2         11 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
867 2         27 $values[$i][$i] = sprintf($precisionstr,0);
868            
869 2         8 for( $j = $i+1; $j < $seqct; $j++ ) {
870            
871 2         18 my $pairwise = $aln->select_noncont($i+1,$j+1);
872 2         8 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
873 2         11 my $P = $self->transitions($pairwise) / $L;
874 2         12 my $Q = $self->transversions($pairwise) / $L;
875 2         13 my $C = $gc[$i][$j] + $gc[$j][$i]-
876             ( 2 * $gc[$i][$j] * $gc[$j][$i] );
877 2 50       7 if( $P ) {
878 2         7 $P = $P / $C;
879             }
880 2         18 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
881             # fwd and rev lookup
882 2         12 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
883 2         11 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
884 2         21 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
885             # (diagonals) distance is 0 for same sequence
886 2         7 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
887 2         19 $values[$j][$j] = sprintf($precisionstr,0);
888             }
889             }
890 2         17 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
891             -matrix => \%dist,
892             -names => \@names,
893             -values => \@values);
894              
895             }
896              
897             =head2 D_F84
898              
899             Title : D_F84
900             Usage : my $d = $stat->D_F84($aln)
901             Function: Calculates D (pairwise distance) between 2 sequences in an
902             alignment using the Felsenstein 1984 distance model.
903             Returns : L
904             Args : L of DNA sequences
905             [optional] double - gap penalty
906              
907             =cut
908              
909             sub D_F84 {
910 0     0 1 0 my ($self,$aln,$gappenalty) = @_;
911 0 0       0 return 0 unless $self->_check_arg($aln);
912 0         0 $self->throw_not_implemented();
913             # ambiguities ignored at this point
914 0         0 my (@seqs,@names,@values,%dist);
915 0         0 my $seqct = 0;
916 0         0 foreach my $seq ( $aln->each_seq) {
917             # if there is no name,
918 0         0 my $id = $seq->display_id;
919 0 0 0     0 if( ! length($id) || # deal with empty names
920             $id =~ /^\s+$/ ) {
921 0         0 $id = $seqct+1;
922             }
923 0         0 push @names, $id;
924 0         0 push @seqs, uc $seq->seq();
925 0         0 $seqct++;
926             }
927              
928 0         0 my $precisionstr = "%.$Precision"."f";
929              
930 0         0 for( my $i = 0; $i < $seqct-1; $i++ ) {
931             # (diagonals) distance is 0 for same sequence
932 0         0 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
933 0         0 $values[$i][$i] = sprintf($precisionstr,0);
934              
935 0         0 for( my $j = $i+1; $j < $seqct; $j++ ) {
936             }
937             }
938             }
939              
940             # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
941             # Tajima-Nei correction used for multiple substitutions in the calc
942             # of the distance matrix. Nucleic acids only.
943             #
944             # D = p-distance = 1 - (matches/(posns_scored + gaps)
945             #
946             # distance = -b * ln(1-D/b)
947             #
948              
949             =head2 D_TajimaNei
950              
951             Title : D_TajimaNei
952             Usage : my $d = $stat->D_TajimaNei($aln)
953             Function: Calculates D (pairwise distance) between 2 sequences in an
954             alignment using the TajimaNei 1984 distance model.
955             Returns : L
956             Args : Bio::Align::AlignI of DNA sequences
957              
958              
959             =cut
960              
961             sub D_TajimaNei{
962 2     2 1 8 my ($self,$aln) = @_;
963 2 50       8 return 0 unless $self->_check_arg($aln);
964             # ambiguities ignored at this point
965 2         5 my (@seqs,@names,@values,%dist);
966 2         5 my $seqct = 0;
967 2         6 foreach my $seq ( $aln->each_seq) {
968             # if there is no name,
969 4         9 push @names, $seq->display_id;
970 4         10 push @seqs, uc $seq->seq();
971 4         8 $seqct++;
972             }
973 2         6 my $precisionstr = "%.$Precision"."f";
974 2         3 my ($i,$j,$bs);
975             # pairwise
976 2         7 for( $i =0; $i < $seqct -1; $i++ ) {
977 2         11 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
978 2         21 $values[$i][$i] = sprintf($precisionstr,0);
979              
980 2         9 for ( $j = $i+1; $j <$seqct;$j++ ) {
981 2         9 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
982             $seqs[$j]);
983 2         14 my $pairwise = $aln->select_noncont($i+1,$j+1);
984 2         7 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
985 2         6 my $fij2 = 0;
986 2         7 for( $bs = 0; $bs < 4; $bs++ ) {
987 8         9 my $fi = 0;
988 8         11 map {$fi += $matrix->[$bs]->[$_] } 0..3;
  32         52  
989 8         9 my $fj = 0;
990             # summation
991 8         11 map { $fj += $matrix->[$_]->[$bs] } 0..3;
  32         36  
992 8 50 33     27 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
993 8         17 $fij2 += $fij**2;
994             }
995            
996 2         5 my ($pair,$h) = (0,0);
997 2         8 for( $bs = 0; $bs < 3; $bs++ ) {
998 6         14 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
999 12         18 my $fij = $pfreq->[$pair++] / $slen;
1000 12 100       19 if( $fij ) {
1001            
1002 10         16 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1003              
1004 10         16 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
  40         53  
1005 10         15 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
  40         48  
1006 10         15 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
  40         48  
1007 10         10 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
  40         46  
1008            
1009 10 50       14 if( $fij ) {
1010 10         24 $h += ( ($fij**2) / 2 ) /
1011             ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012             ( ( $ci2 + $cj2 ) / (2 * $slen) )
1013             );
1014             }
1015 10         72 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1016             }
1017             }
1018             }
1019             # just want diagonals which are matches (A matched A, C -> C)
1020              
1021 2         8 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022             $matrix->[2]->[2] + $matrix->[3]->[3] );
1023 2         5 my $D = 1 - ( $m / $slen);
1024 2         7 my $d;
1025 2 50       9 if( $h == 0 ) {
1026 0         0 $d = -1;
1027             } else {
1028 2         7 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1029 2         6 my $c = 1- $D/ $b;
1030              
1031 2 50       5 if( $c < 0 ) {
1032 0         0 $d = -1;
1033             } else {
1034 2         7 $d = (-1 * $b) * log ( $c);
1035             }
1036             }
1037             # fwd and rev lookup
1038 2         9 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039 2         8 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040 2         17 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1041              
1042             # (diagonals) distance is 0 for same sequence
1043 2         8 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044 2         19 $values[$j][$j] = sprintf($precisionstr,0);
1045             }
1046             }
1047 2         16 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1048             -matrix => \%dist,
1049             -names => \@names,
1050             -values => \@values);
1051              
1052             }
1053              
1054             # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1055              
1056             =head2 D_JinNei
1057              
1058             Title : D_JinNei
1059             Usage : my $d = $stat->D_JinNei($aln)
1060             Function: Calculates D (pairwise distance) between 2 sequences in an
1061             alignment using the Jin-Nei 1990 distance model.
1062             Returns : L
1063             Args : L of DNA sequences
1064              
1065              
1066             =cut
1067              
1068             sub D_JinNei{
1069 0     0 1 0 my ($self,@args) = @_;
1070 0         0 $self->warn("JinNei implementation not completed");
1071 0         0 return;
1072             }
1073              
1074             =head2 transversions
1075              
1076             Title : transversions
1077             Usage : my $transversions = $stats->transversion($aln);
1078             Function: Calculates the number of transversions between two sequences in
1079             an alignment
1080             Returns : integer
1081             Args : Bio::Align::AlignI
1082              
1083              
1084             =cut
1085              
1086             sub transversions{
1087 6     6 1 480 my ($self,$aln) = @_;
1088 6         24 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1089             }
1090              
1091             =head2 transitions
1092              
1093             Title : transitions
1094             Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095             Function: Calculates the number of transitions in a given DNA alignment
1096             Returns : integer representing the number of transitions
1097             Args : Bio::Align::AlignI object
1098              
1099              
1100             =cut
1101              
1102             sub transitions{
1103 6     6 1 19 my ($self,$aln) = @_;
1104 6         40 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1105             }
1106              
1107              
1108             sub _trans_count_helper {
1109 12     12   28 my ($self,$aln,$type) = @_;
1110 12 50       30 return 0 unless( $self->_check_arg($aln) );
1111 12 50       41 if( ! $aln->is_flush ) { $self->throw("must be flush") }
  0         0  
1112 12         19 my (@tcount);
1113 12         25 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114             uc $aln->get_seq_by_pos(2)->seq() );
1115 12         32 my $alen = $aln->length;
1116 12         26 for (my $i = 0;$i<$alen; $i++ ) {
1117 2298         2697 my ($c1,$c2) = ( substr($first,$i,1),
1118             substr($second,$i,1) );
1119 2298 100       3263 if( $c1 ne $c2 ) {
1120 480         707 foreach my $nt ( @{$type->{$c1}} ) {
  480         677  
1121 477 100       695 if( $nt eq $c2) {
1122 120         209 $tcount[$i]++;
1123             }
1124             }
1125             }
1126             }
1127 12         18 my $sum = 0;
1128 12 100       27 map { if( $_) { $sum += $_} } @tcount;
  2025         2286  
  120         183  
1129 12         60 return $sum;
1130             }
1131              
1132             # this will generate a matrix which records across the row, the number
1133             # of DNA subst
1134             #
1135             sub _build_nt_matrix {
1136 19     19   35 my ($self,$seqa,$seqb) = @_;
1137            
1138              
1139 19         85 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1140             [ qw(0 0 0 0) ],
1141             [ qw(0 0 0 0) ],
1142             [ qw(0 0 0 0) ] ];
1143 19         24 my $gaps = 0; # number of gaps
1144 19         38 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1145 19         28 my $len_a = length($seqa);
1146 19         40 for( my $i = 0; $i < $len_a; $i++) {
1147 1974         2649 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1148 1974         1863 $ti =~ tr/U/T/;
1149 1974         1704 $tj =~ tr/U/T/;
1150              
1151 1974 100       3912 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
  151         138  
  151         207  
1152 1823 100       3330 if( $tj =~ /^$GapChars$/) { $gaps++; next }
  318         260  
  318         485  
1153              
1154 1505         1642 my $ti_index = $NucleotideIndexes{$ti};
1155 1505         1343 my $tj_index = $NucleotideIndexes{$tj};
1156              
1157 1505 50       1811 if( ! defined $ti_index ) {
1158 0         0 $self->warn("ti_index not defined for $ti\n");
1159 0         0 next;
1160             }
1161            
1162 1505         1433 $basect_matrix->[$ti_index]->[$tj_index]++;
1163            
1164 1505 100       2705 if( $ti ne $tj ) {
1165 159         435 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1166             }
1167             }
1168 19         47 return ($basect_matrix,$pfreq,$gaps);
1169             }
1170              
1171             sub _check_ambiguity_nucleotide {
1172 0     0   0 my ($base1,$base2) = @_;
1173 0         0 my %iub = Bio::Tools::IUPAC->iupac_iub();
1174 0         0 my @amb1 = @{ $iub{uc($base1)} };
  0         0  
1175 0         0 my @amb2 = @{ $iub{uc($base2)} };
  0         0  
1176 0         0 my ($pmatch) = (0);
1177 0         0 for my $amb ( @amb1 ) {
1178 0 0       0 if( grep { $amb eq $_ } @amb2 ) {
  0         0  
1179 0         0 $pmatch = 1;
1180 0         0 last;
1181             }
1182             }
1183 0 0       0 if( $pmatch ) {
1184 0         0 return (1 / scalar @amb1) * (1 / scalar @amb2);
1185             } else {
1186 0         0 return 0;
1187             }
1188             }
1189              
1190              
1191             sub _check_arg {
1192 25     25   43 my($self,$aln ) = @_;
1193 25 50 33     177 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
    50          
1194 0         0 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1195 0         0 return 0;
1196             } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197 0         0 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1198 0         0 return 0;
1199             }
1200 25         68 return 1;
1201             }
1202              
1203             =head2 Data Methods
1204              
1205             =cut
1206              
1207             =head2 pairwise_stats
1208              
1209             Title : pairwise_stats
1210             Usage : $obj->pairwise_stats($newval)
1211             Function:
1212             Returns : value of pairwise_stats
1213             Args : newvalue (optional)
1214              
1215              
1216             =cut
1217              
1218             sub pairwise_stats{
1219 18     18 1 48 my ($self,$value) = @_;
1220 18 100       48 if( defined $value) {
1221 2         5 $self->{'_pairwise_stats'} = $value;
1222             }
1223 18         109 return $self->{'_pairwise_stats'};
1224              
1225             }
1226              
1227             =head2 calc_KaKs_pair
1228              
1229             Title : calc_KaKs_pair
1230             Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1231             $name1, $name2).
1232             Function : calculates Nei-Gojobori statistics for pairwise
1233             comparison.
1234             Args : A Bio::Align::AlignI compliant object such as a
1235             Bio::SimpleAlign object, and 2 sequence name strings.
1236             Returns : a reference to a hash of statistics with keys as
1237             listed in Description.
1238              
1239             =cut
1240              
1241             sub calc_KaKs_pair {
1242 1     1 1 466 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243 1 50       5 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1244             if @_!= 4;
1245 1 50       5 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246 1         6 my @seqs = (
1247             #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248             #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249             {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250             {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1251             ) ;
1252 1 50       5 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253 0         0 $self->throw(" aligned sequences must be of equal length!");
1254             }
1255 1         2 my $results = [];
1256 1         5 $self->_get_av_ds_dn(\@seqs, $results);
1257 1         5 return $results;
1258              
1259             }
1260              
1261             =head2 calc_all_KaKs_pairs
1262              
1263             Title : calc_all_KaKs_pairs
1264             Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265             Function : Calculates Nei_gojobori statistics for all pairwise
1266             combinations in sequence.
1267             Arguments: A Bio::Align::ALignI compliant object such as
1268             a Bio::SimpleAlign object.
1269             Returns : A reference to an array of hashes of statistics of
1270             all pairwise comparisons in the alignment.
1271              
1272             =cut
1273              
1274              
1275              
1276             sub calc_all_KaKs_pairs {
1277             #returns a multi_element_array with all pairwise comparisons
1278 1     1 1 1162 my ($self,$aln) = @_;
1279 1 50       10 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1280 1         3 my @seqs;
1281 1         6 for my $seq ($aln->each_seq) {
1282 3         14 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1283             }
1284 1         5 my $results ;
1285 1         4 $results = $self->_get_av_ds_dn(\@seqs, $results);
1286 1         7 return $results;
1287             }
1288              
1289             =head2 calc_average_KaKs
1290              
1291             Title : calc_average_KaKs.
1292             Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293             Function : calculates Nei_Gojobori stats for average of all
1294             sequences in the alignment.
1295             Args : A Bio::Align::AlignI compliant object such as a
1296             Bio::SimpleAlign object, number of bootstrap iterations
1297             (default 1000).
1298             Returns : A reference to a hash of statistics as listed in Description.
1299              
1300             =cut
1301              
1302             sub calc_average_KaKs {
1303             #calculates global value for sequences in alignment using bootstrapping
1304             #this is quite slow (~10 seconds per 3 X 200nt seqs);
1305 1     1 1 1025 my ($self, $aln, $bootstrap_rpt) = @_;
1306 1   50     4 $bootstrap_rpt ||= 1000;
1307 1 50       7 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1308 1         1 my @seqs;
1309 1         5 for my $seq ($aln->each_seq) {
1310 3         6 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1311             }
1312 1         2 my $results ;
1313 1         3 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1314             #print "ds = $ds_orig, dn = $dn_orig\n";
1315 1         4 $results = {D_s => $ds_orig, D_n => $dn_orig};
1316 1         5 $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1317 1         14 return $results;
1318             }
1319              
1320             ############## primary internal subs for alignment comparisons ########################
1321              
1322             sub _run_bootstrap {
1323             ### generates sampled sequences, calculates Ds and Dn values,
1324             ### then calculates variance of sampled sequences and add results to results hash
1325             ###
1326 1     1   2 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327 1         2 my @seqs = @$seq_ref;
1328 1         2 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329 1         4 my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1330              
1331             #1st make alternative array of codons;
1332 1         1 my $c = 0;
1333 1         4 while ($c < length $seqs[0]{'seq'}) {
1334 57         68 for (0..$#seqs) {
1335 171         149 push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
  171         575  
1336             }
1337 57         81 $c+=3;
1338             }
1339              
1340 1         3 for (1..$bootstrap_rpt) {
1341 100         207 my $sampled = _resample (\@btstrp_aoa);
1342 100         219 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343 100         129 push @{$bootstrap_values{'ds'}}, $ds;
  100         183  
1344 100         109 push @{$bootstrap_values{'dn'}}, $dn;
  100         319  
1345             }
1346              
1347 1         6 $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1348 1         3 $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1349             $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1350 1         18 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351             #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1352             #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1353             #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1354             }
1355              
1356             sub _resample {
1357 100     100   107 my $ref = shift;
1358 100         94 my $codon_num = scalar (@{$ref->[0]});
  100         145  
1359 100         102 my @altered;
1360 100         171 for (0..$codon_num -1) { #for each codon
1361 5700         6348 my $rand = int (rand ($codon_num));
1362 5700         6685 for (0..$#$ref) {
1363 17100         14813 push @{$altered[$_]}, $ref->[$_][$rand];
  17100         26744  
1364             }
1365             }
1366 100         134 my @stringed = map {join '', @$_}@altered;
  300         1224  
1367 100         139 my @return;
1368             #now out in random name to keep other subs happy
1369 100         155 for (@stringed) {
1370 300         664 push @return, {id=>'1', seq=> $_};
1371             }
1372 100         820 return \@return;
1373             }
1374              
1375             sub _get_av_ds_dn {
1376             # takes array of hashes of sequence strings and ids #
1377 103     103   125 my $self = shift;
1378 103         129 my $seq_ref = shift;
1379 103 100       190 my $result = shift if @_;
1380 103         341 my @caller = caller(1);
1381 103         1812 my @seqarray = @$seq_ref;
1382 103         111 my $bootstrap_score_list;
1383             #for a multiple alignment considers all pairwise combinations#
1384 103         225 my %dsfor_average = (ds => [], dn => []);
1385 103         203 for (my $i = 0; $i < scalar @seqarray; $i++) {
1386 308         589 for (my $j = $i +1; $j
1387             # print "comparing $i and $j\n";
1388 307 50       571 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389 0         0 $self->warn(" aligned sequences must be of equal length!");
1390 0         0 next;
1391             }
1392              
1393 307         458 my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1394 307         504 my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1395             # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396 307         526 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1397             #get averages
1398 307         484 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399 307         561 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1400              
1401             #calculate ps and pn (p54)
1402 307         325 my $syn_prop = $syn_count / $av_s_site;
1403 307         317 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1404              
1405             #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406 307         564 my $d_syn = $self->jk($syn_prop);
1407 307         451 my $d_nc = $self->jk($nc_prop);
1408              
1409             #JK calculation must succeed for continuation of calculation
1410             #ret_value = -1 if error
1411 307 50 33     830 next unless $d_nc >=0 && $d_syn >=0;
1412              
1413              
1414 307         318 push @{$dsfor_average{'ds'}}, $d_syn;
  307         538  
1415 307         298 push @{$dsfor_average{'dn'}}, $d_nc;
  307         444  
1416              
1417             #if not doing bootstrap, calculate the pairwise comparisin stats
1418 307 100 100     1420 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1419             #now calculate variances assuming large sample
1420 4         10 my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1421 4         7 my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422             #now calculate z_value
1423             #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1424             #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425 4 50       11 my $z = ($d_syn_var + $d_nc_var) ?
1426             ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427             # print "z is $z\n";
1428             push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1429             S_d => $syn_count, N_d =>$non_syn_count,
1430             P_s => $syn_prop, P_n=>$nc_prop,
1431 4         5 D_s => @{$dsfor_average{'ds'}}[-1],
1432 4         60 D_n => @{$dsfor_average{'dn'}}[-1],
1433             D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1434             Seq1 => $seqarray[$i]{'id'},
1435 4         7 Seq2 => $seqarray[$j]{'id'},
1436             z_score => $z,
1437             };
1438 4 50 33     24 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
      33        
1439             if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1440             }#endif
1441             }
1442             }
1443              
1444             #warn of failure if no results hashes are present
1445             #will fail if Jukes Cantor has failed for all pairwise combinations
1446             #$self->warn("calculation failed!") if scalar @$result ==0;
1447              
1448             #return results unless bootstrapping
1449 103 100 100     345 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1450             #else if getting average for bootstrap
1451 101         226 return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1452             }
1453              
1454              
1455             sub jk {
1456 614     614 0 669 my ($self, $p) = @_;
1457 614 50       826 if ($p > 0.75) {
1458 0         0 $self->warn( " Jukes Cantor won't work -too divergent!");
1459 0         0 return -1;
1460             }
1461 614         1081 return -1 * (3/4) * (log(1 - (4/3) * $p));
1462             }
1463              
1464             #works for large value of n (50?100?)
1465             sub jk_var {
1466 8     8 0 11 my ($p, $n) = @_;
1467 8         20 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1468             }
1469              
1470              
1471             # compares 2 sequences to find the number of synonymous/non
1472             # synonymous mutations between them
1473              
1474             sub analyse_mutations {
1475 307     307 0 406 my ($seq1, $seq2) = @_;
1476 307         1978 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1477             [2,1]], # depend on which is the same
1478             1=>[[0,2],
1479             [2,0]],
1480             2=>[[0,1],
1481             [1,0]],
1482             },
1483             3=> [ [0,1,2], # all need to be altered
1484             [1,0,2],
1485             [0,2,1],
1486             [1,2,0],
1487             [2,0,1],
1488             [2,1,0] ],
1489             );
1490 307         513 my $TOTAL = 0; # total synonymous changes
1491 307         320 my $TOTAL_n = 0; # total non-synonymous changes
1492 307         289 my $gap_cnt = 0;
1493              
1494 307         329 my %input;
1495 307         304 my $seqlen = length($seq1);
1496 307         461 for (my $j=0; $j< $seqlen; $j+=3) {
1497 17499         20785 $input{'cod1'} = substr($seq1, $j,3);
1498 17499         18369 $input{'cod2'} = substr($seq2, $j,3);
1499              
1500             #ignore codon if beeing compared with gaps!
1501 17499 50 33     46608 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1502 0         0 $gap_cnt += 3; #just increments once if there is a pair of gaps
1503 0         0 next;
1504             }
1505              
1506 17499         21958 my ($diff_cnt, $same) = count_diffs(\%input);
1507              
1508             #ignore if codons are identical
1509 17499 100       30834 next if $diff_cnt == 0 ;
1510 4912 100       5538 if ($diff_cnt == 1) {
    50          
    0          
1511 3921         5725 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512 3921         7172 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513             #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1514             }
1515             elsif ($diff_cnt ==2) {
1516 991         920 my $s_cnt = 0;
1517 991         908 my $n_cnt = 0;
1518 991         874 my $tot_muts = 4;
1519             #will stay 4 unless there are stop codons at intervening point
1520 991         863 OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
  991         1536  
1521 1982         2137 my $altered = $input{'cod1'};
1522 1982         2056 my $prev= $altered;
1523             # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524 1982         2042 for my $mut_i (@$perm) { #index of codon mutated
1525 3964         4405 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526 3964 50       5180 if ($t[$CODONS->{$altered}] eq '*') {
1527 0         0 $tot_muts -=2;
1528             #print "changes to stop codon!!\n";
1529 0         0 next OUTER;
1530             }
1531             else {
1532 3964         4717 $s_cnt += $synchanges{$prev}{$altered};
1533             # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1534             }
1535 3964         4522 $prev = $altered;
1536             }
1537             # print "\n";
1538             }
1539 991 50       1525 if ($tot_muts != 0) {
1540 991         1339 $TOTAL += ($s_cnt/($tot_muts/2));
1541 991         1827 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1542             }
1543              
1544             }
1545             elsif ($diff_cnt ==3 ) {
1546 0         0 my $s_cnt = 0;
1547 0         0 my $n_cnt = 0;
1548 0         0 my $tot_muts = 18; #potential number of mutations
1549 0         0 OUTER: for my $perm (@{$mutator{'3'}}) {
  0         0  
1550 0         0 my $altered = $input{'cod1'};
1551 0         0 my $prev= $altered;
1552             # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553 0         0 for my $mut_i (@$perm) { #index of codon mutated
1554 0         0 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555 0 0       0 if ($t[$CODONS->{$altered}] eq '*') {
1556 0         0 $tot_muts -=3;
1557             # print "changes to stop codon!!\n";
1558 0         0 next OUTER;
1559              
1560             }
1561             else {
1562 0         0 $s_cnt += $synchanges{$prev}{$altered};
1563             # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1564             }
1565 0         0 $prev = $altered;
1566             }
1567             # print "\n";
1568              
1569             }#end OUTER loop
1570             #calculate number of synonymous/non synonymous mutations for that codon
1571             # and add to total
1572 0 0       0 if ($tot_muts != 0) {
1573 0         0 $TOTAL += ($s_cnt / ($tot_muts /3));
1574 0         0 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1575             }
1576             } #endif $diffcnt = 3
1577             } #end of sequencetraversal
1578 307         1680 return ($TOTAL, $TOTAL_n, $gap_cnt);
1579             }
1580              
1581              
1582             sub count_diffs {
1583             #counts the number of nucleotide differences between 2 codons
1584             # returns this value plus the codon index of which nucleotide is the same when 2
1585             #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1586             # to change.
1587 17499     17499 0 15403 my $ref = shift;
1588 17499         16060 my $cnt = 0;
1589 17499         15474 my $same= undef;
1590             #just for 2 differences
1591 17499         19637 for (0..2) {
1592 52497 100       70389 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1593 5903         6321 $cnt++;
1594             } else {
1595 46594         48570 $same = $_;
1596             }
1597             }
1598 17499         24912 return ($cnt, $same);
1599             }
1600              
1601             =head2 get_syn_changes
1602              
1603             Title : get_syn_changes
1604             Usage : Bio::Align::DNAStatitics->get_syn_changes
1605             Function: Generate a hashref of all pairwise combinations of codns
1606             differing by 1
1607             Returns : Symetic matrix using hashes
1608             First key is codon
1609             and each codon points to a hashref of codons
1610             the values of which describe type of change.
1611             my $type = $hash{$codon1}->{$codon2};
1612             values are :
1613             1 synonymous
1614             0 non-syn
1615             -1 either codon is a stop codon
1616             Args : none
1617              
1618             =cut
1619              
1620             sub get_syn_changes {
1621             #hash of all pairwise combinations of codons differing by 1
1622             # 1 = syn, 0 = non-syn, -1 = stop
1623 4     4 1 6 my %results;
1624 4         9 my @codons = _make_codons ();
1625 4         125 my $arr_len = scalar @codons;
1626 4         19 for (my $i = 0; $i < $arr_len -1; $i++) {
1627 252         302 my $cod1 = $codons[$i];
1628 252         330 for (my $j = $i +1; $j < $arr_len; $j++) {
1629 8064         6724 my $diff_cnt = 0;
1630 8064         7668 for my $pos(0..2) {
1631 24192 100       32696 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1632             }
1633 8064 100       11895 next if $diff_cnt !=1;
1634              
1635             #synon change
1636 1152 100 100     2909 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
    100          
1637 276         347 $results{$cod1}{$codons[$j]} =1;
1638 276         455 $results{$codons[$j]}{$cod1} = 1;
1639             }
1640             #stop codon
1641             elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1642 92         117 $results{$cod1}{$codons[$j]} = -1;
1643 92         151 $results{$codons[$j]}{$cod1} = -1;
1644             }
1645             # nc change
1646             else {
1647 784         1029 $results{$cod1}{$codons[$j]} = 0;
1648 784         1377 $results{$codons[$j]}{$cod1} = 0;
1649             }
1650             }
1651             }
1652 4         142 return %results;
1653             }
1654              
1655             =head2 dnds_pattern_number
1656              
1657             Title : dnds_pattern_number
1658             Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1659             Function: Counts the number of codons with no gaps in the MSA
1660             Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661             Args : A Bio::Align::AlignI compliant object such as a
1662             Bio::SimpleAlign object.
1663              
1664             =cut
1665              
1666             sub dnds_pattern_number{
1667 0     0 1 0 my ($self, $aln) = @_;
1668 0         0 return ($aln->remove_gaps->length)/3;
1669             }
1670              
1671             sub count_syn_sites {
1672             #counts the number of possible synonymous changes for sequence
1673 615     615 0 1123 my ($seq, $synsite) = @_;
1674 615 100       910 __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1675 614         588 my $S = 0;
1676 614         810 for (my $i = 0; $i< length($seq); $i+=3) {
1677 34998         34394 my $cod = substr($seq, $i, 3);
1678 34998 50       42249 next if $cod =~ /\-/; #deal with alignment gaps
1679 34998         48649 $S += $synsite->{$cod}{'s'};
1680             }
1681             #print "S is $S\n";
1682 614         835 return $S;
1683             }
1684              
1685            
1686              
1687             sub get_syn_sites {
1688             #sub to generate lookup hash for the number of synonymous changes per codon
1689 4     4 0 10 my @nucs = qw(T C A G);
1690 4         5 my %raw_results;
1691 4         8 for my $i (@nucs) {
1692 16         16 for my $j (@nucs) {
1693 64         64 for my $k (@nucs) {
1694             # for each possible codon
1695 256         322 my $cod = "$i$j$k";
1696 256         256 my $aa = $t[$CODONS->{$cod}];
1697             #calculate number of synonymous mutations vs non syn mutations
1698 256         259 for my $i (qw(0 1 2)){
1699 768         675 my $s = 0;
1700 768         620 my $n = 3;
1701 768         730 for my $nuc (qw(A T C G)) {
1702 3072 100       3812 next if substr ($cod, $i,1) eq $nuc;
1703 2304         1895 my $test = $cod;
1704 2304         2007 substr($test, $i, 1) = $nuc ;
1705 2304 100       2769 if ($t[$CODONS->{$test}] eq $aa) {
1706 552         457 $s++;
1707             }
1708 2304 100       2976 if ($t[$CODONS->{$test}] eq '*') {
1709 108         105 $n--;
1710             }
1711             }
1712 768         1500 $raw_results{$cod}[$i] = {'s' => $s ,
1713             'n' => $n };
1714             }
1715            
1716             } #end analysis of single codon
1717             }
1718             } #end analysis of all codons
1719 4         9 my %final_results;
1720            
1721 4         95 for my $cod (sort keys %raw_results) {
1722 256         233 my $t = 0;
1723 256         190 map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
  768         1100  
  256         288  
1724 256         522 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1725             }
1726 4         138 return \%final_results;
1727             }
1728              
1729             sub _make_codons {
1730             #makes all codon combinations, returns array of them
1731 8     8   22 my @nucs = qw(T C A G);
1732 8         9 my @codons;
1733 8         16 for my $i (@nucs) {
1734 32         36 for my $j (@nucs) {
1735 128         117 for my $k (@nucs) {
1736 512         665 push @codons, "$i$j$k";
1737             }
1738             }
1739             }
1740 8         103 return @codons;
1741             }
1742              
1743             sub get_codons {
1744             #generates codon translation look up table#
1745 4     4 0 9 my $x = 0;
1746 4         9 my $CODONS = {};
1747 4         7 for my $codon (_make_codons) {
1748 256         324 $CODONS->{$codon} = $x;
1749 256         212 $x++;
1750             }
1751 4         18 return $CODONS;
1752             }
1753              
1754             #########stats subs, can go in another module? Here for speed. ###
1755             sub mean {
1756 204     204 0 232 my $ref = shift;
1757 204         209 my $el_num = scalar @$ref;
1758 204         179 my $tot = 0;
1759 204         295 map{$tot += $_}@$ref;
  806         1030  
1760 204         577 return ($tot/$el_num);
1761             }
1762              
1763             sub variance {
1764 2     2 0 2 my $ref = shift;
1765 2         3 my $mean = mean($ref);
1766 2         3 my $sum_of_squares = 0;
1767 2         3 map{$sum_of_squares += ($_ - $mean) **2}@$ref;
  200         207  
1768 2         8 return $sum_of_squares;
1769             }
1770              
1771             sub sampling_variance {
1772 2     2 0 2 my $ref = shift;
1773 2         5 return variance($ref) / (scalar @$ref -1);
1774             }
1775              
1776             1;