File Coverage

Bio/PopGen/Statistics.pm
Criterion Covered Total %
statement 482 589 81.8
branch 123 192 64.0
condition 51 100 51.0
subroutine 23 25 92.0
pod 19 19 100.0
total 698 925 75.4


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PopGen::Statistics
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PopGen::Statistics - Population Genetics statistical tests
17              
18             =head1 SYNOPSIS
19              
20             use Bio::PopGen::Statistics;
21             use Bio::AlignIO;
22             use Bio::PopGen::IO;
23             use Bio::PopGen::Simulation::Coalescent;
24              
25             my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
26              
27             my $tree = $sim->next_tree;
28              
29             $sim->add_Mutations($tree,20);
30              
31             my $stats = Bio::PopGen::Statistics->new();
32             my $individuals = [ $tree->get_leaf_nodes];
33             my $pi = $stats->pi($individuals);
34             my $D = $stats->tajima_D($individuals);
35              
36             # Alternatively to do this on input data from
37             # See the tests in t/PopGen.t for more examples
38             my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
39             -file => 't/data/popstats.prettybase');
40             my $pop = $parser->next_population;
41             # Note that you can also call the stats as a class method if you like
42             # the only reason to instantiate it (as above) is if you want
43             # to set the verbosity for debugging
44             $pi = Bio::PopGen::Statistics->pi($pop);
45             $theta = Bio::PopGen::Statistics->theta($pop);
46              
47             # Pi and Theta also take additional arguments,
48             # see the documentation for more information
49              
50             use Bio::PopGen::Utilities;
51             use Bio::AlignIO;
52              
53             my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
54             -format => 'clustalw');
55             my $aln = $in->next_aln;
56             # get a population, each sequence is an individual and
57             # for the default case, every site which is not monomorphic
58             # is a 'marker'. Each individual will have a 'genotype' for the
59             # site which will be the specific base in the alignment at that
60             # site
61              
62             my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
63              
64              
65             =head1 DESCRIPTION
66              
67             This object is intended to provide implementations some standard
68             population genetics statistics about alleles in populations.
69              
70             This module was previously named Bio::Tree::Statistics.
71              
72             This object is a place to accumulate routines for calculating various
73             statistics from the coalescent simulation, marker/allele, or from
74             aligned sequence data given that you can calculate alleles, number of
75             segregating sites.
76              
77             Currently implemented:
78             Fu and Li's D (fu_and_li_D)
79             Fu and Li's D* (fu_and_li_D_star)
80             Fu and Li's F (fu_and_li_F)
81             Fu and Li's F* (fu_and_li_F_star)
82             Tajima's D (tajima_D)
83             Watterson's theta (theta)
84             pi (pi) - number of pairwise differences
85             composite_LD (composite_LD)
86             McDonald-Kreitman (mcdonald_kreitman or MK)
87              
88             Count based methods also exist in case you have already calculated the
89             key statistics (seg sites, num individuals, etc) and just want to
90             compute the statistic.
91              
92             In all cases where a the method expects an arrayref of
93             L objects and L
94             object will also work.
95              
96             =head2 REFERENCES
97              
98             Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
99             Mutations." Genetics 133:693-709.
100              
101             Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
102             from a Population." Genetics 143:557-570.
103              
104             McDonald J, Kreitman M.
105              
106             Tajima F. (1989) "Statistical method for testing the neutral mutation
107             hypothesis by DNA polymorphism." Genetics 123:585-595.
108              
109              
110             =head2 CITING THIS WORK
111              
112             Please see this reference for use of this implementation.
113              
114             Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
115              
116             If you use these Bio::PopGen modules please cite the Bioperl
117             publication (see FAQ) and the above reference.
118              
119              
120             =head1 FEEDBACK
121              
122             =head2 Mailing Lists
123              
124             User feedback is an integral part of the evolution of this and other
125             Bioperl modules. Send your comments and suggestions preferably to
126             the Bioperl mailing list. Your participation is much appreciated.
127              
128             bioperl-l@bioperl.org - General discussion
129             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
130              
131             =head2 Support
132              
133             Please direct usage questions or support issues to the mailing list:
134              
135             I
136              
137             rather than to the module maintainer directly. Many experienced and
138             reponsive experts will be able look at the problem and quickly
139             address it. Please include a thorough description of the problem
140             with code and data examples if at all possible.
141              
142             =head2 Reporting Bugs
143              
144             Report bugs to the Bioperl bug tracking system to help us keep track
145             of the bugs and their resolution. Bug reports can be submitted via
146             the web:
147              
148             https://github.com/bioperl/bioperl-live/issues
149              
150             =head1 AUTHOR - Jason Stajich, Matthew Hahn
151              
152             Email jason-at-bioperl-dot-org
153             Email matthew-dot-hahn-at-duke-dot-edu
154              
155             McDonald-Kreitman implementation based on work by Alisha Holloway at
156             UC Davis.
157              
158              
159             =head1 APPENDIX
160              
161             The rest of the documentation details each of the object methods.
162             Internal methods are usually preceded with a _
163              
164             =cut
165              
166              
167             # Let the code begin...
168              
169              
170             package Bio::PopGen::Statistics;
171 3     3   3034 use strict;
  3         3  
  3         100  
172             use constant {
173 3         245 in_label => 'ingroup',
174             out_label => 'outgroup',
175             non_syn => 'non_synonymous',
176             syn => 'synonymous',
177             default_codon_table => 1, # Standard Codon table
178 3     3   10 };
  3         3  
179              
180 3     3   20862 use Bio::MolEvol::CodonModel;
  3         4  
  3         92  
181 3     3   12 use List::Util qw(sum);
  3         4  
  3         216  
182              
183 3     3   11 use base qw(Bio::Root::Root);
  3         4  
  3         333  
184             our $codon_table => default_codon_table;
185             our $has_twotailed => 0;
186             BEGIN {
187 3     3   3 eval { require Text::NSP::Measures::2D::Fisher2::twotailed };
  3         448  
188 3 50       14 if( $@ ) { $has_twotailed = 0; }
  3         13439  
189 0         0 else { $has_twotailed = 1; }
190             }
191              
192              
193              
194              
195              
196              
197             =head2 new
198              
199             Title : new
200             Usage : my $obj = Bio::PopGen::Statistics->new();
201             Function: Builds a new Bio::PopGen::Statistics object
202             Returns : an instance of Bio::PopGen::Statistics
203             Args : none
204              
205              
206             =cut
207              
208              
209             =head2 fu_and_li_D
210              
211             Title : fu_and_li_D
212             Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
213             OR
214             my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
215             Function: Fu and Li D statistic for a list of individuals
216             given an outgroup and the number of external mutations
217             (either provided or calculated from list of outgroup individuals)
218             Returns : decimal
219             Args : $individuals - array reference which contains ingroup individuals
220             (L or derived classes)
221             $extmutations - number of external mutations OR
222             arrayref of outgroup individuals
223              
224             =cut
225              
226             sub fu_and_li_D {
227 7     7 1 21 my ($self,$ingroup,$outgroup) = @_;
228              
229 7         10 my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
230 7 100 33     43 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
231 4         5 $n = scalar @$ingroup;
232             # pi - all pairwise differences
233 4         9 $seg_sites = $self->segregating_sites_count($ingroup);
234             } elsif( ref($ingroup) &&
235             $ingroup->isa('Bio::PopGen::PopulationI')) {
236 3         7 $n = $ingroup->get_number_individuals;
237 3         6 $seg_sites = $self->segregating_sites_count($ingroup);
238             } else {
239 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
240 0         0 return 0;
241             }
242            
243 7 50       16 if( $seg_sites <= 0 ) {
244 0         0 $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
245 0         0 return 0;
246             }
247              
248 7 50       17 if( ! defined $outgroup ) {
    100          
249 0         0 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
250 0         0 return 0;
251             } elsif( ref($outgroup) ) {
252 4         8 ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
253 4 50       9 $ancestral = 0 unless defined $ancestral;
254             } else {
255 3         4 $ancestral = $outgroup;
256             }
257            
258 7         16 return $self->fu_and_li_D_counts($n,$seg_sites,
259             $ancestral,$derived);
260             }
261              
262             =head2 fu_and_li_D_counts
263              
264             Title : fu_li_D_counts
265             Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
266             $external);
267             Function: Fu and Li D statistic for the raw counts of the number
268             of samples, sites, external and internal mutations
269             Returns : decimal number
270             Args : number of samples (N)
271             number of segregating sites (n)
272             number of external mutations (n_e)
273              
274             =cut
275              
276              
277             sub fu_and_li_D_counts {
278 8     8 1 131 my ($self,$n,$seg_sites, $external_mut) = @_;
279 8         7 my $a_n = 0;
280 8 50       16 if( $n <= 3 ) {
281 0         0 $self->warn("n is $n, too small, must be > 3\n");
282 0         0 return;
283             }
284 8         24 for(my $k= 1; $k < $n; $k++ ) {
285 51         79 $a_n += ( 1 / $k );
286             }
287 8         9 my $b = 0;
288 8         17 for(my $k= 1; $k < $n; $k++ ) {
289 51         84 $b += ( 1 / $k**2 );
290             }
291              
292 8         18 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
293             ( ( $n - 1) * ( $n - 2 ) ) );
294              
295 8         23 my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
296             ( $c - ( ( $n + 1) /
297             ( $n - 1) ) ));
298            
299 8         10 my $u = $a_n - 1 - $v;
300              
301 8         80 ($seg_sites - $a_n * $external_mut) /
302             sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
303            
304             }
305              
306              
307             =head2 fu_and_li_D_star
308              
309             Title : fu_and_li_D_star
310             Usage : my $D = $statistics->fu_an_li_D_star(\@individuals);
311             Function: Fu and Li's D* statistic for a set of samples
312             Without an outgroup
313             Returns : decimal number
314             Args : array ref of L objects
315             OR
316             L object
317              
318             =cut
319              
320             #'
321             # fu_and_li_D*
322              
323             sub fu_and_li_D_star {
324 3     3 1 443 my ($self,$individuals) = @_;
325              
326 3         4 my ($seg_sites,$n,$singletons);
327 3 100 33     22 if( ref($individuals) =~ /ARRAY/i ) {
    50          
328 2         3 $n = scalar @$individuals;
329 2         5 $seg_sites = $self->segregating_sites_count($individuals);
330 2         6 $singletons = $self->singleton_count($individuals);
331             } elsif( ref($individuals) &&
332             $individuals->isa('Bio::PopGen::PopulationI')) {
333 1         1 my $pop = $individuals;
334 1         3 $n = $pop->get_number_individuals;
335 1         2 $seg_sites = $self->segregating_sites_count($pop);
336 1         4 $singletons = $self->singleton_count($pop);
337             } else {
338 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
339 0         0 return 0;
340             }
341              
342 3         10 return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
343             }
344              
345             =head2 fu_and_li_D_star_counts
346              
347             Title : fu_li_D_star_counts
348             Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
349             $singletons);
350              
351             Function: Fu and Li D statistic for the raw counts of the number
352             of samples, sites, external and internal mutations
353             Returns : decimal number
354             Args : number of samples (N)
355             number of segregating sites (n)
356             singletons (n_s)
357              
358             =cut
359              
360              
361             sub fu_and_li_D_star_counts {
362 4     4 1 5 my ($self,$n,$seg_sites, $singletons) = @_;
363 4         4 my $a_n;
364 4         14 for(my $k = 1; $k < $n; $k++ ) {
365 35         50 $a_n += ( 1 / $k );
366             }
367              
368 4         6 my $a1 = $a_n + 1 / $n;
369              
370 4         4 my $b = 0;
371 4         10 for(my $k= 1; $k < $n; $k++ ) {
372 35         47 $b += ( 1 / $k**2 );
373             }
374              
375 4         12 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
376             ( ( $n - 1) * ( $n - 2 ) ) );
377              
378 4         13 my $d = $c + ($n -2) / ($n - 1)**2 +
379             2 / ($n -1) *
380             ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
381             1 / $n );
382            
383 4         17 my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
384             (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) /
385             (($a_n**2) + $b);
386              
387 4         9 my $u_star = ( ($n/($n-1))*
388             ($a_n - ($n/
389             ($n-1)))) - $v_star;
390              
391              
392 4         37 return (($n / ($n - 1)) * $seg_sites -
393             $a_n * $singletons) /
394             sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
395             }
396              
397              
398             =head2 fu_and_li_F
399              
400             Title : fu_and_li_F
401             Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
402             Function: Calculate Fu and Li's F on an ingroup with either the set of
403             outgroup individuals, or the number of external mutations
404             Returns : decimal number
405             Args : array ref of L objects for the ingroup
406             OR a L object
407             number of external mutations OR list of individuals for the outgroup
408              
409             =cut
410              
411             #'
412              
413             sub fu_and_li_F {
414 5     5 1 472 my ($self,$ingroup,$outgroup) = @_;
415 5         3 my ($seg_sites,$pi,$n,$external,$internal);
416 5 100 33     38 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
417 2         4 $n = scalar @$ingroup;
418             # pi - all pairwise differences
419 2         5 $pi = $self->pi($ingroup);
420 2         6 $seg_sites = $self->segregating_sites_count($ingroup);
421             } elsif( ref($ingroup) &&
422             $ingroup->isa('Bio::PopGen::PopulationI')) {
423 3         6 $n = $ingroup->get_number_individuals;
424 3         6 $pi = $self->pi($ingroup);
425 3         6 $seg_sites = $self->segregating_sites_count($ingroup);
426             } else {
427 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
428 0         0 return 0;
429             }
430            
431 5 50       16 if( ! defined $outgroup ) {
    100          
432 0         0 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
433 0         0 return 0;
434             } elsif( ref($outgroup) ) {
435 2         7 ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
436             } else {
437 3         3 $external = $outgroup;
438             }
439 5         13 $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
440             }
441              
442             =head2 fu_and_li_F_counts
443              
444             Title : fu_li_F_counts
445             Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
446             $sites,
447             $external);
448             Function: Fu and Li F statistic for the raw counts of the number
449             of samples, sites, external and internal mutations
450             Returns : decimal number
451             Args : number of samples (N)
452             average pairwise differences (pi)
453             number of segregating sites (n)
454             external mutations (n_e)
455              
456             =cut
457              
458              
459             sub fu_and_li_F_counts {
460 6     6 1 12 my ($self,$n,$pi,$seg_sites, $external) = @_;
461 6         5 my $a_n = 0;
462 6         14 for(my $k= 1; $k < $n; $k++ ) {
463 43         60 $a_n += ( 1 / $k );
464             }
465              
466 6         13 my $a1 = $a_n + (1 / $n );
467              
468 6         7 my $b = 0;
469 6         13 for(my $k= 1; $k < $n; $k++ ) {
470 43         56 $b += ( 1 / $k**2 );
471             }
472              
473 6         17 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
474             ( ( $n - 1) * ( $n - 2 ) ) );
475              
476 6         23 my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
477             ( (9*$n)*($n-1) ) ) -
478             (2/($n-1)) ) / ( ($a_n**2)+$b );
479              
480 6         18 my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
481             ( 4*( ($n+1)/(($n-1)**2) ))*
482             ($a1 - ((2*$n)/($n+1))) ) /
483             $a_n - $v_F;
484              
485             # warn("$v_F vf $u_F uf n = $n\n");
486 6         13 my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
487             ($v_F*($seg_sites**2)) ) );
488              
489 6         47 return $F;
490             }
491              
492             =head2 fu_and_li_F_star
493              
494             Title : fu_and_li_F_star
495             Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
496             Function: Calculate Fu and Li's F* on an ingroup without an outgroup
497             It uses count of singleton alleles instead
498             Returns : decimal number
499             Args : array ref of L objects for the ingroup
500             OR
501             L object
502              
503             =cut
504              
505             #' keep my emacs happy
506              
507             sub fu_and_li_F_star {
508 3     3 1 447 my ($self,$individuals) = @_;
509              
510 3         4 my ($seg_sites,$pi,$n,$singletons);
511 3 100 33     22 if( ref($individuals) =~ /ARRAY/i ) {
    50          
512 2         3 $n = scalar @$individuals;
513             # pi - all pairwise differences
514 2         6 $pi = $self->pi($individuals);
515 2         7 $seg_sites = $self->segregating_sites_count($individuals);
516 2         7 $singletons = $self->singleton_count($individuals);
517             } elsif( ref($individuals) &&
518             $individuals->isa('Bio::PopGen::PopulationI')) {
519 1         2 my $pop = $individuals;
520 1         2 $n = $pop->get_number_individuals;
521 1         3 $pi = $self->pi($pop);
522 1         4 $seg_sites = $self->segregating_sites_count($pop);
523 1         3 $singletons = $self->singleton_count($pop);
524             } else {
525 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
526 0         0 return 0;
527             }
528 3         8 return $self->fu_and_li_F_star_counts($n,
529             $pi,
530             $seg_sites,
531             $singletons);
532             }
533              
534             =head2 fu_and_li_F_star_counts
535              
536             Title : fu_li_F_star_counts
537             Usage : my $F = $statistics->fu_and_li_F_star_counts($samps,
538             $pi,$sites,
539             $singletons);
540             Function: Fu and Li F statistic for the raw counts of the number
541             of samples, sites, external and internal mutations
542             Returns : decimal number
543             Args : number of samples (N)
544             average pairwise differences (pi)
545             number of segregating sites (n)
546             singleton mutations (n_s)
547              
548             =cut
549              
550              
551             sub fu_and_li_F_star_counts {
552 4     4 1 7 my ($self,$n,$pi,$seg_sites, $singletons) = @_;
553 4 50       30 if( $n <= 1 ) {
554 0         0 $self->warn("N must be > 1\n");
555 0         0 return;
556             }
557 4 50       7 if( $n == 2) {
558 0         0 return 0;
559             }
560              
561 4         5 my $a_n = 0;
562            
563              
564 4         4 my $b = 0;
565 4         12 for(my $k= 1; $k < $n; $k++ ) {
566 35         34 $b += (1 / ($k**2));
567 35         43 $a_n += ( 1 / $k ); # Eq (2)
568             }
569 4         6 my $a1 = $a_n + (1 / $n );
570              
571             # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
572              
573             # From Simonsen et al (1995) instead of Fu and Li 1993
574 4         20 my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
575             (9 * ($n ** 2) * ( $n - 1))) +
576             ((2 * ($n - 1) * $a_n ) / $n ** 2) -
577             (8 * $b / $n) ) /
578             ( ($a_n ** 2) + $b );
579            
580 4         11 my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
581             (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
582              
583             # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
584 4         9 my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
585             sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
586 4         31 return $F_star;
587             }
588              
589             =head2 tajima_D
590              
591             Title : tajima_D
592             Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
593             Function: Calculate Tajima's D on a set of samples
594             Returns : decimal number
595             Args : array ref of L objects
596             OR
597             L object
598              
599              
600             =cut
601              
602             #'
603              
604             sub tajima_D {
605 6     6 1 491 my ($self,$individuals) = @_;
606 6         10 my ($seg_sites,$pi,$n);
607              
608 6 100 33     57 if( ref($individuals) =~ /ARRAY/i ) {
    50          
609 2         3 $n = scalar @$individuals;
610             # pi - all pairwise differences
611 2         5 $pi = $self->pi($individuals);
612 2         7 $seg_sites = $self->segregating_sites_count($individuals);
613              
614             } elsif( ref($individuals) &&
615             $individuals->isa('Bio::PopGen::PopulationI')) {
616 4         5 my $pop = $individuals;
617 4         14 $n = $pop->get_number_individuals;
618 4         16 $pi = $self->pi($pop);
619 4         16 $seg_sites = $self->segregating_sites_count($pop);
620             } else {
621 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
622 0         0 return 0;
623             }
624 6         31 $self->tajima_D_counts($n,$seg_sites,$pi);
625             }
626              
627             =head2 tajima_D_counts
628              
629             Title : tajima_D_counts
630             Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
631             Function: Tajima's D statistic for the raw counts of the number
632             of samples, sites, and avg pairwise distances (pi)
633             Returns : decimal number
634             Args : number of samples (N)
635             number of segregating sites (n)
636             average pairwise differences (pi)
637              
638             =cut
639              
640             #'
641              
642             sub tajima_D_counts {
643 6     6 1 14 my ($self,$n,$seg_sites,$pi) = @_;
644 6         5 my $a1 = 0;
645 6         21 for(my $k= 1; $k < $n; $k++ ) {
646 284         316 $a1 += ( 1 / $k );
647             }
648              
649 6         7 my $a2 = 0;
650 6         30 for(my $k= 1; $k < $n; $k++ ) {
651 284         335 $a2 += ( 1 / $k**2 );
652             }
653            
654 6         13 my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
655 6         14 my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
656             ( ( 9 * $n) * ( $n - 1) );
657 6         10 my $c1 = $b1 - ( 1 / $a1 );
658 6         43 my $c2 = $b2 - ( ( $n + 2 ) /
659             ( $a1 * $n))+( $a2 / $a1 ** 2);
660 6         10 my $e1 = $c1 / $a1;
661 6         8 my $e2 = $c2 / ( $a1**2 + $a2 );
662            
663 6         13 my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
664 6 50       16 return if $denom == 0;
665 6         7 my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
666 6         92 return $D;
667             }
668              
669              
670             =head2 pi
671              
672             Title : pi
673             Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds)
674             Function: Calculate pi (average number of pairwise differences) given
675             a list of individuals which have the same number of markers
676             (also called sites) as available from the get_Genotypes()
677             call in L
678             Returns : decimal number
679             Args : Arg1= array ref of L objects
680             which have markers/mutations. We expect all individuals to
681             have a marker - we will deal with missing data as a special case.
682             OR
683             Arg1= L object. In the event that
684             only allele frequency data is available, storing it in
685             Population object will make this available.
686             num sites [optional], an optional second argument (integer)
687             which is the number of sites, then pi returned is pi/site.
688              
689             =cut
690              
691             sub pi {
692 21     21 1 50 my ($self,$individuals,$numsites) = @_;
693 21         21 my (%data,%marker_total,@marker_names,$n);
694              
695 21 100 33     123 if( ref($individuals) =~ /ARRAY/i ) {
    50          
696             # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
697 9         40 @marker_names = $individuals->[0]->get_marker_names;
698 9         21 $n = scalar @$individuals;
699              
700             # Here we are calculating the allele frequencies
701 9         12 foreach my $ind ( @$individuals ) {
702 45 50       120 if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
703 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
704 0         0 return 0;
705             }
706 45         45 foreach my $m ( @marker_names ) {
707 2125         2593 foreach my $allele (map { $_->get_Alleles}
  2125         2462  
708             $ind->get_Genotypes($m) ) {
709 2125         1932 $data{$m}->{$allele}++;
710 2125         2383 $marker_total{$m}++;
711             }
712             }
713             }
714             # while( my ($marker,$count) = each %marker_total ) {
715             # foreach my $c ( values %{$data{$marker}} ) {
716             # $c /= $count;
717             # }
718             # }
719             # %data will contain allele frequencies for each marker, allele
720             } elsif( ref($individuals) &&
721             $individuals->isa('Bio::PopGen::PopulationI') ) {
722 12         33 my $pop = $individuals;
723 12         28 $n = $pop->get_number_individuals;
724 12         80 foreach my $marker( $pop->get_Markers ) {
725 143         374 push @marker_names, $marker->name;
726             #$data{$marker->name} = {$marker->get_Allele_Frequencies};
727 143         466 my @genotypes = $pop->get_Genotypes(-marker => $marker->name);
728 143         1137 for my $al ( map { $_->get_Alleles} @genotypes ) {
  12444         14821  
729 24628         25399 $data{$marker->name}->{$al}++;
730 24628         26737 $marker_total{$marker->name}++;
731             }
732             }
733             } else {
734 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
735             }
736             # based on Kevin Thornton's code:
737             # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
738             # For now we assume that all individuals have the same markers
739 21         84 my ($diffcount,$totalcompare) = (0,0);
740 21         21 my $pi = 0;
741 21         62 while ( my ($marker,$markerdat) = each %data ) {
742 568         399 my $sampsize = $marker_total{$marker};
743 568         344 my $ssh = 0;
744 568         795 my @alleles = keys %$markerdat;
745 568 50       670 if ( $sampsize > 1 ) {
746 568         388 my $denom = $sampsize * ($sampsize - 1.0);
747 568         433 foreach my $al ( @alleles ) {
748 1118         1145 $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
749             }
750 568         1077 $pi += 1.0 - $ssh;
751             }
752             }
753 21         239 $self->debug( "pi=$pi\n");
754 21 100       46 if( $numsites ) {
755 2         14 return $pi / $numsites;
756             } else {
757 19         256 return $pi;
758             }
759             }
760              
761              
762             =head2 theta
763              
764             Title : theta
765             Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
766             Function: Calculates Watterson's theta from the sample size
767             and the number of segregating sites.
768             Providing the third parameter, total number of sites will
769             return theta per site.
770             This is also known as K-hat = K / a_n
771             Returns : decimal number
772             Args : sample size (integer),
773             num segregating sites (integer)
774             total sites (integer) [optional] (to calculate theta per site)
775             OR
776             provide an arrayref of the L objects
777             total sites (integer) [optional] (to calculate theta per site)
778             OR
779             provide an L object
780             total sites (integer)[optional]
781              
782             =cut
783              
784             #'
785              
786             sub theta {
787 7     7 1 687 my $self = shift;
788 7         10 my ( $n, $seg_sites,$totalsites) = @_;
789 7 100 66     68 if( ref($n) =~ /ARRAY/i ) {
    100          
790 2         2 my $samps = $n;
791 2         1 $totalsites = $seg_sites; # only 2 arguments if one is an array
792 2         2 my %data;
793 2         5 my @marker_names = $samps->[0]->get_marker_names;
794             # we need to calculate number of polymorphic sites
795 2         6 $seg_sites = $self->segregating_sites_count($samps);
796 2         3 $n = scalar @$samps;
797              
798             } elsif(ref($n) &&
799             $n->isa('Bio::PopGen::PopulationI') ) {
800             # This will handle the case when we pass in a PopulationI object
801 4         4 my $pop = $n;
802 4         6 $totalsites = $seg_sites; # shift the arguments over by one
803 4         14 $n = $pop->haploid_population->get_number_individuals;
804 4         10 $seg_sites = $self->segregating_sites_count($pop);
805             }
806 7         8 my $a1 = 0;
807 7         18 for(my $k= 1; $k < $n; $k++ ) {
808 202         242 $a1 += ( 1 / $k );
809             }
810 7 100       12 if( $totalsites ) { # 0 and undef are the same can't divide by them
811 2         3 $seg_sites /= $totalsites;
812             }
813 7 50       14 if( $a1 == 0 ) {
814 0         0 return 0;
815             }
816 7         60 return $seg_sites / $a1;
817             }
818              
819             =head2 singleton_count
820              
821             Title : singleton_count
822             Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
823             Function: Calculate the number of mutations/alleles which only occur once in
824             a list of individuals for all sites/markers
825             Returns : (integer) number of alleles which only occur once (integer)
826             Args : arrayref of L objects
827             OR
828             L object
829              
830             =cut
831              
832             sub singleton_count {
833 7     7 1 454 my ($self,$individuals) = @_;
834              
835 7         5 my @inds;
836 7 100 33     36 if( ref($individuals) =~ /ARRAY/ ) {
    50          
837 5         9 @inds = @$individuals;
838             } elsif( ref($individuals) &&
839             $individuals->isa('Bio::PopGen::PopulationI') ) {
840 2         2 my $pop = $individuals;
841 2         4 @inds = $pop->get_Individuals();
842 2 50       3 unless( @inds ) {
843 0         0 $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
844 0         0 return 0;
845             }
846             } else {
847 0         0 $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
848 0         0 return 0;
849             }
850             # find number of sites where a particular allele is only seen once
851              
852 7         10 my ($singleton_allele_ct,%sites) = (0);
853             # first collect all the alleles into a hash structure
854            
855 7         10 foreach my $n ( @inds ) {
856 35 50       84 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
857 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
858 0         0 return 0;
859             }
860 35         44 foreach my $g ( $n->get_Genotypes ) {
861 1600         1702 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
862 1600         1508 foreach my $allele (@alleles ) {
863 1600         2106 $sites{$nm}->{$allele}++;
864             }
865             }
866             }
867 7         22 foreach my $site ( values %sites ) { # don't really care what the name is
868 320         278 foreach my $allelect ( values %$site ) { #
869             # find the sites which have an allele with only 1 copy
870 636 100       724 $singleton_allele_ct++ if( $allelect == 1 );
871             }
872             }
873 7         67 return $singleton_allele_ct;
874             }
875              
876             # Yes I know that singleton_count and segregating_sites_count are
877             # basically processing the same data so calling them both is
878             # redundant, something I want to fix later but want to make things
879             # correct and simple first
880              
881             =head2 segregating_sites_count
882              
883             Title : segregating_sites_count
884             Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
885             Function: Gets the number of segregating sites (number of polymorphic sites)
886             Returns : (integer) number of segregating sites
887             Args : arrayref of L objects
888             OR
889             L object
890              
891             =cut
892              
893             # perhaps we'll change this in the future
894             # to return the actual segregating sites
895             # so one can use this to pull in the names of those sites.
896             # Would be trivial if it is useful.
897              
898             sub segregating_sites_count {
899 31     31 1 490 my ($self,$individuals) = @_;
900 31         40 my $type = ref($individuals);
901 31         29 my $seg_sites = 0;
902 31 100 33     175 if( $type =~ /ARRAY/i ) {
    50          
903 15         14 my %sites;
904 15         20 foreach my $n ( @$individuals ) {
905 75 50       173 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
906 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
907 0         0 return 0;
908             }
909 75         105 foreach my $g ( $n->get_Genotypes ) {
910 3225         3509 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
911 3225         3159 foreach my $allele (@alleles ) {
912 3225         4114 $sites{$nm}->{$allele}++;
913             }
914             }
915             }
916 15         47 foreach my $site ( values %sites ) { # use values b/c we don't
917             # really care what the name is
918             # find the sites which >1 allele
919 645 100       902 $seg_sites++ if( keys %$site > 1 );
920             }
921             } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
922 16         43 foreach my $marker ( $individuals->haploid_population->get_Markers ) {
923 163         217 my @alleles = $marker->get_Alleles;
924 163 100       281 $seg_sites++ if ( scalar @alleles > 1 );
925             }
926             } else {
927 0         0 $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
928 0         0 return 0;
929             }
930 31         108 return $seg_sites;
931             }
932              
933              
934             =head2 heterozygosity
935              
936             Title : heterozygosity
937             Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
938             Function: Calculate the heterozgosity for a sample set for a set of alleles
939             Returns : decimal number
940             Args : sample size (integer)
941             frequency of one allele (fraction - must be less than 1)
942             [optional] frequency of another allele - this is only needed
943             in a non-binary allele system
944              
945             Note : p^2 + 2pq + q^2
946              
947             =cut
948              
949              
950             sub heterozygosity {
951 0     0 1 0 my ($self,$samp_size, $freq1,$freq2) = @_;
952 0 0       0 if( ! $freq2 ) { $freq2 = 1 - $freq1 }
  0         0  
953 0 0 0     0 if( $freq1 > 1 || $freq2 > 1 ) {
954 0         0 $self->warn("heterozygosity expects frequencies to be less than 1");
955             }
956 0         0 my $sum = ($freq1**2) + (($freq2)**2);
957 0         0 my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
958 0         0 return $h;
959             }
960              
961              
962             =head2 derived_mutations
963              
964             Title : derived_mutations
965             Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
966             Function: Calculate the number of alleles or (mutations) which are ancestral
967             and the number which are derived (occurred only on the tips)
968             Returns : array of 2 items - number of external and internal derived
969             mutation
970             Args : ingroup - Ls arrayref OR
971             L
972             outgroup- Ls arrayref OR
973             L OR
974             a single L
975              
976             =cut
977              
978             sub derived_mutations {
979 10     10 1 13 my ($self,$ingroup,$outgroup) = @_;
980 10         10 my (%indata,%outdata,@marker_names);
981              
982             # basically we have to do some type checking
983             # if that perl were typed...
984 10         16 my ($itype,$otype) = (ref($ingroup),ref($outgroup));
985              
986 10 50       15 return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
987             # are already the value we
988             # are searching for
989             # pick apart the ingroup
990             # get the data
991 10 100 33     63 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
992 4 50 33     19 if( ! ref($ingroup->[0]) ||
993             ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
994 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
995 0         0 return 0;
996             }
997             # we assume that all individuals have the same markers
998             # i.e. that they are aligned
999 4         8 @marker_names = $ingroup->[0]->get_marker_names;
1000 4         6 for my $ind ( @$ingroup ) {
1001 20         18 for my $m ( @marker_names ) {
1002 100         112 for my $allele ( map { $_->get_Alleles }
  100         122  
1003             $ind->get_Genotypes($m) ) {
1004 100         138 $indata{$m}->{$allele}++;
1005             }
1006             }
1007             }
1008             } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
1009 6         14 @marker_names = $ingroup->get_marker_names;
1010 6         15 for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
1011 30         24 for my $m ( @marker_names ) {
1012 150         176 for my $allele ( map { $_->get_Alleles}
  150         183  
1013             $ind->get_Genotypes($m) ) {
1014 150         221 $indata{$m}->{$allele}++;
1015             }
1016             }
1017             }
1018             } else {
1019 0         0 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1020 0         0 return 0;
1021             }
1022            
1023 10 100       29 if( $otype =~ /ARRAY/i ) {
    50          
1024 5 50 33     26 if( ! ref($outgroup->[0]) ||
1025             ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1026 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1027 0         0 return 0;
1028             }
1029 5         6 for my $ind ( @$outgroup ) {
1030 5         24 for my $m ( @marker_names ) {
1031 25         32 for my $allele ( map { $_->get_Alleles }
  25         35  
1032             $ind->get_Genotypes($m) ) {
1033 25         47 $outdata{$m}->{$allele}++;
1034             }
1035             }
1036             }
1037            
1038             } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1039 5         10 for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1040 5         7 for my $m ( @marker_names ) {
1041 25         36 for my $allele ( map { $_->get_Alleles}
  25         36  
1042             $ind->get_Genotypes($m) ) {
1043 25         43 $outdata{$m}->{$allele}++;
1044             }
1045             }
1046             }
1047             } else {
1048 0         0 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1049 0         0 return 0;
1050             }
1051            
1052             # derived mutations are defined as
1053             #
1054             # ingroup (G A T)
1055             # outgroup (A)
1056             # derived mutations are G and T, A is the external mutation
1057            
1058             # ingroup (A T)
1059             # outgroup (C)
1060             # derived mutations A,T no external/ancestral mutations
1061            
1062             # ingroup (G A T)
1063             # outgroup (A T)
1064             # cannot determine
1065            
1066 10         14 my ($internal,$external);
1067 10         13 foreach my $marker ( @marker_names ) {
1068 50         33 my @outalleles = keys %{$outdata{$marker}};
  50         85  
1069 50         42 my @in_alleles = keys %{$indata{$marker}};
  50         68  
1070 50 100 66     145 next if( @outalleles > 1 || @in_alleles == 1);
1071 40         33 for my $allele ( @in_alleles ) {
1072 80 100       114 if( ! exists $outdata{$marker}->{$allele} ) {
1073 40 100       45 if( $indata{$marker}->{$allele} == 1 ) {
1074 10         10 $external++;
1075             } else {
1076 30         28 $internal++;
1077             }
1078             }
1079             }
1080             }
1081 10         52 return ($external, $internal);
1082             }
1083              
1084              
1085             =head2 composite_LD
1086              
1087             Title : composite_LD
1088             Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1089             Function: Calculate the Linkage Disequilibrium
1090             This is for calculating LD for unphased data.
1091             Other methods will be appropriate for phased haplotype data.
1092              
1093             Returns : Hash of Hashes - first key is site 1,second key is site 2
1094             and value is LD for those two sites.
1095             my $LDarrayref = $matrix{$site1}->{$site2};
1096             my ($ldval, $chisquared) = @$LDarrayref;
1097             Args : L or arrayref of
1098             Ls
1099             Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1100             Sinauer, Sunderlanm MA.
1101              
1102             =cut
1103              
1104             sub composite_LD {
1105 2     2 1 7 my ($self,$pop) = @_;
1106 2 50 33     17 if( ref($pop) =~ /ARRAY/i ) {
    50          
1107 0 0 0     0 if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1108 0         0 $pop = Bio::PopGen::Population->new(-individuals => @$pop);
1109             } else {
1110 0         0 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1111 0         0 return ();
1112             }
1113             } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1114 0         0 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1115 0         0 return ();
1116             }
1117              
1118 2         247 my @marker_names = $pop->get_marker_names;
1119 2         5 my @inds = $pop->get_Individuals;
1120 2         3 my $num_inds = scalar @inds;
1121 2         1 my (%lookup);
1122             # calculate allele frequencies for each marker from the population
1123             # use the built-in get_Marker to get the allele freqs
1124             # we still need to calculate the genotype frequencies
1125 2         3 foreach my $marker_name ( @marker_names ) {
1126 5         5 my(%allelef);
1127              
1128 5         5 foreach my $ind ( @inds ) {
1129 76         124 my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
1130 76 50       120 if( ! defined $genotype ) {
1131 0         0 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1132 0         0 next;
1133             }
1134 76         96 my @alleles = sort $genotype->get_Alleles;
1135 76 100       117 next if( scalar @alleles != 2);
1136 73         76 my $genostr = join(',', @alleles);
1137 73         55 $allelef{$alleles[0]}++;
1138 73         86 $allelef{$alleles[1]}++;
1139             }
1140              
1141             # we should check for cases where there > 2 alleles or
1142             # only 1 allele and throw out those markers.
1143 5         11 my @alleles = sort keys %allelef;
1144 5         4 my $allele_count = scalar @alleles;
1145             # test if site is polymorphic
1146 5 50       9 if( $allele_count != 2) {
1147             # only really warn if we're seeing multi-allele
1148 0 0       0 $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1149 0         0 next; # skip this marker
1150             }
1151              
1152             # Need to do something here to detect alleles which aren't
1153             # a single character
1154 5 50 33     19 if( length($alleles[0]) != 1 ||
1155             length($alleles[1]) != 1 ) {
1156 0         0 $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1157 0         0 next;
1158             }
1159              
1160             # fix the call for allele 1 (A or B) and
1161             # allele 2 (a or b) in terms of how we'll do the
1162             # N square from Weir p.126
1163 5         16 $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1164 5         10 $lookup{$marker_name}->{'1'} = $alleles[0];
1165 5         11 $lookup{$marker_name}->{'2'} = $alleles[1];
1166             }
1167              
1168 2         5 @marker_names = sort keys %lookup;
1169 2         2 my $site_count = scalar @marker_names;
1170             # where the final data will be stored
1171 2         3 my %stats_for_sites;
1172              
1173             # standard way of generating pairwise combos
1174             # LD is done by comparing all the pairwise site (marker)
1175             # combinations and keeping track of the genotype and
1176             # pairwise genotype (ie genotypes of the 2 sites) frequencies
1177 2         4 for( my $i = 0; $i < $site_count - 1; $i++ ) {
1178 3         4 my $site1 = $marker_names[$i];
1179              
1180 3         6 for( my $j = $i+1; $j < $site_count ; $j++) {
1181 4         3 my (%genotypes, %total_genotype_count,$total_pairwisegeno_count,
1182             %pairwise_genotypes);
1183            
1184 4         4 my $site2 = $marker_names[$j];
1185 4         8 my (%allele_count,%allele_freqs) = (0,0);
1186 4         5 foreach my $ind ( @inds ) {
1187             # build string of genotype at site 1
1188 53         82 my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1189 53         75 my @alleles1 = sort $genotype1->get_Alleles;
1190              
1191             # if an individual has only one available allele
1192             # (has a blank or N for one of the chromosomes)
1193             # we don't want to use it in our calculation
1194              
1195 53 100       78 next unless( scalar @alleles1 == 2);
1196 52         58 my $genostr1 = join(',', @alleles1);
1197              
1198             # build string of genotype at site 2
1199 52         70 my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1200 52         72 my @alleles2 = sort $genotype2->get_Alleles;
1201 52         62 my $genostr2 = join(',', @alleles2);
1202            
1203 52 100       61 next unless( scalar @alleles2 == 2);
1204 50         48 for (@alleles1) {
1205 100         74 $allele_count{$site1}++;
1206 100         84 $allele_freqs{$site1}->{$_}++;
1207             }
1208 50         44 $genotypes{$site1}->{$genostr1}++;
1209 50         25 $total_genotype_count{$site1}++;
1210              
1211 50         42 for (@alleles2) {
1212 100         67 $allele_count{$site2}++;
1213 100         76 $allele_freqs{$site2}->{$_}++;
1214             }
1215 50         37 $genotypes{$site2}->{$genostr2}++;
1216 50         33 $total_genotype_count{$site2}++;
1217              
1218             # We are using the $site1,$site2 to signify
1219             # a unique key
1220 50         63 $pairwise_genotypes{"$genostr1,$genostr2"}++;
1221             # some individuals
1222 50         57 $total_pairwisegeno_count++;
1223             }
1224 4         8 for my $site ( %allele_freqs ) {
1225 16         9 for my $al ( keys %{ $allele_freqs{$site} } ) {
  16         30  
1226 16         20 $allele_freqs{$site}->{$al} /= $allele_count{$site};
1227             }
1228             }
1229 4         3 my $n = $total_pairwisegeno_count; # number of pairs of comparisons
1230             # 'A' and 'B' are two loci or in our case site1 and site2
1231 4         6 my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1232 4         8 my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1233 4         3 my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1234 4         4 my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1235             # AABB
1236 4         5 my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1237             $allele1_site2, $allele1_site2));
1238 4         12 $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1239             # AABb
1240 4         6 my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1241             $allele1_site2, $allele2_site2));
1242 4         9 $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1243             # AaBB
1244 4         6 my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1245             $allele1_site2, $allele1_site2));
1246 4         9 $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1247             # AaBb
1248 4         5 my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1249             $allele1_site2, $allele2_site2));
1250 4         10 $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1251             # count of AABB in
1252 4   50     7 my $n1 = $pairwise_genotypes{$N1genostr} || 0;
1253             # count of AABb in
1254 4   100     10 my $n2 = $pairwise_genotypes{$N2genostr} || 0;
1255             # count of AaBB in
1256 4   100     9 my $n4 = $pairwise_genotypes{$N4genostr} || 0;
1257             # count of AaBb in
1258 4   100     16 my $n5 = $pairwise_genotypes{$N5genostr} || 0;
1259              
1260 4         5 my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1261 4         3 my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1262 4   50     9 my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1263 4   50     10 my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1264 4   50     6 my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq
1265 4         4 my $p_a = 1 - $p_A;
1266              
1267 4   50     8 my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq
1268 4         2 my $p_b = 1 - $p_B;
1269              
1270             # variance of allele frequencies
1271 4         6 my $pi_A = $p_A * $p_a;
1272 4         4 my $pi_B = $p_B * $p_b;
1273              
1274             # hardy weinberg
1275 4         5 my $D_A = $p_AA - $p_A**2;
1276 4         5 my $D_B = $p_BB - $p_B**2;
1277 4         6 my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1278 4         18 $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1279              
1280 4         7 my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1281 4         36 $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1282 4         36 $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1283             $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1284            
1285 4         5 my $chisquared;
1286 4         4 eval { $chisquared = ( $n * ($delta_AB**2) ) /
  4         7  
1287             ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1288             };
1289 4 50       6 if( $@ ) {
1290 0         0 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1291 0         0 next;
1292             }
1293             # this will be an upper triangular matrix
1294 4         37 $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1295             }
1296             }
1297 2         12 return %stats_for_sites;
1298             }
1299              
1300             =head2 mcdonald_kreitman
1301              
1302             Title : mcdonald_kreitman
1303             Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1304             Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1305             individuals and an outgroup by computing the number of
1306             differences at synonymous and non-synonymous sites
1307             for intraspecific comparisons and with the outgroup
1308             Returns : 2x2 table, followed by a hash reference indicating any
1309             warning messages about the status of the alleles or codons
1310             Args : -ingroup => L object or
1311             arrayref of Ls
1312             -outgroup => L object or
1313             arrayef of Ls
1314             -polarized => Boolean, to indicate if this should be
1315             a polarized test. Must provide two individuals
1316             as outgroups.
1317              
1318             =cut
1319              
1320             sub mcdonald_kreitman {
1321 6     6 1 4983 my ($self,@args) = @_;
1322 6         35 my ($ingroup, $outgroup,$polarized) =
1323             $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1324 6         30 my $verbose = $self->verbose;
1325 6         9 my $outgroup_count;
1326 6         8 my $gapchar = '\-';
1327 6 50       30 if( ref($outgroup) =~ /ARRAY/i ) {
    0          
1328 6         10 $outgroup_count = scalar @$outgroup;
1329             } elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) {
1330 0         0 $outgroup_count = $outgroup->get_number_individuals;
1331             } else {
1332 0         0 $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1333             }
1334            
1335 6 100       24 if( $polarized ) {
    50          
    50          
1336 2 50       8 if( $outgroup_count < 2 ) {
1337 0         0 $self->throw("Need 2 outgroups with polarized option\n");
1338             }
1339             } elsif( $outgroup_count > 1 ) {
1340 0         0 $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1341             } elsif( $outgroup_count == 0 ) {
1342 0         0 $self->throw("No outgroup sequence provided");
1343             }
1344            
1345 6         38 my $codon_path = Bio::MolEvol::CodonModel->codon_path;
1346            
1347 6         524 my (%marker_names,%unique,@inds);
1348 6         12 for my $p ( $ingroup, $outgroup) {
1349 12 50       42 if( ref($p) =~ /ARRAY/i ) {
1350 12         27 push @inds, @$p;
1351             } else {
1352 0         0 push @inds, $p->get_Individuals;
1353             }
1354             }
1355 6         12 for my $i ( @inds ) {
1356 35 50       89 if( $unique{$i->unique_id}++ ) {
1357 0         0 $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1358             }
1359 35         61 for my $n ( $i->get_marker_names ) {
1360 14126         9605 $marker_names{$n}++;
1361             }
1362             }
1363              
1364 6         335 my @marker_names = keys %marker_names;
1365 6 50       81 if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1366             # sort by site or codon number and do it in
1367             # a schwartzian transformation baby!
1368 0         0 @marker_names = map { $_->[1] }
1369 0         0 sort { $a->[0] <=> $b->[0] }
1370 0         0 map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
  0         0  
1371             }
1372              
1373              
1374 6         12 my $num_inds = scalar @inds;
1375 6         21 my %vals = ( 'ingroup' => $ingroup,
1376             'outgroup' => $outgroup,
1377             );
1378              
1379             # Make the Codon Table type a parameter!
1380 6         45 my $table = Bio::Tools::CodonTable->new(-id => $codon_table);
1381 6         15 my @vt = qw(outgroup ingroup);
1382 6         5 my %changes;
1383             my %status;
1384 6         19 my %two_by_two = ( 'fixed_N' => 0,
1385             'fixed_S' => 0,
1386             'poly_N' => 0,
1387             'poly_S' => 0);
1388              
1389 6         12 for my $codon ( @marker_names ) {
1390 2436         1700 my (%codonvals);
1391             my %all_alleles;
1392 2436         2119 for my $t ( @vt ) {
1393 4872         3461 my $outcount = 1;
1394 4872         3395 for my $ind ( @{$vals{$t}} ) {
  4872         5995  
1395 14126         21196 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1396 14126 50       18456 if( @alleles > 2 ) {
1397 0         0 warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n");
1398 0         0 die;
1399             } else {
1400 14126         10579 my ($allele) = shift @alleles;
1401 14126         20793 $all_alleles{$ind->unique_id} = $allele;
1402 14126         20428 my $AA = $table->translate($allele);
1403 14126 50 66     53198 next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
      66        
1404              
1405 8971         6552 my $label = $t;
1406 8971 100       10452 if( $t eq 'outgroup' ) {
1407 3244         3478 $label = $t.$outcount++;
1408             }
1409 8971         10793 $codonvals{$label}->{$allele}++;
1410 8971         14880 $codonvals{all}->{$allele}++;
1411             }
1412             }
1413             }
1414 2436         1874 my $total = sum ( values %{$codonvals{'ingroup'}} );
  2436         5497  
1415 2436 100 100     8190 next if( $total && $total < 2 ); # skip sites with < alleles
1416             # process all the seen alleles (codons)
1417             # this is a vertical slide through the alignment
1418 1626 100       1094 if( keys %{$codonvals{all}} <= 1 ) {
  1626         4748  
1419             # no changes or no VALID codons - monomorphic
1420             } else {
1421             # grab only the first outgroup codon (what to do with rest?)
1422 278         173 my ($outcodon) = keys %{$codonvals{'outgroup1'}};
  278         376  
1423 278 50       397 if( ! $outcodon ) {
1424 0         0 $status{"no outgroup codon $codon"}++;
1425 0         0 next;
1426             }
1427 278         385 my $out_AA = $table->translate($outcodon);
1428 278         260 my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
  278         441  
1429 278 50 100     1263 if( ($polarized && ($outcodon ne $outcodon2)) ||
      66        
      66        
1430             $out_AA eq 'X' || $out_AA eq '*' ) {
1431             # skip if outgroup codons are different
1432             # (when polarized option is on)
1433             # or skip if the outcodon is STOP or 'NNN'
1434 90 50       125 if( $verbose > 0 ) {
1435 0         0 $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1436             }
1437 90         73 $status{'outgroup codons different'}++;
1438 90         279 next;
1439             }
1440              
1441             # check if ingroup is actually different from outgroup -
1442             # if there are the same number of alleles when considering
1443             # ALL or just the ingroup, then there is nothing new seen
1444             # in the outgroup so it must be a shared allele (codon)
1445              
1446             # so we just count how many total alleles were seen
1447             # if this is the same as the number of alleles seen for just
1448             # the ingroup then the outgroup presents no new information
1449              
1450 188         127 my @ingroup_codons = keys %{$codonvals{'ingroup'}};
  188         303  
1451 188         234 my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1452              
1453 188 50       232 if( $verbose > 0 ) {
1454             $self->debug("alleles are in: ", join(",", @ingroup_codons),
1455 0         0 " out: ", join(",", keys %{$codonvals{outgroup1}}),
  0         0  
1456             " diff_from_out=$diff_from_out\n");
1457              
1458 0         0 for my $ind ( sort keys %all_alleles ) {
1459 0         0 $self->debug( "$ind\t$all_alleles{$ind}\n");
1460             }
1461             }
1462             # are all the ingroup alleles the same and diferent from outgroup?
1463             # fixed differences between species
1464 188 100       216 if( $diff_from_out ) {
1465 99 100       129 if( scalar @ingroup_codons == 1 ) {
1466             # fixed differences
1467 92 50       385 if( $outcodon =~ /^$gapchar/ ) {
    50          
1468 0         0 $status{'outgroup codons with gaps'}++;
1469 0         0 next;
1470             } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1471 0         0 $status{'ingroup codons with gaps'}++;
1472 0         0 next;
1473             }
1474 92         166 my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1475 92         102 $two_by_two{fixed_N} += $path->[0];
1476 92         86 $two_by_two{fixed_S} += $path->[1];
1477 92 50       376 if( $verbose > 0 ) {
1478 0         0 $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1479 0         0 $self->debug("path is ",join(",",@$path),"\n");
1480             $self->debug
1481             (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1482 0         0 @$path, map { $two_by_two{$_} }
  0         0  
1483             qw(fixed_N fixed_S poly_N poly_S)));
1484             }
1485             } else {
1486             # polymorphic and all are different from outgroup
1487             # Here we find the minimum number of NS subst
1488 7         10 my ($Ndiff,$Sdiff) = (3,0); # most different path
1489 7         8 for my $c ( @ingroup_codons ) {
1490 14 50 33     79 next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1491 14         24 my $path = $codon_path->{uc $c.$outcodon};
1492 14         16 my ($tNdiff,$tSdiff) = @$path;
1493 14 100 100     46 if( $path->[0] < $Ndiff ||
      66        
1494             ($tNdiff == $Ndiff &&
1495             $tSdiff <= $Sdiff)) {
1496 11         15 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1497             }
1498             }
1499 7         10 $two_by_two{fixed_N} += $Ndiff;
1500 7         7 $two_by_two{fixed_S} += $Sdiff;
1501 7 50       11 if( @ingroup_codons > 2 ) {
1502 0         0 $status{"more than 2 ingroup codons $codon"}++;
1503 0         0 warn("more than 2 ingroup codons (@ingroup_codons)\n");
1504             } else {
1505 7         16 my $path = $codon_path->{uc join('',@ingroup_codons)};
1506              
1507 7         10 $two_by_two{poly_N} += $path->[0];
1508 7         6 $two_by_two{poly_S} += $path->[1];
1509 7 50       33 if( $verbose > 0 ) {
1510 0         0 $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
  0         0  
1511             }
1512             }
1513             }
1514             } else {
1515 89         98 my %unq = map { $_ => 1 } @ingroup_codons;
  181         273  
1516 89         97 delete $unq{$outcodon};
1517 89         113 my @unique_codons = keys %unq;
1518              
1519             # calc path for diff add to poly
1520             # Here we find the minimum number of subst bw
1521             # codons
1522 89         84 my ($Ndiff,$Sdiff) = (3,0); # most different path
1523 89         84 for my $c ( @unique_codons ) {
1524 92         159 my $path = $codon_path->{uc $c.$outcodon };
1525 92 50       138 if( ! defined $path ) {
1526 0         0 die " cannot get path for ", $c.$outcodon, "\n";
1527             }
1528 92         87 my ($tNdiff,$tSdiff) = @$path;
1529 92 50 33     163 if( $path->[0] < $Ndiff ||
      66        
1530             ($tNdiff == $Ndiff &&
1531             $tSdiff <= $Sdiff)) {
1532 92         127 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1533             }
1534             }
1535              
1536 89 100       139 if( @unique_codons == 2 ) {
1537 3         8 my $path = $codon_path->{uc join('',@unique_codons)};
1538 3 50       8 if( ! defined $path ) {
1539 0         0 $self->throw("no path for @unique_codons\n");
1540             }
1541 3         5 $Ndiff += $path->[0];
1542 3         3 $Sdiff += $path->[1];
1543             }
1544 89         91 $two_by_two{poly_N} += $Ndiff;
1545 89         77 $two_by_two{poly_S} += $Sdiff;
1546 89 50       374 if( $verbose > 0 ) {
1547             $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1548 0         0 $Ndiff, $Sdiff, map { $two_by_two{$_} }
  0         0  
1549             qw(fixed_N fixed_S poly_N poly_S)));
1550             }
1551             }
1552             }
1553             }
1554             return ( $two_by_two{'poly_N'},
1555             $two_by_two{'fixed_N'},
1556             $two_by_two{'poly_S'},
1557 6         83 $two_by_two{'fixed_S'},
1558             {%status});
1559            
1560             }
1561              
1562             *MK = \&mcdonald_kreitman;
1563              
1564              
1565             =head2 mcdonald_kreitman_counts
1566              
1567             Title : mcdonald_kreitman_counts
1568             Usage : my $MK = $statistics->mcdonald_kreitman_counts(
1569              
1570             N_poly -> integer of count of non-syn polymorphism
1571             N_fix -> integer of count of non-syn fixed substitutions
1572             S_poly -> integer of count of syn polymorphism
1573             S_fix -> integer of count of syn fixed substitutions
1574             );
1575             Function:
1576             Returns : decimal number
1577             Args :
1578              
1579             =cut
1580              
1581              
1582             sub mcdonald_kreitman_counts {
1583 0     0 1   my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1584 0 0         if( $has_twotailed ) {
1585 0           return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic
1586             (n11=>$Npoly,
1587             n1p=>$Npoly+$Spoly,
1588             np1=>$Npoly+$Nfix,
1589             npp=>$Npoly+$Nfix+$Spoly+$Sfix);
1590             } else {
1591 0           $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");
1592 0           return 0;
1593             }
1594             }
1595              
1596              
1597             1;