File Coverage

Bio/PopGen/Statistics.pm
Criterion Covered Total %
statement 482 589 81.8
branch 123 192 64.0
condition 52 100 52.0
subroutine 23 25 92.0
pod 19 19 100.0
total 699 925 75.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PopGen::Statistics
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PopGen::Statistics - Population Genetics statistical tests
17              
18             =head1 SYNOPSIS
19              
20             use Bio::PopGen::Statistics;
21             use Bio::AlignIO;
22             use Bio::PopGen::IO;
23             use Bio::PopGen::Simulation::Coalescent;
24              
25             my $sim = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 12);
26              
27             my $tree = $sim->next_tree;
28              
29             $sim->add_Mutations($tree,20);
30              
31             my $stats = Bio::PopGen::Statistics->new();
32             my $individuals = [ $tree->get_leaf_nodes];
33             my $pi = $stats->pi($individuals);
34             my $D = $stats->tajima_D($individuals);
35              
36             # Alternatively to do this on input data from
37             # See the tests in t/PopGen.t for more examples
38             my $parser = Bio::PopGen::IO->new(-format => 'prettybase',
39             -file => 't/data/popstats.prettybase');
40             my $pop = $parser->next_population;
41             # Note that you can also call the stats as a class method if you like
42             # the only reason to instantiate it (as above) is if you want
43             # to set the verbosity for debugging
44             $pi = Bio::PopGen::Statistics->pi($pop);
45             $theta = Bio::PopGen::Statistics->theta($pop);
46              
47             # Pi and Theta also take additional arguments,
48             # see the documentation for more information
49              
50             use Bio::PopGen::Utilities;
51             use Bio::AlignIO;
52              
53             my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
54             -format => 'clustalw');
55             my $aln = $in->next_aln;
56             # get a population, each sequence is an individual and
57             # for the default case, every site which is not monomorphic
58             # is a 'marker'. Each individual will have a 'genotype' for the
59             # site which will be the specific base in the alignment at that
60             # site
61              
62             my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
63              
64              
65             =head1 DESCRIPTION
66              
67             This object is intended to provide implementations some standard
68             population genetics statistics about alleles in populations.
69              
70             This module was previously named Bio::Tree::Statistics.
71              
72             This object is a place to accumulate routines for calculating various
73             statistics from the coalescent simulation, marker/allele, or from
74             aligned sequence data given that you can calculate alleles, number of
75             segregating sites.
76              
77             Currently implemented:
78             Fu and Li's D (fu_and_li_D)
79             Fu and Li's D* (fu_and_li_D_star)
80             Fu and Li's F (fu_and_li_F)
81             Fu and Li's F* (fu_and_li_F_star)
82             Tajima's D (tajima_D)
83             Watterson's theta (theta)
84             pi (pi) - number of pairwise differences
85             composite_LD (composite_LD)
86             McDonald-Kreitman (mcdonald_kreitman or MK)
87              
88             Count based methods also exist in case you have already calculated the
89             key statistics (seg sites, num individuals, etc) and just want to
90             compute the statistic.
91              
92             In all cases where a the method expects an arrayref of
93             L objects and L
94             object will also work.
95              
96             =head2 REFERENCES
97              
98             Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
99             Mutations." Genetics 133:693-709.
100              
101             Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
102             from a Population." Genetics 143:557-570.
103              
104             McDonald J, Kreitman M.
105              
106             Tajima F. (1989) "Statistical method for testing the neutral mutation
107             hypothesis by DNA polymorphism." Genetics 123:585-595.
108              
109              
110             =head2 CITING THIS WORK
111              
112             Please see this reference for use of this implementation.
113              
114             Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
115              
116             If you use these Bio::PopGen modules please cite the Bioperl
117             publication (see FAQ) and the above reference.
118              
119              
120             =head1 FEEDBACK
121              
122             =head2 Mailing Lists
123              
124             User feedback is an integral part of the evolution of this and other
125             Bioperl modules. Send your comments and suggestions preferably to
126             the Bioperl mailing list. Your participation is much appreciated.
127              
128             bioperl-l@bioperl.org - General discussion
129             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
130              
131             =head2 Support
132              
133             Please direct usage questions or support issues to the mailing list:
134              
135             I
136              
137             rather than to the module maintainer directly. Many experienced and
138             reponsive experts will be able look at the problem and quickly
139             address it. Please include a thorough description of the problem
140             with code and data examples if at all possible.
141              
142             =head2 Reporting Bugs
143              
144             Report bugs to the Bioperl bug tracking system to help us keep track
145             of the bugs and their resolution. Bug reports can be submitted via
146             the web:
147              
148             https://github.com/bioperl/bioperl-live/issues
149              
150             =head1 AUTHOR - Jason Stajich, Matthew Hahn
151              
152             Email jason-at-bioperl-dot-org
153             Email matthew-dot-hahn-at-duke-dot-edu
154              
155             McDonald-Kreitman implementation based on work by Alisha Holloway at
156             UC Davis.
157              
158              
159             =head1 APPENDIX
160              
161             The rest of the documentation details each of the object methods.
162             Internal methods are usually preceded with a _
163              
164             =cut
165              
166              
167             # Let the code begin...
168              
169              
170             package Bio::PopGen::Statistics;
171 3     3   2696 use strict;
  3         5  
  3         106  
172             use constant {
173 3         258 in_label => 'ingroup',
174             out_label => 'outgroup',
175             non_syn => 'non_synonymous',
176             syn => 'synonymous',
177             default_codon_table => 1, # Standard Codon table
178 3     3   13 };
  3         4  
179              
180 3     3   21868 use Bio::MolEvol::CodonModel;
  3         7  
  3         102  
181 3     3   15 use List::Util qw(sum);
  3         5  
  3         268  
182              
183 3     3   16 use base qw(Bio::Root::Root);
  3         3  
  3         345  
184             our $codon_table => default_codon_table;
185             our $has_twotailed => 0;
186             BEGIN {
187 3     3   9 eval { require Text::NSP::Measures::2D::Fisher2::twotailed };
  3         216  
188 3 50       16 if( $@ ) { $has_twotailed = 0; }
  3         14920  
189 0         0 else { $has_twotailed = 1; }
190             }
191              
192              
193              
194              
195              
196              
197             =head2 new
198              
199             Title : new
200             Usage : my $obj = Bio::PopGen::Statistics->new();
201             Function: Builds a new Bio::PopGen::Statistics object
202             Returns : an instance of Bio::PopGen::Statistics
203             Args : none
204              
205              
206             =cut
207              
208              
209             =head2 fu_and_li_D
210              
211             Title : fu_and_li_D
212             Usage : my $D = $statistics->fu_and_li_D(\@ingroup,\@outgroup);
213             OR
214             my $D = $statistics->fu_and_li_D(\@ingroup,$extmutations);
215             Function: Fu and Li D statistic for a list of individuals
216             given an outgroup and the number of external mutations
217             (either provided or calculated from list of outgroup individuals)
218             Returns : decimal
219             Args : $individuals - array reference which contains ingroup individuals
220             (L or derived classes)
221             $extmutations - number of external mutations OR
222             arrayref of outgroup individuals
223              
224             =cut
225              
226             sub fu_and_li_D {
227 7     7 1 24 my ($self,$ingroup,$outgroup) = @_;
228              
229 7         13 my ($seg_sites,$n,$ancestral,$derived) = (0,0,0,0);
230 7 100 33     49 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
231 4         9 $n = scalar @$ingroup;
232             # pi - all pairwise differences
233 4         9 $seg_sites = $self->segregating_sites_count($ingroup);
234             } elsif( ref($ingroup) &&
235             $ingroup->isa('Bio::PopGen::PopulationI')) {
236 3         8 $n = $ingroup->get_number_individuals;
237 3         5 $seg_sites = $self->segregating_sites_count($ingroup);
238             } else {
239 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D");
240 0         0 return 0;
241             }
242            
243 7 50       16 if( $seg_sites <= 0 ) {
244 0         0 $self->warn("mutation total was not > 0, cannot calculate a Fu and Li D");
245 0         0 return 0;
246             }
247              
248 7 50       19 if( ! defined $outgroup ) {
    100          
249 0         0 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
250 0         0 return 0;
251             } elsif( ref($outgroup) ) {
252 4         9 ($ancestral,$derived) = $self->derived_mutations($ingroup,$outgroup);
253 4 50       8 $ancestral = 0 unless defined $ancestral;
254             } else {
255 3         5 $ancestral = $outgroup;
256             }
257            
258 7         24 return $self->fu_and_li_D_counts($n,$seg_sites,
259             $ancestral,$derived);
260             }
261              
262             =head2 fu_and_li_D_counts
263              
264             Title : fu_li_D_counts
265             Usage : my $D = $statistics->fu_and_li_D_counts($samps,$sites,
266             $external);
267             Function: Fu and Li D statistic for the raw counts of the number
268             of samples, sites, external and internal mutations
269             Returns : decimal number
270             Args : number of samples (N)
271             number of segregating sites (n)
272             number of external mutations (n_e)
273              
274             =cut
275              
276              
277             sub fu_and_li_D_counts {
278 8     8 1 167 my ($self,$n,$seg_sites, $external_mut) = @_;
279 8         10 my $a_n = 0;
280 8 50       21 if( $n <= 3 ) {
281 0         0 $self->warn("n is $n, too small, must be > 3\n");
282 0         0 return;
283             }
284 8         19 for(my $k= 1; $k < $n; $k++ ) {
285 51         86 $a_n += ( 1 / $k );
286             }
287 8         11 my $b = 0;
288 8         17 for(my $k= 1; $k < $n; $k++ ) {
289 51         74 $b += ( 1 / $k**2 );
290             }
291              
292 8         29 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
293             ( ( $n - 1) * ( $n - 2 ) ) );
294              
295 8         26 my $v = 1 + ( ( $a_n**2 / ( $b + $a_n**2 ) ) *
296             ( $c - ( ( $n + 1) /
297             ( $n - 1) ) ));
298            
299 8         11 my $u = $a_n - 1 - $v;
300              
301 8         81 ($seg_sites - $a_n * $external_mut) /
302             sqrt( ($u * $seg_sites) + ($v * $seg_sites*$seg_sites));
303            
304             }
305              
306              
307             =head2 fu_and_li_D_star
308              
309             Title : fu_and_li_D_star
310             Usage : my $D = $statistics->fu_an_li_D_star(\@individuals);
311             Function: Fu and Li's D* statistic for a set of samples
312             Without an outgroup
313             Returns : decimal number
314             Args : array ref of L objects
315             OR
316             L object
317              
318             =cut
319              
320             #'
321             # fu_and_li_D*
322              
323             sub fu_and_li_D_star {
324 3     3 1 511 my ($self,$individuals) = @_;
325              
326 3         7 my ($seg_sites,$n,$singletons);
327 3 100 33     22 if( ref($individuals) =~ /ARRAY/i ) {
    50          
328 2         4 $n = scalar @$individuals;
329 2         6 $seg_sites = $self->segregating_sites_count($individuals);
330 2         10 $singletons = $self->singleton_count($individuals);
331             } elsif( ref($individuals) &&
332             $individuals->isa('Bio::PopGen::PopulationI')) {
333 1         2 my $pop = $individuals;
334 1         3 $n = $pop->get_number_individuals;
335 1         3 $seg_sites = $self->segregating_sites_count($pop);
336 1         4 $singletons = $self->singleton_count($pop);
337             } else {
338 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_D_star");
339 0         0 return 0;
340             }
341              
342 3         14 return $self->fu_and_li_D_star_counts($n,$seg_sites, $singletons);
343             }
344              
345             =head2 fu_and_li_D_star_counts
346              
347             Title : fu_li_D_star_counts
348             Usage : my $D = $statistics->fu_and_li_D_star_counts($samps,$sites,
349             $singletons);
350              
351             Function: Fu and Li D statistic for the raw counts of the number
352             of samples, sites, external and internal mutations
353             Returns : decimal number
354             Args : number of samples (N)
355             number of segregating sites (n)
356             singletons (n_s)
357              
358             =cut
359              
360              
361             sub fu_and_li_D_star_counts {
362 4     4 1 11 my ($self,$n,$seg_sites, $singletons) = @_;
363 4         7 my $a_n;
364 4         14 for(my $k = 1; $k < $n; $k++ ) {
365 35         52 $a_n += ( 1 / $k );
366             }
367              
368 4         20 my $a1 = $a_n + 1 / $n;
369              
370 4         5 my $b = 0;
371 4         10 for(my $k= 1; $k < $n; $k++ ) {
372 35         47 $b += ( 1 / $k**2 );
373             }
374              
375 4         14 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
376             ( ( $n - 1) * ( $n - 2 ) ) );
377              
378 4         14 my $d = $c + ($n -2) / ($n - 1)**2 +
379             2 / ($n -1) *
380             ( 1.5 - ( (2*$a1 - 3) / ($n -2) ) -
381             1 / $n );
382            
383 4         17 my $v_star = ( ( ($n/($n-1) )**2)*$b + (($a_n**2)*$d) -
384             (2*( ($n*$a_n*($a_n+1)) )/(($n-1)**2)) ) /
385             (($a_n**2) + $b);
386              
387 4         11 my $u_star = ( ($n/($n-1))*
388             ($a_n - ($n/
389             ($n-1)))) - $v_star;
390              
391              
392 4         36 return (($n / ($n - 1)) * $seg_sites -
393             $a_n * $singletons) /
394             sqrt( ($u_star * $seg_sites) + ($v_star * $seg_sites*$seg_sites));
395             }
396              
397              
398             =head2 fu_and_li_F
399              
400             Title : fu_and_li_F
401             Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F(\@ingroup,$ext_muts);
402             Function: Calculate Fu and Li's F on an ingroup with either the set of
403             outgroup individuals, or the number of external mutations
404             Returns : decimal number
405             Args : array ref of L objects for the ingroup
406             OR a L object
407             number of external mutations OR list of individuals for the outgroup
408              
409             =cut
410              
411             #'
412              
413             sub fu_and_li_F {
414 5     5 1 507 my ($self,$ingroup,$outgroup) = @_;
415 5         8 my ($seg_sites,$pi,$n,$external,$internal);
416 5 100 33     37 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
417 2         6 $n = scalar @$ingroup;
418             # pi - all pairwise differences
419 2         6 $pi = $self->pi($ingroup);
420 2         8 $seg_sites = $self->segregating_sites_count($ingroup);
421             } elsif( ref($ingroup) &&
422             $ingroup->isa('Bio::PopGen::PopulationI')) {
423 3         8 $n = $ingroup->get_number_individuals;
424 3         8 $pi = $self->pi($ingroup);
425 3         9 $seg_sites = $self->segregating_sites_count($ingroup);
426             } else {
427 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to Fu and Li's F");
428 0         0 return 0;
429             }
430            
431 5 50       19 if( ! defined $outgroup ) {
    100          
432 0         0 $self->warn("Need to provide either an array ref to the outgroup individuals or the number of external mutations");
433 0         0 return 0;
434             } elsif( ref($outgroup) ) {
435 2         6 ($external,$internal) = $self->derived_mutations($ingroup,$outgroup);
436             } else {
437 3         4 $external = $outgroup;
438             }
439 5         12 $self->fu_and_li_F_counts($n,$pi,$seg_sites,$external);
440             }
441              
442             =head2 fu_and_li_F_counts
443              
444             Title : fu_li_F_counts
445             Usage : my $F = $statistics->fu_and_li_F_counts($samps,$pi,
446             $sites,
447             $external);
448             Function: Fu and Li F statistic for the raw counts of the number
449             of samples, sites, external and internal mutations
450             Returns : decimal number
451             Args : number of samples (N)
452             average pairwise differences (pi)
453             number of segregating sites (n)
454             external mutations (n_e)
455              
456             =cut
457              
458              
459             sub fu_and_li_F_counts {
460 6     6 1 15 my ($self,$n,$pi,$seg_sites, $external) = @_;
461 6         8 my $a_n = 0;
462 6         17 for(my $k= 1; $k < $n; $k++ ) {
463 43         62 $a_n += ( 1 / $k );
464             }
465              
466 6         9 my $a1 = $a_n + (1 / $n );
467              
468 6         8 my $b = 0;
469 6         13 for(my $k= 1; $k < $n; $k++ ) {
470 43         60 $b += ( 1 / $k**2 );
471             }
472              
473 6         16 my $c = 2 * ( ( ( $n * $a_n ) - (2 * ( $n -1 ))) /
474             ( ( $n - 1) * ( $n - 2 ) ) );
475              
476 6         23 my $v_F = ( $c + ( (2*(($n**2)+$n+3)) /
477             ( (9*$n)*($n-1) ) ) -
478             (2/($n-1)) ) / ( ($a_n**2)+$b );
479              
480 6         19 my $u_F = ( 1 + ( ($n+1)/(3*($n-1)) )-
481             ( 4*( ($n+1)/(($n-1)**2) ))*
482             ($a1 - ((2*$n)/($n+1))) ) /
483             $a_n - $v_F;
484              
485             # warn("$v_F vf $u_F uf n = $n\n");
486 6         21 my $F = ($pi - $external) / ( sqrt( ($u_F*$seg_sites) +
487             ($v_F*($seg_sites**2)) ) );
488              
489 6         48 return $F;
490             }
491              
492             =head2 fu_and_li_F_star
493              
494             Title : fu_and_li_F_star
495             Usage : my $F = Bio::PopGen::Statistics->fu_and_li_F_star(\@ingroup);
496             Function: Calculate Fu and Li's F* on an ingroup without an outgroup
497             It uses count of singleton alleles instead
498             Returns : decimal number
499             Args : array ref of L objects for the ingroup
500             OR
501             L object
502              
503             =cut
504              
505             #' keep my emacs happy
506              
507             sub fu_and_li_F_star {
508 3     3 1 498 my ($self,$individuals) = @_;
509              
510 3         6 my ($seg_sites,$pi,$n,$singletons);
511 3 100 33     24 if( ref($individuals) =~ /ARRAY/i ) {
    50          
512 2         5 $n = scalar @$individuals;
513             # pi - all pairwise differences
514 2         7 $pi = $self->pi($individuals);
515 2         8 $seg_sites = $self->segregating_sites_count($individuals);
516 2         7 $singletons = $self->singleton_count($individuals);
517             } elsif( ref($individuals) &&
518             $individuals->isa('Bio::PopGen::PopulationI')) {
519 1         2 my $pop = $individuals;
520 1         3 $n = $pop->get_number_individuals;
521 1         3 $pi = $self->pi($pop);
522 1         4 $seg_sites = $self->segregating_sites_count($pop);
523 1         4 $singletons = $self->singleton_count($pop);
524             } else {
525 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to fu_and_li_F_star");
526 0         0 return 0;
527             }
528 3         11 return $self->fu_and_li_F_star_counts($n,
529             $pi,
530             $seg_sites,
531             $singletons);
532             }
533              
534             =head2 fu_and_li_F_star_counts
535              
536             Title : fu_li_F_star_counts
537             Usage : my $F = $statistics->fu_and_li_F_star_counts($samps,
538             $pi,$sites,
539             $singletons);
540             Function: Fu and Li F statistic for the raw counts of the number
541             of samples, sites, external and internal mutations
542             Returns : decimal number
543             Args : number of samples (N)
544             average pairwise differences (pi)
545             number of segregating sites (n)
546             singleton mutations (n_s)
547              
548             =cut
549              
550              
551             sub fu_and_li_F_star_counts {
552 4     4 1 17 my ($self,$n,$pi,$seg_sites, $singletons) = @_;
553 4 50       12 if( $n <= 1 ) {
554 0         0 $self->warn("N must be > 1\n");
555 0         0 return;
556             }
557 4 50       9 if( $n == 2) {
558 0         0 return 0;
559             }
560              
561 4         6 my $a_n = 0;
562            
563              
564 4         6 my $b = 0;
565 4         31 for(my $k= 1; $k < $n; $k++ ) {
566 35         26 $b += (1 / ($k**2));
567 35         63 $a_n += ( 1 / $k ); # Eq (2)
568             }
569 4         8 my $a1 = $a_n + (1 / $n );
570              
571             # warn("a_n is $a_n a1 is $a1 n is $n b is $b\n");
572              
573             # From Simonsen et al (1995) instead of Fu and Li 1993
574 4         24 my $v_F_star = ( (( 2 * $n ** 3 + 110 * $n**2 - (255 * $n) + 153)/
575             (9 * ($n ** 2) * ( $n - 1))) +
576             ((2 * ($n - 1) * $a_n ) / $n ** 2) -
577             (8 * $b / $n) ) /
578             ( ($a_n ** 2) + $b );
579            
580 4         12 my $u_F_star = ((( (4* ($n**2)) + (19 * $n) + 3 - (12 * ($n + 1)* $a1)) /
581             (3 * $n * ( $n - 1))) / $a_n) - $v_F_star;
582              
583             # warn("vf* = $v_F_star uf* = $u_F_star n = $n\n");
584 4         11 my $F_star = ( $pi - ($singletons*( ( $n-1) / $n)) ) /
585             sqrt ( $u_F_star*$seg_sites + $v_F_star*$seg_sites**2);
586 4         30 return $F_star;
587             }
588              
589             =head2 tajima_D
590              
591             Title : tajima_D
592             Usage : my $D = Bio::PopGen::Statistics->tajima_D(\@samples);
593             Function: Calculate Tajima's D on a set of samples
594             Returns : decimal number
595             Args : array ref of L objects
596             OR
597             L object
598              
599              
600             =cut
601              
602             #'
603              
604             sub tajima_D {
605 6     6 1 482 my ($self,$individuals) = @_;
606 6         14 my ($seg_sites,$pi,$n);
607              
608 6 100 33     51 if( ref($individuals) =~ /ARRAY/i ) {
    50          
609 2         4 $n = scalar @$individuals;
610             # pi - all pairwise differences
611 2         6 $pi = $self->pi($individuals);
612 2         10 $seg_sites = $self->segregating_sites_count($individuals);
613              
614             } elsif( ref($individuals) &&
615             $individuals->isa('Bio::PopGen::PopulationI')) {
616 4         8 my $pop = $individuals;
617 4         13 $n = $pop->get_number_individuals;
618 4         14 $pi = $self->pi($pop);
619 4         15 $seg_sites = $self->segregating_sites_count($pop);
620             } else {
621 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI OR a Bio::PopGen::PopulationI object to tajima_D");
622 0         0 return 0;
623             }
624 6         26 $self->tajima_D_counts($n,$seg_sites,$pi);
625             }
626              
627             =head2 tajima_D_counts
628              
629             Title : tajima_D_counts
630             Usage : my $D = $statistics->tajima_D_counts($samps,$sites,$pi);
631             Function: Tajima's D statistic for the raw counts of the number
632             of samples, sites, and avg pairwise distances (pi)
633             Returns : decimal number
634             Args : number of samples (N)
635             number of segregating sites (n)
636             average pairwise differences (pi)
637              
638             =cut
639              
640             #'
641              
642             sub tajima_D_counts {
643 6     6 1 15 my ($self,$n,$seg_sites,$pi) = @_;
644 6         10 my $a1 = 0;
645 6         17 for(my $k= 1; $k < $n; $k++ ) {
646 284         339 $a1 += ( 1 / $k );
647             }
648              
649 6         7 my $a2 = 0;
650 6         18 for(my $k= 1; $k < $n; $k++ ) {
651 284         336 $a2 += ( 1 / $k**2 );
652             }
653            
654 6         14 my $b1 = ( $n + 1 ) / ( 3* ( $n - 1) );
655 6         13 my $b2 = ( 2 * ( $n ** 2 + $n + 3) ) /
656             ( ( 9 * $n) * ( $n - 1) );
657 6         9 my $c1 = $b1 - ( 1 / $a1 );
658 6         26 my $c2 = $b2 - ( ( $n + 2 ) /
659             ( $a1 * $n))+( $a2 / $a1 ** 2);
660 6         12 my $e1 = $c1 / $a1;
661 6         12 my $e2 = $c2 / ( $a1**2 + $a2 );
662            
663 6         14 my $denom = sqrt ( ($e1 * $seg_sites) + (( $e2 * $seg_sites) * ( $seg_sites - 1)));
664 6 50       14 return if $denom == 0;
665 6         9 my $D = ( $pi - ( $seg_sites / $a1 ) ) / $denom;
666 6         83 return $D;
667             }
668              
669              
670             =head2 pi
671              
672             Title : pi
673             Usage : my $pi = Bio::PopGen::Statistics->pi(\@inds)
674             Function: Calculate pi (average number of pairwise differences) given
675             a list of individuals which have the same number of markers
676             (also called sites) as available from the get_Genotypes()
677             call in L
678             Returns : decimal number
679             Args : Arg1= array ref of L objects
680             which have markers/mutations. We expect all individuals to
681             have a marker - we will deal with missing data as a special case.
682             OR
683             Arg1= L object. In the event that
684             only allele frequency data is available, storing it in
685             Population object will make this available.
686             num sites [optional], an optional second argument (integer)
687             which is the number of sites, then pi returned is pi/site.
688              
689             =cut
690              
691             sub pi {
692 21     21 1 79 my ($self,$individuals,$numsites) = @_;
693 21         32 my (%data,%marker_total,@marker_names,$n);
694              
695 21 100 33     132 if( ref($individuals) =~ /ARRAY/i ) {
    50          
696             # one possible argument is an arrayref of Bio::PopGen::IndividualI objs
697 9         52 @marker_names = $individuals->[0]->get_marker_names;
698 9         15 $n = scalar @$individuals;
699              
700             # Here we are calculating the allele frequencies
701 9         16 foreach my $ind ( @$individuals ) {
702 45 50       120 if( ! $ind->isa('Bio::PopGen::IndividualI') ) {
703 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($ind)."\n");
704 0         0 return 0;
705             }
706 45         51 foreach my $m ( @marker_names ) {
707 2125         3168 foreach my $allele (map { $_->get_Alleles}
  2125         3137  
708             $ind->get_Genotypes($m) ) {
709 2125         2962 $data{$m}->{$allele}++;
710 2125         2979 $marker_total{$m}++;
711             }
712             }
713             }
714             # while( my ($marker,$count) = each %marker_total ) {
715             # foreach my $c ( values %{$data{$marker}} ) {
716             # $c /= $count;
717             # }
718             # }
719             # %data will contain allele frequencies for each marker, allele
720             } elsif( ref($individuals) &&
721             $individuals->isa('Bio::PopGen::PopulationI') ) {
722 12         16 my $pop = $individuals;
723 12         27 $n = $pop->get_number_individuals;
724 12         51 foreach my $marker( $pop->get_Markers ) {
725 143         294 push @marker_names, $marker->name;
726             #$data{$marker->name} = {$marker->get_Allele_Frequencies};
727 143         246 my @genotypes = $pop->get_Genotypes(-marker => $marker->name);
728 143         300 for my $al ( map { $_->get_Alleles} @genotypes ) {
  12444         15950  
729 24628         27986 $data{$marker->name}->{$al}++;
730 24628         29096 $marker_total{$marker->name}++;
731             }
732             }
733             } else {
734 0         0 $self->throw("expected an array reference of a list of Bio::PopGen::IndividualI to pi");
735             }
736             # based on Kevin Thornton's code:
737             # http://molpopgen.org/software/libsequence/doc/html/PolySNP_8cc-source.html#l00152
738             # For now we assume that all individuals have the same markers
739 21         78 my ($diffcount,$totalcompare) = (0,0);
740 21         30 my $pi = 0;
741 21         66 while ( my ($marker,$markerdat) = each %data ) {
742 568         591 my $sampsize = $marker_total{$marker};
743 568         463 my $ssh = 0;
744 568         853 my @alleles = keys %$markerdat;
745 568 50       690 if ( $sampsize > 1 ) {
746 568         518 my $denom = $sampsize * ($sampsize - 1.0);
747 568         567 foreach my $al ( @alleles ) {
748 1118         1436 $ssh += ($markerdat->{$al} * ($markerdat->{$al} - 1)) / $denom;
749             }
750 568         1139 $pi += 1.0 - $ssh;
751             }
752             }
753 21         210 $self->debug( "pi=$pi\n");
754 21 100       42 if( $numsites ) {
755 2         14 return $pi / $numsites;
756             } else {
757 19         242 return $pi;
758             }
759             }
760              
761              
762             =head2 theta
763              
764             Title : theta
765             Usage : my $theta = Bio::PopGen::Statistics->theta($sampsize,$segsites);
766             Function: Calculates Watterson's theta from the sample size
767             and the number of segregating sites.
768             Providing the third parameter, total number of sites will
769             return theta per site.
770             This is also known as K-hat = K / a_n
771             Returns : decimal number
772             Args : sample size (integer),
773             num segregating sites (integer)
774             total sites (integer) [optional] (to calculate theta per site)
775             OR
776             provide an arrayref of the L objects
777             total sites (integer) [optional] (to calculate theta per site)
778             OR
779             provide an L object
780             total sites (integer)[optional]
781              
782             =cut
783              
784             #'
785              
786             sub theta {
787 7     7 1 680 my $self = shift;
788 7         16 my ( $n, $seg_sites,$totalsites) = @_;
789 7 100 66     59 if( ref($n) =~ /ARRAY/i ) {
    100          
790 2         2 my $samps = $n;
791 2         3 $totalsites = $seg_sites; # only 2 arguments if one is an array
792 2         2 my %data;
793 2         8 my @marker_names = $samps->[0]->get_marker_names;
794             # we need to calculate number of polymorphic sites
795 2         7 $seg_sites = $self->segregating_sites_count($samps);
796 2         4 $n = scalar @$samps;
797              
798             } elsif(ref($n) &&
799             $n->isa('Bio::PopGen::PopulationI') ) {
800             # This will handle the case when we pass in a PopulationI object
801 4         7 my $pop = $n;
802 4         7 $totalsites = $seg_sites; # shift the arguments over by one
803 4         13 $n = $pop->haploid_population->get_number_individuals;
804 4         23 $seg_sites = $self->segregating_sites_count($pop);
805             }
806 7         14 my $a1 = 0;
807 7         23 for(my $k= 1; $k < $n; $k++ ) {
808 202         246 $a1 += ( 1 / $k );
809             }
810 7 100       13 if( $totalsites ) { # 0 and undef are the same can't divide by them
811 2         4 $seg_sites /= $totalsites;
812             }
813 7 50       27 if( $a1 == 0 ) {
814 0         0 return 0;
815             }
816 7         50 return $seg_sites / $a1;
817             }
818              
819             =head2 singleton_count
820              
821             Title : singleton_count
822             Usage : my ($singletons) = Bio::PopGen::Statistics->singleton_count(\@inds)
823             Function: Calculate the number of mutations/alleles which only occur once in
824             a list of individuals for all sites/markers
825             Returns : (integer) number of alleles which only occur once (integer)
826             Args : arrayref of L objects
827             OR
828             L object
829              
830             =cut
831              
832             sub singleton_count {
833 7     7 1 488 my ($self,$individuals) = @_;
834              
835 7         10 my @inds;
836 7 100 33     37 if( ref($individuals) =~ /ARRAY/ ) {
    50          
837 5         8 @inds = @$individuals;
838             } elsif( ref($individuals) &&
839             $individuals->isa('Bio::PopGen::PopulationI') ) {
840 2         4 my $pop = $individuals;
841 2         5 @inds = $pop->get_Individuals();
842 2 50       5 unless( @inds ) {
843 0         0 $self->warn("Need to provide a population which has individuals loaded, not just a population with allele frequencies");
844 0         0 return 0;
845             }
846             } else {
847 0         0 $self->warn("Expected either a PopulationI object or an arrayref of IndividualI objects");
848 0         0 return 0;
849             }
850             # find number of sites where a particular allele is only seen once
851              
852 7         11 my ($singleton_allele_ct,%sites) = (0);
853             # first collect all the alleles into a hash structure
854            
855 7         11 foreach my $n ( @inds ) {
856 35 50       126 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
857 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
858 0         0 return 0;
859             }
860 35         58 foreach my $g ( $n->get_Genotypes ) {
861 1600         2217 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
862 1600         1849 foreach my $allele (@alleles ) {
863 1600         2623 $sites{$nm}->{$allele}++;
864             }
865             }
866             }
867 7         25 foreach my $site ( values %sites ) { # don't really care what the name is
868 320         346 foreach my $allelect ( values %$site ) { #
869             # find the sites which have an allele with only 1 copy
870 636 100       780 $singleton_allele_ct++ if( $allelect == 1 );
871             }
872             }
873 7         71 return $singleton_allele_ct;
874             }
875              
876             # Yes I know that singleton_count and segregating_sites_count are
877             # basically processing the same data so calling them both is
878             # redundant, something I want to fix later but want to make things
879             # correct and simple first
880              
881             =head2 segregating_sites_count
882              
883             Title : segregating_sites_count
884             Usage : my $segsites = Bio::PopGen::Statistics->segregating_sites_count
885             Function: Gets the number of segregating sites (number of polymorphic sites)
886             Returns : (integer) number of segregating sites
887             Args : arrayref of L objects
888             OR
889             L object
890              
891             =cut
892              
893             # perhaps we'll change this in the future
894             # to return the actual segregating sites
895             # so one can use this to pull in the names of those sites.
896             # Would be trivial if it is useful.
897              
898             sub segregating_sites_count {
899 31     31 1 622 my ($self,$individuals) = @_;
900 31         45 my $type = ref($individuals);
901 31         32 my $seg_sites = 0;
902 31 100 33     151 if( $type =~ /ARRAY/i ) {
    50          
903 15         16 my %sites;
904 15         21 foreach my $n ( @$individuals ) {
905 75 50       245 if( ! $n->isa('Bio::PopGen::IndividualI') ) {
906 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects, this is a ".ref($n)."\n");
907 0         0 return 0;
908             }
909 75         119 foreach my $g ( $n->get_Genotypes ) {
910 3225         4279 my ($nm,@alleles) = ($g->marker_name, $g->get_Alleles);
911 3225         3591 foreach my $allele (@alleles ) {
912 3225         4980 $sites{$nm}->{$allele}++;
913             }
914             }
915             }
916 15         45 foreach my $site ( values %sites ) { # use values b/c we don't
917             # really care what the name is
918             # find the sites which >1 allele
919 645 100       961 $seg_sites++ if( keys %$site > 1 );
920             }
921             } elsif( $type && $individuals->isa('Bio::PopGen::PopulationI') ) {
922 16         40 foreach my $marker ( $individuals->haploid_population->get_Markers ) {
923 163         244 my @alleles = $marker->get_Alleles;
924 163 100       271 $seg_sites++ if ( scalar @alleles > 1 );
925             }
926             } else {
927 0         0 $self->warn("segregating_sites_count expects either a PopulationI object or a list of IndividualI objects");
928 0         0 return 0;
929             }
930 31         253 return $seg_sites;
931             }
932              
933              
934             =head2 heterozygosity
935              
936             Title : heterozygosity
937             Usage : my $het = Bio::PopGen::Statistics->heterozygosity($sampsize,$freq1);
938             Function: Calculate the heterozgosity for a sample set for a set of alleles
939             Returns : decimal number
940             Args : sample size (integer)
941             frequency of one allele (fraction - must be less than 1)
942             [optional] frequency of another allele - this is only needed
943             in a non-binary allele system
944              
945             Note : p^2 + 2pq + q^2
946              
947             =cut
948              
949              
950             sub heterozygosity {
951 0     0 1 0 my ($self,$samp_size, $freq1,$freq2) = @_;
952 0 0       0 if( ! $freq2 ) { $freq2 = 1 - $freq1 }
  0         0  
953 0 0 0     0 if( $freq1 > 1 || $freq2 > 1 ) {
954 0         0 $self->warn("heterozygosity expects frequencies to be less than 1");
955             }
956 0         0 my $sum = ($freq1**2) + (($freq2)**2);
957 0         0 my $h = ( $samp_size*(1- $sum) ) / ($samp_size - 1) ;
958 0         0 return $h;
959             }
960              
961              
962             =head2 derived_mutations
963              
964             Title : derived_mutations
965             Usage : my $ext = Bio::PopGen::Statistics->derived_mutations($ingroup,$outgroup);
966             Function: Calculate the number of alleles or (mutations) which are ancestral
967             and the number which are derived (occurred only on the tips)
968             Returns : array of 2 items - number of external and internal derived
969             mutation
970             Args : ingroup - Ls arrayref OR
971             L
972             outgroup- Ls arrayref OR
973             L OR
974             a single L
975              
976             =cut
977              
978             sub derived_mutations {
979 10     10 1 19 my ($self,$ingroup,$outgroup) = @_;
980 10         13 my (%indata,%outdata,@marker_names);
981              
982             # basically we have to do some type checking
983             # if that perl were typed...
984 10         20 my ($itype,$otype) = (ref($ingroup),ref($outgroup));
985              
986 10 50       19 return $outgroup unless( $otype ); # we expect arrayrefs or objects, nums
987             # are already the value we
988             # are searching for
989             # pick apart the ingroup
990             # get the data
991 10 100 33     52 if( ref($ingroup) =~ /ARRAY/i ) {
    50          
992 4 50 33     25 if( ! ref($ingroup->[0]) ||
993             ! $ingroup->[0]->isa('Bio::PopGen::IndividualI') ) {
994 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for ingroup in external_mutations");
995 0         0 return 0;
996             }
997             # we assume that all individuals have the same markers
998             # i.e. that they are aligned
999 4         9 @marker_names = $ingroup->[0]->get_marker_names;
1000 4         7 for my $ind ( @$ingroup ) {
1001 20         23 for my $m ( @marker_names ) {
1002 100         136 for my $allele ( map { $_->get_Alleles }
  100         153  
1003             $ind->get_Genotypes($m) ) {
1004 100         181 $indata{$m}->{$allele}++;
1005             }
1006             }
1007             }
1008             } elsif( ref($ingroup) && $ingroup->isa('Bio::PopGen::PopulationI') ) {
1009 6         13 @marker_names = $ingroup->get_marker_names;
1010 6         15 for my $ind ( $ingroup->haploid_population->get_Individuals() ) {
1011 30         33 for my $m ( @marker_names ) {
1012 150         192 for my $allele ( map { $_->get_Alleles}
  150         232  
1013             $ind->get_Genotypes($m) ) {
1014 150         286 $indata{$m}->{$allele}++;
1015             }
1016             }
1017             }
1018             } else {
1019 0         0 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for ingroup in external_mutations");
1020 0         0 return 0;
1021             }
1022            
1023 10 100       46 if( $otype =~ /ARRAY/i ) {
    50          
1024 5 50 33     26 if( ! ref($outgroup->[0]) ||
1025             ! $outgroup->[0]->isa('Bio::PopGen::IndividualI') ) {
1026 0         0 $self->warn("Expected an arrayref of Bio::PopGen::IndividualI objects or a Population for outgroup in external_mutations");
1027 0         0 return 0;
1028             }
1029 5         10 for my $ind ( @$outgroup ) {
1030 5         6 for my $m ( @marker_names ) {
1031 25         30 for my $allele ( map { $_->get_Alleles }
  25         35  
1032             $ind->get_Genotypes($m) ) {
1033 25         58 $outdata{$m}->{$allele}++;
1034             }
1035             }
1036             }
1037            
1038             } elsif( $otype->isa('Bio::PopGen::PopulationI') ) {
1039 5         14 for my $ind ( $outgroup->haploid_population->get_Individuals() ) {
1040 5         9 for my $m ( @marker_names ) {
1041 25         36 for my $allele ( map { $_->get_Alleles}
  25         39  
1042             $ind->get_Genotypes($m) ) {
1043 25         57 $outdata{$m}->{$allele}++;
1044             }
1045             }
1046             }
1047             } else {
1048 0         0 $self->warn("Need an arrayref of Bio::PopGen::IndividualI objs or a Bio::PopGen::Population for outgroup in external_mutations");
1049 0         0 return 0;
1050             }
1051            
1052             # derived mutations are defined as
1053             #
1054             # ingroup (G A T)
1055             # outgroup (A)
1056             # derived mutations are G and T, A is the external mutation
1057            
1058             # ingroup (A T)
1059             # outgroup (C)
1060             # derived mutations A,T no external/ancestral mutations
1061            
1062             # ingroup (G A T)
1063             # outgroup (A T)
1064             # cannot determine
1065            
1066 10         21 my ($internal,$external);
1067 10         18 foreach my $marker ( @marker_names ) {
1068 50         43 my @outalleles = keys %{$outdata{$marker}};
  50         90  
1069 50         51 my @in_alleles = keys %{$indata{$marker}};
  50         69  
1070 50 100 66     122 next if( @outalleles > 1 || @in_alleles == 1);
1071 40         41 for my $allele ( @in_alleles ) {
1072 80 100       115 if( ! exists $outdata{$marker}->{$allele} ) {
1073 40 100       46 if( $indata{$marker}->{$allele} == 1 ) {
1074 10         12 $external++;
1075             } else {
1076 30         41 $internal++;
1077             }
1078             }
1079             }
1080             }
1081 10         58 return ($external, $internal);
1082             }
1083              
1084              
1085             =head2 composite_LD
1086              
1087             Title : composite_LD
1088             Usage : %matrix = Bio::PopGen::Statistics->composite_LD($population);
1089             Function: Calculate the Linkage Disequilibrium
1090             This is for calculating LD for unphased data.
1091             Other methods will be appropriate for phased haplotype data.
1092              
1093             Returns : Hash of Hashes - first key is site 1,second key is site 2
1094             and value is LD for those two sites.
1095             my $LDarrayref = $matrix{$site1}->{$site2};
1096             my ($ldval, $chisquared) = @$LDarrayref;
1097             Args : L or arrayref of
1098             Ls
1099             Reference: Weir B.S. (1996) "Genetic Data Analysis II",
1100             Sinauer, Sunderlanm MA.
1101              
1102             =cut
1103              
1104             sub composite_LD {
1105 2     2 1 13 my ($self,$pop) = @_;
1106 2 50 33     18 if( ref($pop) =~ /ARRAY/i ) {
    50          
1107 0 0 0     0 if( ref($pop->[0]) && $pop->[0]->isa('Bio::PopGen::IndividualI') ) {
1108 0         0 $pop = Bio::PopGen::Population->new(-individuals => @$pop);
1109             } else {
1110 0         0 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1111 0         0 return ();
1112             }
1113             } elsif( ! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
1114 0         0 $self->warn("composite_LD expects a Bio::PopGen::PopulationI or an arrayref of Bio::PopGen::IndividualI objects");
1115 0         0 return ();
1116             }
1117              
1118 2         5 my @marker_names = $pop->get_marker_names;
1119 2         3 my @inds = $pop->get_Individuals;
1120 2         4 my $num_inds = scalar @inds;
1121 2         3 my (%lookup);
1122             # calculate allele frequencies for each marker from the population
1123             # use the built-in get_Marker to get the allele freqs
1124             # we still need to calculate the genotype frequencies
1125 2         3 foreach my $marker_name ( @marker_names ) {
1126 5         6 my(%allelef);
1127              
1128 5         8 foreach my $ind ( @inds ) {
1129 76         137 my ($genotype) = $ind->get_Genotypes(-marker => $marker_name);
1130 76 50       102 if( ! defined $genotype ) {
1131 0         0 $self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
1132 0         0 next;
1133             }
1134 76         108 my @alleles = sort $genotype->get_Alleles;
1135 76 100       288 next if( scalar @alleles != 2);
1136 73         108 my $genostr = join(',', @alleles);
1137 73         81 $allelef{$alleles[0]}++;
1138 73         102 $allelef{$alleles[1]}++;
1139             }
1140              
1141             # we should check for cases where there > 2 alleles or
1142             # only 1 allele and throw out those markers.
1143 5         17 my @alleles = sort keys %allelef;
1144 5         102 my $allele_count = scalar @alleles;
1145             # test if site is polymorphic
1146 5 50       11 if( $allele_count != 2) {
1147             # only really warn if we're seeing multi-allele
1148 0 0       0 $self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if $allele_count > 2;
1149 0         0 next; # skip this marker
1150             }
1151              
1152             # Need to do something here to detect alleles which aren't
1153             # a single character
1154 5 50 33     19 if( length($alleles[0]) != 1 ||
1155             length($alleles[1]) != 1 ) {
1156 0         0 $self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
1157 0         0 next;
1158             }
1159              
1160             # fix the call for allele 1 (A or B) and
1161             # allele 2 (a or b) in terms of how we'll do the
1162             # N square from Weir p.126
1163 5         24 $self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
1164 5         15 $lookup{$marker_name}->{'1'} = $alleles[0];
1165 5         10 $lookup{$marker_name}->{'2'} = $alleles[1];
1166             }
1167              
1168 2         7 @marker_names = sort keys %lookup;
1169 2         3 my $site_count = scalar @marker_names;
1170             # where the final data will be stored
1171 2         2 my %stats_for_sites;
1172              
1173             # standard way of generating pairwise combos
1174             # LD is done by comparing all the pairwise site (marker)
1175             # combinations and keeping track of the genotype and
1176             # pairwise genotype (ie genotypes of the 2 sites) frequencies
1177 2         6 for( my $i = 0; $i < $site_count - 1; $i++ ) {
1178 3         6 my $site1 = $marker_names[$i];
1179              
1180 3         7 for( my $j = $i+1; $j < $site_count ; $j++) {
1181 4         6 my (%genotypes, %total_genotype_count,$total_pairwisegeno_count,
1182             %pairwise_genotypes);
1183            
1184 4         4 my $site2 = $marker_names[$j];
1185 4         9 my (%allele_count,%allele_freqs) = (0,0);
1186 4         5 foreach my $ind ( @inds ) {
1187             # build string of genotype at site 1
1188 53         83 my ($genotype1) = $ind->get_Genotypes(-marker => $site1);
1189 53         99 my @alleles1 = sort $genotype1->get_Alleles;
1190              
1191             # if an individual has only one available allele
1192             # (has a blank or N for one of the chromosomes)
1193             # we don't want to use it in our calculation
1194              
1195 53 100       87 next unless( scalar @alleles1 == 2);
1196 52         80 my $genostr1 = join(',', @alleles1);
1197              
1198             # build string of genotype at site 2
1199 52         82 my ($genotype2) = $ind->get_Genotypes(-marker => $site2);
1200 52         86 my @alleles2 = sort $genotype2->get_Alleles;
1201 52         79 my $genostr2 = join(',', @alleles2);
1202            
1203 52 100       79 next unless( scalar @alleles2 == 2);
1204 50         56 for (@alleles1) {
1205 100         92 $allele_count{$site1}++;
1206 100         116 $allele_freqs{$site1}->{$_}++;
1207             }
1208 50         50 $genotypes{$site1}->{$genostr1}++;
1209 50         38 $total_genotype_count{$site1}++;
1210              
1211 50         51 for (@alleles2) {
1212 100         86 $allele_count{$site2}++;
1213 100         103 $allele_freqs{$site2}->{$_}++;
1214             }
1215 50         55 $genotypes{$site2}->{$genostr2}++;
1216 50         43 $total_genotype_count{$site2}++;
1217              
1218             # We are using the $site1,$site2 to signify
1219             # a unique key
1220 50         75 $pairwise_genotypes{"$genostr1,$genostr2"}++;
1221             # some individuals
1222 50         85 $total_pairwisegeno_count++;
1223             }
1224 4         8 for my $site ( %allele_freqs ) {
1225 16         14 for my $al ( keys %{ $allele_freqs{$site} } ) {
  16         35  
1226 16         24 $allele_freqs{$site}->{$al} /= $allele_count{$site};
1227             }
1228             }
1229 4         5 my $n = $total_pairwisegeno_count; # number of pairs of comparisons
1230             # 'A' and 'B' are two loci or in our case site1 and site2
1231 4         5 my $allele1_site1 = $lookup{$site1}->{'1'}; # this is the BigA allele
1232 4         5 my $allele1_site2 = $lookup{$site2}->{'1'}; # this is the BigB allele
1233 4         7 my $allele2_site1 = $lookup{$site1}->{'2'}; # this is the LittleA allele
1234 4         5 my $allele2_site2 = $lookup{$site2}->{'2'}; # this is the LittleB allele
1235             # AABB
1236 4         7 my $N1genostr = join(",",( $allele1_site1, $allele1_site1,
1237             $allele1_site2, $allele1_site2));
1238 4         19 $self->debug(" [$site1,$site2](AABB) N1genostr=$N1genostr\n");
1239             # AABb
1240 4         10 my $N2genostr = join(",",( $allele1_site1, $allele1_site1,
1241             $allele1_site2, $allele2_site2));
1242 4         14 $self->debug(" [$site1,$site2](AABb) N2genostr=$N2genostr\n");
1243             # AaBB
1244 4         8 my $N4genostr = join(",",( $allele1_site1, $allele2_site1,
1245             $allele1_site2, $allele1_site2));
1246 4         10 $self->debug(" [$site1,$site2](AaBB) N4genostr=$N4genostr\n");
1247             # AaBb
1248 4         6 my $N5genostr = join(",",( $allele1_site1, $allele2_site1,
1249             $allele1_site2, $allele2_site2));
1250 4         9 $self->debug(" [$site1,$site2](AaBb) N5genostr=$N5genostr\n");
1251             # count of AABB in
1252 4   50     8 my $n1 = $pairwise_genotypes{$N1genostr} || 0;
1253             # count of AABb in
1254 4   100     8 my $n2 = $pairwise_genotypes{$N2genostr} || 0;
1255             # count of AaBB in
1256 4   100     14 my $n4 = $pairwise_genotypes{$N4genostr} || 0;
1257             # count of AaBb in
1258 4   100     9 my $n5 = $pairwise_genotypes{$N5genostr} || 0;
1259              
1260 4         6 my $homozA_site1 = join(",", ($allele1_site1,$allele1_site1));
1261 4         5 my $homozB_site2 = join(",", ($allele1_site2,$allele1_site2));
1262 4   50     10 my $p_AA = ($genotypes{$site1}->{$homozA_site1} || 0) / $n;
1263 4   50     18 my $p_BB = ($genotypes{$site2}->{$homozB_site2} || 0) / $n;
1264 4   50     8 my $p_A = $allele_freqs{$site1}->{$allele1_site1} || 0; # an individual allele freq
1265 4         16 my $p_a = 1 - $p_A;
1266              
1267 4   50     16 my $p_B = $allele_freqs{$site2}->{$allele1_site2} || 0; # an individual allele freq
1268 4         6 my $p_b = 1 - $p_B;
1269              
1270             # variance of allele frequencies
1271 4         5 my $pi_A = $p_A * $p_a;
1272 4         3 my $pi_B = $p_B * $p_b;
1273              
1274             # hardy weinberg
1275 4         7 my $D_A = $p_AA - $p_A**2;
1276 4         3 my $D_B = $p_BB - $p_B**2;
1277 4         9 my $n_AB = 2*$n1 + $n2 + $n4 + 0.5 * $n5;
1278 4         20 $self->debug("n_AB=$n_AB -- n1=$n1, n2=$n2 n4=$n4 n5=$n5\n");
1279              
1280 4         7 my $delta_AB = (1 / $n ) * ( $n_AB ) - ( 2 * $p_A * $p_B );
1281 4         28 $self->debug("delta_AB=$delta_AB -- n=$n, n_AB=$n_AB p_A=$p_A, p_B=$p_B\n");
1282 4         45 $self->debug(sprintf(" (%d * %.4f) / ( %.2f + %.2f) * ( %.2f + %.2f) \n",
1283             $n,$delta_AB**2, $pi_A, $D_A, $pi_B, $D_B));
1284            
1285 4         6 my $chisquared;
1286 4         6 eval { $chisquared = ( $n * ($delta_AB**2) ) /
  4         6  
1287             ( ( $pi_A + $D_A) * ( $pi_B + $D_B) );
1288             };
1289 4 50       12 if( $@ ) {
1290 0         0 $self->debug("Skipping the site because the denom is 0.\nsite1=$site1, site2=$site2 : pi_A=$pi_A, pi_B=$pi_B D_A=$D_A, D_B=$D_B\n");
1291 0         0 next;
1292             }
1293             # this will be an upper triangular matrix
1294 4         37 $stats_for_sites{$site1}->{$site2} = [$delta_AB,$chisquared];
1295             }
1296             }
1297 2         13 return %stats_for_sites;
1298             }
1299              
1300             =head2 mcdonald_kreitman
1301              
1302             Title : mcdonald_kreitman
1303             Usage : $Fstat = mcdonald_kreitman($ingroup, $outgroup);
1304             Function: Calculates McDonald-Kreitman statistic based on a set of ingroup
1305             individuals and an outgroup by computing the number of
1306             differences at synonymous and non-synonymous sites
1307             for intraspecific comparisons and with the outgroup
1308             Returns : 2x2 table, followed by a hash reference indicating any
1309             warning messages about the status of the alleles or codons
1310             Args : -ingroup => L object or
1311             arrayref of Ls
1312             -outgroup => L object or
1313             arrayef of Ls
1314             -polarized => Boolean, to indicate if this should be
1315             a polarized test. Must provide two individuals
1316             as outgroups.
1317              
1318             =cut
1319              
1320             sub mcdonald_kreitman {
1321 6     6 1 5647 my ($self,@args) = @_;
1322 6         40 my ($ingroup, $outgroup,$polarized) =
1323             $self->_rearrange([qw(INGROUP OUTGROUP POLARIZED)],@args);
1324 6         36 my $verbose = $self->verbose;
1325 6         8 my $outgroup_count;
1326 6         10 my $gapchar = '\-';
1327 6 50       31 if( ref($outgroup) =~ /ARRAY/i ) {
    0          
1328 6         12 $outgroup_count = scalar @$outgroup;
1329             } elsif( UNIVERSAL::isa($outgroup,'Bio::PopGen::PopulationI') ) {
1330 0         0 $outgroup_count = $outgroup->get_number_individuals;
1331             } else {
1332 0         0 $self->throw("Expected an ArrayRef of Individuals OR a Bio::PopGen::PopulationI");
1333             }
1334            
1335 6 100       22 if( $polarized ) {
    50          
    50          
1336 2 50       9 if( $outgroup_count < 2 ) {
1337 0         0 $self->throw("Need 2 outgroups with polarized option\n");
1338             }
1339             } elsif( $outgroup_count > 1 ) {
1340 0         0 $self->warn(sprintf("%s outgroup sequences provided, but only first will be used",$outgroup_count ));
1341             } elsif( $outgroup_count == 0 ) {
1342 0         0 $self->throw("No outgroup sequence provided");
1343             }
1344            
1345 6         38 my $codon_path = Bio::MolEvol::CodonModel->codon_path;
1346            
1347 6         26 my (%marker_names,%unique,@inds);
1348 6         21 for my $p ( $ingroup, $outgroup) {
1349 12 50       64 if( ref($p) =~ /ARRAY/i ) {
1350 12         38 push @inds, @$p;
1351             } else {
1352 0         0 push @inds, $p->get_Individuals;
1353             }
1354             }
1355 6         15 for my $i ( @inds ) {
1356 35 50       124 if( $unique{$i->unique_id}++ ) {
1357 0         0 $self->warn("Individual ". $i->unique_id. " is seen more than once in the ingroup or outgroup set\n");
1358             }
1359 35         84 for my $n ( $i->get_marker_names ) {
1360 14126         13556 $marker_names{$n}++;
1361             }
1362             }
1363              
1364 6         196 my @marker_names = keys %marker_names;
1365 6 50       31 if( $marker_names[0] =~ /^(Site|Codon)/ ) {
1366             # sort by site or codon number and do it in
1367             # a schwartzian transformation baby!
1368 0         0 @marker_names = map { $_->[1] }
1369 0         0 sort { $a->[0] <=> $b->[0] }
1370 0         0 map { [$_ =~ /^(?:Codon|Site)-(\d+)/, $_] } @marker_names;
  0         0  
1371             }
1372              
1373              
1374 6         12 my $num_inds = scalar @inds;
1375 6         26 my %vals = ( 'ingroup' => $ingroup,
1376             'outgroup' => $outgroup,
1377             );
1378              
1379             # Make the Codon Table type a parameter!
1380 6         49 my $table = Bio::Tools::CodonTable->new(-id => $codon_table);
1381 6         21 my @vt = qw(outgroup ingroup);
1382 6         12 my %changes;
1383             my %status;
1384 6         23 my %two_by_two = ( 'fixed_N' => 0,
1385             'fixed_S' => 0,
1386             'poly_N' => 0,
1387             'poly_S' => 0);
1388              
1389 6         14 for my $codon ( @marker_names ) {
1390 2436         3100 my (%codonvals);
1391             my %all_alleles;
1392 2436         2634 for my $t ( @vt ) {
1393 4872         4499 my $outcount = 1;
1394 4872         4307 for my $ind ( @{$vals{$t}} ) {
  4872         6632  
1395 14126         24918 my @alleles = $ind->get_Genotypes($codon)->get_Alleles;
1396 14126 50       21118 if( @alleles > 2 ) {
1397 0         0 warn("Codon $codon saw ", scalar @alleles, " alleles for ind ", $ind->unique_id, "\n");
1398 0         0 die;
1399             } else {
1400 14126         17764 my ($allele) = shift @alleles;
1401 14126         23997 $all_alleles{$ind->unique_id} = $allele;
1402 14126         23395 my $AA = $table->translate($allele);
1403 14126 50 66     47202 next if( $AA eq 'X' || $AA eq '*' || $allele =~ /N/i);
      66        
1404              
1405 8971         9827 my $label = $t;
1406 8971 100       13060 if( $t eq 'outgroup' ) {
1407 3244         4012 $label = $t.$outcount++;
1408             }
1409 8971         15124 $codonvals{$label}->{$allele}++;
1410 8971         17455 $codonvals{all}->{$allele}++;
1411             }
1412             }
1413             }
1414 2436         2410 my $total = sum ( values %{$codonvals{'ingroup'}} );
  2436         6686  
1415 2436 100 100     7864 next if( $total && $total < 2 ); # skip sites with < alleles
1416             # process all the seen alleles (codons)
1417             # this is a vertical slide through the alignment
1418 1626 100       1534 if( keys %{$codonvals{all}} <= 1 ) {
  1626         5824  
1419             # no changes or no VALID codons - monomorphic
1420             } else {
1421             # grab only the first outgroup codon (what to do with rest?)
1422 278         281 my ($outcodon) = keys %{$codonvals{'outgroup1'}};
  278         560  
1423 278 50       511 if( ! $outcodon ) {
1424 0         0 $status{"no outgroup codon $codon"}++;
1425 0         0 next;
1426             }
1427 278         511 my $out_AA = $table->translate($outcodon);
1428 278         329 my ($outcodon2) = keys %{$codonvals{'outgroup2'}};
  278         678  
1429 278 50 100     1402 if( ($polarized && ($outcodon ne $outcodon2)) ||
      66        
      66        
1430             $out_AA eq 'X' || $out_AA eq '*' ) {
1431             # skip if outgroup codons are different
1432             # (when polarized option is on)
1433             # or skip if the outcodon is STOP or 'NNN'
1434 90 50       180 if( $verbose > 0 ) {
1435 0         0 $self->debug("skipping $out_AA and $outcodon $outcodon2\n");
1436             }
1437 90         130 $status{'outgroup codons different'}++;
1438 90         365 next;
1439             }
1440              
1441             # check if ingroup is actually different from outgroup -
1442             # if there are the same number of alleles when considering
1443             # ALL or just the ingroup, then there is nothing new seen
1444             # in the outgroup so it must be a shared allele (codon)
1445              
1446             # so we just count how many total alleles were seen
1447             # if this is the same as the number of alleles seen for just
1448             # the ingroup then the outgroup presents no new information
1449              
1450 188         233 my @ingroup_codons = keys %{$codonvals{'ingroup'}};
  188         405  
1451 188         362 my $diff_from_out = ! exists $codonvals{'ingroup'}->{$outcodon};
1452              
1453 188 50       377 if( $verbose > 0 ) {
1454             $self->debug("alleles are in: ", join(",", @ingroup_codons),
1455 0         0 " out: ", join(",", keys %{$codonvals{outgroup1}}),
  0         0  
1456             " diff_from_out=$diff_from_out\n");
1457              
1458 0         0 for my $ind ( sort keys %all_alleles ) {
1459 0         0 $self->debug( "$ind\t$all_alleles{$ind}\n");
1460             }
1461             }
1462             # are all the ingroup alleles the same and diferent from outgroup?
1463             # fixed differences between species
1464 188 100       297 if( $diff_from_out ) {
1465 99 100       167 if( scalar @ingroup_codons == 1 ) {
1466             # fixed differences
1467 92 50       551 if( $outcodon =~ /^$gapchar/ ) {
    50          
1468 0         0 $status{'outgroup codons with gaps'}++;
1469 0         0 next;
1470             } elsif( $ingroup_codons[0] =~ /$gapchar/) {
1471 0         0 $status{'ingroup codons with gaps'}++;
1472 0         0 next;
1473             }
1474 92         284 my $path = $codon_path->{uc $ingroup_codons[0].$outcodon};
1475 92         207 $two_by_two{fixed_N} += $path->[0];
1476 92         113 $two_by_two{fixed_S} += $path->[1];
1477 92 50       490 if( $verbose > 0 ) {
1478 0         0 $self->debug("ingroup is @ingroup_codons outcodon is $outcodon\n");
1479 0         0 $self->debug("path is ",join(",",@$path),"\n");
1480             $self->debug
1481             (sprintf("%-15s fixeddiff - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,$ingroup_codons[0], $outcodon,$out_AA,
1482 0         0 @$path, map { $two_by_two{$_} }
  0         0  
1483             qw(fixed_N fixed_S poly_N poly_S)));
1484             }
1485             } else {
1486             # polymorphic and all are different from outgroup
1487             # Here we find the minimum number of NS subst
1488 7         16 my ($Ndiff,$Sdiff) = (3,0); # most different path
1489 7         13 for my $c ( @ingroup_codons ) {
1490 14 50 33     89 next if( $c =~ /$gapchar/ || $outcodon =~ /$gapchar/);
1491 14         35 my $path = $codon_path->{uc $c.$outcodon};
1492 14         30 my ($tNdiff,$tSdiff) = @$path;
1493 14 100 100     51 if( $path->[0] < $Ndiff ||
      100        
1494             ($tNdiff == $Ndiff &&
1495             $tSdiff <= $Sdiff)) {
1496 10         17 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1497             }
1498             }
1499 7         18 $two_by_two{fixed_N} += $Ndiff;
1500 7         11 $two_by_two{fixed_S} += $Sdiff;
1501 7 50       16 if( @ingroup_codons > 2 ) {
1502 0         0 $status{"more than 2 ingroup codons $codon"}++;
1503 0         0 warn("more than 2 ingroup codons (@ingroup_codons)\n");
1504             } else {
1505 7         24 my $path = $codon_path->{uc join('',@ingroup_codons)};
1506              
1507 7         13 $two_by_two{poly_N} += $path->[0];
1508 7         12 $two_by_two{poly_S} += $path->[1];
1509 7 50       37 if( $verbose > 0 ) {
1510 0         0 $self->debug(sprintf("%-15s polysite_all - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,@$path, map { $two_by_two{$_} } qw(fixed_N fixed_S poly_N poly_S)));
  0         0  
1511             }
1512             }
1513             }
1514             } else {
1515 89         160 my %unq = map { $_ => 1 } @ingroup_codons;
  181         382  
1516 89         166 delete $unq{$outcodon};
1517 89         190 my @unique_codons = keys %unq;
1518              
1519             # calc path for diff add to poly
1520             # Here we find the minimum number of subst bw
1521             # codons
1522 89         166 my ($Ndiff,$Sdiff) = (3,0); # most different path
1523 89         151 for my $c ( @unique_codons ) {
1524 92         320 my $path = $codon_path->{uc $c.$outcodon };
1525 92 50       166 if( ! defined $path ) {
1526 0         0 die " cannot get path for ", $c.$outcodon, "\n";
1527             }
1528 92         176 my ($tNdiff,$tSdiff) = @$path;
1529 92 50 33     250 if( $path->[0] < $Ndiff ||
      66        
1530             ($tNdiff == $Ndiff &&
1531             $tSdiff <= $Sdiff)) {
1532 92         194 ($Ndiff,$Sdiff) = ($tNdiff,$tSdiff);
1533             }
1534             }
1535              
1536 89 100       162 if( @unique_codons == 2 ) {
1537 3         15 my $path = $codon_path->{uc join('',@unique_codons)};
1538 3 50       12 if( ! defined $path ) {
1539 0         0 $self->throw("no path for @unique_codons\n");
1540             }
1541 3         7 $Ndiff += $path->[0];
1542 3         6 $Sdiff += $path->[1];
1543             }
1544 89         167 $two_by_two{poly_N} += $Ndiff;
1545 89         120 $two_by_two{poly_S} += $Sdiff;
1546 89 50       480 if( $verbose > 0 ) {
1547             $self->debug(sprintf("%-15s polysite - %s;%s(%s) %d,%d\tNfix=%d Sfix=%d Npoly=%d Spoly=%s\n",$codon,join(',',@ingroup_codons), $outcodon,$out_AA,
1548 0         0 $Ndiff, $Sdiff, map { $two_by_two{$_} }
  0         0  
1549             qw(fixed_N fixed_S poly_N poly_S)));
1550             }
1551             }
1552             }
1553             }
1554             return ( $two_by_two{'poly_N'},
1555             $two_by_two{'fixed_N'},
1556             $two_by_two{'poly_S'},
1557 6         93 $two_by_two{'fixed_S'},
1558             {%status});
1559            
1560             }
1561              
1562             *MK = \&mcdonald_kreitman;
1563              
1564              
1565             =head2 mcdonald_kreitman_counts
1566              
1567             Title : mcdonald_kreitman_counts
1568             Usage : my $MK = $statistics->mcdonald_kreitman_counts(
1569              
1570             N_poly -> integer of count of non-syn polymorphism
1571             N_fix -> integer of count of non-syn fixed substitutions
1572             S_poly -> integer of count of syn polymorphism
1573             S_fix -> integer of count of syn fixed substitutions
1574             );
1575             Function:
1576             Returns : decimal number
1577             Args :
1578              
1579             =cut
1580              
1581              
1582             sub mcdonald_kreitman_counts {
1583 0     0 1   my ($self,$Npoly,$Nfix,$Spoly,$Sfix) = @_;
1584 0 0         if( $has_twotailed ) {
1585 0           return &Text::NSP::Measures::2D::Fisher2::twotailed::calculateStatistic
1586             (n11=>$Npoly,
1587             n1p=>$Npoly+$Spoly,
1588             np1=>$Npoly+$Nfix,
1589             npp=>$Npoly+$Nfix+$Spoly+$Sfix);
1590             } else {
1591 0           $self->warn("cannot call mcdonald_kreitman_counts because no Fisher's exact is available - install Text::NSP::Measures::2D::Fisher2::twotailed");
1592 0           return 0;
1593             }
1594             }
1595              
1596              
1597             1;