File Coverage

Bio/PopGen/PopStats.pm

Criterion	Covered	Total	%
statement	67	80	83.7
branch	8	14	57.1
condition	6	10	60.0
subroutine	5	5	100.0
pod	3	3	100.0
total	89	112	79.4

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::PopGen::PopStats
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Jason Stajich
7							#
8							# Copyright Jason Stajich
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::PopGen::PopStats - A collection of methods for calculating
17							statistics about a population or sets of populations
18
19							=head1 SYNOPSIS
20
21							use Bio::PopGen::PopStats;
22							my $stats = Bio::PopGen::PopStats->new(); # add -haploid => 1
23							# to process haploid data
24
25							=head1 DESCRIPTION
26
27							Calculate various population structure statistics, most notably Wright's Fst.
28
29							=head1 FEEDBACK
30
31							=head2 Mailing Lists
32
33							User feedback is an integral part of the evolution of this and other
34							Bioperl modules. Send your comments and suggestions preferably to
35							the Bioperl mailing list. Your participation is much appreciated.
36
37							bioperl-l@bioperl.org - General discussion
38							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39
40							=head2 Support
41
42							Please direct usage questions or support issues to the mailing list:
43
44							I
45
46							rather than to the module maintainer directly. Many experienced and
47							reponsive experts will be able look at the problem and quickly
48							address it. Please include a thorough description of the problem
49							with code and data examples if at all possible.
50
51							=head2 Reporting Bugs
52
53							Report bugs to the Bioperl bug tracking system to help us keep track
54							of the bugs and their resolution. Bug reports can be submitted via
55							the web:
56
57							https://github.com/bioperl/bioperl-live/issues
58
59							=head1 AUTHOR - Jason Stajich
60
61							Email jason-at-bioperl.org
62
63							=head1 CONTRIBUTORS
64
65							Matthew Hahn, matthew.hahn-at-duke.edu
66
67							=head1 APPENDIX
68
69							The rest of the documentation details each of the object methods.
70							Internal methods are usually preceded with a _
71
72							=cut
73
74
75							# Let the code begin...
76
77
78							package Bio::PopGen::PopStats;
79	1			1		650	use strict;
	1					1
	1					26
80
81							# Object preamble - inherits from Bio::Root::Root
82
83
84
85	1			1		3	use base qw(Bio::Root::Root);
	1					0
	1					602
86
87							=head2 new
88
89							Title : new
90							Usage : my $obj = Bio::PopGen::PopStats->new();
91							Function: Builds a new Bio::PopGen::PopStats object
92							Returns : an instance of Bio::PopGen::PopStats
93							Args : -haploid => 1 (if want to use haploid calculations)
94
95
96							=cut
97
98							sub new {
99	2			2	1	10	my($class,@args) = @_;
100
101	2					10	my $self = $class->SUPER::new(@args);
102	2					8	my ($haploid) = $self->_rearrange([qw(HAPLOID)],@args);
103	2	50				5	if( $haploid ) { $self->haploid_status(1) }
	2					5
104	2					4	return $self;
105							}
106
107
108							=head2 haploid_status
109
110							Title : haploid_status
111							Usage : $obj->haploid_status($newval)
112							Function: Boolean value for whether or not to do haploid
113							or diploid calculations, where appropriate
114							Returns : Boolean
115							Args : on set, new boolean value optional)
116
117
118							=cut
119
120							sub haploid_status{
121	174			174	1	122	my $self = shift;
122	174	100				240	return $self->{'haploid_status'} = shift if @_;
123	172					245	return $self->{'haploid_status'};
124							}
125
126
127							# Implementation provided my Matthew Hahn, massaged by Jason Stajich
128
129							=head2 Fst
130
131							Title : Fst
132							Usage : my $fst = $stats->Fst(\@populations,\@markernames)
133							Function: Calculate Wright's Fst based on a set of sub-populations
134							and specific markers
135							Returns : Fst value (a value between 0 and 1)
136							Args : Arrayref of populations to process
137							Arrayref of marker names to process
138							Note : Based on diploid method in Weir BS, Genetics Data Analysis II, 1996
139							page 178.
140
141							=cut
142
143							#' make emacs happy here
144							sub Fst {
145	8			8	1	3167	my ($self,$populations,$markernames) = @_;
146
147	8	50	33			79	if( ! defined $populations \|\|
		50	33
148							ref($populations) !~ /ARRAY/i ) {
149	0					0	$self->warn("Must provide a valid arrayref for populations");
150	0					0	return;
151							} elsif( ! defined $markernames \|\|
152							ref($markernames) !~ /ARRAY/i ) {
153	0					0	$self->warn("Must provide a valid arrayref for marker names");
154	0					0	return;
155							}
156	8					11	my $num_sub_pops = scalar @$populations;
157
158	8	50				14	if( $num_sub_pops < 2 ) {
159	0					0	$self->warn("Must provide at least 2 populations for this test, you provided $num_sub_pops");
160	0					0	return;
161							}
162
163							# This code assumes that pop 1 contains at least one of all the
164							# alleles - need to do some more work to insure that the complete
165							# set of alleles is seen.
166	8					7	my $Fst;
167	8					6	my ($TS_sub1,$TS_sub2);
168
169	8					10	foreach my $marker ( @$markernames ) {
170							# Get all the alleles from all the genotypes in all subpopulations
171	84					60	my %allAlleles;
172	84					75	foreach my $allele ( map { $_->get_Alleles() }
	1440					1543
173	196					276	map { $_->get_Genotypes($marker) } @$populations ){
174	1440					1011	$allAlleles{$allele}++;
175							}
176	84					227	my @alleles = keys %allAlleles;
177
178	84					92	foreach my $allele_name ( @alleles ) {
179	172					134	my $avg_samp_size = 0; # n-bar
180	172					132	my $avg_allele_freq = 0; # p-tilda-A-dot
181
182	172					115	my $total_samples_squared = 0; #
183	172					97	my $sum_heterozygote = 0;
184
185	172					141	my @marker_freqs;
186
187							# Walk through each population, get the calculated allele frequencies
188							# for the marker, do some bookkeeping
189
190
191	172					136	foreach my $pop ( @$populations ) {
192	405					560	my $s = $pop->get_number_individuals($marker);
193
194	405					304	$avg_samp_size += $s;
195	405					344	$total_samples_squared += $s**2;
196
197	405					521	my $markerobj = $pop->get_Marker($marker);
198	405	50				537	if( ! defined $markerobj ) {
199	0					0	$self->warn("Could not derive Marker for $marker ".
200							"from population ". $pop->name);
201	0					0	return;
202							}
203
204	405					530	my $freq_homozygotes =
205							$pop->get_Frequency_Homozygotes($marker,$allele_name);
206	405					665	my %af = $markerobj->get_Allele_Frequencies();
207	405		100			812	my $all_freq = ( ($af{$allele_name} \|\| 0));
208
209	405					354	$avg_allele_freq += $s * $all_freq;
210	405					383	$sum_heterozygote += (2 * $s)*( $all_freq - $freq_homozygotes);
211
212	405					591	push @marker_freqs, \%af;
213							}
214	172					137	my $total_samples = $avg_samp_size; # sum of n over i sub-populations
215	172					113	$avg_samp_size /= $num_sub_pops;
216	172					121	$avg_allele_freq /= $total_samples;
217
218							# n-sub-c
219	172					178	my $adj_samp_size = ( 1/ ($num_sub_pops - 1)) *
220							( $total_samples - ( $total_samples_squared/$total_samples));
221
222	172					130	my $variance = 0; # s-squared-sub-A
223	172					93	my $sum_variance = 0;
224	172					105	my $i = 0; # we have cached the marker info
225	172					193	foreach my $pop ( @$populations ) {
226	405					498	my $s = $pop->get_number_individuals($marker);
227	405					285	my %af = %{$marker_freqs[$i++]};
	405					824
228	405		100			1057	$sum_variance += $s * (( ($af{$allele_name} \|\| 0) -
229							$avg_allele_freq)**2);
230							}
231	172					190	$variance = ( 1 / (( $num_sub_pops-1)$avg_samp_size))$sum_variance;
232
233							# H-tilda-A-dot
234	172					148	my $freq_heterozygote = ($sum_heterozygote / $total_samples);
235
236	172	50				240	if( $self->haploid_status ) {
237							# Haploid calculations
238
239	172					250	my $T_sub1 = $variance -
240							( ( 1/($avg_samp_size-1))*
241							( ($avg_allele_freq*(1-$avg_allele_freq))-
242							( (($num_sub_pops-1)/$num_sub_pops)*$variance)));
243	172					247	my $T_sub2 = ( (($adj_samp_size-1)/($avg_samp_size-1))*
244							$avg_allele_freq*(1-$avg_allele_freq) ) +
245							( 1 + ( (($num_sub_pops-1)*
246							($avg_samp_size-$adj_samp_size))/
247							($avg_samp_size - 1))) *
248							($variance/$num_sub_pops);
249
250
251							#to get total Fst from all alleles (if more than two) or all
252							#loci (if more than one), we need to calculate $T_sub1 and
253							#$T_sub2 for all alleles for all loci, sum, and then divide
254							#again to get Fst.
255	172					126	$TS_sub1 += $T_sub1;
256	172					395	$TS_sub2 += $T_sub2;
257
258							} else {
259	0					0	my $S_sub1 = $variance - ( (1/($avg_samp_size-1))*
260							( ($avg_allele_freq*
261							(1-$avg_allele_freq)) -
262							((($num_sub_pops-1)/$num_sub_pops)*
263							$variance)-0.25*$freq_heterozygote ) );
264	0					0	my $S_sub2 = ($avg_allele_freq*(1-$avg_allele_freq)) -
265							( ($avg_samp_size/($num_sub_pops($avg_samp_size-1)))
266							( ((($num_sub_pops*($avg_samp_size- $adj_samp_size))/
267							$avg_samp_size)$avg_allele_freq
268							(1-$avg_allele_freq)) -
269							( (1/$avg_samp_size)* (($avg_samp_size-1)+
270							($num_sub_pops-1)*
271							($avg_samp_size-
272							$adj_samp_size) )*$variance ) -
273							( (($num_sub_pops*($avg_samp_size-$adj_samp_size))/
274							(4$avg_samp_size$adj_samp_size))*
275							$freq_heterozygote ) ) );
276
277	0					0	my $S_sub3 = ($adj_samp_size/(2$avg_samp_size))
278							$freq_heterozygote;
279
280							#Again, to get the average over many alleles or many loci,
281							#we will have to run the above for each and then sum the $S
282							#variables and recalculate the F statistics
283	0					0	$TS_sub1 += $S_sub1;
284	0					0	$TS_sub2 += $S_sub2;
285							}
286							}
287							}
288							# $Fst_diploid = $S_sub1/$S_sub2;
289							#my $Fit_diploid = 1 - ($S_sub3/$S_sub2);
290							#my $Fis_diploid = ($Fit_diploid-$Fst_diploid)/(1-$Fst_diploid);
291	8					9	$Fst = $TS_sub1 / $TS_sub2;
292
293	8					13	return $Fst;
294							}
295
296							1;