File Coverage

blib/lib/Statistics/MVA/BayesianDiscrimination.pm

Criterion	Covered	Total	%
statement	24	191	12.5
branch	0	40	0.0
condition	0	9	0.0
subroutine	8	16	50.0
pod	3	3	100.0
total	35	259	13.5

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::MVA::BayesianDiscrimination;
2
3	1			1		49066	use warnings;
	1					3
	1					28
4	1			1		4	use strict;
	1					2
	1					29
5	1			1		4	use Carp;
	1					6
	1					81
6	1			1		1195	use Statistics::MVA;
	1					44870
	1					37
7	1			1		953	use Math::Cephes qw/:explog/;
	1					8168
	1					369
8	1			1		10	use List::Util qw/sum/;
	1					3
	1					74
9	1			1		1009	use Text::SimpleTable;
	1					2197
	1					72
10
11							#=fs pod stuff
12							=head1 NAME
13
14							Statistics::MVA::BayesianDiscrimination - Two-Sample Linear Discrimination Analysis with Posterior Probability Calculation.
15
16							=cut
17
18							=head1 VERSION
19
20							This document describes Statistics::MVA::BayesianDiscrimination version 0.0.2
21
22							=cut
23
24							=head1 DESCRIPTION
25
26							Discriminant analysis is a procedure for classifying a set of observations each with k variables into predefined classes such as to allow the
27							determination of the class of new observations based upon the values of the k variables for these new observations. Group membership based on linear combinations of the variables. From the set of observations where group membership is know the procedure constructs a set of linear functions, termed
28							discriminant functions, such that:
29
30							L = B[0] + B[1] * x1 + B[2] * x2 +... ... + B[n] * x_n
31
32							Where B[0] is a constant, B[n's] are discriminant coefficients and x's are the input variables. These discriminant functions
33							(there is one for each group - consequently as this module only analyses data for two groups atm it generates two such
34							discriminant functions.
35
36							Before proceeding with the analysis you should: (1) Perform Bartlett´s test to see if the covariance matrices of the data
37							are homogenous for the populations used (see L. If they are not homogenous you should use Quadratic Discrimination analysis.
38							(2) test for equality of the group means using Hotelling's T^2 (see L or MANOVA. If
39							the groups do not differ significantly it is extremely unlikely that discrimination analysis with generate any useful
40							discrimination rules. (3) Specify the prior probabilities. This module allows you to do this in several ways - see
41							L.
42
43							This class automatically generates the discrimination coefficients at part of object construction. You can then either
44							use the C method to access these values or use the C method to apply the equations to a new
45							observation. Both of these methods are context dependent - see L. See
46							http://en.wikipedia.org/wiki/Linear_discriminant_analysis for further details.
47
48							=cut
49
50							=head1 SYNOPSIS
51
52							# we have two groups of data each with 3 variables and 10 observations - example data from http://www.stat.psu.edu/online/courses/stat505/data/insect.txt
53							my $data_X = [
54							[qw/ 191 131 53/],
55							[qw/ 185 134 50/],
56							[qw/ 200 137 52/],
57							[qw/ 173 127 50/],
58							[qw/ 171 128 49/],
59							[qw/ 160 118 47/],
60							[qw/ 188 134 54/],
61							[qw/ 186 129 51/],
62							[qw/ 174 131 52/],
63							[qw/ 163 115 47/],
64							];
65
66							my $data_Y = [
67							[qw/ 186 107 49/],
68							[qw/ 211 122 49/],
69							[qw/ 201 144 47/],
70							[qw/ 242 131 54/],
71							[qw/ 184 108 43/],
72							[qw/ 211 118 51/],
73							[qw/ 217 122 49/],
74							[qw/ 223 127 51/],
75							[qw/ 208 125 50/],
76							[qw/ 199 124 46/],
77							];
78
79							use Statistics::MVA::BayesianDiscrimination;
80
81							# Pass the data as a list of the two LISTS-of-LISTS above (termed X and Y). The module by default assumes equal prior probabilities.
82							#my $bld = Statistics::MVA::BayesianDiscrimination->new($data_X,$data_Y);
83
84							# Pass the data but telling the module to calculate the prior probabilities as the ratio of observations for the two groups (e.g. P(X) X_obs_num / Total_obs.
85							#my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
86
87							# Pass the data but directly specifying the values of prior probability for X and Y to use as an anonymous array.
88							#my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => [ 0.25, 0.75 ] },$ins_a,$ins_b);
89
90							# Print values for coefficients to STDOUT.
91							$bld->output;
92
93							# Pass the values as an ARRAY reference by calling in LIST context - see L.
94							my ($prior_x, $constant_x, $matrix_x, $prior_y, $constant_y, $matrix_y) = $bld->output;
95
96							# Perform discriminantion analyis for a specific observation and print result to STDOUT.
97							$bld->discriminate([qw/184 114 59/]);
98
99							# Call in LIST context to obtain results directly - see L.
100							my ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type) = $bld->discriminate([qw/194 124 49/]);
101
102							=cut
103
104							=head1 METHODS
105							=cut
106
107							=head2 new
108
109							Creates a new Statistics::MVA::BayesianDiscrimination. This accepts two references for List-of-Lists of values
110							corresponding to the two groups of data - termed X and Y. Within each List-of-Lists each nested array corresponds to a single set of observations.
111							It also accepts an optional HASH reference of options preceding these values. The constructor automatically generates
112							the discrimination coefficients that are accessed using the C method.
113
114							# Pass data as ARRAY references.
115							my $bld = Statistics::MVA::BayesianDiscrimination->new($data_X,$data_Y);
116
117							# Passing optional HASH reference of options.
118							my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
119
120							=cut
121
122							=head2 output
123
124							Context-dependent method for accessing results of discrimination analysis. In void context it prints the coefficients to STDOUT.
125
126							$bld->output;
127
128							In LIST-context it returns a list of the relevant data accessed as follows:
129
130							my ($prior_x, $constant_x, $matrix_x, $prior_y, $constant_y, $matrix_y) = $bld->output;
131
132							print qq{\nPrior probability of X = $prior_x and Y = $prior_y.};
133							print qq{\nConstants for discrimination function for X = $constant_x and Y = $constant_y.};
134							print qq{\nCoefficients for discrimination function X = @{$matrix_x}.};
135							print qq{\nCoefficients for discrimination function Y = @{$matrix_y}.};
136
137							=cut
138
139							=head2 discriminate
140
141							Method for classification of a new observation. Pass it an ARRAY reference of SCALAR values appropriate for the original
142							data-sets passed to the constructor. In void context it prints a report to STDOUT:
143
144							$bld->discriminate([qw/123 34 325/];
145
146							In LIST-context it returns a list of the relevant data as follows:
147
148							my ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type) = $bld->discriminate([qw/123 34 325/];
149
150							print qq{\nLinear score function for X = $val_x and Y = $val_y - the new observation is of type \x27$type\x27.};
151							print qq{\nThe prior probability that the new observation is of type X = $p_x and the posterior probability = $post_p_x};
152							print qq{\nThe prior probability that the new observation is of type X = $p_y and the posterior probability = $post_p_y};
153
154							=cut
155
156							=head1 OPTIONS
157							=cut
158
159							=head2 priors
160
161							Pass within an anonymous HASH preceding the two data references during object construction:
162
163							my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => option_value },$data_X,$data_Y);
164
165							Passing '0' causes the module to assume equal prior probabilities for the two groups (prior_x = prior_y = 0.5). Passing
166							'1' causes the module to generate priors depending on the ratios of the two data-sets e.g. X has 15 observations and Y
167							has 27 observations gives prior_x = 15 / (15 + 27). Alternatively you may specify the values to use by passing an anonymous ARRAY reference of length 2
168							where the first value is prior_x and the second is prior_y. There are currently no checks on priors directly passed so
169							ensure that prior_x + prior_y = 1 if you supply you own.
170
171							# Use prior_x = prior_y = 0.5.
172							my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 0 },$data_X,$data_Y);
173
174							# Generate priors depending on rations of observation numbers.
175							my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
176
177							# Specify your own priors.
178							my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => [$prior_x, $prior_y] },$data_X,$data_Y);
179
180							=cut
181
182							#=fe
183
184	1			1		6	use version; our $VERSION = qv('0.0.2');
	1					3
	1					9
185
186							#/ with these new modules object creation directly checks data and performs analysis. storing just the essential results. can access directly or print them
187
188							sub new {
189	0			0	1		my $class = shift;
190							#y grab an options hash if there is one
191	0	0					my $options_ref = shift if ( ref $_[0] eq q{HASH} );
192							#y the rest is data
193	0						my @data = @_;
194	0						my $p_x;
195							my $p_y;
196
197	0						my $self = [];
198
199	0						my $k = scalar @data;
200	0	0					croak qq{\nThis only accepts two groups of data} if $k != 2;
201							#y if there´s an options hash with priors check it - otherwise default...
202	0	0					if ( defined $options_ref ) { ($p_x, $p_y) = &_priors($options_ref, @data) }
	0
203							else {
204	0						print qq{\nDefaulting to priors of 0.5 for both x and y.};
205	0						$p_x = 0.5;
206	0						$p_y = 0.5;
207							}
208							#y check they have good matrix format - need to check compatibility too
209	0	0					croak qq{\nbad data} if &Statistics::MVA::CVMat::_check($data[0]);
210	0	0					croak qq{\nbad data} if &Statistics::MVA::CVMat::_check($data[1]);
211							#y data is passed in table format - i.e. nested arrays (rows) correspond to observations and not variables
212	0						my $lol1 = &Statistics::MVA::CVMat::transpose($data[0]);
213	0						my $lol2 = &Statistics::MVA::CVMat::transpose($data[1]);
214							#y get variable number
215	0						my $p = scalar @{$lol1};
	0
216	0	0					croak qq{\nThese data sets are not compatible} if ($p != scalar @{$lol2});
	0
217							#y combine the data for overall means calculations - if using single data with names could use adjust in MVA
218	0						my $full = [];
219	0						$full = &_combine($lol1,$lol2, $p);
220							#y copy vars and we will do in place mean calculation - no need to deep copy references here
221							#my $first = &_deep_copy($lol1);
222	0						my $first = [@{$lol1}];
	0
223	0						my $second = [@{$lol2}];
	0
224							#y in place calculation of the means
225	0						&_means($full,$p);
226	0						&_means($first,$p);
227	0						&_means($second,$p);
228							#y in place subtraction of means from raw data
229	0						&_subtract($lol1, $full);
230	0						&_subtract($lol2, $full);
231							#y for MVA cv_calculations the data needs to be in table format - this is wastefull as it just transpose - thus is unecessary transposes - IN PLACE - should modify this?!?
232	0						$lol1 = &Statistics::MVA::CVMat::transpose($lol1);
233	0						$lol2 = &Statistics::MVA::CVMat::transpose($lol2);
234	0						my $mva;
235							#y if we have a twat who likes this annoying method
236	0	0	0				if (defined $options_ref && defined $options_ref->{cv} && $options_ref->{cv} == 0) { $mva = &_twats($lol1, $lol2); }
	0		0
237							else {
238							#y create MVA object - these are defaults parameters so don´t need to pass them...
239	0						$mva = Statistics::MVA->new([ $lol1, $lol2 ], {standardise => 0, divisor => 1});
240							}
241							#r CV matrix data - loose last [0] for realmatrix object
242							#print Dumper $mva->[0][0][0][0];
243							#print Dumper $mva->[0][1][0][0];
244							#y create empty matrix of same dimensions
245	0						my $c = $mva->[0][0][0]->shadow;
246
247							#/ its 2x not divided by priors?!? - only constant takes into account priors
248							#$mva->[0][0][0]->multiply_scalar($mva->[0][0][0],$p_x);
249							#$mva->[0][1][0]->multiply_scalar($mva->[0][1][0],$p_y);
250	0						$mva->[0][0][0]->multiply_scalar($mva->[0][0][0],0.5);
251	0						$mva->[0][1][0]->multiply_scalar($mva->[0][1][0],0.5);
252
253	0						$c->add($mva->[0][0][0],$mva->[0][1][0]);
254
255							#y no need for transpose just build from rows...
256	0						my $averages_a = Math::MatrixReal->new_from_rows([$first]);
257	0						my $averages_b = Math::MatrixReal->new_from_rows([$second]);
258							#y calculate b_0 - constant discrimination coefficients
259	0						my $constant_x = 0.5 * $averages_a * $c->inverse * ~$averages_a;
260	0						my $constant_y = 0.5 * $averages_b * $c->inverse * ~$averages_b;
261
262							#/ you must add the priors to the constant - only the constant takes into account the priors.
263							#$constant_x->[0][0][0] = $constant_x->[0][0][0] - log($p_x);
264							#$constant_y->[0][0][0] = $constant_y->[0][0][0] - log($p_y);
265							#y now should be -0.5...
266	0						$constant_x->[0][0][0] = -$constant_x->[0][0][0] + log($p_x);
267	0						$constant_y->[0][0][0] = -$constant_y->[0][0][0] + log($p_y);
268
269							#y calculate the rest of the discrimination coeficients
270							#~$averages_a * $c->inverse * $averages_a;
271	0						my $matrix_x = $averages_a * $c->inverse;
272	0						my $matrix_y = $averages_b * $c->inverse;
273
274							#y feed the objectvv- this is only done for persistance - otherwise we loose the data...
275	0						$self->[0][0][0] = $constant_x;
276	0						$self->[0][0][1] = $matrix_x;
277	0						$self->[0][1][0] = $constant_y;
278	0						$self->[0][1][1] = $matrix_y;
279	0						$self->[1][0] = $p_x;
280	0						$self->[1][1] = $p_y;
281	0						$self->[2] = $p;
282
283	0						bless $self, $class;
284	0						return $self;
285							}
286
287							sub _priors {
288	0			0			my ($options_ref, @data) = @_;
289	0						my $p_x;
290							my $p_y;
291	0	0					if ($options_ref->{priors} == 1 ) {
		0
		0
292	0						my $n_x = scalar @{$data[0]};
	0
293	0						my $n_y = scalar @{$data[1]};
	0
294	0						my $n_total = $n_x + $n_y;
295	0						$p_x = sprintf (q{%.5f}, $n_x / $n_total);
296	0						$p_y = sprintf (q{%.5f}, $n_y / $n_total);
297							}
298							elsif (ref $options_ref->{priors} eq q{ARRAY} ) {
299	0						my @priors = @{$options_ref->{priors}};
	0
300	0	0					croak qq{\nIf passing an ARRAY ref of priors it must have length two.} if ( scalar @priors != 2 );
301	0	0	0				croak qq{\nThe priors must sum to 1.} if ( $priors[0] + $priors[1] < 0.95 \|\| $priors[0] + $priors[1] > 1.05 );
302	0						$p_x = $priors[0];
303	0						$p_y = $priors[1];
304							}
305							elsif ($options_ref->{priors} == 0) {
306	0						$p_x = 0.5;
307	0						$p_y = 0.5;
308							}
309	0						else { croak qq{\nI do not recognise that value for the priors option} }
310	0						return ($p_x, $p_y);
311							}
312
313							sub _subtract {
314	0			0			my ($lol, $full) = @_;
315	0						for my $var (0..$#{$lol}) {
	0
316	0						for my $cell (0..$#{$lol->[$var]}) {
	0
317	0						$lol->[$var][$cell] = $lol->[$var][$cell] - $full->[$var];
318							}
319							}
320							}
321
322							sub _combine{
323	0			0			my ($lol1, $lol2, $p) = @_;
324	0						my $full = [];
325	0						for my $i (0..$p-1) {
326	0						push @{$full->[$i]}, @{$lol1->[$i]}, @{$lol2->[$i]};
	0
	0
	0
327							}
328	0						return $full;
329							}
330
331							sub _means {
332	0			0			my ($full, $p) = @_;
333	0						for my $i (0..$p-1) {
334	0						my $sum = sum @{$full->[$i]};
	0
335	0						my $n = scalar @{$full->[$i]};
	0
336	0						$full->[$i] = $sum/$n;
337							}
338							}
339
340							sub _twats {
341	0			0			my ($lol1, $lol2) = @_;
342	0						my $lol1_mat = Math::MatrixReal->new_from_rows($lol1);
343	0						my $n_1 = scalar @{$lol1};
	0
344	0						$lol1_mat = (~$lol1_mat * $lol1_mat) / $n_1;
345	0						my $lol2_mat = Math::MatrixReal->new_from_rows($lol2);
346	0						my $n_2 = scalar @{$lol2};
	0
347	0						$lol2_mat = (~$lol2_mat * $lol2_mat) / $n_2;
348	0						my $mva = [];
349	0						$mva->[0][0][0] = $lol1_mat;
350	0						$mva->[0][1][0] = $lol2_mat;
351	0						print $mva->[0][0][0];
352	0						print $mva->[0][1][0];
353	0						return $mva;
354							}
355
356							sub output {
357	0			0	1		my $self = shift;
358	0						my $constant_x = $self->[0][0][0];
359	0						my $constant_y = $self->[0][1][0];
360	0						my $matrix_x = $self->[0][0][1];
361	0						my $matrix_y = $self->[0][1][1];
362	0						my $p_x = $self->[1][0];
363	0						my $p_y = $self->[1][1];
364	0						my $p_var_num = $self->[2];
365	0	0					if (!wantarray) {
366	0						print qq{\nTwo-groups with $p_var_num variables each and prior probabilities of p(x) = $p_x and p(y) = $p_y.\n\n};
367	0						my @config = ( [17, q{}], [15, q{X}], [15, q{Y}] );
368	0						my $tbl = Text::SimpleTable->new(@config);
369	0						$tbl->row( qq{Constant B[0]}, sprintf(q{%.5f}, $constant_x->[0][0][0]), sprintf(q{%.5f}, $constant_y->[0][0][0]) );
370	0						$tbl->hr;
371	0						for my $row (0..$#{$matrix_x->[0][0]}) {
	0
372	0						my $coeff = $row+1;
373	0						$tbl->row( qq{Coefficient B[$coeff]}, sprintf(q{%.5f}, $matrix_x->[0][0][$row]), sprintf(q{%.5f}, $matrix_y->[0][0][$row]) );
374	0	0					$tbl->hr if $row != $#{$matrix_x->[0][0]};
	0
375							}
376	0						print $tbl->draw;
377	0						return;
378							}
379	0						else { return ( $p_x, $constant_x->[0][0][0], $matrix_x->[0][0], $p_y, $constant_y->[0][0][0], $matrix_y->[0][0] ); }
380							}
381
382							sub discriminate {
383	0			0	1		my ($self, $ex) = @_;
384	0						my $context = wantarray;
385	0						my $constant_x = $self->[0][0][0];
386	0						my $constant_y = $self->[0][1][0];
387	0						my $matrix_x = $self->[0][0][1];
388	0						my $matrix_y = $self->[0][1][1];
389	0						my $p_x = $self->[1][0];
390	0						my $p_y = $self->[1][1];
391	0						my $p_var_num = $self->[2];
392
393	0	0					croak qq{\nData must be passed as an ARRAY ref} if (ref $ex ne q{ARRAY});
394	0	0					croak qq{\nThis example has the wrong variable number} if scalar @{$ex} != $p_var_num;
	0
395
396	0						my $example = Math::MatrixReal->new_from_cols([$ex]);
397	0						my $val_x = ( $matrix_x * $example ) + $constant_x;#; + log($p_x);
398
399							#r when adding log(p) its the linear score function and not simply the linear discrimination result
400							#$val_x = $val_x->[0][0][0];#+log($p_x);
401	0						$val_x = $val_x->[0][0][0];
402	0						my $val_y = ( $matrix_y * $example ) + $constant_y;#; + log($p_x);
403							#$val_y = $val_y->[0][0][0];#+log($p_y);
404	0						$val_y = $val_y->[0][0][0];
405
406	0	0					my $type = $val_x > $val_y ? q{X}
		0
407							: $val_y > $val_x ? q{Y}
408							: undef;
409
410	0						my $post_p_x = exp($val_x) / (exp($val_x) + exp($val_y));
411	0						my $post_p_y = exp($val_y) / (exp($val_x) + exp($val_y));
412
413	0	0					if (!$context) {
414	0						print qq{\nLinear score function for x = $val_x\nLinear score function for y = $val_y\n};
415	0	0					if ( defined $type ) { print qq{\nExample (@{$ex}) is a \x27$type\x27.} }
	0
	0
416	0						else { print qq{\nCannot distinguish which group the example belongs to.} }
417	0						print qq{\n\nPrior probability of being an x = $p_x\nPosterior probability of being an x = $post_p_x\n};
418	0						print qq{\nPrior probability of being a y = $p_y\nPosterior probability of being a y = $post_p_y\n};
419	0						return;
420							}
421							else {
422	0						return ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type);
423							}
424							}
425
426							1; # Magic true value required at end of module
427
428							__END__