File Coverage

blib/lib/Statistics/MVA/BayesianDiscrimination.pm
Criterion Covered Total %
statement 24 191 12.5
branch 0 40 0.0
condition 0 9 0.0
subroutine 8 16 50.0
pod 3 3 100.0
total 35 259 13.5


line stmt bran cond sub pod time code
1             package Statistics::MVA::BayesianDiscrimination;
2              
3 1     1   49066 use warnings;
  1         3  
  1         28  
4 1     1   4 use strict;
  1         2  
  1         29  
5 1     1   4 use Carp;
  1         6  
  1         81  
6 1     1   1195 use Statistics::MVA;
  1         44870  
  1         37  
7 1     1   953 use Math::Cephes qw/:explog/;
  1         8168  
  1         369  
8 1     1   10 use List::Util qw/sum/;
  1         3  
  1         74  
9 1     1   1009 use Text::SimpleTable;
  1         2197  
  1         72  
10              
11             #=fs pod stuff
12             =head1 NAME
13              
14             Statistics::MVA::BayesianDiscrimination - Two-Sample Linear Discrimination Analysis with Posterior Probability Calculation.
15              
16             =cut
17              
18             =head1 VERSION
19              
20             This document describes Statistics::MVA::BayesianDiscrimination version 0.0.2
21              
22             =cut
23              
24             =head1 DESCRIPTION
25              
26             Discriminant analysis is a procedure for classifying a set of observations each with k variables into predefined classes such as to allow the
27             determination of the class of new observations based upon the values of the k variables for these new observations. Group membership based on linear combinations of the variables. From the set of observations where group membership is know the procedure constructs a set of linear functions, termed
28             discriminant functions, such that:
29              
30             L = B[0] + B[1] * x1 + B[2] * x2 +... ... + B[n] * x_n
31              
32             Where B[0] is a constant, B[n's] are discriminant coefficients and x's are the input variables. These discriminant functions
33             (there is one for each group - consequently as this module only analyses data for two groups atm it generates two such
34             discriminant functions.
35              
36             Before proceeding with the analysis you should: (1) Perform Bartlett´s test to see if the covariance matrices of the data
37             are homogenous for the populations used (see L. If they are not homogenous you should use Quadratic Discrimination analysis.
38             (2) test for equality of the group means using Hotelling's T^2 (see L or MANOVA. If
39             the groups do not differ significantly it is extremely unlikely that discrimination analysis with generate any useful
40             discrimination rules. (3) Specify the prior probabilities. This module allows you to do this in several ways - see
41             L.
42              
43             This class automatically generates the discrimination coefficients at part of object construction. You can then either
44             use the C method to access these values or use the C method to apply the equations to a new
45             observation. Both of these methods are context dependent - see L. See
46             http://en.wikipedia.org/wiki/Linear_discriminant_analysis for further details.
47              
48             =cut
49              
50             =head1 SYNOPSIS
51              
52             # we have two groups of data each with 3 variables and 10 observations - example data from http://www.stat.psu.edu/online/courses/stat505/data/insect.txt
53             my $data_X = [
54             [qw/ 191 131 53/],
55             [qw/ 185 134 50/],
56             [qw/ 200 137 52/],
57             [qw/ 173 127 50/],
58             [qw/ 171 128 49/],
59             [qw/ 160 118 47/],
60             [qw/ 188 134 54/],
61             [qw/ 186 129 51/],
62             [qw/ 174 131 52/],
63             [qw/ 163 115 47/],
64             ];
65              
66             my $data_Y = [
67             [qw/ 186 107 49/],
68             [qw/ 211 122 49/],
69             [qw/ 201 144 47/],
70             [qw/ 242 131 54/],
71             [qw/ 184 108 43/],
72             [qw/ 211 118 51/],
73             [qw/ 217 122 49/],
74             [qw/ 223 127 51/],
75             [qw/ 208 125 50/],
76             [qw/ 199 124 46/],
77             ];
78            
79             use Statistics::MVA::BayesianDiscrimination;
80              
81             # Pass the data as a list of the two LISTS-of-LISTS above (termed X and Y). The module by default assumes equal prior probabilities.
82             #my $bld = Statistics::MVA::BayesianDiscrimination->new($data_X,$data_Y);
83              
84             # Pass the data but telling the module to calculate the prior probabilities as the ratio of observations for the two groups (e.g. P(X) X_obs_num / Total_obs.
85             #my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
86              
87             # Pass the data but directly specifying the values of prior probability for X and Y to use as an anonymous array.
88             #my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => [ 0.25, 0.75 ] },$ins_a,$ins_b);
89              
90             # Print values for coefficients to STDOUT.
91             $bld->output;
92              
93             # Pass the values as an ARRAY reference by calling in LIST context - see L.
94             my ($prior_x, $constant_x, $matrix_x, $prior_y, $constant_y, $matrix_y) = $bld->output;
95            
96             # Perform discriminantion analyis for a specific observation and print result to STDOUT.
97             $bld->discriminate([qw/184 114 59/]);
98              
99             # Call in LIST context to obtain results directly - see L.
100             my ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type) = $bld->discriminate([qw/194 124 49/]);
101              
102             =cut
103              
104             =head1 METHODS
105             =cut
106              
107             =head2 new
108              
109             Creates a new Statistics::MVA::BayesianDiscrimination. This accepts two references for List-of-Lists of values
110             corresponding to the two groups of data - termed X and Y. Within each List-of-Lists each nested array corresponds to a single set of observations.
111             It also accepts an optional HASH reference of options preceding these values. The constructor automatically generates
112             the discrimination coefficients that are accessed using the C method.
113              
114             # Pass data as ARRAY references.
115             my $bld = Statistics::MVA::BayesianDiscrimination->new($data_X,$data_Y);
116              
117             # Passing optional HASH reference of options.
118             my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
119            
120             =cut
121              
122             =head2 output
123              
124             Context-dependent method for accessing results of discrimination analysis. In void context it prints the coefficients to STDOUT.
125              
126             $bld->output;
127              
128             In LIST-context it returns a list of the relevant data accessed as follows:
129              
130             my ($prior_x, $constant_x, $matrix_x, $prior_y, $constant_y, $matrix_y) = $bld->output;
131              
132             print qq{\nPrior probability of X = $prior_x and Y = $prior_y.};
133             print qq{\nConstants for discrimination function for X = $constant_x and Y = $constant_y.};
134             print qq{\nCoefficients for discrimination function X = @{$matrix_x}.};
135             print qq{\nCoefficients for discrimination function Y = @{$matrix_y}.};
136              
137             =cut
138              
139             =head2 discriminate
140              
141             Method for classification of a new observation. Pass it an ARRAY reference of SCALAR values appropriate for the original
142             data-sets passed to the constructor. In void context it prints a report to STDOUT:
143              
144             $bld->discriminate([qw/123 34 325/];
145              
146             In LIST-context it returns a list of the relevant data as follows:
147              
148             my ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type) = $bld->discriminate([qw/123 34 325/];
149              
150             print qq{\nLinear score function for X = $val_x and Y = $val_y - the new observation is of type \x27$type\x27.};
151             print qq{\nThe prior probability that the new observation is of type X = $p_x and the posterior probability = $post_p_x};
152             print qq{\nThe prior probability that the new observation is of type X = $p_y and the posterior probability = $post_p_y};
153              
154             =cut
155              
156             =head1 OPTIONS
157             =cut
158              
159             =head2 priors
160              
161             Pass within an anonymous HASH preceding the two data references during object construction:
162              
163             my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => option_value },$data_X,$data_Y);
164              
165             Passing '0' causes the module to assume equal prior probabilities for the two groups (prior_x = prior_y = 0.5). Passing
166             '1' causes the module to generate priors depending on the ratios of the two data-sets e.g. X has 15 observations and Y
167             has 27 observations gives prior_x = 15 / (15 + 27). Alternatively you may specify the values to use by passing an anonymous ARRAY reference of length 2
168             where the first value is prior_x and the second is prior_y. There are currently no checks on priors directly passed so
169             ensure that prior_x + prior_y = 1 if you supply you own.
170              
171             # Use prior_x = prior_y = 0.5.
172             my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 0 },$data_X,$data_Y);
173              
174             # Generate priors depending on rations of observation numbers.
175             my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => 1 },$data_X,$data_Y);
176              
177             # Specify your own priors.
178             my $bld = Statistics::MVA::BayesianDiscrimination->new({priors => [$prior_x, $prior_y] },$data_X,$data_Y);
179              
180             =cut
181              
182             #=fe
183              
184 1     1   6 use version; our $VERSION = qv('0.0.2');
  1         3  
  1         9  
185              
186             #/ with these new modules object creation directly checks data and performs analysis. storing just the essential results. can access directly or print them
187              
188             sub new {
189 0     0 1   my $class = shift;
190             #y grab an options hash if there is one
191 0 0         my $options_ref = shift if ( ref $_[0] eq q{HASH} );
192             #y the rest is data
193 0           my @data = @_;
194 0           my $p_x;
195             my $p_y;
196              
197 0           my $self = [];
198              
199 0           my $k = scalar @data;
200 0 0         croak qq{\nThis only accepts two groups of data} if $k != 2;
201             #y if there´s an options hash with priors check it - otherwise default...
202 0 0         if ( defined $options_ref ) { ($p_x, $p_y) = &_priors($options_ref, @data) }
  0            
203             else {
204 0           print qq{\nDefaulting to priors of 0.5 for both x and y.};
205 0           $p_x = 0.5;
206 0           $p_y = 0.5;
207             }
208             #y check they have good matrix format - need to check compatibility too
209 0 0         croak qq{\nbad data} if &Statistics::MVA::CVMat::_check($data[0]);
210 0 0         croak qq{\nbad data} if &Statistics::MVA::CVMat::_check($data[1]);
211             #y data is passed in table format - i.e. nested arrays (rows) correspond to observations and not variables
212 0           my $lol1 = &Statistics::MVA::CVMat::transpose($data[0]);
213 0           my $lol2 = &Statistics::MVA::CVMat::transpose($data[1]);
214             #y get variable number
215 0           my $p = scalar @{$lol1};
  0            
216 0 0         croak qq{\nThese data sets are not compatible} if ($p != scalar @{$lol2});
  0            
217             #y combine the data for overall means calculations - if using single data with names could use adjust in MVA
218 0           my $full = [];
219 0           $full = &_combine($lol1,$lol2, $p);
220             #y copy vars and we will do in place mean calculation - no need to deep copy references here
221             #my $first = &_deep_copy($lol1);
222 0           my $first = [@{$lol1}];
  0            
223 0           my $second = [@{$lol2}];
  0            
224             #y in place calculation of the means
225 0           &_means($full,$p);
226 0           &_means($first,$p);
227 0           &_means($second,$p);
228             #y in place subtraction of means from raw data
229 0           &_subtract($lol1, $full);
230 0           &_subtract($lol2, $full);
231             #y for MVA cv_calculations the data needs to be in table format - this is wastefull as it just transpose - thus is unecessary transposes - IN PLACE - should modify this?!?
232 0           $lol1 = &Statistics::MVA::CVMat::transpose($lol1);
233 0           $lol2 = &Statistics::MVA::CVMat::transpose($lol2);
234 0           my $mva;
235             #y if we have a twat who likes this annoying method
236 0 0 0       if (defined $options_ref && defined $options_ref->{cv} && $options_ref->{cv} == 0) { $mva = &_twats($lol1, $lol2); }
  0   0        
237             else {
238             #y create MVA object - these are defaults parameters so don´t need to pass them...
239 0           $mva = Statistics::MVA->new([ $lol1, $lol2 ], {standardise => 0, divisor => 1});
240             }
241             #r CV matrix data - loose last [0] for realmatrix object
242             #print Dumper $mva->[0][0][0][0];
243             #print Dumper $mva->[0][1][0][0];
244             #y create empty matrix of same dimensions
245 0           my $c = $mva->[0][0][0]->shadow;
246              
247             #/ its 2x not divided by priors?!? - only constant takes into account priors
248             #$mva->[0][0][0]->multiply_scalar($mva->[0][0][0],$p_x);
249             #$mva->[0][1][0]->multiply_scalar($mva->[0][1][0],$p_y);
250 0           $mva->[0][0][0]->multiply_scalar($mva->[0][0][0],0.5);
251 0           $mva->[0][1][0]->multiply_scalar($mva->[0][1][0],0.5);
252              
253 0           $c->add($mva->[0][0][0],$mva->[0][1][0]);
254              
255             #y no need for transpose just build from rows...
256 0           my $averages_a = Math::MatrixReal->new_from_rows([$first]);
257 0           my $averages_b = Math::MatrixReal->new_from_rows([$second]);
258             #y calculate b_0 - constant discrimination coefficients
259 0           my $constant_x = 0.5 * $averages_a * $c->inverse * ~$averages_a;
260 0           my $constant_y = 0.5 * $averages_b * $c->inverse * ~$averages_b;
261              
262             #/ you must add the priors to the constant - only the constant takes into account the priors.
263             #$constant_x->[0][0][0] = $constant_x->[0][0][0] - log($p_x);
264             #$constant_y->[0][0][0] = $constant_y->[0][0][0] - log($p_y);
265             #y now should be -0.5...
266 0           $constant_x->[0][0][0] = -$constant_x->[0][0][0] + log($p_x);
267 0           $constant_y->[0][0][0] = -$constant_y->[0][0][0] + log($p_y);
268              
269             #y calculate the rest of the discrimination coeficients
270             #~$averages_a * $c->inverse * $averages_a;
271 0           my $matrix_x = $averages_a * $c->inverse;
272 0           my $matrix_y = $averages_b * $c->inverse;
273              
274             #y feed the objectvv- this is only done for persistance - otherwise we loose the data...
275 0           $self->[0][0][0] = $constant_x;
276 0           $self->[0][0][1] = $matrix_x;
277 0           $self->[0][1][0] = $constant_y;
278 0           $self->[0][1][1] = $matrix_y;
279 0           $self->[1][0] = $p_x;
280 0           $self->[1][1] = $p_y;
281 0           $self->[2] = $p;
282              
283 0           bless $self, $class;
284 0           return $self;
285             }
286              
287             sub _priors {
288 0     0     my ($options_ref, @data) = @_;
289 0           my $p_x;
290             my $p_y;
291 0 0         if ($options_ref->{priors} == 1 ) {
    0          
    0          
292 0           my $n_x = scalar @{$data[0]};
  0            
293 0           my $n_y = scalar @{$data[1]};
  0            
294 0           my $n_total = $n_x + $n_y;
295 0           $p_x = sprintf (q{%.5f}, $n_x / $n_total);
296 0           $p_y = sprintf (q{%.5f}, $n_y / $n_total);
297             }
298             elsif (ref $options_ref->{priors} eq q{ARRAY} ) {
299 0           my @priors = @{$options_ref->{priors}};
  0            
300 0 0         croak qq{\nIf passing an ARRAY ref of priors it must have length two.} if ( scalar @priors != 2 );
301 0 0 0       croak qq{\nThe priors must sum to 1.} if ( $priors[0] + $priors[1] < 0.95 || $priors[0] + $priors[1] > 1.05 );
302 0           $p_x = $priors[0];
303 0           $p_y = $priors[1];
304             }
305             elsif ($options_ref->{priors} == 0) {
306 0           $p_x = 0.5;
307 0           $p_y = 0.5;
308             }
309 0           else { croak qq{\nI do not recognise that value for the priors option} }
310 0           return ($p_x, $p_y);
311             }
312              
313             sub _subtract {
314 0     0     my ($lol, $full) = @_;
315 0           for my $var (0..$#{$lol}) {
  0            
316 0           for my $cell (0..$#{$lol->[$var]}) {
  0            
317 0           $lol->[$var][$cell] = $lol->[$var][$cell] - $full->[$var];
318             }
319             }
320             }
321              
322             sub _combine{
323 0     0     my ($lol1, $lol2, $p) = @_;
324 0           my $full = [];
325 0           for my $i (0..$p-1) {
326 0           push @{$full->[$i]}, @{$lol1->[$i]}, @{$lol2->[$i]};
  0            
  0            
  0            
327             }
328 0           return $full;
329             }
330              
331             sub _means {
332 0     0     my ($full, $p) = @_;
333 0           for my $i (0..$p-1) {
334 0           my $sum = sum @{$full->[$i]};
  0            
335 0           my $n = scalar @{$full->[$i]};
  0            
336 0           $full->[$i] = $sum/$n;
337             }
338             }
339              
340             sub _twats {
341 0     0     my ($lol1, $lol2) = @_;
342 0           my $lol1_mat = Math::MatrixReal->new_from_rows($lol1);
343 0           my $n_1 = scalar @{$lol1};
  0            
344 0           $lol1_mat = (~$lol1_mat * $lol1_mat) / $n_1;
345 0           my $lol2_mat = Math::MatrixReal->new_from_rows($lol2);
346 0           my $n_2 = scalar @{$lol2};
  0            
347 0           $lol2_mat = (~$lol2_mat * $lol2_mat) / $n_2;
348 0           my $mva = [];
349 0           $mva->[0][0][0] = $lol1_mat;
350 0           $mva->[0][1][0] = $lol2_mat;
351 0           print $mva->[0][0][0];
352 0           print $mva->[0][1][0];
353 0           return $mva;
354             }
355              
356             sub output {
357 0     0 1   my $self = shift;
358 0           my $constant_x = $self->[0][0][0];
359 0           my $constant_y = $self->[0][1][0];
360 0           my $matrix_x = $self->[0][0][1];
361 0           my $matrix_y = $self->[0][1][1];
362 0           my $p_x = $self->[1][0];
363 0           my $p_y = $self->[1][1];
364 0           my $p_var_num = $self->[2];
365 0 0         if (!wantarray) {
366 0           print qq{\nTwo-groups with $p_var_num variables each and prior probabilities of p(x) = $p_x and p(y) = $p_y.\n\n};
367 0           my @config = ( [17, q{}], [15, q{X}], [15, q{Y}] );
368 0           my $tbl = Text::SimpleTable->new(@config);
369 0           $tbl->row( qq{Constant B[0]}, sprintf(q{%.5f}, $constant_x->[0][0][0]), sprintf(q{%.5f}, $constant_y->[0][0][0]) );
370 0           $tbl->hr;
371 0           for my $row (0..$#{$matrix_x->[0][0]}) {
  0            
372 0           my $coeff = $row+1;
373 0           $tbl->row( qq{Coefficient B[$coeff]}, sprintf(q{%.5f}, $matrix_x->[0][0][$row]), sprintf(q{%.5f}, $matrix_y->[0][0][$row]) );
374 0 0         $tbl->hr if $row != $#{$matrix_x->[0][0]};
  0            
375             }
376 0           print $tbl->draw;
377 0           return;
378             }
379 0           else { return ( $p_x, $constant_x->[0][0][0], $matrix_x->[0][0], $p_y, $constant_y->[0][0][0], $matrix_y->[0][0] ); }
380             }
381              
382             sub discriminate {
383 0     0 1   my ($self, $ex) = @_;
384 0           my $context = wantarray;
385 0           my $constant_x = $self->[0][0][0];
386 0           my $constant_y = $self->[0][1][0];
387 0           my $matrix_x = $self->[0][0][1];
388 0           my $matrix_y = $self->[0][1][1];
389 0           my $p_x = $self->[1][0];
390 0           my $p_y = $self->[1][1];
391 0           my $p_var_num = $self->[2];
392            
393 0 0         croak qq{\nData must be passed as an ARRAY ref} if (ref $ex ne q{ARRAY});
394 0 0         croak qq{\nThis example has the wrong variable number} if scalar @{$ex} != $p_var_num;
  0            
395              
396 0           my $example = Math::MatrixReal->new_from_cols([$ex]);
397 0           my $val_x = ( $matrix_x * $example ) + $constant_x;#; + log($p_x);
398              
399             #r when adding log(p) its the linear score function and not simply the linear discrimination result
400             #$val_x = $val_x->[0][0][0];#+log($p_x);
401 0           $val_x = $val_x->[0][0][0];
402 0           my $val_y = ( $matrix_y * $example ) + $constant_y;#; + log($p_x);
403             #$val_y = $val_y->[0][0][0];#+log($p_y);
404 0           $val_y = $val_y->[0][0][0];
405              
406 0 0         my $type = $val_x > $val_y ? q{X}
    0          
407             : $val_y > $val_x ? q{Y}
408             : undef;
409              
410 0           my $post_p_x = exp($val_x) / (exp($val_x) + exp($val_y));
411 0           my $post_p_y = exp($val_y) / (exp($val_x) + exp($val_y));
412              
413 0 0         if (!$context) {
414 0           print qq{\nLinear score function for x = $val_x\nLinear score function for y = $val_y\n};
415 0 0         if ( defined $type ) { print qq{\nExample (@{$ex}) is a \x27$type\x27.} }
  0            
  0            
416 0           else { print qq{\nCannot distinguish which group the example belongs to.} }
417 0           print qq{\n\nPrior probability of being an x = $p_x\nPosterior probability of being an x = $post_p_x\n};
418 0           print qq{\nPrior probability of being a y = $p_y\nPosterior probability of being a y = $post_p_y\n};
419 0           return;
420             }
421             else {
422 0           return ($val_x, $p_x, $post_p_x, $val_y, $p_y, $post_p_y, $type);
423             }
424             }
425              
426             1; # Magic true value required at end of module
427              
428             __END__