File Coverage

blib/lib/Statistics/Distributions/GTest.pm
Criterion Covered Total %
statement 27 274 9.8
branch 0 58 0.0
condition 0 30 0.0
subroutine 9 40 22.5
pod 0 8 0.0
total 36 410 8.7


line stmt bran cond sub pod time code
1             package Statistics::Distributions::GTest;
2              
3 1     1   29201 use warnings;
  1         2  
  1         33  
4 1     1   5 use strict;
  1         2  
  1         32  
5 1     1   4 use Carp;
  1         6  
  1         95  
6 1     1   4 use List::Util qw( sum max );
  1         1  
  1         125  
7 1     1   742 use Math::Cephes qw(:explog);
  1         7004  
  1         299  
8 1     1   837 use Statistics::Distributions qw( chisqrdistr chisqrprob );
  1         3303  
  1         99  
9 1     1   851 use Text::SimpleTable;
  1         3342  
  1         28  
10 1     1   1382 use Contextual::Return;
  1         34564  
  1         8  
11             =head1 NAME
12              
13             Statistics::Distributions::GTest - Perl implementation of the Log-Likelihood Ratio Test (G-test) of Independence.
14              
15             =cut
16             =head1 VERSION
17              
18             This document describes Statistics::Distributions::GTest version 0.1.5.
19              
20             =cut
21 1     1   2865 use version; our $VERSION = qv('0.1.5'); # next release 0.1.1...
  1         1904  
  1         4  
22             =head1 SYNOPSIS
23              
24             use Statistics::Distributions::GTest;
25              
26             # Create an GTest object.
27             my $gtest = Statistics::Distributions::GTest->new();
28              
29             # A 3x3 example. Data is sent to object a reference to a LoL.
30             my $a_ref = [
31             [ 458, 537 ,345],
32             [ 385, 457 ,456],
33             [ 332, 376 ,364 ],
34             ];
35              
36             # Feed the object the data by passing reference with named argument 'table'.
37             $gtest->read_data ( { table => $a_ref } );
38              
39             # Perform the analysis using one of the two methods - see DESCRIPTION.
40             $gtest->G();
41             #$gtest->G_alt();
42              
43             # Print a table of the calculated expected values.
44             $gtest->print_expected();
45              
46             # To access results use results method. The return of this method is context dependent (see METHODS).
47             # To print a report to STDOUT call results in VOID context - may also call in BOOLEAN, NUMERIC and LIST (see METHODS).
48             $gtest->results();
49              
50             =cut
51             =head1 DESCRIPTION
52              
53             The G-test of independence is an alternative to the chi-square test of independence for testing for independence in
54             contingency tables. G-tests are coming into increasing use and as with the chi-square test for independence the G-test
55             for independence is used when you have two nominal variables each with two or more possible values. The null hypothesis
56             is that the relative proportions of one variable are independent of the second variable. This module implements two two
57             equivalent, but marginally different approaches to calculate G scores (that described in
58             http://en.wikipedia.org/wiki/G-test and that used by http://udel.edu/~mcdonald/statgtestind.html). Benchmarking
59             indicates that first approach works about a third faster than the alternative. However, this difference
60             diminishes as the categories increase. See http://en.wikipedia.org/wiki/G-test and
61             http://udel.edu/~mcdonald/statgtestind.html.
62              
63             =cut
64             =head1 METHODS
65              
66             =cut
67              
68             #######################################################################################################################
69              
70             sub new {
71              
72 0     0 0   my ($class, $all_h_ref ) = @_;
73 0 0 0       croak qq{\nArguments must be passed as HASH reference.}
74             if ( ( $all_h_ref ) && ( ref $all_h_ref ne q{HASH} ) );
75 0           my $self = {};
76 0           bless $self, $class;
77 0           return $self;
78             }
79             =head2 new
80              
81             Create a new Statistics::Distributions::GTest object.
82              
83             my $gtest = Statistics::Distributions::GTest->new();
84              
85             =cut
86              
87             sub read_data {
88 0     0 0   my ( $self, $all_h_ref ) = @_;
89              
90 0 0         $all_h_ref or croak qq{\nYou must pass me some data};
91 0 0 0       croak qq{\nThe data must be passed within HASH reference pointed to by key \'table\'.}
92             if ( ( ref $all_h_ref ne q{HASH} ) || ( !exists $all_h_ref->{table} ) );
93            
94             #y unpack data
95 0           my $data_a_ref = $all_h_ref->{table};
96            
97 0 0         $data_a_ref or croak qq{\nkey \'table\' points to nothing};
98 0 0         croak qq{\nThe data pointed to by key \'table\' must be passed as ARRAY reference.} if ( ref $data_a_ref ne q{ARRAY} );
99              
100 0           $self->_data_checks($data_a_ref);
101              
102             #r/ WE MUST DEEP COPY THIS STRUCTURE
103 0           my $deep_copied_ref = _deep_copy_arrays ($data_a_ref);
104              
105             #y convert the numeric values to hashes that store various values
106 0           $self->_inplace_conversion($deep_copied_ref);
107            
108             #y simple flag for data reading
109 0           $self->{properties}{analysis}{obs_table} = 1;
110 0           return;
111             }
112             =head2 read_data
113              
114             Used for loading data into object. Data is fed as a reference to a list of lists within an anonymous hash using the
115             named argument 'table'.
116            
117             $gtest->read_data ( { table => $LoL_ref } );
118              
119             =cut
120              
121             sub _data_checks {
122 0     0     my ($self, $data_a_ref) = @_;
123             #/ get rows
124 0           my $rows = scalar(@{$data_a_ref});
  0            
125 0 0 0       croak qq{\nI need some data - there are too few rows in your data.\n} if ( !$rows || $rows == 1 );
126              
127             #/ get cols - check first and then compare the rest
128 0           my $cols = scalar(@{$data_a_ref->[0]});
  0            
129 0 0 0       croak qq{\nI need some data - there are too few columns in your data.\n} if ( !$cols || $cols == 1 );
130              
131 0           for my $row (@{$data_a_ref}) {
  0            
132              
133 0 0         croak qq{\n\nData set must be passed as ARRAY references.\n} if ( ref $row ne q{ARRAY} );
134 0 0         croak qq{\n\nAll rows must have the same number of columns.\n} if ( scalar( @{$row} ) != $cols );
  0            
135              
136             }
137              
138             # feed the object
139 0           $self->{properties}{rows} = $rows;
140 0           $self->{properties}{cols} = $cols;
141              
142 0           return;
143             }
144              
145             #/ beware: this is a prototype and only used for deep-copying - don´t return at end as its called recursively and you´ll kill the deep copy
146             sub _deep_copy_arrays (\@) {
147 0     0     my $data_structure = shift;
148              
149 0 0         if (!ref $data_structure) { $data_structure }
  0 0          
150             elsif (ref $data_structure eq q{ARRAY} ) {
151 0           [ map { _deep_copy_arrays ($_) } @{$data_structure} ];
  0            
  0            
152             }
153 0           else { croak qq{\nYou must hand in an array ref. } }
154              
155             #/ THIS METHOD IS CALLED RECURSIVELY! DON´T PUT RETURN IN HERE
156             # return;
157             }
158              
159             sub _inplace_conversion {
160             #/ in place converstion to 2-d matrix to cells
161 0     0     my ($self, $a_ref) = @_;
162            
163 0           for my $row (0..$#{$a_ref}) {
  0            
164 0           for my $col (0..$#{$a_ref->[0]}) {
  0            
165              
166             # unpack value
167 0           my $observation = $a_ref->[$row][$col];
168            
169 0           my $cell_h_ref = { observation => $observation,
170             };
171              
172 0           $a_ref->[$row][$col] = $cell_h_ref;
173             }
174             }
175              
176             #y feed the object the deep-copied inplace modified array
177 0           print qq{\n\nData passed all checks. Feeding $self->{properties}{rows} x $self->{properties}{cols} matrix to object.\n};
178 0           $self->{table} = $a_ref;
179              
180 0           return;
181             }
182              
183             sub diag {
184 0     0 0   my $self = shift;
185 0           print qq{\n-------------------------------------------------------\nobject dumper:\n}, Dumper $self;
186 0           print qq{-------------------------------------------------------\n\n};
187              
188 0           return;
189             }
190              
191             sub _sum_a_row {
192 0     0     my ( $self, $row ) = @_; # para de esquecer a colocar @_ quando unpacking!!!
193 0           my $table_a_ref = $self->{table};
194              
195             #y basically the NW way
196 0           my $row_sum = sum map { $table_a_ref->[$row][$_]{observation} } ( 0..($self->{properties}{cols}-1) );
  0            
197             #y my way
198 0           my $row_sum2 = sum map { $_->{observation} } @{$self->{table}[$row]};
  0            
  0            
199            
200             #y basically the NW way
201             # my $col_sum1 = sum map { $table_a_ref->[$_][$col]{observation} } ( 0..($self->{properties}{rows}-1) );
202             #y my way
203             # my $col_sum2 = sum map { $_->[$col]{observation} } @{$self->{table}};
204              
205 0           return $row_sum;
206             }
207              
208             sub _sum_rows {
209              
210 0     0     my $self = shift;
211 0           my $table_a_ref = $self->{table};
212 0           my @row_sums = ();
213              
214 0           for my $row (0..$#{$table_a_ref}) {
  0            
215              
216 0           push @row_sums, $self->_sum_a_row($row);
217              
218             }
219              
220 0           $self->{properties}{row_sums} = [@row_sums];
221              
222 0           return;
223             }
224              
225             sub _sum_a_col {
226 0     0     my ( $self, $col ) = @_; # para de esquecer a colocar @_ quando unpacking!!!
227 0           my $table_a_ref = $self->{table};
228              
229             #y basically the NW way
230 0           my $col_sum = sum map { $table_a_ref->[$_][$col]{observation} } ( 0..($self->{properties}{rows}-1) );
  0            
231             #y my way
232 0           my $col_sum2 = sum map { $_->[$col]{observation} } @{$self->{table}};
  0            
  0            
233              
234 0           return $col_sum;
235              
236             }
237              
238             sub _sum_cols {
239 0     0     my $self = shift;
240 0           my $table_a_ref = $self->{table};
241 0           my @col_sums = ();
242            
243             #/ still need to fix this!!! use $self-{properties}{col}-1?!? - that way its all the same figure
244 0           for my $col (0..$#{$table_a_ref->[0]}) {
  0            
245 0           push @col_sums, $self->_sum_a_col($col);
246             }
247 0           $self->{properties}{col_sums} = [@col_sums];
248              
249 0           return;
250             }
251              
252             sub _total {
253 0     0     my $self = shift;
254 0           my $total_from_rows = sum @{$self->{properties}{row_sums}};
  0            
255 0           my $total_from_cols = sum @{$self->{properties}{col_sums}};
  0            
256 0           $self->{properties}{total} = $total_from_rows;
257              
258 0           return;
259             }
260              
261             sub _calculate_expected {
262 0     0     my $self = shift;
263 0           my $a_ref = $self->{table};
264              
265 0           my @row_sums = @{$self->{properties}{row_sums}};
  0            
266 0           my @col_sums = @{$self->{properties}{col_sums}};
  0            
267 0           my $total = $self->{properties}{total};
268              
269 0           for my $row ( 0..($self->{properties}{rows}-1) ) {
270              
271 0           for my $col (0..($self->{properties}{cols}-1) ) {
272            
273 0           my $expected = ( $row_sums[$row] * $col_sums[$col] ) / $total;
274              
275 0           $a_ref->[$row][$col]{expected} = $expected;
276             }
277             }
278              
279 0           $self->{properties}{analysis}{expect_table} = 1;
280 0           return;
281             }
282              
283             sub _calculate_f_ln_f {
284 0     0     my $self = shift;
285 0           my $a_ref = $self->{table};
286 0           for my $row ( 0..($self->{properties}{rows}-1) ) {
287 0           for my $col (0..($self->{properties}{cols}-1) ) {
288            
289 0           my $observed = $a_ref->[$row][$col]{observation};
290 0           my $f_ln_f = _f_ln_f ($observed);
291 0           $a_ref->[$row][$col]{f_ln_f} = $f_ln_f;
292             }
293             }
294              
295 0           return;
296             }
297              
298             #/ beware: this is a prototype and not for use as an object method
299             sub _f_ln_f($) {
300 0     0     my $value = shift;
301 0 0         $value = $value > 0.5 ? $value * ( log ( $value ) ) : 0 ;
302 0           return $value;
303             }
304              
305             sub _calculate_G_traditional {
306 0     0     my $self = shift;
307 0           my $table_a_ref = $self->{table};
308              
309 0           my $sum = 0;
310              
311 0           for my $row ( 0..($self->{properties}{rows}-1) ) {
312 0           for my $col (0..($self->{properties}{cols}-1) ) {
313              
314 0           my $cell = $table_a_ref->[$row][$col]{observation} *
315             ( log ( $table_a_ref->[$row][$col]{observation} / $table_a_ref->[$row][$col]{expected} ) );
316              
317 0           $sum += $cell;
318             }
319             }
320 0           my $G = 2 * $sum;
321              
322 0           $self->{properties}{G} = $G;
323              
324 0           return;
325             }
326              
327             sub _calculate_G_alternative {
328 0     0     my $self = shift;
329 0           my $table_a_ref = $self->{table};
330 0           my $sum = 0;
331              
332             #/ don´t need this part###################################################
333 0           my @f_ln_f_for_row_sums = map { _f_ln_f($_) } @{$self->{properties}{row_sums}};
  0            
  0            
334 0           my @f_ln_f_for_col_sums = map { _f_ln_f($_) } @{$self->{properties}{col_sums}};
  0            
  0            
335 0           $self->{properties}{f_ln_f_for_row_sums} = [@f_ln_f_for_row_sums];
336 0           $self->{properties}{f_ln_f_for_col_sums} = [@f_ln_f_for_col_sums];
337             ##########################################################################
338              
339 0           my $sum_of_f_ln_f_for_row_sums = sum map { _f_ln_f($_) } @{$self->{properties}{row_sums}};
  0            
  0            
340 0           my $sum_of_f_ln_f_for_col_sums = sum map { _f_ln_f($_) } @{$self->{properties}{col_sums}};
  0            
  0            
341              
342 0           my $f_ln_f_for_total = _f_ln_f ($self->{properties}{total});
343              
344 0           $self->{properties}{f_ln_f_for_total} = [$f_ln_f_for_total];
345            
346 0           my $sum_f_ln_f = 0;
347              
348 0           for my $row ( 0..($self->{properties}{rows}-1) ) {
349            
350             #/ either
351             #for my $col (0..($self->{properties}{cols}-1) ) {
352             #$row = $table_a_ref->[$row][$col]{observation};
353             #/ or
354 0           $row = sum map { $_->{f_ln_f} } @{$self->{table}[$row]};
  0            
  0            
355              
356 0           $sum_f_ln_f += $row;
357             }
358              
359 0           my $G = 2 * ( ( $sum_f_ln_f + $f_ln_f_for_total ) -
360             ( $sum_of_f_ln_f_for_row_sums + $sum_of_f_ln_f_for_col_sums ) );
361              
362 0           $self->{properties}{G} = $G;
363            
364 0           return;
365             }
366              
367             sub G {
368 0     0 0   my $self = shift;
369             #y check data loaded flag
370 0 0         croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
371 0           $self->_sum_rows();
372 0           $self->_sum_cols();
373 0           $self->_total();
374 0           $self->_calculate_expected();
375 0           $self->_calculate_G_traditional();
376            
377 0           $self->_df();
378 0           $self->_calculate_p_value();
379            
380             #y flag to check this has run
381 0           $self->{properties}{analysis}{G} = 1;
382 0           return;
383              
384             }
385             =head2 G
386              
387             To calculate G value. This method implements the calculation described in http://en.wikipedia.org/wiki/G-test.
388            
389             $gtest->G();
390              
391             =cut
392              
393             #/ make this redundant and using self->{prop}{G} for both G calculation methods - create a method to compare the two computed values.
394             sub G_alt {
395 0     0 0   my $self = shift;
396             #y check data loaded flag
397 0 0         croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
398 0           $self->_sum_rows();
399 0           $self->_sum_cols();
400 0           $self->_total();
401 0           $self->_calculate_f_ln_f();
402 0           $self->_calculate_G_alternative();
403              
404 0           $self->_df();
405 0           $self->_calculate_p_value();
406            
407             #y flag to check this has run
408 0           $self->{properties}{analysis}{G_alt} = 1;
409 0           return;
410              
411             }
412             =head2 G_alt
413              
414             To calculate G you may also use this method. This method implements procedure described in
415             http://udel.edu/~mcdonald/statgtestind.html. This approach does not directly generate a table of expected values.
416            
417             $gtest->G_alt();
418              
419             =cut
420              
421             sub _df {
422 0     0     my $self = shift;
423 0           my $df = ( $self->{properties}{rows}-1 ) * ( $self->{properties}{cols}-1 );
424 0           $self->{properties}{df} = $df;
425              
426 0           return;
427             }
428              
429             sub _calculate_p_value {
430 0     0     my $self = shift;
431 0           my $chisprob = chisqrprob ($self->{properties}{df}, $self->{properties}{G});
432 0           $self->{properties}{p_value} = $chisprob;
433              
434 0           return;
435             }
436              
437             sub _configure_table {
438 0     0     my ( $self, $which ) = @_;
439 0           my $max_len = 0;
440 0           for my $row (@{$self->{table}}) {
  0            
441 0           my $temp = max map { length( sprintf ( q{%.0f}, $_->{$which}) ) } @{$row} ;
  0            
  0            
442 0 0         $max_len = $temp > $max_len ? $temp : $max_len;
443             }
444 0           return ($max_len) x $self->{properties}{cols};
445             }
446              
447             sub _print_table {
448 0     0     my ( $self, $which ) = @_;
449              
450 0 0 0       ( $which eq q{observation} || $which eq q{expected} ) ||
451             croak qq{\nUsage is \$object->print_observed or \$object->print_expected};
452              
453             #croak qq{\nUsage is \$object->print_observed or \$object->print_expected}
454             #unless ( ( $which eq q{observed} ) || ( $which eq q{expected} ) );
455             #if !( ( $which eq q{observed} ) || ( $which eq q{expected} ) );
456              
457 0           my $table = Text::SimpleTable->new($self->_configure_table($which));
458 0           my $count = 0;
459 0           for my $row (@{$self->{table}}) {
  0            
460 0 0         $table->hr if ( $count != 0 );
461 0           $table->row( ( map { sprintf ( q{%.0f}, $_->{$which} ) } @{$row} ) );
  0            
  0            
462 0           $count++;
463             }
464 0           print qq{\nTable of $which values:\n}, $table->draw;
465              
466 0           return;
467             }
468              
469             sub print_expected {
470             # convinience method - stops calling of private _print_table
471 0     0 0   my $self = shift;
472              
473             #y check data loaded flag
474 0 0         croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
475            
476             #y check whether _calculated_expected was called
477 0 0 0       if ( ( exists $self->{properties}{analysis}{G_alt} ) && ( !exists $self->{properties}{analysis}{expect_table} ) ) {
478 0           print qq{\nIt appears you ran the alternative procedure that does not directly generate an expected values table. }
479             .qq{\nGenerating expected values table now.\n};
480 0           $self->_calculate_expected();
481             }
482              
483             #y check the analysis flags
484             #y !$x and !$y and croak
485 0 0 0       croak qq{\nYou must run the analysis with either method G or G_alt first}
486             if ( ( !exists $self->{properties}{analysis}{G} ) && ( !exists $self->{properties}{analysis}{G_alt} ) );
487              
488 0           $self->_print_table(q{expected});
489 0           return;
490             }
491             =head2 print_expected
492              
493             Prints a table of the calculated expected values to STDOUT. If you used G_alt to calculate G it will first generated the
494             table of excpeted values.
495            
496             $gtest->print_expected();
497              
498             =cut
499              
500             sub print_observed {
501             # convinience method - stops calling of private _print_table
502 0     0 0   my $self = shift;
503             #y check data loaded flag
504 0 0         croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
505 0           $self->_print_table(q{observation});
506 0           return;
507             }
508             =head2 print_observed
509              
510             Prints a table of the observation values to STDOUT.
511            
512             $gtest->print_observed();
513              
514             =cut
515              
516             sub results {
517 0     0 0   my ($self, $thing ) = @_;
518             #y check the analysis flags
519 0 0 0       croak qq{\nYou must run the analysis with either method G or G_alt first}
520             if ( ( !exists $self->{properties}{analysis}{G} ) && ( !exists $self->{properties}{analysis}{G_alt} ) );
521              
522             #y preferentially grab the traditional method result
523            
524             #my $G = ( !exists $self->{properties}{G} ) ? $self->{properties}{G} : $self->{properties}{G_alt};
525             #/ twat!
526             #my $G = exists $self->{properties}{G};
527 0           my $G = $self->{properties}{G};
528            
529 0           my $df = $self->{properties}{df};
530 0           my $p_val = $self->{properties}{p_value};
531              
532 0 0         if ( $p_val < 1e-05 ) { $p_val = sprintf (q{%e}, $p_val) }
  0            
533              
534 0           $G = sprintf ( q{%.5f}, $G );
535              
536             return (
537 0     0     VOID { $self->_void ($G, $df, $p_val) }
538 0     0     LIST { ($G, $df, $p_val ) }
539 0     0     BOOL { $self->_boolean($G, $df, $thing) }
540 0     0     NUM { $G ; }
541 0           );
542              
543             }
544             =head2 results
545              
546             Used to access the results of the G-test calculation. This method is context-dependent and will return a variety of
547             different values depending on its calling context. In VOID context it simply prints the calculated value of G, df and
548             the p_value in a table to STDOUT.
549            
550             $gtest->results();
551              
552             In BOOLEAN context it requires you to pass it a value for the significance level of the test you wish to apply e.g.
553             0.05. It returns True or False depending on whether the null hypothesis is rejected at that significance level.
554              
555             # test if the result is significant at the p = 0.05 level.
556             if ($gtest->results( 0.05 )) { print qq{\nthis is significant } } else { print qq{\nthis is not significant} }
557              
558             In LIST context it simply returns a LIST of the calculated values of G, df and p for the observation data.
559              
560             my ($G, $df, $p) = $gtest->results();
561              
562             In NUMERIC context it returns the calculated value of G.
563              
564             print qq{\n\nG in numeric is: }, 0+$gtest->results();
565              
566             =cut
567              
568             sub _void {
569 0     0     my ( $self, $G, $df, $p_val ) = @_;
570              
571 0 0         my $g_len = length ($G) > 7 ? length ($G) : 7;
572 0 0         my $df_len = length ($df) > 3 ? length ($df) : 3;
573 0 0         my $p_len = length ($p_val) > 7 ? length ($p_val) : 7;
574              
575 0           my $table = Text::SimpleTable->new($g_len, $df_len, $p_len);
576 0           $table->row( qw/ G_value df p_value / );
577 0           $table->hr;
578 0           $table->row( $G, $df, $p_val );
579 0           print qq{\nTable of results:\n}, $table->draw;
580              
581 0           return;
582             }
583              
584             sub _boolean {
585 0     0     my ($self, $G, $df, $sig) = @_;
586            
587 0 0         $sig or croak qq{\nYou must pass a p_value in BOOLEAN context};
588            
589 0           print qq{\nsig $sig};
590              
591 0 0 0       croak qq{\nThe p value must be numeric and in the range > 0 and < 1.} if ( $sig !~ /\A \d* \.? \d+ ([eE][+-]?\d+)? \z/xms || $sig <= 0 || $sig >= 1) ;
      0        
592             #/ forgot to check for exponential numbers
593             #if ( $sig !~ /\A[01]?\.\d{1,7}\z/xms || $sig <= 0 || $sig >= 1) ;
594            
595 0 0         $G = $G > chisqrdistr ( $df, $sig ) ? 1 : undef;
596 0           return $G;
597             }
598              
599             1; # Magic true value required at end of module
600              
601             __END__