File Coverage

blib/lib/Statistics/Distributions/GTest.pm

Criterion	Covered	Total	%
statement	27	274	9.8
branch	0	58	0.0
condition	0	30	0.0
subroutine	9	40	22.5
pod	0	8	0.0
total	36	410	8.7

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::Distributions::GTest;
2
3	1			1		29201	use warnings;
	1					2
	1					33
4	1			1		5	use strict;
	1					2
	1					32
5	1			1		4	use Carp;
	1					6
	1					95
6	1			1		4	use List::Util qw( sum max );
	1					1
	1					125
7	1			1		742	use Math::Cephes qw(:explog);
	1					7004
	1					299
8	1			1		837	use Statistics::Distributions qw( chisqrdistr chisqrprob );
	1					3303
	1					99
9	1			1		851	use Text::SimpleTable;
	1					3342
	1					28
10	1			1		1382	use Contextual::Return;
	1					34564
	1					8
11							=head1 NAME
12
13							Statistics::Distributions::GTest - Perl implementation of the Log-Likelihood Ratio Test (G-test) of Independence.
14
15							=cut
16							=head1 VERSION
17
18							This document describes Statistics::Distributions::GTest version 0.1.5.
19
20							=cut
21	1			1		2865	use version; our $VERSION = qv('0.1.5'); # next release 0.1.1...
	1					1904
	1					4
22							=head1 SYNOPSIS
23
24							use Statistics::Distributions::GTest;
25
26							# Create an GTest object.
27							my $gtest = Statistics::Distributions::GTest->new();
28
29							# A 3x3 example. Data is sent to object a reference to a LoL.
30							my $a_ref = [
31							[ 458, 537 ,345],
32							[ 385, 457 ,456],
33							[ 332, 376 ,364 ],
34							];
35
36							# Feed the object the data by passing reference with named argument 'table'.
37							$gtest->read_data ( { table => $a_ref } );
38
39							# Perform the analysis using one of the two methods - see DESCRIPTION.
40							$gtest->G();
41							#$gtest->G_alt();
42
43							# Print a table of the calculated expected values.
44							$gtest->print_expected();
45
46							# To access results use results method. The return of this method is context dependent (see METHODS).
47							# To print a report to STDOUT call results in VOID context - may also call in BOOLEAN, NUMERIC and LIST (see METHODS).
48							$gtest->results();
49
50							=cut
51							=head1 DESCRIPTION
52
53							The G-test of independence is an alternative to the chi-square test of independence for testing for independence in
54							contingency tables. G-tests are coming into increasing use and as with the chi-square test for independence the G-test
55							for independence is used when you have two nominal variables each with two or more possible values. The null hypothesis
56							is that the relative proportions of one variable are independent of the second variable. This module implements two two
57							equivalent, but marginally different approaches to calculate G scores (that described in
58							http://en.wikipedia.org/wiki/G-test and that used by http://udel.edu/~mcdonald/statgtestind.html). Benchmarking
59							indicates that first approach works about a third faster than the alternative. However, this difference
60							diminishes as the categories increase. See http://en.wikipedia.org/wiki/G-test and
61							http://udel.edu/~mcdonald/statgtestind.html.
62
63							=cut
64							=head1 METHODS
65
66							=cut
67
68							#######################################################################################################################
69
70							sub new {
71
72	0			0	0		my ($class, $all_h_ref ) = @_;
73	0	0	0				croak qq{\nArguments must be passed as HASH reference.}
74							if ( ( $all_h_ref ) && ( ref $all_h_ref ne q{HASH} ) );
75	0						my $self = {};
76	0						bless $self, $class;
77	0						return $self;
78							}
79							=head2 new
80
81							Create a new Statistics::Distributions::GTest object.
82
83							my $gtest = Statistics::Distributions::GTest->new();
84
85							=cut
86
87							sub read_data {
88	0			0	0		my ( $self, $all_h_ref ) = @_;
89
90	0	0					$all_h_ref or croak qq{\nYou must pass me some data};
91	0	0	0				croak qq{\nThe data must be passed within HASH reference pointed to by key \'table\'.}
92							if ( ( ref $all_h_ref ne q{HASH} ) \|\| ( !exists $all_h_ref->{table} ) );
93
94							#y unpack data
95	0						my $data_a_ref = $all_h_ref->{table};
96
97	0	0					$data_a_ref or croak qq{\nkey \'table\' points to nothing};
98	0	0					croak qq{\nThe data pointed to by key \'table\' must be passed as ARRAY reference.} if ( ref $data_a_ref ne q{ARRAY} );
99
100	0						$self->_data_checks($data_a_ref);
101
102							#r/ WE MUST DEEP COPY THIS STRUCTURE
103	0						my $deep_copied_ref = _deep_copy_arrays ($data_a_ref);
104
105							#y convert the numeric values to hashes that store various values
106	0						$self->_inplace_conversion($deep_copied_ref);
107
108							#y simple flag for data reading
109	0						$self->{properties}{analysis}{obs_table} = 1;
110	0						return;
111							}
112							=head2 read_data
113
114							Used for loading data into object. Data is fed as a reference to a list of lists within an anonymous hash using the
115							named argument 'table'.
116
117							$gtest->read_data ( { table => $LoL_ref } );
118
119							=cut
120
121							sub _data_checks {
122	0			0			my ($self, $data_a_ref) = @_;
123							#/ get rows
124	0						my $rows = scalar(@{$data_a_ref});
	0
125	0	0	0				croak qq{\nI need some data - there are too few rows in your data.\n} if ( !$rows \|\| $rows == 1 );
126
127							#/ get cols - check first and then compare the rest
128	0						my $cols = scalar(@{$data_a_ref->[0]});
	0
129	0	0	0				croak qq{\nI need some data - there are too few columns in your data.\n} if ( !$cols \|\| $cols == 1 );
130
131	0						for my $row (@{$data_a_ref}) {
	0
132
133	0	0					croak qq{\n\nData set must be passed as ARRAY references.\n} if ( ref $row ne q{ARRAY} );
134	0	0					croak qq{\n\nAll rows must have the same number of columns.\n} if ( scalar( @{$row} ) != $cols );
	0
135
136							}
137
138							# feed the object
139	0						$self->{properties}{rows} = $rows;
140	0						$self->{properties}{cols} = $cols;
141
142	0						return;
143							}
144
145							#/ beware: this is a prototype and only used for deep-copying - don´t return at end as its called recursively and you´ll kill the deep copy
146							sub _deep_copy_arrays (\@) {
147	0			0			my $data_structure = shift;
148
149	0	0					if (!ref $data_structure) { $data_structure }
	0	0
150							elsif (ref $data_structure eq q{ARRAY} ) {
151	0						[ map { _deep_copy_arrays ($_) } @{$data_structure} ];
	0
	0
152							}
153	0						else { croak qq{\nYou must hand in an array ref. } }
154
155							#/ THIS METHOD IS CALLED RECURSIVELY! DON´T PUT RETURN IN HERE
156							# return;
157							}
158
159							sub _inplace_conversion {
160							#/ in place converstion to 2-d matrix to cells
161	0			0			my ($self, $a_ref) = @_;
162
163	0						for my $row (0..$#{$a_ref}) {
	0
164	0						for my $col (0..$#{$a_ref->[0]}) {
	0
165
166							# unpack value
167	0						my $observation = $a_ref->[$row][$col];
168
169	0						my $cell_h_ref = { observation => $observation,
170							};
171
172	0						$a_ref->[$row][$col] = $cell_h_ref;
173							}
174							}
175
176							#y feed the object the deep-copied inplace modified array
177	0						print qq{\n\nData passed all checks. Feeding $self->{properties}{rows} x $self->{properties}{cols} matrix to object.\n};
178	0						$self->{table} = $a_ref;
179
180	0						return;
181							}
182
183							sub diag {
184	0			0	0		my $self = shift;
185	0						print qq{\n-------------------------------------------------------\nobject dumper:\n}, Dumper $self;
186	0						print qq{-------------------------------------------------------\n\n};
187
188	0						return;
189							}
190
191							sub _sum_a_row {
192	0			0			my ( $self, $row ) = @_; # para de esquecer a colocar @_ quando unpacking!!!
193	0						my $table_a_ref = $self->{table};
194
195							#y basically the NW way
196	0						my $row_sum = sum map { $table_a_ref->[$row][$_]{observation} } ( 0..($self->{properties}{cols}-1) );
	0
197							#y my way
198	0						my $row_sum2 = sum map { $_->{observation} } @{$self->{table}[$row]};
	0
	0
199
200							#y basically the NW way
201							# my $col_sum1 = sum map { $table_a_ref->[$_][$col]{observation} } ( 0..($self->{properties}{rows}-1) );
202							#y my way
203							# my $col_sum2 = sum map { $_->[$col]{observation} } @{$self->{table}};
204
205	0						return $row_sum;
206							}
207
208							sub _sum_rows {
209
210	0			0			my $self = shift;
211	0						my $table_a_ref = $self->{table};
212	0						my @row_sums = ();
213
214	0						for my $row (0..$#{$table_a_ref}) {
	0
215
216	0						push @row_sums, $self->_sum_a_row($row);
217
218							}
219
220	0						$self->{properties}{row_sums} = [@row_sums];
221
222	0						return;
223							}
224
225							sub _sum_a_col {
226	0			0			my ( $self, $col ) = @_; # para de esquecer a colocar @_ quando unpacking!!!
227	0						my $table_a_ref = $self->{table};
228
229							#y basically the NW way
230	0						my $col_sum = sum map { $table_a_ref->[$_][$col]{observation} } ( 0..($self->{properties}{rows}-1) );
	0
231							#y my way
232	0						my $col_sum2 = sum map { $_->[$col]{observation} } @{$self->{table}};
	0
	0
233
234	0						return $col_sum;
235
236							}
237
238							sub _sum_cols {
239	0			0			my $self = shift;
240	0						my $table_a_ref = $self->{table};
241	0						my @col_sums = ();
242
243							#/ still need to fix this!!! use $self-{properties}{col}-1?!? - that way its all the same figure
244	0						for my $col (0..$#{$table_a_ref->[0]}) {
	0
245	0						push @col_sums, $self->_sum_a_col($col);
246							}
247	0						$self->{properties}{col_sums} = [@col_sums];
248
249	0						return;
250							}
251
252							sub _total {
253	0			0			my $self = shift;
254	0						my $total_from_rows = sum @{$self->{properties}{row_sums}};
	0
255	0						my $total_from_cols = sum @{$self->{properties}{col_sums}};
	0
256	0						$self->{properties}{total} = $total_from_rows;
257
258	0						return;
259							}
260
261							sub _calculate_expected {
262	0			0			my $self = shift;
263	0						my $a_ref = $self->{table};
264
265	0						my @row_sums = @{$self->{properties}{row_sums}};
	0
266	0						my @col_sums = @{$self->{properties}{col_sums}};
	0
267	0						my $total = $self->{properties}{total};
268
269	0						for my $row ( 0..($self->{properties}{rows}-1) ) {
270
271	0						for my $col (0..($self->{properties}{cols}-1) ) {
272
273	0						my $expected = ( $row_sums[$row] * $col_sums[$col] ) / $total;
274
275	0						$a_ref->[$row][$col]{expected} = $expected;
276							}
277							}
278
279	0						$self->{properties}{analysis}{expect_table} = 1;
280	0						return;
281							}
282
283							sub _calculate_f_ln_f {
284	0			0			my $self = shift;
285	0						my $a_ref = $self->{table};
286	0						for my $row ( 0..($self->{properties}{rows}-1) ) {
287	0						for my $col (0..($self->{properties}{cols}-1) ) {
288
289	0						my $observed = $a_ref->[$row][$col]{observation};
290	0						my $f_ln_f = _f_ln_f ($observed);
291	0						$a_ref->[$row][$col]{f_ln_f} = $f_ln_f;
292							}
293							}
294
295	0						return;
296							}
297
298							#/ beware: this is a prototype and not for use as an object method
299							sub _f_ln_f($) {
300	0			0			my $value = shift;
301	0	0					$value = $value > 0.5 ? $value * ( log ( $value ) ) : 0 ;
302	0						return $value;
303							}
304
305							sub _calculate_G_traditional {
306	0			0			my $self = shift;
307	0						my $table_a_ref = $self->{table};
308
309	0						my $sum = 0;
310
311	0						for my $row ( 0..($self->{properties}{rows}-1) ) {
312	0						for my $col (0..($self->{properties}{cols}-1) ) {
313
314	0						my $cell = $table_a_ref->[$row][$col]{observation} *
315							( log ( $table_a_ref->[$row][$col]{observation} / $table_a_ref->[$row][$col]{expected} ) );
316
317	0						$sum += $cell;
318							}
319							}
320	0						my $G = 2 * $sum;
321
322	0						$self->{properties}{G} = $G;
323
324	0						return;
325							}
326
327							sub _calculate_G_alternative {
328	0			0			my $self = shift;
329	0						my $table_a_ref = $self->{table};
330	0						my $sum = 0;
331
332							#/ don´t need this part###################################################
333	0						my @f_ln_f_for_row_sums = map { _f_ln_f($_) } @{$self->{properties}{row_sums}};
	0
	0
334	0						my @f_ln_f_for_col_sums = map { _f_ln_f($_) } @{$self->{properties}{col_sums}};
	0
	0
335	0						$self->{properties}{f_ln_f_for_row_sums} = [@f_ln_f_for_row_sums];
336	0						$self->{properties}{f_ln_f_for_col_sums} = [@f_ln_f_for_col_sums];
337							##########################################################################
338
339	0						my $sum_of_f_ln_f_for_row_sums = sum map { _f_ln_f($_) } @{$self->{properties}{row_sums}};
	0
	0
340	0						my $sum_of_f_ln_f_for_col_sums = sum map { _f_ln_f($_) } @{$self->{properties}{col_sums}};
	0
	0
341
342	0						my $f_ln_f_for_total = _f_ln_f ($self->{properties}{total});
343
344	0						$self->{properties}{f_ln_f_for_total} = [$f_ln_f_for_total];
345
346	0						my $sum_f_ln_f = 0;
347
348	0						for my $row ( 0..($self->{properties}{rows}-1) ) {
349
350							#/ either
351							#for my $col (0..($self->{properties}{cols}-1) ) {
352							#$row = $table_a_ref->[$row][$col]{observation};
353							#/ or
354	0						$row = sum map { $_->{f_ln_f} } @{$self->{table}[$row]};
	0
	0
355
356	0						$sum_f_ln_f += $row;
357							}
358
359	0						my $G = 2 * ( ( $sum_f_ln_f + $f_ln_f_for_total ) -
360							( $sum_of_f_ln_f_for_row_sums + $sum_of_f_ln_f_for_col_sums ) );
361
362	0						$self->{properties}{G} = $G;
363
364	0						return;
365							}
366
367							sub G {
368	0			0	0		my $self = shift;
369							#y check data loaded flag
370	0	0					croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
371	0						$self->_sum_rows();
372	0						$self->_sum_cols();
373	0						$self->_total();
374	0						$self->_calculate_expected();
375	0						$self->_calculate_G_traditional();
376
377	0						$self->_df();
378	0						$self->_calculate_p_value();
379
380							#y flag to check this has run
381	0						$self->{properties}{analysis}{G} = 1;
382	0						return;
383
384							}
385							=head2 G
386
387							To calculate G value. This method implements the calculation described in http://en.wikipedia.org/wiki/G-test.
388
389							$gtest->G();
390
391							=cut
392
393							#/ make this redundant and using self->{prop}{G} for both G calculation methods - create a method to compare the two computed values.
394							sub G_alt {
395	0			0	0		my $self = shift;
396							#y check data loaded flag
397	0	0					croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
398	0						$self->_sum_rows();
399	0						$self->_sum_cols();
400	0						$self->_total();
401	0						$self->_calculate_f_ln_f();
402	0						$self->_calculate_G_alternative();
403
404	0						$self->_df();
405	0						$self->_calculate_p_value();
406
407							#y flag to check this has run
408	0						$self->{properties}{analysis}{G_alt} = 1;
409	0						return;
410
411							}
412							=head2 G_alt
413
414							To calculate G you may also use this method. This method implements procedure described in
415							http://udel.edu/~mcdonald/statgtestind.html. This approach does not directly generate a table of expected values.
416
417							$gtest->G_alt();
418
419							=cut
420
421							sub _df {
422	0			0			my $self = shift;
423	0						my $df = ( $self->{properties}{rows}-1 ) * ( $self->{properties}{cols}-1 );
424	0						$self->{properties}{df} = $df;
425
426	0						return;
427							}
428
429							sub _calculate_p_value {
430	0			0			my $self = shift;
431	0						my $chisprob = chisqrprob ($self->{properties}{df}, $self->{properties}{G});
432	0						$self->{properties}{p_value} = $chisprob;
433
434	0						return;
435							}
436
437							sub _configure_table {
438	0			0			my ( $self, $which ) = @_;
439	0						my $max_len = 0;
440	0						for my $row (@{$self->{table}}) {
	0
441	0						my $temp = max map { length( sprintf ( q{%.0f}, $_->{$which}) ) } @{$row} ;
	0
	0
442	0	0					$max_len = $temp > $max_len ? $temp : $max_len;
443							}
444	0						return ($max_len) x $self->{properties}{cols};
445							}
446
447							sub _print_table {
448	0			0			my ( $self, $which ) = @_;
449
450	0	0	0				( $which eq q{observation} \|\| $which eq q{expected} ) \|\|
451							croak qq{\nUsage is \$object->print_observed or \$object->print_expected};
452
453							#croak qq{\nUsage is \$object->print_observed or \$object->print_expected}
454							#unless ( ( $which eq q{observed} ) \|\| ( $which eq q{expected} ) );
455							#if !( ( $which eq q{observed} ) \|\| ( $which eq q{expected} ) );
456
457	0						my $table = Text::SimpleTable->new($self->_configure_table($which));
458	0						my $count = 0;
459	0						for my $row (@{$self->{table}}) {
	0
460	0	0					$table->hr if ( $count != 0 );
461	0						$table->row( ( map { sprintf ( q{%.0f}, $_->{$which} ) } @{$row} ) );
	0
	0
462	0						$count++;
463							}
464	0						print qq{\nTable of $which values:\n}, $table->draw;
465
466	0						return;
467							}
468
469							sub print_expected {
470							# convinience method - stops calling of private _print_table
471	0			0	0		my $self = shift;
472
473							#y check data loaded flag
474	0	0					croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
475
476							#y check whether _calculated_expected was called
477	0	0	0				if ( ( exists $self->{properties}{analysis}{G_alt} ) && ( !exists $self->{properties}{analysis}{expect_table} ) ) {
478	0						print qq{\nIt appears you ran the alternative procedure that does not directly generate an expected values table. }
479							.qq{\nGenerating expected values table now.\n};
480	0						$self->_calculate_expected();
481							}
482
483							#y check the analysis flags
484							#y !$x and !$y and croak
485	0	0	0				croak qq{\nYou must run the analysis with either method G or G_alt first}
486							if ( ( !exists $self->{properties}{analysis}{G} ) && ( !exists $self->{properties}{analysis}{G_alt} ) );
487
488	0						$self->_print_table(q{expected});
489	0						return;
490							}
491							=head2 print_expected
492
493							Prints a table of the calculated expected values to STDOUT. If you used G_alt to calculate G it will first generated the
494							table of excpeted values.
495
496							$gtest->print_expected();
497
498							=cut
499
500							sub print_observed {
501							# convinience method - stops calling of private _print_table
502	0			0	0		my $self = shift;
503							#y check data loaded flag
504	0	0					croak qq{\nYou have to load some data before calling this method} if ( !exists $self->{properties}{analysis}{obs_table} );
505	0						$self->_print_table(q{observation});
506	0						return;
507							}
508							=head2 print_observed
509
510							Prints a table of the observation values to STDOUT.
511
512							$gtest->print_observed();
513
514							=cut
515
516							sub results {
517	0			0	0		my ($self, $thing ) = @_;
518							#y check the analysis flags
519	0	0	0				croak qq{\nYou must run the analysis with either method G or G_alt first}
520							if ( ( !exists $self->{properties}{analysis}{G} ) && ( !exists $self->{properties}{analysis}{G_alt} ) );
521
522							#y preferentially grab the traditional method result
523
524							#my $G = ( !exists $self->{properties}{G} ) ? $self->{properties}{G} : $self->{properties}{G_alt};
525							#/ twat!
526							#my $G = exists $self->{properties}{G};
527	0						my $G = $self->{properties}{G};
528
529	0						my $df = $self->{properties}{df};
530	0						my $p_val = $self->{properties}{p_value};
531
532	0	0					if ( $p_val < 1e-05 ) { $p_val = sprintf (q{%e}, $p_val) }
	0
533
534	0						$G = sprintf ( q{%.5f}, $G );
535
536							return (
537	0			0			VOID { $self->_void ($G, $df, $p_val) }
538	0			0			LIST { ($G, $df, $p_val ) }
539	0			0			BOOL { $self->_boolean($G, $df, $thing) }
540	0			0			NUM { $G ; }
541	0						);
542
543							}
544							=head2 results
545
546							Used to access the results of the G-test calculation. This method is context-dependent and will return a variety of
547							different values depending on its calling context. In VOID context it simply prints the calculated value of G, df and
548							the p_value in a table to STDOUT.
549
550							$gtest->results();
551
552							In BOOLEAN context it requires you to pass it a value for the significance level of the test you wish to apply e.g.
553							0.05. It returns True or False depending on whether the null hypothesis is rejected at that significance level.
554
555							# test if the result is significant at the p = 0.05 level.
556							if ($gtest->results( 0.05 )) { print qq{\nthis is significant } } else { print qq{\nthis is not significant} }
557
558							In LIST context it simply returns a LIST of the calculated values of G, df and p for the observation data.
559
560							my ($G, $df, $p) = $gtest->results();
561
562							In NUMERIC context it returns the calculated value of G.
563
564							print qq{\n\nG in numeric is: }, 0+$gtest->results();
565
566							=cut
567
568							sub _void {
569	0			0			my ( $self, $G, $df, $p_val ) = @_;
570
571	0	0					my $g_len = length ($G) > 7 ? length ($G) : 7;
572	0	0					my $df_len = length ($df) > 3 ? length ($df) : 3;
573	0	0					my $p_len = length ($p_val) > 7 ? length ($p_val) : 7;
574
575	0						my $table = Text::SimpleTable->new($g_len, $df_len, $p_len);
576	0						$table->row( qw/ G_value df p_value / );
577	0						$table->hr;
578	0						$table->row( $G, $df, $p_val );
579	0						print qq{\nTable of results:\n}, $table->draw;
580
581	0						return;
582							}
583
584							sub _boolean {
585	0			0			my ($self, $G, $df, $sig) = @_;
586
587	0	0					$sig or croak qq{\nYou must pass a p_value in BOOLEAN context};
588
589	0						print qq{\nsig $sig};
590
591	0	0	0				croak qq{\nThe p value must be numeric and in the range > 0 and < 1.} if ( $sig !~ /\A \d* \.? \d+ ([eE][+-]?\d+)? \z/xms \|\| $sig <= 0 \|\| $sig >= 1) ;
			0
592							#/ forgot to check for exponential numbers
593							#if ( $sig !~ /\A[01]?\.\d{1,7}\z/xms \|\| $sig <= 0 \|\| $sig >= 1) ;
594
595	0	0					$G = $G > chisqrdistr ( $df, $sig ) ? 1 : undef;
596	0						return $G;
597							}
598
599							1; # Magic true value required at end of module
600
601							__END__