File Coverage

blib/lib/Text/NSP/Measures/2D/Fisher.pm
Criterion Covered Total %
statement 92 96 95.8
branch 13 14 92.8
condition n/a
subroutine 7 7 100.0
pod 2 3 66.6
total 114 120 95.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D::Fisher - Perl module that provides methods
4             to compute the Fishers exact tests.
5              
6             =head1 SYNOPSIS
7              
8             =head3 Basic Usage
9              
10             use Text::NSP::Measures::2D::Fisher::left;
11              
12             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
13              
14             $left_value = calculateStatistic( n11=>$n11,
15             n1p=>$n1p,
16             np1=>$np1,
17             npp=>$npp);
18              
19             if( ($errorCode = getErrorCode()))
20             {
21             print STDERR $errorCode." - ".getErrorMessage();
22             }
23             else
24             {
25             print getStatisticName."value for bigram is ".$left_value;
26             }
27              
28              
29             =head1 DESCRIPTION
30              
31             Assume that the frequency count data associated with a bigram
32             is stored in a 2x2 contingency table:
33              
34             word2 ~word2
35             word1 n11 n12 | n1p
36             ~word1 n21 n22 | n2p
37             --------------
38             np1 np2 npp
39              
40             where n11 is the number of times occur together, and
41             n12 is the number of times occurs with some word other than
42             word2, and n1p is the number of times in total that word1 occurs as
43             the first word in a bigram.
44              
45             The fishers exact tests are calculated by fixing the marginal totals
46             and computing the hypergeometric probabilities for all the possible
47             contingency tables,
48              
49             A left sided test is calculated by adding the probabilities of all
50             the possible two by two contingency tables formed by fixing the
51             marginal totals and changing the value of n11 to less than the given
52             value. A left sided Fisher's Exact Test tells us how likely it is to
53             randomly sample a table where n11 is less than observed. In other words,
54             it tells us how likely it is to sample an observation where the two words
55             are less dependent than currently observed.
56              
57             A right sided test is calculated by adding the probabilities of all
58             the possible two by two contingency tables formed by fixing the
59             marginal totals and changing the value of n11 to greater than or
60             equal to the given value. A right sided Fisher's Exact Test tells us
61             how likely it is to randomly sample a table where n11 is greater
62             than observed. In other words, it tells us how likely it is to sample
63             an observation where the two words are more dependent than currently
64             observed.
65              
66             A two-tailed fishers test is calculated by adding the probabilities of
67             all the contingency tables with probabilities less than the probability
68             of the observed table. The two-tailed fishers test tells us how likely
69             it would be to observe an contingency table which is less probable than
70             the current table.
71              
72             =head2 Methods
73              
74             =over
75              
76             =cut
77              
78              
79             package Text::NSP::Measures::2D::Fisher;
80              
81              
82 4     4   11958 use Text::NSP::Measures::2D;
  4         9  
  4         1023  
83 4     4   18 use strict;
  4         8  
  4         91  
84 4     4   17 use Carp;
  4         6  
  4         187  
85 4     4   18 use warnings;
  4         7  
  4         4049  
86             # use subs(calculateStatistic);
87             require Exporter;
88              
89             our ($VERSION, @EXPORT, @ISA);
90              
91             @ISA = qw(Exporter);
92              
93             @EXPORT = qw(initializeStatistic calculateStatistic
94             getErrorCode getErrorMessage getStatisticName
95             $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
96             $npp $np1 $np2 $n2p $n1p $errorCodeNumber
97             $errorMessage);
98              
99             $VERSION = '0.97';
100              
101              
102             =item getValues() -This method calls the
103             computeObservedValues() and the computeExpectedValues() methods to
104             compute the observed and marginal total values. It checks these values
105             for any errors that might cause the Fishers Exact test measures to
106             fail.
107              
108             INPUT PARAMS : $count_values .. Reference of an array containing
109             the count values computed by the
110             count.pl program.
111              
112             RETURN VALUES : 1/undef ..returns '1' to indicate success
113             and an undefined(NULL) value to indicate
114             failure.
115              
116             =cut
117              
118             sub getValues
119             {
120 36     36 1 54 my $values = shift;
121              
122             # computes and returns the marginal totals from the frequency
123             # combination values. returns undef if there is an error in
124             # the computation or the values are inconsistent.
125 36 100       100 if(!(Text::NSP::Measures::2D::computeMarginalTotals($values)) ){
126 15         50 return;
127             }
128              
129             # computes and returns the observed and marginal values from
130             # the frequency combination values. returns 0 if there is an
131             # error in the computation or the values are inconsistent.
132 21 100       58 if( !(Text::NSP::Measures::2D::computeObservedValues($values)) ) {
133 15         50 return;
134             }
135              
136 6         34 return 1;
137             }
138              
139              
140             =item computeDistribution() - This method calculates the probabilities
141             for all the possible tables
142              
143             INPUT PARAMS : $n11_start .. the value for the cell 1,1 in the first contingency
144             table
145             $final_limit .. the value of cell 1,1 in the last contingency table
146             for which we have to compute the probability.
147              
148             RETURN VALUES : $probability .. Reference to a hash containing hypergeometric
149             probabilities for all the possible contingency
150             tables
151              
152             =cut
153              
154             sub computeDistribution
155             {
156 6     6 1 11 my $n11_start = shift @_;
157 6         33 my $final_limit = shift @_;
158              
159             # first sort the numerator array in the descending order.
160 6         32 my @numerator = sort { $b <=> $a } ($n1p, $np1, $n2p, $np2);
  24         45  
161              
162             # initialize the hash to store the probability distribution values.
163 6         13 my %probability = ();
164              
165             # declare some temporary variables for use in loops and computing the values.
166 6         16 my $i;
167 6         10 my $j=0;
168              
169             # initialize the product variable to be used in the probability computation.
170 6         9 my $product = 0;
171              
172             # set the values for the first contingency table.
173 6         12 $n11 = $n11_start;
174 6         10 $n12 = $n1p-$n11;
175 6         292 $n21 = $np1-$n11;
176 6         18 $n22 = $n2p - $n21;
177              
178 6         21 while($n22 < 0)
179             {
180 0         0 $n11++;
181 0         0 $n12 = $n1p - $n11;
182 0         0 $n21 = $np1 - $n11;
183 0         0 $n22 = $n2p - $n21;
184             }
185              
186             # declare the denominator array.
187 6         12 my @denominator = ();
188              
189 6         10 $product = 0;
190              
191 6         9 my $prob = 0;
192              
193 6         24 $i = $n11;
194 6         12 $n12 = $n1p - $i;
195 6         11 $n21 = $np1 - $i;
196 6         11 $n22 = $n2p - $n21;
197              
198             # initialize the denominator array with values sorted in the descending order.
199 6         25 @denominator = sort { $b <=> $a } ($npp, $n22, $n12, $n21, $i);
  51         69  
200              
201             #decalare other variables for use in computation.
202 6         19 my @dLimits = ();
203 6         10 my @nLimits = ();
204 6         11 my $dIndex = 0;
205 6         12 my $nIndex = 0;
206              
207             # set the dLimits and nLimits arrays to be used in the cancellation of factorials
208             # and to be used in the computation of factorial.
209             # the dLimits and the nLimits allow us to cancel out factorials in the numerator
210             # and the denominator. for example:
211             # 6! 1*2*3*4*5*6
212             # --- = --------------- = 5*6
213             # 4! 1*2*3*4
214             #
215             # we achieve this by defining a range within which all the
216             # nos must be multiplied. So every pair of entries in the nLimits array defines a range
217             # so for the above case the entries would be:
218             # 5,6
219             #
220 6         21 for ( $j = 0; $j < 4; $j++ )
221             {
222 24 100       93 if ( $numerator[$j] > $denominator[$j] )
    100          
223             {
224 6         13 $nLimits[$nIndex] = $denominator[$j] + 1;
225 6         12 $nLimits[$nIndex+1] = $numerator[$j];
226 6         16 $nIndex += 2;
227             }
228             elsif ( $denominator[$j] > $numerator[$j] )
229             {
230 6         18 $dLimits[$dIndex] = $numerator[$j] + 1;
231 6         13 $dLimits[$dIndex+1] = $denominator[$j];
232 6         16 $dIndex += 2;
233             }
234             }
235 6         11 $dLimits[$dIndex] = 1;
236 6         12 $dLimits[$dIndex+1] = $denominator[4];
237              
238             # since, all the variables have been initialized, we start the computations.
239 6         19 $product = computeHyperGeometric(\@dLimits, \@nLimits);
240 6         18 $probability{$i} = $product;
241 6         682 $prob = $probability{$i};
242              
243             # to reduce the no. of computations and the make the measure more efficient
244             # we use the previous tables probabilities to compute the new tables probabilities
245             # we can do this because the counts in the table will change by only a factor of 1
246             # thus instead of repeating all those multiplications we have to perform only
247             # 4 multiplications.
248 6         12 my $subproduct = 0;
249              
250 6         23 for ($i = $n11+1; $i <= $final_limit; $i++ )
251             {
252 49         80 $subproduct += log $n12;
253 49         46 $n22++;
254 49         61 $subproduct -= log $n22;
255 49         63 $subproduct += log $n21;
256 49         55 $n12--;
257 49         48 $n21--;
258 49         63 $subproduct -= log $i;
259 49         102 $probability{$i} = $product+$subproduct;
260 49 50       124 if($probability{$i} != 0)
261             {
262 49         92 $product = $product+$subproduct;
263 49         109 $subproduct=0;
264             }
265             }
266              
267              
268 6         41 return (\%probability);
269             }
270              
271              
272              
273             sub computeHyperGeometric
274             {
275 6     6 0 19 my $dLimits = shift @_;
276 6         11 my $nLimits = shift @_;
277 6         11 my $product = 0;
278              
279             # compute the probability now, since all the variables have been initialized.
280 6         24 while ( defined ( $nLimits->[0] ) )
281             {
282 6         27 while ( defined ( $nLimits->[0] ) )
283             {
284 90         154 $product += log $nLimits->[0];
285 90         94 $nLimits->[0]++;
286 90 100       244 if ( $nLimits->[0] > $nLimits->[1] )
287             {
288 6         8 shift @{$nLimits};
  6         49  
289 6         605 shift @{$nLimits};
  6         29  
290             }
291             }
292 6         20 while ( defined ( $dLimits->[0] ) )
293             {
294 96         162 $product -= log $dLimits->[0];
295 96         105 $dLimits->[0]++;
296 96 100       909 if ( $dLimits->[0] > $dLimits->[1] )
297             {
298 12         42 shift @{$dLimits};
  12         64  
299 12         14 shift @{$dLimits};
  12         45  
300             }
301             }
302             }
303 6         15 return $product;
304             }
305              
306              
307             1;
308             __END__