File Coverage

blib/lib/Text/NSP/Measures/2D/Fisher/right.pm
Criterion Covered Total %
statement 44 51 86.2
branch 14 24 58.3
condition n/a
subroutine 6 7 85.7
pod n/a
total 64 82 78.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D::Fisher::right - Perl module implementation of the right sided
4             Fisher's exact test.
5              
6             =head1 SYNOPSIS
7              
8             =head3 Basic Usage
9              
10             use Text::NSP::Measures::2D::Fisher::right;
11              
12             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
13              
14             $right_value = calculateStatistic( n11=>$n11,
15             n1p=>$n1p,
16             np1=>$np1,
17             npp=>$npp);
18              
19             if( ($errorCode = getErrorCode()))
20             {
21             print STDERR $errorCode." - ".getErrorMessage();
22             }
23             else
24             {
25             print getStatisticName."value for bigram is ".$right_value;
26             }
27              
28              
29             =head1 DESCRIPTION
30              
31             Assume that the frequency count data associated with a bigram
32             is stored in a 2x2 contingency table:
33              
34             word2 ~word2
35             word1 n11 n12 | n1p
36             ~word1 n21 n22 | n2p
37             --------------
38             np1 np2 npp
39              
40             where n11 is the number of times occur together, and
41             n12 is the number of times occurs with some word other than
42             word2, and n1p is the number of times in total that word1 occurs as
43             the first word in a bigram.
44              
45             The fishers exact tests are calculated by fixing the marginal totals
46             and computing the hypergeometric probabilities for all the possible
47             contingency tables,
48              
49             A right sided test is calculated by adding the probabilities of all
50             the possible two by two contingency tables formed by fixing the
51             marginal totals and changing the value of n11 to greater than or
52             equal to the given value. A right sided Fisher's Exact Test tells us
53             how likely it is to randomly sample a table where n11 is greater
54             than observed. In other words, it tells us how likely it is to sample
55             an observation where the two words are more dependent than currently
56             observed.
57              
58             =head2 Methods
59              
60             =over
61              
62             =cut
63              
64             package Text::NSP::Measures::2D::Fisher::right;
65              
66              
67 1     1   2291 use Text::NSP::Measures::2D::Fisher;
  1         3  
  1         234  
68 1     1   5 use strict;
  1         3  
  1         28  
69 1     1   5 use Carp;
  1         2  
  1         66  
70 1     1   5 use warnings;
  1         2  
  1         28  
71 1     1   5 no warnings 'redefine';
  1         2  
  1         514  
72             require Exporter;
73              
74             our ($VERSION, @EXPORT, @ISA);
75              
76             @ISA = qw(Exporter);
77              
78             @EXPORT = qw(initializeStatistic calculateStatistic
79             getErrorCode getErrorMessage getStatisticName);
80              
81             $VERSION = '0.97';
82              
83              
84             =item calculateStatistic() - This method calculates the right Fisher value
85              
86             INPUT PARAMS : $count_values .. Reference of an hash containing
87             the count values computed by the
88             count.pl program.
89              
90             RETURN VALUES : $right .. Right Fisher value.
91              
92             =cut
93              
94             sub calculateStatistic
95             {
96 12     12   8451 my %values = @_;
97              
98 12         16 my $probabilities;
99 12         13 my $left_flag = 0;
100              
101             # computes and returns the observed and marginal values from
102             # the frequency combination values. returns 0 if there is an
103             # error in the computation or the values are inconsistent.
104 12 100       41 if( !(Text::NSP::Measures::2D::Fisher::getValues(\%values)) )
105             {
106 10         23 return;
107             }
108              
109 2 50       7 my $final_limit = ($n1p < $np1) ? $n1p : $np1;
110 2         11 my $n11_org = $n11;
111              
112 2         5 my $n11_start = $n1p + $np1 - $npp;
113 2 100       6 if($n11_start < $n11)
114             {
115 1         3 $n11_start = $n11;
116             }
117              
118              
119             # to make the computations faster, we check which would require less computations
120             # computing the leftfisher value and subtracting it from 1 or directly computing
121             # the right fisher value. We do this since, generally for bigrams n11 is quite small
122             # so its much faster to compute the left Fisher value.
123 2         5 my $left_final_limit = $n11-1;
124 2         3 my $left_n11 = $n1p + $np1 - $npp;
125 2 100       7 if($left_n11<0)
126             {
127 1         2 $left_n11 = 0;
128             }
129              
130             # if computing the left fisher values first will take lesser amount of time them
131             # we set a flag for later reference and then compute the leftfisher score for
132             # n11-1 and then subtract the total score from one to get the right fisher value.
133 2 50       7 if(($left_final_limit - $left_n11) < ($final_limit - $n11_start))
134             {
135 2         3 $left_flag = 1;
136 2 50       8 if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($left_n11, $left_final_limit)))
137             {
138 0         0 return;
139             }
140             }
141              
142             #else we compute the value normally and simply sum to get the rightfisher value.
143             else
144             {
145 0 0       0 if( !($probabilities = Text::NSP::Measures::2D::Fisher::computeDistribution($n11_start, $final_limit)))
146             {
147 0         0 return;
148             }
149             }
150              
151 2         3 my $key_n11;
152              
153 2         3 my $rightfisher=0;
154              
155 2         12 foreach $key_n11 (sort { $b <=> $a } keys %$probabilities)
  18         26  
156             {
157 11 50       20 if($left_flag)
158             {
159 11 100       25 if($key_n11 >= $n11_org)
160             {
161 1         2 last;
162             }
163             }
164             else
165             {
166 0 0       0 if($key_n11 < $n11_org)
167             {
168 0         0 last;
169             }
170             }
171 10         33 $rightfisher += exp($probabilities->{$key_n11});
172             }
173              
174             # if we computed the leftfisher value to get the right fisher value, we subtract
175             # the sum of the probabilities for the tables from one to get the right fisher score.
176 2 50       7 if($left_flag)
177             {
178 2 50       7 if ($rightfisher > 1)
179             {
180 0         0 $rightfisher = 0;
181             }
182             else
183             {
184 2         3 $rightfisher = 1 - $rightfisher;
185             }
186             }
187              
188 2         12 return $rightfisher;
189             }
190              
191              
192             =item getStatisticName() - Returns the name of this statistic
193              
194             INPUT PARAMS : none
195              
196             RETURN VALUES : $name .. Name of the measure.
197              
198             =cut
199              
200             sub getStatisticName
201             {
202 0     0     return "Right Fisher";
203             }
204              
205              
206              
207             1;
208             __END__