File Coverage

blib/lib/Text/NSP/Measures/2D.pm
Criterion Covered Total %
statement 76 91 83.5
branch 25 30 83.3
condition 2 3 66.6
subroutine 7 7 100.0
pod 3 3 100.0
total 113 134 84.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D - Perl module that provides basic framework
4             for building measure of association for
5             bigrams.
6              
7             =head1 SYNOPSIS
8              
9             =head3 Basic Usage
10              
11             use Text::NSP::Measures::2D::MI::ll;
12              
13             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
14              
15             $ll_value = calculateStatistic( n11=>$n11,
16             n1p=>$n1p,
17             np1=>$np1,
18             npp=>$npp);
19              
20             if( ($errorCode = getErrorCode()))
21             {
22             print STDERR $errorCode." - ".getErrorMessage()."\n"";
23             }
24             else
25             {
26             print getStatisticName."value for bigram is ".$ll_value."\n"";
27             }
28              
29              
30             =head1 DESCRIPTION
31              
32             This module is to be used as a foundation for building 2-dimensional
33             measures of association. The methods in this module retrieve observed
34             bigram frequency counts, marginal totals, and also compute expected
35             values. They also provide error checks for these counts.
36              
37             With bigram or 2d measures we use variables with corresponding names
38             to store the 2x2 contingency table to store the frequency counts
39             associated with each word in the bigram, as well as the number of
40             times the bigram occurs. A contingency table looks like
41              
42             |word2 | not-word2|
43             --------------------
44             word1 | n11 | n12 | n1p
45             not-word1 | n21 | n22 | n2p
46             --------------------
47             np1 np2 npp
48              
49             Marginal Frequencies:
50              
51             n1p = the number of bigrams where the first word is word1.
52             np1 = the number of bigrams where the second word is word2.
53             n2p = the number of bigrams where the first word is not word1.
54             np2 = the number of bigrams where the second word is not word2.
55              
56             These marginal totals are stored in variables which have names
57             corresponding to the cell they represent. These values may then be
58             referred to as follows:
59              
60             $n1p,
61             $np1,
62             $n2p,
63             $np2,
64             $npp
65              
66             Observed Frequencies:
67              
68             n11 = number of times the bigram occurs, joint frequency
69             n12 = number of times word1 occurs in the first position of a bigram
70             when word2 does not occur in the second position.
71             n21 = number of times word2 occurs in the second position of a
72             bigram when word1 does not occur in the first position.
73             n22 = number of bigrams where word1 is not in the first position and
74             word2 is not in the second position.
75              
76             The observed frequencies are also stored in variables with corresponding names.
77             These values may then be referred to as follows:
78              
79              
80             $n11,
81             $n12,
82             $n21,
83             $n22
84              
85             Expected Frequencies:
86              
87             m11 = expected number of times both words in the bigram occur
88             together if they are independent. (n1p*np1/npp)
89             m12 = expected number of times word1 in the bigram will occur in
90             the first position when word2 does not occur in the second
91             position given that the words are independent. (n1p*np2/npp)
92             m21 = expected number of times word2 in the bigram will occur
93             in the second position when word1 does not occur in the first
94             position given that the words are independent. (np1*n2p/npp)
95             m22 = expected number of times word1 will not occur in the first
96             position and word2 will not occur in the second position
97             given that the words are independent. (n2p*np2/npp)
98              
99             Similarly the expected values are stored as
100              
101             $m11,
102             $m12,
103             $m21,
104             $m22
105              
106             =head2 Methods
107              
108             =over
109              
110             =cut
111              
112              
113             package Text::NSP::Measures::2D;
114              
115              
116 19     19   17621 use Text::NSP::Measures;
  19         48  
  19         4442  
117 19     19   99 use strict;
  19         37  
  19         503  
118 19     19   142 use Carp;
  19         38  
  19         7575  
119 19     19   112 use warnings;
  19         30  
  19         38541  
120             require Exporter;
121              
122             our ($VERSION, @ISA, @EXPORT);
123              
124             @ISA = qw(Exporter);
125              
126             our ($n11, $n12, $n21, $n22);
127             our ($m11, $m12, $m21, $m22);
128             our ($npp, $n1p, $np1, $n2p, $np2);
129             # $npp = -1; $n1p = -1; $np1 = -1;
130             # $n2p = -1; $np2 = -1;
131              
132              
133             @EXPORT = qw(initializeStatistic calculateStatistic
134             getErrorCode getErrorMessage getStatisticName
135             $errorCodeNumber $errorMessage
136             $n11 $n12 $n21 $n22 $m11 $m12 $m21 $m22
137             $npp $np1 $np2 $n2p $n1p);
138              
139             $VERSION = '0.97';
140              
141              
142             =item computeObservedValues() - A method to compute observed values,
143             and also to verify that the computed Observed values are correct,
144             That is they are positive, less than the marginal totals and the
145             total bigram count.
146              
147              
148             INPUT PARAMS : $count_values .. Reference to an hash consisting
149             of the count values passed to
150             the calculateStatistic() method.
151              
152             RETURN VALUES : 1/undef ..returns '1' to indicate success
153             and an undefined(NULL) value to indicate
154             failure.
155             =cut
156              
157             sub computeObservedValues
158             {
159 197     197 1 307 my ($values) = @_;
160              
161 197 100       447 if(!defined $values->{n11})
162             {
163 14         54 $errorMessage = "Required frequency count (1,1) not passed";
164 14         33 $errorCodeNumber = 200;
165 14         55 return;
166             }
167             else
168             {
169 183         270 $n11 = $values->{n11};
170             }
171             # joint frequency should be greater than equal to zero
172 183 100       529 if ($n11 < 0)
173             {
174 14         35 $errorMessage = "Frequency value 'n11' must not be negative.";
175 14         24 $errorCodeNumber = 201;
176 14         109 return;
177             }
178              
179             # joint frequency (n11) should be less than or equal to the
180             # total number of bigrams (npp)
181 169 100       907 if($n11 > $npp)
182             {
183 14         36 $errorMessage = "Frequency value 'n11' must not exceed total number of bigrams.";
184 14         26 $errorCodeNumber = 202;
185 14         684 return;
186             }
187              
188             # joint frequency should be less than or equal to the marginal totals
189 155 100 66     774 if ($n11 > $np1 || $n11 > $n1p)
190             {
191 14         36 $errorMessage = "Frequency value of ngram 'n11' must not exceed the marginal totals.";
192 14         27 $errorCodeNumber = 202;
193 14         66 return;
194             }
195              
196             # The marginal totals are reasonable so we can
197             # calculate the observed frequencies
198 141         202 $n12 = $n1p - $n11;
199 141         200 $n21 = $np1 - $n11;
200 141         795 $n22 = $np2 - $n12;
201              
202 141 50       330 if ($n12 < 0)
203             {
204 0         0 $errorMessage = "Frequency value 'n12' must not be negative.";
205 0         0 $errorCodeNumber = 201;
206 0         0 return;
207             }
208              
209 141 50       293 if ($n21 < 0)
210             {
211 0         0 $errorMessage = "Frequency value 'n21' must not be negative.";
212 0         0 $errorCodeNumber = 201;
213 0         0 return;
214             }
215              
216 141 100       298 if ($n22 < 0)
217             {
218 14         34 $errorMessage = "Frequency value 'n22' must not be negative.";
219 14         80 $errorCodeNumber = 201;
220 14         61 return;
221             }
222              
223 127         481 return 1;
224             }
225              
226              
227              
228             =item computeExpectedValues() - A method to compute expected values.
229              
230              
231             INPUT PARAMS :none
232              
233             RETURN VALUES : 1/undef ..returns '1' to indicate success
234             and an undefined(NULL) value to indicate
235             failure.
236              
237             =cut
238              
239             sub computeExpectedValues
240             {
241             # calculate the expected values
242 83     83 1 579 $m11 = $n1p * $np1 / $npp;
243 83         119 $m12 = $n1p * $np2 / $npp;
244 83         114 $m21 = $n2p * $np1 / $npp;
245 83         179 $m22 = $n2p * $np2 / $npp;
246              
247 83         255 return 1;
248             }
249              
250              
251              
252             =item computeMarginalTotals() - This method computes the marginal totals from the count values as
253             passed to it.
254              
255              
256             INPUT PARAMS : $count_values .. Reference to an hash consisting
257             of the frequency combination
258             output.
259              
260             RETURN VALUES : 1/undef ..returns '1' to indicate success
261             and an undefined(NULL) value to indicate
262             failure.
263              
264             =cut
265              
266             sub computeMarginalTotals
267             {
268              
269 267     267 1 14939 my ($values)=@_;
270              
271 267 100       925 if(!defined $values->{npp})
    100          
272             {
273 14         46 $errorMessage = "Total bigram count not passed";
274 14         665 $errorCodeNumber = 200;
275 14         106 return;
276             }
277             elsif($values->{npp}<=0)
278             {
279 14         36 $errorMessage = "Total bigram count cannot be less than to zero";
280 14         547 $errorCodeNumber = 204;
281 14         62 return;
282             }
283             else
284             {
285 239         362 $npp = $values->{npp};
286             }
287              
288 239         402 $n1p=-1;
289 239 100       463 if(!defined $values->{n1p})
290             {
291 14         35 $errorMessage = "Required Marginal total (1,p) count not passed";
292 14         27 $errorCodeNumber = 200;
293 14         77 return;
294             }
295             else
296             {
297 225         360 $n1p=$values->{n1p};
298             }
299             # right frequency (n1p) should be greater than or equal to zero
300 225 100       568 if ($n1p < 0)
301             {
302 14         38 $errorMessage = "Marginal total value 'n1p' must not be negative.";
303 14         827 $errorCodeNumber = 204;
304 14         84 return;
305             }
306             # right frequency (n1p) should be less than or equal to the total
307             # number of bigrams (npp)
308 211 100       473 if ($n1p > $npp)
309             {
310 14         57 $errorMessage = "Marginal total value 'n1p' must not exceed total number of bigrams.";
311 14         259 $errorCodeNumber = 203;
312 14         72 return;
313             }
314              
315              
316 197         232 $np1 = -1;
317 197 50       10120 if(!defined $values->{np1})
318             {
319 0         0 $errorMessage = "Required Marginal total (p,1) count not passed";
320 0         0 $errorCodeNumber = 200;
321 0         0 return;
322             }
323             else
324             {
325 197         281 $np1=$values->{np1};
326             }
327             # left frequency (np1) should be greater than or equal to zero
328 197 50       511 if ($np1 < 0)
329             {
330 0         0 $errorMessage = "Marginal total value 'np1' must not be negative.";
331 0         0 $errorCodeNumber = 204;
332 0         0 return;
333             }
334             # left frequency (np1) should be less than or equal to the total
335             # number of bigrams (npp)
336 197 50       426 if ($np1 > $npp)
337             {
338 0         0 $errorMessage = "Marginal total value 'np1' must not exceed total number of bigrams.";
339 0         0 $errorCodeNumber = 203;
340 0         0 return;
341             }
342              
343 197         301 $np2 = $npp - $np1;
344 197         254 $n2p = $npp - $n1p;
345              
346 197         702 return 1;
347             }
348              
349              
350              
351             1;
352             __END__