File Coverage

blib/lib/Text/NSP/Measures/3D.pm
Criterion Covered Total %
statement 138 231 59.7
branch 57 90 63.3
condition 20 48 41.6
subroutine 7 7 100.0
pod 3 3 100.0
total 225 379 59.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::3D - Perl module that provides basic framework for
4             building measure of association for trigrams.
5              
6             =head1 SYNOPSIS
7              
8             This module can be used as a foundation for building 3-dimensional
9             measures of association that can then be used by statistic.pl. In
10             particular this module provides methods that give convenient access to
11             3-d (i.e., trigram) frequency counts as created by count.pl, as well as
12             some degree of error handling that verifies the data.
13              
14              
15             =head3 Basic Usage
16              
17             use Text::NSP::Measures::3D::MI::ll;
18              
19             $ll_value = calculateStatistic( n111=>10,
20             n1pp=>40,
21             np1p=>45,
22             npp1=>42,
23             n11p=>20,
24             n1p1=>23,
25             np11=>21,
26             nppp=>100);
27              
28             if( ($errorCode = getErrorCode()))
29             {
30             print STDERR $erroCode." - ".getErrorMessage()."\n";
31             }
32             else
33             {
34             print getStatisticName."value for bigram is ".$ll_value."\n";
35             }
36              
37             =head1 DESCRIPTION
38              
39             The methods in this module retrieve observed trigram frequency counts and
40             marginal totals, and also compute expected values. They also provide
41             support for error checking of the output produced by count.pl. These
42             methods are used in all the trigram (3d) measure modules provided in NSP.
43             If you are writing your own 3d measure, you can use these methods as well.
44              
45             With trigram or 3d measures we use a 3x3 contingency table to store the
46             frequency counts associated with each word in the trigram, as well as the
47             number of times the trigram occurs. The notation we employ is as follows:
48              
49             Marginal Frequencies:
50              
51             n1pp = the number of trigrams where the first word is word1.
52             np1p = the number of trigrams where the second word is word2.
53             npp1 = the number of trigrams where the third word is word3
54             n2pp = the number of trigrams where the first word is not word1.
55             np2p = the number of trigrams where the second word is not word2.
56             npp2 = the number of trigrams where the third word is not word3.
57              
58             Observed Frequencies:
59              
60             n111 = number of times word1, word2 and word3 occur together in
61             their respective positions, joint frequency.
62             n112 = number of times word1 and word2 occur in their respective
63             positions but word3 does not.
64             n121 = number of times word1 and word3 occur in their respective
65             positions but word2 does not.
66             n211 = number of times word2 and word3 occur in their respective
67             positions but word1 does not.
68             n122 = number of times word1 occurs in its respective position
69             but word2 and word3 do not.
70             n212 = number of times word2 occurs in in its respective position
71             but word1 and word3 do not.
72             n221 = number of times word3 occurs in its respective position
73             but word1 and word2 do not.
74             n222 = number of time neither word1, word2 or word3 occur in their
75             respective positions.
76              
77             Expected Frequencies:
78              
79             m111 = expected number of times word1, word2 and word3 occur together in
80             their respective positions.
81             m112 = expected number of times word1 and word2 occur in their respective
82             positions but word3 does not.
83             m121 = expected number of times word1 and word3 occur in their respective
84             positions but word2 does not.
85             m211 = expected number of times word2 and word3 occur in their respective
86             positions but word1 does not.
87             m122 = expected number of times word1 occurs in its respective position
88             but word2 and word3 do not.
89             m212 = expected number of times word2 occurs in in its respective position
90             but word1 and word3 do not.
91             m221 = expected number of times word3 occurs in its respective position
92             but word1 and word2 do not.
93             m222 = expected number of time neither word1, word2 or word3 occur in their
94             respective positions.
95              
96             =head2 Methods
97              
98             =over
99              
100             =cut
101              
102              
103             package Text::NSP::Measures::3D;
104              
105              
106 6     6   3381 use Text::NSP::Measures;
  6         13  
  6         783  
107 6     6   26 use strict;
  6         10  
  6         98  
108 6     6   25 use Carp;
  6         12  
  6         274  
109 6     6   27 use warnings;
  6         8  
  6         16159  
110             require Exporter;
111              
112             our ($VERSION, @ISA, $marginals, @EXPORT);
113              
114             our ($n111, $n112, $n121, $n122, $n211, $n212, $n221, $n222);
115             our ($m111, $m112, $m121, $m122, $m211, $m212, $m221, $m222);
116             our ($nppp, $n1pp, $np1p, $npp1, $n11p, $n1p1, $np11);
117             our ($n2pp, $np2p, $npp2);
118              
119              
120             @ISA = qw(Exporter);
121              
122             @EXPORT = qw(initializeStatistic calculateStatistic
123             getErrorCode getErrorMessage getStatisticName
124             $n111 $n112 $n121 $n122 $n211 $n212 $n221 $n222
125             $m111 $m112 $m121 $m122 $m211 $m212 $m221 $m222
126             $nppp $n1pp $np1p $npp1 $n11p $n1p1 $np11 $n2pp
127             $np2p $npp2 $errorCodeNumber $errorMessage);
128              
129             $VERSION = '0.97';
130              
131              
132             =item computeObservedValues($count_values) - A method to
133             compute observed values, and also to verify that the
134             computed Observed values are correct, That is they are
135             positive, less than the marginal totals and the total
136             bigram count.
137              
138             INPUT PARAMS : $count_values .. Reference to an hash consisting
139             of the count values passed to
140             the calculateStatistic() method.
141              
142             RETURN VALUES : 1/undef ..returns '1' to indicate success
143             and an undefined(NULL) value to indicate
144             failure.
145              
146             =cut
147              
148             sub computeObservedValues
149             {
150 27     27 1 39 my ($values) = @_;
151              
152 27         34 $n111 = -1;
153 27 100       62 if(!defined $values->{n111})
154             {
155 4         9 $errorMessage = "Required trigram (1,1,1) not passed";
156 4         8 $errorCodeNumber = 200;
157 4         15 return;
158             }
159             else
160             {
161 23         39 $n111=$values->{n111};
162             }
163             # joint frequency should be greater than equal to zero
164 23 100       51 if ($n111< 0)
165             {
166 4         22 $errorMessage = "Frequency value (n111=$n111) must not be negative.";
167 4         7 $errorCodeNumber = 201;
168 4         16 return;
169             }
170              
171             # n111 frequency should be less than or equal to totalBigrams
172 19 50       38 if ($n111> $nppp)
173             {
174 0         0 $errorMessage = "Frequency value (n111=$n111) must not exceed total number of bigrams.";
175 0         0 $errorCodeNumber = 202;
176 0         0 return;
177             }
178             # joint frequency n111 should be less than or equal to the marginal totals
179 19 50 66     108 if ($n111 > $n1pp || $n111 > $np1p || $n111 > $npp1)
      66        
180             {
181 8         30 $errorMessage = "Frequency value of ngram (n111=$n111) must not exceed the marginal totals.";
182 8         11 $errorCodeNumber = 202;
183 8         25 return;
184             }
185              
186              
187 11         20 $n112=$n11p-$n111;
188 11 50       41 if ($n112< 0)
189             {
190 0         0 $errorMessage = "Frequency value (n112=$n112) must not be negative.";
191 0         0 $errorCodeNumber = 201;
192 0         0 return;
193             }
194             # n111 frequency should be less than or equal to totalBigrams
195 11 50       32 if ($n112> $nppp)
196             {
197 0         0 $errorMessage = "Frequency value (n112=$n112) must not exceed total number of bigrams.";
198 0         0 $errorCodeNumber = 202;
199 0         0 return;
200             }
201             # joint frequency n111 should be less than or equal to the marginal totals
202 11 50 33     86 if ($n112 > $n1pp || $n112 > $np1p || $n112 > $npp2)
      33        
203             {
204 0         0 $errorMessage = "Frequency value of ngram (n112=$n112) must not exceed the marginal totals.";
205 0         0 $errorCodeNumber = 202;
206 0         0 return;
207             }
208              
209              
210 11         18 $n121=$n1p1-$n111;
211 11 50       41 if ($n121< 0)
212             {
213 0         0 $errorMessage = "Frequency value (n121=$n121) must not be negative.";
214 0         0 $errorCodeNumber = 201;
215 0         0 return;
216             }
217             # n111 frequency should be less than or equal to totalBigrams
218 11 50       31 if ($n121> $nppp)
219             {
220 0         0 $errorMessage = "Frequency value (n121=$n121) must not exceed total number of bigrams.";
221 0         0 $errorCodeNumber = 202;
222 0         0 return;
223             }
224             # joint frequency n111 should be less than or equal to the marginal totals
225 11 50 33     89 if ($n121 > $n1pp || $n121 > $np2p || $n121 > $npp1)
      33        
226             {
227 0         0 $errorMessage = "Frequency value of ngram (n121=$n121) must not exceed the marginal totals.";
228 0         0 $errorCodeNumber = 202;
229 0         0 return;
230             }
231              
232              
233 11         17 $n211=$np11-$n111;
234 11 50       28 if ($n211< 0)
235             {
236 0         0 $errorMessage = "Frequency value (n211=$n211) must not be negative.";
237 0         0 $errorCodeNumber = 201;
238 0         0 return;
239             }
240             # n111 frequency should be less than or equal to totalBigrams
241 11 50       32 if ($n211> $nppp)
242             {
243 0         0 $errorMessage = "Frequency value (n211=$n211) must not exceed total number of bigrams.";
244 0         0 $errorCodeNumber = 202;
245 0         0 return;
246             }
247             # joint frequency n111 should be less than or equal to the marginal totals
248 11 50 66     64 if ($n211 > $n2pp || $n211 > $np1p || $n211 > $npp1)
      66        
249             {
250 4         17 $errorMessage = "Frequency value of ngram (n211=$n211) must not exceed the marginal totals.";
251 4         6 $errorCodeNumber = 202;
252 4         14 return;
253             }
254              
255 7         14 $n212=$np1p-$n111-$n112-$n211;
256 7 50       23 if ($n212< 0)
257             {
258 0         0 $errorMessage = "Frequency value (n212=$n212) must not be negative.";
259 0         0 $errorCodeNumber = 201;
260 0         0 return;
261             }
262             # n111 frequency should be less than or equal to totalBigrams
263 7 50       18 if ($n212> $nppp)
264             {
265 0         0 $errorMessage = "Frequency value (n212=$n212) must not exceed total number of bigrams.";
266 0         0 $errorCodeNumber = 202;
267 0         0 return;
268             }
269             # joint frequency n111 should be less than or equal to the marginal totals
270 7 50 33     61 if ($n212 > $n2pp || $n212 > $np1p || $n212 > $npp2)
      33        
271             {
272 0         0 $errorMessage = "Frequency value of ngram (n212=$n212) must not exceed the marginal totals.";
273 0         0 $errorCodeNumber = 202;
274 0         0 return;
275             }
276              
277              
278 7         16 $n122=$n1pp-$n111-$n112-$n121;
279 7 50       23 if ($n122< 0)
280             {
281 0         0 $errorMessage = "Frequency value (n122=$n122) must not be negative.";
282 0         0 $errorCodeNumber = 201;
283 0         0 return;
284             }
285             # n111 frequency should be less than or equal to totalBigrams
286 7 50       26 if ($n122> $nppp)
287             {
288 0         0 $errorMessage = "Frequency value (n122=$n122) must not exceed total number of bigrams.";
289 0         0 $errorCodeNumber = 202;
290 0         0 return;
291             }
292             # joint frequency n111 should be less than or equal to the marginal totals
293 7 50 33     83 if ($n122 > $n1pp || $n122 > $np2p || $n122 > $npp2)
      33        
294             {
295 0         0 $errorMessage = "Frequency value of ngram (n122=$n122) must not exceed the marginal totals.";
296 0         0 $errorCodeNumber = 202;
297 0         0 return;
298             }
299              
300              
301 7         17 $n221=$npp1-$n111-$n211-$n121;
302 7 50       36 if ($n221< 0)
303             {
304 0         0 $errorMessage = "Frequency value (n221=$n221) must not be negative.";
305 0         0 $errorCodeNumber = 201;
306 0         0 return;
307             }
308             # n111 frequency should be less than or equal to totalBigrams
309 7 50       26 if ($n221> $nppp)
310             {
311 0         0 $errorMessage = "Frequency value (n221=$n221) must not exceed total number of bigrams.";
312 0         0 $errorCodeNumber = 202;
313 0         0 return;
314             }
315             # joint frequency n111 should be less than or equal to the marginal totals
316 7 50 33     87 if ($n221 > $n2pp || $n221 > $np2p || $n221 > $npp1)
      33        
317             {
318 0         0 $errorMessage = "Frequency value of ngram (n221=$n221) must not exceed the marginal totals.";
319 0         0 $errorCodeNumber = 202;
320 0         0 return;
321             }
322              
323              
324 7         22 $n222=$nppp-($n111+$n112+$n121+$n122+$n211+$n212+$n221);
325 7 50       20 if ($n222< 0)
326             {
327 0         0 $errorMessage = "Frequency value (n222=$n222) must not be negative.";
328 0         0 $errorCodeNumber = 201;
329 0         0 return;
330             }
331             # n111 frequency should be less than or equal to totalBigrams
332 7 50       20 if ($n222> $nppp)
333             {
334 0         0 $errorMessage = "Frequency value (n222=$n222) must not exceed total number of bigrams.";
335 0         0 $errorCodeNumber = 202;
336 0         0 return;
337             }
338             # joint frequency n111 should be less than or equal to the marginal totals
339 7 50 33     79 if ($n222 > $n2pp || $n222 > $np2p || $n222 > $npp2)
      33        
340             {
341 0         0 $errorMessage = "Frequency value of ngram (n222=$n222) must not exceed the marginal totals.";
342 0         0 $errorCodeNumber = 202;
343 0         0 return;
344             }
345              
346 7         32 return 1;
347             }
348              
349              
350              
351              
352              
353             =item computeExpectedValues($count_values) - A method to compute
354             expected values.
355              
356             INPUT PARAMS : $count_values .. Reference to an hash consisting
357             of the count output.
358              
359             RETURN VALUES : 1/undef ..returns '1' to indicate success
360             and an undefined(NULL) value to indicate
361             failure.
362              
363             =cut
364              
365             sub computeExpectedValues
366             {
367 7     7 1 16 my ($values)=@_;
368              
369 7         29 $m111=$n1pp*$np1p*$npp1/($nppp**2);
370 7         19 $m112=$n1pp*$np1p*$npp2/($nppp**2);
371 7         17 $m121=$n1pp*$np2p*$npp1/($nppp**2);
372 7         24 $m122=$n1pp*$np2p*$npp2/($nppp**2);
373 7         18 $m211=$n2pp*$np1p*$npp1/($nppp**2);
374 7         18 $m212=$n2pp*$np1p*$npp2/($nppp**2);
375 7         22 $m221=$n2pp*$np2p*$npp1/($nppp**2);
376 7         17 $m222=$n2pp*$np2p*$npp2/($nppp**2);
377              
378 7         28 return 1;
379             }
380              
381              
382              
383              
384              
385              
386             =item computeMarginalTotals($marginal_values) - This method
387             computes the marginal totals from the valuescomputed by the count.pl
388             program and are passed to the calculateStatistic() method.
389              
390             INPUT PARAMS : $count_values .. Reference to an hash consisting
391             of the frequency combination
392             output.
393              
394             RETURN VALUES : 1/undef ..returns '1' to indicate success
395             and an undefined(NULL) value to indicate
396             failure.
397              
398             =cut
399              
400             sub computeMarginalTotals
401             {
402              
403 67     67 1 85 my ($values)=@_;
404              
405 67         84 $nppp = -1;
406 67 100       206 if(!defined $values->{nppp})
    100          
407             {
408 4         10 $errorMessage = "Total trigram count not passed";
409 4         6 $errorCodeNumber = 200;
410 4         14 return;
411             }
412             elsif($values->{nppp}<=0)
413             {
414 4         8 $errorMessage = "Total trigram count cannot be less than to zero";
415 4         7 $errorCodeNumber = 200;
416 4         13 return;
417             }
418             else
419             {
420 59         80 $nppp = $values->{nppp};
421             }
422              
423              
424 59         62 $n1pp = -1;
425 59 100       101 if(!defined $values->{n1pp})
426             {
427 4         8 $errorMessage = "Required marginal total (1,p,p) not passed";
428 4         13 $errorCodeNumber = 200;
429 4         21 return;
430             }
431             else
432             {
433 55         78 $n1pp=$values->{n1pp};
434             }
435             # n1pp should be greater than or equal to zero
436 55 100       119 if ($n1pp< 0)
437             {
438 4         17 $errorMessage = "Marginal total value ($n1pp) must not be negative.";
439 4         7 $errorCodeNumber = 204; return;
  4         14  
440             }
441             # n1pp should be less than or equal to totalBigrams
442 51 50       91 if ($n1pp > $nppp)
443             {
444 0         0 $errorMessage = "Marginal total value ($n1pp) must not exceed total number of bigrams.";
445 0         0 $errorCodeNumber = 203; return;
  0         0  
446             }
447              
448              
449              
450 51         57 $np1p = -1;
451 51 100       90 if(!defined $values->{np1p})
452             {
453 4         8 $errorMessage = "Required marginal total (p,1,p) not passed";
454 4         6 $errorCodeNumber = 200;
455 4         16 return;
456             }
457             else
458             {
459 47         61 $np1p=$values->{np1p};
460             }
461             # np1p should be greater than or equal to zero
462 47 50       106 if ($np1p< 0)
463             {
464 0         0 $errorMessage = "Marginal total value ($np1p) must not be negative.";
465 0         0 $errorCodeNumber = 204; return;
  0         0  
466             }
467             # np1p should be less than or equal to totalBigrams
468 47 100       87 if ($np1p > $nppp)
469             {
470 4         16 $errorMessage = "Marginal total value ($np1p) must not exceed total number of trigrams.";
471 4         5 $errorCodeNumber = 203; return;
  4         15  
472             }
473              
474              
475 43         51 $npp1 = -1;
476 43 100       84 if(!defined $values->{npp1})
477             {
478 4         11 $errorMessage = "Required marginal total (p,p,1) not passed";
479 4         6 $errorCodeNumber = 200;
480 4         20 return;
481             }
482             else
483             {
484 39         50 $npp1=$values->{npp1};
485             }
486             # npp1 should be greater than or equal to zero
487 39 50       79 if ($npp1< 0)
488             {
489 0         0 $errorMessage = "Marginal total value ($npp1) must not be negative.";
490 0         0 $errorCodeNumber = 204; return;
  0         0  
491             }
492             # npp1 should be less than or equal to totalBigrams
493 39 50       84 if ($npp1 > $nppp)
494             {
495 0         0 $errorMessage = "Marginal total value ($npp1) must not exceed total number of bigrams.";
496 0         0 $errorCodeNumber = 203; return;
  0         0  
497             }
498              
499 39         42 $n11p = -1;
500 39 100       73 if(!defined $values->{n11p})
501             {
502 4         10 $errorMessage = "Required marginal total (1,1,p) not passed";
503 4         7 $errorCodeNumber = 200;
504 4         14 return;
505             }
506             else
507             {
508 35         45 $n11p=$values->{n11p};
509             }
510             # n11p should be greater than or equal to zero
511 35 50       67 if ($n11p< 0)
512             {
513 0         0 $errorMessage = "Marginal total value ($n11p) must not be negative.";
514 0         0 $errorCodeNumber = 204; return;
  0         0  
515             }
516             # n11p should be less than or equal to totalBigrams
517 35 50       66 if ($n11p > $nppp)
518             {
519 0         0 $errorMessage = "Marginal total value ($n11p) must not exceed total number of bigrams.";
520 0         0 $errorCodeNumber = 203; return;
  0         0  
521             }
522              
523 35         39 $np11=-1;
524 35 100       82 if(!defined $values->{np11})
525             {
526 4         8 $errorMessage = "Required marginal total (p,1,1) not passed";
527 4         13 $errorCodeNumber = 200;
528 4         15 return;
529             }
530             else
531             {
532 31         46 $np11=$values->{np11};
533             }
534             # np11 should be greater than or equal to zero
535 31 50       70 if ($np11< 0)
536             {
537 0         0 $errorMessage = "Marginal total value ($np11) must not be negative.";
538 0         0 $errorCodeNumber = 204; return;
  0         0  
539             }
540             # np11 should be less than or equal to totalBigrams
541 31 50       58 if ($np11 > $nppp)
542             {
543 0         0 $errorMessage = "Marginal total value ($np11) must not exceed total number of trigrams.";
544 0         0 $errorCodeNumber = 203; return;
  0         0  
545             }
546              
547 31         33 $n1p1=-1;
548 31 100       67 if(!defined $values->{n1p1})
549             {
550 4         10 $errorMessage = "Required marginal total (1,p,1) not passed";
551 4         7 $errorCodeNumber = 200;
552 4         16 return;
553             }
554             else
555             {
556 27         40 $n1p1=$values->{n1p1};
557             }
558             # n1p1 should be greater than or equal to zero
559 27 50       48 if ($n1p1< 0)
560             {
561 0         0 $errorMessage = "Marginal total value ($n1p1) must not be negative.";
562 0         0 $errorCodeNumber = 204; return;
  0         0  
563             }
564             # n1p1 should be less than or equal to totalBigrams
565 27 50       55 if ($n1p1 > $nppp)
566             {
567 0         0 $errorMessage = "Marginal total value ($n1p1) must not exceed total number of bigrams.";
568 0         0 $errorCodeNumber = 203; return;
  0         0  
569             }
570              
571 27         45 $n2pp=$values->{nppp}-$n1pp;
572 27         37 $np2p=$values->{nppp}-$np1p;
573 27         38 $npp2=$values->{nppp}-$npp1;
574              
575 27         98 return 1;
576             }
577              
578             1;
579             __END__