File Coverage

blib/lib/Text/NSP/Measures/2D/MI/pmi.pm
Criterion Covered Total %
statement 20 22 90.9
branch 2 2 100.0
condition n/a
subroutine 6 8 75.0
pod n/a
total 28 32 87.5


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D::MI::pmi - Perl module that implements Pointwise
4             Mutual Information.
5              
6             =head1 SYNOPSIS
7              
8             =head3 Basic Usage
9              
10             use Text::NSP::Measures::2D::MI::pmi;
11              
12             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
13              
14             $pmi_value = calculateStatistic( n11=>$n11,
15             n1p=>$n1p,
16             np1=>$np1,
17             npp=>$npp);
18              
19             if( ($errorCode = getErrorCode()))
20             {
21             print STDERR $errorCode." - ".getErrorMessage()."\n"";
22             }
23             else
24             {
25             print getStatisticName."value for bigram is ".$pmi_value."\n"";
26             }
27              
28             =head1 DESCRIPTION
29              
30             Assume that the frequency count data associated with a bigram
31             is stored in a 2x2 contingency table:
32              
33             word2 ~word2
34             word1 n11 n12 | n1p
35             ~word1 n21 n22 | n2p
36             --------------
37             np1 np2 npp
38              
39             where n11 is the number of times occur together, and
40             n12 is the number of times occurs with some word other than
41             word2, and n1p is the number of times in total that word1 occurs as
42             the first word in a bigram.
43              
44             The expected values for the internal cells are calculated by taking the
45             product of their associated marginals and dividing by the sample size,
46             for example:
47              
48             np1 * n1p
49             m11= ---------
50             npp
51              
52             Pointwise Mutual Information (pmi) is defined as the log of the deviation
53             between the observed frequency of a bigram (n11) and the probability of
54             that bigram if it were independent (m11).
55              
56             PMI = log (n11/m11)
57              
58             The Pointwise Mutual Information tends to overestimate bigrams with low
59             observed frequency counts. To prevent this sometimes a variation of pmi
60             is used which increases the influence of the observed frequency.
61              
62             PMI = log((n11^$exp)/m11)
63              
64             The $exp is 1 by default, so by default the measure will compute the
65             Pointwise Mutual Information for the given bigram. To use a variation of
66             the measure, users can pass the $exp parameter using the --pmi_exp
67             command line option in statistic.pl or by passing the $exp to the
68             initializeStatistic() method from their program.
69              
70             The usage for statistic.pl is
71              
72             statistic.pl pmi out_pmi.stt out.cnt - for Point Wise Mutual Information
73             $exp is 1 in this case.
74              
75             statistic.pl --pmi_exp 2 pmi out_pmi2.stt out.cnt - for the variant with
76             $exp set to 2.
77              
78             =head2 Methods
79              
80             =over
81              
82             =cut
83              
84              
85             package Text::NSP::Measures::2D::MI::pmi;
86              
87              
88 1     1   2008 use Text::NSP::Measures::2D::MI;
  1         3  
  1         220  
89 1     1   6 use strict;
  1         1  
  1         26  
90 1     1   5 use Carp;
  1         2  
  1         52  
91 1     1   39 use warnings;
  1         2  
  1         36  
92 1     1   5 no warnings 'redefine';
  1         2  
  1         381  
93             require Exporter;
94              
95             our ($VERSION, @EXPORT, @ISA, $exp);
96              
97             $exp = 1;
98              
99             @ISA = qw(Exporter);
100              
101             @EXPORT = qw(initializeStatistic calculateStatistic
102             getErrorCode getErrorMessage getStatisticName);
103              
104             $VERSION = '0.97';
105              
106              
107             =item initializeStatistic() -Initialization of the pmi_exp parameter if required
108              
109             INPUT PARAMS : none
110              
111             RETURN VALUES : none
112              
113             =cut
114              
115             sub initializeStatistic
116             {
117 0     0   0 $exp = shift;
118             }
119              
120              
121              
122             =item calculateStatistic() - This method calculates the pmi value
123              
124             INPUT PARAMS : $count_values .. Reference of a hash containing
125             the count values computed by the
126             count.pl program.
127              
128             RETURN VALUES : $pmi .. PMI value for this bigram.
129              
130             =cut
131              
132             sub calculateStatistic
133             {
134 28     28   5178 my %values = @_;
135              
136             # computes and returns the observed and expected values from
137             # the frequency combination values. returns 0 if there is an
138             # error in the computation or the values are inconsistent.
139 28 100       88 if( !(Text::NSP::Measures::2D::MI::getValues(\%values)) ) {
140 10         24 return;
141             }
142              
143             # Now the calculations!
144 18         52 my $pmi = Text::NSP::Measures::2D::MI::computePMI($n11**$exp,$m11);
145              
146 18         56 return($pmi/log(2));
147             }
148              
149              
150              
151             =item getStatisticName() - Returns the name of this statistic
152              
153             INPUT PARAMS : none
154              
155             RETURN VALUES : $name .. Name of the measure.
156              
157             =cut
158              
159             sub getStatisticName
160             {
161 0     0     return "Pointwise Mutual Information";
162             }
163              
164              
165              
166             1;
167             __END__