File Coverage

blib/lib/Statistics/Smoothing/SGT.pm
Criterion Covered Total %
statement 16 114 14.0
branch 1 10 10.0
condition 0 6 0.0
subroutine 4 12 33.3
pod 0 9 0.0
total 21 151 13.9


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Statistics::Smoothing::SGT - A Simple Good-Turing (SGT) smoothing implementation
4              
5             =head1 SYNOPSIS
6              
7             =head2 Basic Usage
8              
9             use Statistics::Smoothing::SGT
10             my $sgt = new Statistics::Smoothing::SGT($frequencyClasses, $total);
11             $sgt->calculateValues();
12             $probabilities = $sgt->getProbabilities();
13             $newFrequencies = $sgt->getNewFrequencies();
14             $nBar = $sgt->getNBar();
15              
16             =head1 AUTHORS
17              
18             Florian Doemges, florian@doemges.net
19              
20             Bjoern Wilmsmann, bjoern@wilmsmann.de
21              
22             =head1 COPYRIGHT
23              
24             Copyright (C) 2006, Florian Doemges and Bjoern Wilmsmann,
25             Department of Linguistics, Ruhr-University, Bochum
26              
27             Partially based on the SGT module (Copyright (C) 2004) by Andre Halama
28             (halama@linguistics.rub.de) and Tibor Kiss (tibor@linguistics.rub.de),
29             Department of Linguistics, Ruhr-University, Bochum.
30              
31             This module in turn was based on the work (including an implementation
32             of the algorithm in C) by
33             Geoffrey Sampson, Department of Informatics, University of Sussex.
34              
35             This program is free software; you can redistribute it and/or modify
36             it under the terms of the GNU General Public License as published by
37             the Free Software Foundation; either version 2 of the License, or (at
38             your option) any later version.
39              
40             This program is distributed in the hope that it will be useful, but
41             WITHOUT ANY WARRANTY; without even the implied warranty of
42             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
43             General Public License for more details.
44              
45             You should have received a copy of the GNU General Public License
46             along with this program; if not, write to
47              
48             The Free Software Foundation, Inc.,
49             59 Temple Place - Suite 330,
50             Boston, MA 02111-1307, USA.
51              
52             Note: a copy of the GNU General Public License is available on the web
53             at L and is included in this
54             distribution under the name GPL.
55              
56             =head1 BUGS
57              
58             =head1 SEE ALSO
59              
60             =head1 DESCRIPTION
61              
62             This Perl module implements the Simple Good Turing (SGT) algorithm
63             for smoothing of probabilistic values developed by William Gale and
64             Geoffrey Sampson.
65              
66             The algorithm is described in detail in Sampson's Empirical Linguistics
67             (Continuum International, London and New York, 2001), chapter 7.
68             An online version of this paper is available at Geoffrey Sampson's
69             homepage under L.
70              
71             =head2 Error Codes
72              
73             =head2 Methods
74              
75             =over
76              
77             =cut
78              
79              
80             package Statistics::Smoothing::SGT;
81              
82             # use strict, as we do not our variables to go haywire
83 1     1   302199 use strict;
  1         3  
  1         72  
84              
85             # use these for debugging purposes
86 1     1   6 use warnings;
  1         3  
  1         36  
87 1     1   1246 use Data::Dumper;
  1         13863  
  1         1514  
88              
89             our ($VERSION);
90              
91             $VERSION = '2.1.2';
92              
93             # constructor
94             sub new {
95 1     1 0 24 my ($class, $frequencyClasses, $ngramTotal) = @_;
96 1         3 my $rowsFound = scalar(keys(%{$frequencyClasses}));
  1         5  
97 1 50       4 unless ($rowsFound >=5) {
98 0         0 die("At least 5 m/V(m) pairs must be provided for SGT.\nThe hash contains only $rowsFound rows.\n");
99             }
100 1         5 my $self = {
101             frequencyClasses => $frequencyClasses,
102             ngramTotal => $ngramTotal
103             };
104 1         4 bless($self, $class);
105 1         3 return $self;
106             }
107              
108             # method for calculating all Z(m)
109             sub calculateZ() {
110 0     0 0   my ($self) = @_;
111 0           my @zValues;
112             my @equivalenceClasses;
113 0           my @cardinalities;
114            
115             # iterate over frequencies for conversion into ascendingly ordered list
116 0           foreach my $frequency (sort {$a <=> $b} keys(%{$self->{frequencyClasses}})) {
  0            
  0            
117             # push frequency to array of equivalence classes
118 0           push(@equivalenceClasses, $frequency);
119              
120             # push cardinality of this frequency class to array of cardinalities at corresponding
121             # position
122 0           push(@cardinalities, $self->{frequencyClasses}->{$frequency});
123             }
124            
125             # save arrays for processing in other functions
126 0           $self->{equivalenceClasses} = \@equivalenceClasses;
127 0           $self->{cardinalities} = \@cardinalities;
128            
129             # calculate z value for m = 1
130 0           push(@zValues, 2 * $self->{cardinalities}->[0] / ($self->{equivalenceClasses}->[1]));
131            
132             # iterate over equivalence classes and their respective cardinalities
133             # for calculating z values for all m > 1 and m < max
134 0           for (my $i = 1; $i < @{$self->{equivalenceClasses}} - 1; $i++) {
  0            
135 0           push(@zValues, 2 * $self->{cardinalities}->[$i] / ($self->{equivalenceClasses}->[$i + 1] - $self->{equivalenceClasses}->[$i - 1]));
136             }
137              
138             # calculate z value for m = max
139 0           push(@zValues, 2 * $self->{cardinalities}->[@{$self->{cardinalities}} - 1] / ($self->{equivalenceClasses}->[@{$self->{equivalenceClasses}} - 1] - $self->{equivalenceClasses}->[@{$self->{equivalenceClasses}} - 2]));
  0            
  0            
  0            
140              
141             # return
142 0           return \@zValues;
143             }
144              
145             # method for getting first gap between frequencies
146             sub getGap {
147 0     0 0   my ($self) = @_;
148 0           my $gapAt;
149 0           my $buffer = 0;
150              
151             # iterate over equivalence classes
152 0           for (my $i = 0; $i <= @{$self->{equivalenceClasses}}; $i++) {
  0            
153             # check if gap between current value and previous
154             # one is bigger than 1
155 0 0 0       if (!defined $self->{equivalenceClasses}->[$i] || $buffer + 1 < $self->{equivalenceClasses}->[$i]) {
156             # if so, we have found our gap and can skip the
157             # remaining iterations
158 0           $gapAt = $buffer;
159 0           last;
160             }
161              
162             # write current value to buffer for checking next value
163 0           $buffer = $self->{equivalenceClasses}->[$i];
164             }
165            
166             # return gap
167 0           return $gapAt;
168             }
169              
170             # method for deducing slope and intersection of a logarithmised
171             # exponential function with its best fitting linear function
172             # (i.e. linear regression)
173             sub logBestFit {
174 0     0 0   my ($self, $Xaxis, $Yaxis) = @_;
175 0           my $XYs = 0;
176 0           my $Xsquares = 0;
177 0           my $meanX = 0;
178 0           my $meanY = 0;
179 0           my $rows = scalar @{$Xaxis};
  0            
180              
181             # calculate log mean values for x and y
182 0           for (my $i = 0; $i < $rows; $i++) {
183 0           $meanX += log($Xaxis->[$i]);
184 0           $meanY += log($Yaxis->[$i]);
185             }
186 0           $meanX /= $rows;
187 0           $meanY /= $rows;
188              
189             # calculate slope and intersection
190 0           for (my $i = 0; $i < $rows; $i++) {
191 0           $XYs += ((log($Xaxis->[$i]) - $meanX) * (log($Yaxis->[$i]) - $meanY));
192 0           $Xsquares += (log($Xaxis->[$i]) - $meanX) ** 2;
193             }
194 0           my $slope = $XYs / $Xsquares;
195 0           my $intersection = $meanY - $slope * $meanX;
196            
197             # return slope and intersection
198 0           return ($slope, $intersection);
199             }
200              
201             # public method for calculating SGT-smoothed values
202             sub calculateValues() {
203 0     0 0   my ($self) = @_;
204 0           my $gapAt;
205             my $slope;
206 0           my $intersection;
207 0           my %xValues;
208 0           my %yValues;
209 0           my $zValues;
210 0           my %mFound;
211 0           my %mStar;
212 0           my $newFrequencies;
213 0           my $m;
214 0           my $xFlag;
215 0           my $diff;
216 0           my $i;
217              
218             # get z values
219 0           $zValues = $self->calculateZ();
220              
221             # get first frequency gap
222 0           $gapAt = $self->getGap();
223              
224             # perform linear regression
225 0           ($slope, $intersection) = $self->logBestFit($self->{equivalenceClasses}, $zValues);
226            
227             # calculcate y function values
228 0           for ($i = 0; $i <= $self->{equivalenceClasses}->[@{$self->{equivalenceClasses}} - 1]; $i++) {
  0            
229 0           $yValues{$i} = ($i + 1) * exp($slope * (log($i + 2) - log($i + 1)));
230             }
231              
232             # calculate x value only, if last element has not yet
233             # been reached, x values for large classes do not matter
234             # anyway
235 0           for($i = 0; $i < @{$self->{equivalenceClasses}} - 2; $i++) {
  0            
236 0           $xValues{$self->{equivalenceClasses}->[$i]} = $self->{equivalenceClasses}->[$i + 1] * $self->{cardinalities}->[$i + 1] / $self->{cardinalities}->[$i];
237             }
238              
239             # build hash for all existing m (=equivalence class) and V(m) (= cardinality)
240 0           for ($i = 0; $i < @{$self->{equivalenceClasses}}; $i++){
  0            
241 0           $mFound{$self->{equivalenceClasses}->[$i]} = $self->{cardinalities}->[$i];
242             }
243              
244             # iterate over equivalence classes in order to determine
245             # which function to use
246 0           for ($i = 1; $i <= $self->{equivalenceClasses}->[@{$self->{equivalenceClasses}} - 1]; $i++) {
  0            
247             # check, if m (=equivalence class) and V(m) (= cardinality) exist for this index
248 0 0         unless ($mFound{$i}) {
249 0           next;
250             }
251            
252             # if y function has not been used yet and first gap between frequencies
253             # has not yet been reached
254 0 0 0       if (!$xFlag && $i < $gapAt) {
255             # calculate distance value between x and y function for which y
256             # function is to be used
257 0           $diff = (sqrt (($self->{equivalenceClasses}->[$i + 1] ** 2) *
258             ($self->{cardinalities}->[$i + 1] / $self->{cardinalities}->[$i] ** 2) *
259             (1 + $self->{cardinalities}->[$i + 1] / $self->{cardinalities}->[$i]))) * 1.65;
260              
261             # if difference between x and y value is smaller than the difference
262             # value calculated above
263 0 0         if (abs($xValues{$i} - $yValues{$i}) < $diff) {
264             # use y function from now on
265 0           $mStar{$i} = $yValues{$i};
266 0           $xFlag = 1;
267             } else {
268             # use x function
269 0           $mStar{$i} = $xValues{$i};
270             }
271             } else {
272             # use y function from now on
273 0           $mStar{$i} = $yValues{$i};
274 0           $xFlag = 1;
275             }
276            
277             # calculate new total (i.e. nBar);
278 0           $self->{nBar} += $mStar{$i} * $mFound{$i};
279             }
280              
281             # save new frequencies
282 0           $self->{newFrequencies} = \%mStar;
283              
284             # calculate the probability for unseen events (i.e. pZero)
285 0           $self->{probabilities}->{0} = $self->{cardinalities}->[0] / $self->{ngramTotal};
286              
287             # calculate all other probabilities
288 0           foreach $m (keys(%mStar)) {
289 0           $self->{probabilities}->{$m} = (1 - $self->{probabilities}->{0}) * ($mStar{$m} / $self->{nBar});
290             }
291             }
292              
293             # public method for getting new frequencies
294             sub getNewFrequencies() {
295 0     0 0   my ($self) = @_;
296              
297             # return equivalence classes
298 0           return $self->{newFrequencies};
299             }
300              
301             # public method for getting equivalence classes
302             sub getEquivalenceClasses() {
303 0     0 0   my ($self) = @_;
304              
305             # return equivalence classes
306 0           return $self->{equivalenceClasses};
307             }
308              
309             # public method for getting probabilities
310             sub getProbabilities() {
311 0     0 0   my ($self) = @_;
312              
313             # return probabilities
314 0           return $self->{probabilities};
315             }
316              
317             # public method for getting nBar
318             sub getNBar() {
319 0     0 0   my ($self) = @_;
320              
321             # return nBar
322 0           return $self->{nBar};
323             }
324              
325             1;
326              
327             __END__