File Coverage

blib/lib/Statistics/MaxEntropy.pm
Criterion Covered Total %
statement 472 589 80.1
branch 113 184 61.4
condition 25 42 59.5
subroutine 40 48 83.3
pod 8 40 20.0
total 658 903 72.8


line stmt bran cond sub pod time code
1             package Statistics::MaxEntropy;
2              
3             ##---------------------------------------------------------------------------##
4             ## Author:
5             ## Hugo WL ter Doest terdoest@cs.utwente.nl
6             ## Description:
7             ## Object-oriented implementation of
8             ## Generalised Iterative Scaling algorithm,
9             ## Improved Iterative Scaling algorithm, and
10             ## Feature Induction algorithm
11             ## for inducing maximum entropy probability distributions
12             ## Keywords:
13             ## Maximum Entropy Modeling
14             ## Kullback-Leibler Divergence
15             ## Exponential models
16             ##
17             ##---------------------------------------------------------------------------##
18             ## Copyright (C) 1998, 1999 Hugo WL ter Doest terdoest@cs.utwente.nl
19             ##
20             ## This library is free software; you can redistribute it and/or modify
21             ## it under the terms of the GNU General Public License as published by
22             ## the Free Software Foundation; either version 2 of the License, or
23             ## (at your option) any later version.
24             ##
25             ## This library is distributed in the hope that it will be useful,
26             ## but WITHOUT ANY WARRANTY; without even the implied warranty of
27             ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28             ## GNU General Public License for more details.
29             ##
30             ## You should have received a copy of the GNU Library General Public
31             ## License along with this program; if not, write to the Free Software
32             ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33             ##---------------------------------------------------------------------------##
34              
35              
36             ##---------------------------------------------------------------------------##
37             ## Globals
38             ##---------------------------------------------------------------------------##
39 3         408 use vars qw($VERSION
40             @ISA
41             @EXPORT
42             $VECTOR_PACKAGE
43              
44             $debug
45             $SAMPLE_size
46             $NEWTON_max_it
47             $KL_max_it
48             $KL_min
49             $NEWTON_min
50             $cntrl_c_pressed
51             $cntrl_backslash_pressed
52 3     3   2027 );
  3         5  
53              
54              
55             ##---------------------------------------------------------------------------##
56             ## Require libraries
57             ##---------------------------------------------------------------------------##
58 3     3   15 use strict;
  3         6  
  3         74  
59 3     3   9048 use diagnostics -verbose;
  3         683256  
  3         45  
60 3     3   3384 use Statistics::SparseVector;
  3         8  
  3         202  
61             $VECTOR_PACKAGE = "Statistics::SparseVector";
62 3     3   2189 use POSIX;
  3         19543  
  3         22  
63 3     3   9449 use Carp;
  3         6  
  3         152  
64 3     3   2830 use Data::Dumper;
  3         36106  
  3         20846  
65             require Exporter;
66             require AutoLoader;
67              
68             @ISA = qw(Exporter AutoLoader);
69             # Items to export into callers namespace by default. Note: do not export
70             # names by default without a very good reason. Use EXPORT_OK instead.
71             # Do not simply export all your public functions/methods/constants.
72             @EXPORT = qw($KL_min
73             $NEWTON_min
74             $debug
75             $nr_add
76             $KL_max_it
77             $NEWTON_max_it
78             $SAMPLE_size
79             );
80              
81             $VERSION = '1.0';
82              
83              
84             # default values for some configurable parameters
85             $NEWTON_max_it = 20;
86             $NEWTON_min = 0.001;
87             $KL_max_it = 100;
88             $KL_min = 0.001;
89             $debug = 0;
90             $SAMPLE_size = 250; # the size of MC samples
91             # binary or integer feature functions
92              
93             # for catching interrupts
94             $cntrl_c_pressed = 0;
95             $cntrl_backslash_pressed = 0;
96             $SIG{INT} = \&catch_cntrl_c;
97             $SIG{QUIT} = \&catch_cntrl_backslash;
98              
99              
100             # checks floats
101             sub is_float {
102 59812     59812 0 79971 my($f) = @_;
103              
104 59812         542843 return ($f =~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/);
105             }
106              
107              
108             # interrrupt routine for control c
109             sub catch_cntrl_c {
110 0     0 0 0 my($signame) = shift;
111              
112 0         0 $cntrl_c_pressed = 1;
113 0         0 die " pressed\n";
114             }
115              
116              
117             # interrrupt routine for control \ (originally core-dump request)
118             sub catch_cntrl_backslash {
119 0     0 0 0 my($signame) = shift;
120              
121 0         0 $cntrl_backslash_pressed = 1;
122             }
123              
124              
125             # creates a new event space
126             # depending on the $arg parameter samples it or reads it from a file
127             sub new {
128 37     37 1 223 my($this, $vectype, $filename) = @_;
129              
130             # for calling $self->new($someth):
131 37   66     185 my $class = ref($this) || $this;
132 37         92 my $self = {};
133 37         107 bless $self, $class;
134 37         142 $self->{SCALER} = "gis"; # default
135 37         134 $self->{SAMPLING} = "corpus"; # default
136 37         107 $self->{NR_CLASSES} = 0;
137 37         96 $self->{NR_EVENTS} = 0;
138 37         94 $self->{NR_FEATURES} = 0;
139 37         101 $self->{VECTYPE} = $vectype;
140 37 100       96 if ($filename) { # hey a filename
141 4         18 $self->read($filename);
142             }
143 37         232 $self->{FEATURE_IGNORE} = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
144 37         96 return($self);
145             }
146              
147              
148             # decides how to sample, "enum", "mc", or "corpus"
149             sub sample {
150 332     332 0 669 my($self) = @_;
151              
152 332         382 my($sample);
153              
154 332 100       1450 if ($self->{SAMPLING} eq "mc") {
    100          
155 16         72 $sample = $self->new();
156 16         39 $sample->{VECTYPE} = "binary";
157 16         46 $sample->{SCALER} = $self->{SCALER};
158 16         45 $sample->{NR_FEATURES} = $self->{NR_FEATURES};
159             # refer to the parameters of $self
160 16         66 $sample->{PARAMETERS} = $self->{PARAMETERS};
161 16         49 $sample->{NEED_CORRECTION_FEATURE} = 1;
162 16         48 $sample->{CORRECTION_PARAMETER} = $self->{CORRECTION_PARAMETER};
163 16         49 $sample->{E_REF} = $self->{E_REF};
164 16         28 $sample->{THIS_IS_A_SAMPLE} = 1;
165 16         63 $sample->mc($self);
166 16         40 $sample->{CLASSES_CHANGED} = 1;
167 16         73 $sample->prepare_model();
168             }
169             elsif ($self->{SAMPLING} eq "enum") {
170 17         76 $sample = $self->new();
171 17         51 $sample->{SCALER} = $self->{SCALER};
172 17         39 $sample->{SAMPLING} = "enum";
173 17         41 $sample->{NR_FEATURES} = $self->{NR_FEATURES};
174 17         66 $sample->{PARAMETERS} = $self->{PARAMETERS};
175 17         41 $sample->{NEED_CORRECTION_FEATURE} = 1;
176 17         49 $sample->{CORRECTION_PARAMETER} = $self->{CORRECTION_PARAMETER};
177 17         41 $sample->{E_REF} = $self->{E_REF};
178 17         30 $sample->{THIS_IS_A_SAMPLE} = 1;
179 17         42 $sample->{M} = $self->{NR_FEATURES};
180             }
181             else { # "corpus"
182 299         411 $sample = $self;
183             }
184 332         1025 $sample->prepare_sample();
185 332         795 return($sample);
186             }
187              
188              
189             # makes sure that when prepare_model is called, everything is recomputed
190             sub clear {
191 14     14 0 153 my($self) = @_;
192            
193 14         37 undef $self->{PARAMETERS_INITIALISED};
194 14         34 $self->{PARAMETERS_CHANGED} = 1;
195 14         50 $self->{CLASSES_CHANGED} = 1;
196             }
197              
198              
199              
200             sub DESTROY {
201 343     343   712 my($self) = @_;
202            
203 343 50       1727 if ($cntrl_c_pressed) {
204 0         0 $self->dump();
205             }
206             }
207              
208              
209             # reads an events file, dies in case of inconsistent lines
210             # syntax first line: .....
211             # syntax other lines, binary functions:
212             # syntax other lines, integer functions:
213             # an intvector is a space separated list of integers
214             sub read {
215 4     4 0 8 my($self, $file) = @_;
216              
217 4         7 my($features,
218             $feature_names,
219             @cols);
220              
221 4         8 $feature_names = "";
222 4 50       129 open(EVENTS, $file) ||
223             $self->die("Could not open $file\n");
224 4         12 print "Opened $file\n";
225              
226             # read the names of the features, skip comment
227             # note that feature name are in reverse order now
228 4         8 do {
229 4         88 $feature_names = ;
230             } until ($feature_names !~ /\#.*/);
231 4         11 chomp $feature_names;
232 4         34 $self->{FEATURE_NAMES} = [split(/\t/, $feature_names)];
233 4         10 $self->{NR_FEATURES} = $#{$self->{FEATURE_NAMES}} + 1;
  4         13  
234              
235             # read the bitvectors
236 4         19 while () {
237 400 50       910 if (!/\#.*/) { # skip comments
238 400         442 chomp;
239              
240 400 50       1223 if (/\s*(\d+)\s+(.+)/) {
241 400         922 $self->{FREQ}[$self->{NR_CLASSES}] = $1;
242 400         686 $features = $2;
243             }
244 400 50       820 if ($self->{FREQ} == 0) {
245 0         0 $self->die("Class $self->{NR_CLASSES} has zero probability\n");
246             }
247 400         659 $self->{NR_EVENTS} += $self->{FREQ}[$self->{NR_CLASSES}];
248             $self->{CLASSES}[$self->{NR_CLASSES}] =
249             $VECTOR_PACKAGE->new_vec($self->{NR_FEATURES},
250 400         1211 $features, $self->{VECTYPE});
251 400         1561 $self->{NR_CLASSES}++;
252             }
253             }
254 4         31 close(EVENTS);
255              
256 4         28 print "Read $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
257             "and $self->{NR_FEATURES} features\n";
258 4         13 print "Closed $file\n";
259              
260 4         7 $self->{FILENAME} = $file;
261 4         11 $self->{CLASSES_CHANGED} = 1;
262 4         11 $self->{PARAMETERS_CHANGED} = 1;
263             }
264              
265              
266             # reads an initial distribution
267             # syntax: one parameter per line
268             sub read_parameters {
269 0     0 0 0 my($self, $file) = @_;
270              
271 0         0 my($i);
272              
273 0         0 $i = 0;
274 0 0       0 open(DISTR,$file) ||
275             $self->die("Could not open $file\n");
276 0         0 print "Opened $file\n";
277              
278 0         0 while () {
279 0 0       0 if (!/\#.*/) {
280 0         0 chomp;
281 0         0 $self->{PARAMETERS}[$i++] = $_;
282             }
283             }
284              
285 0         0 close(DISTR);
286 0 0       0 if ($i != $self->{NR_FEATURES}) {
287 0         0 $self->die("Initial distribution file corrupt\n");
288             }
289 0         0 print "Read $i parameters; closed $file\n";
290 0         0 $self->{PARAMETERS_CHANGED} = 1;
291             }
292              
293              
294             # writes the the current parameters
295             # syntax:
296             sub write_parameters {
297 1     1 1 8 my($self, $file) = @_;
298              
299 1         3 my($i);
300              
301 1 50       80 open(DISTR,">$file") ||
302             $self->die("Could not open $file\n");
303 1         5 print "Opened $file\n";
304              
305 1         26 for ($i = 0; $i < $self->{NR_FEATURES}; $i++) {
306 18 50       48 if ($self->{FEATURE_IGNORE}->bit_test($i)) {
307 0         0 print DISTR "IGNORED\n";
308             }
309             else {
310 18         99 print DISTR "$self->{PARAMETERS}[$i]\n";
311             }
312             }
313              
314 1         28 close(DISTR);
315 1         6 print "Closed $file\n";
316             }
317              
318              
319             # writes the the current features with their parameters
320             # syntax first line: <$nr_features>
321             # syntax last line:
322             # syntax other lines:
323             sub write_parameters_with_names {
324 0     0 1 0 my($self, $file) = @_;
325              
326 0         0 my($x,
327             $bitmask);
328              
329 0 0       0 open(DISTR,">$file") ||
330             $self->die("Could not open $file\n");
331 0         0 print "Opened $file\n";
332              
333             # preamble
334 0         0 print DISTR "$self->{NR_FEATURES}\n";
335 0         0 print DISTR "$self->{SCALER}\n";
336 0 0       0 if ($self->{SCALER} eq "gis") {
337 0         0 print DISTR "$self->{M}\n";
338 0         0 print DISTR "$self->{CORRECTION_PARAMETER}\n";
339             }
340              
341             # print feature names with parameters
342             # in the meanwhile build the bitmask
343 0         0 $bitmask = "";
344 0         0 for ($x = 0; $x < $self->{NR_FEATURES}; $x++) {
345 0         0 print DISTR "$self->{FEATURE_NAMES}[$x]\t$self->{PARAMETERS}[$x]\n";
346 0 0       0 if ($self->{FEATURE_IGNORE}->bit_test($x)) {
347 0         0 $bitmask = "0" . $bitmask;
348             }
349             else {
350 0         0 $bitmask = "1" . $bitmask;
351             }
352             }
353 0         0 print DISTR "$bitmask\n";
354              
355 0         0 close(DISTR);
356 0         0 print "Closed $file\n";
357             }
358              
359              
360             # generate random parameters
361             sub random_parameters {
362 2     2 0 3 my($self) = @_;
363              
364 2         9 my($f);
365              
366 2         107 srand();
367 2         11 for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
368 20         54 $self->{PARAMETERS}[$f] = rand() + 1;
369             }
370 2 100       8 if ($self->{SCALER} eq "gis") {
371 1         5 $self->{CORRECTION_PARAMETER} = rand();
372             }
373 2         5 $self->{PARAMETERS_CHANGED} = 1;
374             }
375              
376              
377             # sets parameters to $val
378             sub set_parameters_to {
379 12     12 0 19 my($self, $val) = @_;
380              
381 12         18 my($f);
382              
383 12         66 for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
384 132         332 $self->{PARAMETERS}[$f] = $val;
385             }
386 12 100       47 if ($self->{SCALER} eq "gis") {
387 6         17 $self->{CORRECTION_PARAMETER} = $val;
388             }
389 12         36 $self->{PARAMETERS_CHANGED} = 1;
390             }
391              
392              
393             # initialise if !$self->{PARAMETERS_INITIALISED}; subsequent calls
394             # of scale (by fi) should not re-initialise parameters
395             sub init_parameters {
396 22     22 0 40 my($self) = @_;
397              
398 22 100       80 if (!$self->{PARAMETERS_INITIALISED}) {
399 14 100       50 if ($self->{SAMPLING} eq "mc") {
400             # otherwise bits will be flipped with prob 1.
401 2         19 $self->random_parameters();
402             }
403             else {
404 12 100       52 if ($self->{SCALER} eq "gis") {
405 6         27 $self->set_parameters_to(0);
406             }
407             else {
408 6         32 $self->set_parameters_to(0);
409             }
410             }
411 14         30 $self->{PARAMETERS_INITIALISED} = 1;
412             }
413             }
414              
415              
416             # make sure \tilde{p} << q_0
417             # constant feature functions are forbidden: that is why
418             # we check whether for all features \sum_x f(x) > 0
419             # and \sum_x f(x) != $corpus_size
420             sub check {
421 420     420 0 619 my($self) = @_;
422              
423 420         461 my ($x);
424              
425 420         1291 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
426 41984 50       126301 if ($self->{CLASS_EXP_WEIGHTS}[$x] == 0) {
427 0         0 print "Initial distribution not ok; class $x\n";
428 0         0 print $self->{CLASS_EXP_WEIGHTS}[$x], "\t", $self->{CLASSES}[$x]->to_Bin(' '),"\n";
429             }
430             }
431             }
432              
433              
434             # writes events to a file
435             # usefull in case new features have been added
436             # syntax: same as input events file
437             sub write {
438 1     1 1 12 my($self, $file) = @_;
439              
440 1         2 my($x, $f);
441              
442             # prologue
443 1 50       157 open(EVENTS,">$file") ||
444             $self->die("Could not open $file\n");
445 1         4 print "Opened $file\n";
446              
447             # write a line with the feature names
448 1         2 print EVENTS join("\t", @{$self->{FEATURE_NAMES}}), "\n";
  1         251  
449             # write the events themselves
450 1         7 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
451 100         229 print EVENTS $self->{FREQ}[$x],"\t";
452 100         254 print EVENTS $self->{CLASSES}[$x]->to_Bin(' '), "\n";
453             }
454              
455             # close the file and tell you did that
456 1         43 close EVENTS;
457 1         10 print "Wrote $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
458             "and $self->{NR_FEATURES} features\n";
459 1         6 print "Closed $file\n";
460             }
461              
462              
463             # reads a dump, and evaluates it into an object
464             sub undump {
465 0     0 1 0 my($class, $file) = @_;
466              
467 0         0 my($x,
468             $VAR1);
469              
470             # open, slurp, and close file
471 0 0       0 open(UNDUMP, "$file") ||
472             croak "Could not open $file\n";
473 0         0 print "Opened $file\n";
474 0         0 undef $/;
475 0         0 $x = ;
476 0         0 $/ = "\n";
477 0         0 close(UNDUMP);
478              
479             # and undump
480 0         0 eval $x;
481 0         0 print "Undumped $VAR1->{NR_EVENTS} events, $VAR1->{NR_CLASSES} classes, " .
482             "and $VAR1->{NR_FEATURES} features\n";
483 0         0 print "Closed $file\n";
484 0         0 return($VAR1);
485             }
486              
487              
488             # makes dump of the event space using Data::Dumper
489             sub dump {
490 3     3 1 39 my($self, $file) = @_;
491              
492 3         5 my(@bitvecs,
493             $dump,
494             %features,
495             $f);
496              
497 3 50       13 if (!$file) {
498 0         0 $file = POSIX::tmpnam();
499             }
500 3 50       665 open(DUMP, ">$file") ||
501             croak "Could not open $file\n";
502 3         22 print "Opened $file\n";
503              
504             # build something that we can sort
505             # ONLY FOR CORPUS!
506 3 50 33     34 if (!$self->{THIS_IS_A_SAMPLE} && $self->{PARAMETERS}) {
507 3         13 for ($f = 0; $f < $self->{NR_FEATURES}; $f++) {
508             $features{$self->{FEATURE_NAMES}[$f]} =
509 38         134 $self->{PARAMETERS}[$f];
510             }
511 3 100 66     28 if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
512             $features{"correction$self->{M}"} =
513 2         8 $self->{CORRECTION_PARAMETER};
514             }
515             # and print it into $self
516             $self->{FEATURE_SORTED} = join(' > ',
517             sort {
518 3 100       29 if ($features{$b} == $features{$a}) {
  89         201  
519 1         3 return($b cmp $a)}
520             else {
521 88         138 return ($features{$b} <=> $features{$a})
522             }
523             }
524             keys(%features));
525             }
526              
527 3         46 $dump = Data::Dumper->new([$self]);
528 3         159 print DUMP $dump->Dump();
529 3         20840 print "Dumped $self->{NR_EVENTS} events, $self->{NR_CLASSES} classes, " .
530             "and $self->{NR_FEATURES} features\n";
531              
532 3         129 close(DUMP);
533 3         1666 print "Closed $file\n";
534             }
535              
536              
537             # $msg is logged, the time is logged, a dump is created, and the
538             # program dies with $msg
539             sub die {
540 0     0 0 0 my($self, $msg) = @_;
541              
542 0         0 $self->log_msg($msg);
543 0         0 $self->log_msg(time());
544 0         0 $self->dump();
545 0         0 croak $msg;
546             }
547              
548              
549             # prints a msg to STDOUT, and appends it to $self->{LOG}
550             # so an emergency dump will contain some history information
551             sub log_msg {
552 532     532 0 1142 my($self, $x) = @_;
553              
554 532         3611 $self->{LOG} .= $x;
555 532         37785 print $x;
556             }
557              
558              
559             # computes f_# for alle events; results in @sample_nr_feats_on
560             # computes %$sample_m_feats_on; a HOL from m
561             sub active_features {
562 428     428 0 573 my($self) = @_;
563              
564 428         580 my($i,
565             $j,
566             $sum);
567              
568 428 100       1124 if ($self->{CLASSES_CHANGED}) {
569             # check for constant features
570 78         255 for ($i = 0; $i < $self->{NR_FEATURES}; $i++) {
571 984         1020 $sum = 0;
572 984         2333 for ($j = 0; $j < $self->{NR_CLASSES}; $j++) {
573 98240         248178 $sum += $self->{CLASSES}[$j]->bit_test($i);
574             }
575 984 50 33     5406 if (!$sum || ($sum == $self->{NR_CLASSES})) {
576 0         0 print "Feature ", $i + 1, " is constant ($sum), and will be ignored\n";
577 0         0 $self->{FEATURE_IGNORE}->Bit_On($i);
578             }
579             }
580             # M is needed for both gis and iis
581             # NEED_CORRECTION_FEATURE is for gis only
582 78         186 $self->{M} = 0;
583 78         132 $self->{NEED_CORRECTION_FEATURE} = 0;
584 78         252 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
585 7784 100       19292 if ($self->{CLASSES}[$i]->Norm() > $self->{M}) {
586             # higher nr_features_active found
587 256         697 $self->{M} = $self->{CLASSES}[$i]->Norm();
588 256         675 $self->{NEED_CORRECTION_FEATURE} = 1;
589             }
590             }
591 78 50       182 if ($debug) {
592 0         0 print "M = $self->{M}\n";
593             }
594             # set up a hash from m to classes HOL; and the correction_feature
595             # CORRECTION_FEATURE FOR gis
596 78         152 undef $self->{M_FEATURES_ACTIVE};
597 78         232 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
598 7784 100       19011 if ($self->{SCALER} eq "gis") {
599             $self->{CORRECTION_FEATURE}[$i] =
600 3595         9091 $self->{M} - $self->{CLASSES}[$i]->Norm();
601             }
602             }
603 78 50       224 if ($debug) {
604 0         0 print "M = $self->{M}\n";
605             }
606             # observed feature expectations
607 78 100       196 if (!$self->{THIS_IS_A_SAMPLE}) {
608 62         218 $self->E_reference();
609             }
610 78         228 undef $self->{CLASSES_CHANGED};
611             }
612             }
613              
614              
615             # compute the class probabilities according to the parameters
616             sub prepare_model {
617 428     428 0 604 my($self) = @_;
618              
619 428         546 my ($x,
620             $f);
621              
622 428         1366 $self->active_features();
623 428 100       1074 if ($self->{PARAMETERS_CHANGED}) {
624 420         653 $self->{Z} = 0;
625 420         1214 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
626 41984         56577 $self->{CLASS_LOG_WEIGHTS}[$x] = 0;
627 41984         120520 for $f ($self->{CLASSES}[$x]->indices()) {
628             $self->{CLASS_LOG_WEIGHTS}[$x] += $self->{PARAMETERS}[$f] *
629 91446         290594 $self->{CLASSES}[$x]->weight($f);
630 91446 50       222340 if ($f >= $self->{NR_FEATURES}) {
631 0         0 print "alarm: wrong index: $f\n";
632             }
633             }
634 41984 100 66     200255 if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
635             $self->{CLASS_LOG_WEIGHTS}[$x] += $self->{CORRECTION_FEATURE}[$x] *
636 27695         53323 $self->{CORRECTION_PARAMETER};
637             }
638 41984         76949 $self->{CLASS_EXP_WEIGHTS}[$x] = exp($self->{CLASS_LOG_WEIGHTS}[$x]);
639 41984         115649 $self->{Z} += $self->{CLASS_EXP_WEIGHTS}[$x];
640             }
641             print "prepare_model: \$Z is not a number: $self->{Z}\n"
642 420 50       1237 unless is_float($self->{Z});
643              
644 420 100       1175 if (!$self->{THIS_IS_A_SAMPLE}) {
645 404         1045 $self->entropies();
646             }
647 420         1286 $self->check();
648 420         1199 undef $self->{PARAMETERS_CHANGED};
649             }
650             }
651              
652              
653             sub prepare_sample {
654 332     332 0 399 my($self) = @_;
655              
656             # expectations
657 332 100       706 if ($self->{SCALER} eq "gis") {
658 236         756 $self->E_loglinear();
659             }
660             else {
661             # A_{mj}
662 96         256 $self->A();
663             }
664             }
665              
666              
667             # feature expectations for the MaxEnt distribution
668             sub E_loglinear {
669 236     236 0 327 my($self) = @_;
670              
671 236         302 my($x,
672             $f,
673             $vec,
674             $weight,
675             $Z);
676              
677 236         578 undef $self->{E_LOGLIN};
678 236 100       959 if ($self->{SAMPLING} eq "enum") {
679 9         37 $vec = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
680 9         22 $self->{Z} = 0;
681 9         39 for ($x = 0; $x < 2 ** $self->{NR_FEATURES}; $x++) {
682 18432         32991 $weight = $self->weight($vec);
683 18432         48064 for $f ($vec->indices()) {
684 103424         280939 $self->{E_LOGLIN}[$f] += $weight * $vec->weight($f);
685             }
686             $self->{E_LOGLIN}[$self->{NR_FEATURES}] += $weight *
687 18432         65901 ($self->{M} - $vec->Norm());
688 18432         25238 $self->{Z} += $weight;
689 18432         45217 $vec->increment();
690             }
691 9         40 for $f (0..$self->{NR_FEATURES}) {
692 106         227 $self->{E_LOGLIN}[$f] /= $self->{Z};
693             }
694             }
695             else { # either corpus or mc sample
696 227         854 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
697 22695         59286 for $f ($self->{CLASSES}[$x]->indices()) {
698             $self->{E_LOGLIN}[$f] += $self->{CLASS_EXP_WEIGHTS}[$x] *
699 48374         146590 $self->{CLASSES}[$x]->weight($f);
700             }
701 22695 50       58843 if ($self->{NEED_CORRECTION_FEATURE}) {
702             $self->{E_LOGLIN}[$self->{NR_FEATURES}] +=
703             $self->{CLASS_EXP_WEIGHTS}[$x] *
704 22695         71600 ($self->{M} - $self->{CLASSES}[$x]->Norm());
705             }
706             }
707 227         561 for $f (0..$self->{NR_FEATURES}) {
708 2671         4020 $self->{E_LOGLIN}[$f] /= $self->{Z};
709             }
710             }
711             }
712              
713              
714             # observed feature expectations
715             sub E_reference {
716 62     62 0 101 my($self) = @_;
717              
718 62         116 my($x,
719             $f,
720             @sum);
721              
722 62         246 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
723 6200         15334 for $f ($self->{CLASSES}[$x]->indices()) {
724 11765         33716 $sum[$f] += $self->{FREQ}[$x] * $self->{CLASSES}[$x]->weight($f);
725             }
726 6200 100       21104 if ($self->{SCALER} eq "gis") {
727             $sum[$self->{NR_FEATURES}] += $self->{CORRECTION_FEATURE}[$x] *
728 3100         9133 $self->{FREQ}[$x];
729             }
730             }
731 62         174 for $f (0..$self->{NR_FEATURES}) {
732 886 100       1565 if ($sum[$f]) {
733 855         1550 $self->{E_REF}[$f] = $sum[$f] / $self->{NR_EVENTS};
734             }
735             }
736             }
737              
738              
739             # compute several entropies
740             sub entropies {
741 404     404 0 565 my($self) = @_;
742              
743 404         549 my ($i,
744             $w,
745             $log_w,
746             $w_ref,
747             $log_w_ref);
748              
749 404         609 $self->{H_p} = 0;
750 404         540 $self->{H_cross} = 0;
751 404         509 $self->{H_p_ref} = 0;
752 404         502 $self->{KL} = 0;
753 404         1134 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
754 40400         53151 $w = $self->{CLASS_EXP_WEIGHTS}[$i];
755             # we don't know whether $p > 0
756 40400         52602 $log_w = $self->{CLASS_LOG_WEIGHTS}[$i];
757 40400         57017 $w_ref = $self->{FREQ}[$i];
758             # we know that $p_ref > 0
759 40400         52474 $log_w_ref = log($w_ref);
760             # update the sums
761 40400         54828 $self->{H_p} -= $w * $log_w;
762 40400         50657 $self->{H_cross} -= $w_ref * $log_w;
763 40400         53466 $self->{KL} += $w_ref * ($log_w_ref - $log_w);
764 40400         53250 $self->{H_p_ref} -= $w_ref * $log_w_ref;
765 40400 50       121445 if ($w == 0) {
766 0         0 $self->log_msg("entropies: skipping event $i (p^n($i) = 0)\n");
767             }
768             }
769             # normalise
770 404         1026 $self->{H_p} = $self->{H_p} / $self->{Z} + log($self->{Z});
771 404         882 $self->{H_cross} = $self->{H_cross} / $self->{NR_EVENTS} + log($self->{Z});
772             $self->{KL} = $self->{KL} / $self->{NR_EVENTS} - log($self->{NR_EVENTS}) +
773 404         958 log($self->{Z});
774 404         800 $self->{H_p_ref} = $self->{H_p_ref} / $self->{NR_EVENTS} + log($self->{NR_EVENTS});
775 404         1361 $self->{L} = -$self->{H_cross};
776             }
777              
778              
779             # unnormalised p(x,y)
780             # $x is required, $y is optional
781             # $x->Size()+$y->Size() == $self->{NR_FEATURES}
782             sub weight {
783 77824     77824 0 129052 my($self, $x, $y) = @_;
784              
785 77824         80687 my ($f,
786             $sum,
787             $norm);
788              
789 77824         89352 $sum = 0;
790 77824         186595 for $f ($x->indices()) {
791 498688 50       1346787 if (!$self->{FEATURE_IGNORE}->bit_test($f)) {
792 498688         1454974 $sum += $self->{PARAMETERS}[$f] * $x->weight($f);
793 498688 50       1268936 if ($debug) {
794 0         0 print "Current weight: $sum, current feature: $f\n";
795             }
796             }
797             }
798 77824         253987 $norm = $x->Norm();
799             # if $y is defined,
800             # then $x->Size()+$y->Size() == $self->{NR_FEATURES} should hold:
801 77824 50 33     185428 if (defined($y) && (($x->Size() + $y->Size()) == $self->{NR_FEATURES})) {
802 0         0 for $f ($y->indices()) {
803 0 0       0 if (!$self->{FEATURE_IGNORE}->bit_test($f + $x->Size())) {
804 0         0 $sum += $self->{PARAMETERS}[$f + $x->Size()] *
805             $y->weight($f);
806 0 0       0 if ($debug) {
807 0         0 print "Current weight: $sum, current feature: $f\n";
808             }
809             }
810             }
811 0         0 $norm += $y->Norm();
812             }
813 77824 100 66     357723 if ($self->{NEED_CORRECTION_FEATURE} && ($self->{SCALER} eq "gis")) {
814 18432         32979 $sum += ($self->{M} - $norm) * $self->{CORRECTION_PARAMETER};
815             }
816 77824         168315 return(exp($sum));
817             }
818              
819              
820             # computes the `a' coefficients of
821             # \sum_{m=0}^{M} a_{m,j}^{(n)} e^{\alpha^{(n)}_j m}
822             # according to the current distribution
823             sub A {
824 96     96 0 123 my($self) = @_;
825              
826 96         108 my($f,
827             $m,
828             $x,
829             $weight,
830             $vec,
831             $class);
832              
833 96         937 undef $self->{A};
834 96         566 undef $self->{C};
835 96 100       213 if ($self->{SAMPLING} eq "enum") {
836 8         27 undef $self->{Z};
837 8         29 $vec = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
838 8         33 for ($x = 0; $x < 2 ** $self->{NR_FEATURES}; $x++) {
839 59392         121944 $weight = $self->weight($vec);
840 59392         159868 for $f ($vec->indices()) {
841 395264         1054831 $self->{A}{$vec->Norm()}{$f} += $weight * $vec->weight($f);
842 395264         1113796 $self->{C}{$vec->Norm()}{$f} += $vec->weight($f);
843             }
844 59392         138935 $self->{Z} += $weight;
845 59392 50       112051 print "Z = $self->{Z}" unless is_float($self->{Z});
846 59392         184110 $vec->increment();
847             }
848             }
849             else { # mc or corpus
850 88         276 for ($class = 0; $class < $self->{NR_CLASSES}; $class++) {
851 8789         25068 for $f ($self->{CLASSES}[$class]->indices()) {
852             $self->{A}{$self->{CLASSES}[$class]->Norm()}{$f} +=
853             $self->{CLASS_EXP_WEIGHTS}[$class] *
854 22878         66984 $self->{CLASSES}[$class]->weight($f);
855             $self->{C}{$self->{CLASSES}[$class]->Norm()}{$f} +=
856 22878         67247 $self->{CLASSES}[$class]->weight($f);
857             }
858             }
859             }
860             }
861              
862              
863             #
864             # Monte Carlo sampling with the Metropolis update
865             #
866              
867             # returns heads up with probability $load
868             sub loaded_die {
869 4795     4795 0 5679 my($load) = @_;
870              
871 4795 100       12232 (rand() <= $load) ? 1 : 0;
872             }
873              
874              
875             # samples from the probability distribution of $other to create $self
876             # we use the so-called Metropolis update R = h(new)/h(old)
877             # Metropolis algorithm \cite{neal:probabilistic}
878             sub mc {
879 16     16 0 30 my($self, $other, $type) = @_;
880              
881 16         49 my($R,
882             $weight,
883             $state,
884             $old_weight,
885             $k,
886             %events
887             );
888              
889 16         827 srand();
890             # take some class from the sample space as initial state
891 16         67 $state = $VECTOR_PACKAGE->new($self->{NR_FEATURES});
892             # make sure there are no constant features!
893 16         79 $state->Fill();
894 16         79 $events{$state->to_Bin(' ')}++;
895 16         89 $state->Empty();
896 16         23 $weight = 0;
897             # iterate
898 16         20 $k = 0;
899              
900             do {
901 4795         5862 $old_weight = $weight;
902 4795 100       11803 if ($state->bit_flip($k)) {
903 1205         2220 $weight += $self->{PARAMETERS}[$k];
904             }
905             else {
906 3590         5517 $weight -= $self->{PARAMETERS}[$k];
907             }
908 4795         6655 $R = exp($weight - $old_weight);
909 4795 100       12107 if (!loaded_die(1 < $R ? 1 : $R)) { # stay at the old state
    100          
910 2511         5821 $state->bit_flip($k);
911 2511         3137 $weight = $old_weight;
912             }
913             else { # add state
914 2284         6085 $events{$state->to_Bin(' ')}++;
915             }
916 4795 50       10245 if ($debug) {
917 0         0 print $state->to_Bin(' '),"\t",scalar(keys(%events)),"\t$R\n";
918             }
919             # next component
920 4795         24514 $k = ($k + 1) % $self->{NR_FEATURES};
921             } until ((scalar(keys(%events)) == $SAMPLE_size) ||
922 16   66     24 (scalar(keys(%events)) == 2 ** $self->{NR_FEATURES}));
923              
924 16         263 for (keys(%events)) {
925 1600         6703 push @{$self->{CLASSES}},
926 1600         1851 $VECTOR_PACKAGE->new_vec($self->{NR_FEATURES}, $_, $self->{VECTYPE});
927             }
928 16         139 $self->{NR_CLASSES} = scalar(keys(%events)) - 1;
929              
930 16         49 $self->{CLASSES_CHANGED} = 1;
931 16         459 $self->{PARAMETERS_CHANGED} = 1;
932             }
933              
934              
935             #
936             # IIS
937             #
938              
939             # Newton estimation according to (Abney 1997), Appendix B
940             sub C_func {
941 0     0 0 0 my($self, $j, $x) = @_;
942              
943 0         0 my($m,
944             $s0,
945             $s1,
946             $a_x_m);
947              
948 0         0 $s0 = - $self->{NR_EVENTS} * $self->{E_REF}[$j];
949 0         0 $s1 = 0;
950 0         0 for ($m = 1; $m <= $self->{M}; $m++) {
951 0 0       0 if ($self->{"C"}{$m}{$j}) {
952 0         0 $a_x_m = $self->{"C"}{$m}{$j} * exp($x * $m);
953 0         0 $s0 += $a_x_m;
954 0         0 $s1 += $m * $a_x_m;
955             }
956             }
957 0 0       0 print "sum_func not a number: $s0\n"
958             unless is_float($s0);
959 0 0       0 print "sum_deriv not a number: $s1\n"
960             unless is_float($s1);
961              
962 0 0       0 if ($s1 == 0) {
963 0         0 return(0);
964             }
965             else {
966 0         0 return($s0 / $s1);
967             }
968             }
969              
970              
971             # Newton estimation according to (Della Pietra et al. 1997)
972             sub A_func {
973 3538     3538 0 7401 my($self, $j, $x) = @_;
974              
975 3538         3826 my($m,
976             $sum_func,
977             $sum_deriv,
978             $a_x_m);
979              
980 3538         4987 $sum_func = -$self->{E_REF}[$j] * $self->{Z};
981 3538         3432 $sum_deriv = 0;
982 3538         7952 for ($m = 1; $m <= $self->{M}; $m++) {
983 45279 100       126808 if ($self->{"A"}{$m}{$j}) {
984 21560         41114 $a_x_m = $self->{"A"}{$m}{$j} * exp($x * $m);
985 21560         23045 $sum_func += $a_x_m;
986 21560         50776 $sum_deriv += $m * $a_x_m;
987             }
988             }
989 3538 50       5733 if ($sum_deriv == 0) {
990 0         0 return(0);
991             }
992             else {
993 3538         17901 return($sum_func / $sum_deriv);
994             }
995             }
996              
997              
998             # solves \alpha from
999             # \sum_{m=0}^{M} a_{m,j}^{(n)} e^{\alpha^{(n)}_j m}=0
1000             sub iis_estimate_with_newton {
1001 1103     1103 0 1399 my($self, $i) = @_;
1002              
1003 1103         1153 my($x,
1004             $old_x,
1005             $deriv_res,
1006             $func_res,
1007             $k);
1008              
1009             # $x = log(0)
1010 1103         1136 $x = 0;
1011 1103         1118 $k = 0;
1012              
1013             # do newton's method
1014 1103   66     1210 do {
1015             # save old x
1016 3538         4086 $old_x = $x;
1017             # compute new x
1018 3538 100       6560 if ($self->{SAMPLING} eq "enum") {
1019             # (DDL 1997)
1020 504         912 $x -= $self->A_func($i, $x);
1021             }
1022             else {
1023             # sample -> (Abney 1997)
1024 3034         6113 $x -= $self->A_func($i, $x);
1025             }
1026             } until ((abs($x - $old_x) <= $NEWTON_min) ||
1027             ($k++ > $NEWTON_max_it));
1028 1103 50       2642 if ($debug) {
1029 0         0 print "Estimated gamma_$i with Newton's method: $x\n";
1030             }
1031 1103         3008 return($x);
1032             }
1033              
1034              
1035             # updates parameter $i
1036             sub gamma {
1037 332     332 0 497 my($self, $sample) = @_;
1038              
1039 332         437 my($f);
1040              
1041 332         865 for $f (0..$self->{NR_FEATURES} - 1) {
1042 3644 50       9846 if (!$self->{FEATURE_IGNORE}->bit_test($f)) {
1043 3644 100       6768 if ($self->{SCALER} eq "gis") {
1044             $self->{PARAMETERS}[$f] +=
1045 2541         7796 log($self->{E_REF}[$f] / $sample->{E_LOGLIN}[$f]) / $sample->{M};
1046             }
1047             else {
1048 1103         2405 $self->{PARAMETERS}[$f] +=
1049             $sample->iis_estimate_with_newton($f);
1050             }
1051             }
1052             }
1053              
1054 332 50 66     1409 if (($self->{SCALER} eq "gis") && ($self->{NEED_CORRECTION_FEATURE})) {
1055             $self->{CORRECTION_PARAMETER} +=
1056             log($self->{E_REF}[$self->{NR_FEATURES}] /
1057 236         801 $sample->{E_LOGLIN}[$self->{NR_FEATURES}]) / $self->{M};
1058             }
1059             }
1060              
1061              
1062             # the iterative scaling algorithms
1063             sub scale {
1064 22     22 1 94 my($self, $sampling, $scaler) = @_;
1065              
1066 22         40 my($k,
1067             $i,
1068             $kl,
1069             $old_kl,
1070             $diff,
1071             $sample,
1072             $old_correction_parameter,
1073             @old_parameters);
1074              
1075 22 100       51 if ($sampling) {
1076 10         26 $self->{SAMPLING} = $sampling;
1077             }
1078 22 100       69 if ($scaler) {
1079 10         25 $self->{SCALER} = $scaler;
1080             }
1081 22 50 66     119 if (($self->{SAMPLING} eq "enum") && ($self->{VECTYPE} eq "integer")) {
1082 0         0 $self->die("Cannot enumerate integer vectors\n");
1083             }
1084 22 50 66     90 if (($self->{SAMPLING} eq "mc") && ($self->{VECTYPE} eq "integer")) {
1085 0         0 $self->die("Cannot sample from integer vector space\n");
1086             }
1087              
1088 22         91 $self->init_parameters();
1089 22         81 $self->prepare_model();
1090 22         429 $self->log_msg("scale($self->{SCALER}, $self->{SAMPLING}, $self->{VECTYPE}): H(p_ref)=$self->{H_p_ref}\nit.\tD(p_ref||p)\t\tH(p)\t\t\tL(p_ref,p)\t\ttime\n0\t$self->{KL}\t$self->{H_p}\t$self->{L}\t" . time() . "\n");
1091 22         35 $k = 0;
1092 22         43 $kl = 1e99;
1093 22   66     30 do {
      66        
1094             # store parameters for reverting if converging stops
1095 332         436 @old_parameters = @{$self->{PARAMETERS}};
  332         2038  
1096 332         650 $old_correction_parameter = $self->{CORRECTION_PARAMETER};
1097 332 100       720 if ($sample) {
1098 310         1018 $sample->DESTROY();
1099             }
1100 332         836 $sample = $self->sample();
1101 332         1027 $self->gamma($sample);
1102 332         582 $self->{PARAMETERS_CHANGED} = 1;
1103 332         788 $self->prepare_model();
1104 332         609 $diff = $kl - $self->{KL};
1105 332         498 $kl = $self->{KL};
1106              
1107 332         414 $k++;
1108 332         3922 $self->log_msg("$k\t$self->{KL}\t$self->{H_p}\t$self->{L}\t" . time() . "\n");
1109 332 50       866 if ($debug) {
1110 0         0 $self->check();
1111             }
1112 332 100       726 if ($diff < 0) {
1113 10         31 $self->log_msg("Scaling is not converging (anymore); will revert parameters!\n");
1114             # restore old parameters
1115 10         35 $self->{PARAMETERS} = \@old_parameters;
1116 10         26 $self->{CORRECTION_PARAMETER} = $old_correction_parameter;
1117 10         18 $self->{PARAMETERS_CHANGED} = 1;
1118 10         27 $self->prepare_model();
1119             }
1120 332 50       9036 if ($cntrl_backslash_pressed) {
1121 0         0 $self->dump();
1122 0         0 $cntrl_backslash_pressed = 0;
1123             }
1124             } until ($diff <= $KL_min || ($k > $KL_max_it) || ($diff < 0));
1125             }
1126              
1127              
1128             #
1129             # Field Induction Algorithm
1130             #
1131              
1132             # add feature $g to $self
1133             sub add_candidate {
1134 28     28 0 45 my($self, $candidates, $g) = @_;
1135              
1136 28         39 my($i);
1137              
1138 28         57 $self->{NR_FEATURES}++;
1139 28         102 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1140             $self->{CLASSES}[$i]->insert_column($g,
1141 2800         7938 $candidates->{CANDIDATES}[$i]->weight($g));
1142             }
1143 28 100       90 if ($self->{SCALER} eq "gis") {
1144 14         44 $self->{PARAMETERS}[$self->{NR_FEATURES} - 1] = 1;
1145             }
1146             else {
1147 14         56 $self->{PARAMETERS}[$self->{NR_FEATURES} - 1] = $candidates->{ALPHA}[$g];
1148             }
1149 28         38 push @{$self->{FEATURE_NAMES}}, $candidates->{CANDIDATE_NAMES}[$g];
  28         134  
1150 28         45 $self->{PARAMETERS_CHANGED} = 1;
1151 28         45 $self->{CLASSES_CHANGED} = 1;
1152 28         74 $self->prepare_model();
1153             }
1154              
1155              
1156             # remove the last column
1157             sub remove_candidate {
1158 20     20 0 138 my($self) = @_;
1159              
1160 20         27 my($i);
1161              
1162 20         61 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1163             # substitute offset $g length 1 by nothing
1164 2000         5046 $self->{CLASSES}[$i]->delete_column($self->{NR_FEATURES}-1);
1165             }
1166 20         31 pop @{$self->{PARAMETERS}};
  20         43  
1167 20         32 pop @{$self->{FEATURE_NAMES}};
  20         54  
1168 20         45 $self->{NR_FEATURES}--;
1169 20         36 $self->{PARAMETERS_CHANGED} = 1;
1170 20         30 $self->{CLASSES_CHANGED} = 1;
1171 20         52 $self->prepare_model();
1172             }
1173              
1174              
1175             # checks for $event, if not there adds it, otherwise increases its {FREQ}
1176             sub add_event {
1177 0     0 0 0 my($self, $event) = @_;
1178              
1179 0         0 my($i,
1180             $found);
1181              
1182 0         0 $found = 0;
1183 0         0 for ($i = 0; $i < $self->{NR_CLASSES}; $i++) {
1184 0         0 $found = ($event->Compare($self->{CLASSES}[$i]) == 0);
1185 0 0       0 if ($found) {
1186 0         0 $self->{FREQ}[$i]++;
1187 0         0 last;
1188             }
1189             }
1190 0 0       0 if (!$found) {
1191 0         0 $self->{CLASSES}[$self->{NR_CLASSES}] = $event;
1192 0         0 $self->{FREQ}[$self->{NR_CLASSES}] = 1;
1193 0         0 $self->{NR_CLASSES}++;
1194             }
1195 0         0 $self->{NR_EVENTS}++;
1196             }
1197              
1198              
1199             # computes the gain for all $candidates
1200             sub gain {
1201 8     8 0 149 my($self, $candidates) = @_;
1202              
1203 8         14 my($c,
1204             $x,
1205             $kl,
1206             $below,
1207             $above,
1208             $sum_p_ref,
1209             $sum_p);
1210              
1211 8         27 $candidates->{MAX_GAIN} = 0;
1212 8         20 $candidates->{BEST_CAND} = 0;
1213 8         39 for ($c = 0; $c < $candidates->{NR_CANDIDATES}; $c++) {
1214 24 100       89 if (!$candidates->{ADDED}{$c}) {
1215 20         29 $sum_p_ref = 0;
1216 20         30 $sum_p = 0;
1217 20         60 for ($x = 0; $x < $self->{NR_CLASSES}; $x++) {
1218 2000 100       5854 if ($candidates->{CANDIDATES}[$x]->bit_test($c)) {
1219 238         324 $sum_p += $self->{CLASS_EXP_WEIGHTS}[$x];
1220 238         630 $sum_p_ref += $self->{FREQ}[$x];
1221             }
1222             }
1223 20         33 $sum_p /= $self->{Z};
1224 20         35 $sum_p_ref /= $self->{NR_EVENTS};
1225 20         37 $above = $sum_p_ref * (1 - $sum_p);
1226 20         223 $below = $sum_p * (1 - $sum_p_ref);
1227 20 50       68 if ($above * $below > 0) {
1228 20         75 $candidates->{ALPHA}[$c] = log($above / $below);
1229             }
1230             else {
1231 0         0 $self->die("Cannot take log of negative/zero value: $above / $below\n");
1232             }
1233             # temporarily add feature to classes and compute $gain
1234 20         37 $kl = $self->{KL};
1235 20         101 $self->add_candidate($candidates, $c);
1236 20         84 $candidates->{GAIN}[$c] = $kl - $self->{KL};
1237 20         254 $self->log_msg("G($c, $candidates->{ALPHA}[$c]) = $candidates->{GAIN}[$c]\n");
1238 20 100       79 if (($candidates->{MAX_GAIN} <= $candidates->{GAIN}[$c])) {
1239 5         14 $candidates->{MAX_GAIN} = $candidates->{GAIN}[$c];
1240 5         11 $candidates->{BEST_CAND} = $c;
1241             }
1242             # remove the feature
1243 20         73 $self->remove_candidate();
1244             }
1245             }
1246             }
1247              
1248              
1249             # adds the $n best candidates
1250             sub fi {
1251 4     4 1 27 my($self, $scaler, $candidates, $n, $sample) = @_;
1252              
1253 4         5 my ($i,
1254             $kl);
1255              
1256 4         29 $self->log_msg("fi($scaler, $sample, $n, $self->{VECTYPE})\n");
1257 4 50       13 if ($scaler) {
1258 4         48 $self->{SCALER} = $scaler;
1259             }
1260 4 50       14 if ($sample) {
1261 4         12 $self->{SAMPLING} = $sample;
1262             }
1263              
1264 4 50       16 if ($self->{NR_CLASSES} != $candidates->{NR_CLASSES}) {
1265 0         0 $self->die("Candidates have the wrong number of events\n");
1266             }
1267 4         18 $self->scale();
1268 4         15 $kl = $self->{KL};
1269 4 50       20 $n = ($n > $candidates->{NR_CANDIDATES}) ? $candidates->{NR_CANDIDATES} : $n;
1270 4         14 for ($i = 0; $i < $n; $i++) {
1271 8         45 $self->gain($candidates);
1272 8         52 $self->add_candidate($candidates, $candidates->{BEST_CAND});
1273 8         39 $candidates->{ADDED}{$candidates->{BEST_CAND}} = 1;
1274 8         42 $self->log_msg("Adding candidate $candidates->{BEST_CAND}\n");
1275 8         37 $self->scale();
1276 8         75 $self->log_msg("Actual gain: " . ($self->{KL} - $kl) . "\n");
1277 8         37 $kl = $self->{KL};
1278             }
1279 4         22 return(1);
1280             }
1281              
1282              
1283             #
1284             # Routines for classification, only binary features!
1285             #
1286              
1287             # context features are 0 .. $n-1
1288             # $x is a vector, $sampling
1289             sub classify {
1290 128     128 0 618 my($self, $x) = @_;
1291              
1292 128         172 my($y,
1293             $sum,
1294             $i,
1295             $weight,
1296             $best_class,
1297             $best_weight);
1298              
1299 128         448 $self->log_msg("classify(" . $x->to_Bin('') . ")\n");
1300 128         357 $sum = 0;
1301             # use every possible completion of $x to compute $sum
1302             # allocate a class vector
1303 128         666 $y = $VECTOR_PACKAGE->new($self->{NR_FEATURES} - $x->Size());
1304 128         201 $best_weight = 0;
1305             # for every possible $y
1306 128         406 for ($i = 0; $i < 2 ** $y->Size(); $i++) {
1307             # compute p(x,y) which proportional to p(y|x) (I hope)
1308 0         0 $weight = $self->weight($x, $y);
1309 0 0       0 if ($weight > $best_weight) {
1310 0         0 $best_class = $y;
1311 0         0 $best_weight = $weight;
1312 0 0       0 if ($debug) {
1313 0         0 print "$i\t", $y->to_Bin(''), "\t$weight\n";
1314             }
1315             }
1316 0         0 $y->increment();
1317             }
1318 128         492 return($best_class, $best_weight);
1319             }
1320              
1321              
1322             1;
1323              
1324             __END__