File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/TrainerUnlabelled.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Parser::MSTperl::TrainerUnlabelled;
2             {
3             $Treex::Tool::Parser::MSTperl::TrainerUnlabelled::VERSION = '0.11949';
4             }
5              
6 1     1   2246 use Moose;
  0            
  0            
7              
8             extends 'Treex::Tool::Parser::MSTperl::TrainerBase';
9              
10             use Treex::Tool::Parser::MSTperl::Parser;
11              
12             has model => (
13             isa => 'Treex::Tool::Parser::MSTperl::ModelUnlabelled',
14             is => 'rw',
15             );
16              
17             has parser => (
18             isa => 'Treex::Tool::Parser::MSTperl::Parser',
19             is => 'rw',
20             );
21              
22             # v
23             # all values of features used during the training summed together
24             # as using average weights instead of final weights
25             # is reported to help avoid overtraining
26             # For labeller has the form of ->{feature}->{label} = weight
27             # instead of ->{feature} = weight
28             has feature_weights_summed => (
29             isa => 'HashRef',
30             is => 'rw',
31             default => sub { {} },
32             );
33              
34             sub BUILD {
35             my ($self) = @_;
36              
37             $self->parser(
38             Treex::Tool::Parser::MSTperl::Parser->new( config => $self->config )
39             );
40             $self->model( $self->parser->model );
41             $self->featuresControl( $self->config->unlabelledFeaturesControl );
42             $self->number_of_iterations( $self->config->number_of_iterations );
43              
44             return;
45             }
46              
47             # UNLABELLED TRAINING
48              
49             # compute the features of the sentence
50             sub preprocess_sentence {
51              
52             # (Treex::Tool::Parser::MSTperl::Sentence $sentence)
53             my ( $self, $sentence ) = @_;
54              
55             $sentence->fill_fields_after_parse();
56              
57             return;
58             }
59              
60             sub update {
61              
62             # (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct_parse,
63             # Int $sumUpdateWeight)
64             my (
65             $self,
66             $sentence_correct_parse,
67             $sumUpdateWeight,
68             $forbid_new_features
69             ) = @_;
70              
71             # reparse the sentence
72             # y' = argmax_y' s(x_t, y')
73             my $sentence_best_parse = $self->parser->parse_sentence_internal(
74             $sentence_correct_parse
75             );
76             $sentence_best_parse->fill_fields_after_parse();
77              
78             # only progress and/or debug info
79             if ( $self->config->DEBUG >= 2 ) {
80             print "CORRECT PARSE EDGES:\n";
81             foreach my $edge ( @{ $sentence_correct_parse->edges } ) {
82             print $edge->parent->ord . " -> "
83             . $edge->child->ord . "\n";
84             }
85             print "BEST PARSE EDGES:\n";
86             foreach my $edge ( @{ $sentence_best_parse->edges } ) {
87             print $edge->parent->ord . " -> "
88             . $edge->child->ord . "\n";
89             }
90             }
91              
92             # min ||w_i+1 - w_i|| s.t. ...
93             $self->mira_update(
94             $sentence_correct_parse,
95             $sentence_best_parse,
96             $sumUpdateWeight,
97             $forbid_new_features
98             );
99              
100             return;
101              
102             }
103              
104             sub mira_update {
105              
106             # (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct_parse,
107             # Treex::Tool::Parser::MSTperl::Sentence $sentence_best_parse,
108             # Int $sumUpdateWeight)
109             my (
110             $self,
111             $sentence_correct_parse,
112             $sentence_best_parse,
113             $sumUpdateWeight,
114             $forbid_new_features
115             ) = @_;
116              
117             # s(x_t, y_t)
118             my $score_correct = $self->model->score_sentence($sentence_correct_parse);
119              
120             # s(x_t, y')
121             my $score_best = $self->model->score_sentence($sentence_best_parse);
122              
123             # difference in scores should be greater than the margin:
124              
125             # L(y_t, y') number of incorrectly assigned heads
126             my $margin = $sentence_best_parse->count_errors_attachement(
127             $sentence_correct_parse
128             );
129              
130             # s(x_t, y_t) - s(x_t, y') this should be zero or less
131             my $score_gain = $score_correct - $score_best;
132              
133             # L(y_t, y') - [s(x_t, y_t) - s(x_t, y')]
134             my $error = $margin - $score_gain;
135              
136             if ( $error > 0 ) {
137             my ( $features_diff_correct, $features_diff_best, $features_diff_count )
138             = $self->features_diff(
139             $sentence_correct_parse->features,
140             $sentence_best_parse->features,
141             $forbid_new_features
142             );
143              
144             if ( $features_diff_count == 0 ) {
145             warn "Features of the best parse and the correct parse do not " .
146             "differ, unable to update the scores. " .
147             "Consider using more features.\n";
148             if ( $self->config->DEBUG >= 3 ) {
149             print "alpha: 0 on 0 features\n";
150             }
151             } else {
152              
153             # min ||w_i+1 - w_i|| s.t. s(x_t, y_t) - s(x_t, y') >= L(y_t, y')
154             my $update = $error / $features_diff_count;
155              
156             #$update is added to features occuring in the correct parse only
157             foreach my $feature ( @{$features_diff_correct} ) {
158             $self->update_feature_weight(
159             $feature,
160             $update,
161             $sumUpdateWeight
162             );
163             }
164              
165             # and subtracted from features occuring
166             # in the best (and incorrect) parse only
167             foreach my $feature ( @{$features_diff_best} ) {
168             $self->update_feature_weight(
169             $feature,
170             -$update,
171             $sumUpdateWeight
172             );
173             }
174             if ( $self->config->DEBUG >= 3 ) {
175             print "alpha: $update on $features_diff_count features\n";
176             }
177             }
178             } else { #else no need to optimize
179             if ( $self->config->DEBUG >= 3 ) {
180             print "alpha: 0 on 0 features\n";
181             }
182             }
183              
184             return;
185             }
186              
187             sub features_diff {
188              
189             # (ArrayRef[Str] $features_first, ArrayRef[Str] $features_second)
190             my ( $self, $features_first, $features_second, $forbid_new_features ) = @_;
191              
192             #get feature counts
193             my %feature_counts;
194             foreach my $feature ( @{$features_first} ) {
195             $feature_counts{$feature}++;
196             }
197             foreach my $feature ( @{$features_second} ) {
198             $feature_counts{$feature}--;
199             }
200              
201             # TODO: try to disregard features which occur in both parses?
202              
203             #do the diff
204             my @features_first;
205             my @features_second;
206             my $diff_count = 0;
207             FF: foreach my $feature ( keys %feature_counts ) {
208             if ( $forbid_new_features && $self->model->feature_is_unknown($feature) ) {
209             next FF;
210             }
211             if ( $feature_counts{$feature} ) {
212             my $count = abs( $feature_counts{$feature} );
213              
214             # create arrays of differing features,
215             # each differing feature is included ONCE ONLY
216             # because an optimization of update is not present
217             # and the update makes uniform changes to all differing features,
218             # in which case even repeated features should be updated ONCE ONLY
219              
220             # more often in the first array
221             if ( $feature_counts{$feature} > 0 ) {
222              
223             # for ( my $i = 0; $i < $count; $i++ ) {
224             push @features_first, $feature;
225              
226             # }
227              
228             # more often in the second array
229             } else {
230              
231             # for ( my $i = 0; $i < $count; $i++ ) {
232             push @features_second, $feature;
233              
234             # }
235             }
236             $diff_count += $count;
237             } # else same count -> no difference
238             }
239              
240             return ( \@features_first, \@features_second, $diff_count );
241             }
242              
243             # update weight of the feature
244             # (also update the sum of feature weights: feature_weights_summed)
245             sub update_feature_weight {
246              
247             # (Str $feature, Num $update, Num $sumUpdateWeight)
248             my ( $self, $feature, $update, $sumUpdateWeight ) = @_;
249              
250             #adds $update to the current weight of the feature
251             my $result =
252             $self->model->update_feature_weight( $feature, $update );
253              
254             # v = v + w_{i+1}
255             # $sumUpdateWeight denotes number of summands
256             # in which the weight would appear
257             # if it were computed according to the definition
258             my $summed_update = $sumUpdateWeight * $update;
259             $self->feature_weights_summed->{$feature} += $summed_update;
260              
261             return $result;
262             }
263              
264             # recompute weight of $feature as an average
265             # (using feature_weights_summed)
266             sub scores_averaging {
267              
268             # Str $feature
269             my ($self) = @_;
270              
271             foreach my $feature ( keys %{ $self->feature_weights_summed } ) {
272              
273             # w = v/(N * T)
274             # see also: my $self->number_of_inner_iterations =
275             # $self->number_of_iterations * $sentence_count;
276              
277             my $weight = $self->feature_weights_summed->{$feature}
278             / $self->number_of_inner_iterations;
279             $self->model->set_feature_weight( $feature, $weight );
280              
281             # only progress and/or debug info
282             if ( $self->config->DEBUG >= 2 ) {
283             print "$feature\t" . $self->model->get_feature_weight($feature)
284             . "\n";
285              
286             }
287             }
288              
289             return;
290             }
291              
292             1;
293              
294             __END__
295              
296             =pod
297              
298             =for Pod::Coverage BUILD
299              
300             =encoding utf-8
301              
302             =head1 NAME
303              
304             Treex::Tool::Parser::MSTperl::TrainerUnlabelled
305              
306             =head1 VERSION
307              
308             version 0.11949
309              
310             =head1 DESCRIPTION
311              
312             Trains on correctly parsed sentences and so creates and tunes the model.
313             Uses single-best MIRA (McDonald et al., 2005, Proc. HLT/EMNLP)
314              
315             =head1 FIELDS
316              
317             =over 4
318              
319             =item parser
320              
321             Reference to an instance of L<Treex::Tool::Parser::MSTperl::Parser> which is
322             used for the training.
323              
324             =item model
325              
326             Reference to an instance of L<Treex::Tool::Parser::MSTperl::ModelUnlabelled>
327             which is being trained.
328              
329             =back
330              
331             =head1 METHODS
332              
333             The C<sumUpdateWeight> is a number by which the change of the feature weights
334             is multiplied in the sum of the weights, so that at the end of the algorithm
335             the sum corresponds to its formal definition, which is a sum of all weights
336             after each of the updates. C<sumUpdateWeight> is a member of a sequence going
337             from N*T to 1, where N is the number of iterations
338             (L<Treex::Tool::Parser::MSTperl::FeaturesControl/number_of_iterations>, C<10>
339             by default) and T being the number of sentences in training data, N*T thus
340             being the number of inner iterations, i.e. how many times C<mira_update()> is
341             called.
342              
343             =over 4
344              
345             =item $trainer->train($training_data);
346              
347             Trains the model, using the settings from C<config> and the training
348             data in the form of a reference to an array of parsed sentences
349             (L<Treex::Tool::Parser::MSTperl::Sentence>), which can be obtained by the
350             L<Treex::Tool::Parser::MSTperl::Reader>.
351              
352             =item $self->mira_update($sentence_correct_parse, $sentence_best_parse,
353             $sumUpdateWeight)
354              
355             Performs one update of the MIRA (Margin-Infused Relaxed Algorithm) on one
356             sentence from the training data. Its input is the correct parse of the sentence
357             (from the training data) and the best scoring parse created by the parser.
358              
359             =item my ( $features_diff_1, $features_diff_2, $features_diff_count ) =
360             features_diff( $features_1, $features_2 );
361              
362             Compares features of two parses of a sentence, where the features
363             (C<$features_1>, C<$features_2>) are represented as a reference to
364             an array of strings representing the features
365             (the same feature might be present repeatedly, all occurencies of the same
366             feature are summed together).
367              
368             Features that appear exactly the same times in both parses are disregarded.
369              
370             The first two returned values (C<$features_diff_1>, C<$features_diff_2>)
371             are array references,
372             C<$features_diff_1> containing features that appear in the first parse
373             (C<$features_1>) more often than in the second parse (C<$features_2>),
374             and vice versa for C<$features_diff_2>.
375             Each feature is contained as many times as is the difference in number
376             of occurencies, eg. if the feature C<TAG|tag:NN|NN> appears 5 times in the
377             first parse and 8 times in the second parse, then C<$features_diff_2>
378             will contain C<'TAG|tag:NN|NN', 'TAG|tag:NN|NN', 'TAG|tag:NN|NN'>.
379              
380             The third returned value (C<$features_diff_count>) is a count of features
381             in which the parses differ, ie.
382             C<$features_diff_count = scalar(@$features_diff_1) + scalar(@$features_diff_2)>.
383              
384             =item update_feature_weight( $model, $feature, $update, $sumUpdateWeight )
385              
386             Updates weight of C<$feature> by C<$update>
387             (which might be positive or negative)
388             and also updates the sum of updates of the feature
389             (which is later used for overtraining avoidance),
390             multiplied by C<$sumUpdateWeight>, which is simply a count of inner iterations
391             yet to be performed (thus eliminating the need to update the sum on each
392             inner iteration).
393              
394             =back
395              
396             =head1 AUTHORS
397              
398             Rudolf Rosa <rosa@ufal.mff.cuni.cz>
399              
400             =head1 COPYRIGHT AND LICENSE
401              
402             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles
403             University in Prague
404              
405             This module is free software; you can redistribute it and/or modify it under
406             the same terms as Perl itself.