File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/TrainerUnlabelled.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				package Treex::Tool::Parser::MSTperl::TrainerUnlabelled;
2				{
3				$Treex::Tool::Parser::MSTperl::TrainerUnlabelled::VERSION = '0.11949';
4				}
5
6	1	1	2246	use Moose;
	0
	0
7
8				extends 'Treex::Tool::Parser::MSTperl::TrainerBase';
9
10				use Treex::Tool::Parser::MSTperl::Parser;
11
12				has model => (
13				isa => 'Treex::Tool::Parser::MSTperl::ModelUnlabelled',
14				is => 'rw',
15				);
16
17				has parser => (
18				isa => 'Treex::Tool::Parser::MSTperl::Parser',
19				is => 'rw',
20				);
21
22				# v
23				# all values of features used during the training summed together
24				# as using average weights instead of final weights
25				# is reported to help avoid overtraining
26				# For labeller has the form of ->{feature}->{label} = weight
27				# instead of ->{feature} = weight
28				has feature_weights_summed => (
29				isa => 'HashRef',
30				is => 'rw',
31				default => sub { {} },
32				);
33
34				sub BUILD {
35				my ($self) = @_;
36
37				$self->parser(
38				Treex::Tool::Parser::MSTperl::Parser->new( config => $self->config )
39				);
40				$self->model( $self->parser->model );
41				$self->featuresControl( $self->config->unlabelledFeaturesControl );
42				$self->number_of_iterations( $self->config->number_of_iterations );
43
44				return;
45				}
46
47				# UNLABELLED TRAINING
48
49				# compute the features of the sentence
50				sub preprocess_sentence {
51
52				# (Treex::Tool::Parser::MSTperl::Sentence $sentence)
53				my ( $self, $sentence ) = @_;
54
55				$sentence->fill_fields_after_parse();
56
57				return;
58				}
59
60				sub update {
61
62				# (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct_parse,
63				# Int $sumUpdateWeight)
64				my (
65				$self,
66				$sentence_correct_parse,
67				$sumUpdateWeight,
68				$forbid_new_features
69				) = @_;
70
71				# reparse the sentence
72				# y' = argmax_y' s(x_t, y')
73				my $sentence_best_parse = $self->parser->parse_sentence_internal(
74				$sentence_correct_parse
75				);
76				$sentence_best_parse->fill_fields_after_parse();
77
78				# only progress and/or debug info
79				if ( $self->config->DEBUG >= 2 ) {
80				print "CORRECT PARSE EDGES:\n";
81				foreach my $edge ( @{ $sentence_correct_parse->edges } ) {
82				print $edge->parent->ord . " -> "
83				. $edge->child->ord . "\n";
84				}
85				print "BEST PARSE EDGES:\n";
86				foreach my $edge ( @{ $sentence_best_parse->edges } ) {
87				print $edge->parent->ord . " -> "
88				. $edge->child->ord . "\n";
89				}
90				}
91
92				# min \|\|w_i+1 - w_i\|\| s.t. ...
93				$self->mira_update(
94				$sentence_correct_parse,
95				$sentence_best_parse,
96				$sumUpdateWeight,
97				$forbid_new_features
98				);
99
100				return;
101
102				}
103
104				sub mira_update {
105
106				# (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct_parse,
107				# Treex::Tool::Parser::MSTperl::Sentence $sentence_best_parse,
108				# Int $sumUpdateWeight)
109				my (
110				$self,
111				$sentence_correct_parse,
112				$sentence_best_parse,
113				$sumUpdateWeight,
114				$forbid_new_features
115				) = @_;
116
117				# s(x_t, y_t)
118				my $score_correct = $self->model->score_sentence($sentence_correct_parse);
119
120				# s(x_t, y')
121				my $score_best = $self->model->score_sentence($sentence_best_parse);
122
123				# difference in scores should be greater than the margin:
124
125				# L(y_t, y') number of incorrectly assigned heads
126				my $margin = $sentence_best_parse->count_errors_attachement(
127				$sentence_correct_parse
128				);
129
130				# s(x_t, y_t) - s(x_t, y') this should be zero or less
131				my $score_gain = $score_correct - $score_best;
132
133				# L(y_t, y') - [s(x_t, y_t) - s(x_t, y')]
134				my $error = $margin - $score_gain;
135
136				if ( $error > 0 ) {
137				my ( $features_diff_correct, $features_diff_best, $features_diff_count )
138				= $self->features_diff(
139				$sentence_correct_parse->features,
140				$sentence_best_parse->features,
141				$forbid_new_features
142				);
143
144				if ( $features_diff_count == 0 ) {
145				warn "Features of the best parse and the correct parse do not " .
146				"differ, unable to update the scores. " .
147				"Consider using more features.\n";
148				if ( $self->config->DEBUG >= 3 ) {
149				print "alpha: 0 on 0 features\n";
150				}
151				} else {
152
153				# min \|\|w_i+1 - w_i\|\| s.t. s(x_t, y_t) - s(x_t, y') >= L(y_t, y')
154				my $update = $error / $features_diff_count;
155
156				#$update is added to features occuring in the correct parse only
157				foreach my $feature ( @{$features_diff_correct} ) {
158				$self->update_feature_weight(
159				$feature,
160				$update,
161				$sumUpdateWeight
162				);
163				}
164
165				# and subtracted from features occuring
166				# in the best (and incorrect) parse only
167				foreach my $feature ( @{$features_diff_best} ) {
168				$self->update_feature_weight(
169				$feature,
170				-$update,
171				$sumUpdateWeight
172				);
173				}
174				if ( $self->config->DEBUG >= 3 ) {
175				print "alpha: $update on $features_diff_count features\n";
176				}
177				}
178				} else { #else no need to optimize
179				if ( $self->config->DEBUG >= 3 ) {
180				print "alpha: 0 on 0 features\n";
181				}
182				}
183
184				return;
185				}
186
187				sub features_diff {
188
189				# (ArrayRef[Str] $features_first, ArrayRef[Str] $features_second)
190				my ( $self, $features_first, $features_second, $forbid_new_features ) = @_;
191
192				#get feature counts
193				my %feature_counts;
194				foreach my $feature ( @{$features_first} ) {
195				$feature_counts{$feature}++;
196				}
197				foreach my $feature ( @{$features_second} ) {
198				$feature_counts{$feature}--;
199				}
200
201				# TODO: try to disregard features which occur in both parses?
202
203				#do the diff
204				my @features_first;
205				my @features_second;
206				my $diff_count = 0;
207				FF: foreach my $feature ( keys %feature_counts ) {
208				if ( $forbid_new_features && $self->model->feature_is_unknown($feature) ) {
209				next FF;
210				}
211				if ( $feature_counts{$feature} ) {
212				my $count = abs( $feature_counts{$feature} );
213
214				# create arrays of differing features,
215				# each differing feature is included ONCE ONLY
216				# because an optimization of update is not present
217				# and the update makes uniform changes to all differing features,
218				# in which case even repeated features should be updated ONCE ONLY
219
220				# more often in the first array
221				if ( $feature_counts{$feature} > 0 ) {
222
223				# for ( my $i = 0; $i < $count; $i++ ) {
224				push @features_first, $feature;
225
226				# }
227
228				# more often in the second array
229				} else {
230
231				# for ( my $i = 0; $i < $count; $i++ ) {
232				push @features_second, $feature;
233
234				# }
235				}
236				$diff_count += $count;
237				} # else same count -> no difference
238				}
239
240				return ( \@features_first, \@features_second, $diff_count );
241				}
242
243				# update weight of the feature
244				# (also update the sum of feature weights: feature_weights_summed)
245				sub update_feature_weight {
246
247				# (Str $feature, Num $update, Num $sumUpdateWeight)
248				my ( $self, $feature, $update, $sumUpdateWeight ) = @_;
249
250				#adds $update to the current weight of the feature
251				my $result =
252				$self->model->update_feature_weight( $feature, $update );
253
254				# v = v + w_{i+1}
255				# $sumUpdateWeight denotes number of summands
256				# in which the weight would appear
257				# if it were computed according to the definition
258				my $summed_update = $sumUpdateWeight * $update;
259				$self->feature_weights_summed->{$feature} += $summed_update;
260
261				return $result;
262				}
263
264				# recompute weight of $feature as an average
265				# (using feature_weights_summed)
266				sub scores_averaging {
267
268				# Str $feature
269				my ($self) = @_;
270
271				foreach my $feature ( keys %{ $self->feature_weights_summed } ) {
272
273				# w = v/(N * T)
274				# see also: my $self->number_of_inner_iterations =
275				# $self->number_of_iterations * $sentence_count;
276
277				my $weight = $self->feature_weights_summed->{$feature}
278				/ $self->number_of_inner_iterations;
279				$self->model->set_feature_weight( $feature, $weight );
280
281				# only progress and/or debug info
282				if ( $self->config->DEBUG >= 2 ) {
283				print "$feature\t" . $self->model->get_feature_weight($feature)
284				. "\n";
285
286				}
287				}
288
289				return;
290				}
291
292				1;
293
294				__END__
295
296				=pod
297
298				=for Pod::Coverage BUILD
299
300				=encoding utf-8
301
302				=head1 NAME
303
304				Treex::Tool::Parser::MSTperl::TrainerUnlabelled
305
306				=head1 VERSION
307
308				version 0.11949
309
310				=head1 DESCRIPTION
311
312				Trains on correctly parsed sentences and so creates and tunes the model.
313				Uses single-best MIRA (McDonald et al., 2005, Proc. HLT/EMNLP)
314
315				=head1 FIELDS
316
317				=over 4
318
319				=item parser
320
321				Reference to an instance of L<Treex::Tool::Parser::MSTperl::Parser> which is
322				used for the training.
323
324				=item model
325
326				Reference to an instance of L<Treex::Tool::Parser::MSTperl::ModelUnlabelled>
327				which is being trained.
328
329				=back
330
331				=head1 METHODS
332
333				The C<sumUpdateWeight> is a number by which the change of the feature weights
334				is multiplied in the sum of the weights, so that at the end of the algorithm
335				the sum corresponds to its formal definition, which is a sum of all weights
336				after each of the updates. C<sumUpdateWeight> is a member of a sequence going
337				from N*T to 1, where N is the number of iterations
338				(L<Treex::Tool::Parser::MSTperl::FeaturesControl/number_of_iterations>, C<10>
339				by default) and T being the number of sentences in training data, N*T thus
340				being the number of inner iterations, i.e. how many times C<mira_update()> is
341				called.
342
343				=over 4
344
345				=item $trainer->train($training_data);
346
347				Trains the model, using the settings from C<config> and the training
348				data in the form of a reference to an array of parsed sentences
349				(L<Treex::Tool::Parser::MSTperl::Sentence>), which can be obtained by the
350				L<Treex::Tool::Parser::MSTperl::Reader>.
351
352				=item $self->mira_update($sentence_correct_parse, $sentence_best_parse,
353				$sumUpdateWeight)
354
355				Performs one update of the MIRA (Margin-Infused Relaxed Algorithm) on one
356				sentence from the training data. Its input is the correct parse of the sentence
357				(from the training data) and the best scoring parse created by the parser.
358
359				=item my ( $features_diff_1, $features_diff_2, $features_diff_count ) =
360				features_diff( $features_1, $features_2 );
361
362				Compares features of two parses of a sentence, where the features
363				(C<$features_1>, C<$features_2>) are represented as a reference to
364				an array of strings representing the features
365				(the same feature might be present repeatedly, all occurencies of the same
366				feature are summed together).
367
368				Features that appear exactly the same times in both parses are disregarded.
369
370				The first two returned values (C<$features_diff_1>, C<$features_diff_2>)
371				are array references,
372				C<$features_diff_1> containing features that appear in the first parse
373				(C<$features_1>) more often than in the second parse (C<$features_2>),
374				and vice versa for C<$features_diff_2>.
375				Each feature is contained as many times as is the difference in number
376				of occurencies, eg. if the feature C<TAG\|tag:NN\|NN> appears 5 times in the
377				first parse and 8 times in the second parse, then C<$features_diff_2>
378				will contain C<'TAG\|tag:NN\|NN', 'TAG\|tag:NN\|NN', 'TAG\|tag:NN\|NN'>.
379
380				The third returned value (C<$features_diff_count>) is a count of features
381				in which the parses differ, ie.
382				C<$features_diff_count = scalar(@$features_diff_1) + scalar(@$features_diff_2)>.
383
384				=item update_feature_weight( $model, $feature, $update, $sumUpdateWeight )
385
386				Updates weight of C<$feature> by C<$update>
387				(which might be positive or negative)
388				and also updates the sum of updates of the feature
389				(which is later used for overtraining avoidance),
390				multiplied by C<$sumUpdateWeight>, which is simply a count of inner iterations
391				yet to be performed (thus eliminating the need to update the sum on each
392				inner iteration).
393
394				=back
395
396				=head1 AUTHORS
397
398				Rudolf Rosa <rosa@ufal.mff.cuni.cz>
399
400				=head1 COPYRIGHT AND LICENSE
401
402				Copyright Â© 2011 by Institute of Formal and Applied Linguistics, Charles
403				University in Prague
404
405				This module is free software; you can redistribute it and/or modify it under
406				the same terms as Perl itself.