File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/TrainerBase.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Parser::MSTperl::TrainerBase;
2             {
3             $Treex::Tool::Parser::MSTperl::TrainerBase::VERSION = '0.11949';
4             }
5              
6 1     1   2324 use Moose;
  0            
  0            
7             use Carp;
8              
9             has config => (
10             isa => 'Treex::Tool::Parser::MSTperl::Config',
11             is => 'ro',
12             required => '1',
13             );
14              
15             # to be filled in extending packages!
16             has model => (
17             isa => 'Treex::Tool::Parser::MSTperl::ModelBase',
18             is => 'rw',
19             );
20              
21             # to be filled in extending packages!
22             has featuresControl => (
23             isa => 'Treex::Tool::Parser::MSTperl::FeaturesControl',
24             is => 'rw',
25             );
26              
27             # to be filled in extending packages!
28             has number_of_iterations => (
29             isa => 'Int',
30             is => 'rw',
31             );
32              
33             has number_of_inner_iterations => (
34             isa => 'Int',
35             is => 'rw',
36             );
37              
38             has skip_scores_averaging => (
39             is => 'rw',
40             isa => 'Bool',
41             default => 0
42             );
43              
44             # TRAINING COMMON SUBS
45              
46             sub train_dev {
47             my ( $self, $training_data, $dev_data ) = @_;
48              
49             $self->train( $training_data, 0 );
50             my $feature_count = $self->train( $dev_data, 1 );
51              
52             return $feature_count;
53             }
54              
55             sub train_2parts {
56             my ( $self, $training_data, $dev_data ) = @_;
57              
58             $self->train( $training_data, 0 );
59             my $feature_count = $self->train( $dev_data, 0 );
60              
61             return $feature_count;
62             }
63              
64             sub train {
65              
66             # (ArrayRef[Treex::Tool::Parser::MSTperl::Sentence] $training_data
67             # Bool $unlabelled)
68             # Training data: T = {(x_t, y_t)} t=1..T
69             my ( $self, $training_data, $forbid_new_features ) = @_;
70              
71             # number of sentences in training data
72             my $sentence_count = scalar( @{$training_data} );
73              
74             # how many times $self->mira_update() will be called
75             $self->number_of_inner_iterations(
76             $self->number_of_iterations * $sentence_count
77             );
78              
79             # only progress and/or debug info
80             if ( $self->config->DEBUG >= 1 ) {
81             print "Going to train on $sentence_count sentences with "
82             . $self->number_of_iterations . " iterations.\n";
83             }
84              
85             # precompute features of sentences in training data
86             # in labelled parsing also gets the list of labels
87             # and computes the transition probs
88             $self->preprocess_sentences($training_data);
89              
90             # do the training
91             if ( $self->config->DEBUG >= 1 ) {
92             print "Training the model...\n";
93             }
94             my $innerIteration = 0;
95              
96             # for n : 1..N
97             for (
98             my $iteration = 1;
99             $iteration <= $self->number_of_iterations;
100             $iteration++
101             )
102             {
103             if ( $self->config->DEBUG >= 1 ) {
104             print " Iteration number $iteration of "
105             . $self->number_of_iterations . "...\n";
106             }
107             my $sentNo = 0;
108              
109             # for t : 1..T # these are the inner iterations
110             foreach my $sentence_correct ( @{$training_data} ) {
111              
112             # weight of weights/scores sum update <N*T .. 1>;
113             # $sumUpdateWeight denotes number of summands
114             # in which the new value would appear
115             # if it were computed according to the definition
116             my $sumUpdateWeight =
117             $self->number_of_inner_iterations - $innerIteration;
118              
119             # update on this instance
120             $self->update( $sentence_correct, $sumUpdateWeight, $forbid_new_features );
121              
122             # $innerIteration = ( $iteration - 1 ) * $sentence_count + $sentNo;
123             $innerIteration++;
124              
125             # only progress and/or debug info
126             if ( $self->config->DEBUG >= 1 ) {
127             $sentNo++;
128             if ( $sentNo % 50 == 0 ) {
129             print " $sentNo/$sentence_count sentences processed " .
130             "(iteration $iteration/"
131             . $self->number_of_iterations
132             . ")\n";
133             }
134             }
135              
136             } # end for inner iterations
137             } # end for $iteration
138              
139             # only progress and/or debug info
140             if ( $self->config->DEBUG >= 1 ) {
141             print "Done.\n";
142             }
143             if ( $self->config->DEBUG >= 2 ) {
144             print "FINAL FEATURE WEIGTHS:\n";
145             }
146              
147             if ( !$self->skip_scores_averaging ) {
148              
149             # average the model (is said to help overfitting)
150             $self->scores_averaging();
151             }
152              
153             # only progress and/or debug info
154             my $feature_count = $self->model->get_feature_count();
155             if ( $self->config->DEBUG >= 1 ) {
156             print "Model trained with $feature_count features.\n";
157             }
158              
159             return $feature_count;
160              
161             } # end sub train
162              
163             # precompute features of sentences in training data
164             sub preprocess_sentences {
165              
166             # (ArrayRef[Treex::Tool::Parser::MSTperl::Sentence] $training_data
167             # Bool $unlabelled)
168             my ( $self, $training_data ) = @_;
169              
170             # only progress and/or debug info
171             if ( $self->config->DEBUG >= 1 ) {
172             print "Computing sentence features...\n";
173             }
174              
175             my $sentence_count = scalar( @{$training_data} );
176             my $sentNo = 0;
177              
178             foreach my $sentence_correct ( @{$training_data} ) {
179              
180             # compute sentence features
181             # in labelled parsing also gets the list of labels
182             # and computes the transition probs
183             $sentNo++;
184             $self->preprocess_sentence(
185             $sentence_correct, $sentNo / $sentence_count
186             );
187              
188             # only progress and/or debug info
189             if ( $self->config->DEBUG >= 1 ) {
190             if ( $sentNo % 50 == 0 ) {
191             print " $sentNo/$sentence_count sentences "
192             . "processed (computing features)\n";
193             }
194             }
195             if ( $self->config->DEBUG >= 3 ) {
196             print "SENTENCE FEATURES:\n";
197             foreach my $feature ( @{ $sentence_correct->features } ) {
198             print "$feature\n";
199             }
200             print "CORRECT EDGES:\n";
201             foreach my $edge ( @{ $sentence_correct->edges } ) {
202             print $edge->parent->ord . " -> " . $edge->child->ord . "\n";
203             }
204             print "CORRECT LABELS:\n";
205             foreach my $node ( @{ $sentence_correct->nodes_with_root } ) {
206             print $node->ord . "/" . $node->label . "\n";
207             }
208             }
209              
210             }
211              
212             $self->model->prepare_for_mira($self);
213              
214             if ( $self->config->DEBUG >= 1 ) {
215             print "Done.\n";
216             }
217              
218             return;
219             }
220              
221             # ABSTRACT TRAINING SUB STUBS (TO BE REDEFINED IN DESCENDED PACKAGES)
222              
223             # compute the features of the sentence
224             # in labelling also used to get the list of labels and of transition probs
225             sub preprocess_sentence {
226              
227             # (Treex::Tool::Parser::MSTperl::Sentence $sentence, Num $progress)
228             # my ( $self, $sentence, $progress ) = @_;
229              
230             croak 'TrainerBase::preprocess_sentence is an abstract method,'
231             . ' it must be called'
232             . ' either from TrainerUnlabelled or TrainerLabelling!';
233             }
234              
235             sub update {
236              
237             # (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct,
238             # Int $sumUpdateWeight)
239             # my ( $self, $sentence_correct, $sumUpdateWeight ) = @_;
240              
241             croak 'TrainerBase::update is an abstract method, it must be called'
242             . ' either from TrainerUnlabelled or TrainerLabelling!';
243             }
244              
245             # sub mira_update {
246             #
247             # # (Treex::Tool::Parser::MSTperl::Sentence $sentence_correct,
248             # # Treex::Tool::Parser::MSTperl::Sentence $sentence_best,
249             # # Int $sumUpdateWeight)
250             # # my ( $self, $sentence_correct, $sentence_best, $sumUpdateWeight ) = @_;
251             #
252             # croak 'TrainerBase::mira_update is an abstract method, it must be called'
253             # . ' either from TrainerUnlabelled or TrainerLabelling!';
254             # }
255              
256             # recompute feature weights/scores as averages
257             sub scores_averaging {
258              
259             # my ($self) = @_;
260              
261             croak 'TrainerBase::scores_averaging is an abstract method, it '
262             . 'must be called either from TrainerUnlabelled or TrainerLabelling!';
263              
264             }
265              
266             # MODEL STORING
267              
268             sub store_model {
269              
270             my ( $self, $filename ) = @_;
271              
272             $self->model->store($filename);
273              
274             return;
275             }
276              
277             sub store_model_tsv {
278              
279             my ( $self, $filename ) = @_;
280              
281             $self->model->store_tsv($filename);
282              
283             return;
284             }
285              
286             1;
287              
288             __END__
289              
290             =pod
291              
292             =for Pod::Coverage BUILD
293              
294             =encoding utf-8
295              
296             =head1 NAME
297              
298             Treex::Tool::Parser::MSTperl::TrainerBase
299              
300             =head1 VERSION
301              
302             version 0.11949
303              
304             =head1 DESCRIPTION
305              
306             Trains on correctly parsed sentences and so creates and tunes the model.
307             Uses single-best MIRA (McDonald et al., 2005, Proc. HLT/EMNLP)
308              
309             =head1 FIELDS
310              
311             =over 4
312              
313             =item config
314              
315             Reference to the instance of L<Treex::Tool::Parser::MSTperl::Config>.
316              
317             =back
318              
319             =head1 METHODS
320              
321             =over 4
322              
323             =item TODO
324              
325             =back
326              
327             =head1 AUTHORS
328              
329             Rudolf Rosa <rosa@ufal.mff.cuni.cz>
330              
331             =head1 COPYRIGHT AND LICENSE
332              
333             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles
334             University in Prague
335              
336             This module is free software; you can redistribute it and/or modify it under
337             the same terms as Perl itself.