File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/Config.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Parser::MSTperl::Config;
2             {
3             $Treex::Tool::Parser::MSTperl::Config::VERSION = '0.11949';
4             }
5              
6 1     1   50759 use Moose;
  0            
  0            
7             use autodie;
8             use Carp;
9             use File::Spec;
10              
11             use Treex::Tool::Parser::MSTperl::FeaturesControl;
12             use Treex::Tool::Parser::MSTperl::ModelAdditional;
13              
14             # varied levels of debug info,
15             # ranging from 0 (no debug info)
16             # through 1 (progress messages - this is the default setting)
17             # through 2, 3 and 4 to 5 (more and more debug info)
18             has 'DEBUG' => (
19             is => 'rw',
20             isa => 'Int',
21             default => '1',
22             );
23              
24             # Viterbi settings
25              
26             has 'SEQUENCE_BOUNDARY_LABEL' => (
27             is => 'rw',
28             isa => 'Str',
29             default => '###',
30             );
31              
32             has 'VITERBI_STATES_NUM_THRESHOLD' => (
33             is => 'rw',
34             isa => 'Int',
35             default => 5,
36             );
37              
38             # stopping criterion of EM algorithm (when the sum of change of smoothing
39             # parameters is lower than the epsilon, the algorithm stops)
40             has 'EM_EPSILON' => (
41             is => 'rw',
42             isa => 'Num',
43             default => 0.00001,
44             );
45              
46             # strmost sigmoidy
47             has 'SIGM_LAMBDA' => (
48             is => 'rw',
49             isa => 'Num',
50              
51             # default => 0.0015, probably good for data as they used to be :-)
52             default => 1,
53             );
54              
55             # added to emission probs to make them non-negative
56             # has 'EMISSIONS_SHIFT' => (
57             # is => 'rw',
58             # isa => 'Int',
59             # default => 500,
60             # );
61              
62             # where in training data do heldout data for EM algorithm start
63             # (a number between 0 and 1, eg. 0.75 means that first 75% of sentences
64             # are training data and the last 25% are heldout data)
65             has 'EM_heldout_data_at' => (
66             is => 'rw',
67             isa => 'Num',
68             default => 0.9,
69             );
70              
71             has 'config_file' => (
72             is => 'ro',
73             isa => 'Str',
74             required => '1',
75             );
76              
77             has 'unlabelledFeaturesControl' => (
78             isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
79             is => 'rw',
80             );
81              
82             has 'labelledFeaturesControl' => (
83             isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
84             is => 'rw',
85             );
86              
87             # has 'imlabelledFeaturesControl' => (
88             # isa => 'Maybe[Treex::Tool::Parser::MSTperl::FeaturesControl]',
89             # is => 'rw',
90             # );
91              
92             # CONFIGURATION
93              
94             # only assigning is_member (as opposed to afun labelling)
95             # has 'is_member_labelling' => (
96             # is => 'ro',
97             # isa => 'Bool',
98             # default => '0',
99             # );
100              
101             # training mode or parsing mode
102             has 'training' => (
103             is => 'ro',
104             isa => 'Bool',
105             default => '0',
106             );
107              
108             # (default is parsing mode)
109              
110             # has 'ord_field_index' => (
111             # is => 'rw',
112             # isa => 'Int',
113             # );
114              
115             # just temporary before it is found out
116             # which algorithm is the best one
117             has 'labeller_algorithm' => (
118             is => 'rw',
119             isa => 'Int',
120             default => '0',
121             );
122              
123             has 'parent_ord' => (
124             is => 'rw',
125             isa => 'Str',
126             trigger => \&_parent_ord_set,
127             );
128              
129             # sets parent_ord_field_index
130             sub _parent_ord_set {
131             my ( $self, $parent_ord ) = @_;
132              
133             # set index of parent's ord field
134             my $parent_ord_index = $self->field_name2index($parent_ord);
135             $self->parent_ord_field_index($parent_ord_index);
136              
137             return;
138             }
139              
140             has 'parent_ord_field_index' => (
141             is => 'rw',
142             isa => 'Int',
143             );
144              
145             has 'label' => (
146             is => 'rw',
147             isa => 'Str',
148             trigger => \&_label_set,
149             );
150              
151             # sets label_field_index
152             sub _label_set {
153             my ( $self, $label ) = @_;
154              
155             # set index of label field
156             my $label_index = $self->field_name2index($label);
157             $self->label_field_index($label_index);
158              
159             return;
160             }
161              
162             has 'label_field_index' => (
163             is => 'rw',
164             isa => 'Maybe[Int]',
165              
166             # default => 'undef',
167             );
168              
169             # has 'ismember' => (
170             # is => 'rw',
171             # isa => 'Str',
172             # trigger => \&_ismember_set,
173             # );
174              
175             # sets ismember_field_index
176             # sub _ismember_set {
177             # my ( $self, $ismember ) = @_;
178             #
179             # # set index of ismember field
180             # my $ismember_index = $self->field_name2index($ismember);
181             # $self->ismember_field_index($ismember_index);
182             #
183             # return;
184             # }
185              
186             # has 'ismember_field_index' => (
187             # is => 'rw',
188             # isa => 'Maybe[Int]',
189             #
190             # # default => 'undef',
191             # );
192              
193             has 'root_field_values' => (
194             is => 'rw',
195             isa => 'ArrayRef[Str]',
196             default => sub { [] },
197             trigger => \&_root_field_values_set,
198             );
199              
200             # checks number of root field values
201             sub _root_field_values_set {
202             my ($self) = @_;
203              
204             # check number of fields
205             my $root_fields_count = scalar( @{ $self->root_field_values } );
206             if ( $root_fields_count != $self->field_names_count ) {
207             croak "MSTperl config file error: " .
208             "Incorrect number of root field values ($root_fields_count), " .
209             "must be same as number of field names (" .
210             $self->field_names_count . ")!";
211             }
212              
213             return;
214             }
215              
216             has 'number_of_iterations' => (
217             isa => 'Int',
218             is => 'rw',
219             default => 3,
220             );
221              
222             has 'labeller_number_of_iterations' => (
223             isa => 'Int',
224             is => 'rw',
225             default => 3,
226             );
227              
228             # has 'imlabeller_number_of_iterations' => (
229             # isa => 'Int',
230             # is => 'rw',
231             # default => 3,
232             # );
233              
234             has 'use_edge_features_cache' => (
235             is => 'rw',
236             isa => 'Bool',
237             default => '0',
238             );
239              
240             has 'labeller_use_edge_features_cache' => (
241             is => 'rw',
242             isa => 'Bool',
243             default => '0',
244             );
245              
246             # has 'imlabeller_use_edge_features_cache' => (
247             # is => 'rw',
248             # isa => 'Bool',
249             # default => '0',
250             # );
251              
252             # using cache turned off to fit into RAM by default
253             # turn on if training with a lot of RAM or on small training data
254             # turned off when parsing (does not make any sense for parsing)
255              
256             # Distance buckets
257              
258             has 'distance_buckets' => (
259             is => 'rw',
260             isa => 'ArrayRef[Int]',
261             default => sub { [] },
262             trigger => \&_distance_buckets_set,
263             );
264              
265             # sets distance2bucket, maxBucket and minBucket
266             sub _distance_buckets_set {
267             my ( $self, $distance_buckets ) = @_;
268              
269             my %distance2bucket;
270              
271             # find maximal bucket & partly fill %distance2bucket
272             my $maxBucket = 0;
273             foreach my $bucket ( @{$distance_buckets} ) {
274             if ( $distance2bucket{$bucket} ) {
275             warn "Bucket '$bucket' is defined more than once; " .
276             "disregarding its later definitions.\n";
277             } elsif ( $bucket <= 0 ) {
278             croak "MSTperl config file error: " .
279             "Error on bucket '$bucket' - " .
280             "buckets must be positive integers.";
281             } else {
282             $distance2bucket{$bucket} = $bucket;
283             $distance2bucket{ -$bucket } = -$bucket;
284             if ( $bucket > $maxBucket ) {
285             $maxBucket = $bucket;
286             }
287             }
288             }
289              
290             # set maxBucket and minBucket
291             my $minBucket = -$maxBucket;
292             $self->maxBucket($maxBucket);
293             $self->minBucket($minBucket);
294              
295             # fill %distance2bucket from minBucket to maxBucket
296             if ( !$distance2bucket{1} ) {
297             warn "Bucket '1' is not defined, which does not make any sense; " .
298             "adding definition of bucket '1'.\n";
299             $distance2bucket{1} = 1;
300             $distance2bucket{-1} = -1;
301             }
302             my $lastBucket = 1;
303             for ( my $distance = 2; $distance < $maxBucket; $distance++ ) {
304             if ( $distance2bucket{$distance} ) {
305              
306             # the distance defines a bucket
307             $lastBucket = $distance2bucket{$distance};
308             } else {
309              
310             # the distance falls into the highest lower bucket
311             $distance2bucket{$distance} = $lastBucket;
312             $distance2bucket{ -$distance } = -$lastBucket;
313             }
314             }
315             $self->distance2bucket( \%distance2bucket );
316              
317             return;
318             }
319              
320             has 'distance2bucket' => (
321             is => 'rw',
322             isa => 'HashRef[Int]',
323             default => sub { {} },
324             );
325              
326             # if mapping is not found in the hash, maxBucket or minBucket is used
327              
328             has 'maxBucket' => (
329             isa => 'Int',
330             is => 'rw',
331             default => '9',
332             );
333              
334             # any higher distance falls into this bucket
335              
336             has 'minBucket' => (
337             isa => 'Int',
338             is => 'rw',
339             default => '-9',
340             );
341              
342             # any lower distance falls into this bucket, distance is signed (ORD minus ord)
343              
344             # FIELDS
345              
346             # field names (for conversion of field index to field name)
347             has 'field_names' => (
348             is => 'rw',
349             isa => 'ArrayRef[Str]',
350             default => sub { [] },
351             trigger => \&_field_names_set,
352             );
353              
354             # checks field_names, sets field_names_hash and field_indexes
355             sub _field_names_set {
356             my ( $self, $field_names ) = @_;
357              
358             my %field_names_hash;
359             my %field_indexes;
360             for ( my $index = 0; $index < scalar( @{$field_names} ); $index++ ) {
361             my $field_name = $field_names->[$index];
362             if ( $field_names_hash{$field_name} ) {
363             croak "MSTperl config file error: " .
364             "Duplicate field name '$field_name'!";
365             } elsif ( $field_name ne lc($field_name) ) {
366             croak "MSTperl config file error: " .
367             "Field name '$field_name' is not lowercase!";
368             } elsif ( !$field_name =~ /a-z/ ) {
369             croak "MSTperl config file error: " .
370             "Field name '$field_name' does not contain " .
371             "any character from [a-z]!";
372             } else {
373             $field_names_hash{$field_name} = 1;
374             $field_indexes{$field_name} = $index;
375             }
376             }
377              
378             $self->field_names_count( scalar( @{$field_names} ) );
379             $self->field_names_hash( \%field_names_hash );
380             $self->field_indexes( \%field_indexes );
381              
382             return;
383             }
384              
385             has 'field_names_count' => (
386             is => 'rw',
387             isa => 'Int',
388             default => '0',
389             );
390              
391             # 1 for each field name to easily check if a field name exists
392             has 'field_names_hash' => (
393             is => 'rw',
394             isa => 'HashRef[Str]',
395             default => sub { {} },
396             );
397              
398             # index of each field name in field_names
399             # (for conversion of field name to field index)
400             has 'field_indexes' => (
401             is => 'rw',
402             isa => 'HashRef[Str]',
403             default => sub { {} },
404             );
405              
406             has lossFunction => ( is => 'rw', isa => 'Str', default => '' );
407              
408             has use_pmi => (
409             is => 'rw',
410             isa => 'Bool',
411             default => 0
412             );
413              
414             has pmi_model_file => (
415             is => 'rw',
416             isa => 'Str',
417             default => ''
418             );
419              
420             has pmi_model_format => (
421             is => 'rw',
422             isa => 'Str',
423             default => 'tsv'
424             );
425              
426             has 'pmi_buckets' => (
427             is => 'rw',
428             isa => 'Maybe[ArrayRef[Int]]',
429             default => undef,
430             );
431              
432             has use_cprob => (
433             is => 'rw',
434             isa => 'Bool',
435             default => 0
436             );
437              
438             has cprob_model_file => (
439             is => 'rw',
440             isa => 'Str',
441             default => ''
442             );
443              
444             has cprob_model_format => (
445             is => 'rw',
446             isa => 'Str',
447             default => 'tsv'
448             );
449              
450             has 'cprob_buckets' => (
451             is => 'rw',
452             isa => 'Maybe[ArrayRef[Int]]',
453             default => undef,
454             );
455              
456             # METHODS
457              
458             sub BUILD {
459             my ($self) = @_;
460              
461             if ( $self->DEBUG >= 1 ) {
462             print "Processing config file " . $self->config_file . "...\n";
463             }
464              
465             # check if file exists
466             unless ( -e $self->config_file ) {
467             my $dir;
468             my ( $volume, $directory, $cfile ) =
469             File::Spec->splitpath( $self->config_file );
470             $dir = File::Spec->catpath( $volume, $directory, '' );
471             my @files = ();
472             opendir( my $dirhandle, $dir ) or croak $!;
473             while ( my $file = readdir($dirhandle) ) {
474             push @files, $file;
475             }
476             closedir($dirhandle);
477             croak "The config file $cfile does not exists!\n" .
478             "The directory $dir contains the following files: " .
479             join ', ', @files;
480             }
481             use YAML::Tiny;
482             my $config = YAML::Tiny->new;
483             $config = YAML::Tiny->read( $self->config_file );
484              
485             if ( !$config ) {
486             croak "MSTperl config file error: " . YAML::Tiny->errstr;
487              
488             } else {
489              
490             # fields to set, in the order in which they are to be set
491             my @fields = (
492             'field_names',
493             'root_field_values',
494             'parent_ord',
495             'distance_buckets',
496             'label',
497             'lossFunction',
498             'use_pmi',
499             'pmi_model_file',
500             'pmi_model_format',
501             'pmi_buckets',
502             'use_cprob',
503             'cprob_model_file',
504             'cprob_model_format',
505             'cprob_buckets',
506             'use_edge_features_cache',
507             'labeller_use_edge_features_cache',
508             'number_of_iterations',
509             'labeller_number_of_iterations',
510             'labeller_algorithm',
511             'DEBUG',
512             'SEQUENCE_BOUNDARY_LABEL',
513             'VITERBI_STATES_NUM_THRESHOLD',
514             'EM_EPSILON',
515             'EM_heldout_data_at',
516             );
517              
518             # name => required?
519             my %required_fields = (
520             'field_names' => 1,
521             'root_field_values' => 1,
522             'parent_ord' => 1,
523             'distance_buckets' => 1,
524             );
525             foreach my $field (@fields) {
526             if ( $config->[0]->{$field} ) {
527             $self->$field( $config->[0]->{$field} );
528             } else {
529              
530             # if required, then croak
531             if ( $required_fields{$field} ) {
532             croak "MSTperl config file error:"
533             . "Field $field must be set!";
534             }
535              
536             # else OK (default value will be used)
537             }
538             }
539              
540             # ignore some settings if in parsing-only mode
541             if ( !$self->training ) {
542             $self->use_edge_features_cache(0);
543             $self->labeller_use_edge_features_cache(0);
544             }
545              
546             # unlabelled features
547             if ( $config->[0]->{features} && @{ $config->[0]->{features} } ) {
548             $self->unlabelledFeaturesControl(
549             Treex::Tool::Parser::MSTperl::FeaturesControl->new(
550             'config' => $self,
551             'feature_codes_from_config' => $config->[0]->{features},
552             'use_edge_features_cache'
553             => $self->use_edge_features_cache,
554             )
555             );
556              
557             if ( $self->use_pmi ) {
558             my $pmi_model = Treex::Tool::Parser::MSTperl::ModelAdditional->new(
559             config => $self,
560             model_file => $self->pmi_model_file,
561             model_format => $self->pmi_model_format,
562             buckets => $self->pmi_buckets,
563             );
564             my $result = $pmi_model->load();
565             if ($result) {
566             $self->unlabelledFeaturesControl->pmi_model($pmi_model);
567             }
568             }
569              
570             if ( $self->use_cprob ) {
571             my $cprob_model = Treex::Tool::Parser::MSTperl::ModelAdditional->new(
572             config => $self,
573             model_file => $self->cprob_model_file,
574             model_format => $self->cprob_model_format,
575             buckets => $self->cprob_buckets,
576             );
577             my $result = $cprob_model->load();
578             if ($result) {
579             $self->unlabelledFeaturesControl->cprob_model($cprob_model);
580             }
581             }
582             }
583              
584             # labeller features
585             if ($config->[0]->{labeller_features}
586             && @{ $config->[0]->{labeller_features} }
587             )
588             {
589             $self->labelledFeaturesControl(
590             Treex::Tool::Parser::MSTperl::FeaturesControl->new(
591             'config' => $self,
592             'feature_codes_from_config'
593             => $config->[0]->{labeller_features},
594             'use_edge_features_cache'
595             => $self->labeller_use_edge_features_cache,
596             )
597             );
598             }
599              
600             # imlabeller features
601             # if ($config->[0]->{imlabeller_features}
602             # && @{ $config->[0]->{imlabeller_features} }
603             # )
604             # {
605             # $self->imlabelledFeaturesControl(
606             # Treex::Tool::Parser::MSTperl::FeaturesControl->new(
607             # 'config' => $self,
608             # 'feature_codes_from_config'
609             # => $config->[0]->{imlabeller_features},
610             # 'use_edge_features_cache'
611             # => $self->imlabeller_use_edge_features_cache,
612             # )
613             # );
614             # }
615              
616             if (!$self->unlabelledFeaturesControl
617             && !$self->labelledFeaturesControl
618              
619             # && !$self->imlabelledFeaturesControl
620             )
621             {
622             croak "MSTperl config file error: No features set!";
623             }
624              
625             }
626              
627             if ( $self->DEBUG >= 1 ) {
628             print "Done." . "\n";
629             }
630              
631             return;
632             }
633              
634             sub field_name2index {
635             my ( $self, $field_name ) = @_;
636              
637             if ( ref $field_name eq 'ARRAY' ) {
638              
639             # multiarg feature
640             my @return;
641             foreach my $field ( @{$field_name} ) {
642             push @return, $self->field_name2index($field);
643             }
644             return [@return];
645             } else {
646             if ( $self->field_names_hash->{$field_name} ) {
647              
648             # everything OK -> return the field name
649             return $self->field_indexes->{$field_name};
650             } elsif ( $field_name =~ /^-?[0-9]+$/ ) {
651              
652             # not an actual field name but an integer argument -> keep it
653             return $field_name;
654             } else {
655             croak "Unknown field '$field_name', quiting.";
656             }
657             }
658             }
659              
660             1;
661              
662             __END__
663              
664              
665              
666              
667              
668              
669              
670              
671              
672             =pod
673              
674             =for Pod::Coverage BUILD
675              
676             =encoding utf-8
677              
678             =head1 NAME
679              
680             Treex::Tool::Parser::MSTperl::Config
681              
682             =head1 VERSION
683              
684             version 0.11949
685              
686             =head1 DESCRIPTION
687              
688             Handles the configuration of the parser.
689              
690             =head1 FIELDS
691              
692             =head2 Data fields
693              
694             Fields describing fields used with nodes, such as form, pos, lemma...
695              
696             =over 4
697              
698             =item field_names (ArrayRef[Str])
699              
700             Field names (for conversion of field index to field name)
701              
702             =item field_names_hash (HashRef[Str])
703              
704             1 for each field name to easily check if a field name exists
705              
706             =item field_indexes (HashRef[Str])
707              
708             Index of each field name in field_names (for conversion of field name to field
709             index)
710              
711             =back
712              
713             =head2 Settings
714              
715             Most of the settings are set by a config file in YAML format.
716             However, you do not have to understand YAML to be able to change the
717             settings provided that you keep things like formating of the file unchanged
718             (some whitespaces are significant etc.). Actually only a subset of all
719             all that YAML provides is used.
720              
721             Contents of a line from the # character till the end of the line are comments
722             and are ignored (if you need to actually use the # sign, you can quote it -
723             eg. C<'#empty#'> is interpreted as C<#empty#>). Lines that contain only
724             whitespace chars or are empty are ignored as well.
725              
726             Some of the settings are ignored when in parsing mode (i.e. not training).
727             These are use_edge_features_cache (turned off) and number_of_iterations
728             (irrelevant).
729              
730             These are settings which are acquired from the configuration file:
731              
732             =head3 Required Settings
733              
734             =over 4
735              
736             =item field_names
737              
738             Lowercase names of fields in the input file
739             (the data fields are to be separated by tabs in the input file).
740             Use [a-z0-9_] only, using always at least one letter.
741             Use unique names, i.e. devise some names even for unused fields.
742              
743             =item root_field_values
744              
745             Field values to set for the (technical) root node.
746              
747             =item parent_ord
748              
749             Name of field containing ord of the parent of the node
750             (also called "head" or "governing node").
751              
752             =item distance_buckets
753              
754             Buckets to use for C<distance()> function (positive integers in any order).
755             Each distance gets bucketed in the highest lower bucket (absolute-value-wise).
756              
757             Default:
758              
759             distance_buckets:
760             - 1
761             - 2
762             - 3
763             - 4
764             - 5
765             - 11
766              
767             =back
768              
769             =head3 Features Settings
770              
771             Features to be computed on data.
772              
773             Features for the unlabelled parser are set under C<features>,
774             the labeller features under C<labeller_features>.
775              
776             Use the (lowercase) input file field names (e.g. C<pos>)
777             to use the field of the (child) node,
778             uppercase them (e.g. C<POS>) to use the field of the parent,
779             joined together by the C<|> sign to form the features (e.g. C<POS|LEMMA>).
780              
781             Prefix the field names by C<1.> or C<2.>
782             to use the field on the first or second node in the sentence - based on
783             their order in the sentence, regardless of which is parent and which is child
784             (e.g. C<1.pos> for pos of first of the nodes).
785              
786             There are also several predefined functions that you can make use of.
787             Usually you can write the function name in lowercase to invoke them on the child
788             field, uppercase for parent, or prefixed by C<1.> or C<2.> for first or second
789             node (e.g. C<CHILDNO()> to get the number of parent node's children). The
790             parameter of a function must be a (child) field name, or an integer (as the
791             C<index> in C<equalspcat>).
792              
793             =over 4
794              
795             =item distance()
796              
797             bucketed ord-wise distance of child and parent: C<ORD> minus C<ord>
798              
799             =item attdir()
800              
801             parent - child attachement direction: C<signum(ORD minus ord)>
802              
803             =item preceding(field)
804              
805             value of the specified field on the ord-wise preceding node
806             (use C<PRECEDING(field)> to get field on node preceding the PARENT)
807              
808             =item following(field)
809              
810             value of the specified field on the ord-wise following node
811              
812             =item between(field)
813              
814             value of the specified field for each node which is ord-wise between the child
815             node and the parent node
816              
817             =item equals(field1,field2)
818              
819             Returns C<1> if the value of C<field1> is the same as
820             the value of C<field2>. For fields with multiple values,
821             it has the meaning of an "exists" operator: it returns
822             C<1> if there is at least one pair of values of each field that are
823             the same.
824              
825             Returns C<0> if the values don't match.
826              
827             Returns C<-1> if (at least) one of the vaues is
828             C<undef> (may be also represented by an empty string)
829              
830             =item equalspc(field1,field2)
831              
832             like C<equals> but C<field1> is taken from parent node
833             and C<field2> from child node
834              
835             =item equalspcat(field,position)
836              
837             like C<equalspc> but looks at the given position (1 character)
838             in the given field
839              
840             =item substr(field,start,length)
841              
842             substring of field value beginning at given
843             start position (0-based) of given length; standard substr behaviour,
844             i.e. both start and length can be negative and length can be omitted,
845             feature function to be then written as C<substr(field,start)>
846              
847             =item arrayat(array_field,index_field)
848              
849             array_field's value is an array of values
850             separated by single spaces (' '), index_field's value is a zero-based
851             index of a value in the array to be returned (used e.g. for tree distance)
852              
853             =item isfirst()
854              
855             returns 1 if node is the first in the sentence, 0 otherwise
856              
857             =item islast()
858              
859             returns C<1> if node is the last in the sentence, C<0> otherwise
860              
861             =item isfirstchild()
862              
863             returns C<1> if node is the first child of its parent, C<0> otherwise
864              
865             =item islastchild()
866              
867             returns C<1> if node is the last child of its parent, C<0> otherwise
868              
869             =item childno()
870              
871             returns number of node's children
872              
873             =item islastleftchild()
874              
875             is the rightmost of all left children of its parent
876              
877             =item isfirstrightchild()
878              
879             is the leftmost of all right children of its parent
880              
881             =item LABEL()
882              
883             label of parent (to be used only in labeller features);
884             label is somewhat special, it cannot be used as C<label>, C<LABEL> or C<label()>
885              
886             Features containing the C<LABEL()> function are dynamic, i.e. they cannot be
887             precomputed and are always computed just at the time they are needed.
888              
889             =item prevlabel()
890              
891             label of previous sibling (to be used only in labeller features);
892             prevlabel is somewhat special, it cannot be used as
893             C<prevlabel>, C<PREVLABEL> or C<PREVLABEL()>
894              
895             Features containing the C<prevlabel()> function are dynamic, i.e. they cannot be
896             precomputed and are always computed just at the time they are needed.
897              
898             =back
899              
900             See also L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
901              
902             =head3 Internal technical settings
903              
904             These settings are probably better left as they are, but it might be
905             advantageous to have the ability of changing them sometimes, especially when
906             experimenting.
907              
908             You can set the values in various ways. The order of priorities is:
909              
910             =over 4
911              
912             =item 1 set in runtime
913              
914             i.e. set after having created a new Config object:
915              
916             my $config = Treex::Tool::Parser::MSTperl::Config->new(
917             config_file => 'my_config.config');
918             $config->DEBUG(4);
919              
920             The value is only valid from the time of setting.
921              
922             =item 2 set in config file
923              
924             in my_config.config:
925              
926             DEBUG: 4
927              
928             in the perl script:
929              
930             my $config = Treex::Tool::Parser::MSTperl::Config->new(
931             config_file => 'my_config.config');
932              
933             =item 3 set in the constructor
934              
935             i.e. set while creating a new Config object:
936              
937             in my_config.config:
938              
939             # DEBUG: 0
940              
941             in the perl script:
942              
943             my $config = Treex::Tool::Parser::MSTperl::Config->new(
944             config_file => 'my_config.config',
945             DEBUG => 4 );
946              
947             For the setting to take effect, you must not set another value in the config
948             file (you can comment out setting it with '#').
949              
950             =item 4 the default value
951              
952             Used if the value is not set in runtime, in constructor or in the config file.
953              
954             =back
955              
956             Please note that setting some of the values at runtime might not be a good idea.
957              
958             The options are listed here together with their defaults.
959              
960             =over 4
961              
962             =item DEBUG: 0
963              
964             An integer specifying how much debug information you will be getting while
965             running the program, ranging from 0 (no debug info)
966             through 1 (progress messages)
967             through 2, 3 and 4 to 5 (more and more debug info).
968              
969             If you set this value to something higher than 1, you should always redirect
970             the output to a file as printing it to the console is very very slow
971             (and there is so much info that you wouldn't be able to
972             read anything anyway).
973              
974             The possibility
975             to change the value
976             while running the program
977             might be beneficial
978             e.g. if you only want to debug only a particular
979             part of the program.
980              
981             =item number_of_iterations: 3, labeller_number_of_iterations: 3
982              
983             How many times the trainer (Tagger::MSTperl::Trainer) should go through
984             all the training data.
985              
986             =item use_edge_features_cache: 0, labeller_use_edge_features_cache: 0
987              
988             Currently deprecated, unmaintained and probably to be removed.
989              
990             Turns on and off using the C<edge_features_cache>.
991              
992             Using cache should be turned on (C<1>) if training with a lot of RAM or on small
993             training data, as it uses a lot of memory but speeds up the training greatly
994             (approx. by 30% to 50%). If you need to save RAM, turn it off (C<0>).
995              
996             =item labeller_algorithm: 16
997              
998             Algorithm used for Viterbi labelling as well as for training. Several
999             possibilities were tried out,
1000             especially regarding the emission probabilities used in the Viterbi algorithm;
1001             this is for development purposes only, preferebly do not use.
1002              
1003             =over
1004              
1005             =item (0) MIRA-trained weights
1006              
1007             recomputed by +abs(min) and converted to probs,
1008             transitions by MLE on labels
1009              
1010             =item (1) dtto, NOT converted to probs
1011              
1012             should be same as 0
1013              
1014             =item (2) dtto, sum in Viterbi instead of product
1015              
1016             new_prob = old_prob + emiss*trans
1017              
1018             =item (3) dtto, no recompution
1019              
1020             just strip <= 0
1021              
1022             =item (4) basic MLE
1023              
1024             no MIRA, no smoothing, uniform feature weights
1025             blind (unigram) transition backoff,
1026             blind emission backoff (but should not be necessary)
1027              
1028             =item (5) full Viterbi
1029              
1030             dtto, transition probs lambda smoothing by EM
1031              
1032             =item (8) MIRA for all
1033              
1034             completely new, based on reading, no MLE, MIRA for all,
1035             same features for label unigrams and label bigrams
1036              
1037             =item (9) dtto, initialize emissions and transitions by MLE
1038              
1039             =item (10) 0 + fixed best state selection
1040              
1041             =item (11) 10 + tries to use all possible labels
1042              
1043             =item (12) 10 + EM for smoothing of transitions
1044              
1045             =item (13) 11 + EM for smoothing of transitions
1046              
1047             =item (14) 10 + update uses transition probs as well
1048              
1049             =item (15) 12 + update uses transition probs as well
1050              
1051             =item (16) 8 + transitions by MLE & EM on label pairs
1052              
1053             multiplied with emission score in Viterbi and added to last state score
1054              
1055             =item (17) dtto, different transition computation for negative scores
1056              
1057             =item (18) 16 + no Viterbi summing
1058              
1059             =item (19) 16, better formula for combining emissions and transitions
1060              
1061             =item (20) MIRA for all
1062              
1063             =item (21) MIRA for all, with Viterbi
1064              
1065             =item (22) MIRA for all, sentence = one sequence (disregarding tree structure)
1066              
1067             =back
1068              
1069             =item SEQUENCE_BOUNDARY_LABEL: '###'
1070              
1071             This is only a technical thing; a label must be assigned to the (basically
1072             virtual) boundary of a sequence, different from any label used in the data.
1073             The default value is '###', so if you use this exact label as a valid label in
1074             your data, change the setting to something else. If nothing goes wrong, you
1075             should never see this label in the output; however, it is contained in the
1076             model and used for "transition scores" to score the "transition" between the
1077             sequence boundary and the first/last node (i.e. it determines the scores of
1078             labels used as the first or last label in the sequence where no actual
1079             transition takes place and the transition scores would otherwise get ignored).
1080              
1081             =item VITERBI_STATES_NUM_THRESHOLD
1082              
1083             Number of states to keep when pruning. The pruning takes place after each
1084             Viterbi step (i.e. after each computation of possible labels and their scores
1085             for one edge). For more details see the C<prune> subroutine.
1086              
1087             =item EM_EPSILON: 0.00001
1088              
1089             Stopping criterion of EM algorithm which is used to compute smoothing
1090             parameters for linear combination smoothing of transition probabilities
1091             in some variants of the Labeller.
1092             (when the sum of change of smoothing
1093             parameters is lower than the epsilon, the algorithm stops).
1094              
1095             =item EM_heldout_data_at: 0.9
1096              
1097             A number between 0 and 1 specifying
1098             where in training data do heldout data for EM algorithm start
1099             (eg. 0.75 means that first 75% of sentences
1100             are training data and the last 25% are heldout data).
1101              
1102             The training/heldout data division only affects computation of transition
1103             probabilities by MLE, it does not affect MIRA training or MLE for emission
1104             probabilities.
1105              
1106             If EM is not used for smoothing, all data are used as training data.
1107              
1108             =back
1109              
1110             =head2 Technical fields
1111              
1112             Provide access to things needed in more than one of the other packages.
1113              
1114             =over 4
1115              
1116             =item unlabelledFeaturesControl
1117              
1118             Provides access to unlabelled features, especially enabling their computation.
1119             Intance of L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
1120              
1121             =item labelledFeaturesControl
1122              
1123             Provides access to labeller features, especially enabling their computation.
1124             Intance of L<Treex::Tool::Parser::MSTperl::FeaturesControl>.
1125              
1126             =back
1127              
1128             =head1 METHODS
1129              
1130             =head2 Settings
1131              
1132             The best source of information about all the possible settings is the
1133             configuration file itself (usually called C<config.txt>), as it is richly
1134             commented and accompanied by real examples at the same time.
1135              
1136             =over 4
1137              
1138             =item my $config =
1139             Treex::Tool::Parser::MSTperl::Config->new(config_file => 'file.config')
1140              
1141             Reads the configuration file (in YAML format) and applies the settings.
1142              
1143             See file C<samples/sample.config>.
1144              
1145             =item field_name2index ($field_name)
1146              
1147             Fields are referred to by names in the config files but by indexes in the
1148             code. Therefore this conversion function is necessary; the other direction of
1149             the conversion is ensured by the C<field_names> field.
1150              
1151             =back
1152              
1153              
1154             =head1 AUTHORS
1155              
1156             Rudolf Rosa <rosa@ufal.mff.cuni.cz>
1157              
1158             =head1 COPYRIGHT AND LICENSE
1159              
1160             Copyright © 2011 by Institute of Formal and Applied Linguistics,
1161             Charles University in Prague
1162              
1163             This module is free software;
1164             you can redistribute it and/or modify it under the same terms as Perl itself.