File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/FeaturesControl.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Parser::MSTperl::FeaturesControl;
2             {
3             $Treex::Tool::Parser::MSTperl::FeaturesControl::VERSION = '0.11949';
4             }
5              
6 1     1   2433 use Moose;
  0            
  0            
7             use autodie;
8             use Carp;
9              
10             use Treex::Tool::Parser::MSTperl::ModelAdditional;
11              
12             # TODO dynamic features
13              
14             has 'config' => (
15             isa => 'Treex::Tool::Parser::MSTperl::Config',
16             is => 'ro',
17             required => '1',
18             weak_ref => '1',
19             );
20              
21             # FEATURES
22              
23             has 'feature_count' => (
24             is => 'rw',
25             isa => 'Int',
26             );
27              
28             has 'feature_codes_from_config' => (
29             is => 'rw',
30             isa => 'ArrayRef[Str]',
31             default => sub { [] },
32             );
33              
34             has 'feature_codes' => (
35             is => 'rw',
36             isa => 'ArrayRef[Str]',
37             default => sub { [] },
38             );
39              
40             has 'feature_codes_hash' => (
41             is => 'rw',
42             isa => 'HashRef[Str]',
43             default => sub { {} },
44             );
45              
46             has 'feature_indexes' => (
47             is => 'rw',
48             isa => 'HashRef[Str]',
49             default => sub { {} },
50             );
51              
52             # for each feature contains (a reference to) an array
53             # which cointains all its subfeature indexes
54             has 'feature_simple_features_indexes' => (
55             is => 'rw',
56             isa => 'ArrayRef[ArrayRef[Int]]',
57             default => sub { [] },
58             );
59              
60             # features containing array simple features
61             has 'array_features' => (
62             is => 'rw',
63             isa => 'HashRef[Int]',
64             default => sub { {} },
65             );
66              
67             # features containing dynamic simple features
68             has 'dynamic_features' => (
69             is => 'rw',
70             isa => 'HashRef[Int]',
71             default => sub { {} },
72             );
73              
74             # SIMPLE FEATURES
75              
76             has 'simple_feature_count' => (
77             is => 'rw',
78             isa => 'Int',
79             );
80              
81             has 'simple_feature_codes' => (
82             is => 'rw',
83             isa => 'ArrayRef[Str]',
84             default => sub { [] },
85             );
86              
87             has 'simple_feature_codes_hash' => (
88             is => 'rw',
89             isa => 'HashRef[Str]',
90             default => sub { {} },
91             );
92              
93             has 'simple_feature_indexes' => (
94             is => 'rw',
95             isa => 'HashRef[Str]',
96             default => sub { {} },
97             );
98              
99             has 'simple_feature_subs' => (
100             is => 'rw',
101             isa => 'ArrayRef',
102             default => sub { [] },
103             );
104              
105             has 'simple_feature_sub_arguments' => (
106             is => 'rw',
107             isa => 'ArrayRef',
108             default => sub { [] },
109             );
110              
111             # simple features that return an array of values
112             has 'array_simple_features' => (
113             is => 'rw',
114             isa => 'HashRef[Int]',
115             default => sub { {} },
116             );
117              
118             # simple features that must be always recomputed
119             # because their value cannot be always computed from input data
120             # (for labeller - parent's label, brother's label etc.)
121             has 'dynamic_simple_features' => (
122             is => 'rw',
123             isa => 'HashRef[Int]',
124             default => sub { {} },
125             );
126              
127             # # simple features that get more than 1 argument as input
128             # has 'multiarg_simple_features' => (
129             # is => 'rw',
130             # isa => 'HashRef[Int]',
131             # default => sub { {} },
132             # );
133              
134             # CACHING
135              
136             has 'use_edge_features_cache' => (
137             is => 'ro',
138             isa => 'Bool',
139             default => '0',
140             );
141              
142             # using cache turned off to fit into RAM by default
143             # turn on if training with a lot of RAM or on small training data
144             # turned off when parsing (does not make any sense for parsing)
145              
146             has 'edge_features_cache' => (
147             is => 'rw',
148             isa => 'HashRef[ArrayRef[Str]]',
149             default => sub { {} },
150             );
151              
152             has pmi_model => (
153             is => 'rw',
154             isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]',
155             default => undef,
156             );
157              
158             has cprob_model => (
159             is => 'rw',
160             isa => 'Maybe[Treex::Tool::Parser::MSTperl::ModelAdditional]',
161             default => undef,
162             );
163              
164             sub BUILD {
165             my ($self) = @_;
166              
167             # ignore some settings if in parsing-only mode
168             # if ( !$self->training ) {
169             # $self->use_edge_features_cache(0);
170             # }
171              
172             # features
173             foreach my $feature ( @{ $self->feature_codes_from_config } ) {
174             $self->set_feature($feature);
175             }
176              
177             $self->feature_count( scalar( @{ $self->feature_codes } ) );
178             $self->simple_feature_count( scalar( @{ $self->simple_feature_codes } ) );
179              
180             return;
181             }
182              
183             sub set_feature {
184             my ( $self, $feature_code ) = @_;
185              
186             if ( $self->feature_codes_hash->{$feature_code} ) {
187             warn "Feature '$feature_code' is defined more than once; " .
188             "disregarding its later definitions.\n";
189             } else {
190              
191             # get simple features
192             my $isArrayFeature = 0;
193             my $isDynamicFeature = 0;
194             my @simple_features_indexes;
195             my %simple_features_hash;
196             foreach my $simple_feature_code ( split( /\|/, $feature_code ) ) {
197              
198             # checks
199             if ( $simple_features_hash{$simple_feature_code} ) {
200             warn "Simple feature '$simple_feature_code' " .
201             "is used more than once in '$feature_code'; " .
202             "disregarding its later uses.\n";
203             next;
204             }
205             if ( !$self->simple_feature_codes_hash->{$simple_feature_code} ) {
206              
207             # this simple feature has not been used at all yet
208             $self->set_simple_feature($simple_feature_code);
209             }
210              
211             # save
212             my $simple_feature_index =
213             $self->simple_feature_indexes->{$simple_feature_code};
214             $simple_features_hash{$simple_feature_code} = 1;
215             if ( $self->array_simple_features->{$simple_feature_index} ) {
216             $isArrayFeature = 1;
217             }
218             if ( $self->dynamic_simple_features->{$simple_feature_index} ) {
219             $isDynamicFeature = 1;
220             }
221             push @simple_features_indexes, $simple_feature_index;
222             }
223              
224             # save
225             my $feature_index = scalar( @{ $self->feature_codes } );
226             $self->feature_codes_hash->{$feature_code} = 1;
227             $self->feature_indexes->{$feature_code} = $feature_index;
228             push @{ $self->feature_codes }, $feature_code;
229             push @{ $self->feature_simple_features_indexes },
230             [@simple_features_indexes];
231             if ($isArrayFeature) {
232             $self->array_features->{$feature_index} = 1;
233             }
234             if ($isDynamicFeature) {
235             $self->dynamic_features->{$feature_index} = 1;
236             }
237             }
238              
239             return;
240             }
241              
242             sub set_simple_feature {
243             my ( $self, $simple_feature_code ) = @_;
244              
245             # get sub reference and field index
246             my $simple_feature_index = scalar @{ $self->simple_feature_codes };
247             my $simple_feature_sub;
248             my $simple_feature_field;
249              
250             # simple parent/child feature
251             if ( $simple_feature_code =~ /^([a-zA-Z0-9_]+)$/ ) {
252              
253             if ( $simple_feature_code =~ /^([a-z0-9_]+)$/ ) {
254              
255             # child feature
256             $simple_feature_sub = \&{feature_child};
257             $simple_feature_field = $1;
258             } elsif ( $simple_feature_code =~ /^([A-Z0-9_]+)$/ ) {
259              
260             # parent feature
261             $simple_feature_sub = \&{feature_parent};
262             $simple_feature_field = lc($1);
263             } else {
264             die "Incorrect simple feature format '$simple_feature_code'. " .
265             "Use lowercase (" . lc($simple_feature_code) .
266             ") for child node and UPPERCASE (" . uc($simple_feature_code) .
267             ") for parent node.\n";
268             }
269              
270             # first/second/(left sibling)/(right sibling)/Grandparent/grandchildren
271             # node feature
272             } elsif ( $simple_feature_code =~ /^([12gGlr])\.([a-z0-9_]+)$/ ) {
273              
274             $simple_feature_field = $2;
275              
276             if ( $1 eq '1' ) {
277              
278             # first node feature
279             $simple_feature_sub = \&{feature_first};
280             } elsif ( $1 eq '2' ) {
281              
282             # second node feature
283             $simple_feature_sub = \&{feature_second};
284             } elsif ( $1 eq 'g' ) {
285              
286             # grandchildren node feature
287             $simple_feature_sub = \&{feature_grandchildren};
288             } elsif ( $1 eq 'G' ) {
289              
290             # grandparent node feature
291             $simple_feature_sub = \&{feature_grandparent};
292             } elsif ( $1 eq 'l' ) {
293              
294             # left sibling edge child feature
295             $simple_feature_sub = \&{feature_left_sibling};
296             } elsif ( $1 eq 'r' ) {
297              
298             # right sibling edge child feature
299             $simple_feature_sub = \&{feature_right_sibling};
300             } else {
301             croak "Assertion failed!";
302             }
303              
304             # function feature
305             } elsif (
306             $simple_feature_code
307             =~ /^([12gGlr\.a-z]+|[A-Z]+)\([-a-z0-9_,]*\)$/
308             )
309             {
310             my $function_name = $1;
311             $simple_feature_sub =
312             $self->get_simple_feature_sub_reference($function_name);
313              
314             if ($function_name eq 'between'
315             || $function_name eq 'foreach'
316             || substr( $function_name, 0, 2 ) eq 'g.'
317             )
318             {
319              
320             # array function
321             $self->array_simple_features->{$simple_feature_index} = 1;
322             }
323              
324             if ($function_name eq 'LABEL'
325             || $function_name eq 'l.label' || $function_name eq 'prevlabel'
326             || $function_name eq 'G.label'
327             || $function_name eq 'g.label'
328             )
329             {
330              
331             # dynamic feature
332             $self->dynamic_simple_features->{$simple_feature_index} = 1;
333             }
334              
335             # set $simple_feature_field
336             if ( $simple_feature_code =~ /$function_name\(\)$/ ) {
337              
338             # no-arg function feature
339             $simple_feature_field = [];
340             } elsif ( $simple_feature_code =~ /$function_name\(([-a-z0-9_]+)\)$/ ) {
341              
342             # one-arg function feature
343             $simple_feature_field = $1;
344             } elsif (
345             $simple_feature_code
346             =~ /$function_name\(([-a-z0-9_,]+)\)$/
347             )
348             {
349              
350             # multiarg function feature
351             my @fields = split /,/, $1;
352             $simple_feature_field = \@fields;
353             } else {
354             die "Incorrect simple function feature format " .
355             "'$simple_feature_code'.\n";
356             }
357             } else {
358             die "Incorrect simple feature format '$simple_feature_code'.\n";
359             }
360              
361             # if $simple_feature_field is (a ref to) an array of field names,
362             # handles that correctly by iterating over the array and returning
363             # an array of field indexes;
364             # if there is an integer argument instead of a field name,
365             # detects that and keeps that integer unchanged
366             my $simple_feature_sub_arguments =
367             $self->config->field_name2index($simple_feature_field);
368              
369             # save
370             $self->simple_feature_codes_hash->{$simple_feature_code} = 1;
371             $self->simple_feature_indexes->{$simple_feature_code} =
372             $simple_feature_index;
373             push @{ $self->simple_feature_codes }, $simple_feature_code;
374             push @{ $self->simple_feature_subs }, $simple_feature_sub;
375             push @{ $self->simple_feature_sub_arguments },
376             $simple_feature_sub_arguments;
377              
378             return;
379             }
380              
381             # FEATURES COMPUTATION
382              
383             # array (ref) of all features of the edge,
384             # in the form of "feature_index:values_string" strings,
385             # where feature_index is the index of the feature
386             # (index in feature_codes, translatable via feature_indexes)
387             # and values_string are values of corresponding simple features,
388             # joined together by '|'
389             # (if any of the simple features does not return a value, the whole feature
390             # is not present)
391             # TODO maybe not returning a value is still a valuable information -> include?
392             sub get_all_features {
393              
394             # Edge; 0: all features, 1: only dynamic, -1: only non-dynamic
395             # either get only dynamic features or get all but dynamic features
396             my ( $self, $edge, $only_dynamic_features ) = @_;
397              
398             # try to get features from cache
399             # TODO: cache not used now and probably does not even work:
400             # check&fix or remove
401             my $edge_signature;
402             if ( $self->use_edge_features_cache ) {
403             $edge_signature = $edge->signature();
404              
405             my $cache_features = $self->edge_features_cache->{$edge_signature};
406             if ($cache_features) {
407             return $cache_features;
408             }
409             }
410              
411             # double else: if cache not used or if edge features not found in cache
412             my $simple_feature_values = $self->get_simple_feature_values_array($edge);
413             my @features;
414             my $features_count = $self->feature_count;
415             for (
416             my $feature_index = 0;
417             $feature_index < $features_count;
418             $feature_index++
419             )
420             {
421             if ($only_dynamic_features
422             && $only_dynamic_features == 1
423             && !$self->dynamic_features->{$feature_index}
424             )
425             {
426             next;
427             } elsif (
428             $only_dynamic_features
429             && $only_dynamic_features == -1
430             && $self->dynamic_features->{$feature_index}
431             )
432             {
433             next;
434             } else {
435             my $feature_value =
436             $self->get_feature_value( $feature_index, $simple_feature_values );
437             if ( $self->array_features->{$feature_index} ) {
438              
439             #it is an array feature, the returned value is an array reference
440             foreach my $value ( @{$feature_value} ) {
441             push @features, "$feature_index:$value";
442             }
443             } else {
444              
445             #it is not an array feature, the returned value is a string
446             if ( $feature_value ne '' ) {
447             push @features, "$feature_index:$feature_value";
448             }
449             }
450             }
451             }
452              
453             # save result in cache
454             if ( $self->use_edge_features_cache ) {
455             $self->edge_features_cache->{$edge_signature} = \@features;
456             }
457              
458             return \@features;
459             }
460              
461             # returns value of feature: simple feature values joined by '|'
462             # or '' if any of them is undefined or empty;
463             # for an array feature returns an array (ref) of these
464             # or an empty array (ref)
465             sub get_feature_value {
466             my ( $self, $feature_index, $simple_feature_values ) = @_;
467              
468             my $simple_features_indexes =
469             $self->feature_simple_features_indexes->[$feature_index];
470              
471             if ( $self->array_features->{$feature_index} ) {
472             my $feature_value =
473             $self->get_array_feature_value(
474             $simple_features_indexes,
475             $simple_feature_values, 0
476             );
477             if ($feature_value) {
478             return $feature_value;
479             } else {
480             return [];
481             }
482             } else {
483             my @values;
484             foreach my $simple_feature_index ( @{$simple_features_indexes} ) {
485             my $value = $simple_feature_values->[$simple_feature_index];
486             if ( defined $value && $value ne '' ) {
487             push @values, $value;
488             } else {
489             return '';
490             }
491             }
492              
493             my $feature_value = join '|', @values;
494             return $feature_value;
495             }
496             }
497              
498             # for features containing subfeatures that return an array of values
499             sub get_array_feature_value {
500             my (
501             $self,
502             $simple_features_indexes,
503             $simple_feature_values,
504             $start_from
505             ) = @_;
506              
507             # get value at this position (position = $start_from)
508             my $simple_feature_index = $simple_features_indexes->[$start_from];
509             my $value = $simple_feature_values->[$simple_feature_index];
510             if ( !$self->array_simple_features->{$simple_feature_index} ) {
511              
512             # if not an array reference
513             $value = [ ($value) ]; # make it an array reference
514             }
515              
516             my $simple_features_count = scalar @{$simple_features_indexes};
517             if ( $start_from < $simple_features_count - 1 ) {
518              
519             # not the last simple feature => have to recurse
520             my $append =
521             $self->get_array_feature_value(
522             $simple_features_indexes,
523             $simple_feature_values, $start_from + 1
524             );
525             my @values;
526             foreach my $my_value ( @{$value} ) {
527             foreach my $append_value ( @{$append} ) {
528             my $add_value = "$my_value|$append_value";
529             push @values, $add_value;
530             }
531             }
532             return [@values];
533             } else { # else bottom of recursion
534             return $value;
535             }
536             }
537              
538             # SIMPLE FEATURES
539              
540             sub get_simple_feature_values_array {
541             my ( $self, $edge ) = @_;
542              
543             my @simple_feature_values;
544             my $simple_feature_count = $self->simple_feature_count;
545             for (
546             my $simple_feature_index = 0;
547             $simple_feature_index < $simple_feature_count;
548             $simple_feature_index++
549             )
550             {
551             my $sub = $self->simple_feature_subs->[$simple_feature_index];
552              
553             # If the simple feature has one parameter,
554             # then $arguments is the one argument;
555             # if the simple feature has more than one parameter,
556             # then $arguments is a reference to an array of arguments.
557             my $arguments =
558             $self->simple_feature_sub_arguments->[$simple_feature_index];
559             my $value = &$sub( $self, $edge, $arguments );
560             push @simple_feature_values, $value;
561             }
562              
563             return [@simple_feature_values];
564             }
565              
566             my %simple_feature_sub_references = (
567             'LABEL' => \&{feature_parent_label},
568             'prevlabel' => \&{feature_previous_label},
569             'l.label' => \&{feature_previous_label},
570             'G.label' => \&{feature_grandparent_label},
571             'g.label' => \&{feature_grandchildren_label},
572             'distance' => \&{feature_distance},
573             'G.distance' => \&{feature_grandparent_distance},
574             'attdir' => \&{feature_attachement_direction},
575             'G.attdir' => \&{feature_grandparent_attachement_direction}, # grandparent to child
576             'preceding' => \&{feature_preceding_child},
577             'PRECEDING' => \&{feature_preceding_parent},
578             '1.preceding' => \&{feature_preceding_first},
579             '2.preceding' => \&{feature_preceding_second},
580             'following' => \&{feature_following_child},
581             'FOLLOWING' => \&{feature_following_parent},
582             '1.following' => \&{feature_following_first},
583             '2.following' => \&{feature_following_second},
584             'between' => \&{feature_between},
585             'foreach' => \&{feature_foreach},
586             'equals' => \&{feature_equals},
587             'equalspc' => \&{feature_equals_pc},
588             'equalspcat' => \&{feature_equals_pc_at},
589             'arrayat' => \&{feature_array_at_child},
590             'ARRAYAT' => \&{feature_array_at_parent},
591             'arrayatcp' => \&{feature_array_at_cp},
592             'isfirst' => \&{feature_child_is_first_in_sentence},
593             'ISFIRST' => \&{feature_parent_is_first_in_sentence},
594             'islast' => \&{feature_child_is_last_in_sentence},
595             'ISLAST' => \&{feature_parent_is_last_in_sentence},
596             'isfirstchild' => \&{feature_child_is_first_child},
597             'islastchild' => \&{feature_child_is_last_child},
598             'islastleftchild' => \&{feature_child_is_last_left_child},
599             'isfirstrightchild' => \&{feature_child_is_first_right_child},
600             'childno' => \&{feature_number_of_childs_children},
601             'CHILDNO' => \&{feature_number_of_parents_children},
602             'substr' => \&{feature_substr_child},
603             'SUBSTR' => \&{feature_substr_parent},
604             'pmi' => \&{feature_pmi},
605             'pmibucketed' => \&{feature_pmi_bucketed},
606             'pmirounded' => \&{feature_pmi_rounded},
607             'pmid' => \&{feature_pmi_d},
608             'cprob' => \&{feature_cprob},
609             'cprobbucketed' => \&{feature_cprob_bucketed},
610             'cprobrounded' => \&{feature_cprob_rounded},
611              
612             # obsolete
613             # 'pmitworounded' => \&{feature_pmi_2_rounded},
614             # 'pmithreerounded' => \&{feature_pmi_3_rounded},
615             # 'cprobtworounded' => \&{feature_cprob_2_rounded},
616             # 'cprobthreerounded' => \&{feature_cprob_3_rounded},
617             );
618              
619             sub get_simple_feature_sub_reference {
620             my ( $self, $simple_feature_function ) = @_;
621              
622             if ( $simple_feature_sub_references{$simple_feature_function} ) {
623             return $simple_feature_sub_references{$simple_feature_function};
624             } else {
625             croak "Unknown feature function '$simple_feature_function'!";
626             }
627             }
628              
629             # returns undef if there is no grandparent, i.e. the parent is the root
630             sub get_grandparent {
631             my ( $self, $edge ) = @_;
632              
633             return ( $edge->parent )->parent;
634             }
635              
636             sub feature_distance {
637             my ( $self, $edge ) = @_;
638              
639             return $self->feature_distance_generic( $edge->parent, $edge->child );
640             }
641              
642             sub feature_grandparent_distance {
643             my ( $self, $edge ) = @_;
644              
645             my $grandparent = $self->get_grandparent($edge);
646             if ( defined $grandparent ) {
647             return $self->feature_distance_generic( $edge->parent, $edge->child );
648             } else {
649             return '#novalue#';
650             }
651             }
652              
653             sub feature_distance_generic {
654             my ( $self, $node1, $node2 ) = @_;
655              
656             my $distance = $node1->ord - $node2->ord;
657              
658             my $bucket = $self->config->distance2bucket->{$distance};
659             if ( defined $bucket ) {
660             return $bucket;
661             } else {
662             if ( $distance <= $self->config->minBucket ) {
663             return $self->config->minBucket;
664             } else { # $distance >= $self->maxBucket
665             return $self->config->maxBucket;
666             }
667             }
668             }
669              
670             sub feature_attachement_direction {
671             my ( $self, $edge ) = @_;
672              
673             return $self->feature_attachement_direction_generic(
674             $edge->parent, $edge->child
675             );
676             }
677              
678             sub feature_grandparent_attachement_direction {
679             my ( $self, $edge ) = @_;
680              
681             my $grandparent = $self->get_grandparent($edge);
682             if ( defined $grandparent ) {
683             return $self->feature_attachement_direction_generic(
684             $edge->parent, $edge->child
685             );
686             } else {
687             return '#novalue#';
688             }
689             }
690              
691             sub feature_attachement_direction_generic {
692             my ( $self, $node1, $node2 ) = @_;
693              
694             if ( $node1->ord < $node2->ord ) {
695             return -1;
696             } else {
697             return 1;
698             }
699             }
700              
701             sub feature_child {
702             my ( $self, $edge, $field_index ) = @_;
703             return ( $edge->child->fields->[$field_index] );
704             }
705              
706             sub feature_parent {
707             my ( $self, $edge, $field_index ) = @_;
708             return ( $edge->parent->fields->[$field_index] );
709             }
710              
711             sub feature_grandparent {
712             my ( $self, $edge, $field_index ) = @_;
713              
714             my $grandparent = $self->get_grandparent($edge);
715             if ( defined $grandparent ) {
716             return ( $grandparent->fields->[$field_index] );
717             } else {
718             return '#novalue#';
719             }
720             }
721              
722             sub feature_parent_label {
723             my ( $self, $edge ) = @_;
724             return ( $edge->parent->label );
725             }
726              
727             sub feature_previous_label {
728             my ( $self, $edge ) = @_;
729              
730             my $left_sibling = $self->get_left_sibling($edge);
731             if ( defined $left_sibling ) {
732             return ( $left_sibling->child->label );
733             } else {
734             return $self->config->SEQUENCE_BOUNDARY_LABEL;
735             }
736             }
737              
738             sub feature_grandparent_label {
739             my ( $self, $edge ) = @_;
740              
741             my $grandparent = $self->get_grandparent($edge);
742             if ( defined $grandparent ) {
743             return ( $grandparent->label );
744             } else {
745             return '#novalue#';
746             }
747             }
748              
749             sub feature_first {
750             my ( $self, $edge, $field_index ) = @_;
751             return ( $edge->first->fields->[$field_index] );
752             }
753              
754             sub feature_second {
755             my ( $self, $edge, $field_index ) = @_;
756             return ( $edge->second->fields->[$field_index] );
757             }
758              
759             sub feature_left_sibling {
760             my ( $self, $edge, $field_index ) = @_;
761              
762             my $left_sibling = $self->get_left_sibling($edge);
763             if ( defined $left_sibling ) {
764             return ( $left_sibling->child->fields->[$field_index] );
765             } else {
766             return '#start#';
767             }
768             }
769              
770             sub feature_right_sibling {
771             my ( $self, $edge, $field_index ) = @_;
772              
773             my $right_sibling = $self->get_right_sibling($edge);
774             if ( defined $right_sibling ) {
775             return ( $right_sibling->child->fields->[$field_index] );
776             } else {
777             return '#end#';
778             }
779             }
780              
781             sub get_left_sibling {
782             my ( $self, $edge ) = @_;
783              
784             my $siblings = $edge->parent->children;
785             my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord );
786             if ($is_first) {
787              
788             # there is no left sibling to the leftmost node
789             return;
790             } else {
791              
792             # find my position among parent's children (is at least 1)
793             my $my_index = 1;
794             while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) {
795             $my_index++;
796             }
797              
798             # now ($my_index-1) is the index of my (closest) left sibling
799             return ( $siblings->[ $my_index - 1 ] );
800             }
801             }
802              
803             sub get_right_sibling {
804             my ( $self, $edge ) = @_;
805              
806             my $siblings = $edge->parent->children;
807             my $last_sibling_index = scalar(@$siblings) - 1;
808             my $is_last = (
809             $siblings->[$last_sibling_index]->child->ord
810             == $edge->child->ord
811             );
812             if ($is_last) {
813              
814             # there is no right sibling to the rightmost node
815             return;
816             } else {
817              
818             # find my position among parent's children
819             # (is at most $last_sibling_index - 1)
820             my $my_index = $last_sibling_index - 1;
821             while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) {
822             $my_index--;
823             }
824              
825             # now ($my_index+1) is the index of my (closest) right sibling
826             return $siblings->[ $my_index + 1 ];
827             }
828             }
829              
830             sub feature_preceding_child {
831             my ( $self, $edge, $field_index ) = @_;
832              
833             my $node = $edge->sentence->getNodeByOrd( $edge->child->ord - 1 );
834              
835             # $node may be undef
836             if ($node) {
837             if ( $edge->parent->ord == $node->ord ) {
838              
839             # no gap between nodes
840             return '#mid#';
841             } else {
842             return $node->fields->[$field_index];
843             }
844             } else {
845             return '#start#';
846             }
847             }
848              
849             sub feature_preceding_parent {
850             my ( $self, $edge, $field_index ) = @_;
851              
852             my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord - 1 );
853              
854             # $node may be undef
855             if ($node) {
856             if ( $edge->child->ord == $node->ord ) {
857              
858             # no gap between nodes
859             return '#mid#';
860             } else {
861             return $node->fields->[$field_index];
862             }
863             } else {
864             return '#start#';
865             }
866             }
867              
868             sub feature_following_child {
869             my ( $self, $edge, $field_index ) = @_;
870              
871             my $node = $edge->sentence->getNodeByOrd( $edge->child->ord + 1 );
872              
873             # $node may be undef
874             if ($node) {
875             if ( $edge->parent->ord == $node->ord ) {
876              
877             # no gap between nodes
878             return '#mid#';
879             } else {
880             return $node->fields->[$field_index];
881             }
882             } else {
883             return '#end#';
884             }
885             }
886              
887             sub feature_following_parent {
888             my ( $self, $edge, $field_index ) = @_;
889              
890             my $node = $edge->sentence->getNodeByOrd( $edge->parent->ord + 1 );
891              
892             # $node may be undef
893             if ($node) {
894             if ( $edge->child->ord == $node->ord ) {
895              
896             # no gap between nodes
897             return '#mid#';
898             } else {
899             return $node->fields->[$field_index];
900             }
901             } else {
902             return '#end#';
903             }
904             }
905              
906             sub feature_preceding_first {
907             my ( $self, $edge, $field_index ) = @_;
908              
909             my $node = $edge->sentence->getNodeByOrd( $edge->first->ord - 1 );
910              
911             # $node may be undef
912             if ($node) {
913             return $node->fields->[$field_index];
914             } else {
915             return '#start#';
916             }
917             }
918              
919             sub feature_preceding_second {
920             my ( $self, $edge, $field_index ) = @_;
921              
922             my $node = $edge->sentence->getNodeByOrd( $edge->second->ord - 1 );
923              
924             # $node may be undef
925             if ($node) {
926             if ( $edge->first->ord == $node->ord ) {
927              
928             # node preceding second node is first node
929             return '#mid#';
930             } else {
931             return $node->fields->[$field_index];
932             }
933             } else {
934             return '#start#';
935             }
936             }
937              
938             sub feature_following_first {
939             my ( $self, $edge, $field_index ) = @_;
940              
941             my $node = $edge->sentence->getNodeByOrd( $edge->first->ord + 1 );
942              
943             # $node may be undef
944             if ($node) {
945             if ( $edge->second->ord == $node->ord ) {
946              
947             # node following first node is second node
948             return '#mid#';
949             } else {
950             return $node->fields->[$field_index];
951             }
952             } else {
953             return '#end#';
954             }
955             }
956              
957             sub feature_following_second {
958             my ( $self, $edge, $field_index ) = @_;
959              
960             my $node = $edge->sentence->getNodeByOrd( $edge->second->ord + 1 );
961              
962             # $node may be undef
963             if ($node) {
964             return $node->fields->[$field_index];
965             } else {
966             return '#end#';
967             }
968             }
969              
970             sub feature_between {
971             my ( $self, $edge, $field_index ) = @_;
972              
973             my @values;
974             my $from;
975             my $to;
976             if ( $edge->parent->ord < $edge->child->ord ) {
977             $from = $edge->parent->ord + 1;
978             $to = $edge->child->ord - 1;
979             } else {
980             $from = $edge->child->ord + 1;
981             $to = $edge->parent->ord - 1;
982             }
983              
984             # TODO: use precomputed values instead
985              
986             for ( my $ord = $from; $ord <= $to; $ord++ ) {
987             push @values,
988             $edge->sentence->getNodeByOrd($ord)->fields->[$field_index];
989             }
990             return [@values];
991              
992             # my $len = $to - $from;
993             # if ($len >= 0) {
994             # return $edge->sentence->betweenFeatureValues->
995             # {$field_index}->[$from]->[$len];
996             # } else {
997             # return;
998             # }
999              
1000             }
1001              
1002             sub feature_foreach {
1003             my ( $self, $edge, $field_index ) = @_;
1004              
1005             my $values = $edge->child->fields->[$field_index];
1006             if ($values) {
1007             my @values = split / /, $edge->child->fields->[$field_index];
1008             return [@values];
1009             } else {
1010             return '';
1011             }
1012             }
1013              
1014             sub feature_equals {
1015             my ( $self, $edge, $field_indexes ) = @_;
1016              
1017             # equals takes two arguments
1018             if ( @{$field_indexes} == 2 ) {
1019             my ( $field_index_1, $field_index_2 ) = @{$field_indexes};
1020             my $values_1 = $edge->child->fields->[$field_index_1];
1021             my $values_2 = $edge->child->fields->[$field_index_2];
1022              
1023             # we handle undefines and empties specially
1024             if (
1025             defined $values_1
1026             && $values_1 ne ''
1027             && defined $values_2
1028             && $values_2 ne ''
1029             )
1030             {
1031             my $result = 0; # default not equal
1032             my @values_1 = split / /, $values_1;
1033             my @values_2 = split / /, $values_2;
1034              
1035             # try to find a match
1036             foreach my $value_1 (@values_1) {
1037             foreach my $value_2 (@values_2) {
1038             if ( $value_1 eq $value_2 ) {
1039             $result = 1; # one match is enough
1040             }
1041             }
1042             }
1043             return $result;
1044             } else {
1045             return -1; # undef
1046             }
1047             } else {
1048             croak "equals() takes TWO arguments!!!";
1049             }
1050             }
1051              
1052             # only difference to equals is the line:
1053             # my $values_1 = $edge->PARENT->fields->[$field_index_1];
1054             sub feature_equals_pc {
1055             my ( $self, $edge, $field_indexes ) = @_;
1056              
1057             # equals takes two arguments
1058             if ( @{$field_indexes} == 2 ) {
1059             my ( $field_index_1, $field_index_2 ) = @{$field_indexes};
1060             my $values_1 = $edge->parent->fields->[$field_index_1];
1061             my $values_2 = $edge->child->fields->[$field_index_2];
1062              
1063             # we handle undefines and empties specially
1064             if (
1065             defined $values_1
1066             && $values_1 ne ''
1067             && defined $values_2
1068             && $values_2 ne ''
1069             )
1070             {
1071             my $result = 0; # default not equal
1072             my @values_1 = split / /, $values_1;
1073             my @values_2 = split / /, $values_2;
1074              
1075             # try to find a match
1076             foreach my $value_1 (@values_1) {
1077             foreach my $value_2 (@values_2) {
1078             if ( $value_1 eq $value_2 ) {
1079             $result = 1; # one match is enough
1080             }
1081             }
1082             }
1083             return $result;
1084             } else {
1085             return -1; # undef
1086             }
1087             } else {
1088             croak "equals() takes TWO arguments!!!";
1089             }
1090             }
1091              
1092             # sub equalsat - does not make sense
1093              
1094             # whether the character at the given position of the given field
1095             # equals in parent and in child
1096             sub feature_equals_pc_at {
1097             my ( $self, $edge, $arguments ) = @_;
1098              
1099             # equals takes two arguments
1100             if ( @{$arguments} == 2 ) {
1101             my ( $field_index, $position ) = @{$arguments};
1102             my $field_parent = $edge->parent->fields->[$field_index];
1103             my $field_child = $edge->child->fields->[$field_index];
1104              
1105             # we handle undefines and too short fields specially
1106             if (
1107             defined $field_parent
1108             && length $field_parent > $position
1109             && defined $field_child
1110             && length $field_child > $position
1111             )
1112             {
1113             my $value_parent = substr $field_parent, $position, 1;
1114             my $value_child = substr $field_child, $position, 1;
1115             if ( $value_parent eq $value_child ) {
1116             return 1;
1117             } else {
1118             return 0;
1119             }
1120             } else {
1121             return -1; # undef
1122             }
1123             } else {
1124             croak "equals() takes TWO arguments!!!";
1125             }
1126             }
1127              
1128             # substring (field, start, length)
1129             sub feature_substr_child {
1130             my ( $self, $edge, $arguments ) = @_;
1131              
1132             # substr takes two or three arguments
1133             if ( @{$arguments} != 3 && @{$arguments} != 2 ) {
1134             croak "substr() takes THREE or TWO arguments!!!";
1135             } else {
1136             my ( $field_index, $start, $length ) = @{$arguments};
1137             my $field = $edge->child->fields->[$field_index];
1138              
1139             my $value = '';
1140             if ( defined $field ) {
1141             if ( defined $length ) {
1142             $value = substr( $field, $start, $length );
1143             } else {
1144             $value = substr( $field, $start );
1145             }
1146             }
1147              
1148             return $value;
1149             }
1150             }
1151              
1152             # substring (field, start, length)
1153             sub feature_substr_parent {
1154             my ( $self, $edge, $arguments ) = @_;
1155              
1156             # substr takes two or three arguments
1157             if ( @{$arguments} != 3 && @{$arguments} != 2 ) {
1158             croak "substr() takes THREE or TWO arguments!!!";
1159             } else {
1160             my ( $field_index, $start, $length ) = @{$arguments};
1161             my $field = $edge->parent->fields->[$field_index];
1162              
1163             my $value = '';
1164             if ( defined $field ) {
1165             if ( defined $length ) {
1166             $value = substr( $field, $start, $length );
1167             } else {
1168             $value = substr( $field, $start );
1169             }
1170             }
1171              
1172             return $value;
1173             }
1174             }
1175              
1176             # arrayat (array, index)
1177             sub feature_array_at_child {
1178             my ( $self, $edge, $arguments ) = @_;
1179              
1180             # arrayat takes two arguments
1181             if ( @{$arguments} != 2 ) {
1182             croak "arrayat() takes TWO arguments!!!";
1183             } else {
1184             my ( $array_field, $index_field ) = @{$arguments};
1185             my $array = $edge->child->fields->[$array_field];
1186             my $index = $edge->child->fields->[$index_field];
1187              
1188             my @array = split / /, $array;
1189             my $value = $array[$index];
1190             if ( !defined $value ) {
1191             $value = '';
1192             }
1193              
1194             return $value;
1195             }
1196             }
1197              
1198             sub feature_array_at_parent {
1199             my ( $self, $edge, $arguments ) = @_;
1200              
1201             # arrayat takes two arguments
1202             if ( @{$arguments} != 2 ) {
1203             croak "arrayat() takes TWO arguments!!!";
1204             } else {
1205             my ( $array_field, $index_field ) = @{$arguments};
1206             my $array = $edge->parent->fields->[$array_field];
1207             my $index = $edge->parent->fields->[$index_field];
1208              
1209             my @array = split / /, $array;
1210             my $value = $array[$index];
1211             if ( !defined $value ) {
1212             $value = '';
1213             }
1214              
1215             return $value;
1216             }
1217             }
1218              
1219             # arrayatcp (array, index)
1220             sub feature_array_at_cp {
1221             my ( $self, $edge, $arguments ) = @_;
1222              
1223             # arrayat takes two arguments
1224             if ( @{$arguments} != 2 ) {
1225             croak "arrayat() takes TWO arguments!!!";
1226             } else {
1227             my ( $array_field, $index_field ) = @{$arguments};
1228             my $array = $edge->child->fields->[$array_field];
1229             my $index = $edge->parent->fields->[$index_field];
1230              
1231             my @array = split / /, $array;
1232             my $value = $array[$index];
1233             if ( !defined $value ) {
1234             $value = '';
1235             }
1236              
1237             return $value;
1238             }
1239             }
1240              
1241             sub feature_child_is_first_in_sentence {
1242             my ( $self, $edge ) = @_;
1243              
1244             if ( $edge->child->ord == 1 ) {
1245             return 1;
1246             } else {
1247             return 0;
1248             }
1249             }
1250              
1251             sub feature_parent_is_first_in_sentence {
1252             my ( $self, $edge ) = @_;
1253              
1254             if ( $edge->parent->ord == 1 ) {
1255             return 1;
1256             } else {
1257             return 0;
1258             }
1259             }
1260              
1261             sub feature_child_is_last_in_sentence {
1262             my ( $self, $edge ) = @_;
1263              
1264             # last ord = number of nodes (because ords are 1-based, 0 is the root node)
1265             if ( $edge->child->ord == scalar( @{ $edge->sentence->nodes } ) ) {
1266             return 1;
1267             } else {
1268             return 0;
1269             }
1270             }
1271              
1272             sub feature_parent_is_last_in_sentence {
1273             my ( $self, $edge ) = @_;
1274              
1275             # last ord = number of nodes (because ords are 1-based, 0 is the root node)
1276             if ( $edge->parent->ord == scalar( @{ $edge->sentence->nodes } ) ) {
1277             return 1;
1278             } else {
1279             return 0;
1280             }
1281             }
1282              
1283             sub feature_child_is_first_child {
1284             my ( $self, $edge ) = @_;
1285              
1286             my $children = $edge->parent->children;
1287             if ( $children->[0]->child->ord == $edge->child->ord ) {
1288             return 1;
1289             } else {
1290             return 0;
1291             }
1292             }
1293              
1294             sub feature_child_is_last_child {
1295             my ( $self, $edge ) = @_;
1296              
1297             my $children = $edge->parent->children;
1298             my $childrenNum = scalar(@$children);
1299             if ( $children->[ $childrenNum - 1 ]->child->ord == $edge->child->ord ) {
1300             return 1;
1301             } else {
1302             return 0;
1303             }
1304             }
1305              
1306             sub feature_child_is_first_right_child {
1307             my ( $self, $edge ) = @_;
1308              
1309             my $is_right = ( $edge->parent->ord < $edge->child->ord );
1310             if ($is_right) {
1311             my $siblings = $edge->parent->children;
1312             my $is_first = ( $siblings->[0]->child->ord == $edge->child->ord );
1313             if ($is_first) {
1314              
1315             # is right & is first (= leftmost) of all siblings
1316             return 1;
1317             } else {
1318              
1319             # find my position among parent's children (is at least 1)
1320             my $my_index = 1;
1321             while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) {
1322             $my_index++;
1323             }
1324              
1325             # now ($my_index-1) is the index of my (closest) left sibling
1326             my $sibling_is_left =
1327             (
1328             $siblings->[ $my_index - 1 ]->child->ord
1329             < $edge->parent->ord
1330             );
1331             if ($sibling_is_left) {
1332              
1333             # is right and closest left sibling is left
1334             return 1;
1335             } else {
1336              
1337             # is right but not the first one
1338             return 0;
1339             }
1340             }
1341             } else {
1342              
1343             # is left
1344             return 0;
1345             }
1346             }
1347              
1348             sub feature_child_is_last_left_child {
1349             my ( $self, $edge ) = @_;
1350              
1351             my $is_left = ( $edge->child->ord < $edge->parent->ord );
1352             if ($is_left) {
1353             my $siblings = $edge->parent->children;
1354             my $last_sibling_index = scalar(@$siblings) - 1;
1355             my $is_last = (
1356             $siblings->[$last_sibling_index]->child->ord
1357             == $edge->child->ord
1358             );
1359             if ($is_last) {
1360              
1361             # is left & is last of all siblings
1362             return 1;
1363             } else {
1364              
1365             # find my position among parent's children
1366             # (is at most $last_sibling_index - 1)
1367             my $my_index = $last_sibling_index - 1;
1368             while ( $siblings->[$my_index]->child->ord != $edge->child->ord ) {
1369             $my_index--;
1370             }
1371              
1372             # now ($my_index+1) is the index of my (closest) right sibling
1373             my $sibling_is_right =
1374             (
1375             $edge->parent->ord
1376             < $siblings->[ $my_index + 1 ]->child->ord
1377             );
1378             if ($sibling_is_right) {
1379              
1380             # is left and closest right sibling is right
1381             return 1;
1382             } else {
1383              
1384             # is left but not the last one
1385             return 0;
1386             }
1387             }
1388             } else {
1389              
1390             # is right
1391             return 0;
1392             }
1393             }
1394              
1395             sub feature_number_of_childs_children {
1396             my ( $self, $edge ) = @_;
1397              
1398             my $children = $edge->child->children;
1399             if ( $children && scalar(@$children) ) {
1400             return scalar(@$children);
1401             } else {
1402             return 0;
1403             }
1404             }
1405              
1406             sub feature_number_of_parents_children {
1407             my ( $self, $edge ) = @_;
1408              
1409             my $children = $edge->parent->children;
1410             if ( $children && scalar(@$children) ) {
1411             return scalar(@$children);
1412             } else {
1413             return 0;
1414             }
1415             }
1416              
1417             sub feature_additional_model {
1418             my ( $self, $edge, $field_index, $model ) = @_;
1419              
1420             my $child = $edge->child->fields->[$field_index];
1421             my $parent = $edge->parent->fields->[$field_index];
1422              
1423             if ( defined $child && defined $parent ) {
1424             return $model->get_value( $child, $parent );
1425             } else {
1426             croak "Either child or parent is undefined in additional model feature, " .
1427             "this should not happen!";
1428             }
1429             }
1430              
1431             sub feature_additional_model_bucketed {
1432             my ( $self, $edge, $field_index, $model ) = @_;
1433              
1434             my $child = $edge->child->fields->[$field_index];
1435             my $parent = $edge->parent->fields->[$field_index];
1436              
1437             if ( defined $child && defined $parent ) {
1438             return $model->get_bucketed_value( $child, $parent );
1439             } else {
1440             croak "Either child or parent is undefined in additional model feature, " .
1441             "this should not happen!";
1442             }
1443             }
1444              
1445             sub feature_additional_model_rounded {
1446             my ( $self, $edge, $parameters, $model ) = @_;
1447              
1448             my ( $field_index, $rounding ) = @$parameters;
1449             my $child = $edge->child->fields->[$field_index];
1450             my $parent = $edge->parent->fields->[$field_index];
1451              
1452             if ( defined $child && defined $parent ) {
1453             return $model->get_rounded_value( $child, $parent, $rounding );
1454             } else {
1455             croak "Either child or parent is undefined in additional model feature, " .
1456             "this should not happen!";
1457             }
1458             }
1459              
1460             sub feature_additional_model_d {
1461             my ( $self, $edge, $parameters, $model ) = @_;
1462              
1463             my ( $field_index_c, $field_index_p ) = @$parameters;
1464             my $child = $edge->child->fields->[$field_index_c];
1465             my $parent = $edge->parent->fields->[$field_index_p];
1466              
1467             if ( defined $child && defined $parent ) {
1468             return $model->get_rounded_value( $child, $parent );
1469             } else {
1470             croak "Either child or parent is undefined in additional model feature, " .
1471             "this should not happen!";
1472             }
1473             }
1474              
1475             sub feature_pmi {
1476             my ( $self, $edge, $field_index ) = @_;
1477              
1478             return $self->feature_additional_model( $edge, $field_index, $self->pmi_model );
1479             }
1480              
1481             sub feature_pmi_bucketed {
1482             my ( $self, $edge, $field_index ) = @_;
1483              
1484             return $self->feature_additional_model_bucketed( $edge, $field_index, $self->pmi_model );
1485             }
1486              
1487             sub feature_pmi_rounded {
1488             my ( $self, $edge, $parameters ) = @_;
1489              
1490             return $self->feature_additional_model_rounded( $edge, $parameters, $self->pmi_model );
1491             }
1492              
1493             sub feature_pmi_d {
1494             my ( $self, $edge, $parameters ) = @_;
1495              
1496             return $self->feature_additional_model_d( $edge, $parameters, $self->pmi_model );
1497             }
1498              
1499             sub feature_pmi_2_rounded {
1500             my ( $self, $edge, $field_index ) = @_;
1501              
1502             my @params = ( $field_index, 1 );
1503             return $self->feature_pmi_rounded( $edge, \@params );
1504             }
1505              
1506             sub feature_pmi_3_rounded {
1507             my ( $self, $edge, $field_index ) = @_;
1508              
1509             my @params = ( $field_index, 2 );
1510             return $self->feature_pmi_rounded( $edge, \@params );
1511             }
1512              
1513             sub feature_cprob {
1514             my ( $self, $edge, $field_index ) = @_;
1515              
1516             return $self->feature_additional_model( $edge, $field_index, $self->cprob_model );
1517             }
1518              
1519             sub feature_cprob_bucketed {
1520             my ( $self, $edge, $field_index ) = @_;
1521              
1522             return $self->feature_additional_model_bucketed( $edge, $field_index, $self->cprob_model );
1523             }
1524              
1525             sub feature_cprob_rounded {
1526             my ( $self, $edge, $parameters ) = @_;
1527              
1528             return $self->feature_additional_model_rounded( $edge, $parameters, $self->cprob_model );
1529             }
1530              
1531             sub feature_cprob_2_rounded {
1532             my ( $self, $edge, $field_index ) = @_;
1533              
1534             my @params = ( $field_index, 1 );
1535             return $self->feature_cprob_rounded( $edge, \@params );
1536             }
1537              
1538             sub feature_cprob_3_rounded {
1539             my ( $self, $edge, $field_index ) = @_;
1540              
1541             my @params = ( $field_index, 2 );
1542             return $self->feature_cprob_rounded( $edge, \@params );
1543             }
1544              
1545             1;
1546              
1547             __END__
1548              
1549             =pod
1550              
1551             =for Pod::Coverage BUILD
1552              
1553             =encoding utf-8
1554              
1555             =head1 NAME
1556              
1557             Treex::Tool::Parser::MSTperl::FeaturesControl
1558              
1559             =head1 VERSION
1560              
1561             version 0.11949
1562              
1563             =head1 DESCRIPTION
1564              
1565             Controls the features used in the model.
1566              
1567             =head2 Features
1568              
1569             TODO: outdated, superceded by use of config file -> rewrite
1570              
1571             Each feature has a form C<code:value>. The code desribes the information which
1572             is relevant for the feature, and the value is the information retained from
1573             the dependency edge (and possibly other parts of the sentence
1574             (L<Treex::Tool::Parser::MSTperl::Sentence>) stored in C<sentence> field).
1575              
1576             For example, the feature C<L|l:být|pes> means that the lemma of the parent node
1577             (the governing word) is "být" and the lemma of its child node (the dependent
1578             node) is "pes".
1579              
1580             Each (proper) feature is composed of several simple features. In the
1581             aforementioned example, the simple feature codes were C<L> and C<l> and their
1582             values "být" and "pes", respectively. Each simple feature code is a string
1583             (case sensitive) and its value is also a string. The simple feature codes are
1584             joined together by the C<|> sign to form the code of the proper feature, and
1585             similarly, the simple feature values joined by C<|> form the proper feature
1586             value. Then, the proper feature code and value are joined together by C<:>.
1587             (Therefore, the codes and values of the simple features must not contain the
1588             C<|> and the C<:> signs.)
1589              
1590             By a naming convention,
1591             if the same simple feature can be computed for both the parent node and its
1592             child node, their codes are the same but for the case, which is upper for the
1593             parent and lower for the child. If this is not applicable, an uppercase
1594             code is used.
1595              
1596             For higher effectiveness the simple feature codes are translated to integers
1597             (see C<simple_feature_codes>).
1598              
1599             In reality the feature codes are translated to integers as well (see
1600             C<feature_codes>), but this is only an internal issue. You can see these
1601             numbers in the model file if you use the default L<Data::Dumper> format (see
1602             C<load> and C<store>). However, if you use the tsv format (see C<load_tsv>,
1603             C<store_tsv>), you will see the real string feature codes.
1604              
1605             Currently the following simple features are available. Any subset of them can
1606             be used to form a proper feature, but their order should follow their order of
1607             appearance in this list (still, this is only a cleanliness and readability
1608             thing, it does not affect the function of the parser in any way).
1609              
1610             =over 4
1611              
1612             =item Distance (D)
1613              
1614             Distance of the two nodes in the sentence, computed as order of the parent
1615             minus the order of the child. Eg. for the sentence "To je prima pes ." and the
1616             feature D computed on nodes "je" and "pes" (parent and child respectively),
1617             the order of "je" is 2 and the order of "pes" is 4, yielding the feature value
1618             of 2 - 4 = -2. This leads to a feature C<D:-2>.
1619              
1620             =item Form (F, f)
1621              
1622             The form of the node, i.e. the word exactly as it appears in the sentence text.
1623              
1624             Currently not used as it has not lead to any improvement in the parsing.
1625              
1626             =item Lemma (L, l)
1627              
1628             The morphological lemma of the node.
1629              
1630             =item preceding tag (S, s)
1631              
1632             The morphological tag (or POS tag if you like) of the node preceding (ord-wise)
1633             the node.
1634              
1635             =item Tag (T, t)
1636              
1637             The morphological tag of the node.
1638              
1639             =item following tag (U, u)
1640              
1641             The morphological tag of the node following (ord-wise) the node.
1642              
1643             =item between tag (B)
1644              
1645             The morphological tag of each node between (ord-wise) the parent node and the
1646             child node. This simple feature returns (a reference to) an array of values.
1647              
1648             =back
1649              
1650             Some of the simple features can return an empty string in case they are not
1651             applicable (eg. C<U> for the last node in the sentence), then the whole
1652             feature is not present for the edge.
1653              
1654             Some of the simple features return an array of values (eg. the C<B> simple
1655             feature). This can result in several instances of the feature with the same
1656             code for one edge to appear in the result.
1657              
1658             =head1 FIELDS
1659              
1660             =head2 Features
1661              
1662             TODO: slightly outdated
1663              
1664             The examples used here are consistent throughout this part of documentation,
1665             i.e. if several simple features are listed in C<simple_feature_codes> and
1666             then simple feature with index 9 is referred to in C<array_simple_features>,
1667             it really means the C<B> simple feature which is on the 9th position in
1668             C<simple_feature_codes>.
1669              
1670             =over 4
1671              
1672             =item feature_count (Int)
1673              
1674             Alias of C<scalar @{feature_codes}> (but the integer is really
1675             stored in the field for faster access).
1676              
1677             =item feature_codes (ArrayRef[Str])
1678              
1679             Codes of all features to be computed. Their
1680             indexes in this array are used to refer to them in the code. Eg.:
1681              
1682             feature_codes ( [( 'L|T', 'l|t', 'L|T|l|t', 'T|B|t')] )
1683              
1684             =item feature_codes_hash (HashRef[Str])
1685              
1686             1 for each feature code to easily check if a feature exists
1687              
1688             =item feature_indexes (HashRef[Str])
1689              
1690             Index of each feature code in feature_codes (for conversion of feature code to
1691             feature index)
1692              
1693             =item feature_simple_features_indexes (ArrayRef[ArrayRef[Int]])
1694              
1695             For each feature contains (a reference to) an array which contains all its
1696             simple feature indexes (corresponding to positions in C<simple_feature_codes>
1697             ). Eg. for the 4 features (0 to 3) listed in C<feature_codes> and the 10
1698             simple features listed in C<simple_feature_codes> (0 to 9):
1699              
1700             feature_simple_features_indexes ( [(
1701             [ (1, 5) ],
1702             [ (2, 6) ],
1703             [ (1, 5, 2, 6) ],
1704             [ (5, 9, 6) ],
1705             )] )
1706              
1707              
1708             =item array_features (HashRef)
1709              
1710             Indexes of features containing array simple features (see
1711             C<array_simple_features>). Eg.:
1712              
1713             array_features( { 3 => 1} )
1714              
1715             as the feature with index 3 (C<'T|B|t'>) contains the C<B> simple feature
1716             which is an array simple feature.
1717              
1718             =back
1719              
1720             =head2 Simple features
1721              
1722             =over 4
1723              
1724             =item simple_feature_count (Int)
1725              
1726             Alias of C<scalar @{simple_feature_codes}> (but the integer is really
1727             stored in the field for faster access).
1728              
1729             =item simple_feature_codes (ArrayRef[Str])
1730              
1731             Codes of all simple features to be computed. Their order is important as their
1732             indexes in this array are used to refer to them in the code, especially in the
1733             C<get_simple_feature> method. Eg.:
1734              
1735             simple_feature_codes ( [('D', 'L', 'l', 'S', 's', 'T', 't', 'U', 'u', 'B')])
1736              
1737             =item simple_feature_codes_hash (HashRef[Str])
1738              
1739             1 for each simple feature code to easily check if a simple feature exists
1740              
1741             =item simple_feature_indexes (HashRef[Str])
1742              
1743             Index of each simple feature code in simple_feature_codes (for conversion of
1744             simple feature code to simple feature index)
1745              
1746             =item simple_feature_sub_arguments (ArrayRef)
1747              
1748             For each simple feature (on the corresponsing index) contains the index of the
1749             field (in C<field_names>), which is used to compute the simple feature value
1750             (together with a subroutine from C<simple_feature_subs>).
1751              
1752             If the simple feature takes more than one argument (called a multiarg feature
1753             here), then instead of a single field index there is a reference to an array
1754             of field indexes.
1755              
1756             If the simple feature takes other arguments than fields (especially integers),
1757             then these arguments are stored here insted of field indexes.
1758              
1759             =item simple_feature_subs (ArrayRef)
1760              
1761             For faster run, the simple features are internally not represented by their
1762             string codes, which would have to be parsed repeatedly. Instead their codes
1763             are parsed once only (in C<set_simple_feature>) and they are represented as
1764             an integer index of the field which is used to compute the feature (it is the
1765             actual index of the field in the input file line, accessible through
1766             L<Treex::Tool::Parser::MSTperl::Node/fields>) and a reference to a subroutine
1767             (one of the C<feature_*> subs, see below) which computes the feature value
1768             based on the field index and the edge (L<Treex::Tool::Parser::MSTperl::Edge>).
1769             The references subroutine is then invoked in C<get_simple_feature_values_array>.
1770              
1771             =item array_simple_features (HashRef[Int])
1772              
1773             Indexes of simple features that return an array of values instead of a single
1774             string value. Eg.:
1775              
1776             array_simple_features( { 9 => 1} )
1777              
1778             because in the aforementioned example the C<B> simple feature returns an array
1779             of values and has the index C<9>.
1780              
1781              
1782             =back
1783              
1784             =head2 Other
1785              
1786             =over 4
1787              
1788             =item edge_features_cache (HashRef[ArrayRef[Str])
1789              
1790             If caching is turned on (see below), all features of any edge computed by the
1791             C<get_feature_simple_features_indexes> method are computed once only, stored
1792             in this cache and then retrieved when needed.
1793              
1794             The key of the hash is the edge signature (see
1795             L<Treex::Tool::Parser::MSTperl::Edge/signature>), the value is
1796             (a reference to) an array of fetures and their values.
1797              
1798             =back
1799              
1800             =head1 METHODS
1801              
1802             =head2 Settings
1803              
1804             The best source of information about all the possible settings is the
1805             configuration file itself (usually called C<config.txt>), as it is richly
1806             commented and accompanied by real examples at the same time.
1807              
1808             =over 4
1809              
1810             =item my $featuresControl =
1811             Treex::Tool::Parser::MSTperl::FeaturesControl->new(
1812             'config' => $config,
1813             'feature_codes_from_config' => $feature_codes_array_reference,
1814             'use_edge_features_cache' => $use_edge_features_cache,
1815             )
1816              
1817             Parses feature codes and creates their in-memory representations.
1818              
1819             =item set_feature ($feature_code)
1820              
1821             Parses the feature code and (if no errors are encountered) creates its
1822             representation in the fields of this package (all C<feature_>* fields and
1823             possibly also the C<array_features> field).
1824              
1825             =item set_simple_feature ($simple_feature_code)
1826              
1827             Parses the simple feature code and creates its representation in the fields of
1828             this package (all C<simple_feature_>* fields and possibly also the
1829             C<array_simple_features> field).
1830              
1831             =back
1832              
1833             =head2 Computing (proper) features
1834              
1835             =over 4
1836              
1837             =item my $features_array_rf = $model->get_all_features($edge)
1838              
1839             Returns (a reference to) an array which contains all features of the edge
1840             (according to settings).
1841              
1842             If caching is turned on, tries to look the features up in the cache before
1843             computing them. If they are not cached yet, they are computed and stored into
1844             the cache.
1845              
1846             The value of a feature is computed by C<get_feature_value>. Values of simple
1847             features are precomputed (by calling C<get_simple_feature_values_array>) and
1848             passed to the C<get_feature_value> method.
1849              
1850             =item my $feature_value = get_feature_value(3, $simple_feature_values)
1851              
1852             Returns the value of the feature with the given index.
1853              
1854             If it is an array feature (see C<array_features>), its value is (a reference
1855             to) an array of all (string) values of the feature (a reference to an empty
1856             array if there are no values).
1857              
1858             If it is not an array feature, its value is composed from the simple feature
1859             values. If some of the simple features do not have a value defined, an empty
1860             string (C<''>) is returned.
1861              
1862             =item my $feature_value = get_array_feature_value ($simple_features_indexes,
1863             $simple_feature_values, $start_from)
1864              
1865             Recursively calls itself to compose an array of all values of the feature
1866             (composed of the simple features given in C<$simple_features_indexes> array
1867             reference), which is a cartesian product on all values of the simple features.
1868             The C<$start_from> variable should be C<0> when this method is called and is
1869             incremented in the recursive calls.
1870              
1871             =back
1872              
1873             =head2 Computing simple features
1874              
1875             =over 4
1876              
1877             =item my $simple_feature_values = get_simple_feature_values_array($edge)
1878              
1879             Returns (a reference to) an array of values of all simple features (see
1880             C<simple_feature_codes>). For each simple feature, its value can be found
1881             on the position in the returned array corresponding to its position in
1882             C<simple_feature_codes>.
1883              
1884             =item my $sub = get_simple_feature_sub_reference ('distance')
1885              
1886             Translates the feature funtion string name (eg. C<distance>) to its reference
1887             (eg. C<\&feature_distance>).
1888              
1889             =item my $value = get_simple_feature_value ($edge, 9)
1890              
1891             Returns the value of the simple feature with the given index by calling an
1892             appropriate C<feature_*> method on the edge
1893             (see L<Treex::Tool::Parser::MSTperl::Edge>). If
1894             the feature cannot be computed, an empty string (C<''>) is returned (or a
1895             reference to an empty array for array simple features - see
1896             C<array_simple_features>).
1897              
1898             =item feature_distance
1899              
1900             =item feature_child
1901              
1902             =item feature_parent
1903              
1904             =item feature_first
1905              
1906             =item feature_second
1907              
1908             =item feature_preceding_child
1909              
1910             =item feature_preceding_parent
1911              
1912             =item feature_following_child
1913              
1914             =item feature_following_parent
1915              
1916             =item feature_preceding_first
1917              
1918             =item feature_preceding_second
1919              
1920             =item feature_following_first
1921              
1922             =item feature_following_second
1923              
1924             =item feature_between
1925              
1926             =item feature_foreach
1927              
1928             =item feature_equals, feature_equals_pc, feature_equals_pc_at
1929              
1930             A simple feature function C<equals(field_1,field_2)>
1931             with "at least once" semantics for multiple values
1932             (there can be multiple alignments)
1933             with a special output value if one of the fields is unknown
1934             (maybe it suffices to emmit an undef, as this would occur iff at least
1935             one of the arguments is undef; but maybe not and eg. "-1" should be given)
1936              
1937             This makes it possible to have a simple feature which behaves like this:
1938              
1939             =over 4
1940              
1941             =item returns 1 if the edge between child and parent is also present in the
1942             English tree
1943              
1944             =item returns 0 if not
1945              
1946             =item returns -1 if cannot decide (alignment info is missing for some of the
1947             nodes)
1948              
1949             =back
1950              
1951             Because if the parser has (the ord of the en child node and)
1952             the ord of en child's parent and the ord of the en parent node
1953             (and the ord of the en parent's parent), the feature can check whether
1954             en_parent->ord = en_child->parentOrd
1955              
1956             C<equalspc(en->ord, en->parent->ord)>
1957              
1958             =back
1959              
1960             =head1 AUTHORS
1961              
1962             Rudolf Rosa <rosa@ufal.mff.cuni.cz>
1963              
1964             =head1 COPYRIGHT AND LICENSE
1965              
1966             Copyright © 2011 by Institute of Formal and Applied Linguistics,
1967             Charles University in Prague
1968              
1969             This module is free software;
1970             you can redistribute it and/or modify it under the same terms as Perl itself.