File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/Sentence.pm
Criterion Covered Total %
statement 1 3 33.3
branch n/a
condition n/a
subroutine 1 1 100.0
pod n/a
total 2 4 50.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Parser::MSTperl::Sentence;
2             {
3             $Treex::Tool::Parser::MSTperl::Sentence::VERSION = '0.11949';
4             }
5              
6 1     1   2216 use Moose;
  0            
  0            
7              
8             use Treex::Tool::Parser::MSTperl::Node;
9             use Treex::Tool::Parser::MSTperl::RootNode;
10              
11             has config => (
12             isa => 'Treex::Tool::Parser::MSTperl::Config',
13             is => 'ro',
14             required => '1',
15             );
16              
17             # unique for each sentence, where sentence means sequence of words
18             # (i.e. stays the same for copies of the same sentence)
19             # needed for caching of features when training the parser
20             # (can be undef if not needed)
21             has id => (
22             is => 'ro',
23             isa => 'Maybe[Int]',
24             );
25              
26             has nodes => (
27             is => 'rw',
28             isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]',
29             required => 1,
30             );
31              
32             # root node added
33             has nodes_with_root => (
34             is => 'rw',
35             isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]',
36             );
37              
38             # used only in unlabelled parsing
39             has features => (
40             is => 'rw',
41             isa => 'Maybe[ArrayRef[Str]]',
42             );
43              
44             # TODO
45             # has betweenFeatureValues => (
46             # isa => 'HashRef',
47             # is => 'rw',
48             # default => sub { {} },
49             # );
50              
51             has edges => (
52             is => 'rw',
53             isa => 'Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]]',
54             );
55              
56             sub BUILD {
57             my ($self) = @_;
58              
59             #add root
60             my $root = Treex::Tool::Parser::MSTperl::RootNode->new(
61             fields => $self->config->root_field_values,
62             config => $self->config
63             );
64             my @nodes_with_root;
65             push @nodes_with_root, $root;
66             push @nodes_with_root, @{ $self->nodes };
67             $self->nodes_with_root( [@nodes_with_root] );
68              
69             # fill node ords
70             my $ord = 1;
71             foreach my $node ( @{ $self->nodes } ) {
72             $node->ord($ord);
73             $ord++;
74             }
75              
76             return;
77             }
78              
79             sub fill_fields_after_parse {
80              
81             my ($self) = @_;
82              
83             #compute edges
84             $self->compute_edges();
85              
86             #compute features
87             $self->compute_features( $self->config->unlabelledFeaturesControl );
88              
89             return;
90             }
91              
92             sub fill_fields_before_labelling {
93              
94             my ($self) = @_;
95              
96             if ( $self->config->DEBUG >= 3 ) {
97             print $self->id . " fill_fields_before_labelling()\n";
98             }
99              
100             #compute edges
101             $self->compute_edges();
102              
103             #compute features
104             $self->compute_features( $self->config->labelledFeaturesControl );
105              
106             return;
107             }
108              
109             # sub fill_fields_after_labelling {
110             #
111             # my ($self) = @_;
112             #
113             # seems there is nothing to compute here
114             # (provided that the labels are somewhat extra,
115             # i.e. not part of the feature values)
116             #
117             # return;
118             # }
119              
120             # compute node parents and the array of edges
121             # used both in fill_fields_after_parse and fill_fields_before_labelling methods
122             sub compute_edges {
123             my ($self) = @_;
124              
125             my @edges;
126             foreach my $node ( @{ $self->nodes } ) {
127              
128             # fill node parent
129             # (it can be set either in parent or in parentOrd field)
130             if ( $node->parent ) {
131             $node->parentOrd( $node->parent->ord );
132             } else { # $node->parentOrd
133             $node->parent( $self->getNodeByOrd( $node->parentOrd ) );
134             }
135              
136             if ( $self->config->DEBUG >= 3 ) {
137             print $node->parentOrd
138             . '(' . $node->parent->fields->[1] . ')'
139             . ' -> '
140             . $node->ord
141             . '(' . $node->fields->[1] . ')'
142             . "\n";
143             }
144              
145             # add a new edge
146             my $edge = Treex::Tool::Parser::MSTperl::Edge->new(
147             child => $node,
148             parent => $node->parent,
149             sentence => $self
150             );
151             push @edges, $edge;
152              
153             # add edge to the parent's list of children
154             push @{ $node->parent->children }, $edge;
155              
156             }
157             $self->edges( [@edges] );
158              
159             return;
160             }
161              
162             # compute edge features and join them into sentence features
163             sub compute_features {
164              
165             my ( $self, $featuresControl ) = @_;
166              
167             my @features;
168             foreach my $edge ( @{ $self->edges } ) {
169             my $edge_features;
170             my $ALGORITHM = $self->config->labeller_algorithm;
171             if ( $ALGORITHM < 20 ) {
172             $edge_features = $featuresControl->get_all_features($edge);
173             } else {
174             $edge_features = $featuresControl->get_all_features( $edge, -1 );
175             }
176             $edge->features($edge_features);
177             push @features, @{$edge_features};
178             }
179              
180             # (TODO) used only in unlabelled parsing
181             $self->features( [@features] );
182              
183             return;
184             }
185              
186             sub clear_parse {
187             my ($self) = @_;
188              
189             #clear node parents and labels
190             foreach my $node ( @{ $self->nodes } ) {
191             $node->parent(undef);
192             $node->parentOrd(0);
193             $node->label('_');
194             }
195              
196             #clear edges
197             $self->edges(undef);
198              
199             #clear features
200             $self->features(undef);
201              
202             return;
203             }
204              
205             sub copy_nonparsed {
206             my ($self) = @_;
207              
208             #copy nodes
209             my @nodes;
210             foreach my $node ( @{ $self->nodes } ) {
211             my $node_copy = $node->copy_nonparsed();
212             push @nodes, $node_copy;
213             }
214              
215             #create a new instance
216             my $copy = Treex::Tool::Parser::MSTperl::Sentence->new(
217              
218             # TODO: maybe should get a different ID for the sake of labelling
219             # (but this is curently not used anyway)
220             id => $self->id,
221             nodes => [@nodes],
222             config => $self->config,
223             );
224              
225             return $copy;
226             }
227              
228             sub copy_nonlabelled {
229             my ($self) = @_;
230              
231             #copy nodes
232             my @nodes;
233             foreach my $node ( @{ $self->nodes } ) {
234             my $node_copy = $node->copy_nonlabelled();
235             push @nodes, $node_copy;
236             }
237              
238             #create a new instance
239             my $copy = Treex::Tool::Parser::MSTperl::Sentence->new(
240              
241             # TODO: maybe should get a different ID for the sake of labelling
242             # (but this is curently not used anyway)
243             id => $self->id,
244             nodes => [@nodes],
245             config => $self->config,
246             );
247              
248             return $copy;
249             }
250              
251             sub setChildParent {
252              
253             # (Int $childOrd, Int $parentOrd)
254             my ( $self, $childOrd, $parentOrd ) = @_;
255              
256             my $child = $self->getNodeByOrd($childOrd);
257             my $parent = $self->getNodeByOrd($parentOrd);
258              
259             $child->parent($parent);
260             $child->parentOrd($parentOrd);
261              
262             return;
263             }
264              
265             sub len {
266             my ($self) = @_;
267             return scalar( @{ $self->nodes } )
268             }
269              
270             sub getNodeByOrd {
271              
272             # (Int $ord)
273             my ( $self, $ord ) = @_;
274              
275             if ( $ord >= 0 && $ord <= $self->len() ) {
276             return $self->nodes_with_root->[$ord];
277             } else {
278             return; # undef
279             }
280             }
281              
282             sub count_errors_attachement {
283              
284             # (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
285             my ( $self, $correct_sentence ) = @_;
286              
287             my $errors = 0;
288              
289             #assert that nodes in the sentences with the same ords
290             # are corresponding nodes
291             foreach my $my_node ( @{ $self->nodes } ) {
292             my $my_parent = $my_node->parentOrd;
293             my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord );
294             my $correct_parent = $correct_node->parentOrd;
295             if ( $my_parent != $correct_parent ) {
296             if ( $self->config->lossFunction ) {
297             $errors +=
298             $self->attachement_error(
299             $my_node, $my_node->parent, $correct_node->parent
300             );
301             }
302             else {
303             $errors++;
304             }
305             }
306             }
307              
308             return $errors;
309             }
310              
311             sub attachement_error {
312             my ( $self, $node, $assignedParent, $correctParent ) = @_;
313              
314             # TODO how do the undefines happen?
315             # they only seem to occur during testing, not during training
316             return 1 if ( !defined $assignedParent || !defined $correctParent );
317              
318             my $error = 1;
319              
320             my $lossFunction = $self->config->lossFunction;
321              
322             if ( $lossFunction eq 'J' ) {
323             if ( defined $correctParent ) {
324             if ( $correctParent->fields->[4] =~ /^J/ ) {
325             $error = 10;
326             }
327             }
328             }
329             elsif ( $lossFunction eq 'A' ) {
330             if ( $node->fields->[4] =~ /^A/ ) {
331             $error = 10;
332             }
333             }
334             elsif ( $lossFunction eq 'NA' ) {
335             if ( defined $correctParent ) {
336             if ($node->fields->[4] =~ /^A/
337             && $correctParent->fields->[4] =~ /^N/
338             )
339             {
340             $error = 10;
341             }
342             }
343             }
344             elsif ( $lossFunction eq 'NA2' ) {
345              
346             # if the child is A
347             if ( $node->fields->[4] =~ /^A/ ) {
348              
349             # and the assigned or correct parent is N
350             if ( $correctParent->fields->[4] =~ /^N/ || $assignedParent->fields->[4] =~ /^N/ ) {
351             $error = 10;
352             }
353             }
354             }
355             elsif ( $lossFunction eq 'JNA' ) {
356             if ( defined $correctParent ) {
357             if ($node->fields->[4] =~ /^A/
358             &&
359             (
360             $correctParent->fields->[4] =~ /^N/
361             || $correctParent->fields->[4] =~ /^J/
362             )
363             )
364             {
365             $error = 10;
366             }
367             }
368             }
369             elsif ( $lossFunction eq 'NR2' ) {
370              
371             # if the child is N
372             if ( $node->fields->[4] =~ /^N/ ) {
373              
374             # and the assigned or correct parent is R
375             if ( $correctParent->fields->[4] =~ /^R/ || $assignedParent->fields->[4] =~ /^R/ ) {
376             $error = 10;
377             }
378             }
379             }
380             elsif ( $lossFunction eq 'JNR2' ) {
381              
382             # if the child is N or J
383             if ( $node->fields->[4] =~ /^[NJ]/ ) {
384              
385             # and the assigned or correct parent is R
386             if ( $correctParent->fields->[4] =~ /^R/ || $assignedParent->fields->[4] =~ /^R/ ) {
387             $error = 10;
388             }
389             }
390             }
391             elsif ( $lossFunction eq 'J2' ) {
392              
393             # if the assigned or correct parent is J
394             if ( $correctParent->fields->[4] =~ /^J/ || $assignedParent->fields->[4] =~ /^J/ ) {
395             $error = 10;
396             }
397             }
398              
399             return $error;
400             }
401              
402             sub count_errors_labelling {
403              
404             # (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
405             my ( $self, $correct_sentence ) = @_;
406              
407             my $errors = 0;
408              
409             my @correct_labels =
410             map { $_->label } @{ $correct_sentence->nodes };
411             my @my_labels =
412             map { $_->label } @{ $self->nodes };
413             for ( my $i = 0; $i < @correct_labels; $i++ ) {
414             if ( $correct_labels[$i] ne $my_labels[$i] ) {
415             $errors++;
416             }
417             }
418              
419             return $errors;
420             }
421              
422             sub count_errors_attachement_and_labelling {
423              
424             # (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
425             my ( $self, $correct_sentence ) = @_;
426              
427             my $errors = 0;
428              
429             #assert that nodes in the sentences with the same ords
430             # are corresponding nodes
431             foreach my $my_node ( @{ $self->nodes } ) {
432             my $my_parent = $my_node->parentOrd;
433             my $my_label = $my_node->label;
434             my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord );
435             my $correct_parent = $correct_node->parentOrd;
436             my $correct_label = $correct_node->label;
437             if ( $my_parent != $correct_parent || $my_label ne $correct_label ) {
438             $errors++;
439             }
440             }
441              
442             return $errors;
443             }
444              
445             sub toParentOrdsArray {
446             my ($self) = @_;
447              
448             my @parents;
449             foreach my $node ( @{ $self->nodes } ) {
450             push @parents, $node->parentOrd;
451             }
452              
453             return [@parents];
454             }
455              
456             sub toLabelsArray {
457             my ($self) = @_;
458              
459             my @labels;
460             foreach my $node ( @{ $self->nodes } ) {
461             push @labels, $node->label;
462             }
463              
464             return [@labels];
465             }
466              
467             1;
468              
469             __END__
470              
471             =pod
472              
473             =for Pod::Coverage BUILD
474              
475             =encoding utf-8
476              
477             =head1 NAME
478              
479             Treex::Tool::Parser::MSTperl::Sentence
480              
481             =head1 VERSION
482              
483             version 0.11949
484              
485             =head1 DESCRIPTION
486              
487             Represents a sentence, both parsed an unparsed.
488             Contains an array of nodes which represent the words in the sentence.
489              
490             The nodes are ordered, their C<ord> is their 1-based position in the sentence.
491             The C<0 ord> value is reserved for the (technical) sentence root.
492              
493             =head1 FIELDS
494              
495             =over 4
496              
497             =item id (Int)
498              
499             An integer id unique for each sentence (in its proper sense, where sentence
500             is a sequence of tokens - i.e. C<id> stays the same for copies of the same
501             sentence).
502              
503             =item nodes (ArrayRef[Treex::Tool::Parser::MSTperl::Node])
504              
505             (A reference to) an array of nodes (C<Treex::Tool::Parser::MSTperl::Node>) of
506             the sentence.
507              
508             A node represents both a token of the sentence (usually this is a word) and a
509             node in the parse tree of the sentence as well (if the sentence have been
510             parsed).
511              
512             =item nodes_with_root (ArrayRef[Treex::Tool::Parser::MSTperl::Node])
513              
514             Copy of C<nodes> field with a root node
515             (L<Treex::Tool::Parser::MSTperl::RootNode>) added at the beginning. As the
516             root node's C<ord> is C<0> by definition, the position of the nodes in this
517             array exactly corresponds to its C<ord>.
518              
519             =item edges (Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]])
520              
521             If the sentence is parsed (i.e. the nodes know their parents), this field
522             contains (a reference to) an array of all edges
523             (L<Treex::Tool::Parser::MSTperl::Edge>) in the parse tree of the sentence.
524              
525             This field is set by the C<sub> C<fill_fields_after_parse>.
526              
527             If the sentence is not parsed, this field is C<undef>.
528              
529             =item features (Maybe[ArrayRef[Str]])
530              
531             If the sentence is parsed, this field
532             contains (a reference to) an array of all features of all edges in the parse
533             tree of the sentence. If some of the features are repeated in the sentence
534             (i.e. they are present in severeal edges or even repeated in one edge), they
535             are repeated here as well, i.e. this is not a set in mathematical sense but a
536             (generally unordered) list.
537              
538             This field is set by the C<sub> C<fill_fields_after_parse>.
539              
540             If the sentence is not parsed, this field is C<undef>.
541              
542             =back
543              
544             =head1 METHODS
545              
546             =head2 Constructor
547              
548             =over 4
549              
550             =item my $sentence = Treex::Tool::Parser::MSTperl::Sentence->new(
551             id => 12, nodes => [$node1, $node2, $node3, ...]);
552              
553             Creates a new sentence. The C<id> must be unique (but copies of the same
554             sentence are to share the same id). It is used for edge signature generation
555             (L<Treex::Tool::Parser::MSTperl::Edge/signature>) in edge features caching (and
556             therefore does not have to be set if caching is disabled).
557              
558             The order of the nodes denotes their order in the sentence, starting from the
559             node with C<ord> 1, i.e. the technical root
560             (L<Treex::Tool::Parser::MSTperl::RootNode>) is not to be included as it is
561             generated automatically in the constructor.
562             The C<ord>s of the nodes (L<Treex::Tool::Parser::MSTperl::Node/ord>) do not
563             have to (and actually shouldn't) be filled in. If they are, they are checked
564             and a warning on STDERR is issued if they do not correspond to the position of
565             the nodes in the array. If they are not, they are filled in automatically
566             during the sentence creation.
567              
568             Other fields (C<nodes_with_root>, C<edges> and C<features>) should usually not
569             be set. C<nodes_with_root> are set automatically during sentence creation (and
570             any value set to it is discarded). C<edges> and C<features> are to be set only
571             if the sentence is parsed (i.e. the nodes know their parents, see
572             L<Treex::Tool::Parser::MSTperl::Node/parent> and
573             L<Treex::Tool::Parser::MSTperl::Node/parentOrd>) by calling the
574             C<fill_fields_after_parse> method.
575              
576             So, if the sentence is already parsed, you should call the
577             C<fill_fields_after_parse> method immediately after creaion of the sentence.
578              
579             =item my $unparsed_sentence_copy = $sentence->copy_nonparsed();
580              
581             Creates a new instance of the same sentence with the same C<id> and with
582             copies of the nodes but without any parsing information (like after calling
583             C<clear_parse>). The nodes are copied by calling
584             L<Treex::Tool::Parser::MSTperl::Node/copy_nonparsed>.
585              
586             =back
587              
588             =head2 Action methods
589              
590             =over 4
591              
592             =item $sentence->setChildParent(5, 3)
593              
594             Sets the parent of the node with the first C<ord> to be the node with the second
595             C<ord> - eg. here, the 3rd node is the parent of the 5th node.
596             It only sets the C<parent> and C<parentOrd> fields in the child node
597             (i.e. it does not create or modify any edges).
598              
599             When all nodes' parents have been set, C<fill_fields_after_parse> can be called.
600              
601             =item $sentence->fill_fields_after_parse()
602              
603             Fills the fields of the sentence and fields of its nodes which can be filled
604             only for a sentence that has already been parsed (i.e. if the nodes' C<parent>
605             or C<parentOrd> fields are filled).
606              
607             The fields which are filled by this subroutine are C<edges> and C<features>
608             for the sentence and C<parent> or C<parentOrd> for each of the sentence nodes
609             which do not have the field set.
610              
611             =item $sentence->clear_parse()
612              
613             Is kind of an inversion of the C<fill_fields_after_parse> method. It clears
614             the C<edges> and C<features> fields and also unsets the parents of all nodes
615             (by setting their C<parent> field to C<undef> and C<parentOrd> to C<0>).
616              
617             =back
618              
619             =head2 Information methods
620              
621             =over 4
622              
623             =item $sentence->len()
624              
625             Returns length of the sentence, i.e. number of nodes in the sentence.
626             Each node corresponds to one word (one token to be more precise).
627              
628             =item $sentence->count_errors_attachement($correct_sentence)
629              
630             Compares the parse tree of the sentence with its correct parse tree,
631             represented by an instance of the same sentence containing its correct parse.
632              
633             An error is considered to be an incorrectly assigned governing node. So, the
634             parents of all nodes (obviously not including the root node) are compared and
635             if they are different, it is counted as an error. This leads to a minimum
636             number of errors equal to 0 and maximum number equal to the length of the
637             sentence.
638              
639             =item $sentence->count_errors_labelling($correct_sentence)
640              
641             Compares the labelling of the sentence with its correct labelling,
642             represented by an instance of the same sentence containing the correct labels.
643              
644             An error is considered to be an incorrectly assigned label. So, the
645             labels of all edges (technically stored in the child nodes) are compared and
646             if they are different, it is counted as an error. This leads to a minimum
647             number of errors equal to 0 and maximum number equal to the length of the
648             sentence.
649              
650             =item $sentence->getNodeByOrd(6)
651              
652             Returns the node with this C<ord> (it can also be the root node if the C<ord>
653             is 0) or C<undef> if the C<ord> is out of range.
654              
655             =item $sentence->toString()
656              
657             Returns forms of the nodes joined by spaces (i.e. the sentence as a text but
658             with a space between each two adjacent tokens).
659              
660             =item $sentence->toParentOrdsArray()
661              
662             Returns (a reference to) an array of node parent ords, i.e. for the sentence
663             "Tom is big", where "is" is a child of the root node and "Tom" and "big" are
664             children of "is", this method returns C<[2, 0, 2]>.
665              
666             =back
667              
668             =head1 AUTHORS
669              
670             Rudolf Rosa <rosa@ufal.mff.cuni.cz>
671              
672             =head1 COPYRIGHT AND LICENSE
673              
674             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles
675             University in Prague
676              
677             This module is free software; you can redistribute it and/or modify it under
678             the same terms as Perl itself.