File Coverage

blib/lib/Treex/Tool/Parser/MSTperl/Sentence.pm

Criterion	Covered	Total	%
statement	1	3	33.3
branch			n/a
condition			n/a
subroutine	1	1	100.0
pod			n/a
total	2	4	50.0

line	stmt	sub	time	code
1				package Treex::Tool::Parser::MSTperl::Sentence;
2				{
3				$Treex::Tool::Parser::MSTperl::Sentence::VERSION = '0.11949';
4				}
5
6	1	1	2216	use Moose;
	0
	0
7
8				use Treex::Tool::Parser::MSTperl::Node;
9				use Treex::Tool::Parser::MSTperl::RootNode;
10
11				has config => (
12				isa => 'Treex::Tool::Parser::MSTperl::Config',
13				is => 'ro',
14				required => '1',
15				);
16
17				# unique for each sentence, where sentence means sequence of words
18				# (i.e. stays the same for copies of the same sentence)
19				# needed for caching of features when training the parser
20				# (can be undef if not needed)
21				has id => (
22				is => 'ro',
23				isa => 'Maybe[Int]',
24				);
25
26				has nodes => (
27				is => 'rw',
28				isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]',
29				required => 1,
30				);
31
32				# root node added
33				has nodes_with_root => (
34				is => 'rw',
35				isa => 'ArrayRef[Treex::Tool::Parser::MSTperl::Node]',
36				);
37
38				# used only in unlabelled parsing
39				has features => (
40				is => 'rw',
41				isa => 'Maybe[ArrayRef[Str]]',
42				);
43
44				# TODO
45				# has betweenFeatureValues => (
46				# isa => 'HashRef',
47				# is => 'rw',
48				# default => sub { {} },
49				# );
50
51				has edges => (
52				is => 'rw',
53				isa => 'Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]]',
54				);
55
56				sub BUILD {
57				my ($self) = @_;
58
59				#add root
60				my $root = Treex::Tool::Parser::MSTperl::RootNode->new(
61				fields => $self->config->root_field_values,
62				config => $self->config
63				);
64				my @nodes_with_root;
65				push @nodes_with_root, $root;
66				push @nodes_with_root, @{ $self->nodes };
67				$self->nodes_with_root( [@nodes_with_root] );
68
69				# fill node ords
70				my $ord = 1;
71				foreach my $node ( @{ $self->nodes } ) {
72				$node->ord($ord);
73				$ord++;
74				}
75
76				return;
77				}
78
79				sub fill_fields_after_parse {
80
81				my ($self) = @_;
82
83				#compute edges
84				$self->compute_edges();
85
86				#compute features
87				$self->compute_features( $self->config->unlabelledFeaturesControl );
88
89				return;
90				}
91
92				sub fill_fields_before_labelling {
93
94				my ($self) = @_;
95
96				if ( $self->config->DEBUG >= 3 ) {
97				print $self->id . " fill_fields_before_labelling()\n";
98				}
99
100				#compute edges
101				$self->compute_edges();
102
103				#compute features
104				$self->compute_features( $self->config->labelledFeaturesControl );
105
106				return;
107				}
108
109				# sub fill_fields_after_labelling {
110				#
111				# my ($self) = @_;
112				#
113				# seems there is nothing to compute here
114				# (provided that the labels are somewhat extra,
115				# i.e. not part of the feature values)
116				#
117				# return;
118				# }
119
120				# compute node parents and the array of edges
121				# used both in fill_fields_after_parse and fill_fields_before_labelling methods
122				sub compute_edges {
123				my ($self) = @_;
124
125				my @edges;
126				foreach my $node ( @{ $self->nodes } ) {
127
128				# fill node parent
129				# (it can be set either in parent or in parentOrd field)
130				if ( $node->parent ) {
131				$node->parentOrd( $node->parent->ord );
132				} else { # $node->parentOrd
133				$node->parent( $self->getNodeByOrd( $node->parentOrd ) );
134				}
135
136				if ( $self->config->DEBUG >= 3 ) {
137				print $node->parentOrd
138				. '(' . $node->parent->fields->[1] . ')'
139				. ' -> '
140				. $node->ord
141				. '(' . $node->fields->[1] . ')'
142				. "\n";
143				}
144
145				# add a new edge
146				my $edge = Treex::Tool::Parser::MSTperl::Edge->new(
147				child => $node,
148				parent => $node->parent,
149				sentence => $self
150				);
151				push @edges, $edge;
152
153				# add edge to the parent's list of children
154				push @{ $node->parent->children }, $edge;
155
156				}
157				$self->edges( [@edges] );
158
159				return;
160				}
161
162				# compute edge features and join them into sentence features
163				sub compute_features {
164
165				my ( $self, $featuresControl ) = @_;
166
167				my @features;
168				foreach my $edge ( @{ $self->edges } ) {
169				my $edge_features;
170				my $ALGORITHM = $self->config->labeller_algorithm;
171				if ( $ALGORITHM < 20 ) {
172				$edge_features = $featuresControl->get_all_features($edge);
173				} else {
174				$edge_features = $featuresControl->get_all_features( $edge, -1 );
175				}
176				$edge->features($edge_features);
177				push @features, @{$edge_features};
178				}
179
180				# (TODO) used only in unlabelled parsing
181				$self->features( [@features] );
182
183				return;
184				}
185
186				sub clear_parse {
187				my ($self) = @_;
188
189				#clear node parents and labels
190				foreach my $node ( @{ $self->nodes } ) {
191				$node->parent(undef);
192				$node->parentOrd(0);
193				$node->label('_');
194				}
195
196				#clear edges
197				$self->edges(undef);
198
199				#clear features
200				$self->features(undef);
201
202				return;
203				}
204
205				sub copy_nonparsed {
206				my ($self) = @_;
207
208				#copy nodes
209				my @nodes;
210				foreach my $node ( @{ $self->nodes } ) {
211				my $node_copy = $node->copy_nonparsed();
212				push @nodes, $node_copy;
213				}
214
215				#create a new instance
216				my $copy = Treex::Tool::Parser::MSTperl::Sentence->new(
217
218				# TODO: maybe should get a different ID for the sake of labelling
219				# (but this is curently not used anyway)
220				id => $self->id,
221				nodes => [@nodes],
222				config => $self->config,
223				);
224
225				return $copy;
226				}
227
228				sub copy_nonlabelled {
229				my ($self) = @_;
230
231				#copy nodes
232				my @nodes;
233				foreach my $node ( @{ $self->nodes } ) {
234				my $node_copy = $node->copy_nonlabelled();
235				push @nodes, $node_copy;
236				}
237
238				#create a new instance
239				my $copy = Treex::Tool::Parser::MSTperl::Sentence->new(
240
241				# TODO: maybe should get a different ID for the sake of labelling
242				# (but this is curently not used anyway)
243				id => $self->id,
244				nodes => [@nodes],
245				config => $self->config,
246				);
247
248				return $copy;
249				}
250
251				sub setChildParent {
252
253				# (Int $childOrd, Int $parentOrd)
254				my ( $self, $childOrd, $parentOrd ) = @_;
255
256				my $child = $self->getNodeByOrd($childOrd);
257				my $parent = $self->getNodeByOrd($parentOrd);
258
259				$child->parent($parent);
260				$child->parentOrd($parentOrd);
261
262				return;
263				}
264
265				sub len {
266				my ($self) = @_;
267				return scalar( @{ $self->nodes } )
268				}
269
270				sub getNodeByOrd {
271
272				# (Int $ord)
273				my ( $self, $ord ) = @_;
274
275				if ( $ord >= 0 && $ord <= $self->len() ) {
276				return $self->nodes_with_root->[$ord];
277				} else {
278				return; # undef
279				}
280				}
281
282				sub count_errors_attachement {
283
284				# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
285				my ( $self, $correct_sentence ) = @_;
286
287				my $errors = 0;
288
289				#assert that nodes in the sentences with the same ords
290				# are corresponding nodes
291				foreach my $my_node ( @{ $self->nodes } ) {
292				my $my_parent = $my_node->parentOrd;
293				my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord );
294				my $correct_parent = $correct_node->parentOrd;
295				if ( $my_parent != $correct_parent ) {
296				if ( $self->config->lossFunction ) {
297				$errors +=
298				$self->attachement_error(
299				$my_node, $my_node->parent, $correct_node->parent
300				);
301				}
302				else {
303				$errors++;
304				}
305				}
306				}
307
308				return $errors;
309				}
310
311				sub attachement_error {
312				my ( $self, $node, $assignedParent, $correctParent ) = @_;
313
314				# TODO how do the undefines happen?
315				# they only seem to occur during testing, not during training
316				return 1 if ( !defined $assignedParent \|\| !defined $correctParent );
317
318				my $error = 1;
319
320				my $lossFunction = $self->config->lossFunction;
321
322				if ( $lossFunction eq 'J' ) {
323				if ( defined $correctParent ) {
324				if ( $correctParent->fields->[4] =~ /^J/ ) {
325				$error = 10;
326				}
327				}
328				}
329				elsif ( $lossFunction eq 'A' ) {
330				if ( $node->fields->[4] =~ /^A/ ) {
331				$error = 10;
332				}
333				}
334				elsif ( $lossFunction eq 'NA' ) {
335				if ( defined $correctParent ) {
336				if ($node->fields->[4] =~ /^A/
337				&& $correctParent->fields->[4] =~ /^N/
338				)
339				{
340				$error = 10;
341				}
342				}
343				}
344				elsif ( $lossFunction eq 'NA2' ) {
345
346				# if the child is A
347				if ( $node->fields->[4] =~ /^A/ ) {
348
349				# and the assigned or correct parent is N
350				if ( $correctParent->fields->[4] =~ /^N/ \|\| $assignedParent->fields->[4] =~ /^N/ ) {
351				$error = 10;
352				}
353				}
354				}
355				elsif ( $lossFunction eq 'JNA' ) {
356				if ( defined $correctParent ) {
357				if ($node->fields->[4] =~ /^A/
358				&&
359				(
360				$correctParent->fields->[4] =~ /^N/
361				\|\| $correctParent->fields->[4] =~ /^J/
362				)
363				)
364				{
365				$error = 10;
366				}
367				}
368				}
369				elsif ( $lossFunction eq 'NR2' ) {
370
371				# if the child is N
372				if ( $node->fields->[4] =~ /^N/ ) {
373
374				# and the assigned or correct parent is R
375				if ( $correctParent->fields->[4] =~ /^R/ \|\| $assignedParent->fields->[4] =~ /^R/ ) {
376				$error = 10;
377				}
378				}
379				}
380				elsif ( $lossFunction eq 'JNR2' ) {
381
382				# if the child is N or J
383				if ( $node->fields->[4] =~ /^[NJ]/ ) {
384
385				# and the assigned or correct parent is R
386				if ( $correctParent->fields->[4] =~ /^R/ \|\| $assignedParent->fields->[4] =~ /^R/ ) {
387				$error = 10;
388				}
389				}
390				}
391				elsif ( $lossFunction eq 'J2' ) {
392
393				# if the assigned or correct parent is J
394				if ( $correctParent->fields->[4] =~ /^J/ \|\| $assignedParent->fields->[4] =~ /^J/ ) {
395				$error = 10;
396				}
397				}
398
399				return $error;
400				}
401
402				sub count_errors_labelling {
403
404				# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
405				my ( $self, $correct_sentence ) = @_;
406
407				my $errors = 0;
408
409				my @correct_labels =
410				map { $_->label } @{ $correct_sentence->nodes };
411				my @my_labels =
412				map { $_->label } @{ $self->nodes };
413				for ( my $i = 0; $i < @correct_labels; $i++ ) {
414				if ( $correct_labels[$i] ne $my_labels[$i] ) {
415				$errors++;
416				}
417				}
418
419				return $errors;
420				}
421
422				sub count_errors_attachement_and_labelling {
423
424				# (Treex::Tool::Parser::MSTperl::Sentence $correct_sentence)
425				my ( $self, $correct_sentence ) = @_;
426
427				my $errors = 0;
428
429				#assert that nodes in the sentences with the same ords
430				# are corresponding nodes
431				foreach my $my_node ( @{ $self->nodes } ) {
432				my $my_parent = $my_node->parentOrd;
433				my $my_label = $my_node->label;
434				my $correct_node = $correct_sentence->getNodeByOrd( $my_node->ord );
435				my $correct_parent = $correct_node->parentOrd;
436				my $correct_label = $correct_node->label;
437				if ( $my_parent != $correct_parent \|\| $my_label ne $correct_label ) {
438				$errors++;
439				}
440				}
441
442				return $errors;
443				}
444
445				sub toParentOrdsArray {
446				my ($self) = @_;
447
448				my @parents;
449				foreach my $node ( @{ $self->nodes } ) {
450				push @parents, $node->parentOrd;
451				}
452
453				return [@parents];
454				}
455
456				sub toLabelsArray {
457				my ($self) = @_;
458
459				my @labels;
460				foreach my $node ( @{ $self->nodes } ) {
461				push @labels, $node->label;
462				}
463
464				return [@labels];
465				}
466
467				1;
468
469				__END__
470
471				=pod
472
473				=for Pod::Coverage BUILD
474
475				=encoding utf-8
476
477				=head1 NAME
478
479				Treex::Tool::Parser::MSTperl::Sentence
480
481				=head1 VERSION
482
483				version 0.11949
484
485				=head1 DESCRIPTION
486
487				Represents a sentence, both parsed an unparsed.
488				Contains an array of nodes which represent the words in the sentence.
489
490				The nodes are ordered, their C<ord> is their 1-based position in the sentence.
491				The C<0 ord> value is reserved for the (technical) sentence root.
492
493				=head1 FIELDS
494
495				=over 4
496
497				=item id (Int)
498
499				An integer id unique for each sentence (in its proper sense, where sentence
500				is a sequence of tokens - i.e. C<id> stays the same for copies of the same
501				sentence).
502
503				=item nodes (ArrayRef[Treex::Tool::Parser::MSTperl::Node])
504
505				(A reference to) an array of nodes (C<Treex::Tool::Parser::MSTperl::Node>) of
506				the sentence.
507
508				A node represents both a token of the sentence (usually this is a word) and a
509				node in the parse tree of the sentence as well (if the sentence have been
510				parsed).
511
512				=item nodes_with_root (ArrayRef[Treex::Tool::Parser::MSTperl::Node])
513
514				Copy of C<nodes> field with a root node
515				(L<Treex::Tool::Parser::MSTperl::RootNode>) added at the beginning. As the
516				root node's C<ord> is C<0> by definition, the position of the nodes in this
517				array exactly corresponds to its C<ord>.
518
519				=item edges (Maybe[ArrayRef[Treex::Tool::Parser::MSTperl::Edge]])
520
521				If the sentence is parsed (i.e. the nodes know their parents), this field
522				contains (a reference to) an array of all edges
523				(L<Treex::Tool::Parser::MSTperl::Edge>) in the parse tree of the sentence.
524
525				This field is set by the C<sub> C<fill_fields_after_parse>.
526
527				If the sentence is not parsed, this field is C<undef>.
528
529				=item features (Maybe[ArrayRef[Str]])
530
531				If the sentence is parsed, this field
532				contains (a reference to) an array of all features of all edges in the parse
533				tree of the sentence. If some of the features are repeated in the sentence
534				(i.e. they are present in severeal edges or even repeated in one edge), they
535				are repeated here as well, i.e. this is not a set in mathematical sense but a
536				(generally unordered) list.
537
538				This field is set by the C<sub> C<fill_fields_after_parse>.
539
540				If the sentence is not parsed, this field is C<undef>.
541
542				=back
543
544				=head1 METHODS
545
546				=head2 Constructor
547
548				=over 4
549
550				=item my $sentence = Treex::Tool::Parser::MSTperl::Sentence->new(
551				id => 12, nodes => [$node1, $node2, $node3, ...]);
552
553				Creates a new sentence. The C<id> must be unique (but copies of the same
554				sentence are to share the same id). It is used for edge signature generation
555				(L<Treex::Tool::Parser::MSTperl::Edge/signature>) in edge features caching (and
556				therefore does not have to be set if caching is disabled).
557
558				The order of the nodes denotes their order in the sentence, starting from the
559				node with C<ord> 1, i.e. the technical root
560				(L<Treex::Tool::Parser::MSTperl::RootNode>) is not to be included as it is
561				generated automatically in the constructor.
562				The C<ord>s of the nodes (L<Treex::Tool::Parser::MSTperl::Node/ord>) do not
563				have to (and actually shouldn't) be filled in. If they are, they are checked
564				and a warning on STDERR is issued if they do not correspond to the position of
565				the nodes in the array. If they are not, they are filled in automatically
566				during the sentence creation.
567
568				Other fields (C<nodes_with_root>, C<edges> and C<features>) should usually not
569				be set. C<nodes_with_root> are set automatically during sentence creation (and
570				any value set to it is discarded). C<edges> and C<features> are to be set only
571				if the sentence is parsed (i.e. the nodes know their parents, see
572				L<Treex::Tool::Parser::MSTperl::Node/parent> and
573				L<Treex::Tool::Parser::MSTperl::Node/parentOrd>) by calling the
574				C<fill_fields_after_parse> method.
575
576				So, if the sentence is already parsed, you should call the
577				C<fill_fields_after_parse> method immediately after creaion of the sentence.
578
579				=item my $unparsed_sentence_copy = $sentence->copy_nonparsed();
580
581				Creates a new instance of the same sentence with the same C<id> and with
582				copies of the nodes but without any parsing information (like after calling
583				C<clear_parse>). The nodes are copied by calling
584				L<Treex::Tool::Parser::MSTperl::Node/copy_nonparsed>.
585
586				=back
587
588				=head2 Action methods
589
590				=over 4
591
592				=item $sentence->setChildParent(5, 3)
593
594				Sets the parent of the node with the first C<ord> to be the node with the second
595				C<ord> - eg. here, the 3rd node is the parent of the 5th node.
596				It only sets the C<parent> and C<parentOrd> fields in the child node
597				(i.e. it does not create or modify any edges).
598
599				When all nodes' parents have been set, C<fill_fields_after_parse> can be called.
600
601				=item $sentence->fill_fields_after_parse()
602
603				Fills the fields of the sentence and fields of its nodes which can be filled
604				only for a sentence that has already been parsed (i.e. if the nodes' C<parent>
605				or C<parentOrd> fields are filled).
606
607				The fields which are filled by this subroutine are C<edges> and C<features>
608				for the sentence and C<parent> or C<parentOrd> for each of the sentence nodes
609				which do not have the field set.
610
611				=item $sentence->clear_parse()
612
613				Is kind of an inversion of the C<fill_fields_after_parse> method. It clears
614				the C<edges> and C<features> fields and also unsets the parents of all nodes
615				(by setting their C<parent> field to C<undef> and C<parentOrd> to C<0>).
616
617				=back
618
619				=head2 Information methods
620
621				=over 4
622
623				=item $sentence->len()
624
625				Returns length of the sentence, i.e. number of nodes in the sentence.
626				Each node corresponds to one word (one token to be more precise).
627
628				=item $sentence->count_errors_attachement($correct_sentence)
629
630				Compares the parse tree of the sentence with its correct parse tree,
631				represented by an instance of the same sentence containing its correct parse.
632
633				An error is considered to be an incorrectly assigned governing node. So, the
634				parents of all nodes (obviously not including the root node) are compared and
635				if they are different, it is counted as an error. This leads to a minimum
636				number of errors equal to 0 and maximum number equal to the length of the
637				sentence.
638
639				=item $sentence->count_errors_labelling($correct_sentence)
640
641				Compares the labelling of the sentence with its correct labelling,
642				represented by an instance of the same sentence containing the correct labels.
643
644				An error is considered to be an incorrectly assigned label. So, the
645				labels of all edges (technically stored in the child nodes) are compared and
646				if they are different, it is counted as an error. This leads to a minimum
647				number of errors equal to 0 and maximum number equal to the length of the
648				sentence.
649
650				=item $sentence->getNodeByOrd(6)
651
652				Returns the node with this C<ord> (it can also be the root node if the C<ord>
653				is 0) or C<undef> if the C<ord> is out of range.
654
655				=item $sentence->toString()
656
657				Returns forms of the nodes joined by spaces (i.e. the sentence as a text but
658				with a space between each two adjacent tokens).
659
660				=item $sentence->toParentOrdsArray()
661
662				Returns (a reference to) an array of node parent ords, i.e. for the sentence
663				"Tom is big", where "is" is a child of the root node and "Tom" and "big" are
664				children of "is", this method returns C<[2, 0, 2]>.
665
666				=back
667
668				=head1 AUTHORS
669
670				Rudolf Rosa <rosa@ufal.mff.cuni.cz>
671
672				=head1 COPYRIGHT AND LICENSE
673
674				Copyright Â© 2011 by Institute of Formal and Applied Linguistics, Charles
675				University in Prague
676
677				This module is free software; you can redistribute it and/or modify it under
678				the same terms as Perl itself.