File Coverage

blib/lib/Treex/Core/Node/Interset.pm
Criterion Covered Total %
statement 36 105 34.2
branch 7 44 15.9
condition 2 27 7.4
subroutine 14 26 53.8
pod n/a
total 59 202 29.2


line stmt bran cond sub pod time code
1             package Treex::Core::Node::Interset;
2             $Treex::Core::Node::Interset::VERSION = '2.20210102';
3 24     24   32005 use MooseX::Role::Parameterized;
  24         1509174  
  24         126  
4              
5             parameter interset_attribute => (
6             isa => 'Str',
7             default => 'iset',
8             );
9              
10 24     24   935304 use Treex::Core::Log;
  24         72  
  24         2663  
11 24     24   280 use List::Util qw(first); # TODO: this wouldn't be needed if there was Treex::Core::Common for roles
  24         65  
  24         2020  
12 24     24   15827 use Lingua::Interset 3.006;
  24         461656  
  24         1577  
13 24     24   22748 use Lingua::Interset::FeatureStructure;
  24         6984407  
  24         2259  
14 24     24   278 use Data::Dumper;
  24         62  
  24         44765  
15              
16             # "role" is a semi-keyword imported from MooseX::Role::Parameterized
17             role {
18             my $role_parameters = shift;
19             my $interset_attribute = $role_parameters->interset_attribute;
20              
21             has $interset_attribute => (
22             # Unfortunatelly, the old interface uses $anode->set_iset('tense', 'past'),
23             # so set_iset cannot be used as a setter for the whole structure
24             # $anode->set_iset(Lingua::Interset::FeatureStructure->new(tense=>'past'))
25             is => 'ro',
26             isa => 'Lingua::Interset::FeatureStructure',
27             lazy_build => 1,
28             #builder => "_build_$interset_attribute",
29             handles => [qw(
30             matches
31             upos
32             set_upos
33             is_abbreviation
34             is_abessive
35             is_ablative
36             is_absolute_superlative
37             is_absolutive
38             is_accusative
39             is_active
40             is_additive
41             is_adessive
42             is_adjective
43             is_admirative
44             is_adposition
45             is_adverb
46             is_affirmative
47             is_allative
48             is_animate
49             is_antipassive
50             is_aorist
51             is_archaic
52             is_article
53             is_associative
54             is_benefactive
55             is_cardinal
56             is_colloquial
57             is_comitative
58             is_common_gender
59             is_comparative
60             is_conditional
61             is_conjunction
62             is_conjunctive
63             is_construct
64             is_converb
65             is_coordinator
66             is_count_plural
67             is_dative
68             is_definite
69             is_delative
70             is_demonstrative
71             is_desiderative
72             is_destinative
73             is_determiner
74             is_diminutive
75             is_direct_voice
76             is_distributive
77             is_dual
78             is_elative
79             is_elevating
80             is_equative
81             is_ergative
82             is_essive
83             is_exclamative
84             is_factive
85             is_feminine
86             is_finite_verb
87             is_first_hand
88             is_first_person
89             is_foreign
90             is_formal
91             is_fourth_person
92             is_future
93             is_genitive
94             is_gerund
95             is_gerundive
96             is_greater_paucal
97             is_greater_plural
98             is_habitual
99             is_human
100             is_humbling
101             is_hyph
102             is_illative
103             is_imperative
104             is_imperfect
105             is_impersonal
106             is_inanimate
107             is_indefinite
108             is_indicative
109             is_inessive
110             is_infinitive
111             is_informal
112             is_instructive
113             is_instrumental
114             is_interjection
115             is_interrogative
116             is_intransitive
117             is_inverse_number
118             is_iterative
119             is_jussive
120             is_lative
121             is_locative
122             is_masculine
123             is_mediopassive
124             is_middle_voice
125             is_modal
126             is_motivative
127             is_multiplicative
128             is_narrative
129             is_necessitative
130             is_negative
131             is_nominative
132             is_non_first_hand
133             is_nonhuman
134             is_neuter
135             is_noun
136             is_numeral
137             is_optative
138             is_ordinal
139             is_participle
140             is_particle
141             is_partitive
142             is_past
143             is_paucal
144             is_perfect
145             is_personal
146             is_personal_pronoun
147             is_pluperfect
148             is_plural
149             is_polite
150             is_positive
151             is_possessive
152             is_potential
153             is_present
154             is_prolative
155             is_pronominal
156             is_pronoun
157             is_proper_noun
158             is_progressive
159             is_prospective
160             is_punctuation
161             is_purposive
162             is_quotative
163             is_rare
164             is_reciprocal
165             is_reflexive
166             is_relative
167             is_second_person
168             is_singular
169             is_specific
170             is_subjunctive
171             is_sublative
172             is_subordinator
173             is_superessive
174             is_superlative
175             is_supine
176             is_symbol
177             is_temporal
178             is_terminative
179             is_third_person
180             is_total
181             is_transgressive
182             is_transitive
183             is_translative
184             is_trial
185             is_typo
186             is_verb
187             is_verbal_noun
188             is_vocative
189             is_wh
190             is_zero_person
191             )],
192             # Note that we cannot export
193             # $anode->iset->is_auxiliary as it would clash with the existing $anode->is_auxiliary
194             # $tnode->dset->is_passive as it would clash with the existing $tnode->is_passive
195              
196             );
197              
198             method "_build_$interset_attribute" => sub {
199 47     47   336 return Lingua::Interset::FeatureStructure->new();
        47      
200             };
201              
202             # Interset 1.0 legacy method (works with both Interset 1.0 and 2.0 feature structures)
203             method is_preposition => sub {
204 0     0   0 my $self = shift;
        0      
205 0         0 return $self->iset->pos =~ /^(prep|adp)$/;
206             };
207              
208              
209              
210             #------------------------------------------------------------------------------
211             # Takes the Interset feature structure as a hash reference (as output by an
212             # Interset decode() or get_iset_structure() function). For all hash keys that
213             # are known Interset feature names, sets the corresponding iset attribute.
214             #
215             # If the first argument is not a hash reference, the list of arguments is
216             # considered a list of features and values. Usage examples:
217             #
218             # set_iset(\%feature_structure);
219             # set_iset('pos', 'noun');
220             # set_iset('pos' => 'noun', 'gender' => 'masc', 'number' => 'sing');
221             #
222             # TODO: Note that this is not a proper setter method yet.
223             # For backward compatibility, it only *adds* features to the Interset feature structure.
224             # For example:
225             # $anode->set_iset(case=>'nom', gender=>'fem');
226             # $anode->set_iset(case=>'gen', pos=>'noun');
227             # Now $anode->get_iset_structure() would return
228             # {case=>'gen', pos=>'noun', gender=>'fem'}
229             # If you want to delete a feature, you must explicitely set it to an empty string
230             # $anode->set_iset(gender=>'');
231             # Now: {case=>'gen', pos=>'noun', gender=>''} which is equivalent to
232             # {case=>'gen', pos=>'noun'}
233             #------------------------------------------------------------------------------
234             method set_iset => sub {
235 28     28   69 my $self = shift;
        28      
236 28         65 my @assignments;
237 28 50       241 if ( ref( $_[0] ) =~ /(HASH|Lingua::Interset::FeatureStructure)/ ) {
238             # We cannot interpret the hash/object as a set of assignments for add() as below.
239             # Lingua::Interset::FeatureStructure may contain private attributes that are not features.
240             # Using merge_hash_hard() is safer because it only takes known features from the hash and ignores the rest.
241 28         765 return $self->$interset_attribute->merge_hash_hard($_[0]);
242             }
243             else {
244 0 0       0 log_fatal "No parameters for 'set_iset'" if @_ == 0;
245 0 0       0 log_fatal "Odd parameters for 'set_iset'" if @_%2;
246 0         0 @assignments = @_;;
247 0         0 return $self->$interset_attribute->add(@assignments);
248             }
249             };
250              
251              
252              
253             #------------------------------------------------------------------------------
254             # Gets the value of an Interset feature. Makes sure that the result is never
255             # undefined so the use/strict/warnings creature keeps quiet. It returns undef
256             # only if we ask for the value of an unknown feature.
257             #
258             # If there is a disjunction of values (such as "fem|neut"), this function
259             # returns just a string with vertical bars as delimiters. The caller can use
260             # a split() function to get an array, or call get_iset_structure() instead.
261             #------------------------------------------------------------------------------
262             method get_iset => sub {
263 0     0   0 my ($self, $feature) = @_;
        0      
264 0         0 my $value = $self->get_attr("$interset_attribute/$feature");
265             # convert arrayref to string, e.g. "fem|neut"
266 0 0       0 if ( ref($value) eq 'ARRAY' ) {
267 0         0 $value = join '|', @$value;
268             }
269 0 0       0 return $value if defined $value;
270              
271             # Check valid feature name only when the feature is missing.
272             # TODO: convert all Treex code to Interset 2.0, so that no checking is needed.
273 0 0       0 if (!Lingua::Interset::FeatureStructure::feature_valid($feature)) {
274 0         0 log_warn("Querying unknown Interset feature $feature");
275             }
276              
277             # Return empty string instead of undef.
278 0         0 return '';
279             };
280              
281              
282              
283             #------------------------------------------------------------------------------
284             # Gets the values of all Interset features and returns a hash. Any multivalues
285             # (such as "fem|neut") will be converted to arrays referenced from the hash
286             # (same as the result of decode() functions in Interset tagset drivers).
287             #------------------------------------------------------------------------------
288             method get_iset_structure => sub
289             {
290 0     0   0 my $self = shift;
        0      
291 0         0 my $iset = $self->$interset_attribute; # iset or dset
292 0         0 my %f;
293 0         0 foreach my $feature ( $iset->get_nonempty_features() )
294             {
295 0         0 $f{$feature} = $iset->get_joined($feature);
296 0 0       0 if ( $f{$feature} =~ m/\|/ )
297             {
298 0         0 my @values = split( /\|/, $f{$feature} );
299 0         0 $f{$feature} = \@values;
300             }
301             }
302 0         0 return \%f;
303             };
304              
305             #------------------------------------------------------------------------------
306             # Return the values of all non-empty Interset features (except for the "tagset" and "other" features).
307             #------------------------------------------------------------------------------
308             method get_iset_values => sub
309             {
310 0     0   0 my $self = shift;
        0      
311 0         0 return map {$self->get_iset($_)} grep {$_ !~ 'tagset|other'} $self->$interset_attribute->get_nonempty_features();
  0         0  
  0         0  
312             };
313              
314             #------------------------------------------------------------------------------
315             # The inverse of iset->as_string_conllx -- takes a feat string which is the
316             # result of calling iset->as_string_conllx, and sets Interset feature values
317             # according to that string.
318             #------------------------------------------------------------------------------
319             method set_iset_conll_feat => sub {
320 0     0   0 my ($self, $feat_string) = @_;
        0      
321 0         0 my @pairs = split /\|/, $feat_string;
322 0         0 foreach my $pair (@pairs) {
323 0         0 $pair =~ s/[;,]/|/g;
324 0         0 my ($feature, $value) = split /=/, $pair;
325 0         0 $self->set_iset($feature, $value);
326             }
327 0         0 return;
328             };
329              
330             #------------------------------------------------------------------------------
331             # Tests multiple Interset features simultaneously. Input is a list of feature-
332             # value pairs, return value is 1 if the node matches all these values. This
333             # function is an abbreviation for a series of get_iset() calls in an if
334             # statement:
335             #
336             # if($node->match_iset('pos' => 'noun', 'gender' => 'masc')) { ... }
337             #------------------------------------------------------------------------------
338             method match_iset => sub {
339 0     0   0 my $self = shift;
        0      
340 0         0 my @req = @_;
341 0         0 for ( my $i = 0; $i <= $#req; $i += 2 )
342             {
343 0         0 my $feature = $req[$i];
344 0         0 my $expected = $req[$i+1];
345 0 0       0 confess("Undefined feature") unless ($feature);
346 0         0 my $value = $self->get_iset($feature);
347 0 0       0 my $comp =
    0          
    0          
348             $expected =~ s/^\!\~// ? 'nr' :
349             $expected =~ s/^\!// ? 'ne' :
350             $expected =~ s/^\~// ? 're' : 'eq';
351 0 0 0     0 if (
      0        
      0        
      0        
      0        
      0        
      0        
352             $comp eq 'eq' && $value ne $expected ||
353             $comp eq 'ne' && $value eq $expected ||
354             $comp eq 're' && $value !~ m/$expected/ ||
355             $comp eq 'nr' && $value =~ m/$expected/
356             )
357             {
358 0         0 return 0;
359             }
360             }
361 0         0 return 1;
362             };
363              
364             #------------------------------------------------------------------------------
365             # Goal: convert multivalues from arrays to strings:
366             # e.g. iset/gender = ["fem", "neut"] becomes iset/gender = "fem|neut"
367             # to enable storing in a PML file.
368             # Features tagset and other are not serialized. If they are set, copy them as
369             # wild attributes so that they can be stored.
370             #------------------------------------------------------------------------------
371             method serialize_iset => sub
372             {
373 28     28   72 my ($self) = @_;
        28      
374 28         866 foreach my $feature ( $self->$interset_attribute->get_nonempty_features() )
375             {
376 0         0 my $value = $self->get_iset($feature);
377 0 0       0 unless ( $value eq '' )
378             {
379 0         0 $self->set_attr("$interset_attribute/$feature", $value);
380             }
381             }
382 28 50 33     4923 if ( defined($self->$interset_attribute->{other}) && ref($self->$interset_attribute->{other}) eq 'HASH' )
383             {
384             # We assume that 'other' is a simple set of attribute-value pairs and we create a shallow copy.
385 0         0 my $i = $self->$interset_attribute;
386 0         0 my $w = $self->wild();
387 0         0 my @keys = keys(%{$i->{other}});
  0         0  
388 0         0 foreach my $k (@keys)
389             {
390 0         0 $w->{isetother}{$k} = $i->{other}{$k};
391             }
392             # We must also save the original tagset identifier. Without it the values in other are meaningless.
393 0 0       0 if($i->tagset() ne '')
394             {
395 0         0 $w->{isettagset} = $i->tagset();
396             }
397             }
398             else
399             {
400 28         186 delete($self->wild()->{isetother});
401             }
402 28         99 return;
403             };
404              
405             #------------------------------------------------------------------------------
406             # Goal: convert multivalues from strings to arrays:
407             # e.g. iset/gender = "fem|neut" becomes iset/gender = ["fem", "neut"]
408             # Features tagset and other are not serialized with Interset but they may have
409             # been serialized as wild attributes.
410             #------------------------------------------------------------------------------
411             method deserialize_iset => sub
412             {
413 28     28   76 my ($self) = @_;
        28      
414 28 50       93 if (! $Treex::Core::Config::running_in_tred)
415             {
416             # iset
417             # ttred does not like arrayrefs so only unserilaize if not in ttred
418 28 50       854 if ($self->$interset_attribute)
419             {
420             # this looks a bit weird,
421             # but it ensures correct deserialization of multivalues,
422             # i.e. turning e.g. "fem|neut" into ["fem", "neut"]
423 28         804 $self->set_iset($self->$interset_attribute);
424 28 50 33     4732 if (exists($self->wild()->{isetother}) && ref($self->wild()->{isetother}) eq 'HASH')
425             {
426             # We assume that 'other' is a simple set of attribute-value pairs and we create a shallow copy.
427 0         0 my $i = $self->$interset_attribute;
428 0         0 my $w = $self->wild();
429 0 0       0 delete($i->{other}) if(exists($i->{other}));
430 0         0 my @keys = keys(%{$w->{isetother}});
  0         0  
431 0         0 foreach my $k (@keys)
432             {
433 0         0 $i->{other}{$k} = $w->{isetother}{$k};
434             }
435 0         0 delete($self->wild()->{isetother});
436             }
437             # We must also retreive the original tagset identifier. Without it the values in other are meaningless.
438 28 50       94 if (exists($self->wild()->{isettagset}))
439             {
440 0         0 $self->$interset_attribute->set_tagset($self->wild()->{isettagset});
441             }
442             }
443             }
444             # iset_dump
445             # (backward compatibility for files
446             # created when iset_dump was used to store iset)
447 28 50       98 if ( $self->{iset_dump} ) {
448 0         0 $self->set_iset( eval "my " . $self->{iset_dump} . '; return $VAR1' ); ## no critic (ProhibitStringyEval)
449             # iset_dump is deprecated
450 0         0 delete $self->{iset_dump};
451 0 0       0 if ($Treex::Core::Config::running_in_tred) {
452             # ttred does not like arrayrefs so serialize back to strings for it
453 0         0 $self->serialize_iset();
454             }
455             }
456 28         139 return;
457             };
458              
459              
460             }; # end of "role {"
461              
462             1;
463              
464             __END__
465              
466             =encoding utf-8
467              
468             =head1 NAME
469              
470             Treex::Core::Node::Interset
471              
472             =head1 VERSION
473              
474             version 2.20210102
475              
476             =head1 DESCRIPTION
477              
478             Moose role for nodes that have the Interset feature structure.
479              
480             =head1 ATTRIBUTES
481              
482             =over
483              
484             =item iset/*
485              
486             Attributes corresponding to Interset features.
487              
488             =back
489              
490             =head1 METHODS
491              
492             =head2 Access to Interset features
493              
494             =over
495              
496             =item my $boolean = $node->match_iset('pos' => 'noun', 'gender' => '!masc', ...);
497              
498             Do the feature values of this node match the specification?
499             (Values of other features do not matter.)
500             A value preceded by exclamation mark is tested on string inequality.
501             A value preceded by a tilde is tested on regex match.
502             A value preceded by exclamation mark and tilde is tested on regex mismatch.
503             Other values are tested on string equality.
504              
505             =back
506              
507              
508             =head1 AUTHOR
509              
510             Dan Zeman <zeman@ufal.mff.cuni.cz>
511              
512             Martin Popel <popel@ufal.mff.cuni.cz>
513              
514             =head1 COPYRIGHT AND LICENSE
515              
516             Copyright © 2011, 2013, 2014, 2015 by Institute of Formal and Applied Linguistics, Charles University in Prague
517              
518             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.