File Coverage

blib/lib/Treex/Block/W2A/ResegmentSentences.pm
Criterion Covered Total %
statement 39 100 39.0
branch 4 22 18.1
condition 0 5 0.0
subroutine 11 13 84.6
pod 1 1 100.0
total 55 141 39.0


line stmt bran cond sub pod time code
1             package Treex::Block::W2A::ResegmentSentences;
2             $Treex::Block::W2A::ResegmentSentences::VERSION = '2.20151102';
3 1     1   20553 use strict;
  1         2  
  1         25  
4 1     1   5 use warnings;
  1         2  
  1         25  
5 1     1   810 use Moose;
  1         472353  
  1         6  
6 1     1   7791 use Treex::Core::Common;
  1         596312  
  1         5  
7             extends 'Treex::Core::Block';
8              
9             has remove => (
10             is => 'ro',
11             isa => enum( [qw(no all diff)] ),
12             default => 'no',
13             documentation => 'remove=no ... Do not delete any bundles (default). '
14             . 'remove=all ... Delete bundles with multiple subsegments. '
15             . 'remove=diff ... Delete bundles with zones with different number of subsegments.',
16             );
17              
18             has 'segmenters' => (
19             is => 'rw',
20             isa => 'HashRef[Treex::Tool::Segment::RuleBased]',
21             default => sub { return {} },
22             );
23              
24             sub _get_segmenter {
25 9     9   5280 my $self = shift;
26 9         18 my $lang = uc shift;
27 9 100       400 if ( exists $self->segmenters->{$lang} ) {
28 6         245 return $self->segmenters->{$lang};
29             }
30 3         10 my $specific = "Treex::Tool::Segment::${lang}::RuleBased";
31 3         5 my $fallback = "Treex::Tool::Segment::RuleBased";
32 3         6 foreach my $class ( $specific, $fallback ) {
33 6         83 my $segmenter;
34             {
35 6         8 local $SIG{"__DIE__"};
  6         21  
36 1     1   456 $segmenter = eval "use $class; $class->new()"; ##no critic (BuiltinFunctions::ProhibitStringyEval) We want to use it, it is simpler and we check result
  0     1   0  
  0     1   0  
  1     1   773  
  1     1   2  
  1     1   38  
  1         433  
  0         0  
  0         0  
  1         6  
  1         2  
  1         23  
  1         374  
  0         0  
  0         0  
  1         7  
  1         2  
  1         29  
  6         498  
37             }
38 6 100       5368 if ($segmenter) {
39 3         130 $self->segmenters->{$lang} = $segmenter;
40 3         17 return $segmenter;
41             }
42             else {
43 3         20 log_info("Failed during creating segmenter $class: $@");
44             }
45             }
46 0           log_fatal("Cannot create segmenter for $lang");
47             }
48              
49             sub process_bundle {
50 0     0 1   my ( $self, $bundle ) = @_;
51              
52 0   0       my $my_label = $self->zone_label || '';
53 0           my %sentences;
54 0           my ( $my_segments, $max_segments ) = ( 0, 0 );
55 0           foreach my $zone ( $bundle->get_all_zones() ) {
56 0           my $lang = $zone->language;
57 0           my $label = $zone->get_label();
58 0           my $segmenter = $self->_get_segmenter($lang);
59 0           $sentences{$label} = [ $segmenter->get_segments( $zone->sentence ) ];
60 0           my $segments = @{ $sentences{$label} };
  0            
61 0 0         if ( $segments > $max_segments ) { $max_segments = $segments; }
  0            
62 0 0         if ( $label eq $my_label ) { $my_segments = $segments; }
  0            
63             }
64              
65             # If no language (and selector) were specified for this block
66             # resegment all zones
67 0 0         if ( $my_segments == 0 ) {
68 0           $my_segments = $max_segments;
69             }
70              
71             # We are finished if
72             # the zone to be processed contains just one sentence.
73 0 0         return if $my_segments == 1;
74              
75             # So we have more subsegments. Delete the bundle and exit if requested.
76 0 0         if ($self->remove eq 'all'){
77 0           $bundle->remove();
78 0           return;
79             }
80              
81             # TODO: If a zone contains less subsegments (e.g. just 1) than $segments
82             # we can try to split it to equally long chunks regardless of the real
83             # sentence boundaries. Anyway, all evaluation blocks should join the
84             # segments together again before measuring BLEU etc.
85 0           my $doc = $bundle->get_document;
86 0           my $orig_id = $bundle->id;
87 0           my $last_bundle = $bundle;
88 0           my @labels = keys %sentences;
89              
90             # If any zone has different number of subsegments than $my_segments
91             # and the user requested to delete such bundles, do it and exit.
92 0 0         if ($self->remove eq 'diff'){
93 0 0   0     if (any {$_ != $my_segments} map {scalar @{$sentences{$_}}} @labels) {
  0            
  0            
  0            
94 0           $bundle->remove();
95 0           return;
96             }
97             }
98              
99             # First subsegment will be saved into the original bundle (with renamed id)
100 0           $bundle->set_id("${orig_id}_1of$my_segments");
101 0           foreach my $zone ( $bundle->get_all_zones() ) {
102 0           my $label = $zone->get_label();
103 0           my $sent = shift @{ $sentences{$label} };
  0            
104 0           $zone->set_sentence($sent);
105             }
106              
107             # Other subsegments will be saved to new bundles
108 0           for my $i ( 2 .. $my_segments ) {
109 0           my $new_bundle = $doc->create_bundle( { after => $last_bundle } );
110 0           $last_bundle = $new_bundle;
111 0           $new_bundle->set_id("${orig_id}_${i}of$my_segments");
112 0           foreach my $label (@labels) {
113 0           my $sent = shift @{ $sentences{$label} };
  0            
114 0 0         if ( !defined $sent ) { $sent = ' '; }
  0            
115              
116             # If some zone contains more segments than the "current" zone,
117             # the remaining segments will be joined to the last bundle.
118 0 0 0       if ( $i == $my_segments && $max_segments > $my_segments ) {
119 0           $sent .= ' ' . join( ' ', @{ $sentences{$label} } );
  0            
120             }
121 0           my ( $lang, $selector ) = split /_/, $label;
122 0           my $new_zone = $new_bundle->create_zone( $lang, $selector );
123 0           $new_zone->set_sentence($sent);
124             }
125             }
126              
127 0           return;
128             }
129              
130             1;
131              
132             __END__
133              
134             =encoding utf-8
135              
136             =head1 NAME
137              
138             Treex::Block::W2A::ResegmentSentences - split bundles which contain more sentences
139              
140             =head1 VERSION
141              
142             version 2.20151102
143              
144             =head1 MOTIVATION
145              
146             Some resources (most notably WMT newstest) are segmented to chunks of text
147             which mostly correspond to sentences, but sometimes contain more than one sentence.
148             Sometimes we want to process such documents in Treex and output (Write::*)
149             the result in a format where one output segement correspond to one input segement.
150             (So e.g. for "one-sentence-per-line writers", we have the same number of input and output lines.)
151              
152             However, most Treex blocks expect exactly one (linguistic) sentence in each bundle.
153             The solution is to use block C<W2A::ResegmentSentences> after the reader
154             and C<Misc::JoinBundles> before the writer.
155              
156             =head1 DESCRIPTION
157              
158             If the sentence segmenter says that the current sentence is
159             actually composed of two or more sentences, then new bundles
160             are inserted after the current bundle, each containing just
161             one piece of the resegmented original sentence.
162              
163             This block should be executed before tokenization (and tagging etc).
164             It deals only with the (string) attribute C<sentence> in each zone,
165             it does not process any trees.
166              
167             All zones are processed.
168             The number of bundles created is determined by the number of subsegments
169             in the "current" zone (specified by the parameters C<language> and C<selector>).
170             If a zone contains less subsegments than the current one,
171             the remaining bundles will contain empty sentence.
172             If a zone contains more subsegments than the current one,
173             the remaining subsegments will be joined in the last bundle.
174              
175             In other words, it is granted that the current zone,
176             will not contain empty sentences.
177              
178             As a special case if parameters C<language> and C<selector> define a zone
179             which is not present in a bundle (this holds also for language=all),
180             the "current" zone is the one with most subsegments, i.e. no subsegments are joined.
181              
182             =head1 PARAMETERS
183              
184             =head2 remove (no|all|diff)
185             By setting parameter C<remove> you can delete some bundles.
186             Default is remove=no.
187             Setting remove=all will delete all bundles with more than one subsegments in the current zone.
188             Setting remove=diff will delete all bundles that have (at least) two zones with different number of subsegments.
189              
190             =head1 SEE ALSO
191              
192             L<Treex::Block::Misc::JoinBundles>
193              
194             =head1 AUTHOR
195              
196             ZdenÄ›k Žabokrtský <zabokrtsky@ufal.mff.cuni.cz>
197              
198             Martin Popel <popel@ufal.mff.cuni.cz>
199              
200             =head1 COPYRIGHT AND LICENSE
201              
202             Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
203              
204             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.