File Coverage

blib/lib/Treex/Block/W2A/ResegmentSentences.pm
Criterion Covered Total %
statement 2 4 50.0
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 4 6 66.6


line stmt bran cond sub pod time code
1             package Treex::Block::W2A::ResegmentSentences;
2             BEGIN {
3 1     1   53413 $Treex::Block::W2A::ResegmentSentences::VERSION = '0.08170';
4             }
5 1     1   2163 use Moose;
  0            
  0            
6             use Treex::Core::Common;
7             extends 'Treex::Core::Block';
8              
9             has 'segmenters' => (
10             is => 'rw',
11             isa => 'HashRef[Treex::Tool::Segment::RuleBased]',
12             default => sub { return {} },
13             );
14              
15             sub _get_segmenter {
16             my $self = shift;
17             my $lang = uc shift;
18             if ( exists $self->segmenters->{$lang} ) {
19             return $self->segmenters->{$lang};
20             }
21             my $specific = "Treex::Tool::Segment::${lang}::RuleBased";
22             my $fallback = "Treex::Tool::Segment::RuleBased";
23             foreach my $class ( $specific, $fallback ) {
24             my $segmenter = eval "use $class; $class->new()"; ##no critic (BuiltinFunctions::ProhibitStringyEval) We want to use it, it is simpler and we check result
25             if ($segmenter) {
26             $self->segmenters->{$lang} = $segmenter;
27             return $segmenter;
28             }
29             else {
30             log_info("Failed during creating segmenter $class: $@");
31             }
32             }
33             log_fatal("Cannot create segmenter for $lang");
34             }
35              
36             sub process_bundle {
37             my ( $self, $bundle ) = @_;
38              
39             my $my_label = $self->zone_label || '';
40             my %sentences;
41             my ( $my_segments, $max_segments ) = ( 0, 0 );
42             foreach my $zone ( $bundle->get_all_zones() ) {
43             my $lang = $zone->language;
44             my $label = $zone->get_label();
45             my $segmenter = $self->_get_segmenter($lang);
46             $sentences{$label} = [ $segmenter->get_segments( $zone->sentence ) ];
47             my $segments = @{ $sentences{$label} };
48             if ( $segments > $max_segments ) { $max_segments = $segments; }
49             if ( $label eq $my_label ) { $my_segments = $segments; }
50             }
51              
52             # If no language (and selector) were specified for this block
53             # resegment all zones
54             if ( $my_segments == 0 ) {
55             $my_segments = $max_segments;
56             }
57              
58             # We are finished if
59             # the zone to be processed contains just one sentence.
60             return if $my_segments == 1;
61              
62             # TODO: If a zone contains less subsegments (e.g. just 1) than $segments
63             # we can try to split it to equally long chunks regardless of the real
64             # sentence boundaries. Anyway, all evaluation blocks should join the
65             # segments together again before measuring BLEU etc.
66             my $doc = $bundle->get_document;
67             my $orig_id = $bundle->id;
68             $bundle->set_id("${orig_id}_1of$my_segments");
69             foreach my $zone ( $bundle->get_all_zones() ) {
70             my $label = $zone->get_label();
71             my $sent = shift @{ $sentences{$label} };
72             $zone->set_sentence($sent);
73             }
74             my $last_bundle = $bundle;
75             my @labels = keys %sentences;
76              
77             # TODO parameter to set how many bundles should be created: $my_segments or $max_segments?
78             for my $i ( 2 .. $my_segments ) {
79             my $new_bundle = $doc->create_bundle( { after => $last_bundle } );
80             $last_bundle = $new_bundle;
81             $new_bundle->set_id("${orig_id}_${i}of$my_segments");
82             foreach my $label (@labels) {
83             my $sent = shift @{ $sentences{$label} };
84             if ( !defined $sent ) { $sent = ' '; }
85              
86             # If some zone contains more segments than the "current" zone,
87             # the remaining segments will be joined to the last bundle.
88             if ( $i == $my_segments && $max_segments > $my_segments ) {
89             $sent .= ' ' . join( ' ', @{ $sentences{$label} } );
90             }
91             my ( $lang, $selector ) = split /_/, $label;
92             my $new_zone = $new_bundle->create_zone( $lang, $selector );
93             $new_zone->set_sentence($sent);
94             }
95             }
96              
97             return;
98             }
99              
100             1;
101              
102             __END__
103              
104             TODO POD
105              
106             =encoding utf-8
107              
108             =head1 NAME
109              
110             Treex::Block::W2A::ResegmentSentences
111              
112             =head1 VERSION
113              
114             version 0.08170
115              
116             =head1 DESCRIPTION
117              
118             If the sentence segmenter says that the current sentence is
119             actually composed of two or more sentences, then new bundles
120             are inserted after the current bundle, each containing just
121             one piece of the resegmented original sentence.
122              
123             All zones are processed.
124             The number of bundles created is determined by the number of subsegments
125             in the "current" zone (specified by the parameters C<language> and C<selector>).
126             If a zone contains less subsegments than the current one,
127             the remaining bundles will contain empty sentence.
128             If a zone contains more subsegments than the current one,
129             the remaining subsegments will be joined in the last bundle.
130              
131             In other words, it is granted that the current zone,
132             will not contain empty sentences.
133              
134             =head1 AUTHOR
135              
136             ZdenÄ›k Žabokrtský <zabokrtsky@ufal.mff.cuni.cz>
137              
138             Martin Popel <popel@ufal.mff.cuni.cz>
139              
140             =head1 COPYRIGHT AND LICENSE
141              
142             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
143              
144             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.