File Coverage

blib/lib/Treex/Tool/Segment/RuleBased.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Treex::Tool::Segment::RuleBased;
2             $Treex::Tool::Segment::RuleBased::VERSION = '0.13095';
3 1     1   13842 use strict;
  1         1  
  1         33  
4 1     1   3 use warnings;
  1         1  
  1         20  
5 1     1   473 use utf8;
  1         8  
  1         3  
6 1     1   180 use Moose;
  0            
  0            
7             use Treex::Core::Common;
8              
9             has use_paragraphs => (
10             is => 'ro',
11             isa => 'Bool',
12             default => 1,
13             documentation =>
14             'Should paragraph boundaries be preserved as sentence boundaries?'
15             . ' Paragraph boundary is defined as two or more consecutive newlines.',
16             );
17              
18             has use_lines => (
19             is => 'ro',
20             isa => 'Bool',
21             default => 0,
22             documentation =>
23             'Should newlines in the text be preserved as sentence boundaries?'
24             . '(But if you want to detect sentence boundaries just based on newlines'
25             . ' and nothing else, use rather W2A::SegmentOnNewlines.)',
26             );
27              
28             has limit_words => (
29             is => 'ro',
30             isa => 'Int',
31             default => 250,
32             documentation =>
33             'Should very long segments (longer than the given number of words) be split?'
34             . 'The number of words is only approximate; detected by counting whitespace only,'
35             . 'not by full tokenization. Set to zero to disable this function completely.',
36             );
37              
38             has detect_lists => (
39             is => 'ro',
40             isa => 'Int',
41             default => 100,
42             documentation =>
43             'Minimum (approx.) number of words to toggle list detection, 0 = never, 1 = always.'
44             );
45              
46             # Tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
47             # * single uppercase letters serve usually as first name initials
48             # * in langauge-specific descendants consider adding
49             # * period-ending items that never indicate sentence breaks
50             # * titles before names of persons etc.
51             #
52             # Note, that we cannot write
53             # sub get_unbreakers { return qr{...}; }
54             # because we want the regex to be compiled just once, not on every method call.
55             my $UNBREAKERS = qr{\p{Upper}};
56              
57             sub unbreakers {
58             return $UNBREAKERS;
59             }
60              
61             # Characters that can appear after period (or other end-sentence symbol)
62             sub closings {
63             return '"”»)';
64             }
65              
66             # Characters that can appear before the first word of a sentence
67             sub openings {
68             return '"“«(';
69             }
70              
71             # Contextual rules for "un-breaking" (to be overridden)
72             sub apply_contextual_rules {
73             my ($self, $text) = @_;
74             return $text;
75             }
76              
77             sub get_segments {
78             my ( $self, $text ) = @_;
79              
80             # Pre-processing
81             $text = $self->apply_contextual_rules($text);
82              
83             my $unbreakers = $self->unbreakers;
84             $text =~ s/\b($unbreakers)\./$1<<>>/g;
85              
86             # two newlines usually separate paragraphs
87             if ( $self->use_paragraphs ) {
88             $text =~ s/([^.!?])\n\n+/$1<<>>/gsm;
89             }
90              
91             if ( $self->use_lines ) {
92             $text =~ s/\n/<<>>/gsm;
93             }
94              
95             # Normalize whitespaces
96             $text =~ s/\s+/ /gsm;
97              
98             # This is the main work
99             $text = $self->split_at_terminal_punctuation($text);
100              
101             # Post-processing
102             $text =~ s/<<>>/\n/gsmx;
103             $text =~ s/<<>>/./gsxm;
104             $text =~ s/\s+$//gsxm;
105             $text =~ s/^\s+//gsxm;
106              
107             # try to separate various list items (e.g. TV programmes, calendars)
108             my @segs = map { $self->split_at_list_items($_) } split /\n/, $text;
109              
110             # handle segments that are too long
111             return map { $self->segment_too_long($_) ? $self->handle_long_segment($_) : $_ } @segs;
112             }
113              
114             sub split_at_terminal_punctuation {
115             my ( $self, $text ) = @_;
116             my ( $openings, $closings ) = ( $self->openings, $self->closings );
117             $text =~ s{
118             ([.?!]) # $1 = end-sentence punctuation
119             ([$closings]?) # $2 = optional closing quote/bracket
120             \s # space
121             ([$openings]?\p{Upper}) # $3 = uppercase letter (optionally preceded by opening quote)
122             }{$1$2\n$3}gsxm;
123             return $text;
124             }
125              
126             sub handle_long_segment {
127             my ( $self, $seg ) = @_;
128              
129             # split at some other dividing punctuation characters (poems, unending speech)
130             my @split = map { $self->segment_too_long($_) ? $self->split_at_dividing_punctuation($_) : $_ } $seg;
131              
132             # split at any punctuation
133             @split = map { $self->segment_too_long($_) ? $self->split_at_any_punctuation($_) : $_ } @split;
134              
135             # split hard if still too long
136             return map { $self->segment_too_long($_) ? $self->split_hard($_) : $_ } @split;
137             }
138              
139             # Return 1 if the segment is too long
140             sub segment_too_long {
141             my ( $self, $seg ) = @_;
142              
143             # skip everything if the limit is infinity
144             return 0 if ( $self->limit_words == 0 );
145              
146             # return 1 if the number of space-separated segments exceeds the limit
147             my $wc = () = $seg =~ m/\s+/g;
148             return 1 if ( $wc >= $self->limit_words );
149             return 0;
150             }
151              
152             # "Non-final" punctuation that could divide segments (NB: single dot excluded due to abbreviations)
153             my $DIV_PUNCT = qr{(!|\.\.+|\?|\*|[–—-](\s*[–—-])+|;)};
154              
155             sub split_at_dividing_punctuation {
156             my ( $self, $text ) = @_;
157              
158             my $closings = $self->closings;
159             $text =~ s/($DIV_PUNCT\s*[$closings]?,?)/$1\n/g;
160              
161             return split /\n/, $self->_join_too_short_segments($text);
162             }
163              
164             # Universal list types (currently only semicolon-separated lists, to be overridden in language-specific blocks)
165             my $LIST_TYPES = [
166             {
167             name => ';', # a label for the list type (just for debugging)
168             sep => ';\h+', # separator regexp
169             sel_sep => undef, # separator regexp used only for the selection of this list (sep used if not set)
170             type => 'e', # type of separator (ending: e / staring: s)
171             max => 400, # maximum average list-item length (overrides the default)
172             min => 30, # minimum average list-item length (overrides the default)
173             # negative pre-context, not used if not set (here: skip semicolons separating just numbers)
174             neg_pre => '[0-9]\h*(?=;\h*[0-9]+(?:[^\.0-9]|\.[0-9]|$))',
175             },
176             ];
177              
178             # Language-specific blocks should override this method and provide usual list types for the given language
179             sub list_types {
180             return @{$LIST_TYPES};
181             }
182              
183             my $MAX_AVG_ITEM_LEN = 400; # default maximum average list item length, in characters
184             my $MIN_AVG_ITEM_LEN = 30; # default minimum average list item length, in characters
185             my $MIN_LIST_ITEMS = 3; # minimum number of items in a list
186             my $PRIORITY = 2.5; # multiple of list items a lower-rank list type must have over a higher-rank type
187              
188             sub split_at_list_items {
189              
190             my ( $self, $text ) = @_;
191              
192             # skip this if list detection is turned off
193             return $text if ( $self->detect_lists == 0 );
194              
195             # skip too short lines
196             my $wc = () = $text =~ m/\s+/g;
197             return $text if ( $self->detect_lists > $wc );
198              
199             my @list_types = $self->list_types;
200             my $sel_list_type;
201             my $sel_len;
202              
203             # find out which list type is the best for the given text
204             for ( my $i = 0; $i < @list_types; ++$i ) {
205              
206             my $cur_list_type = $list_types[$i];
207             my $sep = $cur_list_type->{sel_sep} || $cur_list_type->{sep};
208             my $neg = $cur_list_type->{neg_pre};
209             my $min = $cur_list_type->{min} || $MIN_AVG_ITEM_LEN;
210             my $max = $cur_list_type->{max} || $MAX_AVG_ITEM_LEN;
211              
212             my $items = () = $text =~ m/$sep/gi;
213              
214             # count number of items; exclude negative pre-context matches, if negative pre-context is specified
215             my $false = 0;
216             $false = () = $text =~ m/$neg(?=$sep)/gi if ($neg);
217             $items -= $false;
218              
219             my $len = $items > 0 ? ( length($text) / $items ) : 'NaN';
220              
221             # test if this type overrides the previously set one
222             if ( $items >= $MIN_LIST_ITEMS && $len < $max && $len > $min && ( !$sel_len || $len * $PRIORITY < $sel_len ) ) {
223             $sel_list_type = $cur_list_type;
224             $sel_len = $len;
225             }
226             }
227              
228             # return if no list type found
229             return $text if ( !$sel_list_type );
230              
231             # list type detected, split by the given list type
232             my $sep = $sel_list_type->{sep};
233             my $neg = $sel_list_type->{neg_pre};
234             my $name = $sel_list_type->{name};
235              
236             # protect negative pre-context, if any is specified
237             $text =~ s/($neg)(?=$sep)/$1<<>>/gi if ($neg);
238              
239             # split at the given list type
240             if ( $sel_list_type->{type} eq 'e' ) {
241             $text =~ s/(?>>)($sep)/$1\n/gi;
242             }
243             else {
244             $text =~ s/(?>>)($sep)/\n$1/gi;
245             }
246              
247             # remove negative pre-context protection
248             $text =~ s/<<>>//g;
249              
250             # delete too short splits
251             $text = $self->_join_too_short_segments($text);
252              
253             # return the split result
254             return split /\n/, $text;
255             }
256              
257             sub _join_too_short_segments {
258             my ( $self, $text ) = @_;
259              
260             $text =~ s/^\n//;
261             $text =~ s/\n$//;
262             $text =~ s/\n(?=\h*(\S+(\h+\S+){0,2})?\h*(\n|$))/ /g;
263             return $text;
264             }
265              
266             sub split_at_any_punctuation {
267             my ( $self, $text ) = @_;
268              
269             my $closings = $self->closings;
270              
271             # prefer punctuation followed by a letter
272             $text =~ s/([,;!?–—-]+\s*[$closings]?)\s+(\p{Alpha})/$1\n$2/g;
273              
274             # delete too short splits
275             $text = $self->_join_too_short_segments($text);
276              
277             my @split = split /\n/, $text;
278              
279             # split at any punctuation if the text is still too long
280             return map {
281             $_ =~ s/([,;!?–—-]+\s*[$closings]?)/$1\n/g if ( $self->segment_too_long($_) );
282             split /\n/, $self->_join_too_short_segments($_)
283             } @split;
284             }
285              
286             sub split_hard {
287             my ( $self, $text ) = @_;
288              
289             my @tokens = split /(\s+)/, $text;
290             my @result;
291             my $pos = 0;
292              
293             while ( $pos < @tokens ) {
294             my $limit = $pos + $self->limit_words * 2 - 1;
295             $limit = @tokens - 1 if ( $limit > @tokens - 1 );
296             push @result, join( '', @tokens[ $pos .. $limit ] );
297             $pos = $limit + 1;
298             }
299             return @result;
300             }
301              
302             1;
303              
304             __END__