File Coverage

blib/lib/Treex/Tool/Segment/RuleBased.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Treex::Tool::Segment::RuleBased;
2				$Treex::Tool::Segment::RuleBased::VERSION = '0.13095';
3	1	1	13842	use strict;
	1		1
	1		33
4	1	1	3	use warnings;
	1		1
	1		20
5	1	1	473	use utf8;
	1		8
	1		3
6	1	1	180	use Moose;
	0
	0
7				use Treex::Core::Common;
8
9				has use_paragraphs => (
10				is => 'ro',
11				isa => 'Bool',
12				default => 1,
13				documentation =>
14				'Should paragraph boundaries be preserved as sentence boundaries?'
15				. ' Paragraph boundary is defined as two or more consecutive newlines.',
16				);
17
18				has use_lines => (
19				is => 'ro',
20				isa => 'Bool',
21				default => 0,
22				documentation =>
23				'Should newlines in the text be preserved as sentence boundaries?'
24				. '(But if you want to detect sentence boundaries just based on newlines'
25				. ' and nothing else, use rather W2A::SegmentOnNewlines.)',
26				);
27
28				has limit_words => (
29				is => 'ro',
30				isa => 'Int',
31				default => 250,
32				documentation =>
33				'Should very long segments (longer than the given number of words) be split?'
34				. 'The number of words is only approximate; detected by counting whitespace only,'
35				. 'not by full tokenization. Set to zero to disable this function completely.',
36				);
37
38				has detect_lists => (
39				is => 'ro',
40				isa => 'Int',
41				default => 100,
42				documentation =>
43				'Minimum (approx.) number of words to toggle list detection, 0 = never, 1 = always.'
44				);
45
46				# Tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
47				# * single uppercase letters serve usually as first name initials
48				# * in langauge-specific descendants consider adding
49				# * period-ending items that never indicate sentence breaks
50				# * titles before names of persons etc.
51				#
52				# Note, that we cannot write
53				# sub get_unbreakers { return qr{...}; }
54				# because we want the regex to be compiled just once, not on every method call.
55				my $UNBREAKERS = qr{\p{Upper}};
56
57				sub unbreakers {
58				return $UNBREAKERS;
59				}
60
61				# Characters that can appear after period (or other end-sentence symbol)
62				sub closings {
63				return '"”»)';
64				}
65
66				# Characters that can appear before the first word of a sentence
67				sub openings {
68				return '"“«(';
69				}
70
71				# Contextual rules for "un-breaking" (to be overridden)
72				sub apply_contextual_rules {
73				my ($self, $text) = @_;
74				return $text;
75				}
76
77				sub get_segments {
78				my ( $self, $text ) = @_;
79
80				# Pre-processing
81				$text = $self->apply_contextual_rules($text);
82
83				my $unbreakers = $self->unbreakers;
84				$text =~ s/\b($unbreakers)\./$1<<>>/g;
85
86				# two newlines usually separate paragraphs
87				if ( $self->use_paragraphs ) {
88				$text =~ s/([^.!?])\n\n+/$1<<>>/gsm;
89				}
90
91				if ( $self->use_lines ) {
92				$text =~ s/\n/<<>>/gsm;
93				}
94
95				# Normalize whitespaces
96				$text =~ s/\s+/ /gsm;
97
98				# This is the main work
99				$text = $self->split_at_terminal_punctuation($text);
100
101				# Post-processing
102				$text =~ s/<<>>/\n/gsmx;
103				$text =~ s/<<>>/./gsxm;
104				$text =~ s/\s+$//gsxm;
105				$text =~ s/^\s+//gsxm;
106
107				# try to separate various list items (e.g. TV programmes, calendars)
108				my @segs = map { $self->split_at_list_items($_) } split /\n/, $text;
109
110				# handle segments that are too long
111				return map { $self->segment_too_long($_) ? $self->handle_long_segment($_) : $_ } @segs;
112				}
113
114				sub split_at_terminal_punctuation {
115				my ( $self, $text ) = @_;
116				my ( $openings, $closings ) = ( $self->openings, $self->closings );
117				$text =~ s{
118				([.?!]) # $1 = end-sentence punctuation
119				([$closings]?) # $2 = optional closing quote/bracket
120				\s # space
121				([$openings]?\p{Upper}) # $3 = uppercase letter (optionally preceded by opening quote)
122				}{$1$2\n$3}gsxm;
123				return $text;
124				}
125
126				sub handle_long_segment {
127				my ( $self, $seg ) = @_;
128
129				# split at some other dividing punctuation characters (poems, unending speech)
130				my @split = map { $self->segment_too_long($_) ? $self->split_at_dividing_punctuation($_) : $_ } $seg;
131
132				# split at any punctuation
133				@split = map { $self->segment_too_long($_) ? $self->split_at_any_punctuation($_) : $_ } @split;
134
135				# split hard if still too long
136				return map { $self->segment_too_long($_) ? $self->split_hard($_) : $_ } @split;
137				}
138
139				# Return 1 if the segment is too long
140				sub segment_too_long {
141				my ( $self, $seg ) = @_;
142
143				# skip everything if the limit is infinity
144				return 0 if ( $self->limit_words == 0 );
145
146				# return 1 if the number of space-separated segments exceeds the limit
147				my $wc = () = $seg =~ m/\s+/g;
148				return 1 if ( $wc >= $self->limit_words );
149				return 0;
150				}
151
152				# "Non-final" punctuation that could divide segments (NB: single dot excluded due to abbreviations)
153				my $DIV_PUNCT = qr{(!\|\.\.+\|\?\|\\|[–—-](\s[–—-])+\|;)};
154
155				sub split_at_dividing_punctuation {
156				my ( $self, $text ) = @_;
157
158				my $closings = $self->closings;
159				$text =~ s/($DIV_PUNCT\s*[$closings]?,?)/$1\n/g;
160
161				return split /\n/, $self->_join_too_short_segments($text);
162				}
163
164				# Universal list types (currently only semicolon-separated lists, to be overridden in language-specific blocks)
165				my $LIST_TYPES = [
166				{
167				name => ';', # a label for the list type (just for debugging)
168				sep => ';\h+', # separator regexp
169				sel_sep => undef, # separator regexp used only for the selection of this list (sep used if not set)
170				type => 'e', # type of separator (ending: e / staring: s)
171				max => 400, # maximum average list-item length (overrides the default)
172				min => 30, # minimum average list-item length (overrides the default)
173				# negative pre-context, not used if not set (here: skip semicolons separating just numbers)
174				neg_pre => '[0-9]\h(?=;\h[0-9]+(?:[^\.0-9]\|\.[0-9]\|$))',
175				},
176				];
177
178				# Language-specific blocks should override this method and provide usual list types for the given language
179				sub list_types {
180				return @{$LIST_TYPES};
181				}
182
183				my $MAX_AVG_ITEM_LEN = 400; # default maximum average list item length, in characters
184				my $MIN_AVG_ITEM_LEN = 30; # default minimum average list item length, in characters
185				my $MIN_LIST_ITEMS = 3; # minimum number of items in a list
186				my $PRIORITY = 2.5; # multiple of list items a lower-rank list type must have over a higher-rank type
187
188				sub split_at_list_items {
189
190				my ( $self, $text ) = @_;
191
192				# skip this if list detection is turned off
193				return $text if ( $self->detect_lists == 0 );
194
195				# skip too short lines
196				my $wc = () = $text =~ m/\s+/g;
197				return $text if ( $self->detect_lists > $wc );
198
199				my @list_types = $self->list_types;
200				my $sel_list_type;
201				my $sel_len;
202
203				# find out which list type is the best for the given text
204				for ( my $i = 0; $i < @list_types; ++$i ) {
205
206				my $cur_list_type = $list_types[$i];
207				my $sep = $cur_list_type->{sel_sep} \|\| $cur_list_type->{sep};
208				my $neg = $cur_list_type->{neg_pre};
209				my $min = $cur_list_type->{min} \|\| $MIN_AVG_ITEM_LEN;
210				my $max = $cur_list_type->{max} \|\| $MAX_AVG_ITEM_LEN;
211
212				my $items = () = $text =~ m/$sep/gi;
213
214				# count number of items; exclude negative pre-context matches, if negative pre-context is specified
215				my $false = 0;
216				$false = () = $text =~ m/$neg(?=$sep)/gi if ($neg);
217				$items -= $false;
218
219				my $len = $items > 0 ? ( length($text) / $items ) : 'NaN';
220
221				# test if this type overrides the previously set one
222				if ( $items >= $MIN_LIST_ITEMS && $len < $max && $len > $min && ( !$sel_len \|\| $len * $PRIORITY < $sel_len ) ) {
223				$sel_list_type = $cur_list_type;
224				$sel_len = $len;
225				}
226				}
227
228				# return if no list type found
229				return $text if ( !$sel_list_type );
230
231				# list type detected, split by the given list type
232				my $sep = $sel_list_type->{sep};
233				my $neg = $sel_list_type->{neg_pre};
234				my $name = $sel_list_type->{name};
235
236				# protect negative pre-context, if any is specified
237				$text =~ s/($neg)(?=$sep)/$1<<>>/gi if ($neg);
238
239				# split at the given list type
240				if ( $sel_list_type->{type} eq 'e' ) {
241				$text =~ s/(?>>)($sep)/$1\n/gi;
242				}
243				else {
244				$text =~ s/(?>>)($sep)/\n$1/gi;
245				}
246
247				# remove negative pre-context protection
248				$text =~ s/<<>>//g;
249
250				# delete too short splits
251				$text = $self->_join_too_short_segments($text);
252
253				# return the split result
254				return split /\n/, $text;
255				}
256
257				sub _join_too_short_segments {
258				my ( $self, $text ) = @_;
259
260				$text =~ s/^\n//;
261				$text =~ s/\n$//;
262				$text =~ s/\n(?=\h(\S+(\h+\S+){0,2})?\h(\n\|$))/ /g;
263				return $text;
264				}
265
266				sub split_at_any_punctuation {
267				my ( $self, $text ) = @_;
268
269				my $closings = $self->closings;
270
271				# prefer punctuation followed by a letter
272				$text =~ s/([,;!?–—-]+\s*[$closings]?)\s+(\p{Alpha})/$1\n$2/g;
273
274				# delete too short splits
275				$text = $self->_join_too_short_segments($text);
276
277				my @split = split /\n/, $text;
278
279				# split at any punctuation if the text is still too long
280				return map {
281				$_ =~ s/([,;!?–—-]+\s*[$closings]?)/$1\n/g if ( $self->segment_too_long($_) );
282				split /\n/, $self->_join_too_short_segments($_)
283				} @split;
284				}
285
286				sub split_hard {
287				my ( $self, $text ) = @_;
288
289				my @tokens = split /(\s+)/, $text;
290				my @result;
291				my $pos = 0;
292
293				while ( $pos < @tokens ) {
294				my $limit = $pos + $self->limit_words * 2 - 1;
295				$limit = @tokens - 1 if ( $limit > @tokens - 1 );
296				push @result, join( '', @tokens[ $pos .. $limit ] );
297				$pos = $limit + 1;
298				}
299				return @result;
300				}
301
302				1;
303
304				__END__