File Coverage

blib/lib/Treex/Tool/Segment/RuleBased.pm
Criterion Covered Total %
statement 5 7 71.4
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 8 10 80.0


line stmt bran cond sub pod time code
1             package Treex::Tool::Segment::RuleBased;
2             BEGIN {
3 1     1   24222 $Treex::Tool::Segment::RuleBased::VERSION = '0.08170';
4             }
5 1     1   1310 use utf8;
  1         11  
  1         6  
6 1     1   412 use Moose;
  0            
  0            
7             use Treex::Core::Common;
8              
9             has use_paragraphs => (
10             is => 'ro',
11             isa => 'Bool',
12             default => 1,
13             documentation =>
14             'Should paragraph boundaries be preserved as sentence boundaries?'
15             . ' Paragraph boundary is defined as two or more consecutive newlines.',
16             );
17              
18             has use_lines => (
19             is => 'ro',
20             isa => 'Bool',
21             default => 0,
22             documentation =>
23             'Should newlines in the text be preserved as sentence boundaries?'
24             . '(But if you want to detect sentence boundaries just based on newlines'
25             . ' and nothing else, use rather W2A::SegmentOnNewlines.)',
26             );
27              
28             # Tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
29             # * single uppercase letters serve usually as first name initials
30             # * in langauge-specific descendants consider adding
31             # * period-ending items that never indicate sentence breaks
32             # * titles before names of persons etc.
33             #
34             # Note, that we cannot write
35             # sub get_unbreakers { return qr{...}; }
36             # because we want the regex to be compiled just once, not on every method call.
37             my $UNBREAKERS = qr{\p{Upper}};
38              
39             sub unbreakers {
40             return $UNBREAKERS;
41             }
42              
43             # Characters that can appear after period (or other end-sentence symbol)
44             sub closings {
45             return '"”»)';
46             }
47              
48             # Characters that can appear before the first word of a sentence
49             sub openings {
50             return '"“«(';
51             }
52              
53             sub get_segments {
54             my ( $self, $text ) = @_;
55              
56             # Pre-processing
57             my $unbreakers = $self->unbreakers;
58             $text =~ s/\b($unbreakers)\./$1<<<DOT>>>/g;
59              
60             # two newlines usually separate paragraphs
61             if ( $self->use_paragraphs ) {
62             $text =~ s/([^.!?])\n\n+/$1<<<SEP>>>/gsm;
63             }
64              
65             if ( $self->use_lines ) {
66             $text =~ s/\n/<<<SEP>>>/gsm;
67             }
68              
69             # Normalize whitespaces
70             $text =~ s/\s+/ /gsm;
71              
72             # This is the main work
73             $text = $self->split_at_terminal_punctuation($text);
74              
75             # Post-processing
76             $text =~ s/<<<SEP>>>/\n/gsmx;
77             $text =~ s/<<<DOT>>>/./gsxm;
78             $text =~ s/\s+$//gsxm;
79             $text =~ s/^\s+//gsxm;
80              
81             return split /\n/, $text;
82             }
83              
84             sub split_at_terminal_punctuation {
85             my ( $self, $text ) = @_;
86             my ( $openings, $closings ) = ( $self->openings, $self->closings );
87             $text =~ s{
88             ([.?!]) # $1 = end-sentence punctuation
89             ([$closings]?) # $2 = optional closing quote/bracket
90             \s # space
91             ([$openings]?\p{Upper}) # $3 = uppercase letter (optionally preceded by opening quote)
92             }{$1$2\n$3}gsxm;
93             return $text;
94             }
95              
96             1;
97              
98             __END__
99              
100             =encoding utf-8
101              
102             =head1 NAME
103              
104             Treex::Tool::Segment::RuleBased - Rule based pseudo language-independent sentence segmenter
105              
106             =head1 VERSION
107              
108             version 0.08170
109              
110             =head1 DESCRIPTION
111              
112             Sentence boundaries are detected based on a regex rules
113             that detect end-sentence punctuation ([.?!]) followed by a uppercase letter.
114             This class is implemented in a pseudo language-independent way,
115             but it can be used as an ancestor for language-specific segmentation
116             by overriding the method C<segment_text>
117             (using C<around> see L<Moose::Manual::MethodModifiers>)
118             or just by overriding methods C<unbreakers>, C<openings> and C<closings>.
119              
120             See L<Treex::Block::W2A::EN::Segment>
121              
122             =head1 METHODS
123              
124             =over 4
125              
126             =item get_segments
127              
128             Returns list of sentences
129              
130             =back
131              
132             =head1 METHODS TO OVERRIDE
133              
134             =over 4
135              
136             =item segment_text
137              
138             Do the segmentation (handling C<use_paragraphs> and C<use_lines>)
139              
140             =item $text = split_at_terminal_punctuation($text)
141              
142             Adds newlines after terminal punctuation followed by an uppercase letter.
143              
144             =item unbreakers
145              
146             Returns regex that should match tokens that usually do not end a sentence even if they are followed by a period and a capital letter:
147             * single uppercase letters serve usually as first name initials
148             * in langauge-specific descendants consider adding
149             * period-ending items that never indicate sentence breaks
150             * titles before names of persons etc.
151              
152             =item openings
153              
154             Returns string with characters that can appear before the first word of a sentence
155              
156             =item closings
157              
158             Returns string with characters that can appear after period (or other end-sentence symbol)
159              
160             =back
161              
162             =head1 AUTHOR
163              
164             Martin Popel <popel@ufal.mff.cuni.cz>
165              
166             =head1 COPYRIGHT AND LICENSE
167              
168             Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague
169              
170             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.