File Coverage

blib/lib/Test/Text/Sentence.pm

Criterion	Covered	Total	%
statement	22	23	95.6
branch	4	6	66.6
condition			n/a
subroutine	4	4	100.0
pod	1	1	100.0
total	31	34	91.1

line	stmt	bran	sub	pod	time	code
1						package Test::Text::Sentence;
2
3						#==============================================================================
4						#
5						# Start of POD
6						#
7						#==============================================================================
8
9						=head1 NAME
10
11						Test::Text::Sentence - module for splitting text into sentences
12
13						=head1 SYNOPSIS
14
15						use Test::Text::Sentence qw( split_sentences );
16						use locale;
17						use POSIX qw( locale_h );
18
19						setlocale( LC_CTYPE, 'iso_8859_1' );
20						@sentences = split_sentences( $text );
21
22						=head1 DESCRIPTION
23
24						The C module contains the function split_sentences, which
25						splits text into its constituent sentences, based on a fairly approximate
26						regex. If you set the locale before calling it, it will deal correctly with
27						locale dependant capitalization to identify sentence boundaries. Certain well
28						know exceptions, such as abbreviations, may cause incorrect segmentations.
29
30						=head1 FUNCTIONS
31
32						=head2 split_sentences( $text )
33
34						The split sentences function takes a scalar containing ascii text as an
35						argument and returns an array of sentences that the text has been split into.
36
37						@sentences = split_sentences( $text );
38
39						=head1 SEE ALSO
40
41						locale
42						POSIX
43
44						=head1 REPOSITORY
45
46						L
47
48						=head1 AUTHOR
49
50						Ave Wrigley Ewrigley@cre.canon.co.ukE
51
52						=head1 COPYRIGHT
53
54						Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
55
56						This is free software; you can redistribute it and/or modify it under
57						the same terms as the Perl 5 programming language system itself.
58
59						=cut
60
61						#==============================================================================
62						#
63						# End of POD
64						#
65						#==============================================================================
66
67						#==============================================================================
68						#
69						# Pragmas
70						#
71						#==============================================================================
72
73						require 5.006;
74	4		4		95260	use strict;
	4				18
	4				109
75	4		4		19	use warnings;
	4				6
	4				133
76
77						#==============================================================================
78						#
79						# Modules
80						#
81						#==============================================================================
82
83						require Exporter;
84
85						#==============================================================================
86						#
87						# Public globals
88						#
89						#==============================================================================
90
91	4		4		25	use vars qw( @ISA @EXPORT_OK @PUNCTUATION );
	4				7
	4				1172
92
93						@ISA = qw( Exporter );
94						@EXPORT_OK = qw( split_sentences );
95						@PUNCTUATION = ( '\.', '\!', '\?' );
96
97						#==============================================================================
98						#
99						# Public methods
100						#
101						#==============================================================================
102
103						#------------------------------------------------------------------------------
104						#
105						# split_sentences - takes text input an splits it into sentences, based on a
106						# fairly approximate regex. Returns an array of the sentences.
107						#
108						#------------------------------------------------------------------------------
109
110						sub split_sentences
111						{
112	16		16	1	3284	my $text = shift;
113
114	16	50			35	return () unless $text;
115
116						# capital letter is a character set; to account for local, this includes
117						# all characters for which lc is different from that character
118
119						my $capital_letter =
120						'[' .
121						join( '',
122	3056				3708	grep { lc( $_ ) ne ( $_ ) }
123	16				44	map { chr( $_ ) } ord( "A" ) .. ord( "\xff" )
	3056				3914
124						) .
125						']'
126						;
127
128	16				150	my $punctuation = '(?:' . join( '\|', @PUNCTUATION ) . ')';
129
130						# this needs to be alternation, not character class, because of
131						# multibyte characters
132
133	16				33	my $opt_start_quote = q/['"]?/; # "'
134	16				27	my $opt_close_quote = q/['"]?/; # "'
135
136						# these are distinguished because (eventually!) I would like to do
137						# locale stuff on quote characters
138
139	16				19	my $opt_start_bracket = q/[[({]?/; # }{
140	16				18	my $opt_close_bracket = q/[\])}]?/;
141
142						# return $text if there is no punctuation ...
143
144	16	100			189	return $text unless $text =~ /$punctuation/;
145
146	13				617	my @sentences = $text =~ /
147						(
148						# sentences start with ...
149						$opt_start_quote # an optional start quote
150						$opt_start_bracket # an optional start bracket
151						$capital_letter # a capital letter ...
152						.+? # at least some (non-greedy) anything ...
153						$punctuation # ... followed by any one of !?.
154						$opt_close_quote # an optional close quote
155						$opt_close_bracket # and an optional close bracket
156						)
157						(?= # with lookahead that it is followed by ...
158						(?: # either ...
159						\s+ # some whitespace ...
160						$opt_start_quote # an optional start quote
161						$opt_start_bracket # an optional start bracket
162						$capital_letter # an uppercase word character (for locale
163						# sensitive matching)
164						\| # or ...
165						\n\n # a couple (or more) of CRs
166						\| # or ...
167						\s*$ # optional whitespace, followed by end of string
168						)
169						)
170						/gxs
171						;
172	13	50			82	return @sentences if @sentences;
173	0					return ( $text );
174						}
175
176						#==============================================================================
177						#
178						# Return TRUE
179						#
180						#==============================================================================
181
182						1;