File Coverage

blib/lib/Test/Text/Sentence.pm

Criterion	Covered	Total	%
statement	22	23	95.6
branch	3	6	50.0
condition			n/a
subroutine	4	4	100.0
pod	1	1	100.0
total	30	34	88.2

line	stmt	bran	sub	pod	time	code
1						package Test::Text::Sentence;
2
3						#==============================================================================
4						#
5						# Start of POD
6						#
7						#==============================================================================
8
9						=head1 NAME
10
11						Test::Text::Sentence - module for splitting text into sentences
12
13						=head1 SYNOPSIS
14
15						use Test::Text::Sentence qw( split_sentences );
16						use locale;
17						use POSIX qw( locale_h );
18
19						setlocale( LC_CTYPE, 'iso_8859_1' );
20						@sentences = split_sentences( $text );
21
22						=head1 DESCRIPTION
23
24						The C module contains the function split_sentences, which
25						splits text into its constituent sentences, based on a fairly approximate
26						regex. If you set the locale before calling it, it will deal correctly with
27						locale dependant capitalization to identify sentence boundaries. Certain well
28						know exceptions, such as abbreviations, may cause incorrect segmentations.
29
30						=head1 FUNCTIONS
31
32						=head2 split_sentences( $text )
33
34						The split sentences function takes a scalar containing ascii text as an
35						argument and returns an array of sentences that the text has been split into.
36
37						@sentences = split_sentences( $text );
38
39						=head1 SEE ALSO
40
41						locale
42						POSIX
43
44						=head1 REPOSITORY
45
46						L
47
48						=head1 AUTHOR
49
50						Ave Wrigley Ewrigley@cre.canon.co.ukE
51
52						=head1 COPYRIGHT
53
54						Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
55
56						This is free software; you can redistribute it and/or modify it under
57						the same terms as the Perl 5 programming language system itself.
58
59						=cut
60
61						#==============================================================================
62						#
63						# End of POD
64						#
65						#==============================================================================
66
67						#==============================================================================
68						#
69						# Pragmas
70						#
71						#==============================================================================
72
73						require 5.006;
74	4		4		94246	use strict;
	4				16
	4				118
75	4		4		18	use warnings;
	4				7
	4				149
76
77						#==============================================================================
78						#
79						# Modules
80						#
81						#==============================================================================
82
83						require Exporter;
84
85						#==============================================================================
86						#
87						# Public globals
88						#
89						#==============================================================================
90
91	4		4		17	use vars qw( @ISA @EXPORT_OK @PUNCTUATION );
	4				13
	4				1225
92
93						@ISA = qw( Exporter );
94						@EXPORT_OK = qw( split_sentences );
95						@PUNCTUATION = ( '\.', '\!', '\?' );
96
97						#==============================================================================
98						#
99						# Public methods
100						#
101						#==============================================================================
102
103						#------------------------------------------------------------------------------
104						#
105						# split_sentences - takes text input an splits it into sentences, based on a
106						# fairly approximate regex. Returns an array of the sentences.
107						#
108						#------------------------------------------------------------------------------
109
110						sub split_sentences
111						{
112	16		16	1	3287	my $text = shift;
113
114	16	50			34	return () unless $text;
115
116						# capital letter is a character set; to account for local, this includes
117						# all characters for which lc is different from that character
118
119						my $capital_letter =
120						'[' .
121						join( '',
122	3056				3709	grep { lc( $_ ) ne ( $_ ) }
123	16				43	map { chr( $_ ) } ord( "A" ) .. ord( "\xff" )
	3056				3932
124						) .
125						']'
126						;
127
128	16				157	my $punctuation = '(?:' . join( '\|', @PUNCTUATION ) . ')';
129
130						# this needs to be alternation, not character class, because of
131						# multibyte characters
132
133	16				24	my $opt_start_quote = q/['"]?/; # "'
134	16				23	my $opt_close_quote = q/['"]?/; # "'
135
136						# these are distinguished because (eventually!) I would like to do
137						# locale stuff on quote characters
138
139	16				19	my $opt_start_bracket = q/[[({]?/; # }{
140	16				21	my $opt_close_bracket = q/[\])}]?/;
141
142						# return $text if there is no punctuation ...
143
144	16	50			229	return $text unless $text =~ /$punctuation/;
145
146	16				658	my @sentences = $text =~ /
147						(
148						# sentences start with ...
149						$opt_start_quote # an optional start quote
150						$opt_start_bracket # an optional start bracket
151						$capital_letter # a capital letter ...
152						.+? # at least some (non-greedy) anything ...
153						$punctuation # ... followed by any one of !?.
154						$opt_close_quote # an optional close quote
155						$opt_close_bracket # and an optional close bracket
156						)
157						(?= # with lookahead that it is followed by ...
158						(?: # either ...
159						\s+ # some whitespace ...
160						$opt_start_quote # an optional start quote
161						$opt_start_bracket # an optional start bracket
162						$capital_letter # an uppercase word character (for locale
163						# sensitive matching)
164						\| # or ...
165						\n\n # a couple (or more) of CRs
166						\| # or ...
167						\s*$ # optional whitespace, followed by end of string
168						)
169						)
170						/gxs
171						;
172	16	50			103	return @sentences if @sentences;
173	0					return ( $text );
174						}
175
176						#==============================================================================
177						#
178						# Return TRUE
179						#
180						#==============================================================================
181
182						1;