File Coverage

blib/lib/Text/Sentence.pm

Criterion	Covered	Total	%
statement	23	23	100.0
branch	5	6	83.3
condition			n/a
subroutine	4	4	100.0
pod	1	1	100.0
total	33	34	97.0

line	stmt	bran	sub	pod	time	code
1						package Text::Sentence;
2
3						#==============================================================================
4						#
5						# Start of POD
6						#
7						#==============================================================================
8
9						=head1 NAME
10
11						Text::Sentence - module for splitting text into sentences
12
13						=head1 SYNOPSIS
14
15						use Text::Sentence qw( split_sentences );
16						use locale;
17						use POSIX qw( locale_h );
18
19						setlocale( LC_CTYPE, 'iso_8859_1' );
20						@sentences = split_sentences( $text );
21
22						=head1 DESCRIPTION
23
24						The C module contains the function split_sentences, which
25						splits text into its constituent sentences, based on a fairly approximate
26						regex. If you set the locale before calling it, it will deal correctly with
27						locale dependant capitalization to identify sentence boundaries. Certain well
28						know exceptions, such as abreviations, may cause incorrect segmentations.
29
30						=head1 FUNCTIONS
31
32						=head2 split_sentences( $text )
33
34						The split sentences function takes a scalar containing ascii text as an
35						argument and returns an array of sentences that the text has been split into.
36
37						@sentences = split_sentences( $text );
38
39						=head1 SEE ALSO
40
41						locale
42						POSIX
43
44						=head1 REPOSITORY
45
46						L
47
48						=head1 AUTHOR
49
50						Ave Wrigley Ewrigley@cre.canon.co.ukE
51
52						=head1 COPYRIGHT
53
54						Copyright (c) 1997 Canon Research Centre Europe (CRE). All rights reserved.
55
56						This is free software; you can redistribute it and/or modify it under
57						the same terms as the Perl 5 programming language system itself.
58
59						=cut
60
61						#==============================================================================
62						#
63						# End of POD
64						#
65						#==============================================================================
66
67						#==============================================================================
68						#
69						# Pragmas
70						#
71						#==============================================================================
72
73						require 5.006;
74	2		2		13	use strict;
	2				4
	2				43
75	2		2		9	use warnings;
	2				3
	2				72
76
77						#==============================================================================
78						#
79						# Modules
80						#
81						#==============================================================================
82
83						require Exporter;
84
85						#==============================================================================
86						#
87						# Public globals
88						#
89						#==============================================================================
90
91	2		2		8	use vars qw( $VERSION @ISA @EXPORT_OK @PUNCTUATION );
	2				3
	2				705
92
93						$VERSION = '0.020';
94						@ISA = qw( Exporter );
95						@EXPORT_OK = qw( split_sentences );
96						@PUNCTUATION = ( '\.', '\!', '\?' );
97
98						#==============================================================================
99						#
100						# Public methods
101						#
102						#==============================================================================
103
104						#------------------------------------------------------------------------------
105						#
106						# split_sentences - takes text input an splits it into sentences, based on a
107						# fairly approximate regex. Returns an array of the sentences.
108						#
109						#------------------------------------------------------------------------------
110
111						sub split_sentences
112						{
113	344		344	1	450	my $text = shift;
114
115	344	50			653	return () unless $text;
116
117						# capital letter is a character set; to account for local, this includes
118						# all characters for which lc is different from that character
119
120						my $capital_letter =
121						'[' .
122						join( '',
123	65704				102604	grep { lc( $_ ) ne ( $_ ) }
124	344				760	map { chr( $_ ) } ord( "A" ) .. ord( "\xff" )
	65704				103070
125						) .
126						']'
127						;
128
129	344				4256	my $punctuation = '(?:' . join( '\|', @PUNCTUATION ) . ')';
130
131						# this needs to be alternation, not character class, because of
132						# multibyte characters
133
134	344				419	my $opt_start_quote = q/['"]?/; # "'
135	344				375	my $opt_close_quote = q/['"]?/; # "'
136
137						# these are distinguished because (eventually!) I would like to do
138						# locale stuff on quote characters
139
140	344				354	my $opt_start_bracket = q/[[({]?/; # }{
141	344				378	my $opt_close_bracket = q/[\])}]?/;
142
143						# return $text if there is no punctuation ...
144
145	344	100			2171	return $text unless $text =~ /$punctuation/;
146
147	128				7894	my @sentences = $text =~ /
148						(
149						# sentences start with ...
150						$opt_start_quote # an optional start quote
151						$opt_start_bracket # an optional start bracket
152						$capital_letter # a capital letter ...
153						.+? # at least some (non-greedy) anything ...
154						$punctuation # ... followed by any one of !?.
155						$opt_close_quote # an optional close quote
156						$opt_close_bracket # and an optional close bracket
157						)
158						(?= # with lookahead that it is followed by ...
159						(?: # either ...
160						\s+ # some whitespace ...
161						$opt_start_quote # an optional start quote
162						$opt_start_bracket # an optional start bracket
163						$capital_letter # an uppercase word character (for locale
164						# sensitive matching)
165						\| # or ...
166						\n\n # a couple (or more) of CRs
167						\| # or ...
168						\s*$ # optional whitespace, followed by end of string
169						)
170						)
171						/gxs
172						;
173	128	100			622	return @sentences if @sentences;
174	42				164	return ( $text );
175						}
176
177						#==============================================================================
178						#
179						# Return TRUE
180						#
181						#==============================================================================
182
183						1;