File Coverage

blib/lib/Lingua/EN/Sentence.pm

Criterion	Covered	Total	%
statement	81	103	78.6
branch	11	26	42.3
condition			n/a
subroutine	15	20	75.0
pod	7	12	58.3
total	114	161	70.8

line	stmt	bran	sub	pod	time	code
1						package Lingua::EN::Sentence;
2
3
4						=head1 NAME
5
6						Lingua::EN::Sentence - split text into sentences
7
8						=head1 SYNOPSIS
9
10						use Lingua::EN::Sentence qw( get_sentences add_acronyms );
11
12						add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
13						my $text = q{
14						A sentence usually ends with a dot, exclamation or question mark optionally followed by a space!
15						A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot
16
17						Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split
18						as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq.
19						and (some text) ellipsis such as ... or . . are ignored.
20						Some valid cases canot be deteected, such as the answer is X. It cannot easily be
21						differentiated from the single letter-dot sequence to abbreviate a person's given name.
22						Numbered points within a sentence will not cause a split 1. Like this one.
23						See the code for all the rules that apply.
24						This string has 7 sentences.
25						};
26
27						if (defined($sentences))
28						{
29						my $sentences = get_sentences($text);
30						foreach my $sent (@$sentences)
31						{
32						$i++;
33						print("SENTENCE $i:$sent\n");
34						}
35						}
36
37						=head1 DESCRIPTION
38
39						The C module contains the function get_sentences, which
40						splits text into its constituent sentences, based on a regular expression and a
41						list of abbreviations (built in and given).
42
43						Certain well know exceptions, such as abbreviations, may cause incorrect
44						segmentations. But some of them are already integrated into this code and are
45						being taken care of. Still, if you see that there are words causing the
46						get_sentences function to fail, you can add those to the module, so it notices them.
47						Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 'mrs.'
48
49						=head1 ALGORITHM
50
51						The first step is to mark the dot ending an abbreviation by changing it to a special
52						character. Now it won't cause a sentence split. The original dot is restored after
53						the sentences are split
54
55						Basically, I use a 'brute' regular expression to split the text into sentences.
56						(Well, nothing is yet split - I just mark the end-of-sentence). Then I look into
57						a set of rules which decide when an end-of-sentence is justified and when it's a
58						mistake. In case of a mistake, the end-of-sentence mark is removed. What are
59						such mistakes?
60
61						Letter-dot sequences: U.S.A. , i.e. , e.g.
62						Dot sequences: '..' or '...' or 'text . . more text'
63						Two carriage returns denote the end of a sentence even if it doesn't end with a dot
64
65						=head1 LIMITATIONS
66
67						1) John F. Kennedy was a former president
68						2) The answer is F. That ends the quiz
69
70						In the first sentence, F. is detected as a persons initial and not the end of a sentence.
71						But this means we cannot detect the true end of sentence 2, which is after the 'F'. This
72						case is not common though.
73
74						=head1 FUNCTIONS
75
76						All functions used should be requested in the 'use' clause. None is exported by
77						default.
78
79						=over 4
80
81						=item get_sentences( $text )
82
83						The get_sentences function takes a scalar containing ascii text as an argument
84						and returns a reference to an array of sentences that the text has been split
85						into. Returned sentences will be trimmed (beginning and end of sentence) of
86						white space. Strings with no alpha-numeric characters in them, won't be
87						returned as sentences. If no text is supplied, a reference to an empty array
88						is returned.
89
90						=item add_acronyms( @acronyms )
91
92						This function is used for adding acronyms not supported by this code.
93						The input should be regular expressions for matching the desired acronyms,
94						but should not include the final period (C<.>). So, for example, C
95						matches C and C. C will match C. You do not
96						need to bother with acronyms consisting of single letters and dots
97						(e.g. "U.S.A."), as these are found automatically. Note also that acronyms
98						are searched for on a case insensitive basis.
99
100						Please see 'Acronym/Abbreviations list' section for the abbreviations already
101						supported by this module.
102
103						=item get_acronyms( )
104
105						This function will return the defined list of acronyms.
106
107						=item set_acronyms( @my_acronyms )
108
109						This function replaces the predefined acronym list with the given list. See
110						L for details on the input specifications.
111
112
113						=item get_EOS( )
114
115						This function returns the value of the string used to mark the end of sentence.
116						You might want to see what it is, and to make sure your text doesn't contain it.
117						You can use set_EOS() to alter the end-of-sentence string to whatever you
118						desire.
119
120						=item set_EOS( $new_EOS_string )
121
122						This function alters the end-of-sentence string used to mark the end of sentences.
123
124						=item set_locale( $new_locale )
125						Receives language locale in the form language.country.character-set
126						for example:
127						"fr_CA.ISO8859-1"
128						for Canadian French using character set ISO8859-1.
129
130						Returns a reference to a hash containing the current locale formatting values.
131						Returns undef if got undef.
132
133
134						The following will set the LC_COLLATE behaviour to Argentinian Spanish.
135						NOTE: The naming and availability of locales depends on your operating sysem.
136						Please consult the perllocale manpage for how to find out which locales are available in your system.
137
138						$loc = set_locale( "es_AR.ISO8859-1" );
139
140						This actually does this:
141
142						$loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
143
144						=back
145
146						=head1 Acronym/Abbreviations list
147
148						You can use the get_acronyms() function to get acronyms.
149						It has become too long to specify in the documentation.
150
151						If I come across a good general-purpose list - I'll incorporate it into this module.
152						Feel free to suggest such lists.
153
154						=head1 FUTURE WORK
155
156						[1] Object Oriented like usage
157						[2] Supporting more than just English/French
158						[3] Code optimization. Currently everything is RE based and not so optimized RE
159						[4] Possibly use more semantic heuristics for detecting a beginning of a sentence
160
161						=head1 SEE ALSO
162
163						Text::Sentence
164						Lingua::Sentence
165						Raku port of Lingua::EN::Sentence
166
167						=head1 REPOSITORY
168
169						L
170
171						=head1 AUTHOR
172
173						Shlomo Yona shlomo@cs.haifa.ac.il
174
175						Currently being maintained by Kim Ryan, kimryan at CPAN d o t org
176
177
178						=head1 COPYRIGHT AND LICENSE
179
180						Copyright (c) 2001-2016 Shlomo Yona. All rights reserved.
181						Copyright (c) 2022 Kim Ryan. All rights reserved.
182
183						This library is free software; you can redistribute it and/or modify
184						it under the same terms as Perl itself.
185
186						=cut
187
188						#==============================================================================
189						#
190						# Pragmas
191						#
192						#==============================================================================
193
194	1		1		70684	use strict;
	1				3
	1				32
195	1		1		6	use warnings;
	1				2
	1				29
196	1		1		521	use POSIX qw(locale_h setlocale);
	1				6644
	1				5
197						#==============================================================================
198						#
199						# Modules
200						#
201						#==============================================================================
202	1		1		1664	use Exporter;
	1				2
	1				53
203
204						#==============================================================================
205						#
206						# Public globals
207						#
208						#==============================================================================
209	1		1		7	use vars qw/$VERSION @ISA @EXPORT_OK $EOS $LOC $AP $P $PAP @ABBREVIATIONS/;
	1				2
	1				110
210	1		1		6	use Carp qw/cluck/;
	1				2
	1				45
211	1		1		632	use English;
	1				3681
	1				5
212
213						our $VERSION = '0.34';
214
215						our $LOC;
216						if ($OSNAME ne 'android') {
217						# Call POSIX function
218						$LOC= setlocale(LC_CTYPE, "en_US.UTF-8");
219						}
220
221
222	1		1		959	use locale;
	1				595
	1				5
223
224						@ISA = qw( Exporter );
225						@EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS set_EOS set_locale);
226
227						our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging
228						our $EOS = "\001";
229						our $EOA = '__EOA__';
230
231						our $P = q/[\.!?]/; # PUNCTUATION
232
233						$AP = q/(?:'\|"\|\?\|\)\|\]\|\})?/; # AFTER PUNCTUATION
234						our $PAP = $P.$AP;
235
236						# ACRONYMS AND ABBREVIATIONS
237						my @PEOPLE = qw( Mr Mrs Ms Dr Prof Mme Ms?gr Sens? Reps? Gov Attys? Supt Insp Const Det Revd? Ald Rt Hon);
238						my @TITLE_SUFFIXES = qw(PhD Jn?r Sn?r Esq MD LLB);
239						my @MILITARY = qw( Col Gen Lt Cm?dr Adm Capt Sgt Cpl Maj Pte);
240						my @INSTITUTES = qw( Dept Univ Assn Bros);
241						my @COMPANIES = qw( Inc Pty Ltd Co Corp);
242						my @PLACES =
243						qw(
244						Arc Al Ave Blv?d Cl Ct Cres Dr Expy? Fw?y Hwa?y La Pde? Pl Plz Rd St Tce
245						dist mt km in ft
246						Ala Ariz Ark Cal Calif Col Colo Conn Del Fed Fla Ga Ida Id Ill Ind Ia Kan Kans Ken Ky
247						La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore Penna Penn Pa Dak
248						Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk
249						Aust Vic Qld Tas
250						);
251						my @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sept? Oct Nov Dec);
252						my @MISC = qw(no esp est); # Established
253						my @LATIN = qw(vs etc al ibid sic);
254						my @MATH = qw(fig eq sec cf Thm Def Conj resp);
255
256						our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC,@LATIN, @MATH);
257
258
259						#==============================================================================
260						#
261						# Public methods
262						#
263						#==============================================================================
264
265						#------------------------------------------------------------------------------
266						# get_sentences - takes text input and splits it into sentences.
267						# A regular expression viciously cuts the text into sentences,
268						# and then a list of rules (some of them consist of a list of abbreviations)
269						# are applied on the marked text in order to fix end-of-sentence markings in
270						# places which are not indeed end-of-sentence.
271						#------------------------------------------------------------------------------
272						sub get_sentences {
273	2		2	1	99	my ($text) = @_;
274
275	2	50			7	unless (defined($text))
276						{
277	0				0	return [];
278						}
279
280	2	50			6	$VERBOSE and print("ORIGINAL\n$text\n");
281
282	2				7	$text = mark_up_abbreviations($text);
283	2	50			7	$VERBOSE and print("mark_up_abbreviations\n$text\n");
284
285	2				20	$text = first_sentence_breaking($text);
286	2	50			7	$VERBOSE and print("first_sentence_breaking\n$text\n");
287
288	2				5	$text = remove_false_end_of_sentence($text);
289	2	50			7	$VERBOSE and print("remove_false_end_of_sentence\n$text\n");
290
291	2				6	$text = split_unsplit_stuff($text);
292	2	50			5	$VERBOSE and print("split_unsplit_stuff\n$text\n");
293
294	2				19	my @sentences = split(/$EOS/,$text);
295	2				6	my $cleaned_sentences = clean_sentences(\@sentences);
296	2	50			6	if ($VERBOSE) {
297	0				0	my $i;
298	0				0	foreach my $sent (@$cleaned_sentences) {
299	0				0	$i++;
300	0				0	print("SENTENCE $i >>>$sent<<<\n");
301						}
302						}
303	2				8	return $cleaned_sentences;
304						}
305
306						#------------------------------------------------------------------------------
307						# add_acronyms - user can add a list of acronyms/abbreviations.
308						#------------------------------------------------------------------------------
309						sub add_acronyms {
310	1		1	1	784	push @ABBREVIATIONS, @_;
311						}
312
313						#------------------------------------------------------------------------------
314						# get_acronyms - get list of defined acronyms.
315						#------------------------------------------------------------------------------
316						sub get_acronyms {
317	0		0	1	0	return @ABBREVIATIONS;
318						}
319
320						#------------------------------------------------------------------------------
321						# set_acronyms - replace the predefined acronyms list with your own list.
322						#------------------------------------------------------------------------------
323						sub set_acronyms {
324	0		0	1	0	@ABBREVIATIONS=@_;
325						}
326
327						#------------------------------------------------------------------------------
328						# get_EOS - get the value of the $EOS variable (end-of-sentence mark).
329						#------------------------------------------------------------------------------
330						sub get_EOS {
331	0		0	1	0	return $EOS;
332						}
333
334						#------------------------------------------------------------------------------
335						# set_EOS - set the value of the $EOS variable (end-of-sentence mark).
336						#------------------------------------------------------------------------------
337						sub set_EOS {
338	0		0	1	0	my ($new_EOS) = @_;
339	0	0			0	if (not defined $new_EOS) {
340	0				0	cluck "Won't set \$EOS to undefined value!\n";
341	0				0	return $EOS;
342						}
343	0				0	$EOS = $new_EOS;
344	0				0	return $EOS;
345						}
346
347						#------------------------------------------------------------------------------
348
349						# set_locale - set the value of the locale.
350						#
351						# Receieves language locale in the form
352						# language.country.character-set
353						# for example:
354						# "fr_CA.ISO8859-1"
355						# for Canadian French using character set ISO8859-1.
356						#
357						# Returns a reference to a hash containing the current locale formatting values.
358						# Returns undef if got undef.
359						#
360						# The following will set the LC_ALL behaviour to Argentinian Spanish.
361						# NOTE: The naming and availability of locales depends on your operating system.
362						# Please consult the perllocale manpage for how to find out which locales are
363						# available in your system.
364						#
365						# $loc = set_locale( "es_AR.ISO8859-1" );
366						#
367						# This actually does this:
368						#
369						# $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" ); # NOTE, but actually does LC_CTYPE, should be LC_COLLATE?
370
371						#------------------------------------------------------------------------------
372						sub set_locale {
373	0		0	1	0	my ($new_locale) = @_;
374	0	0			0	if (not defined $new_locale) {
375	0				0	cluck "Won't set locale to undefined value!\n";
376	0				0	return undef;
377						}
378
379	0	0			0	if ($OSNAME ne 'android') {
380						# Call POSIX function
381	0				0	$LOC = setlocale(LC_CTYPE, $new_locale);
382	0				0	return $LOC;
383						}
384						else {
385	0				0	return undef;
386						}
387						}
388
389						#==============================================================================
390						#
391						# Private methods
392						#
393						#==============================================================================
394						sub remove_false_end_of_sentence {
395	2		2	0	5	my ($marked_segment) = @_;
396
397
398						# don't split U.S.A., U.K.
399	2				122	$marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
400	2				64	$marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;
401
402						# don't split after a white-space followed by a single letter or number followed
403						# by a dot followed by another whitespace., such as "Something. 1. point one"
404						# Note: will fail for 12. Point 12
405	2				71	$marked_segment=~s/(\s[\w\d]\.\s+)$EOS/$1/sg;
406
407						# fix ellipsis: bla bla... yada yada
408	2				33	$marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg;
409
410						# fix quoted EOS such as "." "?" "!"
411	2				26	$marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
412
413						# don't break after quote unless its a capital letter.
414	2				56	$marked_segment=~s/(["']\s)$EOS(\s[[:lower:]])/$1$2/sg;
415
416						# don't break: text . . some more text.
417	2				28	$marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
418	2				47	$marked_segment=~s/(["']\s)$EOS(\s[[:lower:]])/$1$2/sg;
419
420	2				64	$marked_segment=~s/(\s$PAP\s)$EOS/$1/sg;
421
422	2				10	return $marked_segment;
423						}
424
425						sub split_unsplit_stuff {
426	2		2	0	11	my ($text) = @_;
427
428
429						# breaks numbered points, such as {EOL}1. point one
430	2				93	$text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg;
431
432						# eg 'end. (' -> 'end. $EOS ('
433	2				44	$text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
434	2				18	$text=~s/('\w$P)(\s)/$1$EOS$2/gs;
435
436	2				46	$text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
437
438						# add EOS when you see "a.m." or "p.m." followed by a capital letter.
439	2				5	$text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
440
441	2				6	return $text;
442						}
443
444						sub clean_sentences {
445	2		2	0	5	my ($sentences) = @_;
446	2				3	my $cleaned_sentences;
447	2				5	foreach my $s (@$sentences) {
448	15	50			55	next if not defined $s;
449	15	50			47	next if $s!~m/\w+/;
450	15				45	$s=~s/^\s*//;
451	15				195	$s=~s/\s*$//;
452						## $s=~s/\s+/ /g;
453						# replace end of abbrev marker with a dot
454	15				69	$s=~s/$EOA/\./g;
455	15				38	push @$cleaned_sentences,$s;
456						}
457	2				7	return $cleaned_sentences;
458						}
459
460						# Replace sequence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
461						# This simplifies the code that detects end of sentnees. The marker is
462						# replaced with the original dot adter sentence slitting
463
464						sub mark_up_abbreviations {
465	2		2	0	5	my ($text) = @_;
466
467	2				3	my %found_abbrevs;
468	2				15	foreach my $abbrev (@ABBREVIATIONS) {
469	317	100			13192	if ($text=~/\b($abbrev)\./i) {
470	13				42	$found_abbrevs{$abbrev} = 1;
471						}
472						}
473
474	2				13	foreach my $abbrev (keys %found_abbrevs) {
475	11				154	$text=~s/($abbrev)\./$1$EOA/gs;
476						}
477
478	2				19	return $text;
479						}
480
481						sub first_sentence_breaking {
482	2		2	0	8	my ($text) = @_;
483	2				22	$text=~s/\n\s*\n/$EOS/gs; ## double new-line means a different sentence.
484	2				78	$text=~s/($PAP\s)/$1$EOS/gs;
485	2				84	$text=~s/(\s\w$P)/$1$EOS/gs; # break also when single letter comes before punc.
486	2				9	return $text;
487						}
488
489						1;