File Coverage

blib/lib/Lingua/EN/Sentence.pm
Criterion Covered Total %
statement 81 103 78.6
branch 11 26 42.3
condition n/a
subroutine 15 20 75.0
pod 7 12 58.3
total 114 161 70.8


line stmt bran cond sub pod time code
1             package Lingua::EN::Sentence;
2              
3              
4             =head1 NAME
5              
6             Lingua::EN::Sentence - split text into sentences
7              
8             =head1 SYNOPSIS
9              
10             use Lingua::EN::Sentence qw( get_sentences add_acronyms );
11              
12             add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
13             my $text = q{
14             A sentence usually ends with a dot, exclamation or question mark optionally followed by a space!
15             A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot
16              
17             Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split
18             as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq.
19             and (some text) ellipsis such as ... or . . are ignored.
20             Some valid cases canot be deteected, such as the answer is X. It cannot easily be
21             differentiated from the single letter-dot sequence to abbreviate a person's given name.
22             Numbered points within a sentence will not cause a split 1. Like this one.
23             See the code for all the rules that apply.
24             This string has 7 sentences.
25             };
26              
27             if (defined($sentences))
28             {
29             my $sentences = get_sentences($text);
30             foreach my $sent (@$sentences)
31             {
32             $i++;
33             print("SENTENCE $i:$sent\n");
34             }
35             }
36              
37             =head1 DESCRIPTION
38              
39             The C module contains the function get_sentences, which
40             splits text into its constituent sentences, based on a regular expression and a
41             list of abbreviations (built in and given).
42              
43             Certain well know exceptions, such as abbreviations, may cause incorrect
44             segmentations. But some of them are already integrated into this code and are
45             being taken care of. Still, if you see that there are words causing the
46             get_sentences function to fail, you can add those to the module, so it notices them.
47             Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 'mrs.'
48              
49             =head1 ALGORITHM
50              
51             The first step is to mark the dot ending an abbreviation by changing it to a special
52             character. Now it won't cause a sentence split. The original dot is restored after
53             the sentences are split
54              
55             Basically, I use a 'brute' regular expression to split the text into sentences.
56             (Well, nothing is yet split - I just mark the end-of-sentence). Then I look into
57             a set of rules which decide when an end-of-sentence is justified and when it's a
58             mistake. In case of a mistake, the end-of-sentence mark is removed. What are
59             such mistakes?
60              
61             Letter-dot sequences: U.S.A. , i.e. , e.g.
62             Dot sequences: '..' or '...' or 'text . . more text'
63             Two carriage returns denote the end of a sentence even if it doesn't end with a dot
64              
65             =head1 LIMITATIONS
66              
67             1) John F. Kennedy was a former president
68             2) The answer is F. That ends the quiz
69              
70             In the first sentence, F. is detected as a persons initial and not the end of a sentence.
71             But this means we cannot detect the true end of sentence 2, which is after the 'F'. This
72             case is not common though.
73              
74             =head1 FUNCTIONS
75              
76             All functions used should be requested in the 'use' clause. None is exported by
77             default.
78              
79             =over 4
80              
81             =item get_sentences( $text )
82              
83             The get_sentences function takes a scalar containing ascii text as an argument
84             and returns a reference to an array of sentences that the text has been split
85             into. Returned sentences will be trimmed (beginning and end of sentence) of
86             white space. Strings with no alpha-numeric characters in them, won't be
87             returned as sentences. If no text is supplied, a reference to an empty array
88             is returned.
89              
90             =item add_acronyms( @acronyms )
91              
92             This function is used for adding acronyms not supported by this code.
93             The input should be regular expressions for matching the desired acronyms,
94             but should not include the final period (C<.>). So, for example, C
95             matches C and C. C will match C. You do not
96             need to bother with acronyms consisting of single letters and dots
97             (e.g. "U.S.A."), as these are found automatically. Note also that acronyms
98             are searched for on a case insensitive basis.
99              
100             Please see 'Acronym/Abbreviations list' section for the abbreviations already
101             supported by this module.
102              
103             =item get_acronyms( )
104              
105             This function will return the defined list of acronyms.
106              
107             =item set_acronyms( @my_acronyms )
108              
109             This function replaces the predefined acronym list with the given list. See
110             L for details on the input specifications.
111              
112              
113             =item get_EOS( )
114              
115             This function returns the value of the string used to mark the end of sentence.
116             You might want to see what it is, and to make sure your text doesn't contain it.
117             You can use set_EOS() to alter the end-of-sentence string to whatever you
118             desire.
119              
120             =item set_EOS( $new_EOS_string )
121              
122             This function alters the end-of-sentence string used to mark the end of sentences.
123              
124             =item set_locale( $new_locale )
125             Receives language locale in the form language.country.character-set
126             for example:
127             "fr_CA.ISO8859-1"
128             for Canadian French using character set ISO8859-1.
129              
130             Returns a reference to a hash containing the current locale formatting values.
131             Returns undef if got undef.
132              
133              
134             The following will set the LC_COLLATE behaviour to Argentinian Spanish.
135             NOTE: The naming and availability of locales depends on your operating sysem.
136             Please consult the perllocale manpage for how to find out which locales are available in your system.
137              
138             $loc = set_locale( "es_AR.ISO8859-1" );
139              
140             This actually does this:
141              
142             $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
143              
144             =back
145              
146             =head1 Acronym/Abbreviations list
147              
148             You can use the get_acronyms() function to get acronyms.
149             It has become too long to specify in the documentation.
150              
151             If I come across a good general-purpose list - I'll incorporate it into this module.
152             Feel free to suggest such lists.
153              
154             =head1 FUTURE WORK
155              
156             [1] Object Oriented like usage
157             [2] Supporting more than just English/French
158             [3] Code optimization. Currently everything is RE based and not so optimized RE
159             [4] Possibly use more semantic heuristics for detecting a beginning of a sentence
160              
161             =head1 SEE ALSO
162              
163             Text::Sentence
164             Lingua::Sentence
165             Raku port of Lingua::EN::Sentence
166            
167             =head1 REPOSITORY
168              
169             L
170              
171             =head1 AUTHOR
172              
173             Shlomo Yona shlomo@cs.haifa.ac.il
174              
175             Currently being maintained by Kim Ryan, kimryan at CPAN d o t org
176              
177              
178             =head1 COPYRIGHT AND LICENSE
179              
180             Copyright (c) 2001-2016 Shlomo Yona. All rights reserved.
181             Copyright (c) 2022 Kim Ryan. All rights reserved.
182              
183             This library is free software; you can redistribute it and/or modify
184             it under the same terms as Perl itself.
185              
186             =cut
187              
188             #==============================================================================
189             #
190             # Pragmas
191             #
192             #==============================================================================
193              
194 1     1   70684 use strict;
  1         3  
  1         32  
195 1     1   6 use warnings;
  1         2  
  1         29  
196 1     1   521 use POSIX qw(locale_h setlocale);
  1         6644  
  1         5  
197             #==============================================================================
198             #
199             # Modules
200             #
201             #==============================================================================
202 1     1   1664 use Exporter;
  1         2  
  1         53  
203              
204             #==============================================================================
205             #
206             # Public globals
207             #
208             #==============================================================================
209 1     1   7 use vars qw/$VERSION @ISA @EXPORT_OK $EOS $LOC $AP $P $PAP @ABBREVIATIONS/;
  1         2  
  1         110  
210 1     1   6 use Carp qw/cluck/;
  1         2  
  1         45  
211 1     1   632 use English;
  1         3681  
  1         5  
212              
213             our $VERSION = '0.34';
214              
215             our $LOC;
216             if ($OSNAME ne 'android') {
217             # Call POSIX function
218             $LOC= setlocale(LC_CTYPE, "en_US.UTF-8");
219             }
220              
221            
222 1     1   959 use locale;
  1         595  
  1         5  
223              
224             @ISA = qw( Exporter );
225             @EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS set_EOS set_locale);
226              
227             our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging
228             our $EOS = "\001";
229             our $EOA = '__EOA__';
230              
231             our $P = q/[\.!?]/; # PUNCTUATION
232              
233             $AP = q/(?:'|"|\?|\)|\]|\})?/; # AFTER PUNCTUATION
234             our $PAP = $P.$AP;
235              
236             # ACRONYMS AND ABBREVIATIONS
237             my @PEOPLE = qw( Mr Mrs Ms Dr Prof Mme Ms?gr Sens? Reps? Gov Attys? Supt Insp Const Det Revd? Ald Rt Hon);
238             my @TITLE_SUFFIXES = qw(PhD Jn?r Sn?r Esq MD LLB);
239             my @MILITARY = qw( Col Gen Lt Cm?dr Adm Capt Sgt Cpl Maj Pte);
240             my @INSTITUTES = qw( Dept Univ Assn Bros);
241             my @COMPANIES = qw( Inc Pty Ltd Co Corp);
242             my @PLACES =
243             qw(
244             Arc Al Ave Blv?d Cl Ct Cres Dr Expy? Fw?y Hwa?y La Pde? Pl Plz Rd St Tce
245             dist mt km in ft
246             Ala Ariz Ark Cal Calif Col Colo Conn Del Fed Fla Ga Ida Id Ill Ind Ia Kan Kans Ken Ky
247             La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore Penna Penn Pa Dak
248             Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk
249             Aust Vic Qld Tas
250             );
251             my @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sept? Oct Nov Dec);
252             my @MISC = qw(no esp est); # Established
253             my @LATIN = qw(vs etc al ibid sic);
254             my @MATH = qw(fig eq sec cf Thm Def Conj resp);
255              
256             our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC,@LATIN, @MATH);
257              
258              
259             #==============================================================================
260             #
261             # Public methods
262             #
263             #==============================================================================
264              
265             #------------------------------------------------------------------------------
266             # get_sentences - takes text input and splits it into sentences.
267             # A regular expression viciously cuts the text into sentences,
268             # and then a list of rules (some of them consist of a list of abbreviations)
269             # are applied on the marked text in order to fix end-of-sentence markings in
270             # places which are not indeed end-of-sentence.
271             #------------------------------------------------------------------------------
272             sub get_sentences {
273 2     2 1 99 my ($text) = @_;
274            
275 2 50       7 unless (defined($text))
276             {
277 0         0 return [];
278             }
279            
280 2 50       6 $VERBOSE and print("ORIGINAL\n$text\n");
281            
282 2         7 $text = mark_up_abbreviations($text);
283 2 50       7 $VERBOSE and print("mark_up_abbreviations\n$text\n");
284            
285 2         20 $text = first_sentence_breaking($text);
286 2 50       7 $VERBOSE and print("first_sentence_breaking\n$text\n");
287            
288 2         5 $text = remove_false_end_of_sentence($text);
289 2 50       7 $VERBOSE and print("remove_false_end_of_sentence\n$text\n");
290            
291 2         6 $text = split_unsplit_stuff($text);
292 2 50       5 $VERBOSE and print("split_unsplit_stuff\n$text\n");
293            
294 2         19 my @sentences = split(/$EOS/,$text);
295 2         6 my $cleaned_sentences = clean_sentences(\@sentences);
296 2 50       6 if ($VERBOSE) {
297 0         0 my $i;
298 0         0 foreach my $sent (@$cleaned_sentences) {
299 0         0 $i++;
300 0         0 print("SENTENCE $i >>>$sent<<<\n");
301             }
302             }
303 2         8 return $cleaned_sentences;
304             }
305              
306             #------------------------------------------------------------------------------
307             # add_acronyms - user can add a list of acronyms/abbreviations.
308             #------------------------------------------------------------------------------
309             sub add_acronyms {
310 1     1 1 784 push @ABBREVIATIONS, @_;
311             }
312              
313             #------------------------------------------------------------------------------
314             # get_acronyms - get list of defined acronyms.
315             #------------------------------------------------------------------------------
316             sub get_acronyms {
317 0     0 1 0 return @ABBREVIATIONS;
318             }
319              
320             #------------------------------------------------------------------------------
321             # set_acronyms - replace the predefined acronyms list with your own list.
322             #------------------------------------------------------------------------------
323             sub set_acronyms {
324 0     0 1 0 @ABBREVIATIONS=@_;
325             }
326              
327             #------------------------------------------------------------------------------
328             # get_EOS - get the value of the $EOS variable (end-of-sentence mark).
329             #------------------------------------------------------------------------------
330             sub get_EOS {
331 0     0 1 0 return $EOS;
332             }
333              
334             #------------------------------------------------------------------------------
335             # set_EOS - set the value of the $EOS variable (end-of-sentence mark).
336             #------------------------------------------------------------------------------
337             sub set_EOS {
338 0     0 1 0 my ($new_EOS) = @_;
339 0 0       0 if (not defined $new_EOS) {
340 0         0 cluck "Won't set \$EOS to undefined value!\n";
341 0         0 return $EOS;
342             }
343 0         0 $EOS = $new_EOS;
344 0         0 return $EOS;
345             }
346              
347             #------------------------------------------------------------------------------
348              
349             # set_locale - set the value of the locale.
350             #
351             # Receieves language locale in the form
352             # language.country.character-set
353             # for example:
354             # "fr_CA.ISO8859-1"
355             # for Canadian French using character set ISO8859-1.
356             #
357             # Returns a reference to a hash containing the current locale formatting values.
358             # Returns undef if got undef.
359             #
360             # The following will set the LC_ALL behaviour to Argentinian Spanish.
361             # NOTE: The naming and availability of locales depends on your operating system.
362             # Please consult the perllocale manpage for how to find out which locales are
363             # available in your system.
364             #
365             # $loc = set_locale( "es_AR.ISO8859-1" );
366             #
367             # This actually does this:
368             #
369             # $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" ); # NOTE, but actually does LC_CTYPE, should be LC_COLLATE?
370              
371             #------------------------------------------------------------------------------
372             sub set_locale {
373 0     0 1 0 my ($new_locale) = @_;
374 0 0       0 if (not defined $new_locale) {
375 0         0 cluck "Won't set locale to undefined value!\n";
376 0         0 return undef;
377             }
378            
379 0 0       0 if ($OSNAME ne 'android') {
380             # Call POSIX function
381 0         0 $LOC = setlocale(LC_CTYPE, $new_locale);
382 0         0 return $LOC;
383             }
384             else {
385 0         0 return undef;
386             }
387             }
388              
389             #==============================================================================
390             #
391             # Private methods
392             #
393             #==============================================================================
394             sub remove_false_end_of_sentence {
395 2     2 0 5 my ($marked_segment) = @_;
396            
397            
398             # don't split U.S.A., U.K.
399 2         122 $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
400 2         64 $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;
401              
402             # don't split after a white-space followed by a single letter or number followed
403             # by a dot followed by another whitespace., such as "Something. 1. point one"
404             # Note: will fail for 12. Point 12
405 2         71 $marked_segment=~s/(\s[\w\d]\.\s+)$EOS/$1/sg;
406              
407             # fix ellipsis: bla bla... yada yada
408 2         33 $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg;
409            
410             # fix quoted EOS such as "." "?" "!"
411 2         26 $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
412            
413             # don't break after quote unless its a capital letter.
414 2         56 $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
415              
416             # don't break: text . . some more text.
417 2         28 $marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
418 2         47 $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
419              
420 2         64 $marked_segment=~s/(\s$PAP\s)$EOS/$1/sg;
421            
422 2         10 return $marked_segment;
423             }
424              
425             sub split_unsplit_stuff {
426 2     2 0 11 my ($text) = @_;
427            
428              
429             # breaks numbered points, such as {EOL}1. point one
430 2         93 $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg;
431            
432             # eg 'end. (' -> 'end. $EOS ('
433 2         44 $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
434 2         18 $text=~s/('\w$P)(\s)/$1$EOS$2/gs;
435              
436 2         46 $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
437              
438             # add EOS when you see "a.m." or "p.m." followed by a capital letter.
439 2         5 $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
440              
441 2         6 return $text;
442             }
443              
444             sub clean_sentences {
445 2     2 0 5 my ($sentences) = @_;
446 2         3 my $cleaned_sentences;
447 2         5 foreach my $s (@$sentences) {
448 15 50       55 next if not defined $s;
449 15 50       47 next if $s!~m/\w+/;
450 15         45 $s=~s/^\s*//;
451 15         195 $s=~s/\s*$//;
452             ## $s=~s/\s+/ /g;
453             # replace end of abbrev marker with a dot
454 15         69 $s=~s/$EOA/\./g;
455 15         38 push @$cleaned_sentences,$s;
456             }
457 2         7 return $cleaned_sentences;
458             }
459              
460             # Replace sequence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
461             # This simplifies the code that detects end of sentnees. The marker is
462             # replaced with the original dot adter sentence slitting
463              
464             sub mark_up_abbreviations {
465 2     2 0 5 my ($text) = @_;
466            
467 2         3 my %found_abbrevs;
468 2         15 foreach my $abbrev (@ABBREVIATIONS) {
469 317 100       13192 if ($text=~/\b($abbrev)\./i) {
470 13         42 $found_abbrevs{$abbrev} = 1;
471             }
472             }
473              
474 2         13 foreach my $abbrev (keys %found_abbrevs) {
475 11         154 $text=~s/($abbrev)\./$1$EOA/gs;
476             }
477            
478 2         19 return $text;
479             }
480              
481             sub first_sentence_breaking {
482 2     2 0 8 my ($text) = @_;
483 2         22 $text=~s/\n\s*\n/$EOS/gs; ## double new-line means a different sentence.
484 2         78 $text=~s/($PAP\s)/$1$EOS/gs;
485 2         84 $text=~s/(\s\w$P)/$1$EOS/gs; # break also when single letter comes before punc.
486 2         9 return $text;
487             }
488              
489             1;