File Coverage

blib/lib/Lingua/EN/Sentence.pm
Criterion Covered Total %
statement 81 102 79.4
branch 11 26 42.3
condition n/a
subroutine 15 20 75.0
pod 7 12 58.3
total 114 160 71.2


line stmt bran cond sub pod time code
1             package Lingua::EN::Sentence;
2              
3             #==============================================================================
4             #
5             # Start of POD
6             #
7             #==============================================================================
8              
9             =head1 NAME
10              
11             Lingua::EN::Sentence - split text into sentences
12              
13             =head1 SYNOPSIS
14              
15             use Lingua::EN::Sentence qw( get_sentences add_acronyms );
16            
17             add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
18             my $text = q{
19             A sentence usually ends with a dot, exclamation or question mark optionally followed by a space!
20             A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot
21            
22             Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split
23             as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq.
24             and (some text) ellipsis such as ... or . . are ignored.
25             Some valid cases canot be deteected, such as the answer is X. It cannot easily be
26             differentiated from the single letter-dot sequence to abbreviate a person's given name.
27             Numbered points within a sentence will not cause a split 1. Like this one.
28             See the code for all the rules that apply.
29             This string has 7 sentences.
30             };
31            
32             my $sentences=get_sentences($text); # Get the sentences.
33             foreach my $sent (@$sentences)
34             {
35             $i++;
36             print("SENTENCE $i:$sent\n");
37             }
38              
39              
40             =head1 DESCRIPTION
41              
42             The C module contains the function get_sentences, which
43             splits text into its constituent sentences, based on a regular expression and a
44             list of abbreviations (built in and given).
45              
46             Certain well know exceptions, such as abbreviations, may cause incorrect
47             segmentations. But some of them are already integrated into this code and are
48             being taken care of. Still, if you see that there are words causing the
49             get_sentences function to fail, you can add those to the module, so it notices them.
50             Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 'mrs.'
51              
52             =head1 ALGORITHM
53              
54             The first step is to mark the dot ending an abbreviation by changing it to a special
55             character. Now it won't cause a sentence split. The original dot is restored after
56             the sentences are split
57              
58             Basically, I use a 'brute' regular expression to split the text into sentences.
59             (Well, nothing is yet split - I just mark the end-of-sentence). Then I look into
60             a set of rules which decide when an end-of-sentence is justified and when it's a
61             mistake. In case of a mistake, the end-of-sentence mark is removed. What are
62             such mistakes?
63              
64             Letter-dot sequences: U.S.A. , i.e. , e.g.
65             Dot sequences: '..' or '...' or 'text . . more text'
66             Two carriage returns denote the end of a sentence even if it doesn't end with a dot
67              
68             =head1 LIMITATIONS
69              
70             1) John F. Kennedy was a former president
71             2) The answer is F. That ends the quiz
72              
73             In the first sentence, F. is detected as a persons initial and not the end of a sentence.
74             But this means we cannot detect the true end of sentence 2, which is after the 'F'. This
75             case is not common though.
76              
77             =head1 FUNCTIONS
78              
79             All functions used should be requested in the 'use' clause. None is exported by
80             default.
81              
82             =over 4
83              
84             =item get_sentences( $text )
85              
86             The get_sentences function takes a scalar containing ascii text as an argument
87             and returns a reference to an array of sentences that the text has been split
88             into. Returned sentences will be trimmed (beginning and end of sentence) of
89             white space. Strings with no alpha-numeric characters in them, won't be
90             returned as sentences.
91              
92             =item add_acronyms( @acronyms )
93              
94             This function is used for adding acronyms not supported by this code.
95             The input should be regular expressions for matching the desired acronyms,
96             but should not include the final period (C<.>). So, for example, C
97             matches C and C. C will match C. You do not
98             need to bother with acronyms consisting of single letters and dots
99             (e.g. "U.S.A."), as these are found automatically. Note also that acronyms
100             are searched for on a case insensitive basis.
101              
102             Please see 'Acronym/Abbreviations list' section for the abbreviations already
103             supported by this module.
104              
105             =item get_acronyms( )
106              
107             This function will return the defined list of acronyms.
108              
109             =item set_acronyms( @my_acronyms )
110              
111             This function replaces the predefined acronym list with the given list. See
112             L for details on the input specifications.
113              
114              
115             =item get_EOS( )
116              
117             This function returns the value of the string used to mark the end of sentence.
118             You might want to see what it is, and to make sure your text doesn't contain it.
119             You can use set_EOS() to alter the end-of-sentence string to whatever you
120             desire.
121              
122             =item set_EOS( $new_EOS_string )
123              
124             This function alters the end-of-sentence string used to mark the end of sentences.
125              
126             =item set_locale( $new_locale )
127             Receives language locale in the form language.country.character-set
128             for example:
129             "fr_CA.ISO8859-1"
130             for Canadian French using character set ISO8859-1.
131              
132             Returns a reference to a hash containing the current locale formatting values.
133             Returns undef if got undef.
134              
135              
136             The following will set the LC_COLLATE behaviour to Argentinian Spanish.
137             NOTE: The naming and availability of locales depends on your operating sysem.
138             Please consult the perllocale manpage for how to find out which locales are available in your system.
139              
140             $loc = set_locale( "es_AR.ISO8859-1" );
141              
142             This actually does this:
143              
144             $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
145              
146             =back
147              
148             =head1 Acronym/Abbreviations list
149              
150             You can use the get_acronyms() function to get acronyms.
151             It has become too long to specify in the documentation.
152              
153             If I come across a good general-purpose list - I'll incorporate it into this module.
154             Feel free to suggest such lists.
155              
156             =head1 FUTURE WORK
157              
158             [1] Object Oriented like usage
159             [2] Supporting more than just English/French
160             [3] Code optimization. Currently everything is RE based and not so optimized RE
161             [4] Possibly use more semantic heuristics for detecting a beginning of a sentence
162              
163             =head1 SEE ALSO
164              
165             Text::Sentence
166             Lingua::Sentence
167             Raku port of Lingua::EN::Sentence
168            
169             =head1 REPOSITORY
170              
171             L
172              
173             =head1 AUTHOR
174              
175             Shlomo Yona shlomo@cs.haifa.ac.il
176              
177             Currently being maintained by Kim Ryan, kimryan at CPAN d o t org
178              
179              
180             =head1 COPYRIGHT AND LICENSE
181              
182             Copyright (c) 2001-2016 Shlomo Yona. All rights reserved.
183             Copyright (c) 2022 Kim Ryan. All rights reserved.
184              
185             This library is free software; you can redistribute it and/or modify
186             it under the same terms as Perl itself.
187              
188             =cut
189              
190             #==============================================================================
191             #
192             # End of POD
193             #
194             #==============================================================================
195              
196              
197             #==============================================================================
198             #
199             # Pragmas
200             #
201             #==============================================================================
202              
203 1     1   56133 use strict;
  1         2  
  1         24  
204 1     1   4 use warnings;
  1         1  
  1         23  
205 1     1   406 use POSIX qw(locale_h setlocale);
  1         5248  
  1         4  
206             #==============================================================================
207             #
208             # Modules
209             #
210             #==============================================================================
211 1     1   1634 use Exporter;
  1         1  
  1         50  
212              
213             #==============================================================================
214             #
215             # Public globals
216             #
217             #==============================================================================
218 1     1   6 use vars qw/$VERSION @ISA @EXPORT_OK $EOS $LOC $AP $P $PAP @ABBREVIATIONS/;
  1         1  
  1         88  
219 1     1   5 use Carp qw/cluck/;
  1         2  
  1         42  
220 1     1   519 use English;
  1         3088  
  1         4  
221              
222             our $VERSION = '0.33';
223              
224             our $LOC;
225             if ($OSNAME ne 'android') {
226             # Call POSIX function
227             $LOC= setlocale(LC_CTYPE, "en_US.UTF-8");
228             }
229              
230            
231 1     1   770 use locale;
  1         493  
  1         4  
232              
233             @ISA = qw( Exporter );
234             @EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS set_EOS set_locale);
235              
236             our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging
237             our $EOS = "\001"; #"__EOS__";
238             our $EOA = '__EOA__';
239              
240             our $P = q/[\.!?]/; # PUNCTUATION
241              
242             $AP = q/(?:'|"|\?|\)|\]|\})?/; # AFTER PUNCTUATION
243             our $PAP = $P.$AP;
244              
245             # ACRONYMS AND ABBREVIATIONS
246             my @PEOPLE = qw( Mr Mrs Ms Dr Prof Mme Ms?gr Sens? Reps? Gov Attys? Supt Insp Const Det Revd? Ald Rt Hon);
247             my @TITLE_SUFFIXES = qw(PhD Jn?r Sn?r Esq MD LLB);
248             my @MILITARY = qw( Col Gen Lt Cm?dr Adm Capt Sgt Cpl Maj Pte);
249             my @INSTITUTES = qw( Dept Univ Assn Bros);
250             my @COMPANIES = qw( Inc Pty Ltd Co Corp);
251             my @PLACES =
252             qw(
253             Arc Al Ave Blv?d Cl Ct Cres Dr Expy? Fw?y Hwa?y La Pde? Pl Plz Rd St Tce
254             dist mt km in ft
255             Ala Ariz Ark Cal Calif Col Colo Conn Del Fed Fla Ga Ida Id Ill Ind Ia Kan Kans Ken Ky
256             La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore Penna Penn Pa Dak
257             Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk
258             Aust Vic Qld Tas
259             );
260             my @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sept? Oct Nov Dec);
261             my @MISC = qw(no esp est); # Established
262             my @LATIN = qw(vs etc al ibid sic);
263             my @MATH = qw(fig eq sec cf Thm Def Conj resp);
264              
265             our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC,@LATIN, @MATH);
266              
267              
268             #==============================================================================
269             #
270             # Public methods
271             #
272             #==============================================================================
273              
274             #------------------------------------------------------------------------------
275             # get_sentences - takes text input and splits it into sentences.
276             # A regular expression viciously cuts the text into sentences,
277             # and then a list of rules (some of them consist of a list of abbreviations)
278             # are applied on the marked text in order to fix end-of-sentence markings in
279             # places which are not indeed end-of-sentence.
280             #------------------------------------------------------------------------------
281             sub get_sentences {
282 2     2 1 70 my ($text) = @_;
283 2 50       5 return [] unless defined $text;
284 2 50       5 $VERBOSE and print("ORIGINAL\n$text\n");
285            
286 2         4 $text = mark_up_abbreviations($text);
287 2 50       11 $VERBOSE and print("mark_up_abbreviations\n$text\n");
288            
289 2         15 $text = first_sentence_breaking($text);
290 2 50       5 $VERBOSE and print("first_sentence_breaking\n$text\n");
291            
292 2         4 $text = remove_false_end_of_sentence($text);
293 2 50       5 $VERBOSE and print("remove_false_end_of_sentence\n$text\n");
294            
295 2         4 $text = split_unsplit_stuff($text);
296 2 50       6 $VERBOSE and print("split_unsplit_stuff\n$text\n");
297            
298 2         14 my @sentences = split(/$EOS/,$text);
299 2         5 my $cleaned_sentences = clean_sentences(\@sentences);
300 2 50       5 if ($VERBOSE) {
301 0         0 my $i;
302 0         0 foreach my $sent (@$cleaned_sentences) {
303 0         0 $i++;
304 0         0 print("SENTENCE $i >>>$sent<<<\n");
305             }
306             }
307 2         8 return $cleaned_sentences;
308             }
309              
310             #------------------------------------------------------------------------------
311             # add_acronyms - user can add a list of acronyms/abbreviations.
312             #------------------------------------------------------------------------------
313             sub add_acronyms {
314 1     1 1 585 push @ABBREVIATIONS, @_;
315             }
316              
317             #------------------------------------------------------------------------------
318             # get_acronyms - get list of defined acronyms.
319             #------------------------------------------------------------------------------
320             sub get_acronyms {
321 0     0 1 0 return @ABBREVIATIONS;
322             }
323              
324             #------------------------------------------------------------------------------
325             # set_acronyms - replace the predefined acronyms list with your own list.
326             #------------------------------------------------------------------------------
327             sub set_acronyms {
328 0     0 1 0 @ABBREVIATIONS=@_;
329             }
330              
331             #------------------------------------------------------------------------------
332             # get_EOS - get the value of the $EOS variable (end-of-sentence mark).
333             #------------------------------------------------------------------------------
334             sub get_EOS {
335 0     0 1 0 return $EOS;
336             }
337              
338             #------------------------------------------------------------------------------
339             # set_EOS - set the value of the $EOS variable (end-of-sentence mark).
340             #------------------------------------------------------------------------------
341             sub set_EOS {
342 0     0 1 0 my ($new_EOS) = @_;
343 0 0       0 if (not defined $new_EOS) {
344 0         0 cluck "Won't set \$EOS to undefined value!\n";
345 0         0 return $EOS;
346             }
347 0         0 $EOS = $new_EOS;
348 0         0 return $EOS;
349             }
350              
351             #------------------------------------------------------------------------------
352              
353             # set_locale - set the value of the locale.
354             #
355             # Receieves language locale in the form
356             # language.country.character-set
357             # for example:
358             # "fr_CA.ISO8859-1"
359             # for Canadian French using character set ISO8859-1.
360             #
361             # Returns a reference to a hash containing the current locale formatting values.
362             # Returns undef if got undef.
363             #
364             # The following will set the LC_ALL behaviour to Argentinian Spanish.
365             # NOTE: The naming and availability of locales depends on your operating system.
366             # Please consult the perllocale manpage for how to find out which locales are
367             # available in your system.
368             #
369             # $loc = set_locale( "es_AR.ISO8859-1" );
370             #
371             # This actually does this:
372             #
373             # $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" ); # NOTE, but actually does LC_CTYPE, should be LC_COLLATE?
374              
375             #------------------------------------------------------------------------------
376             sub set_locale {
377 0     0 1 0 my ($new_locale) = @_;
378 0 0       0 if (not defined $new_locale) {
379 0         0 cluck "Won't set locale to undefined value!\n";
380 0         0 return undef;
381             }
382            
383 0 0       0 if ($OSNAME ne 'android') {
384             # Call POSIX function
385 0         0 $LOC = setlocale(LC_CTYPE, $new_locale);
386 0         0 return $LOC;
387             }
388             else {
389 0         0 return undef;
390             }
391             }
392              
393             #==============================================================================
394             #
395             # Private methods
396             #
397             #==============================================================================
398              
399              
400             ## Please email me any suggestions for optimizing these RegExps.
401             sub remove_false_end_of_sentence {
402 2     2 0 11 my ($marked_segment) = @_;
403            
404            
405             ## ## don't do U.S.A., U.K.
406             ## $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg;
407 2         108 $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
408 2         53 $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;
409              
410             # don't split after a white-space followed by a single letter or number followed
411             # by a dot followed by another whitespace., such as "Something. 1. point one"
412             # Note: will fail for 12. Point 12
413 2         54 $marked_segment=~s/(\s[\w\d]\.\s+)$EOS/$1/sg;
414              
415             # fix ellipsis: bla bla... yada yada
416 2         35 $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg;
417            
418             # fix quoted EOS such as "." "?" "!"
419 2         22 $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
420            
421             # don't break after quote unless its a capital letter.
422 2         47 $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
423              
424             # don't break: text . . some more text.
425 2         32 $marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
426 2         39 $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
427              
428 2         43 $marked_segment=~s/(\s$PAP\s)$EOS/$1/sg;
429            
430 2         8 return $marked_segment;
431             }
432              
433             sub split_unsplit_stuff {
434 2     2 0 4 my ($text) = @_;
435            
436              
437             # $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg; # breaks numbered points, such as {EOL}1. point one
438              
439 2         65 $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg;
440            
441             # eg 'end. (' -> 'end. $EOS ('
442 2         34 $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
443            
444 2         14 $text=~s/('\w$P)(\s)/$1$EOS$2/gs;
445              
446              
447 2         37 $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
448              
449             # split where single capital letter followed by dot makes sense to break.
450             # notice these are exceptions to the general rule NOT to split on single
451             # letter.
452             # notice also that single letter M is missing here, due to French 'mister'
453             # which is represented as M.
454             #
455             # the rule will not split on names beginning or containing
456             # single capital letter dot in the first or second name
457             # assuming 2 or three word name.
458            
459             # NOT WORKING , it breaks up U.S.A. after U.
460             # Valid cases if single letter thrn dot are rare, such as 'The answer is F'.
461             # Can't decipher meaning of this regex
462             # $text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
463            
464              
465             # add EOS when you see "a.m." or "p.m." followed by a capital letter.
466 2         5 $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
467              
468 2         4 return $text;
469             }
470              
471             sub clean_sentences {
472 2     2 0 4 my ($sentences) = @_;
473 2         3 my $cleaned_sentences;
474 2         3 foreach my $s (@$sentences) {
475 15 50       21 next if not defined $s;
476 15 50       35 next if $s!~m/\w+/;
477 15         34 $s=~s/^\s*//;
478 15         160 $s=~s/\s*$//;
479             ## $s=~s/\s+/ /g;
480             # replace end of abbrev marker with a dot
481 15         36 $s=~s/$EOA/\./g;
482 15         27 push @$cleaned_sentences,$s;
483             }
484 2         4 return $cleaned_sentences;
485             }
486              
487             # Replace seuence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
488             # This simplifies the code that detects end of sentnees. The marker is
489             # replaced with the original dot adter sentence slitting
490              
491             sub mark_up_abbreviations {
492 2     2 0 4 my ($text) = @_;
493            
494 2         3 my %found_abbrevs;
495 2         5 foreach my $abbrev (@ABBREVIATIONS) {
496 317 100       10914 if ($text=~/\b($abbrev)\./i) {
497 13         34 $found_abbrevs{$abbrev} = 1;
498             }
499             }
500              
501 2         15 foreach my $abbrev (keys %found_abbrevs) {
502 11         123 $text=~s/($abbrev)\./$1$EOA/gs;
503             }
504            
505 2         10 return $text;
506             }
507              
508             sub first_sentence_breaking {
509 2     2 0 6 my ($text) = @_;
510 2         18 $text=~s/\n\s*\n/$EOS/gs; ## double new-line means a different sentence.
511 2         59 $text=~s/($PAP\s)/$1$EOS/gs;
512 2         61 $text=~s/(\s\w$P)/$1$EOS/gs; # break also when single letter comes before punc.
513 2         15 return $text;
514             }
515              
516              
517             #==============================================================================
518             #
519             # Return TRUE
520             #
521             #==============================================================================
522              
523             1;