File Coverage

blib/lib/Lingua/HE/Sentence.pm
Criterion Covered Total %
statement 45 51 88.2
branch 4 8 50.0
condition n/a
subroutine 10 12 83.3
pod 3 6 50.0
total 62 77 80.5


line stmt bran cond sub pod time code
1             package Lingua::HE::Sentence;
2              
3             #==============================================================================
4             #
5             # Start of POD
6             #
7             #==============================================================================
8              
9             =head1 NAME
10              
11             Lingua::HE::Sentence - Module for splitting Hebrew text into sentences.
12              
13             =head1 SYNOPSIS
14              
15             use Lingua::HE::Sentence qw( get_sentences );
16              
17             my $sentences=get_sentences($text); ## Get the sentences.
18             foreach my $sentence (@$sentences) {
19             ## do something with $sentence
20             }
21              
22              
23             =head1 DESCRIPTION
24              
25             The C module contains the function get_sentences, which splits Hebrew text into its constituent sentences, based on regular expressions.
26              
27             The module assumes text encoded in UTF-8. Supporting other input formats will be added upon request.
28              
29             =head1 HEBREW DETAILS
30              
31             Language: Hebrew
32             Language ID: he
33             MS Locale ID: 1037
34             ISO 639-1: he
35             ISO 639-2 (MARC): heb
36             ISO 8859 (charset): 8859-8
37             ANSI codepage: 1255
38             Unicode: 0590-05FF
39              
40             =head1 PROBLEM DESCRIPTION
41              
42             Many applications in natural language processing require some knowledge of sentence boundaries. The problem of properly locating sentence bonudaries in text in Hebrew is in many ways less severe than the same problem in other languages. The purpose of this module is to supply Perl users with a tool which can take plain text in Hebrew and get an ordered list of the sentences in the text.
43              
44             =head1 PROPERTIES OF HEBREW SENTENCES
45              
46             The following facts are part of the guidelines given by the 'academy of the Hebrew language'.
47              
48             Sentences usually end with one of the following punctuation symbols:
49             . a dot
50             ? a question mark
51             ! an exclamation mark
52              
53             No dot should be placed after sentences on titles (such as book names, chpter titles etc.)
54              
55             A dot can be placed after letters and numbers used for listing items, chapters etc., as long as these letters or numbers are not placed on a special line. When these letters or numbers appear alone, no dot should succeed them. Brackets or a closing bracket can be used instead of a dot in this case.
56              
57             Decimal point should be represented with a dot and not a comma in order to distinguish the number from its decimal fraction.
58              
59             In some rare cases semicolons also represent end of sentence, but usually the sentences separated by sa semicolor are practically one long sentence. I chose not to split on semicolons at all.
60              
61              
62             =head1 ASSUMPTIONS
63              
64             Input text is assumed to be represented in UTF-8
65              
66             Input text is assumed to have some structure, i.e. titles are separated from the rest of the text with at least a couple of newline characters ('\n').
67              
68             Input is expected to follow the PROPERTIES listed above.
69              
70             Complex sentences should be further segmented using clause identificatoin algorithms, this module will not provide (at least in this version) any support for clause identification and segmentation.
71              
72             =head1 FUNCTIONS
73              
74             All functions used should be requested in the 'use' clause. None is exported by default.
75              
76             =item get_sentences( $text )
77              
78             The get sentences function takes a scalar containing ascii text as an argument and returns a reference to an array of sentences that the text has been split into.
79             Returned sentences will be trimmed (beginning and end of sentence) of white-spaces.
80             Strings with no alpha-numeric characters in them, won't be returned as sentences.
81              
82             =item get_EOS( )
83              
84             This function returns the value of the string used to mark the end of sentence. You might want to see what it is, and to make sure your text doesn't contain it. You can use set_EOS() to alter the end-of-sentence string to whatever you desire.
85              
86             =item set_EOS( $new_EOS_string )
87              
88             This function alters the end-of-sentence string used to mark the end of sentences.
89              
90             =head1 BUGS
91              
92             No proper handling of sentence boundaries within and in presence of quotes (either single or dounle). Please report bugs at http://rt.cpan.org/ and CC the author (see details below).
93              
94             =head1 FUTURE WORK (in no particular order)
95              
96             =item [0] Write tests!
97              
98             =item [1] Object Oriented like usage.
99              
100             =item [2] Supporting more encodings/charsets.
101              
102             =item [3] Code cleanup and optimization.
103              
104             =item [4] Fix bugs.
105              
106             =item [5] Generate sentencizer based on supervised learning. (requires tagged texts...)
107              
108             =head1 SEE ALSO
109              
110             Lingua::EN::Sentence
111              
112             =head1 AUTHOR
113              
114             Shlomo Yona shlomo@cs.haifa.ac.il
115              
116             =head1 COPYRIGHT
117              
118             Copyright (c) 2001-2005 Shlomo Yona. All rights reserved.
119              
120             =head1 LICENSE
121              
122             This library is free software.
123             You can redistribute it and/or modify it under the same terms as Perl itself.
124              
125             =cut
126              
127             #==============================================================================
128             #
129             # End of POD
130             #
131             #==============================================================================
132              
133              
134             #==============================================================================
135             #
136             # Pragmas
137             #
138             #==============================================================================
139              
140 4     4   116326 use 5.008_004; # due to utf8 support
  4         13  
  4         156  
141 4     4   23 use warnings;
  4         18  
  4         156  
142 4     4   19 use strict;
  4         10  
  4         203  
143             #==============================================================================
144             #
145             # Modules
146             #
147             #==============================================================================
148             require Exporter;
149              
150             #==============================================================================
151             #
152             # Public globals
153             #
154             #==============================================================================
155 4     4   25 use Carp qw/cluck/;
  4         7  
  4         371  
156 4     4   3338 use utf8;
  4         36  
  4         21  
157              
158             our $VERSION = '0.13';
159              
160             our @ISA = qw( Exporter );
161             our @EXPORT_OK = qw( get_sentences get_EOS set_EOS);
162              
163             our $EOS="\001";
164              
165             #==============================================================================
166             #
167             # Public methods
168             #
169             #==============================================================================
170              
171             #------------------------------------------------------------------------------
172             # get_sentences - takes text input and splits it into sentences.
173             # A regular expression cuts viciously the text into sentences,
174             # and then a list of rules (some of them consist of a list of abbreviations)
175             # is applied on the marked text in order to fix end-of-sentence markings on
176             # places which are not indeed end-of-sentence.
177             #------------------------------------------------------------------------------
178             sub get_sentences {
179 1     1 1 789 my ($text)=@_;
180 1 50       4 return [] unless defined $text;
181              
182 1         4 my $marked_text = sentence_breaking($text);
183 1         82 my @sentences = split(/$EOS/,$marked_text);
184 1         7 my $cleaned_sentences = clean_sentences(\@sentences);
185 1         7 return $cleaned_sentences;
186             }
187              
188             #------------------------------------------------------------------------------
189             # get_EOS - get the value of the $EOS (end-of-sentence mark).
190             #------------------------------------------------------------------------------
191             sub get_EOS {
192 0     0 1 0 return $EOS;
193             }
194              
195             #------------------------------------------------------------------------------
196             # set_EOS - set the value of the $EOS (end-of-sentence mark).
197             #------------------------------------------------------------------------------
198             sub set_EOS {
199 0     0 1 0 my ($new_EOS) = @_;
200 0 0       0 if (not defined $new_EOS) {
201 0         0 cluck "Won't set \$EOS to undefined value!\n";
202 0         0 return $EOS;
203             }
204 0         0 return $EOS = $new_EOS;
205             }
206              
207             #==============================================================================
208             #
209             # Private methods
210             #
211             #==============================================================================
212              
213             sub clean_sentences {
214 1     1 0 3 my ($sentences) = @_;
215 1         2 my $cleaned_sentences = [];
216 1         4 foreach my $s (@$sentences) {
217 61 50       109 next if not defined $s;
218 61 100       147 next if $s=~m/^\s*$/;
219 56         166 $s=~s/^\s*//;
220 56         860 $s=~s/\s*$//;
221 56         108 push @$cleaned_sentences,$s;
222             }
223 1         3 return $cleaned_sentences;
224             }
225              
226             sub sentence_breaking {
227 1     1 0 3 my ($text) = @_;
228             ## double new-line means a different sentence.
229 1         98 $text=~s/\n\s*\n/$EOS/gs;
230             ## break by end-of-sentence just before closing quotes/punct. and opening quotes/punct.
231 4     4   2434 $text=~s/(\p{IsEndOfSentenceCharacter}+(['"\p{ClosePunctuation}])?\s+)/$1$EOS/gs;
  4         36  
  4         49  
  1         30  
232 1         670 $text=~s/(['"\p{ClosePunctuation}]\s*\p{IsEndOfSentenceCharacter}+\s+)/$1$EOS/gs;
233              
234             # breake also when single letter comes before punc.
235 1         575 $text=~s/(\s\w\p{IsEndOfSentenceCharacter}\s+)/$1$EOS/gs;
236              
237             ## unbreak a series of alphanum/end-of-sentence within punctuation before an EOS
238 1         438 $text=~s/(\p{Punctuation}[\w\p{IsEndOfSentenceCharacter}]['"\p{ClosePunctuation}]\s*)$EOS/$1/gs;
239             ## re-break stuff
240 1         897 $text=~s/(\p{IsEndOfSentenceCharacter}+['"\p{ClosePunctuation}]?\s+)(?!$EOS)/$1$EOS/gs;
241              
242              
243             ## unbreak stuff like: VAV-(!)
244 1         483 $text=~s/$EOS(\s*(?:\x{05D5}-?(?:\w|\s)*)?['"\p{OpenPunctuation}]\s*\p{IsEndOfSentenceCharacter}+['"\p{ClosePunctuation}]\s*)/$1/gs;
245             ## unbreak stuff like: '?!'
246 1         287 $text=~s/(['"\p{OpenPunctuation}]\s*\p{IsEndOfSentenceCharacter}+['"\p{ClosePunctuation}]\s*)$EOS/$1/gs;
247             ## unbreak stuff like: 'i.b.m.' followed by text
248 1         737 $text=~s/(\p{IsEndOfSentenceCharacter}\w+\p{IsEndOfSentenceCharacter}\p{Punctuation}*\s*)$EOS/$1/gs;
249              
250 1         384 return $text;
251             }
252              
253             # End of Sentence characters
254             # 21 !
255             # 2E .
256             # 3F ?
257             sub IsEndOfSentenceCharacter {
258 9     9 0 2899 return <<'END';
259             21
260             2E
261             3F
262             END
263             }
264              
265             #==============================================================================
266             #
267             # Return TRUE
268             #
269             #==============================================================================
270              
271             1;