File Coverage

lib/Text/Ispell.pm
Criterion Covered Total %
statement 18 129 13.9
branch 0 32 0.0
condition 0 7 0.0
subroutine 6 24 25.0
pod 11 12 91.6
total 35 204 17.1


line stmt bran cond sub pod time code
1              
2             #(@) Text::Ispell.pm - a module encapsulating access to the Ispell program.
3              
4             =head1 NAME
5              
6             Text::Ispell.pm - a module encapsulating access to the Ispell program.
7              
8             =cut
9              
10              
11             package Text::Ispell;
12 1     1   458 use Exporter;
  1         2  
  1         142  
13             @Text::Ispell::ISA = qw(Exporter);
14             @Text::Ispell::EXPORT_OK = qw(
15             spellcheck
16             add_word
17             add_word_lc
18             accept_word
19             parse_according_to
20             set_params_by_language
21             save_dictionary
22             terse_mode
23             allow_compounds
24             make_wild_guesses
25             use_dictionaries
26             use_personal_dictionaries
27             );
28             %Text::Ispell::EXPORT_TAGS = (
29             'all' => \@Text::Ispell::EXPORT_OK,
30             );
31              
32              
33 1     1   806 use FileHandle;
  1         11336  
  1         7  
34 1     1   1254 use IPC::Open2;
  1         3824  
  1         57  
35 1     1   8 use Carp;
  1         2  
  1         51  
36              
37 1     1   5 use strict;
  1         1  
  1         37  
38              
39 1     1   5 use vars qw( $VERSION );
  1         2  
  1         1701  
40             $VERSION = '0.04';
41              
42              
43             =head1 SYNOPSIS
44              
45             # Brief:
46             use Text::Ispell;
47             Text::Ispell::spellcheck( $string );
48             # or
49             use Text::Ispell qw( spellcheck ); # import the function
50             spellcheck( $string );
51              
52             # Useful:
53             use Text::Ispell qw( :all ); # import all symbols
54             for my $r ( spellcheck( "hello hacking perl shrdlu 42" ) ) {
55             print "$r->{'type'}: $r->{'term'}\n";
56             }
57              
58              
59             =head1 DESCRIPTION
60              
61             Text::Ispell::spellcheck() takes one argument. It must be a
62             string, and it should contain only printable characters.
63             One allowable exception is a terminal newline, which will be
64             chomped off anyway. The line is fed to a coprocess running
65             ispell for analysis. The line is parsed on non-wordchars
66             into a sequence of terms. By default, the set of wordchars
67             is defined in ispell as letters, digits, and the apostrophe.
68             In other words, the line is subjected the equivalent of
69              
70             split /[^a-zA-Z0-9']+/
71              
72             (ispell has a means to add characters to the default set,
73             but currently Text::Ispell does not provide access to that
74             feature.)
75              
76             The result of ispell's analysis of each term is a categorization
77             of the term into one of six types: ok, root, miss, none, compound,
78             and guess. Some of these carry additional information.
79              
80             Text::Ispell::spellcheck returns a list of objects, each
81             corresponding to a term in the spellchecked string. Each object
82             is a hash (hash-ref) with at least two entries: 'term' and 'type'.
83             The former contains the term ispell is reporting on, and the latter
84             is ispell's determination of that term's type (see above).
85             For types 'ok' and 'none', that is all the information there is.
86             For the type 'root', an additional hash entry is present: 'root'.
87             Its value is the word which ispell identified in the dictionary
88             as being the likely root of the current term.
89             For the type 'miss', an additional hash entry is present: 'misses'.
90             Its value is a string of words, comma-separated, which ispell
91             identified as being "near-misses" of the current term, when
92             scanning the dictionary.
93              
94             A quickie example:
95              
96             use Text::Ispell qw( spellcheck );
97             Text::Ispell::allow_compounds(1);
98             for my $r ( spellcheck( "hello hacking perl salmoning fruithammer shrdlu 42" ) ) {
99             if ( $r->{'type'} eq 'ok' ) {
100             # as in the case of 'hello'
101             print "'$r->{'term'}' was found in the dictionary.\n";
102             }
103             elsif ( $r->{'type'} eq 'root' ) {
104             # as in the case of 'hacking'
105             print "'$r->{'term'}' can be formed from root '$r->{'root'}'\n";
106             }
107             elsif ( $r->{'type'} eq 'miss' ) {
108             # as in the case of 'perl'
109             print "'$r->{'term'}' was not found in the dictionary;\n";
110             print "Near misses: $r->{'misses'}\n";
111             }
112             elsif ( $r->{'type'} eq 'guess' ) {
113             # as in the case of 'salmoning'
114             print "'$r->{'term'}' was not found in the dictionary;\n";
115             print "Root/affix Guesses: $r->{'guesses'}\n";
116             }
117             elsif ( $r->{'type'} eq 'compound' ) {
118             # as in the case of 'fruithammer'
119             print "'$r->{'term'}' is a valid compound word.\n";
120             }
121             elsif ( $r->{'type'} eq 'none' ) {
122             # as in the case of 'shrdlu'
123             print "No match for term '$r->{'term'}'\n";
124             }
125             # and numbers are skipped entirely, as in the case of 42.
126             }
127              
128              
129             =head2 ERRORS
130              
131             C starts the ispell coprocess
132             if the coprocess seems not to exist. Ordinarily this is simply
133             the first time it's called.
134              
135             ispell is spawned via the C function, which
136             throws an exception (i.e. dies) if the spawn fails. The caller
137             should be prepared to catch this exception -- unless, of course,
138             the default behavior of die is acceptable.
139              
140             =head2 Nota Bene
141              
142             The full location of the ispell executable is stored
143             in the variable C<$Text::Ispell::path>. The default
144             value is F.
145             If your ispell executable has some name other than
146             this, then you must set C<$Text::Ispell::path> accordingly
147             before you call C (or any other function
148             in the module) for the first time!
149              
150             =cut
151              
152              
153             sub _init {
154 0 0   0     unless ( $Text::Ispell::pid ) {
155 0           my @options;
156 0           while ( my( $k, $ar ) = each %Text::Ispell::options ) {
157 0 0         if ( @$ar ) {
158 0           for ( @$ar ) {
159             #push @options, "$k $_";
160 0           push @options, $k, $_;
161             }
162             }
163             else {
164 0           push @options, $k;
165             }
166             }
167              
168 0   0       $Text::Ispell::path ||= '/usr/local/bin/ispell';
169              
170 0           $Text::Ispell::pid = undef; # so that it's still undef if open2 fails.
171 0           $Text::Ispell::pid = open2( # if open2 fails, it throws, but doesn't return.
172             *Reader,
173             *Writer,
174             $Text::Ispell::path,
175             '-a', '-S',
176             @options,
177             );
178              
179 0           my $hdr = scalar();
180              
181 0           $Text::Ispell::terse = 0; # must be the same as ispell.
182 0           $Text::Ispell::word_chars = "'0-9A-Za-z";
183             }
184             $Text::Ispell::pid
185 0           }
186              
187             sub _exit {
188 0 0   0     if ( $Text::Ispell::pid ) {
189 0           close Reader;
190 0           close Writer;
191 0           kill $Text::Ispell::pid;
192 0           $Text::Ispell::pid = undef;
193             }
194             }
195              
196              
197             sub spellcheck {
198 0 0   0 0   _init() or return(); # caller should really catch the exception from a failed open2.
199 0           my $line = shift;
200 0           local $/ = "\n"; local $\ = '';
  0            
201 0           chomp $line;
202 0           $line =~ s/\r//g; # kill the hate
203 0 0         $line =~ /\n/ and croak "newlines not allowed in arguments to Text::Ispell::spellcheck!";
204 0           print Writer "^$line\n";
205 0           my @commentary;
206 0           local $_;
207 0           while ( ) {
208 0           chomp;
209 0 0         last unless $_ gt '';
210 0           push @commentary, $_;
211             }
212              
213             #
214             # it doth appear that ispell simply skips, without comment,
215             # any terms that consist solely of digits.
216             #
217 0           my $split_pattern = "[^$Text::Ispell::word_chars]+";
218 0           my @terms = grep { /\D/ } split /$split_pattern/, $line;
  0            
219              
220 0 0         unless ( $Text::Ispell::terse ) {
221 0 0         @terms == @commentary or die "terms: ".join(',',@terms)."\ncommentary:\n".join("\n",@commentary)."\n\n";
222             }
223              
224 0           my %types = (
225             '*' => 'ok',
226             '-' => 'compound',
227             '+' => 'root',
228             '#' => 'none',
229             '&' => 'miss',
230             '?' => 'guess',
231             );
232             # and there's one more type, unknown, which is
233             # used when the first char is not in the above set.
234              
235             my %modisp = (
236             'root' => sub {
237 0     0     my $h = shift;
238 0           $h->{'root'} = shift;
239             },
240             'none' => sub {
241 0     0     my $h = shift;
242 0           $h->{'original'} = shift;
243 0           $h->{'offset'} = shift;
244             },
245             'miss' => sub {
246 0     0     my $h = shift;
247 0           $h->{'original'} = shift;
248 0           $h->{'count'} = shift; # count will always be 0, when $c eq '?'.
249 0           $h->{'offset'} = shift;
250 0           $h->{'offset'} =~ s/:$//; # offset has trailing colon.
251              
252 0           my @misses = map { s/,$//; $_ } splice @_, 0, $h->{'count'};
  0            
  0            
253 0           my @guesses = map { s/,$//; $_ } @_;
  0            
  0            
254 0           $h->{'misses'} = join ' ', @misses;
255 0           $h->{'guesses'} = join ' ', @guesses;
256             },
257 0           );
258 0           $modisp{'guess'} = $modisp{'miss'};
259              
260 0           my @results;
261 0           for my $i ( 0 .. $#commentary ) {
262 0           my %h = (
263             'term' => $terms[$i],
264             'commentary' => $commentary[$i],
265             );
266 0           my( $c, @args ) = split ' ', $h{'commentary'};
267            
268 0   0       my $type = $types{$c} || 'unknown';
269              
270 0 0         $modisp{$type} and $modisp{$type}->( \%h, @args );
271              
272 0 0 0       if ( $Text::Ispell::terse && $h{'offset'} ) {
273             # need to recalculate the 'term':
274 0           my @terms = grep { /\D/ } split /$split_pattern/, substr $line, $h{'offset'}-1;
  0            
275 0           $h{'term'} = $terms[0];
276             }
277              
278 0           $h{'type'} = $type;
279 0           push @results, \%h;
280             }
281              
282             @results
283 0           }
284              
285             sub _send_command($$) {
286 0     0     my( $cmd, $arg ) = @_;
287 0 0         defined $arg or $arg = '';
288 0           local $/ = "\n"; local $\ = '';
  0            
289 0           chomp $arg;
290 0           _init();
291 0           print Writer "$cmd$arg\n";
292             }
293              
294              
295             =head1 AUX FUNCTIONS
296              
297             =head2 add_word(word)
298              
299             Adds a word to the personal dictionary. Be careful of capitalization.
300             If you want the word to be added "case-insensitively", you should
301             call C
302              
303             =cut
304              
305             sub add_word($) {
306 0     0 1   _send_command "\*", $_[0];
307             }
308              
309             =head2 add_word_lc(word)
310              
311             Adds a word to the personal dictionary, in lower-case form.
312             This allows ispell to match it in a case-insensitive manner.
313              
314             =cut
315              
316             sub add_word_lc($) {
317 0     0 1   _send_command "\&", $_[0];
318             }
319              
320             =head2 accept_word(word)
321              
322             Similar to adding a word to the dictionary, in that it causes
323             ispell to accept the word as valid, but it does not actually
324             add it to the dictionary. Presumably the effects of this only
325             last for the current ispell session, which will mysteriously
326             end if any of the coprocess-restarting functions are called...
327              
328             =cut
329              
330             sub accept_word($) {
331 0     0 1   _send_command "\@", $_[0];
332             }
333              
334             =head2 parse_according_to(formatter)
335              
336             Causes ispell to parse subsequent input lines according to
337             the specified formatter. As of ispell v. 3.1.20, only
338             'tex' and 'nroff' are supported.
339              
340             =cut
341              
342             sub parse_according_to($) {
343             # must be one of 'tex' or 'nroff'
344 0     0 1   _send_command "\-", $_[0];
345             }
346              
347             =head2 set_params_by_language(language)
348              
349             Causes ispell to set its internal operational parameters
350             according to the given language. Legal arguments to this
351             function, and its effects, are currently unknown by the
352             author of Text::Ispell.
353              
354             =cut
355              
356             sub set_params_by_language($) {
357 0     0 1   _send_command "\~", $_[0];
358             }
359              
360             =head2 save_dictionary()
361              
362             Causes ispell to save the current state of the dictionary
363             to its disk file. Presumably ispell would ordinarily
364             only do this upon exit.
365              
366             =cut
367              
368             sub save_dictionary() {
369 0     0 1   _send_command "\#", '';
370             }
371              
372             =head2 terse_mode(bool:terse)
373              
374             In terse mode, ispell will not produce reports for "correct" words.
375             This means that the calling program will not receive results of the
376             types 'ok', 'root', and 'compound'.
377              
378             ispell starts up in NON-terse mode, i.e. reports are produced for
379             all terms, not just "incorrect" ones.
380              
381             =cut
382              
383             sub terse_mode($) {
384 0     0 1   my $bool = shift;
385 0 0         my $cmd = $bool ? "\!" : "\%";
386 0           _send_command $cmd, '';
387 0           $Text::Ispell::terse = $bool;
388             }
389              
390              
391             =head1 FUNCTIONS THAT RESTART ISPELL
392              
393             The following functions cause the current ispell coprocess, if any, to terminate.
394             This means that all the changes to the state of ispell made by the above
395             functions will be lost, and their respective values reset to their defaults.
396             The only function above whose effect is persistent is C.
397              
398             Perhaps in the future we will figure out a good way to make this
399             state information carry over from one instantiation of the coprocess
400             to the next.
401              
402             =head2 allow_compounds(bool)
403              
404             When this value is set to True, compound words are
405             accepted as legal -- as long as both words are found in the
406             dictionary; more than two words are always illegal.
407             When this value is set to False, run-together words are
408             considered spelling errors.
409              
410             The default value of this setting is dictionary-dependent,
411             so the caller should set it explicitly if it really matters.
412              
413             =cut
414              
415             sub allow_compounds {
416 0     0 1   my $bool = shift;
417 0           _exit();
418 0 0         if ( $bool ) {
419 0           $Text::Ispell::options{'-C'} = [];
420 0           delete $Text::Ispell::options{'-B'};
421             }
422             else {
423 0           $Text::Ispell::options{'-B'} = [];
424 0           delete $Text::Ispell::options{'-C'};
425             }
426             }
427              
428             =head2 make_wild_guesses(bool)
429              
430             This setting controls when ispell makes "wild" guesses.
431              
432             If False, ispell only makes "sane" guesses, i.e. possible
433             root/affix combinations that match the current dictionary;
434             only if it can find none will it make "wild" guesses,
435             which don't match the dictionary, and might in fact
436             be illegal words.
437              
438             If True, wild guesses are always made, along with any "sane" guesses.
439             This feature can be useful if the dictionary has a limited word list,
440             or a word list with few suffixes.
441              
442             The default value of this setting is dictionary-dependent,
443             so the caller should set it explicitly if it really matters.
444              
445             =cut
446              
447             sub make_wild_guesses {
448 0     0 1   my $bool = shift;
449 0           _exit();
450 0 0         if ( $bool ) {
451 0           $Text::Ispell::options{'-m'} = [];
452 0           delete $Text::Ispell::options{'-P'};
453             }
454             else {
455 0           $Text::Ispell::options{'-P'} = [];
456 0           delete $Text::Ispell::options{'-m'};
457             }
458             }
459              
460             =head2 use_dictionary([dictionary])
461              
462             Specifies what dictionary to use instead of the
463             default. Dictionary names are actually file
464             names, and are searched for according to the
465             following rule: if the name does not contain a slash,
466             it is looked for in the directory containing the
467             default dictionary, typically /usr/local/lib.
468             Otherwise, it is used as is: if it does not begin
469             with a slash, it is construed from the current
470             directory.
471              
472             If no argument is given, the default dictionary will be used.
473              
474             =cut
475              
476             sub use_dictionary($) {
477 0     0 1   _exit();
478 0 0         if ( @_ ) {
479 0           $Text::Ispell::options{'-d'} = [ @_ ];
480             }
481             else {
482 0           delete $Text::Ispell::options{'-d'};
483             }
484             }
485              
486             =head2 use_personal_dictionary([dictionary])
487              
488             Specifies what personal dictionary to use
489             instead of the default.
490              
491             Dictionary names are actually file names, and are
492             searched for according to the following rule:
493             if the name begins with a slash, it is used as
494             is (i.e. it is an absolute path name). Otherwise,
495             it is construed as relative to the user's home
496             directory ($HOME).
497              
498             If no argument is given, the default personal
499             dictionary will be used.
500              
501             =cut
502              
503             sub use_personal_dictionary($) {
504 0     0 1   _exit();
505 0 0         if ( @_ ) {
506 0           $Text::Ispell::options{'-p'} = [ @_ ];
507             }
508             else {
509 0           delete $Text::Ispell::options{'-p'};
510             }
511             }
512              
513              
514              
515             1;
516              
517              
518             =head1 FUTURE ENHANCEMENTS
519              
520             ispell options:
521              
522             -w chars
523             Specify additional characters that can be part of a word.
524              
525             =head1 DEPENDENCIES
526              
527             Text::Ispell uses the external program ispell, which is
528             the "International Ispell", available at
529              
530             http://fmg-www.cs.ucla.edu/geoff/ispell.html
531              
532             as well as various archives and mirrors, such as
533              
534             ftp://ftp.math.orst.edu/pub/ispell-3.1/
535              
536             This is a very popular program, and may already be
537             installed on your system.
538              
539             Text::Ispell also uses the standard perl modules FileHandle,
540             IPC::Open2, and Carp.
541              
542             =head1 AUTHOR
543              
544             jdporter@min.net (John Porter)
545              
546             This module is free software; you may redistribute it and/or
547             modify it under the same terms as Perl itself.
548              
549             =cut
550