File Coverage

blib/lib/Lingua/EN/CommonMistakes.pm
Criterion Covered Total %
statement 63 66 95.4
branch 19 22 86.3
condition 2 3 66.6
subroutine 9 9 100.0
pod n/a
total 93 100 93.0


line stmt bran cond sub pod time code
1             package Lingua::EN::CommonMistakes;
2              
3 2     2   21023 use 5.006;
  2         8  
  2         73  
4 2     2   18 use strict;
  2         4  
  2         69  
5 2     2   9 use warnings FATAL => 'all';
  2         8  
  2         85  
6 2     2   11 use warnings::register;
  2         3  
  2         283  
7 2     2   11 use Carp;
  2         3  
  2         1336  
8              
9             our $VERSION = 20130425;
10              
11             my %MISTAKES;
12              
13             # reads data from __DATA__ section into %MISTAKES
14             sub _read_mistakes {
15 2     2   4 my $in_tag = ':common';
16              
17 2         10 while ( my $line = ) {
18 4406         4693 chomp $line;
19 4406         5341 $line =~ s{#.*\z}{};
20 4406         7219 $line =~ s{\s+\z}{};
21 4406         5973 $line =~ s{\A\s+}{};
22 4406         8046 $line =~ s{ {2,}}{ };
23 4406 100       7101 next unless $line;
24              
25 4400 100       7900 if ( $line =~ m{\A:[^\s]+\z} ) {
26 6         10 $in_tag = $line;
27 6         17 next;
28             }
29              
30 4394         10874 my ( $word, $correction ) = split( /\s/, $line, 2 );
31 4394         24630 $MISTAKES{$in_tag}{$word} = $correction;
32             }
33 2         77 close(Lingua::EN::CommonMistakes::DATA);
34              
35 2         7 return;
36             }
37              
38             sub import {
39 7     7   4853 my ( $package, @args ) = @_;
40 7         13 my @out_name;
41 7         17 my %tags = map { $_ => 1 } qw(:common :punct);
  14         49  
42 7         17 foreach my $arg (@args) {
43 10 100       35 if ( substr( $arg, 0, 1 ) eq '%' ) {
    50          
44 5         17 push @out_name, substr( $arg, 1 );
45             }
46             elsif ( substr( $arg, 0, 1 ) eq ':' ) {
47 5 50       26 if ($arg eq ':no-defaults') {
    100          
48 0         0 %tags = ();
49             } elsif ($arg =~ m{\A:no-(.+)\z}) {
50 1         8 delete $tags{ ":$1" };
51             } else {
52 4         10 $tags{$arg}++;
53             }
54             }
55             else {
56 0         0 croak __PACKAGE__ . ": import argument $arg is not understood";
57             }
58             }
59              
60 7 100       25 if ( !@out_name ) {
61 2         4 push @out_name, 'MISTAKES';
62             }
63              
64 7 50 66     30 if ( $tags{':american'} && $tags{':british'} ) {
65 0         0 croak __PACKAGE__ . ": can't use both :american and :british";
66             }
67              
68 7 100       35 if ( !%MISTAKES ) {
69 2         5 _read_mistakes();
70             }
71              
72 7         11 my %out;
73 7         26 foreach my $tag ( keys %tags ) {
74 17 100       640 if ( !$MISTAKES{$tag} ) {
75 2 100       82 if (warnings::enabled( __PACKAGE__ )) {
76 1         275 carp __PACKAGE__ . ": import argument $tag is not understood";
77             }
78             }
79             else {
80 15         947 (%out) = ( %out, %{ $MISTAKES{$tag} } );
  15         16659  
81             }
82             }
83              
84 7         275 my ($caller_package) = caller();
85 7         18 foreach my $out_name (@out_name) {
86 2     2   12 no strict 'refs';
  2         4  
  2         64  
87 2     2   11 no warnings 'once';
  2         4  
  2         252  
88 7         14 *{ $caller_package . '::' . $out_name } = \%out;
  7         64  
89             }
90 7         224 return;
91             }
92              
93             =head1 NAME
94              
95             Lingua::EN::CommonMistakes - map of common English spelling errors
96              
97             =head1 SYNOPSIS
98              
99             use Lingua::EN::CommonMistakes qw(%MISTAKES);
100              
101             foreach my $word (split /\b/, $text) {
102             if (my $correction = $MISTAKES{lc $word}) {
103             warn "Likely spelling error: $word (-> $correction)\n";
104             }
105             }
106              
107             # or use a different flavor of English
108             use Lingua::EN::CommonMistakes qw(:no-punct :british %MISTAKES);
109             ...
110              
111             Provides a customizable map of common English spelling errors with their
112             respective corrections.
113              
114             =head1 USAGE
115              
116             The behavior of this package is customized at import time.
117              
118             By default, importing this package will create a hash named
119             C<%MISTAKES> in the calling package, containing most corrections, but
120             not containing either American English or British English corrections.
121              
122             This behavior may be customized by providing the following parameters
123             when importing:
124              
125             =over
126              
127             =item %I [default: C<%MISTAKES>]
128              
129             The map will be imported with the given name.
130              
131             =item C<:common>, C<:no-common> [default: C<:common>]
132              
133             If enabled, include the base set of corrections common among all
134             English variants. This is the largest set of corrections.
135              
136             =item C<:american>, C<:no-american> [default: C<:no-american>]
137              
138             If enabled, American English is desirable; include corrections from
139             British English to American English. For example, "colour" should be
140             replaced with "color".
141              
142             =item C<:british>, C<:no-british> [default: C<:no-british>]
143              
144             If enabled, British English is desirable; include corrections from
145             American English to British English. For example, "recognized" should
146             be replaced with "recognised".
147              
148             =item C<:punct>, C<:no-punct> [default: C<:punct>]
149              
150             If enabled, include corrections which introduce punctuation characters;
151             for example, "dont" should be replaced with "don't".
152              
153             C<:no-punct> is often useful when scanning input text where
154             punctuation characters have special meaning, such as in most
155             programming languages.
156              
157             =item C<:no-defaults>
158              
159             If set, the corrections map only includes sets which have been
160             explicitly enabled.
161              
162             =back
163              
164             It's possible to C the package several times if multiple mappings are
165             needed, as in the following example:
166              
167             # one map for common mistakes, another for british->american only
168             use Lingua::EN::CommonMistakes qw(%MISTAKES_COMMON);
169             use Lingua::EN::CommonMistakes qw(:no-defaults :american %MISTAKES_GB_TO_US);
170              
171             =head1 WHY?
172              
173             One might justifiably wonder why it would make sense to use a list of
174             mistakes rather than a full dictionary when spell checking.
175              
176             Spell checking typically uses a whitelist approach: all words are
177             considered incorrect unless they can be found in the whitelist
178             (dictionary). This module instead facilitates a blacklist approach:
179             words are considered correct unless they can be found in the blacklist
180             (map of mistakes).
181              
182             A blacklist approach to spell-checking is often more suitable than a
183             whitelist approach when scanning text which is partly but not entirely
184             English.
185              
186             Computer programs are a prime example of semi-English documents;
187             comments and identifiers may be written in English, with additional
188             restrictions (such as no punctuation characters permitted in
189             identifiers) and often contain words which are intentionally not
190             spelled correctly (abbreviations or corruptions of valid English
191             words, e.g. "int" for "integer").
192              
193             Other examples include mixed language documents or documents which are
194             ostensibly English but contain a lot of domain-specific jargon
195             unlikely to be found in an English dictionary.
196              
197             Despite the fact that such bodies of text are only partly English, any
198             occurrences of words in the blacklist are likely to be genuine errors.
199              
200             A blacklist approach also makes sense when it is more important to
201             have a low rate of false positives than it is to find every error (for
202             example, an automated system which risks being ignored if it generates
203             too many reports of dubious value).
204              
205             =head1 AUTHOR
206              
207             Rohan McGovern, C
208              
209             =head1 BUGS
210              
211             Please view and report any bugs here:
212             L
213              
214             =head1 ACKNOWLEDGEMENTS
215              
216             Most of the word list has been sourced from other projects, including:
217              
218             =over
219              
220             =item *
221              
222             I code checker tool, written for KDE:
223             L
224              
225             =item *
226              
227             I package checker tool, written for Debian:
228             L
229              
230             =back
231              
232             =head1 LICENSE AND COPYRIGHT
233              
234             Copyright 2012 Rohan McGovern.
235              
236             Incorporated word lists may be Copyright their respective authors.
237              
238             This program is free software; you can redistribute it and/or modify
239             it under the terms of the GNU General Public License as published by
240             the Free Software Foundation; version 2 dated June, 1991 or at your option
241             any later version.
242              
243             This program is distributed in the hope that it will be useful,
244             but WITHOUT ANY WARRANTY; without even the implied warranty of
245             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
246             GNU General Public License for more details.
247              
248             A copy of the GNU General Public License is available in the source tree;
249             if not, write to the Free Software Foundation, Inc.,
250             59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
251              
252              
253             =cut
254              
255             1;
256              
257             __DATA__