File Coverage

blib/lib/Pod/Wordlist.pm
Criterion Covered Total %
statement 96 97 98.9
branch 39 44 88.6
condition 13 15 86.6
subroutine 14 14 100.0
pod 3 3 100.0
total 165 173 95.3


line stmt bran cond sub pod time code
1             package Pod::Wordlist;
2 6     6   59385 use 5.008;
  6         21  
3 6     6   27 use strict;
  6         11  
  6         115  
4 6     6   25 use warnings;
  6         10  
  6         246  
5              
6             our $VERSION = '1.25';
7              
8 6     6   4138 use Lingua::EN::Inflect 'PL';
  6         125238  
  6         612  
9 6     6   58 use File::Spec ();
  6         34  
  6         129  
10             use constant {
11 6         600 MAXWORDLENGTH => 50,
12 6     6   27 };
  6         8  
13              
14             use Class::Tiny {
15 6         41 wordlist => \&_copy_wordlist,
16             _is_debug => 0,
17             no_wide_chars => 0,
18 6     6   2862 };
  6         9476  
19              
20             our %Wordlist; ## no critic ( Variables::ProhibitPackageVars )
21              
22             sub _copy_wordlist {
23 8     8   66 my %copy;
24              
25             # %Wordlist can be accessed externally, and users will often add terms in
26             # encoded form
27 8         5028 for my $word ( keys %Wordlist ) {
28 19448         19746 my $decoded_word = $word;
29             # if it was already decoded, this should do nothing
30 19448         27399 utf8::decode($decoded_word);
31 19448         32276 $copy{$decoded_word} = 1;
32             }
33              
34 8         881 return \%copy;
35             }
36              
37             BEGIN {
38 6     6   3953 my $file;
39              
40             # try to find wordlist in non-installed dist
41 6         117 my ($d, $p) = File::Spec->splitpath(__FILE__);
42 6         94 $p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share');
43 6         80 my $full_path = File::Spec->catpath($d, $p, 'wordlist');
44 6 50 33     150 if ($full_path && -e $full_path) {
45 0         0 $file = $full_path;
46             }
47              
48 6 50       23 if ( not defined $file ) {
49 6         2823 require File::ShareDir;
50 6         135450 $file = File::ShareDir::dist_file('Pod-Spell', 'wordlist');
51             }
52              
53 6 50   6   1192 open my $fh, '<:encoding(UTF-8)', $file
  6         38  
  6         9  
  6         47  
54             or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping)
55 6         63098 while ( defined( my $line = readline $fh ) ) {
56 7320         1523301 chomp $line;
57 7320         19927 $Wordlist{$line} = 1;
58 7320         12309 $Wordlist{PL($line)} = 1;
59             }
60 6         6767 close $fh;
61             }
62              
63             sub learn_stopwords {
64 11     11 1 2335 my ( $self, $text ) = @_;
65 11         250 my $stopwords = $self->wordlist;
66              
67 11         135 while ( $text =~ m<(\S+)>g ) {
68 25         586 my $word = $1;
69 25         66 utf8::decode($word);
70 25 100       79 if ( $word =~ m/^!(.+)/s ) {
71             # "!word" deletes from the stopword list
72 2         8 my $negation = $1;
73             # different $1 from above
74 2         5 delete $stopwords->{$negation};
75 2         9 delete $stopwords->{PL($negation)};
76 2 100       400 print "Unlearning stopword <$negation>\n" if $self->_is_debug;
77             }
78             else {
79 23         49 $word =~ s{'s$}{}; # we strip 's when checking so strip here, too
80 23         49 $stopwords->{$word} = 1;
81 23         87 $stopwords->{PL($word)} = 1;
82 23 100       6206 print "Learning stopword <$word>\n" if $self->_is_debug;
83             }
84             }
85 11         266 return;
86             }
87              
88             sub is_stopword {
89 56     56 1 80 my ($self, $word) = @_;
90 56         853 my $stopwords = $self->wordlist;
91 56 100 100     369 if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) {
92 21 100       269 print " Rejecting <$word>\n" if $self->_is_debug;
93 21         268 return 1;
94             }
95 35         152 return;
96             }
97              
98             sub strip_stopwords {
99 18     18 1 108 my ($self, $text) = @_;
100              
101             # Count the things in $text
102 18 100       333 print "Content: <", $text, ">\n" if $self->_is_debug;
103              
104 18         632 my @words = grep { length($_) < MAXWORDLENGTH } split " ", $text;
  71         170  
105              
106 18         49 for ( @words ) {
107 71 100       1459 print "Parsing word: <$_>\n" if $self->_is_debug;
108             # some spellcheckers can't cope with anything but Latin1
109 71 100 100     1824 $_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/;
110              
111             # strip leading punctuation
112 71         411 s/^[\(\[\{\'\"\:\;\,\?\!\.]+//;
113              
114             # keep everything up to trailing punctuation, not counting
115             # periods (for abbreviations like "Ph.D."), single-quotes
116             # (for contractions like "don't") or colons (for package
117             # names like "Foo::Bar")
118 71         248 s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/;
119              
120             # strip trailing single-quote, periods or colons; after this
121             # we have a word that could have internal periods or quotes
122 71         132 s/[\.\'\:]+$//;
123              
124             # strip possessive
125 71         131 s/'s$//i;
126              
127             # zero out variable names or things with internal symbols,
128             # since those are probably code expressions outside a C<>
129 71         115 my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/;
130 71         111 my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/;
131 71 100 100     210 $_ = '' if $is_sigil || $is_strange;
132              
133             # stop if there are no "word" characters left; if it's just
134             # punctuation that we didn't happen to strip or it's weird glyphs,
135             # the spellchecker won't do any good anyway
136 71 100       158 next unless /\w/;
137              
138 51 100       670 print " Checking as <$_>\n" if $self->_is_debug;
139              
140             # replace it with any stopword or stopword parts stripped
141 51         582 $_ = $self->_strip_a_word($_);
142              
143 51 100 100     556 print " Keeping as <$_>\n" if $_ && $self->_is_debug;
144             }
145              
146 18 50       99 return join(" ", grep { defined && length } @words );
  71         246  
147             }
148              
149             sub _strip_a_word {
150 51     51   89 my ($self, $word) = @_;
151 51         54 my $remainder;
152              
153             # try word as-is, including possible hyphenation vs stoplist
154 51 100       100 if ($self->is_stopword($word) ) {
    100          
    100          
155 19         32 $remainder = '';
156             }
157             # internal period could be abbreviations, so check with
158             # trailing period restored and drop or keep on that basis
159             elsif ( index($word, '.') >= 0 ) {
160 2         6 my $abbr = "$word.";
161 2 100       5 $remainder = $self->is_stopword($abbr) ? '' : $abbr;
162             }
163             # check individual parts of hyphenated word, keep whatever isn't a
164             # stopword as individual words
165             elsif ( index($word, '-') >= 0 ) {
166 1         3 my @keep;
167 1         8 for my $part ( split /-/, $word ) {
168 3 100       7 push @keep, $part if ! $self->is_stopword( $part );
169             }
170 1 50       19 $remainder = join(" ", @keep) if @keep;
171             }
172             # otherwise, we just keep it
173             else {
174 29         52 $remainder = $word;
175             }
176 51         78 return $remainder;
177             }
178              
179             1;
180              
181             __END__