File Coverage

blib/lib/Pod/Wordlist.pm
Criterion Covered Total %
statement 96 97 98.9
branch 39 44 88.6
condition 13 15 86.6
subroutine 14 14 100.0
pod 3 3 100.0
total 165 173 95.3


line stmt bran cond sub pod time code
1             package Pod::Wordlist;
2 6     6   57488 use 5.008;
  6         24  
3 6     6   28 use strict;
  6         11  
  6         127  
4 6     6   29 use warnings;
  6         11  
  6         269  
5              
6             our $VERSION = '1.24'; # TRIAL
7              
8 6     6   4080 use Lingua::EN::Inflect 'PL';
  6         124430  
  6         679  
9 6     6   63 use File::Spec ();
  6         37  
  6         134  
10             use constant {
11 6         628 MAXWORDLENGTH => 50,
12 6     6   32 };
  6         9  
13              
14             use Class::Tiny {
15 6         46 wordlist => \&_copy_wordlist,
16             _is_debug => 0,
17             no_wide_chars => 0,
18 6     6   2985 };
  6         9460  
19              
20             our %Wordlist; ## no critic ( Variables::ProhibitPackageVars )
21              
22             sub _copy_wordlist {
23 8     8   79 my %copy;
24              
25             # %Wordlist can be accessed externally, and users will often add terms in
26             # encoded form
27 8         4867 for my $word ( keys %Wordlist ) {
28 19448         19364 my $decoded_word = $word;
29             # if it was already decoded, this should do nothing
30 19448         27243 utf8::decode($decoded_word);
31 19448         30258 $copy{$decoded_word} = 1;
32             }
33              
34 8         789 return \%copy;
35             }
36              
37             BEGIN {
38 6     6   3946 my $file;
39              
40             # try to find wordlist in non-installed dist
41 6         128 my ($d, $p) = File::Spec->splitpath(__FILE__);
42 6         85 $p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share');
43 6         76 my $full_path = File::Spec->catpath($d, $p, 'wordlist');
44 6 50 33     156 if ($full_path && -e $full_path) {
45 0         0 $file = $full_path;
46             }
47              
48 6 50       26 if ( not defined $file ) {
49 6         2753 require File::ShareDir;
50 6         135362 $file = File::ShareDir::dist_file('Pod-Spell', 'wordlist');
51             }
52              
53 6 50   6   1229 open my $fh, '<:encoding(UTF-8)', $file
  6         37  
  6         10  
  6         62  
54             or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping)
55 6         62931 while ( defined( my $line = readline $fh ) ) {
56 7320         1498139 chomp $line;
57 7320         21276 $Wordlist{$line} = 1;
58 7320         12083 $Wordlist{PL($line)} = 1;
59             }
60 6         6464 close $fh;
61             }
62              
63             sub learn_stopwords {
64 11     11 1 2304 my ( $self, $text ) = @_;
65 11         245 my $stopwords = $self->wordlist;
66              
67 11         92 while ( $text =~ m<(\S+)>g ) {
68 25         487 my $word = $1;
69 25         65 utf8::decode($word);
70 25 100       62 if ( $word =~ m/^!(.+)/s ) {
71             # "!word" deletes from the stopword list
72 2         6 my $negation = $1;
73             # different $1 from above
74 2         5 delete $stopwords->{$negation};
75 2         8 delete $stopwords->{PL($negation)};
76 2 100       419 print "Unlearning stopword <$negation>\n" if $self->_is_debug;
77             }
78             else {
79 23         41 $word =~ s{'s$}{}; # we strip 's when checking so strip here, too
80 23         55 $stopwords->{$word} = 1;
81 23         73 $stopwords->{PL($word)} = 1;
82 23 100       5795 print "Learning stopword <$word>\n" if $self->_is_debug;
83             }
84             }
85 11         234 return;
86             }
87              
88             sub is_stopword {
89 56     56 1 77 my ($self, $word) = @_;
90 56         863 my $stopwords = $self->wordlist;
91 56 100 100     363 if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) {
92 21 100       264 print " Rejecting <$word>\n" if $self->_is_debug;
93 21         255 return 1;
94             }
95 35         143 return;
96             }
97              
98             sub strip_stopwords {
99 18     18 1 99 my ($self, $text) = @_;
100              
101             # Count the things in $text
102 18 100       383 print "Content: <", $text, ">\n" if $self->_is_debug;
103              
104 18         537 my @words = grep { length($_) < MAXWORDLENGTH } split " ", $text;
  71         154  
105              
106 18         44 for ( @words ) {
107 71 100       1463 print "Parsing word: <$_>\n" if $self->_is_debug;
108             # some spellcheckers can't cope with anything but Latin1
109 71 100 100     1776 $_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/;
110              
111             # strip leading punctuation
112 71         414 s/^[\(\[\{\'\"\:\;\,\?\!\.]+//;
113              
114             # keep everything up to trailing punctuation, not counting
115             # periods (for abbreviations like "Ph.D."), single-quotes
116             # (for contractions like "don't") or colons (for package
117             # names like "Foo::Bar")
118 71         243 s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/;
119              
120             # strip trailing single-quote, periods or colons; after this
121             # we have a word that could have internal periods or quotes
122 71         146 s/[\.\'\:]+$//;
123              
124             # strip possessive
125 71         115 s/'s$//i;
126              
127             # zero out variable names or things with internal symbols,
128             # since those are probably code expressions outside a C<>
129 71         125 my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/;
130 71         102 my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/;
131 71 100 100     207 $_ = '' if $is_sigil || $is_strange;
132              
133             # stop if there are no "word" characters left; if it's just
134             # punctuation that we didn't happen to strip or it's weird glyphs,
135             # the spellchecker won't do any good anyway
136 71 100       148 next unless /\w/;
137              
138 51 100       684 print " Checking as <$_>\n" if $self->_is_debug;
139              
140             # replace it with any stopword or stopword parts stripped
141 51         580 $_ = $self->_strip_a_word($_);
142              
143 51 100 100     591 print " Keeping as <$_>\n" if $_ && $self->_is_debug;
144             }
145              
146 18 50       97 return join(" ", grep { defined && length } @words );
  71         214  
147             }
148              
149             sub _strip_a_word {
150 51     51   89 my ($self, $word) = @_;
151 51         55 my $remainder;
152              
153             # try word as-is, including possible hyphenation vs stoplist
154 51 100       98 if ($self->is_stopword($word) ) {
    100          
    100          
155 19         29 $remainder = '';
156             }
157             # internal period could be abbreviations, so check with
158             # trailing period restored and drop or keep on that basis
159             elsif ( index($word, '.') >= 0 ) {
160 2         6 my $abbr = "$word.";
161 2 100       6 $remainder = $self->is_stopword($abbr) ? '' : $abbr;
162             }
163             # check individual parts of hyphenated word, keep whatever isn't a
164             # stopword as individual words
165             elsif ( index($word, '-') >= 0 ) {
166 1         2 my @keep;
167 1         5 for my $part ( split /-/, $word ) {
168 3 100       7 push @keep, $part if ! $self->is_stopword( $part );
169             }
170 1 50       18 $remainder = join(" ", @keep) if @keep;
171             }
172             # otherwise, we just keep it
173             else {
174 29         40 $remainder = $word;
175             }
176 51         81 return $remainder;
177             }
178              
179             1;
180              
181             __END__