| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Pod::Wordlist; |
|
2
|
6
|
|
|
6
|
|
57488
|
use 5.008; |
|
|
6
|
|
|
|
|
24
|
|
|
3
|
6
|
|
|
6
|
|
28
|
use strict; |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
127
|
|
|
4
|
6
|
|
|
6
|
|
29
|
use warnings; |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
269
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '1.24'; # TRIAL |
|
7
|
|
|
|
|
|
|
|
|
8
|
6
|
|
|
6
|
|
4080
|
use Lingua::EN::Inflect 'PL'; |
|
|
6
|
|
|
|
|
124430
|
|
|
|
6
|
|
|
|
|
679
|
|
|
9
|
6
|
|
|
6
|
|
63
|
use File::Spec (); |
|
|
6
|
|
|
|
|
37
|
|
|
|
6
|
|
|
|
|
134
|
|
|
10
|
|
|
|
|
|
|
use constant { |
|
11
|
6
|
|
|
|
|
628
|
MAXWORDLENGTH => 50, |
|
12
|
6
|
|
|
6
|
|
32
|
}; |
|
|
6
|
|
|
|
|
9
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Class::Tiny { |
|
15
|
6
|
|
|
|
|
46
|
wordlist => \&_copy_wordlist, |
|
16
|
|
|
|
|
|
|
_is_debug => 0, |
|
17
|
|
|
|
|
|
|
no_wide_chars => 0, |
|
18
|
6
|
|
|
6
|
|
2985
|
}; |
|
|
6
|
|
|
|
|
9460
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our %Wordlist; ## no critic ( Variables::ProhibitPackageVars ) |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub _copy_wordlist { |
|
23
|
8
|
|
|
8
|
|
79
|
my %copy; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# %Wordlist can be accessed externally, and users will often add terms in |
|
26
|
|
|
|
|
|
|
# encoded form |
|
27
|
8
|
|
|
|
|
4867
|
for my $word ( keys %Wordlist ) { |
|
28
|
19448
|
|
|
|
|
19364
|
my $decoded_word = $word; |
|
29
|
|
|
|
|
|
|
# if it was already decoded, this should do nothing |
|
30
|
19448
|
|
|
|
|
27243
|
utf8::decode($decoded_word); |
|
31
|
19448
|
|
|
|
|
30258
|
$copy{$decoded_word} = 1; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
8
|
|
|
|
|
789
|
return \%copy; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
BEGIN { |
|
38
|
6
|
|
|
6
|
|
3946
|
my $file; |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# try to find wordlist in non-installed dist |
|
41
|
6
|
|
|
|
|
128
|
my ($d, $p) = File::Spec->splitpath(__FILE__); |
|
42
|
6
|
|
|
|
|
85
|
$p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share'); |
|
43
|
6
|
|
|
|
|
76
|
my $full_path = File::Spec->catpath($d, $p, 'wordlist'); |
|
44
|
6
|
50
|
33
|
|
|
156
|
if ($full_path && -e $full_path) { |
|
45
|
0
|
|
|
|
|
0
|
$file = $full_path; |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
|
|
48
|
6
|
50
|
|
|
|
26
|
if ( not defined $file ) { |
|
49
|
6
|
|
|
|
|
2753
|
require File::ShareDir; |
|
50
|
6
|
|
|
|
|
135362
|
$file = File::ShareDir::dist_file('Pod-Spell', 'wordlist'); |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
6
|
50
|
|
6
|
|
1229
|
open my $fh, '<:encoding(UTF-8)', $file |
|
|
6
|
|
|
|
|
37
|
|
|
|
6
|
|
|
|
|
10
|
|
|
|
6
|
|
|
|
|
62
|
|
|
54
|
|
|
|
|
|
|
or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping) |
|
55
|
6
|
|
|
|
|
62931
|
while ( defined( my $line = readline $fh ) ) { |
|
56
|
7320
|
|
|
|
|
1498139
|
chomp $line; |
|
57
|
7320
|
|
|
|
|
21276
|
$Wordlist{$line} = 1; |
|
58
|
7320
|
|
|
|
|
12083
|
$Wordlist{PL($line)} = 1; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
6
|
|
|
|
|
6464
|
close $fh; |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub learn_stopwords { |
|
64
|
11
|
|
|
11
|
1
|
2304
|
my ( $self, $text ) = @_; |
|
65
|
11
|
|
|
|
|
245
|
my $stopwords = $self->wordlist; |
|
66
|
|
|
|
|
|
|
|
|
67
|
11
|
|
|
|
|
92
|
while ( $text =~ m<(\S+)>g ) { |
|
68
|
25
|
|
|
|
|
487
|
my $word = $1; |
|
69
|
25
|
|
|
|
|
65
|
utf8::decode($word); |
|
70
|
25
|
100
|
|
|
|
62
|
if ( $word =~ m/^!(.+)/s ) { |
|
71
|
|
|
|
|
|
|
# "!word" deletes from the stopword list |
|
72
|
2
|
|
|
|
|
6
|
my $negation = $1; |
|
73
|
|
|
|
|
|
|
# different $1 from above |
|
74
|
2
|
|
|
|
|
5
|
delete $stopwords->{$negation}; |
|
75
|
2
|
|
|
|
|
8
|
delete $stopwords->{PL($negation)}; |
|
76
|
2
|
100
|
|
|
|
419
|
print "Unlearning stopword <$negation>\n" if $self->_is_debug; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
else { |
|
79
|
23
|
|
|
|
|
41
|
$word =~ s{'s$}{}; # we strip 's when checking so strip here, too |
|
80
|
23
|
|
|
|
|
55
|
$stopwords->{$word} = 1; |
|
81
|
23
|
|
|
|
|
73
|
$stopwords->{PL($word)} = 1; |
|
82
|
23
|
100
|
|
|
|
5795
|
print "Learning stopword <$word>\n" if $self->_is_debug; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} |
|
85
|
11
|
|
|
|
|
234
|
return; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub is_stopword { |
|
89
|
56
|
|
|
56
|
1
|
77
|
my ($self, $word) = @_; |
|
90
|
56
|
|
|
|
|
863
|
my $stopwords = $self->wordlist; |
|
91
|
56
|
100
|
100
|
|
|
363
|
if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) { |
|
92
|
21
|
100
|
|
|
|
264
|
print " Rejecting <$word>\n" if $self->_is_debug; |
|
93
|
21
|
|
|
|
|
255
|
return 1; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
35
|
|
|
|
|
143
|
return; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub strip_stopwords { |
|
99
|
18
|
|
|
18
|
1
|
99
|
my ($self, $text) = @_; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Count the things in $text |
|
102
|
18
|
100
|
|
|
|
383
|
print "Content: <", $text, ">\n" if $self->_is_debug; |
|
103
|
|
|
|
|
|
|
|
|
104
|
18
|
|
|
|
|
537
|
my @words = grep { length($_) < MAXWORDLENGTH } split " ", $text; |
|
|
71
|
|
|
|
|
154
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
18
|
|
|
|
|
44
|
for ( @words ) { |
|
107
|
71
|
100
|
|
|
|
1463
|
print "Parsing word: <$_>\n" if $self->_is_debug; |
|
108
|
|
|
|
|
|
|
# some spellcheckers can't cope with anything but Latin1 |
|
109
|
71
|
100
|
100
|
|
|
1776
|
$_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/; |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# strip leading punctuation |
|
112
|
71
|
|
|
|
|
414
|
s/^[\(\[\{\'\"\:\;\,\?\!\.]+//; |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# keep everything up to trailing punctuation, not counting |
|
115
|
|
|
|
|
|
|
# periods (for abbreviations like "Ph.D."), single-quotes |
|
116
|
|
|
|
|
|
|
# (for contractions like "don't") or colons (for package |
|
117
|
|
|
|
|
|
|
# names like "Foo::Bar") |
|
118
|
71
|
|
|
|
|
243
|
s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/; |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# strip trailing single-quote, periods or colons; after this |
|
121
|
|
|
|
|
|
|
# we have a word that could have internal periods or quotes |
|
122
|
71
|
|
|
|
|
146
|
s/[\.\'\:]+$//; |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# strip possessive |
|
125
|
71
|
|
|
|
|
115
|
s/'s$//i; |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# zero out variable names or things with internal symbols, |
|
128
|
|
|
|
|
|
|
# since those are probably code expressions outside a C<> |
|
129
|
71
|
|
|
|
|
125
|
my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/; |
|
130
|
71
|
|
|
|
|
102
|
my $is_strange = /[\%\^\&\#\$\@\_\<\>\(\)\[\]\{\}\\\*\:\+\/\=\|\`\~]/; |
|
131
|
71
|
100
|
100
|
|
|
207
|
$_ = '' if $is_sigil || $is_strange; |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# stop if there are no "word" characters left; if it's just |
|
134
|
|
|
|
|
|
|
# punctuation that we didn't happen to strip or it's weird glyphs, |
|
135
|
|
|
|
|
|
|
# the spellchecker won't do any good anyway |
|
136
|
71
|
100
|
|
|
|
148
|
next unless /\w/; |
|
137
|
|
|
|
|
|
|
|
|
138
|
51
|
100
|
|
|
|
684
|
print " Checking as <$_>\n" if $self->_is_debug; |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# replace it with any stopword or stopword parts stripped |
|
141
|
51
|
|
|
|
|
580
|
$_ = $self->_strip_a_word($_); |
|
142
|
|
|
|
|
|
|
|
|
143
|
51
|
100
|
100
|
|
|
591
|
print " Keeping as <$_>\n" if $_ && $self->_is_debug; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
18
|
50
|
|
|
|
97
|
return join(" ", grep { defined && length } @words ); |
|
|
71
|
|
|
|
|
214
|
|
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _strip_a_word { |
|
150
|
51
|
|
|
51
|
|
89
|
my ($self, $word) = @_; |
|
151
|
51
|
|
|
|
|
55
|
my $remainder; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# try word as-is, including possible hyphenation vs stoplist |
|
154
|
51
|
100
|
|
|
|
98
|
if ($self->is_stopword($word) ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
155
|
19
|
|
|
|
|
29
|
$remainder = ''; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
# internal period could be abbreviations, so check with |
|
158
|
|
|
|
|
|
|
# trailing period restored and drop or keep on that basis |
|
159
|
|
|
|
|
|
|
elsif ( index($word, '.') >= 0 ) { |
|
160
|
2
|
|
|
|
|
6
|
my $abbr = "$word."; |
|
161
|
2
|
100
|
|
|
|
6
|
$remainder = $self->is_stopword($abbr) ? '' : $abbr; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
# check individual parts of hyphenated word, keep whatever isn't a |
|
164
|
|
|
|
|
|
|
# stopword as individual words |
|
165
|
|
|
|
|
|
|
elsif ( index($word, '-') >= 0 ) { |
|
166
|
1
|
|
|
|
|
2
|
my @keep; |
|
167
|
1
|
|
|
|
|
5
|
for my $part ( split /-/, $word ) { |
|
168
|
3
|
100
|
|
|
|
7
|
push @keep, $part if ! $self->is_stopword( $part ); |
|
169
|
|
|
|
|
|
|
} |
|
170
|
1
|
50
|
|
|
|
18
|
$remainder = join(" ", @keep) if @keep; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
# otherwise, we just keep it |
|
173
|
|
|
|
|
|
|
else { |
|
174
|
29
|
|
|
|
|
40
|
$remainder = $word; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
51
|
|
|
|
|
81
|
return $remainder; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
1; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
__END__ |