File Coverage

blib/lib/Pod/Wordlist.pm

Criterion	Covered	Total	%
statement	96	97	98.9
branch	39	44	88.6
condition	13	15	86.6
subroutine	14	14	100.0
pod	3	3	100.0
total	165	173	95.3

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Wordlist;
2	6			6		57488	use 5.008;
	6					24
3	6			6		28	use strict;
	6					11
	6					127
4	6			6		29	use warnings;
	6					11
	6					269
5
6							our $VERSION = '1.24'; # TRIAL
7
8	6			6		4080	use Lingua::EN::Inflect 'PL';
	6					124430
	6					679
9	6			6		63	use File::Spec ();
	6					37
	6					134
10							use constant {
11	6					628	MAXWORDLENGTH => 50,
12	6			6		32	};
	6					9
13
14							use Class::Tiny {
15	6					46	wordlist => \&_copy_wordlist,
16							_is_debug => 0,
17							no_wide_chars => 0,
18	6			6		2985	};
	6					9460
19
20							our %Wordlist; ## no critic ( Variables::ProhibitPackageVars )
21
22							sub _copy_wordlist {
23	8			8		79	my %copy;
24
25							# %Wordlist can be accessed externally, and users will often add terms in
26							# encoded form
27	8					4867	for my $word ( keys %Wordlist ) {
28	19448					19364	my $decoded_word = $word;
29							# if it was already decoded, this should do nothing
30	19448					27243	utf8::decode($decoded_word);
31	19448					30258	$copy{$decoded_word} = 1;
32							}
33
34	8					789	return \%copy;
35							}
36
37							BEGIN {
38	6			6		3946	my $file;
39
40							# try to find wordlist in non-installed dist
41	6					128	my ($d, $p) = File::Spec->splitpath(__FILE__);
42	6					85	$p = File::Spec->catdir($p, (File::Spec->updir) x 2, 'share');
43	6					76	my $full_path = File::Spec->catpath($d, $p, 'wordlist');
44	6	50	33			156	if ($full_path && -e $full_path) {
45	0					0	$file = $full_path;
46							}
47
48	6	50				26	if ( not defined $file ) {
49	6					2753	require File::ShareDir;
50	6					135362	$file = File::ShareDir::dist_file('Pod-Spell', 'wordlist');
51							}
52
53	6	50		6		1229	open my $fh, '<:encoding(UTF-8)', $file
	6					37
	6					10
	6					62
54							or die "Cannot read $file: $!"; ## no critic (ErrorHandling::RequireCarping)
55	6					62931	while ( defined( my $line = readline $fh ) ) {
56	7320					1498139	chomp $line;
57	7320					21276	$Wordlist{$line} = 1;
58	7320					12083	$Wordlist{PL($line)} = 1;
59							}
60	6					6464	close $fh;
61							}
62
63							sub learn_stopwords {
64	11			11	1	2304	my ( $self, $text ) = @_;
65	11					245	my $stopwords = $self->wordlist;
66
67	11					92	while ( $text =~ m<(\S+)>g ) {
68	25					487	my $word = $1;
69	25					65	utf8::decode($word);
70	25	100				62	if ( $word =~ m/^!(.+)/s ) {
71							# "!word" deletes from the stopword list
72	2					6	my $negation = $1;
73							# different $1 from above
74	2					5	delete $stopwords->{$negation};
75	2					8	delete $stopwords->{PL($negation)};
76	2	100				419	print "Unlearning stopword <$negation>\n" if $self->_is_debug;
77							}
78							else {
79	23					41	$word =~ s{'s$}{}; # we strip 's when checking so strip here, too
80	23					55	$stopwords->{$word} = 1;
81	23					73	$stopwords->{PL($word)} = 1;
82	23	100				5795	print "Learning stopword <$word>\n" if $self->_is_debug;
83							}
84							}
85	11					234	return;
86							}
87
88							sub is_stopword {
89	56			56	1	77	my ($self, $word) = @_;
90	56					863	my $stopwords = $self->wordlist;
91	56	100	100			363	if ( exists $stopwords->{$word} or exists $stopwords->{ lc $word } ) {
92	21	100				264	print " Rejecting <$word>\n" if $self->_is_debug;
93	21					255	return 1;
94							}
95	35					143	return;
96							}
97
98							sub strip_stopwords {
99	18			18	1	99	my ($self, $text) = @_;
100
101							# Count the things in $text
102	18	100				383	print "Content: <", $text, ">\n" if $self->_is_debug;
103
104	18					537	my @words = grep { length($_) < MAXWORDLENGTH } split " ", $text;
	71					154
105
106	18					44	for ( @words ) {
107	71	100				1463	print "Parsing word: <$_>\n" if $self->_is_debug;
108							# some spellcheckers can't cope with anything but Latin1
109	71	100	100			1776	$_ = '' if $self->no_wide_chars && /[^\x00-\xFF]/;
110
111							# strip leading punctuation
112	71					414	s/^[\(\[\{\'\"\:\;\,\?\!\.]+//;
113
114							# keep everything up to trailing punctuation, not counting
115							# periods (for abbreviations like "Ph.D."), single-quotes
116							# (for contractions like "don't") or colons (for package
117							# names like "Foo::Bar")
118	71					243	s/^([^\)\]\}\"\;\,\?\!]+).*$/$1/;
119
120							# strip trailing single-quote, periods or colons; after this
121							# we have a word that could have internal periods or quotes
122	71					146	s/[\.\'\:]+$//;
123
124							# strip possessive
125	71					115	s/'s$//i;
126
127							# zero out variable names or things with internal symbols,
128							# since those are probably code expressions outside a C<>
129	71					125	my $is_sigil = /^[\&\%\$\@\:\<\*\\\_]/;
130	71					102	my $is_strange = /[\%\^\&\#\$\@\_\<\>\[\]\{\}\\\*\:\+\/\=\\|\`\~]/;
131	71	100	100			207	$_ = '' if $is_sigil \|\| $is_strange;
132
133							# stop if there are no "word" characters left; if it's just
134							# punctuation that we didn't happen to strip or it's weird glyphs,
135							# the spellchecker won't do any good anyway
136	71	100				148	next unless /\w/;
137
138	51	100				684	print " Checking as <$_>\n" if $self->_is_debug;
139
140							# replace it with any stopword or stopword parts stripped
141	51					580	$_ = $self->_strip_a_word($_);
142
143	51	100	100			591	print " Keeping as <$_>\n" if $_ && $self->_is_debug;
144							}
145
146	18	50				97	return join(" ", grep { defined && length } @words );
	71					214
147							}
148
149							sub _strip_a_word {
150	51			51		89	my ($self, $word) = @_;
151	51					55	my $remainder;
152
153							# try word as-is, including possible hyphenation vs stoplist
154	51	100				98	if ($self->is_stopword($word) ) {
		100
		100
155	19					29	$remainder = '';
156							}
157							# internal period could be abbreviations, so check with
158							# trailing period restored and drop or keep on that basis
159							elsif ( index($word, '.') >= 0 ) {
160	2					6	my $abbr = "$word.";
161	2	100				6	$remainder = $self->is_stopword($abbr) ? '' : $abbr;
162							}
163							# check individual parts of hyphenated word, keep whatever isn't a
164							# stopword as individual words
165							elsif ( index($word, '-') >= 0 ) {
166	1					2	my @keep;
167	1					5	for my $part ( split /-/, $word ) {
168	3	100				7	push @keep, $part if ! $self->is_stopword( $part );
169							}
170	1	50				18	$remainder = join(" ", @keep) if @keep;
171							}
172							# otherwise, we just keep it
173							else {
174	29					40	$remainder = $word;
175							}
176	51					81	return $remainder;
177							}
178
179							1;
180
181							__END__