File Coverage

blib/lib/WordLists/Common.pm

Criterion	Covered	Total	%
statement	68	133	51.1
branch	4	30	13.3
condition			n/a
subroutine	10	16	62.5
pod	9	10	90.0
total	91	189	48.1

line	stmt	bran	sub	pod	time	code
1						package WordLists::Common;
2	8		8		5349	use strict;
	8				17
	8				291
3	8		8		41	use warnings;
	8				18
	8				269
4	8		8		20332	use Unicode::Normalize; #provides NFD
	8				28149
	8				712
5	8		8		2236	use utf8;
	8				35
	8				61
6	8		8		2403	use WordLists::Base;
	8				53
	8				15278
7						our $VERSION = $WordLists::Base::VERSION;
8
9						our $AUTOLOAD;
10						require Exporter;
11						our @ISA = qw (Exporter);
12						our @EXPORT = ();
13						our @EXPORT_OK = qw(
14						pretty_doubles
15						pretty_singles
16						pretty_endash
17						norm_spacing
18						custom_norm
19						generic_norm_hw
20						generic_norm_pos
21						generic_minimal_pos
22						uniques
23						@sPosWords
24						@sDefaultAttList
25						@sDefiningAttlist
26						@sParsingParameters
27						reverse_punct
28						);
29						our @sDefaultAttList = qw(hw pos def eg);
30						our @sDefiningAttlist = qw(hw pos);
31						our @sParsingParameters = qw(is_header field_sep attlist default_attlist header_marker);
32						our @sPosWords = (qw(
33						n
34						noun
35						v
36						verb
37						adj
38						adjective
39						adv
40						advb
41						adverb
42						conj
43						conjunction
44						excl
45						exclamation
46						expression
47						pref
48						prefix
49						suffix
50						det
51						determiner
52						quant
53						quantifier
54						postmodifier
55						predeterminer
56						abbreviation
57						pv
58						mv
59						auxiliary
60						aux
61						prep
62						preposition
63						number
64						ordinal
65						cardinal
66
67						),
68						'ordinal number',
69						'cardinal number',
70						'plural noun',
71						'compound noun',
72						'phrasal verb',
73						'modal verb',
74						'auxiliary verb',
75						);
76						sub pretty_doubles($)
77						{
78	0		0	1	0	my $s = shift;
79	0				0	$s =~ s/[“"”]/"/g;
80	0				0	$s =~ s/"$/”/g;
81	0				0	$s =~ s/^"/“/g;
82	0				0	$s =~ s/"([\s\t\r\n])/”$1/g;
83	0				0	$s =~ s/([\s\t\r\n])"/$1“/g;
84	0				0	$s =~ s/([\(\{\[])"/$1“/g;
85	0				0	$s =~ s/"([\)\}\]])/”$1/g;
86	0				0	$s =~ s/([\w\.\?\!])"/$1”/g;
87	0				0	$s =~ s/"/“/g;
88	0				0	return $s;
89						}
90						sub reverse_punct ($)
91						{
92	4		4	0	13	my $s = shift;
93	4				35	my %sReversal = (qw`
94						( )
95						[ ]
96						{ }
97						< >
98						‘ ’
99						“ ”
100						‹ ›
101						« »
102						¡ !
103						¿ ?
104						`);
105	4	100			11	if (defined $sReversal{$s})
106						{
107	2				14	return $sReversal{$s};
108						}
109	2				4	foreach (qw`< [ { (`)
110						{
111	3				10	my $sToFind = quotemeta ($_) . "([^" . quotemeta ($sReversal{$_}) . "]+)". quotemeta $sReversal{$_};
112	3	100			109	if ($s =~ m/^$sToFind$/)
113						{
114	2				3	my $sR = $s;
115	2				38	$sR =~ s/^$sToFind/$_\/$1$sReversal{$_}/g;
116	2				14	return $sR;
117						}
118						}
119	0				0	return $s;
120						}
121						sub pretty_singles($)
122						{
123	0		0	1	0	my $s = shift;
124	0				0	$s =~ s/[‘'’]/'/g;
125	0				0	$s =~ s/'$/’/g;
126	0				0	$s =~ s/^'/‘/g;
127	0				0	$s =~ s/'([\s\t\r\n])/’$1/g;
128	0				0	$s =~ s/([\s\t\r\n])'/$1‘/g;
129	0				0	$s =~ s/([\(\{\[])'/$1‘/g;
130	0				0	$s =~ s/'([\)\}\]])/’$1/g;
131	0				0	$s =~ s/([\w\.\?\!])'/$1’/g;
132	0				0	$s =~ s/'/“/g;
133	0				0	return $s;
134						}
135
136						sub pretty_endash($)
137						{
138	0		0	1	0	my $s = shift;
139	0				0	$s =~ s/([\s\t\r\n])-([\s\t\r\n])/$1–$2/g;
140	0				0	$s =~ s/([\s\t\r\n])-$/$1–/g;
141	0				0	$s =~ s/^-([\s\t\r\n])/–$1/g;
142	0				0	return $s;
143						}
144
145						sub norm_spacing($)
146						{
147	13		13	1	15	my $s = shift;
148	13				23	$s =~ s/^\s+//;
149	13				23	$s =~ s/\s+$//;
150	13				18	$s =~ s/\s+/ /;
151	13				25	return $s;
152						}
153
154						sub custom_norm
155						{
156	0		0	1	0	my $s = shift;
157	0				0	my $args = shift;
158	0	0			0	return $s unless ref $args eq ref {};
159	0	0			0	$s = lc $s if $args->{'lc'};
160	0	0			0	$s = uc $s if $args->{'uc'};
161	0	0			0	if ($args->{'trim_space'})
162						{
163	0				0	$s =~ s/^\s+//;
164	0				0	$s =~ s/\s+$//;
165	0				0	$s =~ s/[\t\r\n\s]+/ /g;
166						}
167	0	0			0	$s =~ s/\bsb\b/someone/g if $args->{'sb'};
168	0	0			0	$s =~ s/\bsth\b/something/g if $args->{'sth'};
169	0	0			0	$s =~ s/$.*$//g if $args->{'brackets'} eq 'kill';
170	0	0			0	$s =~ tr/()//d if $args->{'brackets'} eq 'ignore';
171	0	0			0	$s =~ s/\[.*\]//g if $args->{'squares'} eq 'kill';
172	0	0			0	$s =~ tr/[]//d if $args->{'squares'} eq 'ignore';
173	0	0			0	if ($args->{'accents'})
174						{
175	0				0	$s = NFD($s); # These two lines use Unicode::Normalize::NFD to
176	7		7		56	$s =~ s/\pM//og; # remove accents but keep the underlying characters
	7				14
	7				108
	0				0
177						}
178
179	0	0			0	$s =~ s/[^[:alpha:][:digit:]]//g if $args->{'alnum_only'};
180						# can't and can`t should match. So, unfortunately, does cant
181	0	0			0	$s =~ s/_//g if $args->{'alnum_only'};
182	0				0	return $s;
183						}
184
185						sub generic_norm_hw($)
186						{
187	15		15	1	24	my $s = lc shift;
188	15				31	$s =~ s/$.*$//g;
189	15				27	$s =~ s/\bsb\b/someone/g;
190	15				23	$s =~ s/\bsth\b/something/g;
191	15				21	$s =~ s/^the //g;
192	15				78	$s = NFD($s); # These two lines use Unicode::Normalize::NFD to
193	15				40	$s =~ s/\pM//og; # remove accents but keep the underlying characters
194	15				27	$s =~ s/[^[:alpha:][:digit:]]//g;
195						# can't and can`t should match. So, unfortunately, does cant
196	15				19	$s =~ s/_//g;
197	15				48	return $s;
198						}
199
200						sub generic_norm_pos($)
201						{
202	13		13	1	18	my $sPos = lc shift;
203	13				17	$sPos =~ tr/\-\t\r\n \./ /;
204
205	13				24	$sPos = norm_spacing ($sPos);
206
207	13				17	$sPos =~ s<\b(pl\|plural)\b>
208						;
209
210	13				21	$sPos =~ s<\b(comp\|compound)\b>
211						;
212
213	13				30	$sPos =~ s<\b(n\|noun)\b>
214						;
215
216	13				24	$sPos =~ s<\b(a\|adj\|adjective)\b>
217						;
218
219	13				19	$sPos =~ s<\b(adv\|advb\|adverb)\b>
220						;
221
222	13				22	$sPos =~ s<\b(preposition\|prep)\b>
223						;
224
225	13				15	$sPos =~ s<\b(quant\|quantifier\|q)\b>
226						;
227
228	13				15	$sPos =~ s<\b(pre)(det\|determiner\|d)\b>
229						<$1determiner>;
230
231	13				13	$sPos =~ s<\b(pronoun\|pron)\b>
232						;
233
234	13				50	$sPos =~ s<\b(v\|verb)\b>
235						;
236
237	13				23	$sPos =~ s<\b(phr\|phrase)\b>
238						;
239
240	13				15	$sPos =~ s<\b(exp\|expr\|expression)\b>
241						;
242
243	13				15	$sPos =~ s<\b(mod\|modal)\b>
244						;
245
246	13				15	$sPos =~ s<\bphrase\s+verb\b>
247						;
248
249	13				13	$sPos =~ s<\bp\s*verb\b>
250						;
251
252	13				14	$sPos =~ s<\b(prefix\|pref)\b>
253						;
254
255	13				15	$sPos =~ s<\b(suffix\|suff)\b>
256						;
257
258	13				23	$sPos =~ s<\b(short\|abbreviated\|abbreviation\|abbrev\|abbr)( form)?\b>
259						;
260
261	13				21	$sPos =~ s<\b(conj\|conjunction)\b>
262						;
263
264	13				14	$sPos =~ s<\b(int\|interj\|inter\|interjection)\b>
265						;
266
267	13				18	$sPos =~ s<\b(ex\|excl\|exclam\|exclamation)\b>
268						;
269
270	13				43	return $sPos;
271						}
272
273
274						sub generic_minimal_pos($)
275						{
276	0		0	1		my $sPos = generic_norm_pos(shift);
277
278	0					$sPos =~ s<(adverb)>
279						;
280	0					$sPos =~ s<(adjective)>
281						;
282	0					$sPos =~ s<(phrasal)>
283						;
284	0					$sPos =~ s<(modal)>
285						<>;
286	0					$sPos =~ s<(verb)>
287						;
288	0					$sPos =~ s<(noun)>
289						;
290	0					$sPos =~ s<(adjective)>
291						;
292	0					$sPos =~ s<(preposition)>
293						;
294	0					$sPos =~ s<(exclamation)>
295						;
296	0					$sPos =~tr/ //d;
297	0					return $sPos;
298						}
299
300						sub uniques
301						{
302	0		0	1		my %bSeen;
303	0					return grep {$bSeen{$_}++; $bSeen{$_} ==1;} @_;
	0
	0
304						}
305
306						1;
307
308
309						=pod
310
311						=head1 NAME
312
313						WordLists::Common
314
315						=head1 SYNOPSIS
316
317						use WordLists::Common qw(pretty_doubles pretty_singles);
318						print pretty_doubles (pretty_singles (
319						qq{"That's right," she said, "I was told to 'get lost!'".}
320						) );
321
322						=head1 DESCRIPTION
323
324						This provides common functions and values of relevance to wordlists - such as normalising parts of speech and typographic dashes and quotes. Exportable functions and values include:
325
326						=over
327
328						=item *
329						C<@sPosWords>, a list of things which look like parts of speech (to help parsing things like "head verb", "head up", "head noun")
330
331						=item *
332						A function C replacing space + hyphen + space with space + en-dash + space.
333
334						=item *
335						A function C replacing double quotes with 'smart' double quotes.
336
337						=item *
338						A function C replacing apostrophe/single-quote with 'smart' single quotes.
339
340						=item *
341						A function C
342
343						=item *
344						A function C which takes several options:
345
346						=over
347
348						=item *
349						C - if true, lowercases the string.
350
351						=item *
352						C - if true, uppercases the string. Overrides C.
353
354						=item *
355						C - if true, removes initial and final space, and also condenses repeating white space to a single C<\x20>.
356
357						=item *
358						C - if true, removes characters other than alphabetic ones or digits.
359
360						=item *
361						C - if this is 'kill', removes the contents of any C<()> brackets; if 'ignore', removes the brackets themselves.
362
363						=item *
364						C - if this is 'kill', removes the contents of any C<[]> brackets; if 'ignore', removes the brackets themselves.
365
366						=item *
367						C - if true, removes accents and modifier characters from letters.
368
369						=item *
370						C - if true, replaces 'sb' with 'someone'.
371
372						=item *
373						C - if true, replaces 'sth' with 'something'.
374
375						=back
376
377						=item *
378						A function C which returns a word without accents or characters other than [a-z0-9].
379
380						=item *
381						A function C for normalising parts of speech so that 'v' and 'verb' match.
382
383						=item *
384						A function C which will normalise parts of speech and reduce them to 'minimal' ones.
385
386						=item *
387						A function C which will reduce a list to the unique members.
388
389						=back
390
391						=head1 BUGS
392
393						Please use the Github issues tracker.
394
395						=head1 LICENSE
396
397						Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
398
399						=cut