File Coverage

blib/lib/Tree/Lexicon.pm

Criterion	Covered	Total	%
statement	151	165	91.5
branch	71	90	78.8
condition	29	44	65.9
subroutine	24	24	100.0
pod	9	9	100.0
total	284	332	85.5

line	stmt	bran	cond	sub	pod	time	code
1							package Tree::Lexicon;
2
3	2			2		50534	use 5.006;
	2					9
	2					102
4	2			2		15	use strict;
	2					3
	2					79
5	2			2		11	use warnings FATAL => 'all';
	2					10
	2					131
6	2			2		1998	use integer;
	2					23
	2					10
7	2			2		55	use Carp;
	2					3
	2					6992
8
9							=head1 NAME
10
11							Tree::Lexicon - Object class for storing and retrieving a lexicon in a tree of affixes
12
13							=cut
14
15							require Exporter;
16
17							our @ISA = qw( Exporter );
18							our @EXPORT_OK = qw( cs_regexp ci_regexp );
19
20							=head1 VERSION
21
22							Version 0.01
23
24							=cut
25
26							our $VERSION = '0.01';
27
28							=head1 SYNOPSIS
29
30							use Tree::Lexicon;
31
32							my $lexicon = Tree::Lexicon->new();
33
34							$lexicon->insert( 'apply', '', 'Apple', 'Windows', 'Linux', 'app', 'all day' );
35							# Warns of strings not matching /^\w+/ without inserting
36
37							if ($lexicon->contains( 'WiNdOwS' )) {
38							$lexicon->remove( 'wInDoWs' );
39							$lexicon->insert( 'Vista' );
40							}
41
42							my @words = $lexicon->vocabulary;
43							# Same as:
44							@words = ( 'Apple', 'Linux', 'Windows', 'app', 'apply' );
45
46							@words = $lexicon->auto_complete( 'ap' );
47							# Same as:
48							@words = ( 'app', 'apply' );
49
50							my $regexp = $lexicon->as_regexp();
51							# Same as:
52							$regexp = qr/\b(?:Apple\|Linux\|Windows\|app(?:ly)?)\b/;
53
54							my $caseless->Tree::Lexicon->new( 0 )->insert( 'apply', '', 'Apple', 'Windows', 'Linux', 'app', 'all day' );
55							# Warns of strings not matching /^\w+/ without inserting
56
57							if ($caseless->contains( 'WiNdOwS' )) {
58							$caseless->remove( 'wInDoWs' );
59							$caseless->insert( 'Vista' );
60							}
61
62							@words = $caseless->vocabulary;
63							# Same as:
64							@words = ( 'APP', 'APPLE', 'APPLY', 'LINUX', 'VISTA' );
65
66							@words = $caseless->auto_complete( 'ap' );
67							# Same as:
68							@words = ( 'APP', 'APPLE', 'APPLY' );
69
70							my $regexp = $caseless->as_regexp();
71							# Same as:
72							$regexp = qr/\b(?:[Aa][Pp[Pp](?:[Ll](?:[Ee]\|[Yy]))?\|[Ll][Ii][Nn][Uu][X]\|[Vv][Ii][Ss][Tt][Aa])\b/;
73
74							use Tree::Lexicon qw( cs_regexp ci_regexp );
75
76							my $cs_regexp = cs_regexp( @words );
77							# Same as:
78							$cs_regexp = Tree::Lexicon->new()->insert( @words )->as_regexp();
79
80							my $ci_regexp = ci_regexp( @words );
81							# Same as:
82							$ci_regexp = Tree::Lexicon->new( 0 )->insert( @words )->as_regexp();
83
84
85							=head1 DESCRIPTION
86
87							The purpose of this module is to provide a simple and effective means to store a lexicon. It is intended to aid parsers in identifying keywords and interactive applications in identifying user-provided words.
88
89							=head1 EXPORT
90
91							=head2 cs_regexp
92
93							Convenience function for generating a case sensitive regular expression from list of words.
94
95							my $cs_regexp = cs_regexp( @words );
96							# Same as:
97							$cs_regexp = Tree::Lexicon->new( 1 )->insert( @words )->as_regexp();
98
99							=cut
100
101							sub cs_regexp {
102	2			2	1	4142	return Tree::Lexicon->new()->insert( @_ )->as_regexp();
103							}
104
105							=head2 ci_regexp
106
107							Convenience function for generating a case insensitive regular expression from list of words.
108
109							my $ci_regexp = cs_regexp( @words );
110							# Same as:
111							$ci_regexp = Tree::Lexicon->new( 0 )->insert( @words )->as_regexp();
112
113							=cut
114
115							sub ci_regexp {
116	2			2	1	269	return Tree::Lexicon->new( 0 )->insert( @_ )->as_regexp();
117							}
118
119							=head1 METHODS
120
121							Passing a string not matching C as an argument to L\|/insert>, L\|/remove>, L\|/contains> or L\|/auto_complete> yields a warning to STDERR and nothing else.
122
123							=head2 new
124
125							Returns a new empty C object. By default, the tree's contents are case-sensitive. Passing a single I argument to the constuctor makes its contents case-insensitive.
126
127							$lexicon = Tree::Lexicon->new();
128							# Same as:
129							$lexicon = Tree::Lexicon->new( 1 );
130
131							# or #
132
133							$lexicon = Tree::Lexicon->new( 0 );
134
135							=cut
136
137							# Constructor
138
139							sub new {
140	10			10	1	2840	my $class = shift;
141	10					17	my $cs = shift;
142	10	50				44	$cs = (defined $cs) ? $cs ? 1 : '' : 1;
		100
143
144	10					98	return bless { CASE => $cs, NODES => [] };
145							}
146
147							=head2 insert
148
149							Inserts zero or more words into the lexicon tree and returns the object.
150
151							$lexicon->insert( 'list', 'of', 'words' );
152
153							If you already have an initial list of words, then you can chain this method up with the constructor.
154
155							my $lexicon = Tree::Lexicon->new()->insert( @words );
156
157							=cut
158
159							# Insert words
160
161							sub insert {
162	372			372	1	8023	my $self = shift;
163
164	372	100				933	if ($self->{CASE}) {
165	252					594	foreach (@_) { _insert( $self->{NODES}, $_ ); }
	1108					2613
166							}
167							else {
168	120					257	foreach (@_) { _insert( $self->{NODES}, uc( $_ ) ); }
	712					2251
169							}
170
171	372					992	return $self;
172							}
173
174							=head2 remove
175
176							Removes zero or more words from the lexicon tree and returns them (or C if not found).
177
178							@removed = $lexicon->remove( 'these', 'words' );
179
180							=cut
181
182							# Remove words
183
184							sub remove {
185	364			364	1	8349	my $self = shift;
186
187	0					0	return (wantarray or (@_ > 1)) ?
188							($self->{CASE}) ?
189	0					0	map { _remove( $self->{NODES}, $_ ) } @_ :
190	364	0	33			2827	map { _remove( $self->{NODES}, uc( $_ ) ) } @_ :
		100
		50
191							($self->{CASE}) ?
192							_remove( $self->{NODES}, shift ) :
193							_remove( $self->{NODES}, uc( shift ));
194							}
195
196							=head2 contains
197
198							Returns C<1> or C<''> for each word as to its presence or absense, respectively.
199
200							@verify = $lexicon->contains( 'these', 'words' );
201
202							=cut
203
204							# Verify words
205
206							sub contains {
207	1456			1456	1	21756	my $self = shift;
208
209	0					0	return (wantarray or (@_ > 1)) ?
210							($self->{CASE}) ?
211	0					0	map { _contains( $self->{NODES}, $_ ) } @_ :
212	1456	0	33			12154	map { _contains( $self->{NODES}, uc( $_ ) ) } @_ :
		100
		50
213							($self->{CASE}) ?
214							_contains( $self->{NODES}, shift ) :
215							_contains( $self->{NODES}, uc( shift ));
216							}
217
218							=head2 auto_complete
219
220							Returns all words beginning with the string passed.
221
222							@words = $lexicon->auto_complete( 'a' );
223
224							=cut
225
226							# Words beginning with
227
228							sub auto_complete {
229	26			26	1	1192	my $self = shift;
230	26					36	my $prefix = shift;
231
232	26	50	33			143	unless ($prefix and $prefix =~ /^\w+/) {
233	0					0	carp "Cannot auto-complete non-word string.";
234	0					0	return ();
235							}
236
237	26	50				70	($self->{CASE}) or $prefix = uc $prefix;
238
239	26					54	my ($node, $root) = _ac_first( $self->{NODES}, $prefix );
240
241	26	100				69	(defined $node) or return ();
242
243	21					48	my @words = _vocabulary( $node->[-1], $root );
244	21	100				58	($node->[1]) and unshift @words, $root;
245
246	21					84	return @words;
247							}
248
249							=head2 vocabulary
250
251							Returns all words in the lexicon.
252
253							@words = $lexicon->vocabulary();
254
255							=cut
256
257							# All words
258
259							sub vocabulary {
260	2			2	1	1679	my $self = shift;
261
262	2					11	return _vocabulary( $self->{NODES} );
263							}
264
265							=head2 as_regexp
266
267							Returns a regular expression equivalent to the lexicon tree. The regular expression has the form C.
268
269							$regexp = $lexicon->as_regexp();
270
271							=cut
272
273							# Lexicon as regular expression
274
275							sub as_regexp {
276	736			736	1	3337	my $self = shift;
277	736	100				2650	my $regexp = $self->{CASE} ?
278							_cs_regexp( $self->{NODES} ) :
279							_ci_regexp( $self->{NODES} );
280
281	736					333920	return qr/\b(?:$regexp)\b/;
282							}
283
284							## Begin Private Functions ##
285
286							# Recursive backend for 'insert()'
287							sub _insert {
288	4320			4320		6811	my ($nodes,
289							$string) = @_;
290
291	4320	50	33			34896	unless ($string and $string =~ /^\w+$/) {
292	0					0	carp "Cannot insert non-word string into lexicon tree.";
293	0					0	return;
294							}
295
296							# Node location and possible common root
297	4320					7963	my ($node,
298							$pos,
299							$root) = _locate( $nodes, $string );
300
301							# Is there a common root to node's string and passed string?
302	4320	100				9180	if ($root) {
303							# Are they equal?
304	2585	100				5553	if ($string eq $node->[0]) {
305	21					71	$node->[1] = 1;
306							}
307							else {
308							# Strip the common root from $string
309	2564					21821	$string =~ s/^$root//;
310							# Is common root same as node's string?
311	2564	100				7361	unless ($node->[0] eq $root) {
312							# No: split node upwards
313	626					5432	$node->[0] =~ s/^$root//;
314	626					2285	$node = [ $root, !$string, [ $node ] ];
315	626					1320	$nodes->[$pos] = $node;
316							}
317							# Recurse with what's left of $string
318	2564	100				8385	_insert( $node->[-1], $string ) if ($string);
319							}
320							}
321							else {
322							# This is a node with no root in common with its neighbors
323	1735					1958	splice( @{$nodes}, $pos, 0, [ $string, 1, [] ] );
	1735					11046
324							}
325							}
326
327							# Backend for 'remove()'
328							sub _remove {
329	364			364		581	my ($nodes,
330							$sought) = @_;
331	364					416	my $found;
332	364					621	my @stack = ();
333	364					405	my ($node,
334							$pos,
335							$root);
336
337	364	50	33			2374	unless ($sought and $sought =~ /^\w+$/) {
338	0					0	carp "Cannot remove non-word string from lexicon tree.";
339	0					0	return undef;
340							}
341
342							# Search tree, stripping sought of roots and appending to found
343	364		66			481	do {
344	1055					1955	($node, $pos, $root) = _locate( $nodes, $sought );
345							# Is there a node whose string can be stripped from string?
346	1055	50				3012	if ($root eq $node->[0]) {
347							# Add to what was found
348	1055					1488	$found .= $root;
349							# Strip what was found
350	1055					10141	$sought =~ s/^$root//;
351							# Is there more to search?
352	1055	100				2702	if ($sought) {
353							# Record visit
354	691					1674	push @stack, [ $node, $nodes, $pos ];
355							# Recurse
356	691					4244	$nodes = $node->[-1];
357							}
358							else {
359							# Verify that $found is a "hit"
360	364	50				1851	($node->[1]) or
361							$found = undef;
362							}
363							}
364							else {
365	0					0	$found = undef;
366							}
367							} while ($sought and $found);
368
369	364	50				829	if ($found) {
370	364					650	$node->[1] = '';
371	364		100			1004	until ($node->[1] \|\| @{$node->[-1]}) {
	738					2877
372	473					655	splice( @{$nodes}, $pos, 1 );
	473					885
373	473	100				1151	last unless (@stack);
374	419					552	($node, $nodes, $pos) = @{pop @stack};
	419					2245
375							}
376							}
377
378	364					1768	return $found;
379							}
380
381							# Backend for 'contains()'
382							sub _contains {
383	1456			1456		2571	my ($nodes,
384							$sought) = @_;
385	1456					2005	my ($node,
386							$pos,
387							$root);
388
389	1456	50	33			9985	unless ($sought and $sought =~ /^\w+$/) {
390	0					0	carp "Cannot find non-word string in lexicon tree.";
391	0					0	return undef;
392							}
393
394							# Search tree, stripping string of roots
395	1456					3469	while ($sought) {
396	3622					7358	($node, $pos, $root) = _locate( $nodes, $sought );
397							# Is there a node whose string can be stripped from string?
398	3622	100	100			18692	last unless ($node and $root eq $node->[0]);
399	2922					33441	$sought =~ s/^$root//;
400							# Recurse?
401	2922	100				12910	($sought) and $nodes = $node->[-1];
402							}
403
404	1456		100			7893	return (not $sought and $node->[1]);
405							}
406
407							# Recursive backend for 'vocabulary()'
408							sub _vocabulary {
409	624			624		2623	my $nodes = shift;
410	624		100			2171	my $root = shift \|\| '';
411	624					1241	my @vocab;
412
413	624					804	foreach my $node (@{$nodes}) {
	624					1415
414	601					1656	my $ext_root = $root.$node->[0];
415	601	100				1927	($node->[1]) and push @vocab, $ext_root;
416	601					1356	push @vocab, _vocabulary( $node->[-1], $ext_root );
417							}
418
419	624					2960	return @vocab;
420							}
421
422							# Case sensitive recursive backend for 'as_regexp()'
423							sub _cs_regexp {
424	89045					400100	join( '\|', map {
425	32257					55926	$_->[0].(
426	89045	100				130671	(@{$_->[-1]}) ?
		100
427							'(?:'._cs_regexp( $_->[-1] ).')'.(
428							($_->[1]) ? '?' : ''
429							) : ''
430	32257			32257		49469	) } @{$_[0]} )
431							}
432
433							# Case insensitive recursive backend for 'as_regexp()'
434							sub _ci_regexp {
435	18990					92419	join( '\|', map {
436	6718					12053	_ci_seq( $_->[0] ).(
437	18990	100				36410	(@{$_->[-1]}) ?
		100
438							'(?:'._ci_regexp( $_->[-1] ).')'.(
439							($_->[1]) ? '?' : ''
440							) : ''
441	6718			6718		7563	) } @{$_[0]} )
442							}
443
444							# Begin Helper Functions #
445
446							# Greatest common root of two strings (called by '_locate()')
447							sub _gc_root {
448	9551			9551		16103	my ($string1,
449							$string2) = @_; # ( $string1 le $string2 )
450
451							# Does $string2 begin with $string1?
452	9551	100				121179	($string2 =~ /^$string1/) and
453							return $string1;
454
455							# First character of $string1
456	3512					7270	my $root = substr( $string1, 0, 1 );
457
458							# Does $string2 begin with the same character?
459	3512	100				35504	($string2 =~ /^$root/) or
460							return '';
461
462							# Append characters from $string1 to root ...
463	677					1988	for (my $i = 1; $i < length( $string1 ); $i++) {
464	968					1583	$root .= substr( $string1, $i, 1 );
465							# ... until it no longer matches $string2.
466	968	100				8895	unless ($string2 =~ /^$root/) {
467	677					1179	$root = substr( $root, 0, $i );
468	677					1283	last;
469							}
470							}
471
472	677					2875	return $root;
473							}
474
475							# Get position within array and common root of node string and passed string
476							sub _locate {
477	9023			9023		13694	my ($nodes,
478							$sought) = @_;
479	9023					16733	my ($min,
480	9023					10487	$pos) = ( -1, @{$nodes} - 1 );
481	9023					13117	my $root = '';
482	9023					10778	my $node;
483
484							# Binary search from above
485	9023		100			49858	while ($pos > $min and
486							$sought lt $nodes->[$pos]->[0]) {
487	16746					23972	my $mid = $pos + ($min - $pos) / 2;
488	16746	100				36566	if ($sought lt $nodes->[$mid]->[0]) {
489	9852					40213	$pos = $mid - 1;
490							}
491							else {
492	6894					33838	$min = $mid;
493							}
494							}
495
496							# Value of $pos is position of greatest
497							# less-than-or-equal node, possibly -1
498
499							# Is there a less-than-or-equal node with a common root?
500	9023	100	66			53613	unless ($pos >= 0 and $node = $nodes->[$pos] and
			100
501							$root = _gc_root( $node->[0], $sought )) {
502							# No
503	2700					3116	$pos++;
504							# Value of $pos is position of least
505							# greater-than-or-equal node, possibly scalar( @{$nodes} )
506
507							# Is there a greater-than-or-equal node? with a common root?
508	2700	100	66			2968	($pos < scalar( @{$nodes} )) and
	2700					12392
509							$node = $nodes->[$pos] and
510							$root = _gc_root( $sought, $node->[0] );
511							}
512
513	9023					32500	return ($node, $pos, $root);
514							}
515
516							# Recursive backend to seed 'auto_complete()'
517							sub _ac_first {
518	26			26		35	my ($nodes,
519							$string,
520							$ext_root) = @_;
521	26					40	my ($node,
522							$pos,
523							$root) = _locate( $nodes, $string );
524	26	50				62	($ext_root) or
525							$ext_root = '';
526
527							# Is there a node in common to string?
528	26	100				74	if ($root) {
529							# Yes: extened the recorded root thus far
530	21					30	$ext_root .= $node->[0];
531
532							# Does string terminate in node's string?
533	21	50				200	($node->[0] =~ /^$string/) and
534							return ( $node, $ext_root );
535
536							# Does string exceed node's string?
537	0	0				0	($string =~ s/^$node->[0]//) and
538							return _ac_first( $node->[-1], $string, $ext_root );
539
540							# Else, fall through
541							}
542
543	5					10	return ();
544							}
545
546							# Convert string to case insensistive sequence (called by '_ci_regexp()')
547							sub _ci_seq {
548	18990			18990		23876	my $str = shift;
549	18990					24151	my $lc = lc $str;
550	18990					18979	my $ci_seq;
551
552	18990	100				38598	($lc eq $str) and
553							return $str;
554
555	18818					48957	my @lc_chars = split( //, $lc, -1 );
556
557	18818					44774	foreach my $uc_char (split( //, $str, -1 )) {
558	67140					86998	my $lc_char = shift @lc_chars;
559	67140	100				174532	$ci_seq .= ($lc_char eq $uc_char) ? $uc_char : "[$uc_char$lc_char]";
560							}
561
562	18818					47368	return $ci_seq;
563							}
564
565							# End Helper Functions #
566
567							## End Private Functions ##
568
569							=head1 AUTHOR
570
571							S. Randall Sawyer, C<< >>
572
573							=head1 BUGS
574
575							Please report any bugs or feature requests to C, or through
576							the web interface at L. I will be notified, and then you'll
577							automatically be notified of progress on your bug as I make changes.
578
579							=head1 SUPPORT
580
581							You can find documentation for this module with the perldoc command.
582
583							perldoc Tree::Lexicon
584
585
586							You can also look for information at:
587
588							=over 4
589
590							=item * RT: CPAN's request tracker (report bugs here)
591
592							L
593
594							=item * AnnoCPAN: Annotated CPAN documentation
595
596							L
597
598							=item * CPAN Ratings
599
600							L
601
602							=item * Search CPAN
603
604							L
605
606							=back
607
608
609							=head1 ACKNOWLEDGMENTS
610
611							This module's framework generated with L\|Module::Starter>.
612
613							=head1 LICENSE AND COPYRIGHT
614
615							Copyright 2013 S. Randall Sawyer.
616
617							This program is free software; you can redistribute it and/or modify it
618							under the terms of the the Artistic License (2.0). You may obtain a
619							copy of the full license at:
620
621							L
622
623							=cut
624
625							1;
626
627							__END__