File Coverage

blib/lib/Lingua/Anagrams.pm

Criterion	Covered	Total	%
statement	332	334	99.4
branch	114	140	81.4
condition	35	47	74.4
subroutine	29	29	100.0
pod	5	5	100.0
total	515	555	92.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Anagrams;
2							$Lingua::Anagrams::VERSION = '0.017';
3							# ABSTRACT: pure Perl anagram finder
4
5	1			1		712	use strict;
	1					2
	1					31
6	1			1		5	use warnings;
	1					2
	1					26
7
8	1			1		894	use List::MoreUtils qw(uniq);
	1					1283
	1					4599
9
10
11							# don't cache anagrams for bigger character counts than this
12							our $LIMIT = 20;
13
14							# some global variables to be localized
15							# used to limit time spent copying values
16							our ( $limit, $known, $trie, %cache, $cleaner, @jumps, $word_cache, @indices );
17
18
19							sub new {
20	7			7	1	5346	my $class = shift;
21	7					13	my $wl = shift;
22	7	50				26	die 'first parameter expected to be an array reference'
23							unless ref $wl eq 'ARRAY';
24	7					25	my %params = _make_opts(@_);
25	7		33			36	$class = ref $class \|\| $class;
26	7		50			38	local $cleaner = $params{clean} // \&_clean;
27	7					11	my @word_lists;
28	7	100				41	if ( ref $wl->[0] eq 'ARRAY' ) {
29	1					4	@word_lists = @$wl;
30							}
31							else {
32	6					14	@word_lists = ($wl);
33							}
34	7					10	my ( @tries, @all_words );
35	7					15	for my $words (@word_lists) {
36	8	50				21	next unless @$words;
37	8	50				16	die 'items in lists expected to be words' if ref $words->[0];
38	8					25	$cleaner->($_) for @$words;
39	8					15	my $s1 = @all_words;
40	8					19	push @all_words, @$words;
41	8					100	@all_words = uniq @all_words;
42	8	50				33	next unless @all_words > $s1;
43	8					23	my ( $trie, $known ) = _trieify( \@all_words );
44	8					36	push @tries, [ $trie, $known ];
45							}
46	7	50				17	die 'no words' unless @tries;
47	7		66			133	return bless {
			100
48							limit => $params{limit} // $LIMIT,
49							sorted => $params{sorted} // 0,
50							min => $params{min},
51							clean => $cleaner,
52							tries => \@tries,
53							},
54							$class;
55							}
56
57							sub _trieify {
58	8			8		11	my $words = shift;
59	8					12	my $base = [];
60	8					10	my @known;
61	8					11	my $terminal = [];
62	8					16	for my $word (@$words) {
63	32	50	50			89	next unless length( $word // '' );
64	32					135	my @chars = map ord, split //, $word;
65	32					74	_learn( \@known, \@chars );
66	32					65	_add( $base, \@chars, $terminal );
67							}
68	8					21	return $base, \@known;
69							}
70
71							sub _learn {
72	32			32		36	my ( $known, $new ) = @_;
73	32					44	for my $i (@$new) {
74	47		100			186	$known->[$i] \|\|= 1;
75							}
76							}
77
78							sub _add {
79	79			79		125	my ( $base, $chars, $terminal ) = @_;
80	79					84	my $i = shift @$chars;
81	79	100				118	if ($i) {
82	47		100			190	my $next = $base->[$i] //= [];
83	47					92	_add( $next, $chars, $terminal );
84							}
85							else {
86	32		33			177	$base->[0] //= $terminal;
87							}
88							}
89
90							# walk the trie looking for words you can make out of the current character count
91							sub _words_in {
92	71			71		111	my ( $counts, $total ) = @_;
93	71					77	my @words;
94	71					185	my @stack = ( [ 0, $trie ] );
95	71					89	while (1) {
96	698					667	my ( $c, $level ) = @{ $stack[-1] };
	698					7847
97	698	100	100			5408	if ( $c == -1 \|\| $c >= @$level ) {
98	160	100				347	last if @stack == 1;
99	89					104	pop @stack;
100	89					200	++$total;
101	89					138	$c = \( $stack[-1][0] );
102	89					125	++$counts->[$$c];
103	89					139	$$c = $jumps[$$c];
104							}
105							else {
106	538					695	my $l = $level->[$c];
107	538	100				804	if ($l) { # trie holds corresponding node
108	388	100				3418	if ($c) { # character
109	245	100				888	if ( $counts->[$c] ) {
110	149					297	push @stack, [ 0, $l ];
111	149					574	--$counts->[$c];
112	149					249	--$total;
113							}
114							else {
115	96					656	$stack[-1][0] = $jumps[$c];
116							}
117							}
118							else { # terminal
119	198					580	my $w = join '',
120	143					322	map { chr( $_->[0] ) } @stack[ 0 .. $#stack - 1 ];
121	143		100			595	$w = $word_cache->{$w} //= scalar keys %$word_cache;
122	143					2335	push @words, [ $w, [@$counts] ];
123	143	100				263	if ($total) {
124	83					193	$stack[-1][0] = $jumps[$c];
125							}
126							else {
127	60					1110	pop @stack;
128	60					92	++$total;
129	60					103	$c = \( $stack[-1][0] );
130	60					81	++$counts->[$$c];
131	60					128	$$c = $jumps[$$c];
132							}
133							}
134							}
135							else {
136	150					287	$stack[-1][0] = $jumps[$c];
137							}
138							}
139							}
140	71					214	\@words;
141							}
142
143
144							sub anagrams {
145	8			8	1	1729	my $self = shift;
146	8					15	my $phrase = shift;
147	8					21	my %opts = _make_opts(@_);
148	8					32	local ( $limit, $cleaner ) = @$self{qw(limit clean)};
149	8					20	$cleaner->($phrase);
150	8	50				24	return () unless length $phrase;
151	8					12	my ( $sort, $min );
152
153	8	100				20	if ( exists $opts{sorted} ) {
154	3					5	$sort = $opts{sorted};
155							}
156							else {
157	5					10	$sort = $self->{sorted};
158							}
159	8	100				17	if ( exists $opts{min} ) {
160	2					5	$min = $opts{min};
161							}
162							else {
163	6					12	$min = $self->{min};
164							}
165	8		100			33	my $i = $opts{start_list} // 0;
166	8					10	my @pairs = @{ $self->{tries} };
	8					26
167	8	100				24	if ($i) {
168	1	50				6	die "impossible index for start list: $i" unless defined $pairs[$i];
169	1	50				6	$i = @pairs + $i if $i < 0;
170	1					4	@pairs = @pairs[ $i .. $#pairs ];
171							}
172	8					19	my $counts = _counts($phrase);
173	8					32	local @jumps = _jumps($counts);
174	8					42	local @indices = _indices($counts);
175	8					13	my @anagrams;
176	8					14	local $word_cache = {};
177	8					15	for my $pair (@pairs) {
178	8					19	local ( $trie, $known ) = @$pair;
179	8	50				24	next unless _all_known($counts);
180	8					18	local %cache = ();
181	8					16	%$word_cache = ();
182	8					21	@anagrams = _anagramize($counts);
183	8	50				24	next unless @anagrams;
184	8	50	66			26	next if $min and @anagrams < $min;
185	8					35	last;
186							}
187	8					60	my %r = reverse %$word_cache;
188	42					199	@anagrams = map {
189	8					19	[ map { $r{$_} } @$_ ]
	22					29
190							} @anagrams;
191	8	100				24	if ($sort) {
192	24	100				48	@anagrams = sort {
193	16					49	my $ordered = @$a <= @$b ? 1 : -1;
194	24	100				50	my ( $d, $e ) = $ordered == 1 ? ( $a, $b ) : ( $b, $a );
195	24					53	for ( 0 .. $#$d ) {
196	27					48	my $c = $d->[$_] cmp $e->[$_];
197	27	100				80	return $ordered * $c if $c;
198							}
199	0					0	-$ordered;
200	4					6	} map { [ sort @$_ ] } @anagrams;
201							}
202	8					146	return @anagrams;
203							}
204
205							sub _make_opts {
206	22	100		22		58	if ( @_ == 1 ) {
207	1					3	my $r = shift;
208	1	50				7	die 'options expected to be key value pairs or a hash ref'
209							unless 'HASH' eq ref $r;
210	1					6	return %$r;
211							}
212							else {
213	21					65	return @_;
214							}
215							}
216
217
218							our $null = sub { };
219
220							sub iterator {
221	7			7	1	10291	my $self = shift;
222	7					15	my $phrase = shift;
223	7					28	my %opts = _make_opts(@_);
224	7		66			51	$opts{sorted} //= $self->{sorted};
225	7					26	$self->{clean}->($phrase);
226	7		100			38	my $i = $opts{start_list} // 0;
227	7					13	my @pairs = @{ $self->{tries} };
	7					20
228	7	100				35	if ($i) {
229	1	50				6	die "impossible index for start list: $i" unless defined $pairs[$i];
230	1	50				5	$i = @pairs + $i if $i < 0;
231	1					4	@pairs = @pairs[ $i .. $#pairs ];
232							}
233	7	50				23	return $null unless length $phrase;
234	7					24	return _super_iterator( \@pairs, $phrase, \%opts );
235							}
236
237							# iterator that converts word indices back to words
238							sub _super_iterator {
239	7			7		16	my ( $tries, $phrase, $opts ) = @_;
240	7					25	my $counts = _counts($phrase);
241	7					28	my @j = _jumps($counts);
242	7					38	my @ix = _indices($counts);
243	7					15	my $wc = {};
244	7					20	my $i = _iterator( $tries, $counts, $opts );
245	7					9	my ( %reverse_cache, %c );
246							return sub {
247	28			28		1550	my $rv;
248	28					252	local @jumps = @j;
249	28					66	local @indices = @ix;
250	28					37	local $word_cache = $wc;
251							{
252	28					48	$rv = $i->();
	54					112
253	54	100				153	return unless $rv;
254	47					146	my $key = join ',', sort { $a <=> $b } @$rv;
	73					173
255	47	100				1202	redo if $c{$key}++;
256							}
257	21					41	for my $j (@$rv) {
258	33	100				87	if ( !$reverse_cache{$j} ) {
259	7					61	%reverse_cache = reverse %$word_cache;
260	7					19	last;
261							}
262							}
263	21					41	$rv = [ map { $reverse_cache{$_} } @$rv ];
	40					605
264	21	100				68	if ( $opts->{sorted} ) {
265	5					14	$rv = [ sort @$rv ];
266							}
267	21					137	$rv;
268	7					60	};
269							}
270
271							# iterator that manages the trie list
272							sub _iterator {
273	49			49		81	my ( $tries, $counts, $opts ) = @_;
274	49					63	my $total = 0;
275	49					199	$total += $_ for @$counts[@indices];
276	49					117	my @t = @$tries;
277	49					61	my $i;
278							my $s = sub {
279	154			154		150	my $rv;
280							{
281	154	100				148	unless ($i) {
	199					376
282	96	100				162	if (@t) {
283	49					61	my $pair = shift @t;
284	49					113	local ( $trie, $known ) = @$pair;
285	49	50				96	redo unless _all_known($counts);
286	49					133	my $words = _words_in( $counts, $total );
287	49	50				115	redo unless _worth_pursuing( $counts, $words );
288	49					206	$i = _sub_iterator( $tries, $words, $opts );
289							}
290							else {
291	47					309	return $rv;
292							}
293							}
294	152					253	$rv = $i->();
295	152	100				318	unless ($rv) {
296	45					49	undef $i;
297	45					298	redo;
298							}
299							}
300	107					357	$rv;
301	49					215	};
302	49					123	$s;
303							}
304
305							# iterator that actually walks tries looking for anagrams
306							sub _sub_iterator {
307	49			49		79	my ( $tries, $words, $opts ) = @_;
308	49					109	my @pairs = @$words;
309							return sub {
310							{
311	152	100		152		162	return unless @pairs;
	237					489
312	192	100				446	if ( $opts->{random} ) {
313	112					305	my $i = int rand scalar @pairs;
314	112	100				269	if ($i) {
315	47					63	my $p = $pairs[0];
316	47					59	$pairs[0] = $pairs[$i];
317	47					73	$pairs[$i] = $p;
318							}
319							}
320	192					220	my ( $w, $s ) = @{ $pairs[0] };
	192					357
321	192	100				475	unless ( ref $s eq 'CODE' ) {
322	89	100				296	if ( _any($s) ) {
323	42					170	$s = _iterator( $tries, $s, $opts );
324							}
325							else {
326	47					250	my $next = [];
327							$s = sub {
328	92					110	my $rv = $next;
329	92					100	undef $next;
330	92					248	$rv;
331	47					187	};
332							}
333	89					1257	$pairs[0][1] = $s;
334							}
335	192					443	my $remainder = $s->();
336	192	100				355	unless ($remainder) {
337	85					92	shift @pairs;
338	85					505	redo;
339							}
340	107					338	return [ $w, @$remainder ];
341							}
342	49					330	};
343							}
344
345							# all character counts decremented
346							sub _worth_pursuing {
347	49			49		69	my ( $counts, $words ) = @_;
348
349	49					61	my $c;
350
351							# if any letter count didn't change, there's no hope
352	49					85	OUTER: for my $i (@indices) {
353	137	100				324	next unless $c = $counts->[$i];
354	68					118	for (@$words) {
355	89	100				293	next OUTER if $_->[1][$i] < $c;
356							}
357	0					0	return;
358							}
359	49					134	return 1;
360							}
361
362							sub _indices {
363	15			15		21	my $counts = shift;
364	15					17	my @indices;
365	15					33	for my $i ( 0 .. $#$counts ) {
366	1545	100				2464	push @indices, $i if $counts->[$i];
367							}
368	15					55	return @indices;
369							}
370
371							sub _jumps {
372	15			15		20	my $counts = shift;
373	15					155	my @jumps = (0) x @$counts;
374	15					23	my $j = 0;
375	15					39	while ( my $n = _next_jump( $counts, $j ) ) {
376	36					42	$jumps[$j] = $n;
377	36					84	$j = $n;
378							}
379	15					21	$jumps[-1] = -1;
380	15					252	return @jumps;
381							}
382
383							sub _next_jump {
384	51			51		70	my ( $counts, $j ) = @_;
385	51					120	for my $i ( $j + 1 .. $#$counts ) {
386	1530	100				2616	return $i if $counts->[$i];
387							}
388	15					38	return;
389							}
390
391							sub _clean {
392	52			52		94	$_[0] =~ s/\W+//g;
393	52					115	$_[0] = lc $_[0];
394							}
395
396							sub _all_known {
397	57			57		72	my $counts = shift;
398	57	50				145	return if @$counts > @$known;
399	57					146	for my $i ( 0 .. $#$counts ) {
400	5767	50	66			18174	return if $counts->[$i] && !$known->[$i];
401							}
402	57					206	return 1;
403							}
404
405
406							sub key {
407	6			6	1	15202	my ( $self, $phrase ) = @_;
408	6					15	$self->{clean}->($phrase);
409	6					7	my ( @counts, $lowest );
410	6					30	for my $c ( map ord, split //, $phrase ) {
411	18	100				32	if ( defined $lowest ) {
412	12	100				27	$lowest = $c if $c < $lowest;
413							}
414							else {
415	6					7	$lowest = $c;
416							}
417	18					30	$counts[$c]++;
418							}
419	6					30	@counts = @counts[ $lowest .. $#counts ];
420	6		100			97	$_ //= '' for @counts;
421	6					21	my $suffix = join '.', @counts;
422	6					33	$suffix =~ s/\.(\.+)\./'('.length($1).')'/ge;
	7					34
423	6					42	return "$lowest:$suffix";
424							}
425
426
427							sub lists {
428	1			1	1	2	my $self = shift;
429	1					3	return scalar @{ $self->{tries} };
	1					8
430							}
431
432							sub _counts {
433	15			15		27	my $phrase = shift;
434	15					19	my @counts;
435	15					77	for my $c ( map ord, split //, $phrase ) {
436	42					102	$counts[$c]++;
437							}
438	15		100			4468	$_ //= 0 for @counts;
439	15					45	\@counts;
440							}
441
442							sub _any {
443	125			125		153	for ( @{ $_[0] } ) {
	125					259
444	12548	100				32386	return 1 if $_;
445							}
446	66					191	'';
447							}
448
449							sub _anagramize {
450	25			25		44	my $counts = shift;
451	25					31	my $total = 0;
452	25					97	$total += $_ for @$counts[@indices];
453	25					33	my $key;
454	25	50				51	if ( $total <= $limit ) {
455	25					84	$key = join ',', @$counts[@indices];
456	25					41	my $cached = $cache{$key};
457	25	100				66	return @$cached if $cached;
458							}
459	22					23	my @anagrams;
460	22					43	my $words = _words_in( $counts, $total );
461	22	50				48	if ( _all_touched( $counts, $words ) ) {
462	22					46	for (@$words) {
463	36					59	my ( $word, $c ) = @$_;
464	36	100				65	if ( _any($c) ) {
465	17					41	push @anagrams, [ $word, @$_ ] for _anagramize($c);
466							}
467							else {
468	19					68	push @anagrams, [$word];
469							}
470							}
471	22					35	my %seen;
472	28					118	@anagrams = map {
473	22	50				31	$seen{ join ' ', sort { $a <=> $b } @$_ }++
	39					180
474							? ()
475							: $_
476							} @anagrams;
477							}
478	22	50				84	$cache{$key} = \@anagrams if $key;
479	22					226	@anagrams;
480							}
481
482							sub _all_touched {
483	22			22		28	my ( $counts, $words ) = @_;
484
485	22					20	my $c;
486
487	22					28	my ( @tallies, @good_indices );
488	22					39	for (@$words) {
489	52					71	my $wc = $_->[1];
490	52					66	for (@indices) {
491	140	100				267	next unless $c = $counts->[$_];
492	114		66			277	$good_indices[$_] //= $_;
493	114	100				344	$tallies[$_]++ if $wc->[$_] < $c;
494							}
495							}
496
497							# if any letter count didn't change, there's no hope
498	22	50				47	return unless @good_indices;
499	22					49	for (@good_indices) {
500	2241	100				4810	next unless $_;
501	36	50				83	return unless $tallies[$_];
502							}
503
504							# find the letter with the fewest possibilities
505	22					29	my ( $best, $min, $n );
506	22					36	for (@good_indices) {
507	2241	100				4985	next unless $_;
508	36					42	$n = $tallies[$_];
509	36	100	66			119	if ( !$best \|\| $n < $min ) {
510	22					19	$best = $_;
511	22					40	$min = $n;
512							}
513							}
514
515							# we only need consider all the branches which affected a
516							# particular letter; we will find all possibilities in their
517							# ramifications
518	22					38	$c = $counts->[$best];
519	22					34	@$words = grep { $_->[1][$best] < $c } @$words;
	52					187
520	22					92	return 1;
521							}
522
523							1;
524
525							__END__