File Coverage

blib/lib/WordLists/Sort.pm

Criterion	Covered	Total	%
statement	147	164	89.6
branch	55	86	63.9
condition	24	57	42.1
subroutine	12	12	100.0
pod	5	6	83.3
total	243	325	74.7

line	stmt	bran	cond	sub	pod	time	code
1							package WordLists::Sort;
2	3			3		89801	use utf8;
	3					5
	3					22
3	3			3		92	use strict;
	3					6
	3					99
4	3			3		18	use warnings;
	3					6
	3					109
5							require Exporter;
6	3			3		993	use WordLists::Base;
	3					6
	3					6911
7							our $VERSION = $WordLists::Base::VERSION;
8
9							our @ISA = qw(Exporter);
10							our @EXPORT_OK = qw(
11							complex_compare
12							atomic_compare
13							sorted_collate
14							schwartzian_collate
15							);
16
17							sub complex_compare
18							{
19	12			12	1	20	my $args = $_[2];
20	12	50	33			74	if ( (defined $args) && (ref $args eq ref {}) )
21							{
22	12	0	33			58	return 0 if (!$args->{'override_undef'} and !defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
			33
23	12	50	33			64	return (defined($_[0]) <=> defined ($_[1])) if (!$args->{'override_undef'} and (!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
			33
24	12	100	66			71	return 0 if (!$args->{'override_eq'} and ($_[0] eq $_[1])); # avoid excessive execution of code if possible
25	11	50	33			66	if ( (defined $args->{'functions'}) && (ref $args->{'functions'} eq ref []) )
26							{
27	11					14	my @functions = @{$args->{'functions'}};
	11					31
28	11					21	foreach (@functions)
29							{
30	25					56	my $r = atomic_compare($_[0], $_[1], $_);
31	25	100				127	return $r unless $r == 0;
32							}
33	0					0	return 0;
34							}
35							else
36							{
37	0					0	warn 'Expected: $a, $b, { functions => [...]}';
38	0	0				0	return $_[0] cmp $_[1] unless defined $args;
39							}
40							}
41							else
42							{
43	0	0				0	warn 'Expected: $a, $b, {...}' if defined $args;
44	0	0	0			0	return 0 if (!defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
45	0	0	0			0	return (defined($_[0]) <=> defined ($_[1])) if ((!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
46	0					0	return $_[0] cmp $_[1];
47							}
48	0					0	return 0;
49							}
50							sub debug_compare
51							{
52	260	50	66	260	0	2178	if (defined $_[2] and $_[2])
53							{
54	0	0				0	print "\n". (' ' x $_[2]) . 'Comparing `' . (defined $_[0]? $_[0]:'') . '` and `' . (defined $_[1]? $_[1]:'') .'`';
		0
55							}
56	260	50				646	if (defined $_[3])
57							{
58	0					0	print "-- Result = $_[3]";
59							}
60							}
61							sub atomic_compare
62							{
63	90			90	1	1949	my @s = ($_[0], $_[1]);
64	90					108	my $args = $_[2];
65	90	100	66			418	if ( (defined $args) && (ref $args eq ref {}) )
66							{
67	80	0	33			292	return 0 if (!$args->{'override_undef'} and !defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
			33
68	80	50	33			353	return (defined($_[0]) <=> defined ($_[1])) if (!$args->{'override_undef'} and (!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
			33
69	80	100	66			2143	return 0 if (!$args->{'override_eq'} and ($_[0] eq $_[1])); # avoid excessive execution of code if possible if ( (defined $args) && (ref $args eq ref {}) )
70							my %arg = (
71	54			54		87	'c' => sub { $_[0] cmp $_[1]; },
72	75					366	't' => [],
73							'n' => [],
74							'd' => 0,
75	75					240	%{$args},
76							);
77	75					291	debug_compare ($s[0], $s[1], $args->{'d'});
78	75	100				260	if (ref $args->{'n'} ne ref [])
79							{
80	74					137	$arg{'n'} = [];
81	74	100		8		178	$args->{'n'} = sub {$_[0];} unless defined $args->{'n'};
	8					18
82	74					138	$arg{'n'}[0] = $args->{'n'};
83	74					125	$arg{'n'}[1] = $args->{'n'};
84							}
85
86							#push (@{$arg{'t'}}, {re=> qr/.+/, c=> $arg{'c'} }) unless defined${$arg{'t'}}[0];
87	75					98	my @t = (@{$arg{'t'}}, {re=> qr/./, c=> $arg{'c'} });
	75					482
88	75					110	my @sToken;
89							my @sTokenType;
90	75					139	foreach my $i (0..1)
91							{
92	150					173	$s[$i] = &{ $arg{'n'}[$i] }($s[$i]);
	150					499
93							do
94	150					254	{
95	386					1234	foreach (0..$#t)
96							{
97	563					818	my $re = $t[$_]{'re'};
98
99	563	100				7058	if ($s[$i] =~ s/^($re)//)
100							{
101							#print "\n($1)$s[$i] matches $re";
102	386					497	push @{$sToken[$i]}, $1;
	386					1033
103	386					427	push @{$sTokenType[$i]}, $_;
	386					629
104	386					1433	last;
105							}
106							else
107							{
108							#print "\n$s[$i] doesn't match $re";
109							}
110							}
111							} until $s[$i] eq '';
112
113							}
114	75					176	$arg{'d'}=$arg{'d'} * 2;
115	75	100				84	foreach ($#{$sTokenType[0]} >= $#{$sTokenType[1]} ? 0..$#{$sTokenType[0]} : 0..$#{$sTokenType[1]})
	75					124
	75					169
	62					127
	13					34
116							{
117	185					432	debug_compare ($sToken[0][$_], $sToken[1][$_], $arg{'d'});
118	185	100	66			739	if (defined $sTokenType[0][$_] and defined $sTokenType[1][$_])
		50
		50
119							{
120	174	100				330	if ($sTokenType[0][$_] == $sTokenType[1][$_])
121							{
122	172					264	my $c = $t[$sTokenType[0][$_]]{'c'};
123	172	100				309	if ((ref $c eq ref ''))
124							{
125							# todo: dwimmery code - dp - what dwimmery?
126	17	50				45	return $c unless $c ==0;
127							# return undef;
128							}
129							else
130							{
131	155					245	my $r = &{$c}($sToken[0][$_], $sToken[1][$_]);
	155					364
132	155	100				920	return $r unless $r ==0;
133							}
134							}
135							else
136							{
137	2					24	return ($sTokenType[1][$_] <=> $sTokenType[0][$_]);
138							}
139							}
140							elsif (defined $sTokenType[0][$_])
141							{
142	0					0	return 1;
143							}
144							elsif (defined $sTokenType[1][$_])
145							{
146	11					131	return -1;
147							}
148							}
149							}
150							else
151							{
152	10	50				20	warn 'Expected: $a, $b, {...}' if defined $args;
153	10	100	66			38	return 0 if (!defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
154	9	100	66			43	return (defined($_[0]) <=> defined ($_[1])) if ((!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
155	8					47	return $_[0] cmp $_[1];
156							}
157	41					175	return 0;
158							}
159
160							sub sorted_collate # Sorted Collation - hopefully O n log (n)
161							{
162	4			4	1	15	my ( $aIn, $cmp, $merge) = @_;
163							# ^ + $self
164	4					12	my $iEnum=0;
165	4					12	my $aEnum = [map {[ $iEnum++ , $_]; } @$aIn];
	8					35
166	4					29	my $aSorted = [sort {&{$cmp}($a->[1],$b->[1])} @$aEnum];
	5					20
	5					16
167
168	4					32	for (my $i = 0; $i<$#{$aSorted}; $i++)
	8					32
169							{
170	4	100				19	next unless defined $aSorted->[$i][1] ;
171	3					7	for (my $j = 1; $j<=$#{$aSorted}-$i; $j++)
	6					22
172							{
173	4	50				18	if (defined $aSorted->[$i+$j][1])
174							{
175	4	100				89	if (0 == &{$cmp}($aSorted->[$i][1], $aSorted->[$i+$j][1]))
	4					13
176							{
177	3					22	&{$merge}($aSorted->[$i][1], $aSorted->[$i+$j][1]);
	3					11
178	3					20	$aSorted->[$i+$j][1] = undef;
179							}
180							else
181	1					9	{ $i += $j - 1;
182	1					3	last; # last j === next i
183							}
184							}
185							}
186							}
187	4					10	return [map {$_->[1]} sort { $a->[0] <=> $b->[0] } grep { defined $_->[1] } @$aSorted];
	5					39
	1					5
	8					25
188							}
189							sub schwartzian_collate # Schwartzian Collation - hopefully O n log (n), but less than sorted collation, if $norm is slow
190							{
191	4			4	1	16	my ( $aIn, $cmp, $norm, $merge) = @_;
192							# ^ + $self
193	4					13	my $iEnum=0;
194	4					13	my $aEnum;
195							my $aSorted;
196	4	50				19	if (defined $norm)
197							{
198	4					12	$aEnum = [map {[ $iEnum++ , $_, &{$norm}($_)]; } @$aIn];
	8					36
	8					28
199	4					43	$aSorted = [sort {&{$cmp}($a->[2],$b->[2])} @$aEnum];
	5					17
	5					15
200							}
201							else
202							{
203	0					0	$aEnum = [map {[ $iEnum++ , $_]; } @$aIn];
	0					0
204	0					0	$aSorted = [sort {&{$cmp}($a->[1],$b->[1])} @$aEnum];
	0					0
	0					0
205							}
206	4					22	for (my $i = 0; $i<$#{$aSorted}; $i++)
	8					31
207							{
208	4	100				15	next unless defined $aSorted->[$i][1] ;
209	3					7	for (my $j = 1; $j<=$#{$aSorted}-$i; $j++)
	6					24
210							{
211	4	50				14	if (defined $aSorted->[$i+$j][1])
212							{
213	4	50				16	if (
		100
214	4					171	(defined $norm) ?
215	0					0	( 0 == &{$cmp}($aSorted->[$i][2], $aSorted->[$i+$j][2]) ) :
216							( 0 == &{$cmp}($aSorted->[$i][1], $aSorted->[$i+$j][1]) )
217							)
218							{
219	3					18	&{$merge}($aSorted->[$i][1], $aSorted->[$i+$j][1]);
	3					10
220	3					19	$aSorted->[$i+$j][1] = undef;
221							}
222							else
223	1					6	{ $i += $j - 1;
224	1					42	last; # last j === next i
225							}
226							}
227							}
228							}
229	4					12	return [map {$_->[1]} sort { $a->[0] <=> $b->[0] } grep { defined $_->[1] } @$aSorted];
	5					43
	1					4
	8					22
230							}
231							sub naive_collate # Naive Collation - probably O n**2
232							{
233	4			4	1	10	my ( $aIn, $cmp, $merge) = @_;
234							# ^ + $self
235	4					7	my $iEnum = 0;
236	4					13	my $aEnum = [map {[$iEnum++,$_]} @$aIn];
	8					28
237	4					12	for (my $i = 0; $i<$#{$aEnum}; $i++)
	8					32
238							{
239	4	100				17	next unless defined $aEnum->[$i][1] ;
240	3					6	for (my $j = 1; $j<=$#{$aEnum}-$i; $j++)
	7					32
241							{
242	4	50				15	if (defined $aEnum->[$i+$j][1])
243							{
244	4	100				11	if (0 == &{$cmp}($aEnum->[$i][1], $aEnum->[$i+$j][1]))
	4					16
245							{
246	3					26	&{$merge}($aEnum->[$i][1], $aEnum->[$i+$j][1]);
	3					9
247	3					21	$aEnum->[$i+$j][1] = undef;
248							}
249							}
250							}
251							}
252	4					9	return [map {$_->[1]} grep { defined $_->[1] } @$aEnum];
	5					116
	8					23
253							}
254
255							return 1;
256
257							=pod
258
259
260							=head1 NAME
261
262							WordLists::Sort
263
264
265							=head1 SYNOPSIS
266
267							Provides a structure for comparison functions, generally for complex sort.
268
269							# The following sorts "No6" "No.7" "no 8" in that order - ignoring punctuation.
270							@sorted = sort { atomic_compare (
271							$a,$b,{ n => sub{ $_[0]=~s/[^[:alnum:]]//g; lc $_[0]; } }
272							) } @unsorted;
273
274							# The following sorts A9 before A10.
275							@sorted = sort { atomic_compare (
276							$a,$b,{ t => [ { re => qr/[0-9]+/, c => sub { $_[0] <=> $_[1]; } }, ], } }
277							) } @unsorted;
278
279
280							=head1 DESCRIPTION
281
282							This is by far and away the most evil member of the L family (it's also pretty much unrelated to all the others). It is basically a terse way of writing complex comparison/sort functions as one liners (if you want to).
283
284							The intention is to be able to sort by several different criteria, e.g. so "the UN" sorts after "un-" and before "unabashed", and/or so that "F\x{E9}" sorts after "Fe" but before "FE".
285
286							Once you've written/cribbed a sort algorithm, it's easy to use - just put it in a subroutine and call it. (Actually, what you're writing is a comparison algrithm, which perl's C then calls).
287
288							Writing it is a bit harder, though: the framework involves (potentially) anonymous coderefs sprinkled amidst the hashrefs - it's much easier with indentation.
289
290
291							=head1 FUNCTIONS
292
293
294							=head2 atomic_compare
295
296							C: This provides most of the functionality in the module. It allows normalisation of the arguments, tokenisation so that different sections can be compared with different criteria, and, if so desired, flipping of the result.
297
298							=head3 Function arguments
299
300							C: Normalise. This should be a coderef. If present, runs the code on each argument before comparison.
301							Note that this only happens locally to the function, so lowercasing in functions[1]{n} will not prevent functions[2] putting VAT before vat.
302							(If you want to keep them, nest the original function in the c).
303							If C is an arrayref, it runs the first code on C<$a>, the second on C<$b>.
304
305							C: Tokenize. An arrayref containing hashrefs, each of which is attempted in order. In each hashref should be a regex keyed to C which will match in case you want do different comparisons on different types of data. Permitted values, other than coderefs, are 0 (e.g. C<< {re=>qr/\d/, 'c'=>0} >> means 1 and 9 are equivalent), -1 or 1 (meaning that if this token is discovered at the same location, $a or $b always wins - NB that this is to be avoided in sort functions).
306
307							C: Flip. if set to 1, then the result is reversed (-1 becomes 1 and vice versa but 0 stays the same).
308
309							C: Comparison to use for text which doesn't match a token. Default behaviour is to use a wrapper for C.
310
311							Below is an C function for sorting names of units in a workbook. You want them to appear in the order Welcome, Unit 1, Unit 2, ... Unit 11, but perl's C would put "Welcome" at the end and sorts "Unit 11" before "Unit 2". The normalisation is a hack to pretend 'Welcome' is equivalent to "Unit 0", and the tokenisation instructs that series of digits should be compared as numbers and not as strings, so 10 and 11 now sort after 9.
312
313							atomic_compare (
314							$a, $b,
315							{
316							n => sub{
317							$_[0] =~ s/^Welcome$/Unit 0/i;
318							lc $_[0];
319							},
320							t =>
321							[
322							{
323							re => qr/[0-9]+/,
324							c => sub { $_[0] <=> $_[1]; }
325							},
326							],
327							}
328							);
329
330
331							=head2 complex_compare
332
333							C allows a user to perform several successive comparisons, returning a value as soon as but only when a nonzero result is achieved. This is useful for situations such as:
334
335							=over
336
337							=item *
338
339							For user-facing sorting, such as dictionary headwords where "the Internet" should normally sort after "internet" and not after "theft".
340
341							=item *
342
343							For sorting which requires heavy processing only in some cases, e.g. an identifier C always sorts before C, whatever the numerical values, but to compare C and C an external resource (lookup table, AJAX data, etc.) must be consulted.
344
345							=item *
346
347							Where certain values have high priority, e.g. if you'd like to see the string 'undef' appear before the sting '0'.
348
349							=back
350
351							C is pretty much equivalent to C, but is potentially less confusing than using the C<\|\|> or C operators, and may be easier to code than repeating the C, C<$a>, C<$b>.
352
353							=head3 Function arguments
354
355							C: Prevents the function returning 0 immediately when the strings are identical. (The arguments are ordinarily tested for string equality at the beginning, in order to prevent unnecessary processing.) Setting this flag is only necessary if you have a condition which forces C<$a> or C<$b> to win for certain strings which are equal.
356
357							C: In , an arrayref containing a hashref equivalent to the third value in C. Each function performs a comparison which executes and returns 0, 1, or -1. If the result of any function is nonzero, that is the return value. If it is zero, the next function is tried.
358
359							complex_compare ($a, $b, {
360							functions =>
361							[
362							{
363							n => sub{lc $_[0];}, #is \&lc possible?
364							},
365							{
366							n => sub{$_[0] =~ s/^the\s//; $_[0];},
367							t =>
368							[
369							{ qr/[^[:alpha:]]/ => 0 },
370							],
371							f => 1,
372							c => sub { $_[0] cmp $ [1] },
373							},
374							]
375							})
376
377							=head2 naive_collate
378
379							Performs a collation on an arrayref. Provide a) the data, b) a comparison function which returns 0 if the two comparands are duplicates, and c) a merge function which takes the first and later duplicate.
380
381							NB: This is slow, use either C or C unless you have a good reason for using this function (e.g. your comparison function is unstable and doesn't function like C). To discourage casual use, it is not exported.
382
383							=head2 sorted_collate
384
385							Like C, but faster.
386
387							It is faster because rather than comparing every element against every other element that hasn't already been collated, it sorts once first, then compares against following elements until it finds one which doesn't match, then stops. The list is then returned to its original order (except without the duplicates).
388
389							=head2 schwartzian_collate
390
391							Like C, but uses a Schwartzian transform: after the comparison function, provide a normalisation function.
392
393							It is about as fast as the C, but can be several times faster when the normalisation function is complex.
394
395							=head1 TODO
396
397							=over
398
399							=item *
400
401							Add lots more dwimmery so that C expressions can be used wherever they're likely to be useful and coderefs can be substituted in for regexes where their function is to test.
402
403							=item *
404
405							Make priority lists a bit easier, e.g. by allowing regexes - or even strings - amongst the hashrefs in C.
406
407							=item *
408
409							Write some good, sensible examples for C.
410
411							=item *
412
413							Gather useful common comparison functions which can be imported, studied, borrowed, etc. and offer them as a module.
414
415							=item *
416
417							Write more test cases.
418
419							=item *
420
421							Possibly remove assumptions about the comparanda, i.e. permit comparison of objects, references, etc. (But then: how does tokenisation work? Maybe it only works if C<< n=>sub{$_[0]->toString} >>? Wouldn't we want to compare hashrefs and arrayrefs more intelligently?)
422
423							=item *
424
425							Figure out how to get C and C to work with Schwartzian transforms.
426
427							=back
428
429							=head1 BUGS
430
431							Please use the Github issues tracker.
432
433							=head1 LICENSE
434
435							Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
436
437							=cut