File Coverage

blib/lib/WordLists/Sort.pm
Criterion Covered Total %
statement 147 164 89.6
branch 55 86 63.9
condition 24 57 42.1
subroutine 12 12 100.0
pod 5 6 83.3
total 243 325 74.7


line stmt bran cond sub pod time code
1             package WordLists::Sort;
2 3     3   89801 use utf8;
  3         5  
  3         22  
3 3     3   92 use strict;
  3         6  
  3         99  
4 3     3   18 use warnings;
  3         6  
  3         109  
5             require Exporter;
6 3     3   993 use WordLists::Base;
  3         6  
  3         6911  
7             our $VERSION = $WordLists::Base::VERSION;
8            
9             our @ISA = qw(Exporter);
10             our @EXPORT_OK = qw(
11             complex_compare
12             atomic_compare
13             sorted_collate
14             schwartzian_collate
15             );
16            
17             sub complex_compare
18             {
19 12     12 1 20 my $args = $_[2];
20 12 50 33     74 if ( (defined $args) && (ref $args eq ref {}) )
21             {
22 12 0 33     58 return 0 if (!$args->{'override_undef'} and !defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
      33        
23 12 50 33     64 return (defined($_[0]) <=> defined ($_[1])) if (!$args->{'override_undef'} and (!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
      33        
24 12 100 66     71 return 0 if (!$args->{'override_eq'} and ($_[0] eq $_[1])); # avoid excessive execution of code if possible
25 11 50 33     66 if ( (defined $args->{'functions'}) && (ref $args->{'functions'} eq ref []) )
26             {
27 11         14 my @functions = @{$args->{'functions'}};
  11         31  
28 11         21 foreach (@functions)
29             {
30 25         56 my $r = atomic_compare($_[0], $_[1], $_);
31 25 100       127 return $r unless $r == 0;
32             }
33 0         0 return 0;
34             }
35             else
36             {
37 0         0 warn 'Expected: $a, $b, { functions => [...]}';
38 0 0       0 return $_[0] cmp $_[1] unless defined $args;
39             }
40             }
41             else
42             {
43 0 0       0 warn 'Expected: $a, $b, {...}' if defined $args;
44 0 0 0     0 return 0 if (!defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
45 0 0 0     0 return (defined($_[0]) <=> defined ($_[1])) if ((!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
46 0         0 return $_[0] cmp $_[1];
47             }
48 0         0 return 0;
49             }
50             sub debug_compare
51             {
52 260 50 66 260 0 2178 if (defined $_[2] and $_[2])
53             {
54 0 0       0 print "\n". (' ' x $_[2]) . 'Comparing `' . (defined $_[0]? $_[0]:'') . '` and `' . (defined $_[1]? $_[1]:'') .'`';
    0          
55             }
56 260 50       646 if (defined $_[3])
57             {
58 0         0 print "-- Result = $_[3]";
59             }
60             }
61             sub atomic_compare
62             {
63 90     90 1 1949 my @s = ($_[0], $_[1]);
64 90         108 my $args = $_[2];
65 90 100 66     418 if ( (defined $args) && (ref $args eq ref {}) )
66             {
67 80 0 33     292 return 0 if (!$args->{'override_undef'} and !defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
      33        
68 80 50 33     353 return (defined($_[0]) <=> defined ($_[1])) if (!$args->{'override_undef'} and (!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
      33        
69 80 100 66     2143 return 0 if (!$args->{'override_eq'} and ($_[0] eq $_[1])); # avoid excessive execution of code if possible if ( (defined $args) && (ref $args eq ref {}) )
70             my %arg = (
71 54     54   87 'c' => sub { $_[0] cmp $_[1]; },
72 75         366 't' => [],
73             'n' => [],
74             'd' => 0,
75 75         240 %{$args},
76             );
77 75         291 debug_compare ($s[0], $s[1], $args->{'d'});
78 75 100       260 if (ref $args->{'n'} ne ref [])
79             {
80 74         137 $arg{'n'} = [];
81 74 100   8   178 $args->{'n'} = sub {$_[0];} unless defined $args->{'n'};
  8         18  
82 74         138 $arg{'n'}[0] = $args->{'n'};
83 74         125 $arg{'n'}[1] = $args->{'n'};
84             }
85            
86             #push (@{$arg{'t'}}, {re=> qr/.+/, c=> $arg{'c'} }) unless defined${$arg{'t'}}[0];
87 75         98 my @t = (@{$arg{'t'}}, {re=> qr/./, c=> $arg{'c'} });
  75         482  
88 75         110 my @sToken;
89             my @sTokenType;
90 75         139 foreach my $i (0..1)
91             {
92 150         173 $s[$i] = &{ $arg{'n'}[$i] }($s[$i]);
  150         499  
93             do
94 150         254 {
95 386         1234 foreach (0..$#t)
96             {
97 563         818 my $re = $t[$_]{'re'};
98            
99 563 100       7058 if ($s[$i] =~ s/^($re)//)
100             {
101             #print "\n($1)$s[$i] matches $re";
102 386         497 push @{$sToken[$i]}, $1;
  386         1033  
103 386         427 push @{$sTokenType[$i]}, $_;
  386         629  
104 386         1433 last;
105             }
106             else
107             {
108             #print "\n$s[$i] doesn't match $re";
109             }
110             }
111             } until $s[$i] eq '';
112            
113             }
114 75         176 $arg{'d'}=$arg{'d'} * 2;
115 75 100       84 foreach ($#{$sTokenType[0]} >= $#{$sTokenType[1]} ? 0..$#{$sTokenType[0]} : 0..$#{$sTokenType[1]})
  75         124  
  75         169  
  62         127  
  13         34  
116             {
117 185         432 debug_compare ($sToken[0][$_], $sToken[1][$_], $arg{'d'});
118 185 100 66     739 if (defined $sTokenType[0][$_] and defined $sTokenType[1][$_])
    50          
    50          
119             {
120 174 100       330 if ($sTokenType[0][$_] == $sTokenType[1][$_])
121             {
122 172         264 my $c = $t[$sTokenType[0][$_]]{'c'};
123 172 100       309 if ((ref $c eq ref ''))
124             {
125             # todo: dwimmery code - dp - what dwimmery?
126 17 50       45 return $c unless $c ==0;
127             # return undef;
128             }
129             else
130             {
131 155         245 my $r = &{$c}($sToken[0][$_], $sToken[1][$_]);
  155         364  
132 155 100       920 return $r unless $r ==0;
133             }
134             }
135             else
136             {
137 2         24 return ($sTokenType[1][$_] <=> $sTokenType[0][$_]);
138             }
139             }
140             elsif (defined $sTokenType[0][$_])
141             {
142 0         0 return 1;
143             }
144             elsif (defined $sTokenType[1][$_])
145             {
146 11         131 return -1;
147             }
148             }
149             }
150             else
151             {
152 10 50       20 warn 'Expected: $a, $b, {...}' if defined $args;
153 10 100 66     38 return 0 if (!defined($_[0]) and !defined ($_[1])); # avoid excessive execution of code if possible
154 9 100 66     43 return (defined($_[0]) <=> defined ($_[1])) if ((!defined($_[0]) or !defined ($_[1])) ); # avoid excessive execution of code if possible
155 8         47 return $_[0] cmp $_[1];
156             }
157 41         175 return 0;
158             }
159            
160             sub sorted_collate # Sorted Collation - hopefully O n log (n)
161             {
162 4     4 1 15 my ( $aIn, $cmp, $merge) = @_;
163             # ^ + $self
164 4         12 my $iEnum=0;
165 4         12 my $aEnum = [map {[ $iEnum++ , $_]; } @$aIn];
  8         35  
166 4         29 my $aSorted = [sort {&{$cmp}($a->[1],$b->[1])} @$aEnum];
  5         20  
  5         16  
167            
168 4         32 for (my $i = 0; $i<$#{$aSorted}; $i++)
  8         32  
169             {
170 4 100       19 next unless defined $aSorted->[$i][1] ;
171 3         7 for (my $j = 1; $j<=$#{$aSorted}-$i; $j++)
  6         22  
172             {
173 4 50       18 if (defined $aSorted->[$i+$j][1])
174             {
175 4 100       89 if (0 == &{$cmp}($aSorted->[$i][1], $aSorted->[$i+$j][1]))
  4         13  
176             {
177 3         22 &{$merge}($aSorted->[$i][1], $aSorted->[$i+$j][1]);
  3         11  
178 3         20 $aSorted->[$i+$j][1] = undef;
179             }
180             else
181 1         9 { $i += $j - 1;
182 1         3 last; # last j === next i
183             }
184             }
185             }
186             }
187 4         10 return [map {$_->[1]} sort { $a->[0] <=> $b->[0] } grep { defined $_->[1] } @$aSorted];
  5         39  
  1         5  
  8         25  
188             }
189             sub schwartzian_collate # Schwartzian Collation - hopefully O n log (n), but less than sorted collation, if $norm is slow
190             {
191 4     4 1 16 my ( $aIn, $cmp, $norm, $merge) = @_;
192             # ^ + $self
193 4         13 my $iEnum=0;
194 4         13 my $aEnum;
195             my $aSorted;
196 4 50       19 if (defined $norm)
197             {
198 4         12 $aEnum = [map {[ $iEnum++ , $_, &{$norm}($_)]; } @$aIn];
  8         36  
  8         28  
199 4         43 $aSorted = [sort {&{$cmp}($a->[2],$b->[2])} @$aEnum];
  5         17  
  5         15  
200             }
201             else
202             {
203 0         0 $aEnum = [map {[ $iEnum++ , $_]; } @$aIn];
  0         0  
204 0         0 $aSorted = [sort {&{$cmp}($a->[1],$b->[1])} @$aEnum];
  0         0  
  0         0  
205             }
206 4         22 for (my $i = 0; $i<$#{$aSorted}; $i++)
  8         31  
207             {
208 4 100       15 next unless defined $aSorted->[$i][1] ;
209 3         7 for (my $j = 1; $j<=$#{$aSorted}-$i; $j++)
  6         24  
210             {
211 4 50       14 if (defined $aSorted->[$i+$j][1])
212             {
213 4 50       16 if (
    100          
214 4         171 (defined $norm) ?
215 0         0 ( 0 == &{$cmp}($aSorted->[$i][2], $aSorted->[$i+$j][2]) ) :
216             ( 0 == &{$cmp}($aSorted->[$i][1], $aSorted->[$i+$j][1]) )
217             )
218             {
219 3         18 &{$merge}($aSorted->[$i][1], $aSorted->[$i+$j][1]);
  3         10  
220 3         19 $aSorted->[$i+$j][1] = undef;
221             }
222             else
223 1         6 { $i += $j - 1;
224 1         42 last; # last j === next i
225             }
226             }
227             }
228             }
229 4         12 return [map {$_->[1]} sort { $a->[0] <=> $b->[0] } grep { defined $_->[1] } @$aSorted];
  5         43  
  1         4  
  8         22  
230             }
231             sub naive_collate # Naive Collation - probably O n**2
232             {
233 4     4 1 10 my ( $aIn, $cmp, $merge) = @_;
234             # ^ + $self
235 4         7 my $iEnum = 0;
236 4         13 my $aEnum = [map {[$iEnum++,$_]} @$aIn];
  8         28  
237 4         12 for (my $i = 0; $i<$#{$aEnum}; $i++)
  8         32  
238             {
239 4 100       17 next unless defined $aEnum->[$i][1] ;
240 3         6 for (my $j = 1; $j<=$#{$aEnum}-$i; $j++)
  7         32  
241             {
242 4 50       15 if (defined $aEnum->[$i+$j][1])
243             {
244 4 100       11 if (0 == &{$cmp}($aEnum->[$i][1], $aEnum->[$i+$j][1]))
  4         16  
245             {
246 3         26 &{$merge}($aEnum->[$i][1], $aEnum->[$i+$j][1]);
  3         9  
247 3         21 $aEnum->[$i+$j][1] = undef;
248             }
249             }
250             }
251             }
252 4         9 return [map {$_->[1]} grep { defined $_->[1] } @$aEnum];
  5         116  
  8         23  
253             }
254            
255             return 1;
256            
257             =pod
258            
259            
260             =head1 NAME
261            
262             WordLists::Sort
263            
264            
265             =head1 SYNOPSIS
266            
267             Provides a structure for comparison functions, generally for complex sort.
268            
269             # The following sorts "No6" "No.7" "no 8" in that order - ignoring punctuation.
270             @sorted = sort { atomic_compare (
271             $a,$b,{ n => sub{ $_[0]=~s/[^[:alnum:]]//g; lc $_[0]; } }
272             ) } @unsorted;
273            
274             # The following sorts A9 before A10.
275             @sorted = sort { atomic_compare (
276             $a,$b,{ t => [ { re => qr/[0-9]+/, c => sub { $_[0] <=> $_[1]; } }, ], } }
277             ) } @unsorted;
278            
279            
280             =head1 DESCRIPTION
281            
282             This is by far and away the most evil member of the L family (it's also pretty much unrelated to all the others). It is basically a terse way of writing complex comparison/sort functions as one liners (if you want to).
283            
284             The intention is to be able to sort by several different criteria, e.g. so "the UN" sorts after "un-" and before "unabashed", and/or so that "F\x{E9}" sorts after "Fe" but before "FE".
285            
286             Once you've written/cribbed a sort algorithm, it's easy to use - just put it in a subroutine and call it. (Actually, what you're writing is a comparison algrithm, which perl's C then calls).
287            
288             Writing it is a bit harder, though: the framework involves (potentially) anonymous coderefs sprinkled amidst the hashrefs - it's much easier with indentation.
289            
290            
291             =head1 FUNCTIONS
292            
293            
294             =head2 atomic_compare
295            
296             C: This provides most of the functionality in the module. It allows normalisation of the arguments, tokenisation so that different sections can be compared with different criteria, and, if so desired, flipping of the result.
297            
298             =head3 Function arguments
299            
300             C: Normalise. This should be a coderef. If present, runs the code on each argument before comparison.
301             Note that this only happens locally to the function, so lowercasing in functions[1]{n} will not prevent functions[2] putting VAT before vat.
302             (If you want to keep them, nest the original function in the c).
303             If C is an arrayref, it runs the first code on C<$a>, the second on C<$b>.
304            
305             C: Tokenize. An arrayref containing hashrefs, each of which is attempted in order. In each hashref should be a regex keyed to C which will match in case you want do different comparisons on different types of data. Permitted values, other than coderefs, are 0 (e.g. C<< {re=>qr/\d/, 'c'=>0} >> means 1 and 9 are equivalent), -1 or 1 (meaning that if this token is discovered at the same location, $a or $b always wins - NB that this is to be avoided in sort functions).
306            
307             C: Flip. if set to 1, then the result is reversed (-1 becomes 1 and vice versa but 0 stays the same).
308            
309             C: Comparison to use for text which doesn't match a token. Default behaviour is to use a wrapper for C.
310            
311             Below is an C function for sorting names of units in a workbook. You want them to appear in the order Welcome, Unit 1, Unit 2, ... Unit 11, but perl's C would put "Welcome" at the end and sorts "Unit 11" before "Unit 2". The normalisation is a hack to pretend 'Welcome' is equivalent to "Unit 0", and the tokenisation instructs that series of digits should be compared as numbers and not as strings, so 10 and 11 now sort after 9.
312            
313             atomic_compare (
314             $a, $b,
315             {
316             n => sub{
317             $_[0] =~ s/^Welcome$/Unit 0/i;
318             lc $_[0];
319             },
320             t =>
321             [
322             {
323             re => qr/[0-9]+/,
324             c => sub { $_[0] <=> $_[1]; }
325             },
326             ],
327             }
328             );
329            
330            
331             =head2 complex_compare
332            
333             C allows a user to perform several successive comparisons, returning a value as soon as but only when a nonzero result is achieved. This is useful for situations such as:
334            
335             =over
336            
337             =item *
338            
339             For user-facing sorting, such as dictionary headwords where "the Internet" should normally sort after "internet" and not after "theft".
340            
341             =item *
342            
343             For sorting which requires heavy processing only in some cases, e.g. an identifier C always sorts before C, whatever the numerical values, but to compare C and C an external resource (lookup table, AJAX data, etc.) must be consulted.
344            
345             =item *
346            
347             Where certain values have high priority, e.g. if you'd like to see the string 'undef' appear before the sting '0'.
348            
349             =back
350            
351             C is pretty much equivalent to C, but is potentially less confusing than using the C<||> or C operators, and may be easier to code than repeating the C, C<$a>, C<$b>.
352            
353             =head3 Function arguments
354            
355             C: Prevents the function returning 0 immediately when the strings are identical. (The arguments are ordinarily tested for string equality at the beginning, in order to prevent unnecessary processing.) Setting this flag is only necessary if you have a condition which forces C<$a> or C<$b> to win for certain strings which are equal.
356            
357             C: In , an arrayref containing a hashref equivalent to the third value in C. Each function performs a comparison which executes and returns 0, 1, or -1. If the result of any function is nonzero, that is the return value. If it is zero, the next function is tried.
358            
359             complex_compare ($a, $b, {
360             functions =>
361             [
362             {
363             n => sub{lc $_[0];}, #is \&lc possible?
364             },
365             {
366             n => sub{$_[0] =~ s/^the\s//; $_[0];},
367             t =>
368             [
369             { qr/[^[:alpha:]]/ => 0 },
370             ],
371             f => 1,
372             c => sub { $_[0] cmp $ [1] },
373             },
374             ]
375             })
376            
377             =head2 naive_collate
378            
379             Performs a collation on an arrayref. Provide a) the data, b) a comparison function which returns 0 if the two comparands are duplicates, and c) a merge function which takes the first and later duplicate.
380            
381             NB: This is slow, use either C or C unless you have a good reason for using this function (e.g. your comparison function is unstable and doesn't function like C). To discourage casual use, it is not exported.
382            
383             =head2 sorted_collate
384            
385             Like C, but faster.
386            
387             It is faster because rather than comparing every element against every other element that hasn't already been collated, it sorts once first, then compares against following elements until it finds one which doesn't match, then stops. The list is then returned to its original order (except without the duplicates).
388            
389             =head2 schwartzian_collate
390            
391             Like C, but uses a Schwartzian transform: after the comparison function, provide a normalisation function.
392            
393             It is about as fast as the C, but can be several times faster when the normalisation function is complex.
394            
395             =head1 TODO
396            
397             =over
398            
399             =item *
400            
401             Add lots more dwimmery so that C expressions can be used wherever they're likely to be useful and coderefs can be substituted in for regexes where their function is to test.
402            
403             =item *
404            
405             Make priority lists a bit easier, e.g. by allowing regexes - or even strings - amongst the hashrefs in C.
406            
407             =item *
408            
409             Write some good, sensible examples for C.
410            
411             =item *
412            
413             Gather useful common comparison functions which can be imported, studied, borrowed, etc. and offer them as a module.
414            
415             =item *
416            
417             Write more test cases.
418            
419             =item *
420            
421             Possibly remove assumptions about the comparanda, i.e. permit comparison of objects, references, etc. (But then: how does tokenisation work? Maybe it only works if C<< n=>sub{$_[0]->toString} >>? Wouldn't we want to compare hashrefs and arrayrefs more intelligently?)
422            
423             =item *
424            
425             Figure out how to get C and C to work with Schwartzian transforms.
426            
427             =back
428            
429             =head1 BUGS
430            
431             Please use the Github issues tracker.
432            
433             =head1 LICENSE
434            
435             Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
436            
437             =cut