File Coverage

blib/lib/Sort/ArbBiLex.pm
Criterion Covered Total %
statement 148 148 100.0
branch 66 92 71.7
condition 7 21 33.3
subroutine 21 21 100.0
pod 2 7 28.5
total 244 289 84.4


line stmt bran cond sub pod time code
1              
2             # -*-Fundamental-*-
3             require 5; # Time-stamp: "2004-03-27 17:19:11 AST"
4             package Sort::ArbBiLex;
5 6     6   50348 use strict;
  6         14  
  6         531  
6 5     5   29 use vars qw(@ISA $Debug $VERSION);
  5         9  
  5         402  
7             $VERSION = "4.01";
8             $Debug = 0;
9 5     5   32 use Carp;
  5         13  
  5         351  
10 5     5   5318 use integer; # vroom vroom
  5         54  
  5         24  
11              
12 5 50   5   673 BEGIN { *UNICODE = eval('chr(256)') ? sub(){1} : sub(){0} }
13              
14             #POD at end
15             ###########################################################################
16              
17             sub import {
18 7     7   1514 my $class_name = shift(@_);
19 7         20 my $into = scalar caller;
20 7 100       6621 return unless @_;
21 4 50       78 croak "Argument list in 'use $class_name' must be list of pairs" if @_ % 2;
22 4         9 my($sym, $spec);
23 4         16 while(@_) {
24 4         20 ($sym, $spec) = splice(@_,0,2);
25 4 50       13 defined $sym or croak "Can't use undef as the name of a sub to make";
26 4 50       15 length $sym or croak "Can't use \"\" as the name of a sub to make";
27 4 50       11 defined $spec or croak "Can't use undef as a sort-order spec";
28 4 50       12 length $sym or croak "Can't use \"\" as a sort-order spec";
29 4 50 33     45 $sym = $into . '::' . $sym unless $sym =~ m/::/ or $sym =~ m/'/;
30 5     5   30 no strict 'refs';
  5         8  
  5         705  
31 4         13 *{$sym} = maker($spec);
  4         37  
32             }
33 4         7329 return;
34             }
35              
36             #--------------------------------------------------------------------------
37              
38             sub maker {
39 22     22 1 116 my $subr = eval(&source_maker(@_));
  2     2   14  
  2     2   4  
  2     2   10  
  2     2   111  
  2         5  
  2         885  
  2         1907  
  2         40  
  2         24  
  2         70  
  2         3  
  2         263  
40 46 100       151 die "Compile error <$@> in eval!?!" if $@; # shouldn't be possible!
41 37         113 return $subr;
42             }
43              
44             # Implementation note: I didn't /need/ to use eval(). I could just return
45             # an appropriate closure. But one can't do tr/$foo/$bar/ -- eval is the
46             # only way to get things to (so to speak) interpolate there; and the
47             # efficiency cost of requiring that Perl parse more code is offset by
48             # the efficiency benefit of being able to use tr/// (instead of s///) in
49             # appropriate cases.
50              
51             #--------------------------------------------------------------------------
52              
53             sub source_maker {
54 5     5   5365 no locale;
  5         1297  
  5         26  
55 26     18 1 12425 my($decl) = $_[0];
56 34 100       394 croak "usage: Sort::ArbBiLex::maker(DECLARATION). See the docs."
57             unless @_ == 1;
58              
59 34         49 my $one_level_mode = 0;
60 34         67 my @decl;
61 34 100       88 if(ref $decl) { # It's a rLoL declaration
62 24 100       77 croak "Sort order declaration must be a string or a listref"
63             unless ref($decl) eq 'ARRAY';
64 58 100       89 print "rLoL-decl mode\n" if $Debug > 1;
65             # Make @decl into a list of families
66 58         96 @decl = @$decl;
67             # and each one of the items in @decl must be a ref to a list of scalars
68 58         116 foreach my $f (@decl) {
69 116 50       221 croak "Each family must be a listref" unless ref($f) eq 'ARRAY';
70 116   33     1906 @$f = grep(defined($_) && length($_), @$f); # sanity
71 92         211 foreach my $g (@$f) { # more sanity.
72 170 50       503 croak "A reference found where a glyph was expected" if ref($g);
73             }
74             }
75              
76             } else { # It's a string-style declaration
77 35 50       137 print "string-decl mode\n" if $Debug > 1;
78             # Make @decl into a list of families
79 10 50       97 if($decl =~ /[\cm\cj\n]/) { # It contains majors and minors
80 18         103 @decl = grep /\S/, split( /[\cm\cj]+/, $decl );
81             } else { # It's all majors, on one line
82 16 0       99 print "Strangeness trap 1.\n" if $Debug;
83 14         67 @decl = grep /\S/, split( /\s+/, $decl );
84 20         195 $one_level_mode = 1;
85             }
86              
87             # Now turn @decl into a list of lists, where each element is a
88             # family -- i.e., a ref to a list of glyphs in that family.
89              
90 8 50       74 print "Glyph map:\n", map(" {<$_>}\n", @decl) if $Debug > 1;
91 36         52 foreach my $d (@decl) { # in place changing
92             #print " d $d -> ", map("<$_> ",grep($_ ne '',split(/\s+/, $d))), "\n";
93 40         85 $d = [ grep($_ ne '', split(/\s+/, $d)) ];
94             #print " d $d -> ", map("<$_> ", @$d), "\n";
95             }
96             }
97              
98 52         86 @decl = grep( scalar(@{$_}), @decl); # nix empty families
  114         182  
99 52 50       1672 croak "No glyphs in sort order declaration!?" unless @decl;
100              
101 18 100       50 @decl = map [$_], @{$decl[0]} if @decl == 1;
  6         55  
102             # Change it from a family of N glyphs into N families of one glyph each
103            
104             # Iterate thru the families and their glyphs and build the tables
105 18         29 my(@glyphs, @major_out, @minor_out);
106 18         23 my $max_glyph_length = 0;
107 18         23 my $max_family_length = 0;
108 18         42 my %seen;
109 18         23 my($glyph, $minor); # scratch
110 18         74 for (my $major = 0; $major < @decl; $major++) {
111 122 50       247 print "Family $major\n" if $Debug;
112 122         113 croak "Too many major glyphs" if !UNICODE and $major > 255;
113 24         35 $max_family_length = @{ $decl[$major] }
  122         283  
114 122 100       114 if @{ $decl[$major] } > $max_family_length;
115              
116 122         180 for ($minor = 0; $minor < @{ $decl[$major] }; $minor++) {
  262         800  
117 140         323 $glyph = $decl[$major][$minor];
118 140 50       363 print " Glyph ($major)\:$minor (", $glyph, ")\n" if $Debug;
119 140 50       415 croak "Glyph <$glyph> appears twice in the sort order declaration!"
120             if $seen{$glyph}++;
121 140         160 croak "Too many minor glyphs" if !UNICODE and $minor > 255;
122              
123 140 100       266 $max_glyph_length = length($glyph) if length($glyph) > $max_glyph_length;
124              
125 140         305 $glyph =~ s/([^a-zA-Z0-9])/_char2esc($1)/eg;
  22         51  
126 140         341 push @glyphs, $glyph;
127 140         240 push @major_out, _num2esc($major);
128 140         245 push @minor_out, _num2esc($minor);
129             # or unpack 'H2', pack 'C', 12 or unpack 'H2', chr 12; ?
130             }
131             }
132 18 50       54 die "Unexpected error: No glyphs?!?" if $max_glyph_length == 0; # sanity
133 18 100       53 $one_level_mode = 1 if $max_family_length == 1;
134              
135             #########################################################################
136             # Now start building the code.
137              
138 18         28 my($prelude, $coda, $code, $minor_code, $major_code);
139 18 100       40 if($max_glyph_length == 1) {
140             # All glyphs are single characters, so we can do this all with tr's
141 14         20 $prelude = "# Single character mode.";
142 14         20 $coda = '';
143 14         41 my $glyphs = join '', @glyphs;
144 14         56 my $major_out = join '', @major_out;
145 14         30 my $minor_out = join '', @minor_out;
146              
147 14         54 $minor_code = <<"EOMN"; # contents of a FOR block mapping $$x[0] => $$x[2]
148             \$x->[2] = \$x->[0];
149             \$x->[2] =~ tr[$glyphs][]cd;
150             \$x->[2] =~ tr[$glyphs]
151             [$minor_out];
152             EOMN
153              
154 14         68 $major_code = <<"EOMJ"; # expression returning a scalar as a major key
155             do { # major keymaker
156             my(\$key) = \$_;
157             \$key =~ tr[$glyphs][]cd;
158             \$key =~ tr[$glyphs]
159             [$major_out];
160             scalar(\$key);
161             }
162             EOMJ
163              
164             # End of single-glyph stuff.
165              
166             } else {
167             # There are glyphs over 2 characters long -- gotta use s's.
168             # End of multi-glyph stuff.
169 4         42 my $glyphs = join ',', map "\"$_\"", @glyphs;
170 4         42 my $major_out = join ',', map "\"$_\"", @major_out;
171 4         34 my $minor_out = join ',', map "\"$_\"", @minor_out;
172              
173 4 100       40 if(!$one_level_mode) {
174 2         11 $prelude = <<"EOPRELUDE";
175             { # Multi-character mode. So we need a closure for these variables.
176             my(\%major, \%minor);
177             \@major{$glyphs}
178             = ($major_out);
179             \@minor{$glyphs}
180             = ($minor_out);
181             my \$glyph_re = join "|", map(quotemeta,
182             sort {length(\$b) <=> length(\$a)} keys \%major);
183             # put the longest glyphs first
184             EOPRELUDE
185             } else { # Multi-character mode
186 2         9 $prelude = <<"EOPRELUDE2";
187             { # Multi-character mode. So we need a closure for these variables.
188             my(\%major); # just one-level mode, tho.
189             \@major{$glyphs}
190             = ($major_out);
191             my \$glyph_re = join "|", map(quotemeta,
192             sort {length(\$b) <=> length(\$a)} keys \%major);
193             # put the longest glyphs first
194             EOPRELUDE2
195             }
196 4         9 $coda = "} # end of closure.";
197              
198 4         6 $minor_code = <<"EOMN2"; # contents of a FOR block mapping $$x[0] => $$x[2]
199             \$x->[2] = join '',
200             map \$minor{\$_},
201             \$x->[0] =~ m<(\$glyph_re)>go;
202             EOMN2
203              
204 4         14 $major_code = <<"EOMJ2"; # expression returning a scalar as a major key
205             join('', map \$major{\$_}, m<(\$glyph_re)>go) # major keymaker
206             EOMJ2
207              
208             }
209              
210             ###
211             # Now finish cobbling the code together.
212              
213 18         152 my $now = scalar(gmtime);
214              
215 18 100       46 if(!$one_level_mode) { # 2-level mode
216 8         94 $code = <<"EOVOODOO";
217             \# Generated by Sort::ArbBiLex v$VERSION at $now GMT
218             $prelude
219             # Two-level mode
220             sub { # change that to "sub whatever {" to name this function
221             no locale; # we need the real 8-bit ASCIIbetical sort()
222             use strict;
223             return
224             # map sort map is the Schwartzian Transform. See perlfaq4.
225             map { \$_->[0] }
226             sort {
227             \$a->[1] cmp \$b->[1] ||
228             do {
229             foreach my \$x (\$a, \$b) {
230             if( !defined(\$x->[2]) and defined(\$x->[0]) ) {
231             $minor_code
232             }
233             }
234             \$a->[2] cmp \$b->[2]; # return value of this do-block
235             }
236             }
237             map { [ \$_,
238             $major_code
239             , undef
240             ]
241             }
242             \@_;
243             }
244             $coda
245              
246             EOVOODOO
247              
248             } else { # one-level mode
249              
250 10         67 $code = <<"EOVOODOO2";
251             \# Generated by Sort::ArbBiLex v$VERSION at $now GMT
252             $prelude
253             # One-level mode
254             sub { # change that to "sub whatever {" to name this function
255             no locale; # we need the real 8-bit ASCIIbetical sort()
256             use strict;
257             return
258             # map sort map is the Schwartzian Transform. See perlfaq4.
259             map { \$_->[0] }
260             sort { \$a->[1] cmp \$b->[1] }
261             map { [ \$_,
262             $major_code
263             ]
264             }
265             \@_;
266             }
267             $coda
268              
269             EOVOODOO2
270              
271             }
272              
273 18 50       49 print "\nCode to eval:\n", $code, "__ENDCODE__\n\n" if $Debug;
274              
275 18         767 return $code;
276             }
277              
278             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
279              
280             sub _char2esc {
281 22     22   46 my $in = ord( $_[0] );
282 22 100       73 return sprintf "\\x{%x}", $in if $in > 255;
283 18         103 return sprintf "\\x%02x", $in;
284             }
285              
286             sub _num2esc {
287 280     280   309 my $in = $_[0];
288 280 50       512 return sprintf "\\x{%x}", $in if $in > 255;
289 280         823 return sprintf "\\x%02x", $in;
290             }
291              
292             ###########################################################################
293              
294             # "cmp" returns -1, 0, or 1 depending on whether the left argument is
295             # stringwise less than, equal to, or greater than the right argument.
296              
297             sub xcmp {
298 8 50 33 8 0 47 carp "usage: xcmp(\\&sorter,$a,$b)" unless @_ and ref($_[0]);
299 8 100       55 return 0 if $_[1] eq $_[2]; # We have to trap this early.
300 6 100       181 return 1 if $_[1] ne ( $_[0]->($_[1], $_[2]) )[0];
301             # If they were switched when sorted, then the original-first was
302             # lexically GT than the original-second.
303 4 100       113 return -1 if $_[1] eq ( $_[0]->($_[2], $_[1]) )[0];
304             # If they were switched BACK when REVERSED and sorted, then the
305             # original-first was lexically LT than the original-second.
306 2         12 return 0;
307             # Otherwise they were lexically identical.
308             }
309              
310             # And two actually simpler ones:
311              
312             sub xlt {
313 14 50 33 14 0 9541 carp "usage: xlt(\\&sorter,$a,$b)" unless @_ and ref($_[0]);
314             #AKA: xcmp(@_) == -1;
315 14 100       50 return 0 if $_[1] eq $_[2]; # We have to trap this early.
316 12 100       417 return 1 if $_[1] eq ( $_[0]->($_[2], $_[1]) )[0];
317             # If they were switched BACK when REVERSED and sorted, then the
318             # original-first was lexically LT than the original-second.
319 8         46 return 0;
320             }
321              
322             sub xgt {
323 14 50 33 14 0 82 carp "usage: xgt(\\&sorter,$a,$b)" unless @_ and ref($_[0]);
324             #AKA: xcmp(@_) == -1;
325 14 100       44 return 0 if $_[1] eq $_[2]; # We have to trap this early.
326 12 100       371 return 1 if $_[1] ne ( $_[0]->($_[1], $_[2]) )[0];
327             # If they were switched when sorted, then the original-first was
328             # lexically GT than the original-second.
329 8         43 return 0;
330             }
331              
332             # And then two easy ones:
333              
334             sub xle {
335 8 50 33 8 0 656 carp "usage: xle(\\&sorter,$a,$b)" unless @_ and ref($_[0]);
336 8         18 !xgt(@_); #AKA: xcmp(@_) < 1;
337             }
338              
339             sub xge {
340 8 50 33 8 0 49 carp "usage: xge(\\&sorter,$a,$b)" unless @_ and ref($_[0]);
341 8         20 !xlt(@_); #AKA: xcmp(@_) > -1;
342             }
343              
344             ###########################################################################
345             1;
346              
347             __END__