File Coverage

blib/lib/Mock/Data/Charset.pm
Criterion Covered Total %
statement 312 374 83.4
branch 190 258 73.6
condition 78 139 56.1
subroutine 40 46 86.9
pod 16 16 100.0
total 636 833 76.3


line stmt bran cond sub pod time code
1             package Mock::Data::Charset;
2 9     9   252426 use strict;
  9         27  
  9         282  
3 9     9   46 use warnings;
  9         18  
  9         270  
4 9     9   1020 use Mock::Data::Util qw( _parse_context _escape_str );
  9         22  
  9         59717  
5             require Carp;
6             our @CARP_NOT= ('Mock::Data::Util');
7             require Mock::Data::Generator;
8             our @ISA= ( 'Mock::Data::Generator' );
9              
10             # ABSTRACT: Generator of strings from a set of characters
11             our $VERSION = '0.02'; # VERSION
12              
13              
14             our @generator_attrs= qw( str_len min_codepoint max_codepoint );
15              
16             sub new {
17 45     45 1 132949 my $class= shift;
18 45         85 my (%self, %parse);
19             # make the common case fast
20 45 100 100     202 if (@_ == 1 && !ref $_[0]) {
21 7         653 qr/[$_[0]]/;
22 7         200 %self= ( notation => $_[0] );
23 7 50       18 if (ref $class) {
24 0   0     0 $self{generator_opts} ||= { %{ $class->{generator_opts} } };
  0         0  
25 0   0     0 $self{max_codepoint} //= $class->{max_codepoint};
26 0         0 $class= ref $class;
27             }
28 7         47 return bless \%self, $class;
29             }
30              
31 38 100       146 %self= @_ != 1? @_ : %{$_[0]};
  14         50  
32              
33             # Look for fields from the parser
34 38 100       227 $parse{classes}= delete $self{classes} if defined $self{classes};
35 38 100       106 $parse{codepoints}= delete $self{codepoints} if defined $self{codepoints};
36 38 50       92 $parse{codepoint_ranges}= delete $self{codepoint_ranges} if defined $self{codepoint_ranges};
37 38 50       93 $parse{negate}= delete $self{negate} if defined $self{negate};
38 38 100       86 if (defined $self{chars}) {
39 3         6 push @{$parse{codepoints}}, map ord, @{$self{chars}};
  3         8  
  3         11  
40 3         7 delete $self{chars};
41             }
42 38 50       92 if (defined $self{ranges}) {
43 0         0 push @{$parse{codepoint_ranges}},
44             map +( ref $_? ( ord $_->[0], ord $_->[1] ) : ord ),
45 0 0       0 @{$self{ranges}};
  0         0  
46 0         0 delete $self{ranges};
47             }
48              
49             # If called on an object, carry over some settings
50 38 50       91 if (ref $class) {
51 0 0 0     0 if (!keys %parse && !defined $self{notation} && !$self{members} && !$self{member_invlist}) {
      0        
      0        
52             @self{'_parse','notation','members','member_invlist'}=
53 0         0 @{$class}{'_parse','notation','members','member_invlist'};
  0         0  
54             }
55 0   0     0 $self{$_} //= $class->{$_} for @generator_attrs;
56 0         0 $class= ref $class;
57             }
58              
59 38 100 66     181 if (defined $self{notation} && !keys %parse) {
    100          
60             # want to trigger the syntax error exception now, not lazily later on
61 21         431 qr/[$self{notation}]/;
62             }
63             elsif (keys %parse) {
64 11         27 $self{_parse}= \%parse;
65             Carp::croak("Charset-building options (classes, chars, codepoints, ranges, codepoint_ranges, negate)"
66             ." cannot be combined with members, member_invlist or notation attributes")
67 11 50 33     96 if $self{members} or $self{member_invlist}; # allow notation to preserve original text
68             }
69             else {
70             # At least one of members, member_invlist, notation, or _parse must be specified
71             Carp::croak("Require at least one of members, member_invlist, notation, or charset-building options")
72 6 50 33     27 unless $self{members} or $self{member_invlist};
73             }
74            
75 38         2135 return bless \%self, $class;
76             }
77              
78             sub _parse {
79             # If the '_parse' wasn't initialized, it can be derived from members or member_invlist or notation
80 26 100   26   98 $_[0]{_parse} || do {
81 15         28 my $self= shift;
82 15 50       36 if (defined $self->{notation}) {
    0          
    0          
83 15         52 $self->{_parse}= $self->parse($self->{notation});
84             }
85             elsif ($self->{members}) {
86 0         0 $self->{_parse}{codepoints}= [ map ord, @{$self->{members}} ];
  0         0  
87             }
88             elsif (my $inv= $self->{member_invlist}) {
89 0         0 my $i;
90 0         0 for ($i= 0; $i < $#$inv; $i+= 2) {
91 0 0       0 if ($inv->[$i] + 1 == $inv->[$i+1]) { push @{$self->{_parse}{codepoints}}, $inv->[$i] }
  0         0  
  0         0  
92 0         0 else { push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], $inv->[$i+1] - 1; }
  0         0  
93             }
94 0 0       0 if ($i == $#$inv) {
95 0   0     0 push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], ($self->max_codepoint || 0x10FFFF);
  0         0  
96             }
97             }
98 0         0 else { die "Unhandled lazy-build scenario" }
99 15         53 $self->{_parse};
100             };
101             }
102              
103              
104             sub notation {
105 14   66 14 1 1286 $_[0]{notation} //= _deparse_charset($_[0]->_parse);
106             }
107              
108              
109             sub min_codepoint {
110 112 50   112 1 202 $_[0]{min_codepoint}= $_[1] if @_ > 1;
111             $_[0]{min_codepoint}
112 112         243 }
113             sub max_codepoint {
114             $_[0]{max_codepoint}
115 146     146 1 288 }
116              
117              
118             sub str_len {
119 112 50   112 1 231 $_[0]{str_len}= $_[1] if @_ > 1;
120 112         293 $_[0]{str_len};
121             }
122              
123              
124             sub count {
125 861 100   861 1 296991 $_[0]{members}? scalar @{$_[0]{members}}
  83         179  
126             : $_[0]->_invlist_index->[-1];
127             }
128              
129              
130             sub members {
131 103   66 103 1 313 $_[0]{members} ||= $_[0]->_build_members;
132             }
133              
134             sub _build_members {
135 20     20   34 my $self= shift;
136 20         42 my $invlist= $self->member_invlist;
137 20         33 my @members;
138 20 50       55 if (@$invlist > 1) {
139             push @members, map chr, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
140 20         472 for 0 .. (($#$invlist-1)>>1);
141             }
142             # an odd number of elements means the list ends with an "include-all"
143 20 50       63 push @members, map chr, $invlist->[-1] .. 0x10FFFF
144             if 1 & @$invlist;
145 20         75 return \@members;
146             }
147              
148             sub Mock::Data::Charset::Util::expand_invlist {
149 6     6   10263 my $invlist= shift;
150 6         11 my @members;
151 6 100       19 if (@$invlist > 1) {
152             push @members, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
153 5         32 for 0 .. (($#$invlist-1)>>1);
154             }
155             # an odd number of elements means the list ends with an "include-all"
156 6 100       19 push @members, $invlist->[-1] .. 0x10FFFF
157             if 1 & @$invlist;
158 6         14 return \@members;
159             }
160              
161             # The index is private because there's not a good way to explain it to the user
162             sub _invlist_index {
163 2329     2329   3427 my $self= shift;
164 2329   66     6623 $self->{_invlist_index} ||= Mock::Data::Charset::Util::create_invlist_index($self->member_invlist);
165             }
166              
167             sub Mock::Data::Charset::Util::create_invlist_index {
168 24     24   5012 my $invlist= shift;
169 24         59 my $n_spans= (@$invlist + 1) >> 1;
170 24         32 my @index;
171 24         87 $#index= $n_spans-1;
172 24         39 my $total= 0;
173             $index[$_]= $total += $invlist->[$_*2+1] - $invlist->[$_*2]
174 24         312 for 0 .. (@$invlist >> 1)-1;
175 24 100       83 if (@$invlist & 1) { # In the case that the final range is infinite
176 6         19 $index[$n_spans-1]= $total + 0x110000 - $invlist->[-1];
177             }
178 24         117 \@index;
179             }
180              
181              
182             sub member_invlist {
183 1605 50   1605 1 3336 if (@_ > 1) {
184 0         0 $_[0]{member_invlist}= $_[1];
185 0         0 delete $_[0]{_invlist_index};
186 0         0 delete $_[0]{members};
187 0         0 delete $_[0]{notation};
188             }
189 1605   66     4013 $_[0]{member_invlist} //= _build_member_invlist(@_);
190             }
191              
192             sub _build_member_invlist {
193 34     34   63 my $self= shift;
194 34         83 my $max_codepoint= $self->max_codepoint;
195             # If the search space is small, and there is already a regex notation, it is probably faster
196             # to iterate and let perl do the work than to parse the charset.
197 34         51 my $invlist;
198 34 100 66     216 if (!defined $max_codepoint || $max_codepoint > 1000 || !defined $self->{notation}) {
      100        
199 25   100     110 $max_codepoint ||= 0x10FFFF;
200 25         45 $invlist= eval {
201 25         65 _parsed_charset_to_invlist($self->_parse, $max_codepoint);
202             }# or main::diag $@
203             }
204 34   66     169 $invlist ||= _charset_invlist_brute_force($self->notation, $max_codepoint);
205             # If a user writes to the invlist, it will become out of sync with the Index,
206             # leading to confusing bugs.
207 34 50       241 if (Internals->can('SvREADONLY')) {
208 34         446 Internals::SvREADONLY($_,1) for @$invlist;
209 34         80 Internals::SvREADONLY(@$invlist,1);
210             }
211 34         171 return $invlist;
212             }
213              
214             # Lazy-built string of all basic-multilingual-plane characters
215             our $_ascii_chars;
216             our $_unicode_chars;
217             sub _build_unicode_chars {
218 1 50   1   5 unless (defined $_unicode_chars) {
219             # Construct ranges of valid characters separated by NUL.
220             # Older perls die when the regex engine encounters an invalid character
221             # but newer perls just treat the invalid character as "not a member",
222             # unless the set is a negation in which case non-characters *are* a member.
223             # This makes the assumption that if a non-char isn't a member then \0 won't
224             # be either.
225 1         3 $_unicode_chars= '';
226 1         10820 $_unicode_chars .= chr($_) for 0 .. 0xD7FF;
227 1         12 $_unicode_chars .= "\0";
228 1         109 $_unicode_chars .= chr($_) for 0xFDF0 .. 0xFFFD;
229 1         4 for (1..16) {
230 16         177 $_unicode_chars .= "\0";
231 16         215436 $_unicode_chars .= chr($_) for ($_<<16) .. (($_<<16)|0xFFFD);
232             }
233             }
234 1         9 \$_unicode_chars;
235             }
236              
237             sub _charset_invlist_brute_force {
238 9     9   26 my ($set, $max_codepoint)= @_;
239 9 100       41 my $inv= (ord $set == ord '^')? substr($set,1) : '^'.$set;
240 9         13 my @invlist;
241            
242             # optimize common case
243 9 100       28 if ($max_codepoint < 256) {
244             # Find first character of every match and first character of every non-match
245             # and convert to codepoints.
246 8 100 66     438 @invlist= map +(defined $_? ord($_) : ()),
247             ($_ascii_chars //= join('', map chr($_), 0..255))
248             =~ / ( [$set] ) (?> [$set]* ) (?: \z | ( [$inv] ) (?> [$inv]* ) )/gx;
249             }
250             else {
251 1 50       7 _build_unicode_chars() unless defined $_unicode_chars;
252             # This got complicated while trying to support perls that can't match against non-characters.
253             # The non-characters have been replaced by NULs, so need to capture the char before and after
254             # each transition in case one of them is a NUL.
255 1 50       5924 my @endpoints=
256             ($max_codepoint < 0x10FFFF? substr($_unicode_chars,0,$max_codepoint+1) : $_unicode_chars)
257             =~ /( [$set] ) ( [$set] )* ( \z | [$inv] ) ( [$inv] )* /gx;
258 1 50       9 if (@endpoints) {
259             # List is a multiple of 4 elements: (first-member,last-member,first-non-member,last-non-member)
260             # We're not interested in the span of non-members at the end, so just remove those.
261 1         2 pop @endpoints; pop @endpoints;
  1         3  
262             # Iterate every transition of member/nonmember, and use the second character if present
263             # and isn't a NUL, else use the first character and add 1.
264 1         7 push @invlist, ord $endpoints[0];
265 1         6 for (my $i= 1; $i < @endpoints; $i+= 2) {
266 19 100 66     64 if (defined $endpoints[$i+1] && ord $endpoints[$i+1]) {
    50          
267 18         41 push @invlist, ord $endpoints[$i+1];
268             } elsif (defined $endpoints[$i]) {
269 1         5 push @invlist, 1 + ord $endpoints[$i];
270             } else {
271 0         0 push @invlist, 1 + $invlist[-1];
272             }
273             }
274             # substr is an estimate, because string skips characters, so remove any spurrous
275             # codepoints beyond the max
276 1   66     12 pop @invlist while @invlist && $invlist[-1] > $max_codepoint;
277             }
278             }
279             # If an "infinite" range would be returned, but the user set a maximum codepoint,
280             # list the max codepoint as the end of the invlist.
281 9 100 66     916 if ($max_codepoint < 0x10FFFF and 1 & @invlist) {
282 4         11 push @invlist, $max_codepoint+1;
283             }
284 9         41 return \@invlist;
285             }
286              
287             sub _parsed_charset_to_invlist {
288 25     25   92 my ($parse, $max_codepoint)= @_;
289 25         41 my @invlists;
290             # convert the character list into an inversion list
291 25 100       72 if (defined (my $cp= $parse->{codepoints})) {
292 5         22 my @chars= sort { $a <=> $b } @$cp;
  6         18  
293 5         13 my @invlist= (shift @chars);
294 5         11 push @invlist, $invlist[0] + 1;
295 5         19 for (my $i= 0; $i <= $#chars; $i++) {
296             # If the next char is adjacent, extend the span
297 5 100       20 if ($invlist[-1] == $chars[$i]) {
298 2         7 ++$invlist[-1];
299             } else {
300 3         9 push @invlist, $chars[$i], $chars[$i]+1;
301             }
302             }
303 5         16 push @invlists, \@invlist;
304             }
305             # Each range is an inversion list already
306 25 100       67 if (my $r= $parse->{codepoint_ranges}) {
307 9         32 for (my $i= 0; $i < (@$r >> 1); $i++) {
308 10         32 my ($start, $limit)= ($r->[$i*2], $r->[$i*2+1]+1);
309             # Try to combine the range with the most recent inversion list, if possible,
310 10 100 66     54 if (@invlists && $invlists[-1][-1] < $start) {
    50 33        
311 1         4 push @{ $invlists[-1] }, $start, $limit;
  1         5  
312             } elsif (@invlists && $invlists[-1][0] > $limit) {
313 0         0 unshift @{ $invlists[-1] }, $start, $limit;
  0         0  
314             } else {
315             # else just start a new inversion list
316 9         34 push @invlists, [ $start, $limit ]
317             }
318             }
319             }
320             # Convert each character class to an inversion list.
321 25 100       59 if ($parse->{classes}) {
322             push @invlists, _class_invlist($_)
323 11         35 for @{ $parse->{classes} };
  11         44  
324             }
325 25         2417 my $invlist= Mock::Data::Charset::Util::merge_invlists(\@invlists, $max_codepoint);
326             # Perform negation of inversion list by either starting at char 0 or removing char 0
327 25 100       74 if ($parse->{negate}) {
328 1 50       4 if ($invlist->[0]) { unshift @$invlist, 0 }
  1         4  
329 0         0 else { shift @$invlist; }
330             }
331 25         140 return $invlist;
332             }
333              
334              
335             our $_compile;
336             sub compile {
337 3     3 1 20 local $_compile= 1;
338 3         12 shift->generate(@_);
339             }
340             sub generate {
341 112     112 1 207 my ($self, $mock)= (shift, shift);
342 112         230 my ($len, $cp_min, $cp_max, $member_count)
343             = ($self->str_len, $self->min_codepoint, $self->max_codepoint, $self->count);
344 112 100       245 if (@_) {
345 82 100       187 my %opts= ref $_[0] eq 'HASH'? %{ shift() } : ();
  79         302  
346 82 100 33     211 $len= @_? shift : $opts{str_len} // $opts{len} // $opts{size}; # allow some aliases for length
      33        
347 82   66     243 $cp_min= $opts{min_codepoint} // $cp_min;
348 82   66     241 $cp_max= $opts{max_codepoint} // $cp_max;
349             }
350 112 100 100     415 my ($memb_min, $memb_span)= !defined $cp_min && !defined $cp_max? (0,$member_count)
351             : $self->_codepoint_minmax_to_member_range($cp_min, $cp_max);
352              
353             # If compiling, $len will be a function, else it will be an integer
354 0     0   0 $len= !defined $len? ($_compile? sub { 1 } : 1 )
355 0     0   0 : !ref $len? ($_compile? sub { $len } : $len)
356             : ref $len eq 'ARRAY'? (
357 0     0   0 $_compile? sub { $len->[0] + int rand ($len->[1] - $len->[0]) }
358 112 50       353 : $len->[0] + int rand ($len->[1] - $len->[0])
    50          
    50          
    100          
    50          
    100          
    100          
    100          
359             )
360             : ref $len eq 'CODE'? ($_compile? $len : $len->($mock))
361             : Carp::croak("Unknown str_len specification '$len'");
362              
363             # If member list is small-ish, use faster direct array access
364 112 100 100     367 if ($self->{members} || $member_count < 500) {
365 96         187 my $members= $self->members;
366             return sub {
367 3     3   9 my $buf= '';
368             $buf .= $members->[$memb_min + int rand $memb_span]
369 3         10 for 1..$len->($_[0]);
370 3         35 return $buf;
371 96 100       218 } if $_compile;
372 93         150 my $buf= '';
373             $buf .= $members->[$memb_min + int rand $memb_span]
374 93         433 for 1..$len;
375 93         402 return $buf;
376             }
377             else {
378 16         34 my $invlist= $self->member_invlist;
379 16         27 my $index= $self->_invlist_index;
380             return sub {
381 0     0   0 my $ret= '';
382             $ret .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
383 0         0 for 1..$len->($_[0]);
384 16 50       38 } if $_compile;
385 16         27 my $buf= '';
386             $buf .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
387 16         62 for 1..$len;
388 16         66 return $buf;
389             }
390             }
391              
392             sub _codepoint_minmax_to_member_range {
393 27     27   42 my $self= shift;
394 27         48 my ($cp_min, $cp_max)= @_;
395             my $memb_min= !defined $cp_min? 0
396 27 100       53 : do {
397 6         15 my ($at, $ins)= _find_invlist_element($cp_min, $self->member_invlist, $self->_invlist_index);
398 6   33     19 $at // $ins
399             };
400             my $memb_lim= !defined $cp_max? $self->count
401 27 50       50 : do {
402 27         57 my ($at, $ins)= _find_invlist_element($cp_max, $self->member_invlist, $self->_invlist_index);
403 27 100       68 defined $at? $at + 1 : $ins;
404             };
405 27         65 return ($memb_min, $memb_lim-$memb_min);
406             }
407              
408              
409             sub parse {
410 31     31 1 22332 my ($self, $notation)= @_;
411 31 50       100 return { codepoints => [] } unless length $notation;
412 31 50       123 return { classes => ['All'] } if $notation eq '^';
413 31         75 $notation .= ']';
414             # parse function needs $_ to be the input string
415 31         98 pos($notation)= 0;
416 31         140 return _parse_charset() for $notation;
417             }
418              
419             our $have_prop_invlist;
420             our %_parse_charset_backslash= (
421             a => ord "\a",
422             b => ord "\b",
423             c => sub { die "Unimplemented: \\c" },
424             d => sub { push @{$_[0]{classes}}, 'digit'; undef; },
425             D => sub { push @{$_[0]{classes}}, '^digit'; undef; },
426             e => ord "\e",
427             f => ord "\f",
428             h => sub { push @{$_[0]{classes}}, 'horizspace'; undef; },
429             H => sub { push @{$_[0]{classes}}, '^horizspace'; undef; },
430             n => ord "\n",
431             N => \&_parse_charset_namedchar,
432             o => \&_parse_charset_oct,
433             p => \&_parse_charset_classname,
434             P => sub { _parse_charset_classname(shift, 1) },
435             r => ord "\r",
436             s => sub { push @{$_[0]{classes}}, 'space'; undef; },
437             S => sub { push @{$_[0]{classes}}, '^space'; undef; },
438             t => ord "\t",
439             v => sub { push @{$_[0]{classes}}, 'vertspace'; undef; },
440             V => sub { push @{$_[0]{classes}}, '^vertspace'; undef; },
441             w => sub { push @{$_[0]{classes}}, 'word'; undef; },
442             W => sub { push @{$_[0]{classes}}, '^word'; undef; },
443             x => \&_parse_charset_hex,
444             0 => \&_parse_charset_oct,
445             1 => \&_parse_charset_oct,
446             2 => \&_parse_charset_oct,
447             3 => \&_parse_charset_oct,
448             4 => \&_parse_charset_oct,
449             5 => \&_parse_charset_oct,
450             6 => \&_parse_charset_oct,
451             7 => \&_parse_charset_oct,
452             8 => \&_parse_charset_oct,
453             9 => \&_parse_charset_oct,
454             );
455             our %_class_invlist_cache= (
456             'Any' => [ 0 ],
457             '\\N' => [ 0, ord("\n"), 1+ord("\n") ],
458             );
459             sub _class_invlist {
460 22     22   10809 my $class= shift;
461 22 100       51 if (ord $class == ord '^') {
462 5         25 return Mock::Data::Charset::Util::negate_invlist(
463             _class_invlist(substr($class,1))
464             );
465             }
466 17   66     82 return $_class_invlist_cache{$class} ||= do {
467 8 100       26 $have_prop_invlist= do { require Unicode::UCD; !!Unicode::UCD->can('prop_invlist') }
  2         2673  
  2         46089  
468             unless defined $have_prop_invlist;
469 8 50       41 $have_prop_invlist? [ Unicode::UCD::prop_invlist($class) ]
470             : _charset_invlist_brute_force("\\p{$class}", 0x10FFFF);
471             };
472             }
473             sub _parse_charset_hex {
474 3 50   3   17 /\G( [0-9A-Fa-f]{2} | \{ ([0-9A-Fa-f]+) \} )/gcx
475             or die "Invalid hex escape at "._parse_context;
476 3 100       18 return hex(defined $2? $2 : $1);
477             }
478             sub _parse_charset_oct {
479 5     5   17 --pos; # The caller ate one of the characters we need to parse
480 5 50       31 /\G( [0-7]{3} | 0 | o\{ ([0-7]+) \} ) /gcx
481             or die "Invalid octal escape at "._parse_context;
482 5 100       30 return oct(defined $2? $2 : $1);
483             }
484             sub _parse_charset_namedchar {
485 2     2   1245 require charnames;
486 2 50       12322 /\G \{ ([^}]+) \} /gcx
487             # or die "Invalid named char following \\N at '".substr($_,pos,10)."'";
488             and return charnames::vianame($1);
489             # Plain "\N" means every character except \n
490 0         0 push @{ $_[0]{classes} }, '\\N';
  0         0  
491 0         0 return;
492             }
493             sub _parse_charset_classname {
494 7     7   20 my ($result, $negate)= @_;
495 7 50       39 /\G \{ ([^}]+) \} /gcx
496             or die "Invalid class name following \\p at "._parse_context;
497 7 100       14 push @{$result->{classes}}, lc($negate? "^$1" : $1);
  7         39  
498             undef
499 7         18 }
500             sub _parse_charset {
501 36     36   68 my $flags= shift;
502             # argument is in $_, starting from pos($_)
503 36         65 my %parse;
504             my @range;
505 36         109 $parse{codepoints}= \my @chars;
506 36 100       139 $parse{negate}= 1 if /\G \^ /gcx;
507 36 50       97 if (/\G]/gc) { push @chars, ord ']' }
  0         0  
508 36         64 while (1) {
509 121         155 my $cp; # literal codepoint to be added
510             # Check for special cases
511 121 100 50     490 if (/\G ( \\ | - | \[: | \] ) /gcx) {
    50 66        
      33        
512 78 100       288 if ($1 eq '\\') {
    100          
    100          
513 19 50       66 /\G(.)/gc or die "Unexpected end of input";
514 19   33     76 $cp= $_parse_charset_backslash{$1} || ord $1;
515 19 100       99 $cp= $cp->(\%parse)
516             if ref $cp;
517             }
518             elsif ($1 eq '-') {
519 18 100       48 if (@range == 1) {
520 17         31 push @range, ord '-';
521 17         38 next;
522             }
523             else {
524 1         4 $cp= ord '-';
525             }
526             }
527             elsif ($1 eq '[:') {
528 5 50       34 /\G ( [^:]+ ) :] /gcx
529             or die "Invalid character class at "._parse_context;
530 5         12 push @{$parse{classes}}, $1;
  5         24  
531             }
532             else {
533 36         112 last; # $1 eq ']';
534             }
535             }
536             elsif ($flags && ($flags->{x}||0) >= 2 && /\G[ \t]/gc) {
537 0         0 next; # ignore space and tab under /xx
538             }
539             else {
540 43 50       130 /\G(.)/gc or die "Unexpected end of input";
541 43         81 $cp= ord $1;
542             }
543             # If no single character was found, any range-in-progress needs converted to
544             # charcters
545 68 100       13418 if (!defined $cp) {
    100          
    100          
546 16         34 push @chars, @range;
547 16         25 @range= ();
548             }
549             # At this point, $cp will contain the next ordinal of the character to include,
550             # but it might also be starting or finishing a range.
551             elsif (@range == 1) {
552 8         18 push @chars, $range[0];
553 8         16 $range[0]= $cp;
554             }
555             elsif (@range == 2) {
556 16         22 push @{$parse{codepoint_ranges}}, $range[0], $cp;
  16         46  
557 16         31 @range= ();
558             }
559             else {
560 28         62 push @range, $cp;
561             }
562             #printf "# pos %d cp %d range %s %s include %s\n", pos $_, $cp, $range[0] // '(null)', $range[1] // '(null)', join(',', @include);
563             }
564 36         68 push @chars, @range;
565 36 100       89 if (@chars) {
566 13         65 @chars= sort { $a <=> $b } @chars;
  11         31  
567             } else {
568 23         48 delete $parse{codepoints};
569             }
570 36         187 return \%parse;
571             }
572              
573             sub _ord_to_safe_regex_char {
574 3 0   3   33 return chr($_[0]) =~ /[\w]/? chr $_[0]
    50          
575             : $_[0] <= 0xFF? sprintf('\x%02X',$_[0])
576             : sprintf('\x{%X}',$_[0])
577             }
578              
579             sub _deparse_charset {
580 1     1   2 my $parse= shift;
581 1         3 my $str= '';
582 1 50       5 if (my $cp= $parse->{codepoints}) {
583             $str .= _ord_to_safe_regex_char($_)
584 1         5 for @$cp;
585             }
586 1 50       4 if (my $r= $parse->{codepoint_ranges}) {
587 0         0 for (my $i= 0; $i < (@$r << 1); $i++) {
588 0         0 $str .= _ord_to_safe_regex_char($r->[$i*2]) . '-' . _ord_to_safe_regex_char($r->[$i*2+1]);
589             }
590             }
591 1 50       5 if (my $cl= $parse->{classes}) {
592             # TODO: reverse conversions to \h \v etc.
593 0         0 for (@$cl) {
594 0 0       0 $str .= $_ eq '\N'? '\0-\x09\x0B-\x{10FFFF}'
    0          
595             : ord == ord '^'? '\P{'.substr($_,1).'}'
596             : '\p{'.$_.'}';
597             }
598             }
599 1         6 return $str;
600             }
601              
602              
603             sub get_member {
604 749 100   749 1 3564 $_[0]{members}? $_[0]{members}[$_[1]]
605             : chr _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
606             }
607              
608             sub get_member_codepoint {
609 6 50   6 1 2687 $_[0]{members}? ord $_[0]{members}[$_[1]]
610             : _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
611             }
612              
613             sub _get_invlist_element {
614 1108     1108   1977 my ($ofs, $invlist, $invlist_index)= @_;
615 1108 50       2112 $ofs += @$invlist_index if $ofs < 0;
616 1108 50 33     3944 return undef if $ofs >= $invlist_index->[-1] || $ofs < 0;
617             # Binary Search to find the range that contains this numbered element
618 1108         2072 my ($min, $max, $mid)= (0, $#$invlist_index);
619 1108         1594 while (1) {
620 5772         7211 $mid= ($min+$max) >> 1;
621 5772 100 100     13656 if ($ofs >= $invlist_index->[$mid]) {
    100          
622 2758         3469 $min= $mid+1
623             }
624             elsif ($mid > 0 && $ofs < $invlist_index->[$mid-1]) {
625 1906         2594 $max= $mid-1
626             }
627             else {
628 1108 100       2127 $ofs -= $invlist_index->[$mid-1] if $mid > 0;
629 1108         4343 return $invlist->[$mid*2] + $ofs;
630             }
631             }
632             }
633              
634              
635             sub find_member {
636 754     754 1 2673 my ($self, $char)= @_;
637 754         1509 return _find_invlist_element(ord $char, $self->member_invlist, $self->_invlist_index);
638             }
639              
640             sub _find_invlist_element {
641 787     787   1491 my ($codepoint, $invlist, $index)= @_;
642             # Binary Search to find the range that contains this numbered element
643 787         1389 my ($min, $max, $mid)= (0, $#$invlist);
644 787         1193 while (1) {
645 5739         6960 $mid= ($min+$max) >> 1;
646 5739 100 100     18815 if ($mid > 0 && $codepoint < $invlist->[$mid]) {
    100 100        
647 2331         3019 $max= $mid-1
648             }
649             elsif ($mid < $#$invlist && $codepoint >= $invlist->[$mid+1]) {
650 2621         3630 $min= $mid+1;
651             }
652             else {
653 787 100       1543 return (undef, 0) unless $codepoint >= $invlist->[$mid];
654 786 100       1454 return $codepoint - $invlist->[$mid] unless $mid > 0;
655 778 100       4728 return $codepoint - $invlist->[$mid] + $index->[($mid >> 1) - 1] unless $mid & 1;
656             # if $mid is an odd number, the range is excluded, and there is no match
657 13 100       46 return undef unless wantarray;
658 11         40 return (undef, $index->[($mid-1)>>1]) # return insertion point as second val
659             }
660             }
661             }
662              
663              
664             sub negate {
665 0     0 1 0 my $self= shift;
666 0         0 my $neg= Mock::Data::Charset::Util::negate_invlist($self->member_invlist, $self->max_codepoint);
667 0         0 return $self->new(member_invlist => $neg);
668             }
669             sub Mock::Data::Charset::Util::negate_invlist {
670 5     5   1246 my ($invlist, $max_codepoint)= @_;
671             # Toggle first char of 0
672 5 50       247 $invlist= $invlist->[0]? [ 0, @$invlist ] : [ @{$invlist}[1..$#$invlist] ];
  0         0  
673             # If max_codepoint is defined, and was the final char, remove the range starting at max_codepoint+1
674 5 50 33     38 if (@$invlist & 1 and defined $max_codepoint and $invlist->[-1] == $max_codepoint+1) {
      33        
675 0         0 pop @$invlist;
676             }
677 5         23 return $invlist;
678             }
679              
680              
681             sub union {
682 0     0 1 0 my $self= $_[0];
683 0         0 my @invlists= @_;
684             ref eq 'ARRAY' || ($_= $_->member_invlist)
685 0   0     0 for @invlists;
686 0         0 my $combined= Mock::Data::Charset::Util::merge_invlists(\@invlists, $self->max_codepoint);
687 0         0 return $self->new(member_invlist => $combined);
688             }
689              
690             #=head2 merge_invlists
691             #
692             # my $combined_invlist= $charset->merge_invlist( \@list2, \@list3, ... );
693             # my $combined_invlist= merge_invlist( \@list1, \@list2, ... );
694             #
695             #Merge one or more inversion lists into a superset of all of them.
696             #If called as a method, the L is used as the first list.
697             #
698             #The return value is an inversion list, which can be wrapped in a Charset object by passing it
699             #as the C attribute.
700             #
701             #The current L applies to the result. If called as a plain function, the
702             #C is assumed to be the Unicode maximum of C<0x10FFFF>.
703             #
704             #=cut
705              
706             sub Mock::Data::Charset::Util::merge_invlists {
707 33     33   9948 my @invlists= @{shift()};
  33         78  
708 33   100     99 my $max_codepoint= shift // 0x10FFFF;
709              
710 33 50       80 return [] unless @invlists;
711 33 100       135 return [@{$invlists[0]}] unless @invlists > 1;
  19         101  
712 14         26 my @combined= ();
713             # Repeatedly select the minimum range among the input lists and add it to the result
714 14         42 my @pos= (0)x@invlists;
715 14         35 while (@invlists) {
716 3293         5552 my ($min_ch, $min_i)= ($invlists[0][$pos[0]], 0);
717             # Find which inversion list contains the lowest range
718 3293         5627 for (my $i= 1; $i < @invlists; $i++) {
719 3284 100       7549 if ($invlists[$i][$pos[$i]] < $min_ch) {
720 814         1168 $min_ch= $invlists[$i][$pos[$i]];
721 814         1472 $min_i= $i;
722             }
723             }
724 3293 100       5427 last if $min_ch > $max_codepoint;
725             # Check for overlap of this new inclusion range with the previous
726 3291 100 100     8048 if (@combined && $combined[-1] >= $min_ch) {
727             # they overlap, so just replace the end-codepoint of the range
728             # if the new endpoint is larger
729 3111         4492 my $new_end= $invlists[$min_i][$pos[$min_i]+1];
730 3111 100 100     8754 $combined[-1]= $new_end if !defined $new_end || $combined[-1] < $new_end;
731             }
732             else {
733             # else, simply append the range
734 180         287 push @combined, @{$invlists[$min_i]}[$pos[$min_i] .. $pos[$min_i]+1];
  180         427  
735             }
736             # If the list is empty now, remove it from consideration
737 3291 100       4524 if (($pos[$min_i] += 2) >= @{$invlists[$min_i]}) {
  3291 50       8620  
738 21         36 splice @invlists, $min_i, 1;
739 21         30 splice @pos, $min_i, 1;
740             # If the invlist ends with an infinite range now, we are done
741 21 100       53 if (!defined $combined[-1]) {
742 6         12 pop @combined;
743 6         17 last;
744             }
745             }
746             # If this is the only list remaining, append the rest and done
747             elsif (@invlists == 1) {
748 0         0 push @combined, @{$invlists[0]}[$pos[0] .. $#{$invlists[0]}];
  0         0  
  0         0  
749 0         0 last;
750             }
751             }
752 14         34 while ($combined[-1] > $max_codepoint) {
753 1         3 pop @combined;
754             }
755             # If the list ends with inclusion, and the max_codepoint is less than unicode max,
756             # end the list with it.
757 14 100 100     53 if (1 & @combined and $max_codepoint < 0x10FFFF) {
758 1         2 push @combined, $max_codepoint+1;
759             }
760 14         47 return \@combined;
761             }
762              
763             1;
764              
765             __END__