File Coverage

blib/lib/Mock/Data/Charset.pm
Criterion Covered Total %
statement 312 374 83.4
branch 190 258 73.6
condition 78 139 56.1
subroutine 40 46 86.9
pod 16 16 100.0
total 636 833 76.3


line stmt bran cond sub pod time code
1             package Mock::Data::Charset;
2 12     12   212371 use strict;
  12         29  
  12         343  
3 12     12   60 use warnings;
  12         26  
  12         339  
4 12     12   888 use Mock::Data::Util qw( _parse_context _escape_str );
  12         26  
  12         107  
5             require Carp;
6             our @CARP_NOT= ('Mock::Data::Util');
7             require Mock::Data::Generator;
8             our @ISA= ( 'Mock::Data::Generator' );
9              
10             # ABSTRACT: Generator of strings from a set of characters
11             our $VERSION = '0.03'; # VERSION
12              
13              
14             our @generator_attrs= qw( str_len min_codepoint max_codepoint );
15              
16             sub new {
17 46     46 1 127036 my $class= shift;
18 46         91 my (%self, %parse);
19             # make the common case fast
20 46 100 100     178 if (@_ == 1 && !ref $_[0]) {
21 7         584 qr/[$_[0]]/;
22 7         194 %self= ( notation => $_[0] );
23 7 50       19 if (ref $class) {
24 0   0     0 $self{generator_opts} ||= { %{ $class->{generator_opts} } };
  0         0  
25 0   0     0 $self{max_codepoint} //= $class->{max_codepoint};
26 0         0 $class= ref $class;
27             }
28 7         43 return bless \%self, $class;
29             }
30              
31 39 100       137 %self= @_ != 1? @_ : %{$_[0]};
  14         50  
32              
33             # Look for fields from the parser
34 39 100       113 $parse{classes}= delete $self{classes} if defined $self{classes};
35 39 100       88 $parse{codepoints}= delete $self{codepoints} if defined $self{codepoints};
36 39 50       109 $parse{codepoint_ranges}= delete $self{codepoint_ranges} if defined $self{codepoint_ranges};
37 39 50       85 $parse{negate}= delete $self{negate} if defined $self{negate};
38 39 100       88 if (defined $self{chars}) {
39 3         4 push @{$parse{codepoints}}, map ord, @{$self{chars}};
  3         7  
  3         12  
40 3         7 delete $self{chars};
41             }
42 39 50       115 if (defined $self{ranges}) {
43 0         0 push @{$parse{codepoint_ranges}},
44             map +( ref $_? ( ord $_->[0], ord $_->[1] ) : ord ),
45 0 0       0 @{$self{ranges}};
  0         0  
46 0         0 delete $self{ranges};
47             }
48              
49             # If called on an object, carry over some settings
50 39 50       81 if (ref $class) {
51 0 0 0     0 if (!keys %parse && !defined $self{notation} && !$self{members} && !$self{member_invlist}) {
      0        
      0        
52             @self{'_parse','notation','members','member_invlist'}=
53 0         0 @{$class}{'_parse','notation','members','member_invlist'};
  0         0  
54             }
55 0   0     0 $self{$_} //= $class->{$_} for @generator_attrs;
56 0         0 $class= ref $class;
57             }
58              
59 39 100 66     179 if (defined $self{notation} && !keys %parse) {
    100          
60             # want to trigger the syntax error exception now, not lazily later on
61 22         426 qr/[$self{notation}]/;
62             }
63             elsif (keys %parse) {
64 11         37 $self{_parse}= \%parse;
65             Carp::croak("Charset-building options (classes, chars, codepoints, ranges, codepoint_ranges, negate)"
66             ." cannot be combined with members, member_invlist or notation attributes")
67 11 50 33     46 if $self{members} or $self{member_invlist}; # allow notation to preserve original text
68             }
69             else {
70             # At least one of members, member_invlist, notation, or _parse must be specified
71             Carp::croak("Require at least one of members, member_invlist, notation, or charset-building options")
72 6 50 33     28 unless $self{members} or $self{member_invlist};
73             }
74            
75 39         1875 return bless \%self, $class;
76             }
77              
78             sub _parse {
79             # If the '_parse' wasn't initialized, it can be derived from members or member_invlist or notation
80 27 100   27   104 $_[0]{_parse} || do {
81 16         25 my $self= shift;
82 16 50       51 if (defined $self->{notation}) {
    0          
    0          
83 16         44 $self->{_parse}= $self->parse($self->{notation});
84             }
85             elsif ($self->{members}) {
86 0         0 $self->{_parse}{codepoints}= [ map ord, @{$self->{members}} ];
  0         0  
87             }
88             elsif (my $inv= $self->{member_invlist}) {
89 0         0 my $i;
90 0         0 for ($i= 0; $i < $#$inv; $i+= 2) {
91 0 0       0 if ($inv->[$i] + 1 == $inv->[$i+1]) { push @{$self->{_parse}{codepoints}}, $inv->[$i] }
  0         0  
  0         0  
92 0         0 else { push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], $inv->[$i+1] - 1; }
  0         0  
93             }
94 0 0       0 if ($i == $#$inv) {
95 0   0     0 push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], ($self->max_codepoint || 0x10FFFF);
  0         0  
96             }
97             }
98 0         0 else { die "Unhandled lazy-build scenario" }
99 16         43 $self->{_parse};
100             };
101             }
102              
103              
104             sub notation {
105 14   66 14 1 1168 $_[0]{notation} //= _deparse_charset($_[0]->_parse);
106             }
107              
108              
109             sub min_codepoint {
110 113 50   113 1 213 $_[0]{min_codepoint}= $_[1] if @_ > 1;
111             $_[0]{min_codepoint}
112 113         214 }
113             sub max_codepoint {
114             $_[0]{max_codepoint}
115 148     148 1 286 }
116              
117              
118             sub str_len {
119 113 50   113 1 233 $_[0]{str_len}= $_[1] if @_ > 1;
120 113         238 $_[0]{str_len};
121             }
122              
123              
124             sub count {
125 862 100   862 1 281933 $_[0]{members}? scalar @{$_[0]{members}}
  83         178  
126             : $_[0]->_invlist_index->[-1];
127             }
128              
129              
130             sub members {
131 104   66 104 1 266 $_[0]{members} ||= $_[0]->_build_members;
132             }
133              
134             sub _build_members {
135 21     21   34 my $self= shift;
136 21         42 my $invlist= $self->member_invlist;
137 21         31 my @members;
138 21 50       51 if (@$invlist > 1) {
139             push @members, map chr, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
140 21         450 for 0 .. (($#$invlist-1)>>1);
141             }
142             # an odd number of elements means the list ends with an "include-all"
143 21 50       62 push @members, map chr, $invlist->[-1] .. 0x10FFFF
144             if 1 & @$invlist;
145 21         75 return \@members;
146             }
147              
148             sub Mock::Data::Charset::Util::expand_invlist {
149 6     6   9821 my $invlist= shift;
150 6         12 my @members;
151 6 100       17 if (@$invlist > 1) {
152             push @members, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
153 5         32 for 0 .. (($#$invlist-1)>>1);
154             }
155             # an odd number of elements means the list ends with an "include-all"
156 6 100       21 push @members, $invlist->[-1] .. 0x10FFFF
157             if 1 & @$invlist;
158 6         15 return \@members;
159             }
160              
161             # The index is private because there's not a good way to explain it to the user
162             sub _invlist_index {
163 2330     2330   3325 my $self= shift;
164 2330   66     6380 $self->{_invlist_index} ||= Mock::Data::Charset::Util::create_invlist_index($self->member_invlist);
165             }
166              
167             sub Mock::Data::Charset::Util::create_invlist_index {
168 25     25   4841 my $invlist= shift;
169 25         53 my $n_spans= (@$invlist + 1) >> 1;
170 25         34 my @index;
171 25         82 $#index= $n_spans-1;
172 25         40 my $total= 0;
173             $index[$_]= $total += $invlist->[$_*2+1] - $invlist->[$_*2]
174 25         289 for 0 .. (@$invlist >> 1)-1;
175 25 100       71 if (@$invlist & 1) { # In the case that the final range is infinite
176 6         15 $index[$n_spans-1]= $total + 0x110000 - $invlist->[-1];
177             }
178 25         104 \@index;
179             }
180              
181              
182             sub member_invlist {
183 1607 50   1607 1 3242 if (@_ > 1) {
184 0         0 $_[0]{member_invlist}= $_[1];
185 0         0 delete $_[0]{_invlist_index};
186 0         0 delete $_[0]{members};
187 0         0 delete $_[0]{notation};
188             }
189 1607   66     3845 $_[0]{member_invlist} //= _build_member_invlist(@_);
190             }
191              
192             sub _build_member_invlist {
193 35     35   64 my $self= shift;
194 35         76 my $max_codepoint= $self->max_codepoint;
195             # If the search space is small, and there is already a regex notation, it is probably faster
196             # to iterate and let perl do the work than to parse the charset.
197 35         53 my $invlist;
198 35 100 66     136 if (!defined $max_codepoint || $max_codepoint > 1000 || !defined $self->{notation}) {
      100        
199 26   100     101 $max_codepoint ||= 0x10FFFF;
200 26         39 $invlist= eval {
201 26         67 _parsed_charset_to_invlist($self->_parse, $max_codepoint);
202             }# or main::diag $@
203             }
204 35   66     99 $invlist ||= _charset_invlist_brute_force($self->notation, $max_codepoint);
205             # If a user writes to the invlist, it will become out of sync with the Index,
206             # leading to confusing bugs.
207 35 50       213 if (Internals->can('SvREADONLY')) {
208 35         392 Internals::SvREADONLY($_,1) for @$invlist;
209 35         111 Internals::SvREADONLY(@$invlist,1);
210             }
211 35         199 return $invlist;
212             }
213              
214             # Lazy-built string of all basic-multilingual-plane characters
215             our $_ascii_chars;
216             our $_unicode_chars;
217             sub _build_unicode_chars {
218 1 50   1   4 unless (defined $_unicode_chars) {
219             # Construct ranges of valid characters separated by NUL.
220             # Older perls die when the regex engine encounters an invalid character
221             # but newer perls just treat the invalid character as "not a member",
222             # unless the set is a negation in which case non-characters *are* a member.
223             # This makes the assumption that if a non-char isn't a member then \0 won't
224             # be either.
225 1         3 $_unicode_chars= '';
226 1         10703 $_unicode_chars .= chr($_) for 0 .. 0xD7FF;
227 1         5 $_unicode_chars .= "\0";
228 1         114 $_unicode_chars .= chr($_) for 0xFDF0 .. 0xFFFD;
229 1         4 for (1..16) {
230 16         67 $_unicode_chars .= "\0";
231 16         210223 $_unicode_chars .= chr($_) for ($_<<16) .. (($_<<16)|0xFFFD);
232             }
233             }
234 1         8 \$_unicode_chars;
235             }
236              
237             sub _charset_invlist_brute_force {
238 9     9   24 my ($set, $max_codepoint)= @_;
239 9 100       32 my $inv= (ord $set == ord '^')? substr($set,1) : '^'.$set;
240 9         14 my @invlist;
241            
242             # optimize common case
243 9 100       22 if ($max_codepoint < 256) {
244             # Find first character of every match and first character of every non-match
245             # and convert to codepoints.
246 8 100 66     396 @invlist= map +(defined $_? ord($_) : ()),
247             ($_ascii_chars //= join('', map chr($_), 0..255))
248             =~ / ( [$set] ) (?> [$set]* ) (?: \z | ( [$inv] ) (?> [$inv]* ) )/gx;
249             }
250             else {
251 1 50       6 _build_unicode_chars() unless defined $_unicode_chars;
252             # This got complicated while trying to support perls that can't match against non-characters.
253             # The non-characters have been replaced by NULs, so need to capture the char before and after
254             # each transition in case one of them is a NUL.
255 1 50       5855 my @endpoints=
256             ($max_codepoint < 0x10FFFF? substr($_unicode_chars,0,$max_codepoint+1) : $_unicode_chars)
257             =~ /( [$set] ) ( [$set] )* ( \z | [$inv] ) ( [$inv] )* /gx;
258 1 50       9 if (@endpoints) {
259             # List is a multiple of 4 elements: (first-member,last-member,first-non-member,last-non-member)
260             # We're not interested in the span of non-members at the end, so just remove those.
261 1         3 pop @endpoints; pop @endpoints;
  1         2  
262             # Iterate every transition of member/nonmember, and use the second character if present
263             # and isn't a NUL, else use the first character and add 1.
264 1         5 push @invlist, ord $endpoints[0];
265 1         5 for (my $i= 1; $i < @endpoints; $i+= 2) {
266 19 100 66     61 if (defined $endpoints[$i+1] && ord $endpoints[$i+1]) {
    50          
267 18         41 push @invlist, ord $endpoints[$i+1];
268             } elsif (defined $endpoints[$i]) {
269 1         4 push @invlist, 1 + ord $endpoints[$i];
270             } else {
271 0         0 push @invlist, 1 + $invlist[-1];
272             }
273             }
274             # substr is an estimate, because string skips characters, so remove any spurrous
275             # codepoints beyond the max
276 1   66     11 pop @invlist while @invlist && $invlist[-1] > $max_codepoint;
277             }
278             }
279             # If an "infinite" range would be returned, but the user set a maximum codepoint,
280             # list the max codepoint as the end of the invlist.
281 9 100 66     884 if ($max_codepoint < 0x10FFFF and 1 & @invlist) {
282 4         10 push @invlist, $max_codepoint+1;
283             }
284 9         39 return \@invlist;
285             }
286              
287             sub _parsed_charset_to_invlist {
288 26     26   59 my ($parse, $max_codepoint)= @_;
289 26         39 my @invlists;
290             # convert the character list into an inversion list
291 26 100       78 if (defined (my $cp= $parse->{codepoints})) {
292 5         20 my @chars= sort { $a <=> $b } @$cp;
  6         17  
293 5         15 my @invlist= (shift @chars);
294 5         10 push @invlist, $invlist[0] + 1;
295 5         19 for (my $i= 0; $i <= $#chars; $i++) {
296             # If the next char is adjacent, extend the span
297 5 100       15 if ($invlist[-1] == $chars[$i]) {
298 2         5 ++$invlist[-1];
299             } else {
300 3         10 push @invlist, $chars[$i], $chars[$i]+1;
301             }
302             }
303 5         28 push @invlists, \@invlist;
304             }
305             # Each range is an inversion list already
306 26 100       70 if (my $r= $parse->{codepoint_ranges}) {
307 10         39 for (my $i= 0; $i < (@$r >> 1); $i++) {
308 11         32 my ($start, $limit)= ($r->[$i*2], $r->[$i*2+1]+1);
309             # Try to combine the range with the most recent inversion list, if possible,
310 11 100 66     49 if (@invlists && $invlists[-1][-1] < $start) {
    50 33        
311 1         2 push @{ $invlists[-1] }, $start, $limit;
  1         6  
312             } elsif (@invlists && $invlists[-1][0] > $limit) {
313 0         0 unshift @{ $invlists[-1] }, $start, $limit;
  0         0  
314             } else {
315             # else just start a new inversion list
316 10         36 push @invlists, [ $start, $limit ]
317             }
318             }
319             }
320             # Convert each character class to an inversion list.
321 26 100       60 if ($parse->{classes}) {
322             push @invlists, _class_invlist($_)
323 11         16 for @{ $parse->{classes} };
  11         40  
324             }
325 26         2111 my $invlist= Mock::Data::Charset::Util::merge_invlists(\@invlists, $max_codepoint);
326             # Perform negation of inversion list by either starting at char 0 or removing char 0
327 26 100       64 if ($parse->{negate}) {
328 1 50       4 if ($invlist->[0]) { unshift @$invlist, 0 }
  1         3  
329 0         0 else { shift @$invlist; }
330             }
331 26         134 return $invlist;
332             }
333              
334              
335             our $_compile;
336             sub compile {
337 4     4 1 14 local $_compile= 1;
338 4         14 shift->generate(@_);
339             }
340             sub generate {
341 113     113 1 198 my ($self, $mock)= (shift, shift);
342 113         249 my ($len, $cp_min, $cp_max, $member_count)
343             = ($self->str_len, $self->min_codepoint, $self->max_codepoint, $self->count);
344 113 100       233 if (@_) {
345 82 100       189 my %opts= ref $_[0] eq 'HASH'? %{ shift() } : ();
  79         268  
346 82 100 33     219 $len= @_? shift : $opts{str_len} // $opts{len} // $opts{size}; # allow some aliases for length
      33        
347 82   66     242 $cp_min= $opts{min_codepoint} // $cp_min;
348 82   66     247 $cp_max= $opts{max_codepoint} // $cp_max;
349             }
350 113 100 100     419 my ($memb_min, $memb_span)= !defined $cp_min && !defined $cp_max? (0,$member_count)
351             : $self->_codepoint_minmax_to_member_range($cp_min, $cp_max);
352              
353             # If compiling, $len will be a function, else it will be an integer
354 0     0   0 $len= !defined $len? ($_compile? sub { 1 } : 1 )
355 0     0   0 : !ref $len? ($_compile? sub { $len } : $len)
356             : ref $len eq 'ARRAY'? (
357 0     0   0 $_compile? sub { $len->[0] + int rand ($len->[1] - $len->[0]) }
358 113 50       369 : $len->[0] + int rand ($len->[1] - $len->[0])
    50          
    50          
    100          
    50          
    100          
    100          
    100          
359             )
360             : ref $len eq 'CODE'? ($_compile? $len : $len->($mock))
361             : Carp::croak("Unknown str_len specification '$len'");
362              
363             # If member list is small-ish, use faster direct array access
364 113 100 100     322 if ($self->{members} || $member_count < 500) {
365 97         176 my $members= $self->members;
366             return sub {
367 3     3   8 my $buf= '';
368             $buf .= $members->[$memb_min + int rand $memb_span]
369 3         19 for 1..$len->($_[0]);
370 3         35 return $buf;
371 97 100       210 } if $_compile;
372 93         180 my $buf= '';
373             $buf .= $members->[$memb_min + int rand $memb_span]
374 93         433 for 1..$len;
375 93         401 return $buf;
376             }
377             else {
378 16         36 my $invlist= $self->member_invlist;
379 16         30 my $index= $self->_invlist_index;
380             return sub {
381 0     0   0 my $ret= '';
382             $ret .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
383 0         0 for 1..$len->($_[0]);
384 16 50       32 } if $_compile;
385 16         23 my $buf= '';
386             $buf .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
387 16         64 for 1..$len;
388 16         82 return $buf;
389             }
390             }
391              
392             sub _codepoint_minmax_to_member_range {
393 27     27   41 my $self= shift;
394 27         56 my ($cp_min, $cp_max)= @_;
395             my $memb_min= !defined $cp_min? 0
396 27 100       56 : do {
397 6         14 my ($at, $ins)= _find_invlist_element($cp_min, $self->member_invlist, $self->_invlist_index);
398 6   33     18 $at // $ins
399             };
400             my $memb_lim= !defined $cp_max? $self->count
401 27 50       52 : do {
402 27         55 my ($at, $ins)= _find_invlist_element($cp_max, $self->member_invlist, $self->_invlist_index);
403 27 100       63 defined $at? $at + 1 : $ins;
404             };
405 27         56 return ($memb_min, $memb_lim-$memb_min);
406             }
407              
408              
409             sub parse {
410 32     32 1 19429 my ($self, $notation)= @_;
411 32 50       83 return { codepoints => [] } unless length $notation;
412 32 50       71 return { classes => ['All'] } if $notation eq '^';
413 32         64 $notation .= ']';
414             # parse function needs $_ to be the input string
415 32         91 pos($notation)= 0;
416 32         109 return _parse_charset() for $notation;
417             }
418              
419             our $have_prop_invlist;
420             our %_parse_charset_backslash= (
421             a => ord "\a",
422             b => ord "\b",
423             c => sub { die "Unimplemented: \\c" },
424             d => sub { push @{$_[0]{classes}}, 'digit'; undef; },
425             D => sub { push @{$_[0]{classes}}, '^digit'; undef; },
426             e => ord "\e",
427             f => ord "\f",
428             h => sub { push @{$_[0]{classes}}, 'horizspace'; undef; },
429             H => sub { push @{$_[0]{classes}}, '^horizspace'; undef; },
430             n => ord "\n",
431             N => \&_parse_charset_namedchar,
432             o => \&_parse_charset_oct,
433             p => \&_parse_charset_classname,
434             P => sub { _parse_charset_classname(shift, 1) },
435             r => ord "\r",
436             s => sub { push @{$_[0]{classes}}, 'space'; undef; },
437             S => sub { push @{$_[0]{classes}}, '^space'; undef; },
438             t => ord "\t",
439             v => sub { push @{$_[0]{classes}}, 'vertspace'; undef; },
440             V => sub { push @{$_[0]{classes}}, '^vertspace'; undef; },
441             w => sub { push @{$_[0]{classes}}, 'word'; undef; },
442             W => sub { push @{$_[0]{classes}}, '^word'; undef; },
443             x => \&_parse_charset_hex,
444             0 => \&_parse_charset_oct,
445             1 => \&_parse_charset_oct,
446             2 => \&_parse_charset_oct,
447             3 => \&_parse_charset_oct,
448             4 => \&_parse_charset_oct,
449             5 => \&_parse_charset_oct,
450             6 => \&_parse_charset_oct,
451             7 => \&_parse_charset_oct,
452             8 => \&_parse_charset_oct,
453             9 => \&_parse_charset_oct,
454             );
455             our %_class_invlist_cache= (
456             'Any' => [ 0 ],
457             '\\N' => [ 0, ord("\n"), 1+ord("\n") ],
458             );
459             sub _class_invlist {
460 22     22   10496 my $class= shift;
461 22 100       53 if (ord $class == ord '^') {
462 5         24 return Mock::Data::Charset::Util::negate_invlist(
463             _class_invlist(substr($class,1))
464             );
465             }
466 17   66     66 return $_class_invlist_cache{$class} ||= do {
467 8 100       21 $have_prop_invlist= do { require Unicode::UCD; !!Unicode::UCD->can('prop_invlist') }
  2         2168  
  2         44439  
468             unless defined $have_prop_invlist;
469 8 50       37 $have_prop_invlist? [ Unicode::UCD::prop_invlist($class) ]
470             : _charset_invlist_brute_force("\\p{$class}", 0x10FFFF);
471             };
472             }
473             sub _parse_charset_hex {
474 3 50   3   16 /\G( [0-9A-Fa-f]{2} | \{ ([0-9A-Fa-f]+) \} )/gcx
475             or die "Invalid hex escape at "._parse_context;
476 3 100       14 return hex(defined $2? $2 : $1);
477             }
478             sub _parse_charset_oct {
479 5     5   14 --pos; # The caller ate one of the characters we need to parse
480 5 50       35 /\G( [0-7]{3} | 0 | o\{ ([0-7]+) \} ) /gcx
481             or die "Invalid octal escape at "._parse_context;
482 5 100       24 return oct(defined $2? $2 : $1);
483             }
484             sub _parse_charset_namedchar {
485 2     2   600 require charnames;
486 2 50       10575 /\G \{ ([^}]+) \} /gcx
487             # or die "Invalid named char following \\N at '".substr($_,pos,10)."'";
488             and return charnames::vianame($1);
489             # Plain "\N" means every character except \n
490 0         0 push @{ $_[0]{classes} }, '\\N';
  0         0  
491 0         0 return;
492             }
493             sub _parse_charset_classname {
494 7     7   18 my ($result, $negate)= @_;
495 7 50       36 /\G \{ ([^}]+) \} /gcx
496             or die "Invalid class name following \\p at "._parse_context;
497 7 100       21 push @{$result->{classes}}, lc($negate? "^$1" : $1);
  7         45  
498             undef
499 7         16 }
500             sub _parse_charset {
501 37     37   61 my $flags= shift;
502             # argument is in $_, starting from pos($_)
503 37         68 my %parse;
504             my @range;
505 37         81 $parse{codepoints}= \my @chars;
506 37 100       127 $parse{negate}= 1 if /\G \^ /gcx;
507 37 50       83 if (/\G]/gc) { push @chars, ord ']' }
  0         0  
508 37         52 while (1) {
509 125         158 my $cp; # literal codepoint to be added
510             # Check for special cases
511 125 100 50     453 if (/\G ( \\ | - | \[: | \] ) /gcx) {
    50 66        
      33        
512 80 100       275 if ($1 eq '\\') {
    100          
    100          
513 19 50       59 /\G(.)/gc or die "Unexpected end of input";
514 19   33     83 $cp= $_parse_charset_backslash{$1} || ord $1;
515 19 100       67 $cp= $cp->(\%parse)
516             if ref $cp;
517             }
518             elsif ($1 eq '-') {
519 19 100       66 if (@range == 1) {
520 18         29 push @range, ord '-';
521 18         34 next;
522             }
523             else {
524 1         2 $cp= ord '-';
525             }
526             }
527             elsif ($1 eq '[:') {
528 5 50       28 /\G ( [^:]+ ) :] /gcx
529             or die "Invalid character class at "._parse_context;
530 5         9 push @{$parse{classes}}, $1;
  5         18  
531             }
532             else {
533 37         70 last; # $1 eq ']';
534             }
535             }
536             elsif ($flags && ($flags->{x}||0) >= 2 && /\G[ \t]/gc) {
537 0         0 next; # ignore space and tab under /xx
538             }
539             else {
540 45 50       142 /\G(.)/gc or die "Unexpected end of input";
541 45         85 $cp= ord $1;
542             }
543             # If no single character was found, any range-in-progress needs converted to
544             # charcters
545 70 100       12064 if (!defined $cp) {
    100          
    100          
546 16         32 push @chars, @range;
547 16         25 @range= ();
548             }
549             # At this point, $cp will contain the next ordinal of the character to include,
550             # but it might also be starting or finishing a range.
551             elsif (@range == 1) {
552 8         37 push @chars, $range[0];
553 8         17 $range[0]= $cp;
554             }
555             elsif (@range == 2) {
556 17         26 push @{$parse{codepoint_ranges}}, $range[0], $cp;
  17         59  
557 17         33 @range= ();
558             }
559             else {
560 29         59 push @range, $cp;
561             }
562             #printf "# pos %d cp %d range %s %s include %s\n", pos $_, $cp, $range[0] // '(null)', $range[1] // '(null)', join(',', @include);
563             }
564 37         55 push @chars, @range;
565 37 100       79 if (@chars) {
566 13         66 @chars= sort { $a <=> $b } @chars;
  11         27  
567             } else {
568 24         52 delete $parse{codepoints};
569             }
570 37         227 return \%parse;
571             }
572              
573             sub _ord_to_safe_regex_char {
574 3 0   3   20 return chr($_[0]) =~ /[\w]/? chr $_[0]
    50          
575             : $_[0] <= 0xFF? sprintf('\x%02X',$_[0])
576             : sprintf('\x{%X}',$_[0])
577             }
578              
579             sub _deparse_charset {
580 1     1   3 my $parse= shift;
581 1         3 my $str= '';
582 1 50       3 if (my $cp= $parse->{codepoints}) {
583             $str .= _ord_to_safe_regex_char($_)
584 1         10 for @$cp;
585             }
586 1 50       3 if (my $r= $parse->{codepoint_ranges}) {
587 0         0 for (my $i= 0; $i < (@$r << 1); $i++) {
588 0         0 $str .= _ord_to_safe_regex_char($r->[$i*2]) . '-' . _ord_to_safe_regex_char($r->[$i*2+1]);
589             }
590             }
591 1 50       4 if (my $cl= $parse->{classes}) {
592             # TODO: reverse conversions to \h \v etc.
593 0         0 for (@$cl) {
594 0 0       0 $str .= $_ eq '\N'? '\0-\x09\x0B-\x{10FFFF}'
    0          
595             : ord == ord '^'? '\P{'.substr($_,1).'}'
596             : '\p{'.$_.'}';
597             }
598             }
599 1         5 return $str;
600             }
601              
602              
603             sub get_member {
604 749 100   749 1 3626 $_[0]{members}? $_[0]{members}[$_[1]]
605             : chr _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
606             }
607              
608             sub get_member_codepoint {
609 6 50   6 1 2173 $_[0]{members}? ord $_[0]{members}[$_[1]]
610             : _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
611             }
612              
613             sub _get_invlist_element {
614 1109     1109   1964 my ($ofs, $invlist, $invlist_index)= @_;
615 1109 50       2047 $ofs += @$invlist_index if $ofs < 0;
616 1109 50 33     3630 return undef if $ofs >= $invlist_index->[-1] || $ofs < 0;
617             # Binary Search to find the range that contains this numbered element
618 1109         2046 my ($min, $max, $mid)= (0, $#$invlist_index);
619 1109         1575 while (1) {
620 5781         7196 $mid= ($min+$max) >> 1;
621 5781 100 100     13075 if ($ofs >= $invlist_index->[$mid]) {
    100          
622 2761         3425 $min= $mid+1
623             }
624             elsif ($mid > 0 && $ofs < $invlist_index->[$mid-1]) {
625 1911         2536 $max= $mid-1
626             }
627             else {
628 1109 100       2276 $ofs -= $invlist_index->[$mid-1] if $mid > 0;
629 1109         4104 return $invlist->[$mid*2] + $ofs;
630             }
631             }
632             }
633              
634              
635             sub find_member {
636 754     754 1 2700 my ($self, $char)= @_;
637 754         1547 return _find_invlist_element(ord $char, $self->member_invlist, $self->_invlist_index);
638             }
639              
640             sub _find_invlist_element {
641 787     787   1361 my ($codepoint, $invlist, $index)= @_;
642             # Binary Search to find the range that contains this numbered element
643 787         1337 my ($min, $max, $mid)= (0, $#$invlist);
644 787         1179 while (1) {
645 5739         7015 $mid= ($min+$max) >> 1;
646 5739 100 100     18400 if ($mid > 0 && $codepoint < $invlist->[$mid]) {
    100 100        
647 2331         2997 $max= $mid-1
648             }
649             elsif ($mid < $#$invlist && $codepoint >= $invlist->[$mid+1]) {
650 2621         3868 $min= $mid+1;
651             }
652             else {
653 787 100       1499 return (undef, 0) unless $codepoint >= $invlist->[$mid];
654 786 100       1474 return $codepoint - $invlist->[$mid] unless $mid > 0;
655 778 100       4517 return $codepoint - $invlist->[$mid] + $index->[($mid >> 1) - 1] unless $mid & 1;
656             # if $mid is an odd number, the range is excluded, and there is no match
657 13 100       40 return undef unless wantarray;
658 11         38 return (undef, $index->[($mid-1)>>1]) # return insertion point as second val
659             }
660             }
661             }
662              
663              
664             sub negate {
665 0     0 1 0 my $self= shift;
666 0         0 my $neg= Mock::Data::Charset::Util::negate_invlist($self->member_invlist, $self->max_codepoint);
667 0         0 return $self->new(member_invlist => $neg);
668             }
669             sub Mock::Data::Charset::Util::negate_invlist {
670 5     5   1200 my ($invlist, $max_codepoint)= @_;
671             # Toggle first char of 0
672 5 50       250 $invlist= $invlist->[0]? [ 0, @$invlist ] : [ @{$invlist}[1..$#$invlist] ];
  0         0  
673             # If max_codepoint is defined, and was the final char, remove the range starting at max_codepoint+1
674 5 50 33     37 if (@$invlist & 1 and defined $max_codepoint and $invlist->[-1] == $max_codepoint+1) {
      33        
675 0         0 pop @$invlist;
676             }
677 5         24 return $invlist;
678             }
679              
680              
681             sub union {
682 0     0 1 0 my $self= $_[0];
683 0         0 my @invlists= @_;
684             ref eq 'ARRAY' || ($_= $_->member_invlist)
685 0   0     0 for @invlists;
686 0         0 my $combined= Mock::Data::Charset::Util::merge_invlists(\@invlists, $self->max_codepoint);
687 0         0 return $self->new(member_invlist => $combined);
688             }
689              
690             #=head2 merge_invlists
691             #
692             # my $combined_invlist= $charset->merge_invlist( \@list2, \@list3, ... );
693             # my $combined_invlist= merge_invlist( \@list1, \@list2, ... );
694             #
695             #Merge one or more inversion lists into a superset of all of them.
696             #If called as a method, the L is used as the first list.
697             #
698             #The return value is an inversion list, which can be wrapped in a Charset object by passing it
699             #as the C attribute.
700             #
701             #The current L applies to the result. If called as a plain function, the
702             #C is assumed to be the Unicode maximum of C<0x10FFFF>.
703             #
704             #=cut
705              
706             sub Mock::Data::Charset::Util::merge_invlists {
707 34     34   9737 my @invlists= @{shift()};
  34         78  
708 34   100     98 my $max_codepoint= shift // 0x10FFFF;
709              
710 34 50       97 return [] unless @invlists;
711 34 100       94 return [@{$invlists[0]}] unless @invlists > 1;
  20         107  
712 14         25 my @combined= ();
713             # Repeatedly select the minimum range among the input lists and add it to the result
714 14         36 my @pos= (0)x@invlists;
715 14         34 while (@invlists) {
716 3293         5415 my ($min_ch, $min_i)= ($invlists[0][$pos[0]], 0);
717             # Find which inversion list contains the lowest range
718 3293         5924 for (my $i= 1; $i < @invlists; $i++) {
719 3284 100       7534 if ($invlists[$i][$pos[$i]] < $min_ch) {
720 814         1479 $min_ch= $invlists[$i][$pos[$i]];
721 814         1492 $min_i= $i;
722             }
723             }
724 3293 100       5415 last if $min_ch > $max_codepoint;
725             # Check for overlap of this new inclusion range with the previous
726 3291 100 100     7895 if (@combined && $combined[-1] >= $min_ch) {
727             # they overlap, so just replace the end-codepoint of the range
728             # if the new endpoint is larger
729 3111         4352 my $new_end= $invlists[$min_i][$pos[$min_i]+1];
730 3111 100 100     8684 $combined[-1]= $new_end if !defined $new_end || $combined[-1] < $new_end;
731             }
732             else {
733             # else, simply append the range
734 180         286 push @combined, @{$invlists[$min_i]}[$pos[$min_i] .. $pos[$min_i]+1];
  180         393  
735             }
736             # If the list is empty now, remove it from consideration
737 3291 100       4528 if (($pos[$min_i] += 2) >= @{$invlists[$min_i]}) {
  3291 50       8736  
738 21         33 splice @invlists, $min_i, 1;
739 21         27 splice @pos, $min_i, 1;
740             # If the invlist ends with an infinite range now, we are done
741 21 100       57 if (!defined $combined[-1]) {
742 6         10 pop @combined;
743 6         13 last;
744             }
745             }
746             # If this is the only list remaining, append the rest and done
747             elsif (@invlists == 1) {
748 0         0 push @combined, @{$invlists[0]}[$pos[0] .. $#{$invlists[0]}];
  0         0  
  0         0  
749 0         0 last;
750             }
751             }
752 14         34 while ($combined[-1] > $max_codepoint) {
753 1         3 pop @combined;
754             }
755             # If the list ends with inclusion, and the max_codepoint is less than unicode max,
756             # end the list with it.
757 14 100 100     47 if (1 & @combined and $max_codepoint < 0x10FFFF) {
758 1         4 push @combined, $max_codepoint+1;
759             }
760 14         47 return \@combined;
761             }
762              
763             1;
764              
765             __END__