File Coverage

blib/lib/JE/Object/RegExp.pm
Criterion Covered Total %
statement 213 237 89.8
branch 117 148 79.0
condition 20 51 39.2
subroutine 35 36 97.2
pod 3 6 50.0
total 388 478 81.1


line stmt bran cond sub pod time code
1             package JE::Object::RegExp;
2              
3             our $VERSION = '0.064';
4              
5              
6 11     11   4763 use strict;
  11         18  
  11         469  
7 11     11   47 use warnings; no warnings 'utf8';
  11     11   15  
  11         320  
  11         38  
  11         13  
  11         428  
8              
9 11         86 use overload fallback => 1,
10 11     11   49 '""'=> 'value';
  11         12  
11              
12             # This constant is true if we need to work around perl bug #122460 to keep
13             # the ‘aardvark’ tests (in t/15.05-string-objects.t) passing. This should
14             # only apply to 5.20.0, but this comment was written before the release of
15             # 5.20.1, so whether it applies to that version remains to be seen. Basic-
16             # ally, (?=...) can result in buggy optimisations that cause a faulty
17             # rejection of the match at some locations, because it is assumed that it
18             # cannot match in some spots.
19             use constant aardvark_bug =>
20             # This test should match the empty string. If it advances (pos returns
21             # true), then we have the bug.
22 11     11   1351 do { my $a = "rdvark"; $a =~ /(?{})(?=.)a*?/g; pos $a };
  11         19  
  11         16  
  11         18  
  11         132  
  11         689  
23              
24 11     11   53 use Scalar::Util 'blessed';
  11         18  
  11         2750  
25              
26             our @ISA = 'JE::Object';
27              
28             require JE::Boolean;
29             require JE::Code;
30             require JE::Object;
31             require JE::String;
32              
33             import JE::Code 'add_line_number';
34             sub add_line_number;
35              
36             our @Match;
37             our @EraseCapture;
38              
39             #import JE::String 'desurrogify';
40             #sub desurrogify($);
41             # Only need to turn these on when Perl starts adding regexp modifiers
42             # outside the BMP.
43              
44             # JS regexp features that Perl doesn't have, or which differ from Perl's,
45             # along with their Perl equivalents
46             # ^ with /m \A|(?<=[\cm\cj\x{2028}\x{2029}]) (^ with the /m modifier
47             # matches whenever a Unicode
48             # line break (not just \n)
49             # precedes the current position,
50             # even at the end of the string. In
51             # Perl, /^/m matches \A|(?<=\n)(?!\z) .)
52             # $ \z
53             # $ with /m (?:\z|(?=[\cm\cj\x{2028}\x{2029}]))
54             # \b (?:(?<=$w)(?!$w)|(?
55             # \B (?:(?<=$w)(?=$w)|(?
56             # doesn't include non-ASCII
57             # word chars in \w)
58             # . [^\cm\cj\x{2028}\x{2029}]
59             # \v \cK
60             # \n \cj (whether \n matches \cj in Perl is system-dependent)
61             # \r \cm
62             # \uHHHH \x{HHHH}
63             # \d [0-9]
64             # \D [^0-9]
65             # \s [\p{Zs}\s\ck]
66             # \S [^\p{Zs}\s\ck]
67             # \w [A-Za-z0-9_]
68             # \W [^A-Za-z0-9_]
69             # [^] (?s:.)
70             # [] (?!)
71              
72             # Other differences
73             #
74             # A quantifier in a JS regexp will, when repeated, clear all values cap-
75             # tured by capturing parentheses in the term that it quantifies. This means
76             # that /((a)?b)+/, when matched against "abb" will leave $2 undefined, even
77             # though the second () matched "a" the first time the first () matched.
78             # (The ECMAScript spec says to do it this way, but Safari leaves $2 with
79             # "a" in it and doesn't clear it on the second iteration of the '+'.) Perl
80             # does it both ways, and the rules aren't quite clear to me:
81             #
82             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a)?b)+/;'
83             # b,
84             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a+)?b)+/;'
85             # b,a
86             #
87             # perl5.9.4 produces the same. perl5.002_01 crashes quite nicely.
88             #
89             #
90             # In ECMAScript, when the pattern inside a (?! ... ) fails (in which case
91             # the (?!) succeeds), values captured by parentheses within the negative
92             # lookahead are cleared, such that subsequent backreferences *outside* the
93             # lookahead are equivalent to (?:) (zero-width always-match assertion). In
94             # Perl, the captured values are left as they are when the pattern inside
95             # the lookahead fails:
96             #
97             # $ perl5.8.8 -le 'print "a" =~ /(?!(a)b)a/;'
98             # a
99             # $ perl5.9.4 -le 'print "a" =~ /(?!(a)b)a/;'
100             # a
101             #
102             #
103             # In ECMAScript, as in Perl, a pair of capturing parentheses will produce
104             # the undefined value if the parens were not part of the final match.
105             # Undefined will still be produced if there is a \digit backreference
106             # reference to those parens. In ECMAScript, such a back-reference is equiv-
107             # alent to (?:); in Perl it is equivalent to (?!). Therefore, ECMAScript’s
108             # \1 is equivalent to Perl’s (?(1)\1). (It would seem, upon testing
109             # /(?:|())/ vs. /(?:|())\1/ in perl, that the \1 back-reference always suc-
110             # ceeds, and ends up setting $1 to "" [as opposed to undef]. What is actu-
111             # ally happening is that the failed \1 causes backtracking, so the second
112             # alternative in (?:|()) matches, setting $1 to the empty string. Safari,
113             # incidentally, does what Perl *appears* to do at first glance, *if* the
114             # backreference itself is within capturing parentheses (as in
115             # /(?:|())(\1)/).
116             #
117             # These issues are solved with embedded code snippets, as explained below,
118             # where the actual code is.
119             #
120             #
121             # In ECMAScript, case-folding inside the regular expression engine is not
122             # allowed to change the length of a string. Therefore, "ß" never matches
123             # /ss/i, and vice versa. I’m disinclined to be ECMAScript compliant in this
124             # regard though, because it would affect performance. The inefficient solu-
125             # tion I have in mind is to change /x/i to /(?-i:x)/ for every character
126             # that has a multi-character uppercase equivalent; and to change /xx/i to
127             # /(?-i:[Xx][Xx])/ where xx represents a multi-character sequence that
128             # could match a single character in Perl. The latter is the main problem.
129             # How are we to find out which character sequences need this? We could
130             # change /x/i to /[xX]/ for every literal character in the string, but how
131             # would we take /Σ/ -> /[Σσς]/ into account? And does perl’s regexp engine
132             # slow down if we feed it a ton of character classes instead of literal
133             # text? (Need to do some benchmarks.) (If we do fix this, we need to re-
134             # enable the skipped tests.)
135              
136              
137              
138             =head1 NAME
139              
140             JE::Object::RegExp - JavaScript regular expression (RegExp object) class
141              
142             =head1 SYNOPSIS
143              
144             use JE;
145             use JE::Object::RegExp;
146              
147             $j = new JE;
148              
149             $js_regexp = new JE::Object::RegExp $j, "(.*)", 'ims';
150              
151             $perl_qr = $js_regexp->value;
152              
153             $some_string =~ $js_regexp; # You can use it as a qr//
154              
155             =head1 DESCRIPTION
156              
157             This class implements JavaScript regular expressions for JE.
158              
159             See L for a description of most of the interface. Only what
160             is specific to JE::Object::RegExp is explained here.
161              
162             A RegExp object will stringify the same way as a C, so that you can
163             use C<=~> on it. This is different from the return value of the
164             C method (the way it stringifies in JS).
165              
166             Since JE's regular expressions use Perl's engine underneath, the
167             features that Perl provides that are not part of the ECMAScript spec are
168             supported, except for C<(?s)>
169             and C<(?m)>, which don't do anything, and C<(?|...)>, which is
170             unpredictable.
171              
172             In versions prior to 0.042, a hyphen adjacent to C<\d>, C<\s> or C<\w> in a
173             character class would be unpredictable (sometimes a syntax error). Now it
174             is interpreted literally. This matches what most implementations do, which
175             happens to be the same as Perl's behaviour. (It is a syntax error
176             in ECMAScript.)
177              
178             =head1 METHODS
179              
180             =over 4
181              
182             =cut
183              
184             # ~~~ How should surrogates work??? To make regexps work with JS strings
185             # properly, we need to use the surrogified string so that /../ will
186             # correctly match two surrogates. In this case it won't work properly
187             # with Perl strings, so what is the point of Perl-style stringification?
188             # Perhaps we should allow this anyway, but warn about code points outside
189             # the BMP in the documentation. (Should we also produce a Perl warning?
190             # Though I'm not that it's possible to catch this: "\x{10000}" =~ $re).
191             #
192             # But it would be nice if this would work:
193             # $j->eval("'\x{10000}'") =~ $j->eval('/../')
194             # ~~~ We might be able to make this work with perl 5.12’s qr overloading.
195              
196             our %_patterns = qw/
197             \b (?:(?<=[A-Za-z0-9_])(?![A-Za-z0-9_])|(?
198             \B (?:(?<=[A-Za-z0-9_])(?=[A-Za-z0-9_])|(?
199             . [^\cm\cj\x{2028}\x{2029}]
200             \v \cK
201             \n \cj
202             \r \cm
203             \d [0-9]
204             \D [^0-9]
205             \s [\p{Zs}\s\ck]
206             \S [^\p{Zs}\s\ck]
207             \w [A-Za-z0-9_]
208             \W [^A-Za-z0-9_]
209             /;
210              
211             our %_class_patterns = qw/
212             \v \cK
213             \n \cj
214             \r \cm
215             \d 0-9
216             \s \p{Zs}\s\ck
217             \w A-Za-z0-9_
218             /;
219              
220             my $clear_captures = qr/(?{@Match=@EraseCapture=()})/;
221 11     11   109 my $save_captures = do { no strict 'refs';
  11         13  
  11         6284  
222             qr/(?{$Match[$_]=$EraseCapture[$_]?undef:$$_ for 1..$#+})/; };
223             # These are pretty scary, aren’t they?
224             my $plain_regexp =
225             qr/^((?:[^\\[()]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
226             my $plain_regexp_x_mode =
227             qr/^((?:[^\\[()]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
228             my $plain_regexp_wo_pipe =
229             qr/^((?:[^\\[()|]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
230             my $plain_regexp_x_mode_wo_pipe =
231             qr/^((?:[^\\[()|]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
232              
233             sub _capture_erasure_stuff {
234 21         59 "(?{local\@EraseCapture[" . join(',',@{$_[0]}) . "]=(1)x"
  21         78  
235 21     21   30 . @{$_[0]} . '})'
236             }
237              
238             sub new {
239 375     375 1 1029 my ($class, $global, $re, $flags) = @_;
240 375   33     1293 my $self = $class->SUPER::new($global, {
241             prototype => $global->prototype_for('RegExp')
242             || $global->prop('RegExp')->prop('prototype')
243             });
244              
245 375         738 my $qr;
246              
247 375 100       874 if(defined blessed $re) {
248 56 50 66     593 if ($re->isa(__PACKAGE__)) {
    100          
    50          
249 0 0 0     0 defined $flags && eval{$flags->id} ne 'undef' and
  0         0  
250             die JE::Object::Error::TypeError->new(
251             $global, add_line_number
252             'Second argument to ' .
253             'RegExp() must be undefined if ' .
254             'first arg is a RegExp');
255 0         0 $flags = $$$re{regexp_flags};
256 0         0 $qr = $$$re{value};
257 0         0 $re = $re->prop('source')->[0];
258             }
259             elsif(can $re 'id' and $re->id eq 'undef') {
260 2         12 $re = '';
261             }
262             elsif(can $re 'to_string') {
263 54         161 $re = $re->to_string->value16;
264             }
265             }
266             else {
267 319 100       679 defined $re or $re = '';
268             }
269              
270 375 50       833 if(defined blessed $flags) {
271 0 0 0     0 if(can $flags 'id' and $flags->id eq 'undef') {
    0          
272 0         0 $flags = '';
273             }
274             elsif(can $flags 'to_string') {
275 0         0 $flags = $flags->to_string->value;
276             }
277             }
278             else {
279 375 100       684 defined $flags or $flags = '';
280             }
281              
282              
283             # Let's begin by processing the flags:
284              
285             # Save the flags before we start mangling them
286 375         1291 $$$self{regexp_flags} = $flags;
287              
288 375         1530 $self->prop({
289             name => global =>
290             value => JE::Boolean->new($global, $flags =~ y/g//d),
291             dontenum => 1,
292             readonly => 1,
293             dontdel => 1,
294             });
295              
296             # $flags = desurrogify $flags;
297             # Not necessary, until Perl adds a /𐐢 modifier (not likely)
298              
299             # I'm not supporting /s (at least not for now)
300 11     11   64 no warnings 'syntax'; # so syntax errors in the eval are kept quiet
  11         20  
  11         493  
301 375 100 66     26086 $flags =~ /^((?:(?!s)[\$_\p{ID_Continue}])*)\z/ and eval "qr//$1"
302             or die new JE::Object::Error::SyntaxError $global,
303             add_line_number "Invalid regexp modifiers: '$flags'";
304              
305 373         1376 my $m = $flags =~ /m/;
306 373         1397 $self->prop({
307             name => ignoreCase =>
308             value => JE::Boolean->new($global, $flags =~ /i/),
309             dontenum => 1,
310             readonly => 1,
311             dontdel => 1,
312             });
313 373         1245 $self->prop({
314             name => multiline =>
315             value => JE::Boolean->new($global, $m),
316             dontenum => 1,
317             readonly => 1,
318             dontdel => 1,
319             });
320              
321              
322             # Now we'll deal with the pattern itself.
323              
324             # Save it before we go and mangle it
325 373         1390 $self->prop({
326             name => source =>
327             # ~~~ Can we use ->_new here?
328             value => JE::String->new($global, $re),
329             dontenum => 1,
330             readonly => 1,
331             dontdel => 1,
332             });
333              
334 373 50       906 unless (defined $qr) { # processing begins here
335              
336             # This horrific piece of code converts an ECMAScript regular
337             # expression into a Perl one, more or less.
338              
339             # Since Perl sometimes fills in $1, etc., where they are supposed
340             # to be undefined in ECMAScript, we use embedded code snippets to
341             # put the values into @Match[1..whatever] instead.
342              
343             # The cases we have to take into account are
344             # 1) quantified captures; i.e., (...)+ or (?:()?)+ ; and
345             # 2) captures within interrobang groups: (?!())
346              
347             # The solution is to mark captures as erasure candidates with the
348             # @EraseCapture array.
349              
350             # To solve case 1, we have to put (?{}) markers at the begin-
351             # ning of each grouping construct that has captures in it,
352             # and a quantifier within each pair of capturing parenthe-
353             # ses before the closing paren. (?:(a+)?b)+ will become
354             # (?: (?{...}) ( a+ (?{...}) )? b )+ (spaced out for reada-
355             # bility). The first code interpolation sets $EraseCapture[n]
356             # to 1 for all the captures within that group. The sec-
357             # ond code interpolation will only be triggered if the a+
358             # matches, and there we set $EraseCapture[n] to 0. It’s actu-
359             # ally slightly more complicated than that, because we may
360             # have alternatives directly inside the outer grouping; e.g.,
361             # (?:a|(b))+, so we have to wrap the contents thereof within
362             # (?:), making ‘(?:(?{...})(?:a|(b(?{...}))))+’. Whew!
363              
364             # For case 2 we change (?!...) to (?:(?!...)(?{...})). The embedded
365             # code marks the captures inside (?!) for erasure. The (?: is
366             # needed because the (?!) might be quantified. (We used not to add
367             # the extra (?:), but put the (?{}) at the end of the innermost
368             # enclosing group, but that causes the same \1 problem men-
369             # tioned above.
370              
371             use constant 1.03 # multiple
372             { # Make sure any changes to these constants are also
373             # made at the end
374             # of the subroutine
375             # array indices within each item on the @stack:
376 11         18661 posi => 0, # position within $new_re where the current
377             # group’s contents start, or before the opening
378             # paren for interrobang groups
379             type => 1, # type of group; see constants below
380             xmod => 2, # whether /x mode is active
381             capn => 3, # array ref of capture numbers within this group
382              
383             # types of parens:
384             reg => 0, cap => 1, itrb => 2, brch => 3, cond => 4
385 11     11   9747 };
  11         271  
386              
387 373         431 my $new_re = '';
388 373         332 my $sub_pat;
389 373         822 my @stack = [0,0,$flags =~ /x/];
390 373         397 my $capture_num; # number of the most recently started capture
391             my @capture_nums; # numbers of the captures we’re inside
392             #my $warn;
393             #++$warn if $re eq '(?p{})';
394             {
395 373 100       325 @stack or die new JE::Object::Error::SyntaxError $global,
  1214         2002  
396             add_line_number "Unmatched ) in regexp";
397              
398             # no parens or char classes:
399 1213 50 33     13707 if( $stack[-1][xmod]
    100 66        
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
400             ? $stack[-1][type] == cond || $stack[-1][type] == brch
401             ? $re =~ s/$plain_regexp_x_mode_wo_pipe//
402             : $re =~ s/$plain_regexp_x_mode//
403             : $stack[-1][type] == cond || $stack[-1][type] == brch
404             ? $re =~ s/$plain_regexp_wo_pipe//
405             : $re =~ s/$plain_regexp//
406             ) {
407 566         2546 ($sub_pat = $1) =~
408             s/
409             ([\^\$])
410             |
411             (\.|\\[bBvnrdDsSwW])
412             |
413             \\u([A-Fa-f0-9]{4})
414             |
415             \\([1-9][0-9]*)
416             |
417             \\?([\x{d800}-\x{dfff}])
418             |
419             (\\(?:[^c]|c.))
420             /
421 440 100       2571 defined $1
    100          
    100          
    100          
    100          
    100          
    100          
    100          
422             ? $1 eq '^'
423             ? $m
424             ? '(?:\A|(?<=[\cm\cj\x{2028}\x{2029}]))'
425             : '^'
426             : $m
427             ? '(?:\z|(?=[\cm\cj\x{2028}\x{2029}]))'
428             : '\z'
429             : defined $2 ? $_patterns{$2} :
430             defined $3 ? "\\x{$3}" :
431             defined $4 ? "(?(?{defined\$$4&&"
432             ."!\$EraseCapture[$4]})\\$4)" :
433             # work around a bug in perl:
434             defined $5 ? sprintf '\\x{%x}', ord $5 :
435             $6
436             /egxs;
437 566         787 $new_re .= $sub_pat;
438             }
439              
440             # char class:
441             elsif($re=~s/^\[([^]\\]*(?:\\.[^]\\]*)*)]//s){
442 114 100       409 if($1 eq '') {
    100          
443 2         4 $new_re .= '(?!)';
444             }
445             elsif($1 eq '^') {
446 11         17 $new_re .= '(?s:.)';
447             }
448             else {
449 101         125 my @full_classes;
450 101         505 ($sub_pat = $1) =~ s/
451             (\\[vnr])
452             |
453             (-?)(\\[dsw])(-?)
454             |
455             (\\[DSW])
456             |
457             \\u([A-Fa-f0-9]{4})
458             |
459             \\?([\x{d800}-\x{dfff}])
460             |
461             (\\(?:[^c]|c.))
462             /
463 90 50       571 defined $1 ? $_class_patterns{$1} :
    100          
    100          
    100          
    100          
    100          
    100          
464             defined $3 ?
465             ($2 ? '\-' : '')
466             .$_class_patterns{$3}
467             .($4 ? '\-' : '') :
468             defined $5 ? ((push @full_classes,
469             $_patterns{$5}),'') :
470             defined $6 ? "\\x{$6}" :
471             # work around a bug in perl:
472             defined $7 ? sprintf '\\x{%x}', ord $7 :
473             $8
474             /egxs;
475              
476 101 100       463 $new_re .= length $sub_pat
    100          
    100          
477             ? @full_classes
478             ? '(?:' .
479             join('|', @full_classes,
480             "[$sub_pat]")
481             . ')'
482             : "[$sub_pat]"
483             : @full_classes == 1
484             ? $full_classes[0]
485             : '(?:' . join('|', @full_classes) .
486             ')';
487             }
488             }
489              
490             # (?mods) construct (no colon) :
491             elsif( $stack[-1][xmod]
492             ? $re =~ s/^(\(\s*\?([\w]*)(?:-([\w]*))?\))//
493             : $re =~ s/^(\( \?([\w]*)(?:-([\w]*))?\))//x
494             ) {
495 1         3 $new_re .= $1;
496 1 50 33     18 defined $3 && index($3,'x')+1
      0        
497             ? $stack[-1][xmod]=0
498             : $2 =~ /x/ && ++$stack[-1][xmod];
499             }
500              
501             # start of grouping construct:
502             elsif( $stack[-1][xmod]
503             ? $re=~s/^(\((?:\s*\?([\w-]*:|[^:{?
504             : $re=~s/^(\((?: \?([\w-]*:|[^:{?
505             ) {
506             # warn "$new_re-$1-$2-$3-$re" if $warn;
507 261 50       545 $3 and die JE'Object'Error'SyntaxError->new(
508             $global, add_line_number
509             "Embedded code in regexps is not "
510             . "supported"
511             );
512 261         337 my $pos_b4_parn = length $new_re;
513 261         359 $new_re .= $1;
514 261         313 my $caq = $2; # char(s) after question mark
515 261         205 my @current;
516 261 100       372 if(defined $caq) { # (?...) patterns
517 62 100       219 if($caq eq '(') {
    50          
518 4         18 $re =~ s/^([^)]*\))//;
519 4         7 $new_re .= $1;
520 4 50       17 $1 =~ /^\?[?p]?\{/ && die
521             JE'Object'Error'SyntaxError->new(
522             $global, add_line_number
523             "Embedded code in regexps is not "
524             . "supported"
525             );
526 4         5 $current[type] = cond;
527             }
528             elsif($caq =~ /^[<'P](?![!=])/) {
529 0         0 ++$capture_num;
530 0 0       0 $caq eq "'" ? $re =~ s/^(.*?')//
531             : $re =~ s/^(.*?>)//;
532 0         0 $new_re .= $1;
533 0         0 $current[type] = reg;
534             }
535             else {
536 58         133 $current[type] = (reg,itrb)[$caq eq '!'];
537             }
538 62 100       157 $current[posi] = $caq eq '!' ? $pos_b4_parn
539             : length $new_re;
540             }else{ # capture
541 199         183 ++$capture_num;
542 199         201 push @capture_nums, $capture_num;
543 199         296 push @{$$_[capn]}, $capture_num for @stack;
  241         541  
544 199         346 $current[posi] = length $new_re;
545 199         210 $current[type] = cap;
546             }
547 261         281 $current[xmod] = $stack[-1][xmod];
548 261         398 push @stack, \@current;
549             }
550              
551             # closing paren:
552             elsif($re =~ s/^\)//) {
553 263         229 my @commands;
554 263         253 my $cur = $stack[-1];
555 263 100       420 if($$cur[type] != itrb) {
  26         38  
556 237 100       409 if($$cur[type] == cap) {
557             # we are exiting a capturing group
558 199         465 $new_re .= "(?{local" .
559             "\$EraseCapture[$capture_nums[-1]]=0"
560             ."})";
561 199         246 pop @capture_nums;
562             }
563 237 100 100     526 if($$cur[capn] && @{$$cur[capn]} &&
  29   100     168  
564             $re =~ /^[+{*?]/) { # quantified group
565 13         31 substr $new_re,$$cur[posi],0 =>=
566             _capture_erasure_stuff($$cur[capn])
567             . "(?:";
568 13         17 $new_re .= ")";
569             }
570 237         320 $new_re .= ')';
571             }
572             else {{ # ?!
573 26         30 $new_re .= ')';
574 26 100 66     109 last unless($$cur[capn] && @{$$cur[capn]});
  7         22  
575              
576             # change (?!...) to (?!...)(?{...})
577 7         21 $new_re .= _capture_erasure_stuff(
578             $$cur[capn]
579             );
580              
581             # wrap (?!)(?{}) in (?:) if necessary
582 7 100       30 $re =~ /^[+{*?]/ and
583             substr $new_re,$$cur[posi],0
584             =>= '(?:',
585             $new_re .= ')';
586             }}
587 263         346 pop @stack;
588             }
589              
590             # pipe within (?()|) or (?|) (the latter doesn’t work yet):
591             elsif($re =~ s/^\|//) {
592 2         3 my $cur = $stack[-1];
593 2 100 66     9 if($$cur[capn] && @{$$cur[capn]}
  1         5  
594             #&& $re =~ /^[+{*?]/ # We can’t actually tell
595             ) { # at this point whether the enclosing
596             # group is quantified. Does anyone have any ideas?
597 1         3 substr $new_re,$$cur[posi],0 =>=
598             _capture_erasure_stuff(
599             $$cur[capn]
600             );
601 1         2 @{$$cur[capn]} = ();
  1         3  
602             }
603 2         4 $new_re .= '|';
604 2         4 $$cur[posi] = length $new_re;
605             }
606              
607             # something invalid left over:
608             elsif($re) {
609             #warn $re;
610 0 0       0 die JE::Object::Error::SyntaxError->new($global,
611             add_line_number
612             $re =~ /^\[/
613             ? "Unterminated character class $re in regexp"
614             : 'Trailing \ in regexp');
615             }
616 1213 100       2718 length $re and redo;
617             }
618 372 100       724 @stack or die new JE::Object::Error::SyntaxError $global,
619             add_line_number "Unmatched ) in regexp";
620              
621 371 100       972 aardvark_bug && $new_re =~ /\(\?=/
622             and substr $new_re,0,0, = '(??{""})';
623              
624             #warn $new_re;
625 371 100       527 $qr = eval {
626 11     11   69 use re 'eval'; no warnings 'regexp'; no strict;
  11     11   15  
  11     11   460  
  11         47  
  11         18  
  11         308  
  11         46  
  11         14  
  11         2889  
627              
628             # The warnings pragma doesn’t make it into the re-eval, so
629             # we have to localise $^W, in case the string contains
630             # @EraseCapture[1]=(1)x1 and someone is using -w.
631 371         973 local $^W;
632              
633             # We have to put (?:) around $new_re in the first case,
634             # because it may contain a top-level disjunction, but
635             # not in the second, because the array modifica-
636 371 100       23021 $capture_num # tions in $clear_captures are not localised.
637             ? qr/(?$flags:$clear_captures(?:$new_re)$save_captures)/
638             : qr/(?$flags:$clear_captures$new_re)/
639             } or $@ =~ s/\.?$ \n//x,
640             die JE::Object::Error::SyntaxError->new($global,
641             add_line_number $@);
642              
643             } # end of pattern processing
644              
645 369         4053 $$$self{value} = $qr;
646              
647 369         1472 $self->prop({
648             name => lastIndex =>
649             value => JE::Number->new($global, 0),
650             dontdel => 1,
651             dontenum => 1,
652             });
653              
654 369         1912 $self;
655             }
656             BEGIN {
657 11     11   53 no strict;
  11         15  
  11         502  
658 11     11   21 delete @{__PACKAGE__.'::'}{qw[posi type xmod capn reg cap itrb brch cond]}
  11         5241  
659             }
660              
661              
662              
663             =item value
664              
665             Returns a Perl C regular expression.
666              
667             If the regular expression
668             or the string that is being matched against it contains characters outside
669             the Basic Multilingual Plane (whose character codes exceed 0xffff), the
670             behavior is undefined--for now at least. I still need to solve the problem
671             caused by JS's unintuitive use of raw surrogates. (In JS, C will
672             match a
673             surrogate pair, which is considered to be one character in Perl. This means
674             that the same regexp matched against the same string will produce different
675             results in Perl and JS.)
676              
677             =cut
678              
679             sub value {
680 196     196 1 3014 $${$_[0]}{value};
  196         907  
681             }
682              
683              
684              
685              
686             =item class
687              
688             Returns the string 'RegExp'.
689              
690             =cut
691              
692 346     346 1 1124 sub class { 'RegExp' }
693              
694              
695             sub call {
696 170     170 0 218 my ($self,$str) = @_;
697              
698 170 50       272 die JE::Object::Error::TypeError->new(
699             $self->global, add_line_number
700             "Argument to exec is not a " .
701             "RegExp object"
702             ) unless $self->class eq 'RegExp';
703              
704 170         175 my $je_str;
705 170 50       263 if (defined $str) {
706 170         390 $str =
707             ($je_str=$str->to_string)->value16;
708             }
709             else {
710 0         0 $str = 'undefined';
711             }
712              
713 170         204 my(@ary,$indx);
714 170         266 my $global = $$$self{global};
715              
716 170         379 my $g = $self->prop('global')->value;
717 170 50       287 if ($g) {
718 0         0 my $pos =
719             $self->prop('lastIndex')
720             ->to_number->value;
721 0 0 0     0 $pos < 0 || $pos > length $str
      0        
722             ||
723             (
724             pos $str = $pos,
725             $str !~ /$$$self{value}/g
726             )
727             and goto phail;
728              
729 0         0 @ary = @Match;
730 0         0 $ary[0] = substr($str, $-[0],
731             $+[0] - $-[0]);
732 0         0 $indx = $-[0];
733              
734 0         0 $self->prop(lastIndex =>
735             JE::Number->new(
736             $global,
737             pos $str
738             ));
739 0         0 $global->prototype_for('RegExp')
740             ->prop('constructor')
741             ->capture_re_vars($str);
742             }
743             else {
744 170 100       1682 $str =~ /$$$self{value}/
745             or goto phail;
746              
747 149         282 @ary = @Match;
748 149         755 $ary[0] = substr($str, $-[0],
749             $+[0] - $-[0]);
750 149         307 $indx = $-[0];
751 149         647 $global->prototype_for('RegExp')
752             ->prop('constructor')
753             ->capture_re_vars($str);
754             }
755            
756 149         619 my $ary = JE::Object::Array->new(
757             $global,
758             \@ary
759             );
760 149         508 $ary->prop(index =>
761             JE::Number->new($global,$indx));
762 149 50       451 $ary->prop(input => defined $je_str
763             ? $je_str :
764             JE::String->_new(
765             $global, $str
766             ));
767            
768 149         591 return $ary;
769              
770 21         75 phail:
771             $self->prop(lastIndex =>
772             JE::Number->new(
773             $global,
774             0
775             ));
776 21         66 return $global->null;
777             }
778              
779 0     0 0 0 sub apply { splice @'_, 1, 1; goto &call }
  0         0  
780              
781             @JE::Object::Function::RegExpConstructor::ISA = 'JE::Object::Function';
782             sub JE::Object::Function::RegExpConstructor::capture_re_vars {
783 242     242   267 my $self = shift;
784 242         350 my $global = $$$self{global};
785 242         1388 $self->prop(
786             'lastMatch',
787             JE::String->new($global, substr $_[0], $-[0], $+[0]-$-[0])
788             );
789             {
790 11     11   58 no warnings 'uninitialized';
  11         19  
  11         953  
  242         385  
791 242         884 $self->prop('lastParen', new JE::String $global, "$+")
792             }
793             $self->prop(
794 242         959 'leftContext',
795             new JE'String $global, substr $_[0], 0, $-[0]
796             );
797 242         903 $self->prop('rightContext', new JE'String $global, substr $_[0], $+[0]);
798 11     11   93 no warnings 'uninitialized';
  11         19  
  11         7391  
799 242         1142 $self->prop("\$$_", new JE'String $global, "$Match[$_]") for 1..9;
800             }
801             sub new_constructor {
802 13     13 0 25 my($package,$global) = @_;
803             my $f = JE::Object::Function::RegExpConstructor->new({
804             name => 'RegExp',
805             scope => $global,
806             argnames => [qw/pattern flags/],
807             function => sub {
808 20     20   30 my (undef, $re, $flags) = @_;
809 20 0 0     63 if ($re->class eq 'RegExp' and !defined $flags
      33        
810             || $flags->id eq 'undef') {
811 0         0 return $re
812             }
813 20         49 unshift @_, __PACKAGE__;
814 20         61 goto &new;
815             },
816             function_args => ['scope','args'],
817             constructor => sub {
818 3     3   9 unshift @_, $package;
819 3         10 goto &new;
820             },
821 13         229 constructor_args => ['scope','args'],
822             });
823              
824 13         72 my $proto = $f->prop({
825             name => 'prototype',
826             dontenum => 1,
827             readonly => 1,
828             });
829 13         49 $global->prototype_for('RegExp', $proto);
830              
831             $f->prop({
832             name => '$&',
833             dontdel => 1,
834 3     3   9 fetch => sub { shift->prop('lastMatch') },
835 1     1   4 store => sub { shift->prop('lastMatch', shift) },
836 13         93 });
837             $f->prop({
838             name => '$`',
839             dontdel => 1,
840 4     4   12 fetch => sub { shift->prop('leftContext') },
841 2     2   5 store => sub { shift->prop('leftContext', shift) },
842 13         83 });
843             $f->prop({
844             name => '$\'',
845             dontdel => 1,
846 3     3   8 fetch => sub { shift->prop('rightContext') },
847 1     1   4 store => sub { shift->prop('rightContext', shift) },
848 13         92 });
849             $f->prop({
850             name => '$+',
851             dontdel => 1,
852 3     3   8 fetch => sub { shift->prop('lastParen') },
853 1     1   5 store => sub { shift->prop('lastParen', shift) },
854 13         85 });
855 13         77 my $empty = JE::String->new($global,"");
856 13         104 for(
857             qw(lastParen lastMatch leftContext rightContext),
858             map "\$$_", 1..9
859             ) {
860 169         419 $f->prop({ name => $_, dontdel => 1, value => $empty});
861             }
862            
863             $proto->prop({
864 13         118 name => 'exec',
865             value => JE::Object::Function->new({
866             scope => $global,
867             name => 'exec',
868             argnames => ['string'],
869             no_proto => 1,
870             function_args => ['this','args'],
871             function => \&call,
872             }),
873             dontenum => 1,
874             });
875              
876             $proto->prop({
877             name => 'test',
878             value => JE::Object::Function->new({
879             scope => $global,
880             name => 'test',
881             argnames => ['string'],
882             no_proto => 1,
883             function_args => ['this','args'],
884             function => sub {
885 14     14   16 my ($self,$str) = @_;
886 14 50       35 die JE::Object::Error::TypeError->new(
887             $global, add_line_number
888             "Argument to test is not a " .
889             "RegExp object"
890             ) unless $self->class eq 'RegExp';
891 14         38 my $ret = call($self,$str);
892 14         40 JE::Boolean->new(
893             $global, $ret->id ne 'null'
894             );
895             },
896 13         124 }),
897             dontenum => 1,
898             });
899              
900             $proto->prop({
901             name => 'toString',
902             value => JE::Object::Function->new({
903             scope => $global,
904             name => 'toString',
905             no_proto => 1,
906             function_args => ['this'],
907             function => sub {
908 1     1   2 my ($self,) = @_;
909 1 50       4 die JE::Object::Error::TypeError->new(
910             $global, add_line_number
911             "Argument to toString is not a " .
912             "RegExp object"
913             ) unless $self->class eq 'RegExp';
914 1         3 JE::String->_new(
915             $global,
916             "/" . $self->prop('source')->value
917             . "/$$$self{regexp_flags}"
918             );
919             },
920 13         111 }),
921             dontenum => 1,
922             });
923              
924              
925 13         119 $f;
926             }
927              
928              
929             =back
930              
931             =head1 SEE ALSO
932              
933             =over 4
934              
935             =item JE
936              
937             =item JE::Types
938              
939             =item JE::Object
940              
941             =back
942              
943             =cut
944              
945