File Coverage

blib/lib/JE/Object/RegExp.pm
Criterion Covered Total %
statement 216 240 90.0
branch 119 150 79.3
condition 20 51 39.2
subroutine 35 36 97.2
pod 3 6 50.0
total 393 483 81.3


line stmt bran cond sub pod time code
1             package JE::Object::RegExp;
2              
3             our $VERSION = '0.066';
4              
5              
6 11     11   5041 use strict;
  11         19  
  11         440  
7 11     11   51 use warnings; no warnings 'utf8';
  11     11   19  
  11         353  
  11         42  
  11         16  
  11         496  
8              
9 11         90 use overload fallback => 1,
10 11     11   52 '""'=> 'value';
  11         16  
11              
12             # This constant is true if we need to work around perl bug #122460 to keep
13             # the ‘aardvark’ tests (in t/15.05-string-objects.t) passing. This only
14             # applies to perl 5.20.0. (perl 5.20.1 includes a fix.) Basic-
15             # ally, (?=...) can result in buggy optimisations that cause a faulty
16             # rejection of the match at some locations, because it is assumed that it
17             # cannot match in some spots.
18             use constant aardvark_bug =>
19             # This test should match the empty string. If it advances (pos returns
20             # true), then we have the bug.
21 11     11   1490 do { my $a = "rdvark"; $a =~ /(?{})(?=.)a*?/g; pos $a };
  11         20  
  11         19  
  11         18  
  11         104  
  11         737  
22              
23 11     11   65 use Scalar::Util 'blessed';
  11         27  
  11         2972  
24              
25             our @ISA = 'JE::Object';
26              
27             require JE::Boolean;
28             require JE::Code;
29             require JE::Object;
30             require JE::String;
31              
32             import JE::Code 'add_line_number';
33             sub add_line_number;
34              
35             our @Match;
36             our @EraseCapture;
37              
38             #import JE::String 'desurrogify';
39             #sub desurrogify($);
40             # Only need to turn these on when Perl starts adding regexp modifiers
41             # outside the BMP.
42              
43             # JS regexp features that Perl doesn't have, or which differ from Perl's,
44             # along with their Perl equivalents
45             # ^ with /m \A|(?<=[\cm\cj\x{2028}\x{2029}]) (^ with the /m modifier
46             # matches whenever a Unicode
47             # line break (not just \n)
48             # precedes the current position,
49             # even at the end of the string. In
50             # Perl, /^/m matches \A|(?<=\n)(?!\z) .)
51             # $ \z
52             # $ with /m (?:\z|(?=[\cm\cj\x{2028}\x{2029}]))
53             # \b (?:(?<=$w)(?!$w)|(?
54             # \B (?:(?<=$w)(?=$w)|(?
55             # doesn't include non-ASCII
56             # word chars in \w)
57             # . [^\cm\cj\x{2028}\x{2029}]
58             # \v \cK
59             # \n \cj (whether \n matches \cj in Perl is system-dependent)
60             # \r \cm
61             # \uHHHH \x{HHHH}
62             # \d [0-9]
63             # \D [^0-9]
64             # \s [\p{Zs}\s\ck]
65             # \S [^\p{Zs}\s\ck]
66             # \w [A-Za-z0-9_]
67             # \W [^A-Za-z0-9_]
68             # [^] (?s:.)
69             # [] (?!)
70              
71             # Other differences
72             #
73             # A quantifier in a JS regexp will, when repeated, clear all values cap-
74             # tured by capturing parentheses in the term that it quantifies. This means
75             # that /((a)?b)+/, when matched against "abb" will leave $2 undefined, even
76             # though the second () matched "a" the first time the first () matched.
77             # (The ECMAScript spec says to do it this way, but Safari leaves $2 with
78             # "a" in it and doesn't clear it on the second iteration of the '+'.) Perl
79             # does it both ways, and the rules aren't quite clear to me:
80             #
81             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a)?b)+/;'
82             # b,
83             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a+)?b)+/;'
84             # b,a
85             #
86             # perl5.9.4 produces the same. perl5.002_01 crashes quite nicely.
87             #
88             #
89             # In ECMAScript, when the pattern inside a (?! ... ) fails (in which case
90             # the (?!) succeeds), values captured by parentheses within the negative
91             # lookahead are cleared, such that subsequent backreferences *outside* the
92             # lookahead are equivalent to (?:) (zero-width always-match assertion). In
93             # Perl, the captured values are left as they are when the pattern inside
94             # the lookahead fails:
95             #
96             # $ perl5.8.8 -le 'print "a" =~ /(?!(a)b)a/;'
97             # a
98             # $ perl5.9.4 -le 'print "a" =~ /(?!(a)b)a/;'
99             # a
100             #
101             #
102             # In ECMAScript, as in Perl, a pair of capturing parentheses will produce
103             # the undefined value if the parens were not part of the final match.
104             # Undefined will still be produced if there is a \digit backreference
105             # reference to those parens. In ECMAScript, such a back-reference is equiv-
106             # alent to (?:); in Perl it is equivalent to (?!). Therefore, ECMAScript’s
107             # \1 is equivalent to Perl’s (?(1)\1). (It would seem, upon testing
108             # /(?:|())/ vs. /(?:|())\1/ in perl, that the \1 back-reference always suc-
109             # ceeds, and ends up setting $1 to "" [as opposed to undef]. What is actu-
110             # ally happening is that the failed \1 causes backtracking, so the second
111             # alternative in (?:|()) matches, setting $1 to the empty string. Safari,
112             # incidentally, does what Perl *appears* to do at first glance, *if* the
113             # backreference itself is within capturing parentheses (as in
114             # /(?:|())(\1)/).
115             #
116             # These issues are solved with embedded code snippets, as explained below,
117             # where the actual code is.
118             #
119             #
120             # In ECMAScript, case-folding inside the regular expression engine is not
121             # allowed to change the length of a string. Therefore, "ß" never matches
122             # /ss/i, and vice versa. I’m disinclined to be ECMAScript compliant in this
123             # regard though, because it would affect performance. The inefficient solu-
124             # tion I have in mind is to change /x/i to /(?-i:x)/ for every character
125             # that has a multi-character uppercase equivalent; and to change /xx/i to
126             # /(?-i:[Xx][Xx])/ where xx represents a multi-character sequence that
127             # could match a single character in Perl. The latter is the main problem.
128             # How are we to find out which character sequences need this? We could
129             # change /x/i to /[xX]/ for every literal character in the string, but how
130             # would we take /Σ/ -> /[Σσς]/ into account? And does perl’s regexp engine
131             # slow down if we feed it a ton of character classes instead of literal
132             # text? (Need to do some benchmarks.) (If we do fix this, we need to re-
133             # enable the skipped tests.)
134              
135              
136              
137             =head1 NAME
138              
139             JE::Object::RegExp - JavaScript regular expression (RegExp object) class
140              
141             =head1 SYNOPSIS
142              
143             use JE;
144             use JE::Object::RegExp;
145              
146             $j = new JE;
147              
148             $js_regexp = new JE::Object::RegExp $j, "(.*)", 'ims';
149              
150             $perl_qr = $js_regexp->value;
151              
152             $some_string =~ $js_regexp; # You can use it as a qr//
153              
154             =head1 DESCRIPTION
155              
156             This class implements JavaScript regular expressions for JE.
157              
158             See L for a description of most of the interface. Only what
159             is specific to JE::Object::RegExp is explained here.
160              
161             A RegExp object will stringify the same way as a C, so that you can
162             use C<=~> on it. This is different from the return value of the
163             C method (the way it stringifies in JS).
164              
165             Since JE's regular expressions use Perl's engine underneath, the
166             features that Perl provides that are not part of the ECMAScript spec are
167             supported, except for C<(?s)>
168             and C<(?m)>, which don't do anything, and C<(?|...)>, which is
169             unpredictable.
170              
171             In versions prior to 0.042, a hyphen adjacent to C<\d>, C<\s> or C<\w> in a
172             character class would be unpredictable (sometimes a syntax error). Now it
173             is interpreted literally. This matches what most implementations do, which
174             happens to be the same as Perl's behaviour. (It is a syntax error
175             in ECMAScript.)
176              
177             =head1 METHODS
178              
179             =over 4
180              
181             =cut
182              
183             # ~~~ How should surrogates work??? To make regexps work with JS strings
184             # properly, we need to use the surrogified string so that /../ will
185             # correctly match two surrogates. In this case it won't work properly
186             # with Perl strings, so what is the point of Perl-style stringification?
187             # Perhaps we should allow this anyway, but warn about code points outside
188             # the BMP in the documentation. (Should we also produce a Perl warning?
189             # Though I'm not that it's possible to catch this: "\x{10000}" =~ $re).
190             #
191             # But it would be nice if this would work:
192             # $j->eval("'\x{10000}'") =~ $j->eval('/../')
193             # ~~~ We might be able to make this work with perl 5.12’s qr overloading.
194              
195             our %_patterns = qw/
196             \b (?:(?<=[A-Za-z0-9_])(?![A-Za-z0-9_])|(?
197             \B (?:(?<=[A-Za-z0-9_])(?=[A-Za-z0-9_])|(?
198             . [^\cm\cj\x{2028}\x{2029}]
199             \v \cK
200             \n \cj
201             \r \cm
202             \d [0-9]
203             \D [^0-9]
204             \s [\p{Zs}\s\ck]
205             \S [^\p{Zs}\s\ck]
206             \w [A-Za-z0-9_]
207             \W [^A-Za-z0-9_]
208             /;
209              
210             our %_class_patterns = qw/
211             \v \cK
212             \n \cj
213             \r \cm
214             \d 0-9
215             \s \p{Zs}\s\ck
216             \w A-Za-z0-9_
217             /;
218              
219             my $clear_captures = qr/(?{@Match=@EraseCapture=()})/;
220 11     11   65 my $save_captures = do { no strict 'refs';
  11         21  
  11         7427  
221             qr/(?{$Match[$_]=$EraseCapture[$_]?undef:$$_ for 1..$#+})/; };
222             # These are pretty scary, aren’t they?
223             my $plain_regexp =
224             qr/^((?:[^\\[()]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
225             my $plain_regexp_x_mode =
226             qr/^((?:[^\\[()]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
227             my $plain_regexp_wo_pipe =
228             qr/^((?:[^\\[()|]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
229             my $plain_regexp_x_mode_wo_pipe =
230             qr/^((?:[^\\[()|]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
231              
232             sub _capture_erasure_stuff {
233 21         100 "(?{local\@EraseCapture[" . join(',',@{$_[0]}) . "]=(1)x"
  21         115  
234 21     21   34 . @{$_[0]} . '})'
235             }
236              
237             sub new {
238 375     375 1 1154 my ($class, $global, $re, $flags) = @_;
239 375   33     1527 my $self = $class->SUPER::new($global, {
240             prototype => $global->prototype_for('RegExp')
241             || $global->prop('RegExp')->prop('prototype')
242             });
243              
244 375         882 my $qr;
245              
246 375 100       1086 if(defined blessed $re) {
247 56 50 66     641 if ($re->isa(__PACKAGE__)) {
    100          
    50          
248 0 0 0     0 defined $flags && eval{$flags->id} ne 'undef' and
  0         0  
249             die JE::Object::Error::TypeError->new(
250             $global, add_line_number
251             'Second argument to ' .
252             'RegExp() must be undefined if ' .
253             'first arg is a RegExp');
254 0         0 $flags = $$$re{regexp_flags};
255 0         0 $qr = $$$re{value};
256 0         0 $re = $re->prop('source')->[0];
257             }
258             elsif(can $re 'id' and $re->id eq 'undef') {
259 2         8 $re = '';
260             }
261             elsif(can $re 'to_string') {
262 54         163 $re = $re->to_string->value16;
263             }
264             }
265             else {
266 319 100       713 defined $re or $re = '';
267             }
268              
269 375 50       943 if(defined blessed $flags) {
270 0 0 0     0 if(can $flags 'id' and $flags->id eq 'undef') {
    0          
271 0         0 $flags = '';
272             }
273             elsif(can $flags 'to_string') {
274 0         0 $flags = $flags->to_string->value;
275             }
276             }
277             else {
278 375 100       816 defined $flags or $flags = '';
279             }
280              
281              
282             # Let's begin by processing the flags:
283              
284             # Save the flags before we start mangling them
285 375         1551 $$$self{regexp_flags} = $flags;
286              
287 375         1709 $self->prop({
288             name => global =>
289             value => JE::Boolean->new($global, $flags =~ y/g//d),
290             dontenum => 1,
291             readonly => 1,
292             dontdel => 1,
293             });
294              
295             # $flags = desurrogify $flags;
296             # Not necessary, until Perl adds a /𐐢 modifier (not likely)
297              
298             # I'm not supporting /s (at least not for now)
299 11     11   79 no warnings 'syntax'; # so syntax errors in the eval are kept quiet
  11         22  
  11         700  
300 375 100 66     28015 $flags =~ /^((?:(?!s)[\$_\p{ID_Continue}])*)\z/ and eval "qr//$1"
301             or die new JE::Object::Error::SyntaxError $global,
302             add_line_number "Invalid regexp modifiers: '$flags'";
303              
304 373         1419 my $m = $flags =~ /m/;
305 373         1467 $self->prop({
306             name => ignoreCase =>
307             value => JE::Boolean->new($global, $flags =~ /i/),
308             dontenum => 1,
309             readonly => 1,
310             dontdel => 1,
311             });
312 373         1403 $self->prop({
313             name => multiline =>
314             value => JE::Boolean->new($global, $m),
315             dontenum => 1,
316             readonly => 1,
317             dontdel => 1,
318             });
319              
320              
321             # Now we'll deal with the pattern itself.
322              
323             # Save it before we go and mangle it
324             $self->prop({
325             name => source =>
326             # ~~~ Can we use ->_new here?
327 373         770 value => JE::String->new($global, do {
328 373         1643 (my $tmp = $re) =~
329             s<(\\.)|/>
330 376 100       1379 egg;
331 373         1584 $tmp
332             }),
333             dontenum => 1,
334             readonly => 1,
335             dontdel => 1,
336             });
337              
338 373 50       1092 unless (defined $qr) { # processing begins here
339              
340             # This horrific piece of code converts an ECMAScript regular
341             # expression into a Perl one, more or less.
342              
343             # Since Perl sometimes fills in $1, etc., where they are supposed
344             # to be undefined in ECMAScript, we use embedded code snippets to
345             # put the values into @Match[1..whatever] instead.
346              
347             # The cases we have to take into account are
348             # 1) quantified captures; i.e., (...)+ or (?:()?)+ ; and
349             # 2) captures within interrobang groups: (?!())
350              
351             # The solution is to mark captures as erasure candidates with the
352             # @EraseCapture array.
353              
354             # To solve case 1, we have to put (?{}) markers at the begin-
355             # ning of each grouping construct that has captures in it,
356             # and a quantifier within each pair of capturing parenthe-
357             # ses before the closing paren. (?:(a+)?b)+ will become
358             # (?: (?{...}) ( a+ (?{...}) )? b )+ (spaced out for reada-
359             # bility). The first code interpolation sets $EraseCapture[n]
360             # to 1 for all the captures within that group. The sec-
361             # ond code interpolation will only be triggered if the a+
362             # matches, and there we set $EraseCapture[n] to 0. It’s actu-
363             # ally slightly more complicated than that, because we may
364             # have alternatives directly inside the outer grouping; e.g.,
365             # (?:a|(b))+, so we have to wrap the contents thereof within
366             # (?:), making ‘(?:(?{...})(?:a|(b(?{...}))))+’. Whew!
367              
368             # For case 2 we change (?!...) to (?:(?!...)(?{...})). The embedded
369             # code marks the captures inside (?!) for erasure. The (?: is
370             # needed because the (?!) might be quantified. (We used not to add
371             # the extra (?:), but put the (?{}) at the end of the innermost
372             # enclosing group, but that causes the same \1 problem men-
373             # tioned above.
374              
375             use constant 1.03 # multiple
376             { # Make sure any changes to these constants are also
377             # made at the end
378             # of the subroutine
379             # array indices within each item on the @stack:
380 11         21337 posi => 0, # position within $new_re where the current
381             # group’s contents start, or before the opening
382             # paren for interrobang groups
383             type => 1, # type of group; see constants below
384             xmod => 2, # whether /x mode is active
385             capn => 3, # array ref of capture numbers within this group
386              
387             # types of parens:
388             reg => 0, cap => 1, itrb => 2, brch => 3, cond => 4
389 11     11   11785 };
  11         304  
390              
391 373         473 my $new_re = '';
392 373         369 my $sub_pat;
393 373         888 my @stack = [0,0,$flags =~ /x/];
394 373         385 my $capture_num; # number of the most recently started capture
395             my @capture_nums; # numbers of the captures we’re inside
396             #my $warn;
397             #++$warn if $re eq '(?p{})';
398             {
399 373 100       434 @stack or die new JE::Object::Error::SyntaxError $global,
  1214         2239  
400             add_line_number "Unmatched ) in regexp";
401              
402             # no parens or char classes:
403 1213 50 33     15486 if( $stack[-1][xmod]
    100 66        
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
404             ? $stack[-1][type] == cond || $stack[-1][type] == brch
405             ? $re =~ s/$plain_regexp_x_mode_wo_pipe//
406             : $re =~ s/$plain_regexp_x_mode//
407             : $stack[-1][type] == cond || $stack[-1][type] == brch
408             ? $re =~ s/$plain_regexp_wo_pipe//
409             : $re =~ s/$plain_regexp//
410             ) {
411 566         2895 ($sub_pat = $1) =~
412             s/
413             ([\^\$])
414             |
415             (\.|\\[bBvnrdDsSwW])
416             |
417             \\u([A-Fa-f0-9]{4})
418             |
419             \\([1-9][0-9]*)
420             |
421             \\?([\x{d800}-\x{dfff}])
422             |
423             (\\(?:[^c]|c.))
424             /
425 440 100       2983 defined $1
    100          
    100          
    100          
    100          
    100          
    100          
    100          
426             ? $1 eq '^'
427             ? $m
428             ? '(?:\A|(?<=[\cm\cj\x{2028}\x{2029}]))'
429             : '^'
430             : $m
431             ? '(?:\z|(?=[\cm\cj\x{2028}\x{2029}]))'
432             : '\z'
433             : defined $2 ? $_patterns{$2} :
434             defined $3 ? "\\x{$3}" :
435             defined $4 ? "(?(?{defined\$$4&&"
436             ."!\$EraseCapture[$4]})\\$4)" :
437             # work around a bug in perl:
438             defined $5 ? sprintf '\\x{%x}', ord $5 :
439             $6
440             /egxs;
441 566         848 $new_re .= $sub_pat;
442             }
443              
444             # char class:
445             elsif($re=~s/^\[([^]\\]*(?:\\.[^]\\]*)*)]//s){
446 114 100       418 if($1 eq '') {
    100          
447 2         4 $new_re .= '(?!)';
448             }
449             elsif($1 eq '^') {
450 11         29 $new_re .= '(?s:.)';
451             }
452             else {
453 101         115 my @full_classes;
454 101         575 ($sub_pat = $1) =~ s/
455             (\\[vnr])
456             |
457             (-?)(\\[dsw])(-?)
458             |
459             (\\[DSW])
460             |
461             \\u([A-Fa-f0-9]{4})
462             |
463             \\?([\x{d800}-\x{dfff}])
464             |
465             (\\(?:[^c]|c.))
466             /
467 90 50       596 defined $1 ? $_class_patterns{$1} :
    100          
    100          
    100          
    100          
    100          
    100          
468             defined $3 ?
469             ($2 ? '\-' : '')
470             .$_class_patterns{$3}
471             .($4 ? '\-' : '') :
472             defined $5 ? ((push @full_classes,
473             $_patterns{$5}),'') :
474             defined $6 ? "\\x{$6}" :
475             # work around a bug in perl:
476             defined $7 ? sprintf '\\x{%x}', ord $7 :
477             $8
478             /egxs;
479              
480 101 100       488 $new_re .= length $sub_pat
    100          
    100          
481             ? @full_classes
482             ? '(?:' .
483             join('|', @full_classes,
484             "[$sub_pat]")
485             . ')'
486             : "[$sub_pat]"
487             : @full_classes == 1
488             ? $full_classes[0]
489             : '(?:' . join('|', @full_classes) .
490             ')';
491             }
492             }
493              
494             # (?mods) construct (no colon) :
495             elsif( $stack[-1][xmod]
496             ? $re =~ s/^(\(\s*\?([\w]*)(?:-([\w]*))?\))//
497             : $re =~ s/^(\( \?([\w]*)(?:-([\w]*))?\))//x
498             ) {
499 1         3 $new_re .= $1;
500 1 50 33     19 defined $3 && index($3,'x')+1
      0        
501             ? $stack[-1][xmod]=0
502             : $2 =~ /x/ && ++$stack[-1][xmod];
503             }
504              
505             # start of grouping construct:
506             elsif( $stack[-1][xmod]
507             ? $re=~s/^(\((?:\s*\?([\w-]*:|[^:{?
508             : $re=~s/^(\((?: \?([\w-]*:|[^:{?
509             ) {
510             # warn "$new_re-$1-$2-$3-$re" if $warn;
511 261 50       925 $3 and die JE'Object'Error'SyntaxError->new(
512             $global, add_line_number
513             "Embedded code in regexps is not "
514             . "supported"
515             );
516 261         390 my $pos_b4_parn = length $new_re;
517 261         402 $new_re .= $1;
518 261         352 my $caq = $2; # char(s) after question mark
519 261         239 my @current;
520 261 100       439 if(defined $caq) { # (?...) patterns
521 62 100       265 if($caq eq '(') {
    50          
522 4         14 $re =~ s/^([^)]*\))//;
523 4         7 $new_re .= $1;
524 4 50       13 $1 =~ /^\?[?p]?\{/ && die
525             JE'Object'Error'SyntaxError->new(
526             $global, add_line_number
527             "Embedded code in regexps is not "
528             . "supported"
529             );
530 4         5 $current[type] = cond;
531             }
532             elsif($caq =~ /^[<'P](?![!=])/) {
533 0         0 ++$capture_num;
534 0 0       0 $caq eq "'" ? $re =~ s/^(.*?')//
535             : $re =~ s/^(.*?>)//;
536 0         0 $new_re .= $1;
537 0         0 $current[type] = reg;
538             }
539             else {
540 58         169 $current[type] = (reg,itrb)[$caq eq '!'];
541             }
542 62 100       191 $current[posi] = $caq eq '!' ? $pos_b4_parn
543             : length $new_re;
544             }else{ # capture
545 199         241 ++$capture_num;
546 199         272 push @capture_nums, $capture_num;
547 199         329 push @{$$_[capn]}, $capture_num for @stack;
  241         629  
548 199         434 $current[posi] = length $new_re;
549 199         258 $current[type] = cap;
550             }
551 261         341 $current[xmod] = $stack[-1][xmod];
552 261         453 push @stack, \@current;
553             }
554              
555             # closing paren:
556             elsif($re =~ s/^\)//) {
557 263         240 my @commands;
558 263         285 my $cur = $stack[-1];
559 263 100       464 if($$cur[type] != itrb) {
  26         44  
560 237 100       487 if($$cur[type] == cap) {
561             # we are exiting a capturing group
562 199         551 $new_re .= "(?{local" .
563             "\$EraseCapture[$capture_nums[-1]]=0"
564             ."})";
565 199         274 pop @capture_nums;
566             }
567 237 100 100     642 if($$cur[capn] && @{$$cur[capn]} &&
  29   100     202  
568             $re =~ /^[+{*?]/) { # quantified group
569 13         49 substr $new_re,$$cur[posi],0 =>=
570             _capture_erasure_stuff($$cur[capn])
571             . "(?:";
572 13         38 $new_re .= ")";
573             }
574 237         384 $new_re .= ')';
575             }
576             else {{ # ?!
577 26         32 $new_re .= ')';
578 26 100 66     118 last unless($$cur[capn] && @{$$cur[capn]});
  7         25  
579              
580             # change (?!...) to (?!...)(?{...})
581 7         22 $new_re .= _capture_erasure_stuff(
582             $$cur[capn]
583             );
584              
585             # wrap (?!)(?{}) in (?:) if necessary
586 7 100       35 $re =~ /^[+{*?]/ and
587             substr $new_re,$$cur[posi],0
588             =>= '(?:',
589             $new_re .= ')';
590             }}
591 263         382 pop @stack;
592             }
593              
594             # pipe within (?()|) or (?|) (the latter doesn’t work yet):
595             elsif($re =~ s/^\|//) {
596 2         3 my $cur = $stack[-1];
597 2 100 66     7 if($$cur[capn] && @{$$cur[capn]}
  1         4  
598             #&& $re =~ /^[+{*?]/ # We can’t actually tell
599             ) { # at this point whether the enclosing
600             # group is quantified. Does anyone have any ideas?
601 1         4 substr $new_re,$$cur[posi],0 =>=
602             _capture_erasure_stuff(
603             $$cur[capn]
604             );
605 1         2 @{$$cur[capn]} = ();
  1         2  
606             }
607 2         4 $new_re .= '|';
608 2         3 $$cur[posi] = length $new_re;
609             }
610              
611             # something invalid left over:
612             elsif($re) {
613             #warn $re;
614 0 0       0 die JE::Object::Error::SyntaxError->new($global,
615             add_line_number
616             $re =~ /^\[/
617             ? "Unterminated character class $re in regexp"
618             : 'Trailing \ in regexp');
619             }
620 1213 100       3271 length $re and redo;
621             }
622 372 100       954 @stack or die new JE::Object::Error::SyntaxError $global,
623             add_line_number "Unmatched ) in regexp";
624              
625 371 100       1168 aardvark_bug && $new_re =~ /\(\?=/
626             and substr $new_re,0,0, = '(??{""})';
627              
628             #warn $new_re;
629 371 100       467 $qr = eval {
630 11     11   87 use re 'eval'; no warnings 'regexp'; no strict;
  11     11   17  
  11     11   498  
  11         58  
  11         18  
  11         355  
  11         44  
  11         18  
  11         3041  
631              
632             # The warnings pragma doesn’t make it into the re-eval, so
633             # we have to localise $^W, in case the string contains
634             # @EraseCapture[1]=(1)x1 and someone is using -w.
635 371         999 local $^W;
636              
637             # We have to put (?:) around $new_re in the first case,
638             # because it may contain a top-level disjunction, but
639             # not in the second, because the array modifica-
640 371 100       24430 $capture_num # tions in $clear_captures are not localised.
641             ? qr/(?$flags:$clear_captures(?:$new_re)$save_captures)/
642             : qr/(?$flags:$clear_captures$new_re)/
643             } or $@ =~ s/\.?$ \n//x,
644             die JE::Object::Error::SyntaxError->new($global,
645             add_line_number $@);
646              
647             } # end of pattern processing
648              
649 369         4211 $$$self{value} = $qr;
650              
651 369         1727 $self->prop({
652             name => lastIndex =>
653             value => JE::Number->new($global, 0),
654             dontdel => 1,
655             dontenum => 1,
656             });
657              
658 369         2056 $self;
659             }
660             BEGIN {
661 11     11   63 no strict;
  11         41  
  11         560  
662 11     11   20 delete @{__PACKAGE__.'::'}{qw[posi type xmod capn reg cap itrb brch cond]}
  11         5881  
663             }
664              
665              
666              
667             =item value
668              
669             Returns a Perl C regular expression.
670              
671             If the regular expression
672             or the string that is being matched against it contains characters outside
673             the Basic Multilingual Plane (whose character codes exceed 0xffff), the
674             behavior is undefined--for now at least. I still need to solve the problem
675             caused by JS's unintuitive use of raw surrogates. (In JS, C will
676             match a
677             surrogate pair, which is considered to be one character in Perl. This means
678             that the same regexp matched against the same string will produce different
679             results in Perl and JS.)
680              
681             =cut
682              
683             sub value {
684 196     196 1 2900 $${$_[0]}{value};
  196         1123  
685             }
686              
687              
688              
689              
690             =item class
691              
692             Returns the string 'RegExp'.
693              
694             =cut
695              
696 346     346 1 1508 sub class { 'RegExp' }
697              
698              
699             sub call {
700 170     170 0 292 my ($self,$str) = @_;
701              
702 170 50       456 die JE::Object::Error::TypeError->new(
703             $self->global, add_line_number
704             "Argument to exec is not a " .
705             "RegExp object"
706             ) unless $self->class eq 'RegExp';
707              
708 170         277 my $je_str;
709 170 50       353 if (defined $str) {
710 170         536 $str =
711             ($je_str=$str->to_string)->value16;
712             }
713             else {
714 0         0 $str = 'undefined';
715             }
716              
717 170         274 my(@ary,$indx);
718 170         422 my $global = $$$self{global};
719              
720 170         536 my $g = $self->prop('global')->value;
721 170 50       415 if ($g) {
722 0         0 my $pos =
723             $self->prop('lastIndex')
724             ->to_number->value;
725 0 0 0     0 $pos < 0 || $pos > length $str
      0        
726             ||
727             (
728             pos $str = $pos,
729             $str !~ /$$$self{value}/g
730             )
731             and goto phail;
732              
733 0         0 @ary = @Match;
734 0         0 $ary[0] = substr($str, $-[0],
735             $+[0] - $-[0]);
736 0         0 $indx = $-[0];
737              
738 0         0 $self->prop(lastIndex =>
739             JE::Number->new(
740             $global,
741             pos $str
742             ));
743 0         0 $global->prototype_for('RegExp')
744             ->prop('constructor')
745             ->capture_re_vars($str);
746             }
747             else {
748 170 100       2139 $str =~ /$$$self{value}/
749             or goto phail;
750              
751 149         411 @ary = @Match;
752 149         1036 $ary[0] = substr($str, $-[0],
753             $+[0] - $-[0]);
754 149         402 $indx = $-[0];
755 149         637 $global->prototype_for('RegExp')
756             ->prop('constructor')
757             ->capture_re_vars($str);
758             }
759            
760 149         930 my $ary = JE::Object::Array->new(
761             $global,
762             \@ary
763             );
764 149         714 $ary->prop(index =>
765             JE::Number->new($global,$indx));
766 149 50       702 $ary->prop(input => defined $je_str
767             ? $je_str :
768             JE::String->_new(
769             $global, $str
770             ));
771            
772 149         853 return $ary;
773              
774 21         119 phail:
775             $self->prop(lastIndex =>
776             JE::Number->new(
777             $global,
778             0
779             ));
780 21         99 return $global->null;
781             }
782              
783 0     0 0 0 sub apply { splice @'_, 1, 1; goto &call }
  0         0  
784              
785             @JE::Object::Function::RegExpConstructor::ISA = 'JE::Object::Function';
786             sub JE::Object::Function::RegExpConstructor::capture_re_vars {
787 242     242   329 my $self = shift;
788 242         515 my $global = $$$self{global};
789 242         1815 $self->prop(
790             'lastMatch',
791             JE::String->new($global, substr $_[0], $-[0], $+[0]-$-[0])
792             );
793             {
794 11     11   64 no warnings 'uninitialized';
  11         23  
  11         1019  
  242         515  
795 242         1180 $self->prop('lastParen', new JE::String $global, "$+")
796             }
797             $self->prop(
798 242         1239 'leftContext',
799             new JE'String $global, substr $_[0], 0, $-[0]
800             );
801 242         1217 $self->prop('rightContext', new JE'String $global, substr $_[0], $+[0]);
802 11     11   53 no warnings 'uninitialized';
  11         22  
  11         8240  
803 242         1674 $self->prop("\$$_", new JE'String $global, "$Match[$_]") for 1..9;
804             }
805             sub new_constructor {
806 13     13 0 24 my($package,$global) = @_;
807             my $f = JE::Object::Function::RegExpConstructor->new({
808             name => 'RegExp',
809             scope => $global,
810             argnames => [qw/pattern flags/],
811             function => sub {
812 20     20   45 my (undef, $re, $flags) = @_;
813 20 0 0     69 if ($re->class eq 'RegExp' and !defined $flags
      33        
814             || $flags->id eq 'undef') {
815 0         0 return $re
816             }
817 20         63 unshift @_, __PACKAGE__;
818 20         67 goto &new;
819             },
820             function_args => ['scope','args'],
821             constructor => sub {
822 3     3   11 unshift @_, $package;
823 3         15 goto &new;
824             },
825 13         255 constructor_args => ['scope','args'],
826             });
827              
828 13         84 my $proto = $f->prop({
829             name => 'prototype',
830             dontenum => 1,
831             readonly => 1,
832             });
833 13         56 $global->prototype_for('RegExp', $proto);
834              
835             $f->prop({
836             name => '$&',
837             dontdel => 1,
838 3     3   17 fetch => sub { shift->prop('lastMatch') },
839 1     1   4 store => sub { shift->prop('lastMatch', shift) },
840 13         97 });
841             $f->prop({
842             name => '$`',
843             dontdel => 1,
844 4     4   19 fetch => sub { shift->prop('leftContext') },
845 2     2   11 store => sub { shift->prop('leftContext', shift) },
846 13         109 });
847             $f->prop({
848             name => '$\'',
849             dontdel => 1,
850 3     3   12 fetch => sub { shift->prop('rightContext') },
851 1     1   4 store => sub { shift->prop('rightContext', shift) },
852 13         102 });
853             $f->prop({
854             name => '$+',
855             dontdel => 1,
856 3     3   14 fetch => sub { shift->prop('lastParen') },
857 1     1   4 store => sub { shift->prop('lastParen', shift) },
858 13         86 });
859 13         90 my $empty = JE::String->new($global,"");
860 13         114 for(
861             qw(lastParen lastMatch leftContext rightContext),
862             map "\$$_", 1..9
863             ) {
864 169         466 $f->prop({ name => $_, dontdel => 1, value => $empty});
865             }
866            
867             $proto->prop({
868 13         136 name => 'exec',
869             value => JE::Object::Function->new({
870             scope => $global,
871             name => 'exec',
872             argnames => ['string'],
873             no_proto => 1,
874             function_args => ['this','args'],
875             function => \&call,
876             }),
877             dontenum => 1,
878             });
879              
880             $proto->prop({
881             name => 'test',
882             value => JE::Object::Function->new({
883             scope => $global,
884             name => 'test',
885             argnames => ['string'],
886             no_proto => 1,
887             function_args => ['this','args'],
888             function => sub {
889 14     14   30 my ($self,$str) = @_;
890 14 50       56 die JE::Object::Error::TypeError->new(
891             $global, add_line_number
892             "Argument to test is not a " .
893             "RegExp object"
894             ) unless $self->class eq 'RegExp';
895 14         41 my $ret = call($self,$str);
896 14         63 JE::Boolean->new(
897             $global, $ret->id ne 'null'
898             );
899             },
900 13         159 }),
901             dontenum => 1,
902             });
903              
904             $proto->prop({
905             name => 'toString',
906             value => JE::Object::Function->new({
907             scope => $global,
908             name => 'toString',
909             no_proto => 1,
910             function_args => ['this'],
911             function => sub {
912 1     1   2 my ($self,) = @_;
913 1 50       5 die JE::Object::Error::TypeError->new(
914             $global, add_line_number
915             "Argument to toString is not a " .
916             "RegExp object"
917             ) unless $self->class eq 'RegExp';
918 1         5 JE::String->_new(
919             $global,
920             "/" . $self->prop('source')->value
921             . "/$$$self{regexp_flags}"
922             );
923             },
924 13         123 }),
925             dontenum => 1,
926             });
927              
928              
929 13         134 $f;
930             }
931              
932              
933             =back
934              
935             =head1 SEE ALSO
936              
937             =over 4
938              
939             =item JE
940              
941             =item JE::Types
942              
943             =item JE::Object
944              
945             =back
946              
947             =cut
948              
949