File Coverage

blib/lib/JE/Object/RegExp.pm
Criterion Covered Total %
statement 216 240 90.0
branch 119 150 79.3
condition 20 51 39.2
subroutine 35 36 97.2
pod 3 6 50.0
total 393 483 81.3


line stmt bran cond sub pod time code
1             package JE::Object::RegExp;
2              
3             our $VERSION = '0.065';
4              
5              
6 11     11   5072 use strict;
  11         20  
  11         415  
7 11     11   48 use warnings; no warnings 'utf8';
  11     11   14  
  11         336  
  11         39  
  11         15  
  11         444  
8              
9 11         81 use overload fallback => 1,
10 11     11   46 '""'=> 'value';
  11         14  
11              
12             # This constant is true if we need to work around perl bug #122460 to keep
13             # the ‘aardvark’ tests (in t/15.05-string-objects.t) passing. This only
14             # applies to perl 5.20.0. (perl 5.20.1 includes a fix.) Basic-
15             # ally, (?=...) can result in buggy optimisations that cause a faulty
16             # rejection of the match at some locations, because it is assumed that it
17             # cannot match in some spots.
18             use constant aardvark_bug =>
19             # This test should match the empty string. If it advances (pos returns
20             # true), then we have the bug.
21 11     11   1435 do { my $a = "rdvark"; $a =~ /(?{})(?=.)a*?/g; pos $a };
  11         18  
  11         14  
  11         20  
  11         93  
  11         765  
22              
23 11     11   52 use Scalar::Util 'blessed';
  11         19  
  11         2513  
24              
25             our @ISA = 'JE::Object';
26              
27             require JE::Boolean;
28             require JE::Code;
29             require JE::Object;
30             require JE::String;
31              
32             import JE::Code 'add_line_number';
33             sub add_line_number;
34              
35             our @Match;
36             our @EraseCapture;
37              
38             #import JE::String 'desurrogify';
39             #sub desurrogify($);
40             # Only need to turn these on when Perl starts adding regexp modifiers
41             # outside the BMP.
42              
43             # JS regexp features that Perl doesn't have, or which differ from Perl's,
44             # along with their Perl equivalents
45             # ^ with /m \A|(?<=[\cm\cj\x{2028}\x{2029}]) (^ with the /m modifier
46             # matches whenever a Unicode
47             # line break (not just \n)
48             # precedes the current position,
49             # even at the end of the string. In
50             # Perl, /^/m matches \A|(?<=\n)(?!\z) .)
51             # $ \z
52             # $ with /m (?:\z|(?=[\cm\cj\x{2028}\x{2029}]))
53             # \b (?:(?<=$w)(?!$w)|(?
54             # \B (?:(?<=$w)(?=$w)|(?
55             # doesn't include non-ASCII
56             # word chars in \w)
57             # . [^\cm\cj\x{2028}\x{2029}]
58             # \v \cK
59             # \n \cj (whether \n matches \cj in Perl is system-dependent)
60             # \r \cm
61             # \uHHHH \x{HHHH}
62             # \d [0-9]
63             # \D [^0-9]
64             # \s [\p{Zs}\s\ck]
65             # \S [^\p{Zs}\s\ck]
66             # \w [A-Za-z0-9_]
67             # \W [^A-Za-z0-9_]
68             # [^] (?s:.)
69             # [] (?!)
70              
71             # Other differences
72             #
73             # A quantifier in a JS regexp will, when repeated, clear all values cap-
74             # tured by capturing parentheses in the term that it quantifies. This means
75             # that /((a)?b)+/, when matched against "abb" will leave $2 undefined, even
76             # though the second () matched "a" the first time the first () matched.
77             # (The ECMAScript spec says to do it this way, but Safari leaves $2 with
78             # "a" in it and doesn't clear it on the second iteration of the '+'.) Perl
79             # does it both ways, and the rules aren't quite clear to me:
80             #
81             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a)?b)+/;'
82             # b,
83             # $ perl5.8.8 -le '$, = ",";print "abb" =~ /((a+)?b)+/;'
84             # b,a
85             #
86             # perl5.9.4 produces the same. perl5.002_01 crashes quite nicely.
87             #
88             #
89             # In ECMAScript, when the pattern inside a (?! ... ) fails (in which case
90             # the (?!) succeeds), values captured by parentheses within the negative
91             # lookahead are cleared, such that subsequent backreferences *outside* the
92             # lookahead are equivalent to (?:) (zero-width always-match assertion). In
93             # Perl, the captured values are left as they are when the pattern inside
94             # the lookahead fails:
95             #
96             # $ perl5.8.8 -le 'print "a" =~ /(?!(a)b)a/;'
97             # a
98             # $ perl5.9.4 -le 'print "a" =~ /(?!(a)b)a/;'
99             # a
100             #
101             #
102             # In ECMAScript, as in Perl, a pair of capturing parentheses will produce
103             # the undefined value if the parens were not part of the final match.
104             # Undefined will still be produced if there is a \digit backreference
105             # reference to those parens. In ECMAScript, such a back-reference is equiv-
106             # alent to (?:); in Perl it is equivalent to (?!). Therefore, ECMAScript’s
107             # \1 is equivalent to Perl’s (?(1)\1). (It would seem, upon testing
108             # /(?:|())/ vs. /(?:|())\1/ in perl, that the \1 back-reference always suc-
109             # ceeds, and ends up setting $1 to "" [as opposed to undef]. What is actu-
110             # ally happening is that the failed \1 causes backtracking, so the second
111             # alternative in (?:|()) matches, setting $1 to the empty string. Safari,
112             # incidentally, does what Perl *appears* to do at first glance, *if* the
113             # backreference itself is within capturing parentheses (as in
114             # /(?:|())(\1)/).
115             #
116             # These issues are solved with embedded code snippets, as explained below,
117             # where the actual code is.
118             #
119             #
120             # In ECMAScript, case-folding inside the regular expression engine is not
121             # allowed to change the length of a string. Therefore, "ß" never matches
122             # /ss/i, and vice versa. I’m disinclined to be ECMAScript compliant in this
123             # regard though, because it would affect performance. The inefficient solu-
124             # tion I have in mind is to change /x/i to /(?-i:x)/ for every character
125             # that has a multi-character uppercase equivalent; and to change /xx/i to
126             # /(?-i:[Xx][Xx])/ where xx represents a multi-character sequence that
127             # could match a single character in Perl. The latter is the main problem.
128             # How are we to find out which character sequences need this? We could
129             # change /x/i to /[xX]/ for every literal character in the string, but how
130             # would we take /Σ/ -> /[Σσς]/ into account? And does perl’s regexp engine
131             # slow down if we feed it a ton of character classes instead of literal
132             # text? (Need to do some benchmarks.) (If we do fix this, we need to re-
133             # enable the skipped tests.)
134              
135              
136              
137             =head1 NAME
138              
139             JE::Object::RegExp - JavaScript regular expression (RegExp object) class
140              
141             =head1 SYNOPSIS
142              
143             use JE;
144             use JE::Object::RegExp;
145              
146             $j = new JE;
147              
148             $js_regexp = new JE::Object::RegExp $j, "(.*)", 'ims';
149              
150             $perl_qr = $js_regexp->value;
151              
152             $some_string =~ $js_regexp; # You can use it as a qr//
153              
154             =head1 DESCRIPTION
155              
156             This class implements JavaScript regular expressions for JE.
157              
158             See L for a description of most of the interface. Only what
159             is specific to JE::Object::RegExp is explained here.
160              
161             A RegExp object will stringify the same way as a C, so that you can
162             use C<=~> on it. This is different from the return value of the
163             C method (the way it stringifies in JS).
164              
165             Since JE's regular expressions use Perl's engine underneath, the
166             features that Perl provides that are not part of the ECMAScript spec are
167             supported, except for C<(?s)>
168             and C<(?m)>, which don't do anything, and C<(?|...)>, which is
169             unpredictable.
170              
171             In versions prior to 0.042, a hyphen adjacent to C<\d>, C<\s> or C<\w> in a
172             character class would be unpredictable (sometimes a syntax error). Now it
173             is interpreted literally. This matches what most implementations do, which
174             happens to be the same as Perl's behaviour. (It is a syntax error
175             in ECMAScript.)
176              
177             =head1 METHODS
178              
179             =over 4
180              
181             =cut
182              
183             # ~~~ How should surrogates work??? To make regexps work with JS strings
184             # properly, we need to use the surrogified string so that /../ will
185             # correctly match two surrogates. In this case it won't work properly
186             # with Perl strings, so what is the point of Perl-style stringification?
187             # Perhaps we should allow this anyway, but warn about code points outside
188             # the BMP in the documentation. (Should we also produce a Perl warning?
189             # Though I'm not that it's possible to catch this: "\x{10000}" =~ $re).
190             #
191             # But it would be nice if this would work:
192             # $j->eval("'\x{10000}'") =~ $j->eval('/../')
193             # ~~~ We might be able to make this work with perl 5.12’s qr overloading.
194              
195             our %_patterns = qw/
196             \b (?:(?<=[A-Za-z0-9_])(?![A-Za-z0-9_])|(?
197             \B (?:(?<=[A-Za-z0-9_])(?=[A-Za-z0-9_])|(?
198             . [^\cm\cj\x{2028}\x{2029}]
199             \v \cK
200             \n \cj
201             \r \cm
202             \d [0-9]
203             \D [^0-9]
204             \s [\p{Zs}\s\ck]
205             \S [^\p{Zs}\s\ck]
206             \w [A-Za-z0-9_]
207             \W [^A-Za-z0-9_]
208             /;
209              
210             our %_class_patterns = qw/
211             \v \cK
212             \n \cj
213             \r \cm
214             \d 0-9
215             \s \p{Zs}\s\ck
216             \w A-Za-z0-9_
217             /;
218              
219             my $clear_captures = qr/(?{@Match=@EraseCapture=()})/;
220 11     11   53 my $save_captures = do { no strict 'refs';
  11         15  
  11         6592  
221             qr/(?{$Match[$_]=$EraseCapture[$_]?undef:$$_ for 1..$#+})/; };
222             # These are pretty scary, aren’t they?
223             my $plain_regexp =
224             qr/^((?:[^\\[()]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
225             my $plain_regexp_x_mode =
226             qr/^((?:[^\\[()]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()]*)*)/s;
227             my $plain_regexp_wo_pipe =
228             qr/^((?:[^\\[()|]|\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\((?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
229             my $plain_regexp_x_mode_wo_pipe =
230             qr/^((?:[^\\[()|]|\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*(?:(?:\\.|\(\s*(?:\?#|\*)[^)]*\))[^\\[()|]*)*)/s;
231              
232             sub _capture_erasure_stuff {
233 21         61 "(?{local\@EraseCapture[" . join(',',@{$_[0]}) . "]=(1)x"
  21         85  
234 21     21   27 . @{$_[0]} . '})'
235             }
236              
237             sub new {
238 375     375 1 963 my ($class, $global, $re, $flags) = @_;
239 375   33     1274 my $self = $class->SUPER::new($global, {
240             prototype => $global->prototype_for('RegExp')
241             || $global->prop('RegExp')->prop('prototype')
242             });
243              
244 375         750 my $qr;
245              
246 375 100       914 if(defined blessed $re) {
247 56 50 66     521 if ($re->isa(__PACKAGE__)) {
    100          
    50          
248 0 0 0     0 defined $flags && eval{$flags->id} ne 'undef' and
  0         0  
249             die JE::Object::Error::TypeError->new(
250             $global, add_line_number
251             'Second argument to ' .
252             'RegExp() must be undefined if ' .
253             'first arg is a RegExp');
254 0         0 $flags = $$$re{regexp_flags};
255 0         0 $qr = $$$re{value};
256 0         0 $re = $re->prop('source')->[0];
257             }
258             elsif(can $re 'id' and $re->id eq 'undef') {
259 2         5 $re = '';
260             }
261             elsif(can $re 'to_string') {
262 54         119 $re = $re->to_string->value16;
263             }
264             }
265             else {
266 319 100       634 defined $re or $re = '';
267             }
268              
269 375 50       722 if(defined blessed $flags) {
270 0 0 0     0 if(can $flags 'id' and $flags->id eq 'undef') {
    0          
271 0         0 $flags = '';
272             }
273             elsif(can $flags 'to_string') {
274 0         0 $flags = $flags->to_string->value;
275             }
276             }
277             else {
278 375 100       684 defined $flags or $flags = '';
279             }
280              
281              
282             # Let's begin by processing the flags:
283              
284             # Save the flags before we start mangling them
285 375         1263 $$$self{regexp_flags} = $flags;
286              
287 375         1446 $self->prop({
288             name => global =>
289             value => JE::Boolean->new($global, $flags =~ y/g//d),
290             dontenum => 1,
291             readonly => 1,
292             dontdel => 1,
293             });
294              
295             # $flags = desurrogify $flags;
296             # Not necessary, until Perl adds a /𐐢 modifier (not likely)
297              
298             # I'm not supporting /s (at least not for now)
299 11     11   61 no warnings 'syntax'; # so syntax errors in the eval are kept quiet
  11         18  
  11         501  
300 375 100 66     25911 $flags =~ /^((?:(?!s)[\$_\p{ID_Continue}])*)\z/ and eval "qr//$1"
301             or die new JE::Object::Error::SyntaxError $global,
302             add_line_number "Invalid regexp modifiers: '$flags'";
303              
304 373         1322 my $m = $flags =~ /m/;
305 373         1346 $self->prop({
306             name => ignoreCase =>
307             value => JE::Boolean->new($global, $flags =~ /i/),
308             dontenum => 1,
309             readonly => 1,
310             dontdel => 1,
311             });
312 373         1222 $self->prop({
313             name => multiline =>
314             value => JE::Boolean->new($global, $m),
315             dontenum => 1,
316             readonly => 1,
317             dontdel => 1,
318             });
319              
320              
321             # Now we'll deal with the pattern itself.
322              
323             # Save it before we go and mangle it
324             $self->prop({
325             name => source =>
326             # ~~~ Can we use ->_new here?
327 373         616 value => JE::String->new($global, do {
328 373         1568 (my $tmp = $re) =~
329             s<(\\.)|/>
330 376 100       1179 egg;
331 373         1316 $tmp
332             }),
333             dontenum => 1,
334             readonly => 1,
335             dontdel => 1,
336             });
337              
338 373 50       949 unless (defined $qr) { # processing begins here
339              
340             # This horrific piece of code converts an ECMAScript regular
341             # expression into a Perl one, more or less.
342              
343             # Since Perl sometimes fills in $1, etc., where they are supposed
344             # to be undefined in ECMAScript, we use embedded code snippets to
345             # put the values into @Match[1..whatever] instead.
346              
347             # The cases we have to take into account are
348             # 1) quantified captures; i.e., (...)+ or (?:()?)+ ; and
349             # 2) captures within interrobang groups: (?!())
350              
351             # The solution is to mark captures as erasure candidates with the
352             # @EraseCapture array.
353              
354             # To solve case 1, we have to put (?{}) markers at the begin-
355             # ning of each grouping construct that has captures in it,
356             # and a quantifier within each pair of capturing parenthe-
357             # ses before the closing paren. (?:(a+)?b)+ will become
358             # (?: (?{...}) ( a+ (?{...}) )? b )+ (spaced out for reada-
359             # bility). The first code interpolation sets $EraseCapture[n]
360             # to 1 for all the captures within that group. The sec-
361             # ond code interpolation will only be triggered if the a+
362             # matches, and there we set $EraseCapture[n] to 0. It’s actu-
363             # ally slightly more complicated than that, because we may
364             # have alternatives directly inside the outer grouping; e.g.,
365             # (?:a|(b))+, so we have to wrap the contents thereof within
366             # (?:), making ‘(?:(?{...})(?:a|(b(?{...}))))+’. Whew!
367              
368             # For case 2 we change (?!...) to (?:(?!...)(?{...})). The embedded
369             # code marks the captures inside (?!) for erasure. The (?: is
370             # needed because the (?!) might be quantified. (We used not to add
371             # the extra (?:), but put the (?{}) at the end of the innermost
372             # enclosing group, but that causes the same \1 problem men-
373             # tioned above.
374              
375             use constant 1.03 # multiple
376             { # Make sure any changes to these constants are also
377             # made at the end
378             # of the subroutine
379             # array indices within each item on the @stack:
380 11         19728 posi => 0, # position within $new_re where the current
381             # group’s contents start, or before the opening
382             # paren for interrobang groups
383             type => 1, # type of group; see constants below
384             xmod => 2, # whether /x mode is active
385             capn => 3, # array ref of capture numbers within this group
386              
387             # types of parens:
388             reg => 0, cap => 1, itrb => 2, brch => 3, cond => 4
389 11     11   10212 };
  11         289  
390              
391 373         408 my $new_re = '';
392 373         308 my $sub_pat;
393 373         861 my @stack = [0,0,$flags =~ /x/];
394 373         406 my $capture_num; # number of the most recently started capture
395             my @capture_nums; # numbers of the captures we’re inside
396             #my $warn;
397             #++$warn if $re eq '(?p{})';
398             {
399 373 100       319 @stack or die new JE::Object::Error::SyntaxError $global,
  1214         1875  
400             add_line_number "Unmatched ) in regexp";
401              
402             # no parens or char classes:
403 1213 50 33     13386 if( $stack[-1][xmod]
    100 66        
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
404             ? $stack[-1][type] == cond || $stack[-1][type] == brch
405             ? $re =~ s/$plain_regexp_x_mode_wo_pipe//
406             : $re =~ s/$plain_regexp_x_mode//
407             : $stack[-1][type] == cond || $stack[-1][type] == brch
408             ? $re =~ s/$plain_regexp_wo_pipe//
409             : $re =~ s/$plain_regexp//
410             ) {
411 566         2486 ($sub_pat = $1) =~
412             s/
413             ([\^\$])
414             |
415             (\.|\\[bBvnrdDsSwW])
416             |
417             \\u([A-Fa-f0-9]{4})
418             |
419             \\([1-9][0-9]*)
420             |
421             \\?([\x{d800}-\x{dfff}])
422             |
423             (\\(?:[^c]|c.))
424             /
425 440 100       2504 defined $1
    100          
    100          
    100          
    100          
    100          
    100          
    100          
426             ? $1 eq '^'
427             ? $m
428             ? '(?:\A|(?<=[\cm\cj\x{2028}\x{2029}]))'
429             : '^'
430             : $m
431             ? '(?:\z|(?=[\cm\cj\x{2028}\x{2029}]))'
432             : '\z'
433             : defined $2 ? $_patterns{$2} :
434             defined $3 ? "\\x{$3}" :
435             defined $4 ? "(?(?{defined\$$4&&"
436             ."!\$EraseCapture[$4]})\\$4)" :
437             # work around a bug in perl:
438             defined $5 ? sprintf '\\x{%x}', ord $5 :
439             $6
440             /egxs;
441 566         786 $new_re .= $sub_pat;
442             }
443              
444             # char class:
445             elsif($re=~s/^\[([^]\\]*(?:\\.[^]\\]*)*)]//s){
446 114 100       371 if($1 eq '') {
    100          
447 2         6 $new_re .= '(?!)';
448             }
449             elsif($1 eq '^') {
450 11         18 $new_re .= '(?s:.)';
451             }
452             else {
453 101         110 my @full_classes;
454 101         478 ($sub_pat = $1) =~ s/
455             (\\[vnr])
456             |
457             (-?)(\\[dsw])(-?)
458             |
459             (\\[DSW])
460             |
461             \\u([A-Fa-f0-9]{4})
462             |
463             \\?([\x{d800}-\x{dfff}])
464             |
465             (\\(?:[^c]|c.))
466             /
467 90 50       543 defined $1 ? $_class_patterns{$1} :
    100          
    100          
    100          
    100          
    100          
    100          
468             defined $3 ?
469             ($2 ? '\-' : '')
470             .$_class_patterns{$3}
471             .($4 ? '\-' : '') :
472             defined $5 ? ((push @full_classes,
473             $_patterns{$5}),'') :
474             defined $6 ? "\\x{$6}" :
475             # work around a bug in perl:
476             defined $7 ? sprintf '\\x{%x}', ord $7 :
477             $8
478             /egxs;
479              
480 101 100       445 $new_re .= length $sub_pat
    100          
    100          
481             ? @full_classes
482             ? '(?:' .
483             join('|', @full_classes,
484             "[$sub_pat]")
485             . ')'
486             : "[$sub_pat]"
487             : @full_classes == 1
488             ? $full_classes[0]
489             : '(?:' . join('|', @full_classes) .
490             ')';
491             }
492             }
493              
494             # (?mods) construct (no colon) :
495             elsif( $stack[-1][xmod]
496             ? $re =~ s/^(\(\s*\?([\w]*)(?:-([\w]*))?\))//
497             : $re =~ s/^(\( \?([\w]*)(?:-([\w]*))?\))//x
498             ) {
499 1         3 $new_re .= $1;
500 1 50 33     12 defined $3 && index($3,'x')+1
      0        
501             ? $stack[-1][xmod]=0
502             : $2 =~ /x/ && ++$stack[-1][xmod];
503             }
504              
505             # start of grouping construct:
506             elsif( $stack[-1][xmod]
507             ? $re=~s/^(\((?:\s*\?([\w-]*:|[^:{?
508             : $re=~s/^(\((?: \?([\w-]*:|[^:{?
509             ) {
510             # warn "$new_re-$1-$2-$3-$re" if $warn;
511 261 50       564 $3 and die JE'Object'Error'SyntaxError->new(
512             $global, add_line_number
513             "Embedded code in regexps is not "
514             . "supported"
515             );
516 261         360 my $pos_b4_parn = length $new_re;
517 261         356 $new_re .= $1;
518 261         314 my $caq = $2; # char(s) after question mark
519 261         199 my @current;
520 261 100       338 if(defined $caq) { # (?...) patterns
521 62 100       198 if($caq eq '(') {
    50          
522 4         15 $re =~ s/^([^)]*\))//;
523 4         5 $new_re .= $1;
524 4 50       11 $1 =~ /^\?[?p]?\{/ && die
525             JE'Object'Error'SyntaxError->new(
526             $global, add_line_number
527             "Embedded code in regexps is not "
528             . "supported"
529             );
530 4         10 $current[type] = cond;
531             }
532             elsif($caq =~ /^[<'P](?![!=])/) {
533 0         0 ++$capture_num;
534 0 0       0 $caq eq "'" ? $re =~ s/^(.*?')//
535             : $re =~ s/^(.*?>)//;
536 0         0 $new_re .= $1;
537 0         0 $current[type] = reg;
538             }
539             else {
540 58         155 $current[type] = (reg,itrb)[$caq eq '!'];
541             }
542 62 100       144 $current[posi] = $caq eq '!' ? $pos_b4_parn
543             : length $new_re;
544             }else{ # capture
545 199         212 ++$capture_num;
546 199         217 push @capture_nums, $capture_num;
547 199         294 push @{$$_[capn]}, $capture_num for @stack;
  241         539  
548 199         328 $current[posi] = length $new_re;
549 199         227 $current[type] = cap;
550             }
551 261         332 $current[xmod] = $stack[-1][xmod];
552 261         417 push @stack, \@current;
553             }
554              
555             # closing paren:
556             elsif($re =~ s/^\)//) {
557 263         213 my @commands;
558 263         281 my $cur = $stack[-1];
559 263 100       422 if($$cur[type] != itrb) {
  26         38  
560 237 100       406 if($$cur[type] == cap) {
561             # we are exiting a capturing group
562 199         461 $new_re .= "(?{local" .
563             "\$EraseCapture[$capture_nums[-1]]=0"
564             ."})";
565 199         216 pop @capture_nums;
566             }
567 237 100 100     526 if($$cur[capn] && @{$$cur[capn]} &&
  29   100     167  
568             $re =~ /^[+{*?]/) { # quantified group
569 13         33 substr $new_re,$$cur[posi],0 =>=
570             _capture_erasure_stuff($$cur[capn])
571             . "(?:";
572 13         29 $new_re .= ")";
573             }
574 237         297 $new_re .= ')';
575             }
576             else {{ # ?!
577 26         34 $new_re .= ')';
578 26 100 66     95 last unless($$cur[capn] && @{$$cur[capn]});
  7         21  
579              
580             # change (?!...) to (?!...)(?{...})
581 7         17 $new_re .= _capture_erasure_stuff(
582             $$cur[capn]
583             );
584              
585             # wrap (?!)(?{}) in (?:) if necessary
586 7 100       30 $re =~ /^[+{*?]/ and
587             substr $new_re,$$cur[posi],0
588             =>= '(?:',
589             $new_re .= ')';
590             }}
591 263         345 pop @stack;
592             }
593              
594             # pipe within (?()|) or (?|) (the latter doesn’t work yet):
595             elsif($re =~ s/^\|//) {
596 2         4 my $cur = $stack[-1];
597 2 100 66     8 if($$cur[capn] && @{$$cur[capn]}
  1         4  
598             #&& $re =~ /^[+{*?]/ # We can’t actually tell
599             ) { # at this point whether the enclosing
600             # group is quantified. Does anyone have any ideas?
601 1         4 substr $new_re,$$cur[posi],0 =>=
602             _capture_erasure_stuff(
603             $$cur[capn]
604             );
605 1         2 @{$$cur[capn]} = ();
  1         3  
606             }
607 2         3 $new_re .= '|';
608 2         4 $$cur[posi] = length $new_re;
609             }
610              
611             # something invalid left over:
612             elsif($re) {
613             #warn $re;
614 0 0       0 die JE::Object::Error::SyntaxError->new($global,
615             add_line_number
616             $re =~ /^\[/
617             ? "Unterminated character class $re in regexp"
618             : 'Trailing \ in regexp');
619             }
620 1213 100       2746 length $re and redo;
621             }
622 372 100       702 @stack or die new JE::Object::Error::SyntaxError $global,
623             add_line_number "Unmatched ) in regexp";
624              
625 371 100       923 aardvark_bug && $new_re =~ /\(\?=/
626             and substr $new_re,0,0, = '(??{""})';
627              
628             #warn $new_re;
629 371 100       469 $qr = eval {
630 11     11   73 use re 'eval'; no warnings 'regexp'; no strict;
  11     11   17  
  11     11   435  
  11         42  
  11         16  
  11         339  
  11         47  
  11         11  
  11         2733  
631              
632             # The warnings pragma doesn’t make it into the re-eval, so
633             # we have to localise $^W, in case the string contains
634             # @EraseCapture[1]=(1)x1 and someone is using -w.
635 371         912 local $^W;
636              
637             # We have to put (?:) around $new_re in the first case,
638             # because it may contain a top-level disjunction, but
639             # not in the second, because the array modifica-
640 371 100       23085 $capture_num # tions in $clear_captures are not localised.
641             ? qr/(?$flags:$clear_captures(?:$new_re)$save_captures)/
642             : qr/(?$flags:$clear_captures$new_re)/
643             } or $@ =~ s/\.?$ \n//x,
644             die JE::Object::Error::SyntaxError->new($global,
645             add_line_number $@);
646              
647             } # end of pattern processing
648              
649 369         3992 $$$self{value} = $qr;
650              
651 369         1527 $self->prop({
652             name => lastIndex =>
653             value => JE::Number->new($global, 0),
654             dontdel => 1,
655             dontenum => 1,
656             });
657              
658 369         1940 $self;
659             }
660             BEGIN {
661 11     11   60 no strict;
  11         16  
  11         491  
662 11     11   20 delete @{__PACKAGE__.'::'}{qw[posi type xmod capn reg cap itrb brch cond]}
  11         5239  
663             }
664              
665              
666              
667             =item value
668              
669             Returns a Perl C regular expression.
670              
671             If the regular expression
672             or the string that is being matched against it contains characters outside
673             the Basic Multilingual Plane (whose character codes exceed 0xffff), the
674             behavior is undefined--for now at least. I still need to solve the problem
675             caused by JS's unintuitive use of raw surrogates. (In JS, C will
676             match a
677             surrogate pair, which is considered to be one character in Perl. This means
678             that the same regexp matched against the same string will produce different
679             results in Perl and JS.)
680              
681             =cut
682              
683             sub value {
684 196     196 1 3027 $${$_[0]}{value};
  196         939  
685             }
686              
687              
688              
689              
690             =item class
691              
692             Returns the string 'RegExp'.
693              
694             =cut
695              
696 346     346 1 1163 sub class { 'RegExp' }
697              
698              
699             sub call {
700 170     170 0 194 my ($self,$str) = @_;
701              
702 170 50       288 die JE::Object::Error::TypeError->new(
703             $self->global, add_line_number
704             "Argument to exec is not a " .
705             "RegExp object"
706             ) unless $self->class eq 'RegExp';
707              
708 170         167 my $je_str;
709 170 50       340 if (defined $str) {
710 170         367 $str =
711             ($je_str=$str->to_string)->value16;
712             }
713             else {
714 0         0 $str = 'undefined';
715             }
716              
717 170         191 my(@ary,$indx);
718 170         270 my $global = $$$self{global};
719              
720 170         356 my $g = $self->prop('global')->value;
721 170 50       288 if ($g) {
722 0         0 my $pos =
723             $self->prop('lastIndex')
724             ->to_number->value;
725 0 0 0     0 $pos < 0 || $pos > length $str
      0        
726             ||
727             (
728             pos $str = $pos,
729             $str !~ /$$$self{value}/g
730             )
731             and goto phail;
732              
733 0         0 @ary = @Match;
734 0         0 $ary[0] = substr($str, $-[0],
735             $+[0] - $-[0]);
736 0         0 $indx = $-[0];
737              
738 0         0 $self->prop(lastIndex =>
739             JE::Number->new(
740             $global,
741             pos $str
742             ));
743 0         0 $global->prototype_for('RegExp')
744             ->prop('constructor')
745             ->capture_re_vars($str);
746             }
747             else {
748 170 100       1519 $str =~ /$$$self{value}/
749             or goto phail;
750              
751 149         284 @ary = @Match;
752 149         747 $ary[0] = substr($str, $-[0],
753             $+[0] - $-[0]);
754 149         302 $indx = $-[0];
755 149         471 $global->prototype_for('RegExp')
756             ->prop('constructor')
757             ->capture_re_vars($str);
758             }
759            
760 149         690 my $ary = JE::Object::Array->new(
761             $global,
762             \@ary
763             );
764 149         540 $ary->prop(index =>
765             JE::Number->new($global,$indx));
766 149 50       471 $ary->prop(input => defined $je_str
767             ? $je_str :
768             JE::String->_new(
769             $global, $str
770             ));
771            
772 149         619 return $ary;
773              
774 21         87 phail:
775             $self->prop(lastIndex =>
776             JE::Number->new(
777             $global,
778             0
779             ));
780 21         62 return $global->null;
781             }
782              
783 0     0 0 0 sub apply { splice @'_, 1, 1; goto &call }
  0         0  
784              
785             @JE::Object::Function::RegExpConstructor::ISA = 'JE::Object::Function';
786             sub JE::Object::Function::RegExpConstructor::capture_re_vars {
787 242     242   262 my $self = shift;
788 242         367 my $global = $$$self{global};
789 242         1342 $self->prop(
790             'lastMatch',
791             JE::String->new($global, substr $_[0], $-[0], $+[0]-$-[0])
792             );
793             {
794 11     11   64 no warnings 'uninitialized';
  11         17  
  11         956  
  242         396  
795 242         889 $self->prop('lastParen', new JE::String $global, "$+")
796             }
797             $self->prop(
798 242         938 'leftContext',
799             new JE'String $global, substr $_[0], 0, $-[0]
800             );
801 242         914 $self->prop('rightContext', new JE'String $global, substr $_[0], $+[0]);
802 11     11   47 no warnings 'uninitialized';
  11         20  
  11         7693  
803 242         1197 $self->prop("\$$_", new JE'String $global, "$Match[$_]") for 1..9;
804             }
805             sub new_constructor {
806 13     13 0 21 my($package,$global) = @_;
807             my $f = JE::Object::Function::RegExpConstructor->new({
808             name => 'RegExp',
809             scope => $global,
810             argnames => [qw/pattern flags/],
811             function => sub {
812 20     20   47 my (undef, $re, $flags) = @_;
813 20 0 0     61 if ($re->class eq 'RegExp' and !defined $flags
      33        
814             || $flags->id eq 'undef') {
815 0         0 return $re
816             }
817 20         53 unshift @_, __PACKAGE__;
818 20         56 goto &new;
819             },
820             function_args => ['scope','args'],
821             constructor => sub {
822 3     3   10 unshift @_, $package;
823 3         10 goto &new;
824             },
825 13         235 constructor_args => ['scope','args'],
826             });
827              
828 13         73 my $proto = $f->prop({
829             name => 'prototype',
830             dontenum => 1,
831             readonly => 1,
832             });
833 13         58 $global->prototype_for('RegExp', $proto);
834              
835             $f->prop({
836             name => '$&',
837             dontdel => 1,
838 3     3   11 fetch => sub { shift->prop('lastMatch') },
839 1     1   4 store => sub { shift->prop('lastMatch', shift) },
840 13         108 });
841             $f->prop({
842             name => '$`',
843             dontdel => 1,
844 4     4   18 fetch => sub { shift->prop('leftContext') },
845 2     2   6 store => sub { shift->prop('leftContext', shift) },
846 13         98 });
847             $f->prop({
848             name => '$\'',
849             dontdel => 1,
850 3     3   11 fetch => sub { shift->prop('rightContext') },
851 1     1   6 store => sub { shift->prop('rightContext', shift) },
852 13         85 });
853             $f->prop({
854             name => '$+',
855             dontdel => 1,
856 3     3   13 fetch => sub { shift->prop('lastParen') },
857 1     1   5 store => sub { shift->prop('lastParen', shift) },
858 13         80 });
859 13         85 my $empty = JE::String->new($global,"");
860 13         106 for(
861             qw(lastParen lastMatch leftContext rightContext),
862             map "\$$_", 1..9
863             ) {
864 169         421 $f->prop({ name => $_, dontdel => 1, value => $empty});
865             }
866            
867             $proto->prop({
868 13         119 name => 'exec',
869             value => JE::Object::Function->new({
870             scope => $global,
871             name => 'exec',
872             argnames => ['string'],
873             no_proto => 1,
874             function_args => ['this','args'],
875             function => \&call,
876             }),
877             dontenum => 1,
878             });
879              
880             $proto->prop({
881             name => 'test',
882             value => JE::Object::Function->new({
883             scope => $global,
884             name => 'test',
885             argnames => ['string'],
886             no_proto => 1,
887             function_args => ['this','args'],
888             function => sub {
889 14     14   21 my ($self,$str) = @_;
890 14 50       37 die JE::Object::Error::TypeError->new(
891             $global, add_line_number
892             "Argument to test is not a " .
893             "RegExp object"
894             ) unless $self->class eq 'RegExp';
895 14         30 my $ret = call($self,$str);
896 14         36 JE::Boolean->new(
897             $global, $ret->id ne 'null'
898             );
899             },
900 13         128 }),
901             dontenum => 1,
902             });
903              
904             $proto->prop({
905             name => 'toString',
906             value => JE::Object::Function->new({
907             scope => $global,
908             name => 'toString',
909             no_proto => 1,
910             function_args => ['this'],
911             function => sub {
912 1     1   2 my ($self,) = @_;
913 1 50       4 die JE::Object::Error::TypeError->new(
914             $global, add_line_number
915             "Argument to toString is not a " .
916             "RegExp object"
917             ) unless $self->class eq 'RegExp';
918 1         4 JE::String->_new(
919             $global,
920             "/" . $self->prop('source')->value
921             . "/$$$self{regexp_flags}"
922             );
923             },
924 13         129 }),
925             dontenum => 1,
926             });
927              
928              
929 13         126 $f;
930             }
931              
932              
933             =back
934              
935             =head1 SEE ALSO
936              
937             =over 4
938              
939             =item JE
940              
941             =item JE::Types
942              
943             =item JE::Object
944              
945             =back
946              
947             =cut
948              
949