File Coverage

blib/lib/YAPE/Regex.pm
Criterion Covered Total %
statement 268 469 57.1
branch 99 220 45.0
condition 43 102 42.1
subroutine 16 23 69.5
pod 11 13 84.6
total 437 827 52.8


line stmt bran cond sub pod time code
1             package YAPE::Regex;
2              
3 1     1   59050 use YAPE::Regex::Element;
  1         1105  
  1         48  
4 1     1   6732 use Text::Balanced 'extract_codeblock';
  1         27521  
  1         151  
5 1     1   11 use Carp;
  1         2  
  1         64  
6 1     1   6 use strict;
  1         2  
  1         40  
7 1     1   6 use vars '$VERSION';
  1         2  
  1         967  
8              
9              
10             $VERSION = '4.00';
11              
12              
13             my $valid_POSIX = qr{
14             alpha | alnum | ascii | cntrl | digit | graph |
15             lower | print | punct | space | upper | word | xdigit
16             }x;
17              
18              
19             my $ok_cc_REx = qr{
20             \\([0-3][0-7]{2}) | # octal escapes
21             \\x([a-fA-F0-9]{2}|\{[a-fA-F0-9]+\}) | # hex escapes
22             \\c(.) | # control characters
23             \\([nrftbae]) | # known \X sequences
24             \\N\{([^\}]+)\} | # named characters
25             (\\[wWdDsS]) | # regex macros
26             \\[Pp]([A-Za-z]|\{[a-zA-Z]+\}) | # utf8 macros
27             \[:\^?([a-zA-Z]+):\] | # POSIX macros
28             \\?(.) # anything else
29             }xs;
30              
31              
32             my %pat = (
33             Pcomment => qr{ \( \? \# ( [^)]* ) \) }x,
34             Xcomment => qr{ \# [^\S\n]* ( .* \n ) }x,
35             Pflags => qr{ \( \? ([isxm]*)-?([ismx]*) \) }x,
36              
37             Pahead => qr{ \( \? ( [=!] ) }x,
38             Pbehind => qr{ \( \? < ( [=!] ) }x,
39             Pcond => qr{ \( \? (?: \( (\d+) \) | (?= \( \? (?:
40             Pcut => qr{ \( \? > }x,
41             Pgroup => qr{ \( \? ([isxm]*)-?([ismx]*) : }x,
42             Pcapture => qr{ \( (?! \? ) }x,
43             Pcode => qr{ \( \? (?= \{ ) }x,
44             Plater => qr{ \( \? \? (?= \{ ) }x,
45             Pclose => qr{ \) }x,
46              
47             quant => qr{ ( (?: [+*?] | \{ \d+ ,? \d* \} ) ) }x,
48             ngreed => qr{ ( \? ) }x,
49              
50             anchor => qr{ ( \\ [ABbGZz] | [\^\$] ) }x,
51             macro => qr{ \\ ( [dDwWsS] ) }x,
52             oct => qr{ \\ ( [0-3] [0-7] [0-7] ) }x,
53             hex => qr{ \\ x ( [a-fA-F0-9]{2} ) }x,
54             utf8hex => qr{ \\ x \{ ( [a-fA-F0-9]+ ) \} }x,
55             backref => qr{ \\ ( [1-9] \d* ) }x,
56             ctrl => qr{ \\ c ( . ) }x,
57             named => qr{ \\ N \{ ( [^\}]+ ) \} }x,
58             Cchar => qr{ \\ C }x,
59             slash => qr{ \\ ( . ) }xs,
60             any => qr{ \. }x,
61             class => qr{ \\ ([Pp]) ( [A-Za-z] | \{ [a-zA-Z]+ \} ) | \[ ( \^? ) ( \]? [^][\\]* (?: (?: \[:\w+:\] | \[ (?!:) | \\. ) [^][\\]* )* ) \] }x,
62             nws => qr{ ( (?: [^\s^\$|\\+*?()\[.\{]+ | \{ (?! \d+ ,? \d* \} ) )+ ) }x,
63             reg => qr{ ( (?: [^^\$|\\+*?()\[.\{] | \{ (?! \d+ ,? \d* \} ) )+ ) }x,
64              
65             alt => qr{ \| }x,
66             );
67              
68              
69             sub import {
70 1     1   11 shift;
71 1         8 my @obj = qw(
72             anchor macro oct hex utf8hex backref ctrl named Cchar slash
73             any class text alt comment whitespace flags lookahead lookbehind
74             conditional group capture code later close cut
75             );
76 1     1   6 no strict 'refs';
  1         2  
  1         8043  
77 1         3 for my $class ('YAPE::Regex', @_) {
78 1         7 (my $file = $class . ".pm") =~ s!::!/!g;
79 1 50 0     6 require $file and $class->import if not $INC{$file};
80 1 50       3 if ($class ne 'YAPE::Regex') {
81 0         0 push @{"${class}::ISA"}, 'YAPE::Regex';
  0         0  
82 0         0 push @{"${class}::${_}::ISA"},
83 0         0 "YAPE::Regex::$_", "${class}::Element" for @obj;
84             }
85 1         3 push @{"${class}::${_}::ISA"}, 'YAPE::Regex::Element' for @obj;
  26         289  
86             }
87             }
88              
89              
90             sub new {
91 12     12 1 2082 my ($class, $regex) = @_;
92              
93 12 50 33     13433 croak "no regex given to $class->new"
94             if not defined $regex or length($regex) == 0;
95              
96 12 50       51 eval { local $^W; $regex = qr/$regex/ } if ref($regex) ne 'Regexp';
  12         39  
  12         326  
97              
98 12 100       39 $regex = "(?-imsx:$regex)" if $@;
99              
100             # Make the qr stringification introduced in 5.13.6 look like the old
101             # qr stringification.
102 12 50       91 if ($regex =~ / ^ \( \? \^ ([imsx]+) (: .*) $ /x) {
    100          
103 0         0 my $switches = $1;
104 0         0 my $rest = $2;
105 0         0 my $inverted = invert($switches);
106 0 0       0 if (length $inverted) {
107 0         0 $regex = "(?$switches-$inverted$rest";
108             }
109             else {
110 0         0 $regex = "(?imsx$rest";
111             }
112             }
113             elsif ($regex =~ / ^ \( \? \^ : /x) {
114 9         44 $regex =~ s/\^/-imsx/;
115             }
116              
117 12         99 my $self = bless {
118             TREE => [],
119             TREE_STACK => [],
120             CAPTURE => [],
121             CONTENT => "$regex",
122             DEPTH => 0,
123             }, $class;
124 12         47 $self->{CURRENT} = $self->{TREE};
125              
126 12         195 return $self;
127             }
128              
129              
130             sub invert {
131             # Given an input string which looks like modifiers (ismx),
132             # return the inverse string.
133             # For example, if the input is 'ix', return 'ms'.
134 0     0 0 0 my %mods = map { $_ => 1 } qw(i m s x);
  0         0  
135 0         0 delete $mods{$_} for (split //, shift);
136 0         0 return join '', keys %mods;
137             }
138              
139              
140 0     0 1 0 sub state { $_[0]{STATE} }
141 2     2 1 32 sub error { $_[0]{ERROR} }
142 0     0 0 0 sub depth { $_[0]{DEPTH} }
143 1   50 1 1 10 sub chunk { substr $_[0]{CONTENT}, 0, $_[1] || 30 }
144 10     10 1 68 sub done { $_[0]{STATE} eq 'done' }
145 0     0 1 0 sub root { $_[0]{TREE}[0] }
146 0     0 1 0 sub top { $_[0]{TREE}[0] }
147 12     12 1 37 sub parse { 1 while $_[0]->next }
148 10 50   10 1 52 sub display { $_[0]->parse; $_[0]{TREE}[0]->fullstring if $_[0]->done; }
  10         25  
149              
150              
151             sub next {
152 111     111 1 147 my $self = shift;
153 111 100       276 $self->{STATE} = 'done', return unless length $self->{CONTENT};
154              
155 101 50 66     97 if (
      33        
156 101         548 @{$self->{TREE_STACK}} and
157             $self->{TREE_STACK}[-1]{MODE}{x} and
158             $self->{CONTENT} =~ s/^(\s+)//
159             ) {
160 0         0 my $node = (ref($self) . "::whitespace")->new($1);
161 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
162 0         0 return $node;
163             }
164              
165 101 50       520 if ($self->{CONTENT} =~ s/^$pat{Pcomment}//) {
166 0         0 my $node = (ref($self) . "::comment")->new($1,0);
167 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
168 0         0 $self->{STATE} = 'comment';
169 0         0 return $node;
170             }
171              
172 101 50 66     100 if (
      33        
173 101         521 @{ $self->{TREE_STACK} } and
174             $self->{TREE_STACK}[-1]{MODE}{x} and
175             $self->{CONTENT} =~ s/^$pat{Xcomment}//
176             ) {
177 0         0 my $node = (ref($self) . "::comment")->new($1,1);
178 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
179 0         0 $self->{STATE} = 'comment';
180 0         0 return $node;
181             }
182              
183 101 50       531 if ($self->{CONTENT} =~ s/^$pat{anchor}//) {
184 0         0 my $match = $1;
185 0         0 my ($quant,$ngreed) = $self->_get_quant;
186 0 0       0 return if $quant eq -1;
187 0         0 my $node = (ref($self) . "::anchor")->new($match,$quant,$ngreed);
188 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
189 0         0 $self->{STATE} = 'anchor';
190 0         0 return $node;
191             }
192              
193 101 50       353 if ($self->{CONTENT} =~ s/^$pat{macro}//) {
194 0         0 my $match = $1;
195 0         0 my ($quant,$ngreed) = $self->_get_quant;
196 0 0       0 return if $quant eq -1;
197 0         0 my $node = (ref($self) . "::macro")->new($match,$quant,$ngreed);
198 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
199 0         0 $self->{STATE} = 'macro';
200 0         0 return $node;
201             }
202              
203 101 50       339 if ($self->{CONTENT} =~ s/^$pat{oct}//) {
204 0         0 my $match = $1;
205 0         0 my ($quant,$ngreed) = $self->_get_quant;
206 0 0       0 return if $quant eq -1;
207 0         0 my $node = (ref($self) . "::oct")->new($match,$quant,$ngreed);
208 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
209 0         0 $self->{STATE} = 'oct';
210 0         0 return $node;
211             }
212              
213 101 100       476 if ($self->{CONTENT} =~ s/^$pat{hex}//) {
214 1         3 my $match = $1;
215 1         4 my ($quant,$ngreed) = $self->_get_quant;
216 1         14 my $node = (ref($self) . "::hex")->new($match,$quant,$ngreed);
217 1         3 push @{ $self->{CURRENT} }, $node;
  1         3  
218 1         3 $self->{STATE} = 'hex';
219 1         11 return $node;
220             }
221              
222 100 50       330 if ($self->{CONTENT} =~ s/^$pat{utf8hex}//) {
223 0         0 my $match = $1;
224 0         0 my ($quant,$ngreed) = $self->_get_quant;
225 0         0 my $node = (ref($self) . "::utf8hex")->new($match,$quant,$ngreed);
226 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
227 0         0 $self->{STATE} = 'utf8hex';
228 0         0 return $node;
229             }
230              
231 100 100       325 if ($self->{CONTENT} =~ s/^$pat{backref}//) {
232 1         3 my $match = $1;
233 1         3 my ($quant,$ngreed) = $self->_get_quant;
234 1 50       4 return if $quant eq -1;
235 1         9 my $node = (ref($self) . "::backref")->new($match,$quant,$ngreed);
236              
237             # this code is special for YAPE::Regex::Reverse
238 1 50       8 if ($self->isa('YAPE::Regex::Reverse')) {
239 0 0 0     0 if ($quant eq '*' or $quant eq '+') {
    0 0        
240 0         0 $node = (ref($self) . "::group")->new;
241 0 0       0 $node->{NGREED} = '?' if $quant eq '*';
242 0         0 $node->{CONTENT} = [
243             (ref($self) . "::backref")->new($match,'*'),
244             (ref($self) . "::backref")->new($match),
245             ];
246             }
247             elsif ($quant and $quant ne '?') {
248 0         0 my ($l,$u) = $quant =~ /(\d+),(\d*)/;
249 0         0 $node = (ref($self) . "::group")->new;
250 0 0       0 $node->{NGREED} = '?' if not $l;
251 0 0       0 $l-- if $l; $u-- if $u;
  0 0       0  
252 0         0 $node->{CONTENT} = [
253             (ref($self) . "::backref")->new($match,"{$l,$u}"),
254             (ref($self) . "::backref")->new($match),
255             ];
256             }
257             }
258 1         1 push @{ $self->{CURRENT} }, $node;
  1         3  
259 1         2 $self->{STATE} = 'backref';
260 1         10 return $node;
261             }
262              
263 99 50       298 if ($self->{CONTENT} =~ s/^$pat{ctrl}//) {
264 0         0 my $match = $1;
265 0         0 my ($quant,$ngreed) = $self->_get_quant;
266 0 0       0 return if $quant eq -1;
267 0         0 my $node = (ref($self) . "::ctrl")->new($match,$quant,$ngreed);
268 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
269 0         0 $self->{STATE} = 'ctrl';
270 0         0 return $node;
271             }
272              
273 99 50       323 if ($self->{CONTENT} =~ s/^$pat{named}//) {
274 0         0 my $match = $1;
275 0         0 my ($quant,$ngreed) = $self->_get_quant;
276 0 0       0 return if $quant eq -1;
277 0         0 my $node = (ref($self) . "::named")->new($match,$quant,$ngreed);
278 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
279 0         0 $self->{STATE} = 'named';
280 0         0 return $node;
281             }
282              
283 99 50       296 if ($self->{CONTENT} =~ s/^$pat{Cchar}//) {
284 0         0 my ($quant,$ngreed) = $self->_get_quant;
285 0 0       0 return if $quant eq -1;
286 0         0 my $node = (ref($self) . "::Cchar")->new($quant,$ngreed);
287 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
288 0         0 $self->{STATE} = 'Cchar';
289 0         0 return $node;
290             }
291              
292 99 100       431 if ($self->{CONTENT} =~ s/^$pat{class}//) {
293 6 50       22 my ($neg,$match) = defined($1) ? ($1,$2) : ($3,$4);
294 6 50       13 $match =~ tr/{}//d if defined $1;
295            
296 6         13 my ($quant,$ngreed) = $self->_get_quant;
297 6 50       14 return if $quant eq -1;
298 6 50       12 return unless $self->_ok_class($match);
299 6         26 my $node = (ref($self) . "::class")->new($match,$neg,$quant,$ngreed);
300 6         7 push @{ $self->{CURRENT} }, $node;
  6         12  
301 6         8 $self->{STATE} = 'class';
302 6         35 return $node;
303             }
304              
305 93 50       324 if ($self->{CONTENT} =~ s/^$pat{slash}//) {
306 0         0 my $match = $1;
307 0         0 my ($quant,$ngreed) = $self->_get_quant;
308 0 0       0 return if $quant eq -1;
309 0         0 my $node = (ref($self) . "::slash")->new($match,$quant,$ngreed);
310 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
311 0         0 $self->{STATE} = 'slash';
312 0         0 return $node;
313             }
314              
315 93 50       287 if ($self->{CONTENT} =~ s/^$pat{any}//) {
316 0         0 my ($quant,$ngreed) = $self->_get_quant;
317 0 0       0 return if $quant eq -1;
318 0         0 my $node = (ref($self) . "::any")->new($quant,$ngreed);
319 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
320 0         0 $self->{STATE} = 'any';
321 0         0 return $node;
322             }
323              
324 93 50 66     110 if (
      33        
325 93         456 @{ $self->{TREE_STACK} } and
326             $self->{TREE_STACK}[-1]{MODE}{x} and
327             $self->{CONTENT} =~ s/^$pat{nws}//
328             ) {
329 0         0 my $match = $1;
330 0         0 my $node;
331              
332 0 0 0     0 if (length($match) > 1 and $self->{CONTENT} =~ /^(?:$pat{Pcomment}|$pat{Xcomment}|\s+)*$pat{quant}/) {
333 0         0 $self->{CONTENT} = chop($match) . $self->{CONTENT};
334 0         0 $node = (ref($self) . "::text")->new($match,"","");
335             }
336             else {
337 0         0 my ($quant,$ngreed) = $self->_get_quant;
338 0 0       0 return if $quant eq -1;
339 0         0 $node = (ref($self) . "::text")->new($match,$quant,$ngreed);
340             }
341              
342 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
343 0         0 $self->{STATE} = 'text';
344 0         0 return $node;
345             }
346              
347 93 100       506 if ($self->{CONTENT} =~ s/^$pat{reg}//) {
348 26         54 my $match = $1;
349 26         27 my $node;
350 26 50 66     154 if (length($match) > 1 and $self->{CONTENT} =~ /^$pat{quant}/) {
351 0         0 $self->{CONTENT} = chop($match) . $self->{CONTENT};
352 0         0 $node = (ref($self) . "::text")->new($match,"","");
353             }
354             else {
355 26         61 my ($quant,$ngreed) = $self->_get_quant;
356 26 100       75 return if $quant eq -1;
357 25         123 $node = (ref($self) . "::text")->new($match,$quant,$ngreed);
358             }
359 25         30 push @{ $self->{CURRENT} }, $node;
  25         53  
360 25         35 $self->{STATE} = 'text';
361 25         148 return $node;
362             }
363              
364 67 100       220 if ($self->{CONTENT} =~ s/^$pat{alt}//) {{
  4         21  
365 4 50 66     4 if (
  4   66     6  
366             @{ $self->{TREE_STACK} } and
367             $self->{TREE_STACK}[-1]->type eq 'cond' and
368             $self->{TREE_STACK}[-1]{OPTS} == 2
369             ) {
370 0         0 $self->{CONTENT} = '|' . $self->{CONTENT};
371 0         0 last;
372             }
373 4         18 my $node = (ref($self) . "::alt")->new;
374 4 100 66     5 if (
375 4         26 @{ $self->{TREE_STACK} } and
376             $self->{TREE_STACK}[-1]->type eq 'cond'
377             ) {
378 1         3 $self->{TREE_STACK}[-1]{OPTS}++;
379 1         3 $self->{CURRENT} = $self->{TREE_STACK}[-1]{FALSE};
380             }
381             else {
382 3         3 push @{ $self->{CURRENT} }, $node;
  3         7  
383             }
384 4         7 $self->{STATE} = 'alt';
385 4         40 return $node;
386             }}
387              
388 63 50       285 if ($self->{CONTENT} =~ s/^$pat{Pflags}//) {
389 0         0 my ($add,$sub) = ($1,$2);
390 0         0 my $mode = $self->{TREE_STACK}[-1]{MODE};
391 0         0 @{$mode}{split //, $add} = (1) x length($add);
  0         0  
392 0         0 delete @{$mode}{split //, $sub};
  0         0  
393 0         0 my $node = (ref($self) . "::flags")->new($add,$sub);
394 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
395 0         0 $self->{STATE} = 'flags';
396 0         0 return $node;
397             }
398              
399 63 100       297 if ($self->{CONTENT} =~ s/^$pat{Pcond}//) {
400 7 100       19 if (defined $1) {
401 1         10 my $node = (ref($self) . "::conditional")->new($1);
402 1         12 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } } if
  1         5  
403 1 50       2 @{ $self->{TREE_STACK} };
404 1         2 push @{ $self->{TREE_STACK} }, $node;
  1         2  
405 1         2 push @{ $self->{CURRENT} }, $node;
  1         2  
406 1         2 $self->{CURRENT} = $node->{TRUE};
407 1         2 $self->{DEPTH}++;
408 1         4 $self->{STATE} = "cond($1)";
409 1         7 return $node;
410             }
411             else {
412 6         32 my $node = (ref($self) . "::conditional")->new;
413 6         26 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } } if
  6         20  
414 6 50       11 @{ $self->{TREE_STACK} };
415 6         12 push @{ $self->{TREE_STACK} }, $node;
  6         10  
416 6         9 push @{ $self->{CURRENT} }, $node;
  6         12  
417 6         13 $self->{CURRENT} = $node->{CONTENT};
418 6         8 $self->{DEPTH}++;
419 6         11 $self->{STATE} = "cond(assert)";
420 6         36 return $node;
421             }
422             }
423              
424 56 50       226 if ($self->{CONTENT} =~ s/^$pat{Pcut}//) {
425 0         0 my ($quant,$ngreed) = $self->_get_quant;
426 0 0       0 return if $quant eq -1;
427 0         0 my $node = (ref($self) . "::cut")->new([],$quant,$ngreed);
428 0         0 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } };
  0         0  
429 0         0 push @{ $self->{TREE_STACK} }, $node;
  0         0  
430 0         0 push @{ $self->{CURRENT} }, $node;
  0         0  
431 0         0 $self->{CURRENT} = $node->{CONTENT};
432 0         0 $self->{DEPTH}++;
433 0         0 $self->{STATE} = 'cut';
434 0         0 return $node;
435             }
436              
437 56 100       214 if ($self->{CONTENT} =~ s/^$pat{Pahead}//) {
438 3 100       37 my $node = (ref($self) . "::lookahead")->new($1 eq '=' ? 1 : 0);
439 3         4 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } };
  3         16  
440 3         4 push @{ $self->{TREE_STACK} }, $node;
  3         5  
441 3         4 push @{ $self->{CURRENT} }, $node;
  3         5  
442 3         6 $self->{CURRENT} = $node->{CONTENT};
443 3         4 $self->{DEPTH}++;
444 3         11 $self->{STATE} = 'lookahead(' . ('neg','pos')[$1 eq '='] . ')';
445 3         19 return $node;
446             }
447              
448 53 100       190 if ($self->{CONTENT} =~ s/^$pat{Pbehind}//) {
449 3 100       26 my $node = (ref($self) . "::lookbehind")->new($1 eq '=' ? 1 : 0);
450 3         4 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } };
  3         19  
451 3         5 push @{ $self->{TREE_STACK} }, $node;
  3         7  
452 3         5 push @{ $self->{CURRENT} }, $node;
  3         5  
453 3         6 $self->{CURRENT} = $node->{CONTENT};
454 3         36 $self->{DEPTH}++;
455 3         11 $self->{STATE} = 'lookbehind(' . ('neg','pos')[$1 eq '='] . ')';
456 3         19 return $node;
457             }
458              
459 50 100       216 if ($self->{CONTENT} =~ s/^$pat{Pgroup}//) {
460 13         37 my ($add,$sub) = ($1,$2);
461 13   100     106 my $node = (ref($self) . "::group")->new($add || "",$sub || "");
      100        
462 1         4 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } } if
  13         34  
463 13 100       25 @{ $self->{TREE_STACK} };
464 13         27 @{$node->{MODE}}{split //, $add} = (1) x length($add);
  13         39  
465 13         27 delete @{$node->{MODE}}{split //, $sub};
  13         23  
466 13         20 push @{ $self->{TREE_STACK} }, $node;
  13         23  
467 13         20 push @{ $self->{CURRENT} }, $node;
  13         24  
468 13         23 $self->{CURRENT} = $node->{CONTENT};
469 13         14 $self->{DEPTH}++;
470 13         21 $self->{STATE} = 'group';
471 13         107 return $node;
472             }
473              
474 37 100       142 if ($self->{CONTENT} =~ s/^$pat{Pcapture}//) {
475 2         13 my $node = (ref($self) . "::capture")->new;
476 2         12 $node->{MODE} = { %{ $self->{TREE_STACK}[-1]{MODE} } } if
  2         7  
477 2 50       3 @{ $self->{TREE_STACK} };
478 2         3 push @{ $self->{TREE_STACK} }, $node;
  2         4  
479 2         4 push @{ $self->{CURRENT} }, $node;
  2         3  
480 2         4 push @{ $self->{CAPTURE} }, $node;
  2         3  
481 2         4 $self->{CURRENT} = $node->{CONTENT};
482 2         3 $self->{DEPTH}++;
483 2         3 $self->{STATE} = 'capture(' . @{ $self->{CAPTURE} } . ')';
  2         17  
484 2         14 return $node;
485             }
486              
487 35 100       144 if ($self->{CONTENT} =~ s/^$pat{Pcode}//) {
488 2 50       15 my ($code,$left) = extract_codeblock($self->{CONTENT}) or do {
489 0         0 $self->{ERROR} = 'bad code in (?{ ... }) assertion';
490 0         0 $self->{STATE} = 'error';
491 0         0 return;
492             };
493            
494 2         958 $self->{CONTENT} = $left;
495 2         16 my $node = (ref($self) . "::code")->new($code);
496 2         4 push @{ $self->{TREE_STACK} }, $node;
  2         5  
497 2         4 push @{ $self->{CURRENT} }, $node;
  2         4  
498 2         5 $self->{DEPTH}++;
499 2         3 $self->{STATE} = 'code';
500 2         19 return $node;
501             }
502            
503 33 100       133 if ($self->{CONTENT} =~ s/^$pat{Plater}//) {
504 2 50       9 my ($code,$left) = extract_codeblock($self->{CONTENT}) or do {
505 0         0 $self->{ERROR} = 'bad code in (??{ ... }) assertion';
506 0         0 $self->{STATE} = 'error';
507 0         0 return;
508             };
509            
510 2         586 $self->{CONTENT} = $left;
511 2         16 my $node = (ref($self) . "::later")->new($code);
512 2         4 push @{ $self->{TREE_STACK} }, $node;
  2         4  
513 2         3 push @{ $self->{CURRENT} }, $node;
  2         4  
514 2         4 $self->{DEPTH}++;
515 2         4 $self->{STATE} = 'later';
516 2         18 return $node;
517             }
518            
519 31 100 66     244 if ($self->{DEPTH}-- and $self->{CONTENT} =~ s/^$pat{Pclose}//) {
520 30         62 my ($quant,$ngreed) = $self->_get_quant;
521 30 50       64 return if $quant eq -1;
522 30         113 my $node = (ref($self) . "::close")->new;
523            
524 30         39 $self->{CURRENT} = pop @{ $self->{TREE_STACK} };
  30         69  
525 30         83 $node->{QUANT} = $self->{CURRENT}{QUANT} = $quant;
526 30         58 $node->{NGREED} = $self->{CURRENT}{NGREED} = $ngreed;
527              
528             # this code is special to YAPE::Regex::Reverse
529 30 50       184 if ($self->isa('YAPE::Regex::Reverse')) {
530 0 0 0     0 if ($quant eq '*' or $quant eq '+') {
    0 0        
531 0         0 my $old = $self->{CURRENT}{CONTENT};
532 0         0 $self->{CURRENT}{CONTENT} = [
533             (ref($self) . "::group")->new,
534             (ref($self) . "::capture")->new,
535             ];
536 0 0       0 $self->{CURRENT}{NGREED} = '?' if $quant eq '*';
537 0         0 $self->{CURRENT}{CONTENT}[0]{CONTENT} = $old;
538 0         0 $self->{CURRENT}{CONTENT}[0]{QUANT} = '*';
539 0         0 $self->{CURRENT}{CONTENT}[1]{CONTENT} = $old;
540 0         0 $self->{CAPTURE}[-1] = $self->{CURRENT}{CONTENT}[1];
541 0         0 bless $self->{CURRENT}, (ref($self) . '::group');
542             }
543             elsif ($quant and $quant ne '?') {
544 0         0 my ($l,$u) = $quant =~ /(\d+),(\d*)/;
545 0         0 my $old = $self->{CURRENT}{CONTENT};
546 0         0 $self->{CURRENT}{CONTENT} = [
547             (ref($self) . "::group")->new,
548             (ref($self) . "::capture")->new,
549             ];
550 0 0       0 $self->{CURRENT}{NGREED} = '?' if not $l;
551 0 0       0 $l-- if $l; $u-- if $u;
  0 0       0  
552 0         0 $self->{CURRENT}{CONTENT}[0]{CONTENT} = $old;
553 0         0 $self->{CURRENT}{CONTENT}[0]{QUANT} = "{$l,$u}";
554 0         0 $self->{CURRENT}{CONTENT}[1]{CONTENT} = $old;
555 0         0 $self->{CAPTURE}[-1] = $self->{CURRENT}{CONTENT}[1];
556 0         0 bless $self->{CURRENT}, (ref($self) . '::group');
557             }
558             }
559              
560 30 100 100     34 if (
      66        
561 30         135 @{ $self->{TREE_STACK} } and
562             $self->{TREE_STACK}[-1]->type eq 'cond' and
563             $self->{TREE_STACK}[-1]{OPTS} == 1
564             ) {
565 6         15 $self->{CURRENT} = $self->{TREE_STACK}[-1]{TRUE};
566             }
567             else {
568 24         37 $self->{CURRENT} = $self->{TREE_STACK}[-1];
569 24         43 $self->{CURRENT} = $self->{CURRENT}{CONTENT};
570             }
571            
572 30         48 $self->{STATE} = 'close';
573 30         230 return $node;
574             }
575              
576 1         6 my $token = $self->chunk(1);
577 1         5 $self->{ERROR} = "unexpected token '$token' during '$self->{STATE}'";
578 1         2 $self->{STATE} = 'error';
579              
580 1         21 return;
581             }
582              
583              
584             sub extract {
585 0     0 1 0 my $self = shift;
586 0         0 $self->parse;
587            
588 0         0 my @nodes = @{ $self->{CAPTURE} };
  0         0  
589            
590 0     0   0 return sub { shift @nodes };
  0         0  
591             }
592              
593              
594             sub _get_quant {
595 64     64   74 my $self = shift;
596 64         106 my ($quant,$ngreed) = ("","");
597              
598 64 100 33     413 if (
      33        
      66        
599 57         307 $self->{CONTENT} =~ s/^($pat{Pcomment})?$pat{quant}// or
600             (@{ $self->{TREE_STACK} } and $self->{TREE_STACK}[-1]{MODE}{x} and
601             $self->{CONTENT} =~ s/^($pat{Xcomment}?\s*)?$pat{quant}//)
602             ) {
603 7         18 $quant = $+;
604             {
605 7 100 100     7 if ($quant =~ /^\{(\d+),(\d+)\}/ and $1 > $2) {
  7         39  
606 1         4 $self->{ERROR} = "upper bound lower than lower bound ($quant)";
607 1         3 $self->{STATE} = 'error';
608 1         3 return -1;
609             }
610             }
611 6 50       14 $self->{CONTENT} = $1 . $self->{CONTENT} if $1;
612             }
613              
614 63         306 my ($ws) = $1 if
615 63 50 33     73 @{ $self->{TREE_STACK} } and
      33        
616             $self->{TREE_STACK}[-1]{MODE}{x} and
617             $self->{CONTENT} =~ s/^(\s+)//;
618              
619 63 50 33     74 if (
      33        
      33        
620 63         651 (@{ $self->{TREE_STACK} } and $self->{TREE_STACK}[-1]{MODE}{x} and
621             $self->{CONTENT} =~ s/^($pat{Xcomment}?\s*)?$pat{ngreed}//) or
622             $self->{CONTENT} =~ s/^($pat{Pcomment})?$pat{ngreed}//
623             ) {
624 0         0 $ngreed = $+;
625 0 0       0 $self->{CONTENT} = $1 . $self->{CONTENT} if $1;
626             }
627              
628 63 50       116 $self->{CONTENT} = $ws . $self->{CONTENT} if $ws;
629              
630 63         201 return ($quant,$ngreed);
631             }
632              
633              
634             sub _ok_class {
635 6     6   8 my ($self,$class) = @_;
636              
637 6         110 while ($class =~ s/^($ok_cc_REx)//) {
638 30         51 my $c1 = $1;
639              
640 30 50       173 my $a =
    50          
    50          
    50          
    50          
    50          
641             defined($2) ? oct($2) :
642             defined($3) ? hex(($3 =~ /(\w+)/)[0]) :
643             defined($4) ? ord($4) - 64 :
644             defined($5) ? ord(eval qq{"\\$5"}) :
645             defined($6) ? ord(eval qq{use charnames ':full'; "\\N{$6}"}) :
646             defined($10) ? ord($10) :
647             -1;
648              
649 30         42 my ($utf8,$posix) = ($8,$9);
650            
651 30 50       67 $utf8 =~ tr/{}//d if defined $utf8;
652              
653 30 50 33     58 if (defined($posix) and $posix !~ $valid_POSIX) {
654 0         0 $self->{ERROR} = "unknown POSIX class $c1";
655 0         0 $self->{STATE} = 'error';
656 0         0 return;
657             }
658              
659 30 50       313 if ($class =~ s/^-($ok_cc_REx)//) {
660 0         0 my $c2 = $1;
661 0 0       0 my $b =
    0          
    0          
    0          
    0          
    0          
662             defined($2) ? oct($2) :
663             defined($3) ? hex(($3 =~ /(\w+)/)[0]) :
664             defined($4) ? ord($4) - 64 :
665             defined($5) ? ord(eval qq{"\\$5"}) :
666             defined($6) ? ord(eval qq{use charnames ':full'; "\\N{$6}"}) :
667             defined($10) ? ord($10) :
668             -1;
669              
670 0         0 my ($utf8,$posix) = ($8,$9);
671            
672 0 0       0 $utf8 =~ tr/{}//d if defined $utf8;
673            
674 0 0 0     0 if (defined($posix) and $posix !~ $valid_POSIX) {
675 0         0 $self->{ERROR} = "unknown POSIX class $c2";
676 0         0 $self->{STATE} = 'error';
677 0         0 return;
678             }
679            
680 0 0 0     0 if ($a == -1 or $b == -1) {
    0          
681 0 0       0 carp qq{false [] range "$c1-$c2"} if $^W;
682             }
683             elsif ($a > $b) {
684 0         0 $self->{ERROR} = "invalid [] range $c1-$c2";
685 0         0 $self->{STATE} = 'error';
686 0         0 return;
687             }
688             }
689             }
690              
691 6         15 return 1;
692             }
693              
694              
695             1;
696              
697              
698             =head1 NAME
699              
700             YAPE::Regex - Yet Another Parser/Extractor for Regular Expressions
701              
702             =head1 VERSION
703              
704             This document refers to YAPE::Regex version 4.00.
705              
706             =head1 SYNOPSIS
707              
708             use YAPE::Regex;
709             use strict;
710            
711             my $regex = qr/reg(ular\s+)?exp?(ression)?/i;
712             my $parser = YAPE::Regex->new($regex);
713            
714             # here is the tokenizing part
715             while (my $chunk = $parser->next) {
716             # ...
717             }
718              
719             =head1 C MODULES
720              
721             The C hierarchy of modules is an attempt at a unified means of parsing
722             and extracting content. It attempts to maintain a generic interface, to
723             promote simplicity and reusability. The API is powerful, yet simple. The
724             modules do tokenization (which can be intercepted) and build trees, so that
725             extraction of specific nodes is doable.
726              
727             =head1 DESCRIPTION
728              
729             This module is yet another (?) parser and tree-builder for Perl regular
730             expressions. It builds a tree out of a regex, but at the moment, the extent of
731             the extraction tool for the tree is quite limited (see L).
732             However, the tree can be useful to extension modules.
733              
734             =head1 USAGE
735              
736             In addition to the base class, C, there is the auxiliary class
737             C (common to all C base classes) that holds the
738             individual nodes' classes. There is documentation for the node classes in
739             that module's documentation.
740              
741             =head2 Methods for C
742              
743             =over 4
744              
745             =item * C
746              
747             =item * C
748              
749             If supplied no arguments, the module is loaded normally, and the node classes
750             are given the proper inheritence (from C). If you supply
751             a module (or list of modules), C will automatically include them (if
752             needed) and set up I node classes with the proper inheritence -- that is,
753             it will append C to C<@MyExt::Mod::ISA>, and C
754             to each node class's C<@ISA> (where C is the name of the specific node
755             class).
756              
757             package MyExt::Mod;
758             use YAPE::Regex 'MyExt::Mod';
759            
760             # does the work of:
761             # @MyExt::Mod::ISA = 'YAPE::Regex'
762             # @MyExt::Mod::text::ISA = 'YAPE::Regex::text'
763             # ...
764              
765             =item * Cnew($REx);>
766              
767             Creates a C object, using the contents of C<$REx> as a regular
768             expression. The C method will I to convert C<$REx> to a compiled
769             regex (using C) if C<$REx> isn't already one. If there is an error in
770             the regex, this will fail, but the parser will pretend it was ok. It will then
771             report the bad token when it gets to it, in the course of parsing.
772              
773             =item * Cchunk($len);>
774              
775             Returns the next C<$len> characters in the input string; C<$len> defaults to
776             30 characters. This is useful for figuring out why a parsing error occurs.
777              
778             =item * Cdone;>
779              
780             Returns true if the parser is done with the input string, and false otherwise.
781              
782             =item * Cerror;>
783              
784             Returns the parser error message.
785              
786             =item * Cextract;>
787              
788             Returns a code reference that returns the next back-reference in the regex.
789             For more information on enhancements in upcoming versions of this module, check
790             L.
791              
792             =item * Cdisplay(...);>
793              
794             Returns a string representation of the entire content. It calls the C
795             method in case there is more data that has not yet been parsed. This calls the
796             C method on the root nodes. Check the C docs
797             on the arguments to C.
798              
799             =item * Cnext;>
800              
801             Returns the next token, or C if there is no valid token. There will be
802             an error message (accessible with the C method) if there was a problem in
803             the parsing.
804              
805             =item * Cparse;>
806              
807             Calls C until all the data has been parsed.
808              
809             =item * Croot;>
810              
811             Returns the root node of the tree structure.
812              
813             =item * Cstate;>
814              
815             Returns the current state of the parser. It is one of the following values:
816             C, C, C, C, C, C, C, C,
817             C, C, C, C, C, C, C, C,
818             C, C, C, C, C,
819             C, C, C, C, C, and C.
820              
821             For C, I will be the number the captured pattern represents.
822              
823             For C, I will either be a number representing the
824             back-reference that the conditional depends on, or the string C.
825              
826             For C and C, one of C and C will be there,
827             depending on the type of assertion.
828              
829             =item * Ctop;>
830              
831             Synonymous to C.
832              
833             =back
834              
835             =head2 Extracting Sections
836              
837             While extraction of nodes is the goal of the C modules, the author is at
838             a loss for words as to what needs to be extracted from a regex. At the current
839             time, all the C method does is allow you access to the regex's set of
840             back-references:
841              
842             my $extor = $parser->extract;
843             while (my $backref = $extor->()) {
844             # ...
845             }
846              
847             C is very open to suggestions as to the approach to node extraction (in
848             how the API should look, in addition to what should be proffered). Preliminary
849             ideas include extraction keywords like the output of B<-Dr> (or the C
850             module's C option).
851              
852             =head1 EXTENSIONS
853              
854             =over 4
855              
856             =item * C
857              
858             Presents an explanation of a regular expression, node by node.
859              
860             =item * C (Not released)
861              
862             Reverses the nodes of a regular expression.
863              
864             =back
865              
866             =head1 TO DO
867              
868             This is a listing of things to add to future versions of this module.
869              
870             =head2 API
871              
872             =over 4
873              
874             =item * Create a robust C method
875              
876             Open to suggestions.
877              
878             =back
879              
880             =head1 BUGS
881              
882             Following is a list of known or reported bugs.
883              
884             =head2 Pending
885              
886             =over 4
887              
888             =item * C
889              
890             To understand C<\N{...}> properly, you must be using 5.6.0 or higher. However,
891             the parser only knows how to resolve full names (those made using C
892             ':full'>). There might be an option in the future to specify a class name.
893              
894             =back
895              
896             =head1 SEE ALSO
897              
898             The C documentation, for information on the node classes.
899             Also, C, Damian Conway's excellent module, used for the matching
900             of C<(?{ ... })> and C<(??{ ... })> blocks.
901              
902             =head1 AUTHOR
903              
904             The original author is Jeff "japhy" Pinyan (CPAN ID: PINYAN).
905              
906             Gene Sullivan (gsullivan@cpan.org) is a co-maintainer.
907              
908             =head1 LICENSE
909              
910             This module is free software; you can redistribute it and/or modify
911             it under the same terms as Perl itself. See L.
912              
913             =cut