File Coverage

blib/lib/Regexp/Parsertron.pm
Criterion Covered Total %
statement 213 243 87.6
branch 74 102 72.5
condition 7 12 58.3
subroutine 27 33 81.8
pod 11 13 84.6
total 332 403 82.3


line stmt bran cond sub pod time code
1             package Regexp::Parsertron;
2              
3 6     6   48414 use strict;
  6         35  
  6         132  
4 6     6   24 use warnings;
  6         8  
  6         130  
5             #use warnings qw(FATAL utf8); # Fatalize encoding glitches.
6              
7 6     6   2213 use Data::Section::Simple 'get_data_section';
  6         2934  
  6         281  
8              
9 6     6   2130 use Marpa::R2;
  6         737094  
  6         216  
10              
11 6     6   2592 use Moo;
  6         59700  
  6         23  
12              
13 6     6   9336 use Scalar::Does '-constants'; # For does().
  6         536415  
  6         61  
14              
15 6     6   14304 use Tree;
  6         27697  
  6         162  
16              
17 6     6   33 use Try::Tiny;
  6         10  
  6         276  
18              
19 6     6   30 use Types::Standard qw/Any Int Str/;
  6         14  
  6         37  
20              
21             has bnf =>
22             (
23             default => sub{return ''},
24             is => 'rw',
25             isa => Any,
26             required => 0,
27             );
28              
29             has current_node =>
30             (
31             default => sub{return ''},
32             is => 'rw',
33             isa => Any,
34             required => 0,
35             );
36              
37             has grammar =>
38             (
39             default => sub {return ''},
40             is => 'rw',
41             isa => Any,
42             required => 0,
43             );
44              
45             has re =>
46             (
47             default => sub {return ''},
48             is => 'rw',
49             isa => Any,
50             required => 0,
51             );
52              
53             has recce =>
54             (
55             default => sub{return ''},
56             is => 'rw',
57             isa => Any,
58             required => 0,
59             );
60              
61             has test_count =>
62             (
63             default => sub{return 0},
64             is => 'rw',
65             isa => Int,
66             required => 0,
67             );
68              
69             has tree =>
70             (
71             default => sub{return Tree -> new('Root')},
72             is => 'rw',
73             isa => Any,
74             required => 0,
75             );
76              
77             has uid =>
78             (
79             default => sub {return 0},
80             is => 'rw',
81             isa => Int,
82             required => 0,
83             );
84              
85             has verbose =>
86             (
87             default => sub {return 0},
88             is => 'rw',
89             isa => Int,
90             required => 0,
91             );
92              
93             has warning_str =>
94             (
95             default => sub {return ''},
96             is => 'rw',
97             isa => Str,
98             required => 0,
99             );
100              
101             our $VERSION = '1.03';
102              
103             # ------------------------------------------------
104              
105             sub BUILD
106             {
107 5     5 0 99 my($self) = @_;
108 5         23 my($bnf) = get_data_section('V 5.20');
109              
110 5         5924 $self -> bnf($bnf);
111 5         200 $self -> grammar
112             (
113             Marpa::R2::Scanless::G -> new
114             ({
115             source => \$self -> bnf
116             })
117             );
118 5         2017600 $self -> reset;
119              
120             } # End of BUILD.
121              
122             # ------------------------------------------------
123              
124             sub append
125             {
126 2     2 1 23 my($self, %opts) = @_;
127              
128 2         6 for my $param (qw/text uid/)
129             {
130 4 50       19 die "Method append() takes a hash with these keys: text, uid\n" if (! defined($opts{$param}) );
131             }
132              
133 2         5 my($meta);
134             my($uid);
135              
136 2         40 for my $node ($self -> tree -> traverse)
137             {
138 14 100       268 next if ($node -> is_root);
139              
140 12         75 $meta = $node -> meta;
141 12         67 $uid = $$meta{uid};
142              
143 12 100       25 if ($opts{uid} == $uid)
144             {
145 2         7 $$meta{text} .= $opts{text};
146             }
147             }
148              
149             } # End of append.
150              
151             # ------------------------------------------------
152              
153             sub _add_daughter
154             {
155 14841     14841   26962 my($self, $event_name, $attributes) = @_;
156 14841         227402 $$attributes{uid} = $self -> uid($self -> uid + 1);
157 14841         384243 my($node) = Tree -> new($event_name);
158              
159 14841         531561 $node -> meta($attributes);
160              
161 14841 100       175833 if ($event_name =~ /^close_(?:bracket|parenthesis)$/)
162             {
163 3089         48998 $self -> current_node($self -> current_node -> parent);
164             }
165              
166 14841         322566 $self -> current_node -> add_child($node);
167              
168 14841 100 100     3322057 if ( ($event_name =~ /^open_(?:bracket|parenthesis)$/) || ($event_name =~ /_prefix$/) )
169             {
170 3308         54622 $self -> current_node($node);
171             }
172              
173             } # End of _add_daughter.
174              
175             # ------------------------------------------------
176              
177             sub as_string
178             {
179 872     872 1 6560 my($self) = @_;
180 872         1889 my($string) = '';
181              
182 872         1576 my($meta);
183              
184 872         14877 for my $node ($self -> tree -> traverse)
185             {
186 8382 100       123145 next if ($node -> is_root);
187              
188 7510         45367 $meta = $node -> meta;
189 7510         41029 $string .= $$meta{text};
190             }
191              
192 872         2515 return $string;
193              
194             } # End of as_string.
195              
196             # ------------------------------------------------
197              
198             sub find
199             {
200 2     2 1 260 my($self, $target) = @_;
201              
202 2 50       7 die "Method find() takes a defined value as the parameter\n" if (! defined $target);
203              
204 2         7 my(@found);
205             my($meta);
206              
207 2         37 for my $node ($self -> tree -> traverse)
208             {
209 26 100       319 next if ($node -> is_root);
210              
211 24         139 $meta = $node -> meta;
212              
213 24 100       128 if (index($$meta{text}, $target) >= 0)
214             {
215 3         6 push @found, $$meta{uid};
216             }
217             }
218              
219 2         8 return [@found];
220              
221             } # End of find.
222              
223             # ------------------------------------------------
224              
225             sub get
226             {
227 7     7 1 1333 my($self, $wanted_uid) = @_;
228 7         128 my($max_uid) = $self -> uid;
229              
230 7 50 33     145 if (! defined($wanted_uid) || ($wanted_uid < 1) || ($wanted_uid > $self -> uid) )
      33        
231             {
232 0         0 die "Method get() takes a uid parameter in the range 1 .. $max_uid\n";
233             }
234              
235 7         46 my($meta);
236             my($text);
237 7         0 my($uid);
238              
239 7         92 for my $node ($self -> tree -> traverse)
240             {
241 49 100       697 next if ($node -> is_root);
242              
243 42         257 $meta = $node -> meta;
244 42         205 $uid = $$meta{uid};
245              
246 42 100       66 if ($wanted_uid == $uid)
247             {
248 7         13 $text = $$meta{text};
249             }
250             }
251              
252 7         19 return $text;
253              
254             } # End of get.
255              
256             # ------------------------------------------------
257              
258             sub _next_few_chars
259             {
260 14833     14833   23862 my($self, $stringref, $offset) = @_;
261 14833         29350 my($s) = substr($$stringref, $offset, 20);
262 14833         25988 $s =~ tr/\n/ /;
263 14833         29711 $s =~ s/^\s+//;
264 14833         25330 $s =~ s/\s+$//;
265              
266 14833         29830 return $s;
267              
268             } # End of _next_few_chars.
269              
270             # ------------------------------------------------
271              
272             sub parse
273             {
274 1525     1525 1 3171860 my($self, %opts) = @_;
275              
276             # Emulate parts of new(), which makes things a bit earier for the caller.
277              
278 1525 50       38236 $self -> re($opts{re}) if (defined $opts{re});
279 1525 50       41976 $self -> verbose($opts{verbose}) if (defined $opts{verbose});
280 1525         23900 $self -> warning_str('');
281              
282 1525         54109 $self -> recce
283             (
284             Marpa::R2::Scanless::R -> new
285             ({
286             exhaustion => 'event',
287             grammar => $self -> grammar,
288             })
289             );
290              
291             # Return 0 for success and 1 for failure.
292              
293 1525         540029 my($result) = 0;
294              
295 1525         2957 my($message);
296              
297             try
298             {
299 1525 100   1525   53844 if (defined (my $value = $self -> _process) )
300             {
301 874 50       19573008 $self -> print_cooked_tree if ($self -> verbose > 1);
302             }
303             else
304             {
305 10         581 $result = 1;
306              
307 10         20 my($message) = 'Error: Marpa parse failed. ';
308              
309 10 50       147 print $message, "\n" if ($self -> verbose);
310              
311 10         117 die $message;
312             }
313             }
314             catch
315             {
316 651     651   190842 die $_;
317 1525         10522 };
318              
319             # Return 0 for success and 1 for failure.
320              
321 874         27834 return $result;
322              
323             } # End of parse.
324              
325             # ------------------------------------------------
326              
327             sub prepend
328             {
329 2     2 1 504 my($self, %opts) = @_;
330              
331 2         5 for my $param (qw/text uid/)
332             {
333 4 50       11 die "Method append() takes a hash with these keys: text, uid\n" if (! defined($opts{$param}) );
334             }
335              
336 2         5 my($meta);
337             my($uid);
338              
339 2         39 for my $node ($self -> tree -> traverse)
340             {
341 14 100       203 next if ($node -> is_root);
342              
343 12         75 $meta = $node -> meta;
344 12         57 $uid = $$meta{uid};
345              
346 12 100       25 if ($opts{uid} == $uid)
347             {
348 2         5 $$meta{text} = "$opts{text}$$meta{text}";
349             }
350             }
351              
352             } # End of prepend.
353              
354             # ------------------------------------------------
355              
356             sub _process
357             {
358 1525     1525   3067 my($self) = @_;
359 1525         27752 my($raw_re) = $self -> re;
360 1525         30046 my($test_count) = $self -> test_count($self -> test_count + 1);
361              
362             # This line is 'print', not 'say'!
363              
364 1525 50       62858 print "Test count: $test_count. Parsing (in qr/.../ form): " if ($self -> verbose);
365              
366 1525         11500 my($string_re) = $self -> _string2re($raw_re);
367              
368 1525 50       4773 if ($string_re eq '')
369             {
370 0 0       0 print "\n" if ($self -> verbose);
371              
372 0         0 return undef;
373             }
374              
375 1525 50       26143 print "'$string_re'. \n" if ($self -> verbose);
376              
377 1525 50       25865 if ($self -> verbose > 1)
378             {
379 0         0 my($format) = "%-10s %-5s %-20s %-6s %-30s %s \n";
380              
381 0         0 print sprintf($format, ' Location', 'Width', 'Lexeme', 'Events', 'Names', 'Next few chars');
382              
383             }
384              
385 1525         10412 my($ref_re) = \"$string_re"; # Use " in comment for UltraEdit.
386 1525         3556 my($length) = length($string_re);
387              
388 1525         6968 my($child);
389             my($event_name);
390 1525         0 my($lexeme);
391 1525         0 my($pos);
392 1525         0 my($span, $start);
393              
394             # We use read()/lexeme_read()/resume() because we pause at each lexeme.
395              
396 1525         20517 for
397             (
398             $pos = $self -> recce -> read($ref_re);
399             ($pos < $length);
400             $pos = $self -> recce -> resume($pos)
401             )
402             {
403 14843         1621671 ($start, $span) = $self -> recce -> pause_span;
404 14843         165271 ($event_name, $span, $pos) = $self -> _validate_event($ref_re, $start, $span, $pos,);
405              
406             # If the input is exhausted, we exit immediately so we don't try to use
407             # the values of $start, $span or $pos. They are ignored upon exit.
408              
409 14843 100       29192 last if ($event_name eq "'exhausted"); # Yes, it has a leading quote.
410              
411 14833         182145 $lexeme = $self -> recce -> literal($start, $span);
412 14833         316321 $pos = $self -> recce -> lexeme_read($event_name);
413              
414 14833 50       774466 die "Marpa lexeme_read($event_name) rejected lexeme '$lexeme'\n" if (! defined $pos);
415              
416 14833         49941 $self -> _add_daughter($event_name, {text => $lexeme});
417             }
418              
419 1425         37934 my($message);
420              
421 1425 100       20564 if (my $status = $self -> recce -> ambiguous)
    100          
422             {
423 551         10611213 my($terminals) = $self -> recce -> terminals_expected;
424 551 100       20368 $terminals = ['(None)'] if ($#$terminals < 0);
425 551         2639 $message = "Marpa warning. Parse ambiguous. Status: $status. Terminals expected: " . join(', ', @$terminals);
426             }
427             elsif ($self -> recce -> exhausted)
428             {
429             # Special case. Sigh. I need to patch the BNF to do this. TODO.
430              
431 860 100 66     95026 if ( ($pos + 1 == $length) && (substr($string_re, $pos, 1) eq ')') )
432             {
433 8         35 $self -> _add_daughter('close_parenthesis', {text => ')'});
434             }
435              
436             # See https://metacpan.org/pod/distribution/Marpa-R2/pod/Exhaustion.pod#Exhaustion
437             # for why this code is exhaustion-loving.
438              
439 860 50       12047 $message = 'Marpa parse exhausted' if ($self -> verbose > 1);
440             }
441              
442 1425 100       9414 if ($message)
443             {
444 551         9483 $self -> warning_str($message);
445              
446 551 50       23470 print $message, "\n" if ($self -> verbose);
447             }
448              
449 1425 50       20977 $self -> print_raw_tree if ($self -> verbose);
450              
451             # Return a defined value for success and undef for failure.
452              
453 1425         22798 return $self -> recce -> value;
454              
455             } # End of _process.
456              
457             # ------------------------------------------------
458              
459             sub print_cooked_tree
460             {
461 0     0 1 0 my($self) = @_;
462 0         0 my($format) = "%-30s %3s %s \n";
463              
464 0         0 print sprintf($format, 'Name', 'Uid', 'Text');
465 0         0 print sprintf($format, '----', '---', '----');
466              
467 0         0 my($meta);
468              
469 0         0 for my $node ($self -> tree -> traverse)
470             {
471 0 0       0 next if ($node -> is_root);
472              
473 0         0 $meta = $node -> meta;
474              
475 0         0 print sprintf($format, $node -> value, $$meta{uid}, $$meta{text});
476             }
477              
478             } # End of print_cooked_tree.
479              
480             # ------------------------------------------------
481              
482             sub print_raw_tree
483             {
484 0     0 1 0 my($self) = @_;
485              
486 0         0 print map("$_\n", @{$self -> tree -> tree2string});
  0         0  
487              
488             } # End of print_raw_tree.
489              
490             # ------------------------------------------------
491              
492             sub reset
493             {
494 1528     1528 1 574402 my($self) = @_;
495              
496 1528         6859 $self -> tree(Tree -> new('Root') );
497 1528         146397 $self -> tree -> meta({text => 'Root', uid => 0});
498 1528         44977 $self -> current_node($self -> tree);
499 1528         115087 $self -> uid(0);
500 1528         56407 $self -> warning_str('');
501              
502             } # End of reset.
503              
504             # ------------------------------------------------
505              
506             sub search
507             {
508 2     2 1 20 my($self, $target) = @_;
509              
510 2 50       8 die "Method search() takes a defined value as the parameter\n" if (! defined $target);
511              
512 2         6 my($re) = $self -> _string2re($target);
513              
514 2         32 my(@found);
515             my($meta);
516              
517 2         40 for my $node ($self -> tree -> traverse)
518             {
519 38 100       481 next if ($node -> is_root);
520              
521 36         201 $meta = $node -> meta;
522              
523 36 100       233 if ($$meta{text} =~ $re)
524             {
525 4         9 push @found, $$meta{uid};
526             }
527             }
528              
529 2         8 return [@found];
530              
531             } # End of search.
532              
533             # ------------------------------------------------
534              
535             sub set
536             {
537 1     1 1 274 my($self, %opts) = @_;
538              
539 1         17 for my $param (qw/text uid/)
540             {
541 2 50       8 die "Method set() takes a hash with these keys: text, uid\n" if (! defined($opts{$param}) );
542             }
543              
544 1         2 my($meta);
545             my($uid);
546              
547 1         22 for my $node ($self -> tree -> traverse)
548             {
549 7 100       101 next if ($node -> is_root);
550              
551 6         38 $meta = $node -> meta;
552 6         30 $uid = $$meta{uid};
553              
554 6 100       15 if ($opts{uid} == $uid)
555             {
556 1         2 $$meta{text} = $opts{text};
557             }
558             }
559              
560             } # End of set.
561              
562             # ------------------------------------------------
563              
564             sub _string2re
565             {
566 1527     1527   3948 my($self, $raw_re) = @_;
567              
568 1527         2519 my($re);
569              
570             try
571             {
572 1527 100   1527   50741 $re = does($raw_re, 'Regexp') ? $raw_re : qr/$raw_re/;
573             }
574             catch
575             {
576 0     0   0 my($message) = "Error: Perl cannot convert $raw_re into qr/.../ form";
577              
578 0 0       0 print $message, "\n" if ($self -> verbose);
579              
580 0         0 die $message;
581 1527         9300 };
582              
583 1527         62317 return $re;
584              
585             } # End of _string2re.
586              
587             # ------------------------------------------------
588              
589             sub validate
590             {
591 0     0 0 0 my($self) = @_;
592 0         0 my($re) = $self -> as_string;
593              
594 0         0 my($result);
595              
596             try
597             {
598 0 0   0   0 $result = ('x' =~ $re) ? 0 : 0; # Use any test to force Perl to process the Regexp.
599             }
600             catch
601             {
602 0     0   0 $result = 1; # Failure.
603 0         0 };
604              
605             # Return 0 for success and 1 for failure.
606              
607 0         0 return $result;
608              
609             } # End of validate.
610              
611             # ------------------------------------------------
612              
613             sub _validate_event
614             {
615 14843     14843   26465 my($self, $stringref, $start, $span, $pos) = @_;
616 14843         20228 my(@event) = @{$self -> recce -> events};
  14843         187679  
617 14843         125406 my($event_count) = scalar @event;
618 14843         25146 my(@event_names) = sort map{$$_[0]} @event;
  15027         40743  
619 14843         24471 my($event_name) = $event_names[0]; # Default.
620              
621             # Handle some special cases.
622              
623 14843 100       28755 if ($event_count > 1)
624             {
625 184         523 my($event_list) = join(', ', @event_names);
626              
627 184 100       734 if ($event_list eq 'caret, string')
    100          
    50          
628             {
629 121         188 $event_count = 1;
630 121         212 $event_name = 'caret';
631 121         267 @event_names = $event_name;
632 121         212 $pos = $start;
633 121         235 $span = 1;
634             }
635             elsif ($event_list eq 'query, string')
636             {
637 23         44 $event_count = 1;
638 23         62 $event_name = 'query';
639 23         50 @event_names = $event_name;
640 23         34 $pos = $start;
641 23         35 $span = 1;
642             }
643             elsif ($event_list eq 'string, vertical_bar')
644             {
645 40         72 $event_count = 1;
646 40         58 $event_name = 'vertical_bar';
647 40         74 @event_names = $event_name;
648 40         58 $pos = $start;
649 40         59 $span = 1;
650             }
651             else
652             {
653             #$self -> print_cooked_tree;
654              
655 0         0 die "event_count: $event_count. " . $event_list;
656             }
657             }
658              
659             # If the input is exhausted, we return immediately so we don't try to use
660             # the values of $start, $span or $pos. They are ignored upon return.
661              
662 14843 100       26544 if ($event_name eq "'exhausted") # Yes, it has a leading quote.
663             {
664 10         31 return ($event_name, $span, $pos);
665             }
666              
667 14833         31949 my($lexeme) = substr($$stringref, $start, $span);
668 14833         194927 my($line, $column) = $self -> recce -> line_column($start);
669 14833         182210 my($literal) = $self -> _next_few_chars($stringref, $start + $span);
670 14833         40832 my($message) = "Location: ($line, $column). Lexeme: $lexeme. Events: $event_count. Names: ";
671 14833         27848 my($name_list) = join(', ', @event_names);
672 14833         21081 $message .= ". Next few chars: $literal";
673              
674 14833 50       206085 if ($self -> verbose > 1)
675             {
676 0         0 my($format) = "%4d, %4d %5d %-20s %6d %-30s %s \n";
677              
678 0         0 print sprintf($format, $line, $column, length($lexeme), $lexeme, $event_count, $name_list, $literal);
679              
680             }
681              
682 14833         97421 return ($event_name, $span, $pos);
683              
684             } # End of _validate_event.
685              
686             # ------------------------------------------------
687              
688             1;
689              
690             =pod
691              
692             =head1 NAME
693              
694             C - Parse a Perl regexp into a data structure of type L
695              
696             Warning: Development version. See L for details.
697              
698             =head1 Synopsis
699              
700             =head2 Sample Code
701              
702             This is scripts/synopsis.pl:
703              
704             #!/usr/bin/env perl
705              
706             use v5.10;
707             use strict;
708             use warnings;
709              
710             use Regexp::Parsertron;
711              
712             # ---------------------
713              
714             my($re) = qr/Perl|JavaScript/i;
715             my($parser) = Regexp::Parsertron -> new(verbose => 1);
716              
717             # Return 0 for success and 1 for failure.
718              
719             my($result) = $parser -> parse(re => $re);
720             my($node_id) = 5; # Obtained from displaying and inspecting the tree.
721              
722             print "Calling append(text => '|C++', uid => $node_id) \n";
723              
724             $parser -> append(text => '|C++', uid => $node_id);
725             $parser -> print_raw_tree;
726             $parser -> print_cooked_tree;
727              
728             my($as_string) = $parser -> as_string;
729              
730             print "Original: $re. Result: $result (0 is success) \n";
731             print "as_string(): $as_string \n";
732              
733             $result = $parser -> validate;
734              
735             print "validate(): Result: $result (0 is success) \n";
736              
737             # Return 0 for success and 1 for failure.
738              
739             $parser -> reset;
740             $parser -> verbose(0);
741              
742             $re = qr/Perl|JavaScript|(?:Flub|BCPL)/i;
743             $result = $parser -> parse(re => $re);
744              
745             print "\nAdd complexity to the regexp by parsing a new regexp \n";
746              
747             $parser -> print_raw_tree;
748              
749             And its output:
750              
751             Test count: 1. Parsing (in qr/.../ form): '(?^i:Perl|JavaScript)'.
752             Root. Attributes: {text => "Root", uid => "0"}
753             |--- open_parenthesis. Attributes: {text => "(", uid => "1"}
754             | |--- query_caret. Attributes: {text => "?^", uid => "2"}
755             | |--- flag_set. Attributes: {text => "i", uid => "3"}
756             | |--- colon. Attributes: {text => ":", uid => "4"}
757             | |--- string. Attributes: {text => "Perl|JavaScript", uid => "5"}
758             |--- close_parenthesis. Attributes: {text => ")", uid => "6"}
759              
760             Calling append(text => '|C++', uid => 5)
761             Root. Attributes: {text => "Root", uid => "0"}
762             |--- open_parenthesis. Attributes: {text => "(", uid => "1"}
763             | |--- query_caret. Attributes: {text => "?^", uid => "2"}
764             | |--- flag_set. Attributes: {text => "i", uid => "3"}
765             | |--- colon. Attributes: {text => ":", uid => "4"}
766             | |--- string. Attributes: {text => "Perl|JavaScript|C++", uid => "5"}
767             |--- close_parenthesis. Attributes: {text => ")", uid => "6"}
768              
769             Name Uid Text
770             ---- --- ----
771             open_parenthesis 1 (
772             query_caret 2 ?^
773             flag_set 3 i
774             colon 4 :
775             string 5 Perl|JavaScript|C++
776             close_parenthesis 6 )
777             Original: (?^i:Perl|JavaScript). Result: 0 (0 is success)
778             as_string(): (?^i:Perl|JavaScript|C++)
779             validate(): Result: 0 (0 is success)
780              
781             Adding complexity to the regexp by parsing a new regexp:
782             Root. Attributes: {text => "Root", uid => "0"}
783             |--- open_parenthesis. Attributes: {text => "(", uid => "1"}
784             | |--- query_caret. Attributes: {text => "?^", uid => "2"}
785             | |--- flag_set. Attributes: {text => "i", uid => "3"}
786             | |--- colon. Attributes: {text => ":", uid => "4"}
787             | |--- string. Attributes: {text => "Perl|JavaScript|", uid => "5"}
788             | |--- colon_prefix. Attributes: {text => "(?:", uid => "6"}
789             | | |--- string. Attributes: {text => "Flub|BCPL", uid => "7"}
790             | |--- close_parenthesis. Attributes: {text => ")", uid => "8"}
791             |--- close_parenthesis. Attributes: {text => ")", uid => "9"}
792              
793              
794             Note: The 1st tree is printed due to verbose => 1 in the call to L, while the 2nd
795             is due to the call to L. The columnar output is due to the call to
796             L.
797              
798             =head2 Tutorial
799              
800             =over 4
801              
802             =item o Start with a simple program and a simple regexp
803              
804             This code, scripts/tutorial.pl, is a cut-down version of scripts/synopsis.pl:
805              
806             #!/usr/bin/env perl
807              
808             use v5.10;
809             use strict;
810             use warnings;
811              
812             use Regexp::Parsertron;
813              
814             # ---------------------
815              
816             my($re) = qr/Perl|JavaScript/i;
817             my($parser) = Regexp::Parsertron -> new(verbose => 1);
818              
819             # Return 0 for success and 1 for failure.
820              
821             my($result) = $parser -> parse(re => $re);
822              
823             print "Original: $re. Result: $result. (0 is success) \n";
824              
825             Running it outputs:
826              
827             Test count: 1. Parsing (in qr/.../ form): '(?^i:Perl|JavaScript)'.
828             Root. Attributes: {text => "Root", uid => "0"}
829             |--- open_parenthesis. Attributes: {text => "(", uid => "1"}
830             | |--- query_caret. Attributes: {text => "?^", uid => "2"}
831             | |--- flag_set. Attributes: {text => "i", uid => "3"}
832             | |--- colon. Attributes: {text => ":", uid => "4"}
833             | |--- string. Attributes: {text => "Perl|JavaScript", uid => "5"}
834             |--- close_parenthesis. Attributes: {text => ")", uid => "6"}
835              
836             Original: (?^i:Perl|JavaScript). Result: 0. (0 is success)
837              
838             =item o Examine the tree and determine which nodes you wish to edit
839              
840             The nodes are uniquely identified by their uids.
841              
842             =item o Proceed as does scripts/synopsis.pl
843              
844             Add these lines to the end of the tutorial code, and re-run:
845              
846             my($node_id) = 5; # Obtained from displaying and inspecting the tree.
847              
848             $parser -> append(text => '|C++', uid => $node_id);
849             $parser -> print_raw_tree;
850              
851             The extra output, showing the change to node uid == 5, is:
852              
853             Root. Attributes: {text => "Root", uid => "0"}
854             |--- open_parenthesis. Attributes: {text => "(", uid => "1"}
855             | |--- query_caret. Attributes: {text => "?^", uid => "2"}
856             | |--- flag_set. Attributes: {text => "i", uid => "3"}
857             | |--- colon. Attributes: {text => ":", uid => "4"}
858             | |--- string. Attributes: {text => "Perl|JavaScript|C++", uid => "5"}
859             |--- close_parenthesis. Attributes: {text => ")", uid => "6"}
860              
861             =item o Test also with L and L
862              
863             See t/get.set.t for sample code.
864              
865             =item o Since everything works, make a cup of tea
866              
867             =back
868              
869             =head2 The Edit Methods
870              
871             The I simply means any one or more of these methods, which can all change the text of
872             a node:
873              
874             =over 4
875              
876             =item o L
877              
878             =item o L
879              
880             =item o L
881              
882             =back
883              
884             The edit methods are exercised in t/get.set.t, as well as scripts/synopsis.pl (above).
885              
886             =head1 Description
887              
888             Parses a regexp into a tree object managed by the L module, and provides various methods for
889             updating and retrieving that tree's contents.
890              
891             This module uses L and L.
892              
893             =head1 Distributions
894              
895             This module is available as a Unix-style distro (*.tgz).
896              
897             See L
898             for help on unpacking and installing distros.
899              
900             =head1 Installation
901              
902             Install C as you would any C module:
903              
904             Run:
905              
906             cpanm Regexp::Parsertron
907              
908             or run:
909              
910             sudo cpan Regexp::Parsertron
911              
912             or unpack the distro, and then use:
913              
914             perl Makefile.PL
915             make (or dmake or nmake)
916             make test
917             make install
918              
919             =head1 Constructor and Initialization
920              
921             C is called as C<< my($parser) = Regexp::Parsertron -> new(k1 => v1, k2 => v2, ...) >>.
922              
923             It returns a new object of type C.
924              
925             Key-value pairs accepted in the parameter list (see corresponding methods for details
926             [e.g. L]):
927              
928             =over 4
929              
930             =item o re => $regexp
931              
932             The C method of L is called to see what C is. If it's already of the
933             form C, then it's processed as is, but if it's not, then it's transformed using C.
934              
935             Warning: Currently, the input is expected to have been pre-processed by Perl via qr/$regexp/.
936              
937             Default: ''.
938              
939             =item o verbose => $integer
940              
941             Takes values 0, 1 or 2, which print more and more progress reports.
942              
943             Used for debugging.
944              
945             Default: 0 (print nothing).
946              
947             =back
948              
949             =head1 Methods
950              
951             =head2 append(%opts)
952              
953             Append some text to the text of a node.
954              
955             %opts is a hash with these (key => value) pairs:
956              
957             =over 4
958              
959             =item o text => $string
960              
961             The text to append.
962              
963             =item o uid => $uid
964              
965             The uid of the node to update.
966              
967             =back
968              
969             The code calls C if %opts does not have these 2 keys, or if either value is undef.
970              
971             See scripts/synopsis.pl for sample code.
972              
973             Note: Calling C never changes the uids of nodes, so repeated calling of C with
974             the same C will apply more and more updates to the same node.
975              
976             See also L, L and t/get.set.t.
977              
978             =head2 as_string()
979              
980             Returns the parsed regexp as a string. The string contains all edits applied with
981             L.
982              
983             =head2 find($target)
984              
985             Returns an arrayref of node uids whose text contains the given string.
986              
987             If the arrayref is empty, there were no matches.
988              
989             The Perl function C is used here to test for $target being a substring of the text
990             associated with each node.
991              
992             The code calls C if $target is undef.
993              
994             See t/get.set.t for sample usage of C.
995              
996             See L for a regexp-based test. See also L.
997              
998             =head2 get($uid)
999              
1000             Get the text of the node with the given $uid.
1001              
1002             The code calls C if $uid is undef, or outside the range 1 .. $self -> uid. The latter value
1003             is the highest uid so far assigned to any node.
1004              
1005             Returns undef if the given $uid is not found.
1006              
1007             See also L.
1008              
1009             =head2 new([%opts])
1010              
1011             Here, '[]' indicate an optional parameter.
1012              
1013             See L for details on the parameters accepted by L.
1014              
1015             =head2 parse([%opts])
1016              
1017             Here, '[]' indicate an optional parameter.
1018              
1019             Parses the regexp supplied with the parameter C in the call to L or in the call to
1020             L, or in the call to C<< parse(re => $regexp) >> itself. The latter takes precedence.
1021              
1022             The hash C<%opts> takes the same (key => value) pairs as L does.
1023              
1024             See L for details.
1025              
1026             =head2 prepend(%opts)
1027              
1028             Prepend some text to the text of a node.
1029              
1030             %opts is a hash with these (key => value) pairs:
1031              
1032             =over 4
1033              
1034             =item o text => $string
1035              
1036             The text to prepend.
1037              
1038             =item o uid => $uid
1039              
1040             The uid of the node to update.
1041              
1042             =back
1043              
1044             The code calls C if %opts does not have these 2 keys, or if either value is undef.
1045              
1046             Note: Calling C never changes the uids of nodes, so repeated calling of C with
1047             the same C will apply more and more updates to the same node.
1048              
1049             See also L, L, and t/get.set.t.
1050              
1051             =head2 print_cooked_tree()
1052              
1053             Prints, in a pretty format, the tree built from parsing.
1054              
1055             See the for sample output.
1056              
1057             See also L.
1058              
1059             =head2 print_raw_tree()
1060              
1061             Prints, in a simple format, the tree built from parsing.
1062              
1063             See the for sample output.
1064              
1065             See also L.
1066              
1067             =head2 re([$regexp])
1068              
1069             Here, '[]' indicate an optional parameter.
1070              
1071             Gets or sets the regexp to be processed.
1072              
1073             Note: C is a parameter to L.
1074              
1075             =head2 reset()
1076              
1077             Resets various internal things, except test_count.
1078              
1079             Used basically for debugging.
1080              
1081             =head2 search($target)
1082              
1083             Returns an arrayref of node uids whose text contains the given string.
1084              
1085             If the arrayref is empty, there were no matches.
1086              
1087             $target is converted to a regexp if a simple string is passed in.
1088              
1089             The code calls C if $target is undef.
1090              
1091             See t/search.t for sample usage of C.
1092              
1093             See L for a non-regexp search. See also L.
1094              
1095             =head2 set(%opts)
1096              
1097             Set the text of a node to $opt{text}.
1098              
1099             %opts is a hash with these (key => value) pairs:
1100              
1101             =over 4
1102              
1103             =item o text => $string
1104              
1105             The text to use to overwrite the text of the node.
1106              
1107             =item o uid => $uid
1108              
1109             The uid of the node to update.
1110              
1111             =back
1112              
1113             The code calls C if %opts does not have these 2 keys, or if either value is undef.
1114              
1115             See also L and L.
1116              
1117             =head2 tree()
1118              
1119             Returns an object of type L. Ignore the root node.
1120              
1121             Each node's C method returns a hashref of information about the node. See the
1122             L for details.
1123              
1124             See also the source code for L and L for ideas on how to
1125             use this object.
1126              
1127             =head2 uid()
1128              
1129             Returns the last-used uid.
1130              
1131             Each node in the tree is given a uid, which allows methods like L to work.
1132              
1133             =head2 verbose([$integer])
1134              
1135             Here, '[]' indicate an optional parameter.
1136              
1137             Gets or sets the verbosity level, within the range 0 .. 2. Higher numbers print more progress
1138             reports.
1139              
1140             Used basically for debugging.
1141              
1142             Note: C is a parameter to L.
1143              
1144             =head2 warning_str()
1145              
1146             Returns the last Marpa warning.
1147              
1148             In short, Marpa will always report 'Marpa parse exhausted' in warning_str() if the parse is not
1149             ambiguous, but do not worry - I.
1150              
1151             See L and
1152             L.
1153              
1154             =head1 FAQ
1155              
1156             =head2 Can I add a subtree to the tree?
1157              
1158             Not yet.
1159              
1160             There is a private method, C<_add_daughter()>, which I could make public, if I felt it was safe to
1161             do so.
1162              
1163             =head2 Why does the BNF not accept an empty regexp?
1164              
1165             Simple answer: Changing the BNF to handle this creates a massive problem elsewhere in the BNF.
1166              
1167             Complex answer:
1168              
1169             The BNF contains this countable rule to allow patterns to be juxtaposed without '|', say, to
1170             separate them:
1171              
1172             global_sequence ::= pattern_type+
1173              
1174             And in turn (further toward the leaves of the tree of BNF), I then use:
1175              
1176             pattern_sequence ::= pattern_set+
1177              
1178             To allow an empty regexp would mean changing this rule to:
1179              
1180             pattern_sequence ::= pattern_set*
1181              
1182             But that makes this rule nullable, and Marpa rejects the C rule on the grounds that
1183             a countable rule is not allowed to be nullable. ATM I cannot see a way of
1184             rewriting the rules to avoid this problem. But I'm hopeful such a rewrite is possible.
1185              
1186             =head2 Why does the code sometimes not store '|' - as in qr/(Perl|JavaScript/) - in its own node?
1187              
1188             It could be done by, for example, splitting such a string into three nodes, 'Perl', '|',
1189             'Javascript'. But does that offer any benefit?
1190              
1191             It makes processing by the user more complex because then if they wish to edit the list of
1192             alternatives, they might have to edit two or three nodes instead of one. Here, editing means perhaps
1193             replacing any existing string with the empty string.
1194              
1195             Further, to extend the list of alternatives, the user will be confused by not being sure if they
1196             should change 'Javascript' to 'Javascript|C' or if they have to add two nodes, containing '|' and
1197             'C'. And ATM adding nodes is contraindicated!
1198              
1199             Despite this, when the input stream triggers two events, C and C,
1200             simultaneously because the '|' is at the start of a string, special code in the private method
1201             C<_validate_event()> does put '|' in its own node. IOW the BNF does not do the work, which is really
1202             what I would prefer.
1203              
1204             =head2 Does this module ever use \Q...\E to quote regexp metacharacters?
1205              
1206             No.
1207              
1208             =head2 What is the format of the nodes in the tree built by this module?
1209              
1210             Each node's C is the name of the Marpa-style event which was triggered by detection of
1211             some C within the regexp.
1212              
1213             Each node's C method returns a hashref with these (key => value) pairs:
1214              
1215             =over 4
1216              
1217             =item o text => $string
1218              
1219             This is the text within the regexp which triggered the event just mentioned.
1220              
1221             =item o uid => $integer
1222              
1223             This is the unique id of the 'current' node.
1224              
1225             This C is often used by you to specify which node to work on.
1226              
1227             See t/get.set.t and t/simple.t for sample code.
1228              
1229             The code never changes the uid of a node.
1230              
1231             =back
1232              
1233             See also the source code for L and L for ideas on how to
1234             use the tree.
1235              
1236             See the L for sample code and a report after parsing a tiny regexp.
1237              
1238             =head2 Does the root node in the tree ever hold useful information?
1239              
1240             No. Always ignore it.
1241              
1242             =head2 Why does the BNF never use the lexeme adverb C?
1243              
1244             Because with Marpa::R2 the priority is only used when lexemes are the same length.
1245              
1246             L.
1247              
1248             =head2 Does this module interpret regexps in any way?
1249              
1250             No. You have to run your own Perl code to do that. This module just parses them into a data
1251             structure.
1252              
1253             And that really means this module does not match the regexp against anything. If I appear to do that
1254             while debugging new code, you can't rely on that appearing in production versions of the module.
1255              
1256             =head2 Does this module rewrite regexps?
1257              
1258             No, unless you call one of L.
1259              
1260             =head2 Does this module handle both Perl 5 and Perl 6?
1261              
1262             No. It will only handle Perl 5 syntax.
1263              
1264             =head2 Does this module handle regexps for various versions of Perl5?
1265              
1266             Not yet. Version-dependent regexp syntax will be supported for recent versions of Perl. This is
1267             done by having tokens within the BNF which are replaced at start-up time with version-dependent
1268             details.
1269              
1270             There are no such tokens at the moment.
1271              
1272             All debugging is done assuming the regexp syntax as documented online. See L for the
1273             urls in question.
1274              
1275             =head2 So which version of Perl is supported?
1276              
1277             The code is expected to work for Perls back to V 5.14.0, which is when stringification of regexps
1278             changed. See L below for more.
1279              
1280             I'm (2018-01-14) using Perl V 5.20.2 and making the BNF match the Perl regexp docs listed in
1281             L below.
1282              
1283             The program t/perl-5.21.11.t reads the file 'xt/author/re_tests' which I copied from the source code
1284             of Perl V 5.21.11. This test is the one which currently provides 858 passing tests out of the 1027
1285             tests which pass for me using prove -lv t.
1286              
1287             =head2 Could Perl and this module generate different parses of the same regexp?
1288              
1289             Absolutely! There is no escape from this fact simply because the code used in each program bears no
1290             relationship to the code in the other one.
1291              
1292             The real question is: How do we make the code in each program accept and reject exactly the same
1293             regexps as the code in the other program. I think trial-and-error is all we have available to us for
1294             dealing with this issue.
1295              
1296             =head2 After calling parse(), warning_str() contains the string '... Parse ambiguous ...'
1297              
1298             This is almost certainly a error with the BNF, although of course it may be an error will an
1299             exceptionally-badly formed regexp.
1300              
1301             Report it via L, and please
1302             include the regexp in the report. Thanx!
1303              
1304             =head2 Is this a (Marpa) exhaustion-hating or exhaustion-loving app?
1305              
1306             Exhaustion-loving.
1307              
1308             See L
1309              
1310             =head2 Will this code be modified to run under Marpa::R3 when the latter is stable?
1311              
1312             Yes.
1313              
1314             =head2 What is the purpose of this module?
1315              
1316             =over 4
1317              
1318             =item o To provide a stand-alone parser for regexps
1319              
1320             =item o To help me learn more about regexps
1321              
1322             =item o To become, I hope, a replacement for the horrendously complex L
1323              
1324             =back
1325              
1326             =head2 Who crafted the BNF?
1327              
1328             I did.
1329              
1330             =head1 Scripts
1331              
1332             This diagram indicates the flow of logic from script to script:
1333              
1334             xt/author/re_tests
1335             |
1336             V
1337             xt/author/generate.tests.pl
1338             |
1339             V
1340             xt/authors/perl-5.21.11.tests
1341             |
1342             V
1343             perl -Ilib t/perl-5.21.11.t > xt/author/perl-5.21.11.log 2>&1
1344              
1345             If xt/author/perl-5.21.11.log only contains lines starting with 'ok', then all Perl and Marpa
1346             errors have been hidden, so t/perl-5.21.11.t is ready to live in t/. Before that time it lives in
1347             xt/author/.
1348              
1349             =head1 TODO
1350              
1351             =over 4
1352              
1353             =item o How to best define 'code' in the BNF.
1354              
1355             =item o I could traverse the tree and store a pointer to each node in an array
1356              
1357             This would mean fast access to nodes in random order. But is there any point? Yes, it would speed up
1358             various methods. Specifically, any module which calls C on the tree object would
1359             benefit.
1360              
1361             =item o Allow users to add nodes and hence subtrees to the tree
1362              
1363             =back
1364              
1365             =head1 References
1366              
1367             L. PCRE - Perl Compatible Regular Expressions.
1368              
1369             L. This is the definitive document.
1370              
1371             L.
1372              
1373             L. Samples with commentary.
1374              
1375             L
1376              
1377             L
1378              
1379             L
1380              
1381             L. This is when stringification
1382             changed to return (?^...) rather than (?-xism...).
1383              
1384             L
1385              
1386             L. Regular Expression
1387             Inconsistencies With Unicode.
1388              
1389             L
1390              
1391             L
1392              
1393             L
1394              
1395             =head1 See Also
1396              
1397             L
1398              
1399             L
1400              
1401             L
1402              
1403             L
1404              
1405             L
1406              
1407             L
1408              
1409             L
1410              
1411             L
1412              
1413             L
1414              
1415             L. This is vaguely a version of L.
1416              
1417             L
1418              
1419             L
1420              
1421             And many others...
1422              
1423             =head1 Machine-Readable Change Log
1424              
1425             The file Changes was converted into Changelog.ini by L.
1426              
1427             =head1 Version Numbers
1428              
1429             Version numbers < 1.00 represent development versions. From 1.00 up, they are production versions.
1430              
1431             =head1 CPAN Tester Results
1432              
1433             L
1434              
1435             =head1 Repository
1436              
1437             L
1438              
1439             =head1 Support
1440              
1441             Email the author, or log a bug on RT:
1442              
1443             L.
1444              
1445             =head1 Author
1446              
1447             L was written by Ron Savage Iron@savage.net.auE> in 2011.
1448              
1449             Marpa's homepage: L.
1450              
1451             L.
1452              
1453             =head1 Copyright
1454              
1455             Australian copyright (c) 2016, Ron Savage.
1456              
1457             All Programs of mine are 'OSI Certified Open Source Software';
1458             you can redistribute them and/or modify them under the terms of
1459             The Artistic License 2.0, a copy of which is available at:
1460             http://opensource.org/licenses/alphabetical.
1461              
1462             =cut
1463              
1464             __DATA__