File Coverage

blib/lib/Text/Balanced/Marpa.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             package Text::Balanced::Marpa;
2              
3 16     16   337857 use strict;
  16         36  
  16         703  
4 16     16   8764 use utf8;
  16         127  
  16         152  
5 16     16   528 use warnings;
  16         29  
  16         594  
6 16     16   72 use warnings qw(FATAL utf8); # Fatalize encoding glitches.
  16         23  
  16         732  
7 16     16   6779 use open qw(:std :utf8); # Undeclared streams in UTF-8.
  16         15861  
  16         87  
8              
9 16         187 use Const::Exporter constants =>
10             [
11             nothing_is_fatal => 0, # The default.
12             print_errors => 1,
13             print_warnings => 2,
14             print_debugs => 4,
15             overlap_is_fatal => 8,
16             nesting_is_fatal => 16,
17             ambiguity_is_fatal => 32,
18             exhaustion_is_fatal => 64,
19 16     16   12315 ];
  16         273228  
20              
21 16     16   28246 use Marpa::R2;
  0            
  0            
22              
23             use Moo;
24              
25             use Tree;
26              
27             use Types::Standard qw/Any ArrayRef HashRef Int ScalarRef Str/;
28              
29             use Try::Tiny;
30              
31             has bnf =>
32             (
33             default => sub{return ''},
34             is => 'rw',
35             isa => Any,
36             required => 0,
37             );
38              
39             has close =>
40             (
41             default => sub{return []},
42             is => 'rw',
43             isa => ArrayRef,
44             required => 0,
45             );
46              
47             has delimiter_action =>
48             (
49             default => sub{return {} },
50             is => 'rw',
51             isa => HashRef,
52             required => 0,
53             );
54              
55             has delimiter_stack =>
56             (
57             default => sub{return []},
58             is => 'rw',
59             isa => ArrayRef,
60             required => 0,
61             );
62              
63             has delimiter_frequency =>
64             (
65             default => sub{return {} },
66             is => 'rw',
67             isa => HashRef,
68             required => 0,
69             );
70              
71             has error_message =>
72             (
73             default => sub{return ''},
74             is => 'rw',
75             isa => Str,
76             required => 0,
77             );
78              
79             has error_number =>
80             (
81             default => sub{return 0},
82             is => 'rw',
83             isa => Int,
84             required => 0,
85             );
86              
87             has escape_char =>
88             (
89             default => sub{return '\\'},
90             is => 'rw',
91             isa => Str,
92             required => 0,
93             );
94              
95             has grammar =>
96             (
97             default => sub {return ''},
98             is => 'rw',
99             isa => Any,
100             required => 0,
101             );
102              
103             has known_events =>
104             (
105             default => sub{return {} },
106             is => 'rw',
107             isa => HashRef,
108             required => 0,
109             );
110              
111             has length =>
112             (
113             default => sub{return 0},
114             is => 'rw',
115             isa => Int,
116             required => 0,
117             );
118              
119             has matching_delimiter =>
120             (
121             default => sub{return {} },
122             is => 'rw',
123             isa => HashRef,
124             required => 0,
125             );
126              
127             has next_few_limit =>
128             (
129             default => sub{return 20},
130             is => 'rw',
131             isa => Int,
132             required => 0,
133             );
134              
135             has node_stack =>
136             (
137             default => sub{return []},
138             is => 'rw',
139             isa => ArrayRef,
140             required => 0,
141             );
142              
143             has open =>
144             (
145             default => sub{return []},
146             is => 'rw',
147             isa => ArrayRef,
148             required => 0,
149             );
150              
151             has options =>
152             (
153             default => sub{return 0},
154             is => 'rw',
155             isa => Int,
156             required => 0,
157             );
158              
159             has pos =>
160             (
161             default => sub{return 0},
162             is => 'rw',
163             isa => Int,
164             required => 0,
165             );
166              
167             has recce =>
168             (
169             default => sub{return ''},
170             is => 'rw',
171             isa => Any,
172             required => 0,
173             );
174              
175             has tree =>
176             (
177             default => sub{return ''},
178             is => 'rw',
179             isa => Any,
180             required => 0,
181             );
182              
183             has text =>
184             (
185             default => sub{return \''}, # Use ' in comment for UltraEdit.
186             is => 'rw',
187             isa => ScalarRef[Str],
188             required => 0,
189             );
190              
191             has uid =>
192             (
193             default => sub{return 0},
194             is => 'rw',
195             isa => Int,
196             required => 0,
197             );
198              
199             our $VERSION = '1.06';
200              
201             # ------------------------------------------------
202              
203             sub BUILD
204             {
205             my($self) = @_;
206              
207             # Policy: Event names are always the same as the name of the corresponding lexeme.
208             #
209             # Note: Tokens of the form '_xxx_' are replaced just below, with values returned
210             # by the call to validate_open_close().
211              
212             my($bnf) = <<'END_OF_GRAMMAR';
213              
214             :default ::= action => [values]
215              
216             lexeme default = latm => 1
217              
218             :start ::= input_text
219              
220             input_text ::= input_string*
221              
222             input_string ::= quoted_text
223             | unquoted_text
224              
225             quoted_text ::= open_delim input_text close_delim
226              
227             unquoted_text ::= text
228              
229             # Lexemes in alphabetical order.
230              
231             delimiter_char ~ [_delimiter_]
232              
233             :lexeme ~ close_delim pause => before event => close_delim
234             _close_
235              
236             escaped_char ~ '_escape_char_' delimiter_char # Use ' in comment for UltraEdit.
237              
238             # Warning: Do not add '+' to this set, even though it speeds up things.
239             # The problem is that the set then gobbles up any '\', so the following
240             # character is no longer recognized as being escaped.
241             # Trapping the exception then generated would be possible.
242              
243             non_quote_char ~ [^_delimiter_] # Use " in comment for UltraEdit.
244              
245             :lexeme ~ open_delim pause => before event => open_delim
246             _open_
247              
248             :lexeme ~ text pause => before event => text
249             text ~ escaped_char
250             | non_quote_char
251             END_OF_GRAMMAR
252              
253             my($hashref) = $self -> _validate_open_close;
254             $bnf =~ s/_open_/$$hashref{_open_}/;
255             $bnf =~ s/_close_/$$hashref{_close_}/;
256             $bnf =~ s/_delimiter_/$$hashref{_delimiter_}/g;
257             my($escape_char) = $self -> escape_char;
258              
259             if ($escape_char eq "'")
260             {
261             my($message) = 'Single-quote is forbidden as an escape character';
262              
263             $self -> error_message($message);
264             $self -> error_number(7);
265              
266             # This 'die' is not inside try{}catch{}, so we add the prefix 'Error: '.
267              
268             die "Error: $message\n";
269             }
270              
271             $bnf =~ s/_escape_char_/$escape_char/g;
272              
273             $self -> bnf($bnf);
274             $self -> grammar
275             (
276             Marpa::R2::Scanless::G -> new
277             ({
278             source => \$self -> bnf
279             })
280             );
281              
282             # This hash does not contain the key "'exhausted" because the exhaustion
283             # event is everywhere handled explicitly. Yes, it has a leading quote.
284              
285             my(%event);
286              
287             for my $line (split(/\n/, $self -> bnf) )
288             {
289             $event{$1} = 1 if ($line =~ /event\s+=>\s+(\w+)/);
290             }
291              
292             $self -> known_events(\%event);
293              
294             } # End of BUILD.
295              
296             # ------------------------------------------------
297              
298             sub _add_daughter
299             {
300             my($self, $name, $text) = @_;
301             my($attributes) = {text => $text, uid => $self -> uid($self -> uid + 1)};
302             my($stack) = $self -> node_stack;
303             my($node) = Tree -> new($name);
304              
305             $node -> meta($attributes);
306              
307             $$stack[$#$stack] -> add_child({}, $node);
308              
309             } # End of _add_daughter.
310              
311             # ------------------------------------------------
312              
313             sub next_few_chars
314             {
315             my($self, $stringref, $offset) = @_;
316             my($s) = substr($$stringref, $offset, $self -> next_few_limit);
317             $s =~ tr/\n/ /;
318             $s =~ s/^\s+//;
319             $s =~ s/\s+$//;
320              
321             return $s;
322              
323             } # End of next_few_chars.
324              
325             # ------------------------------------------------
326              
327             sub parse
328             {
329             my($self, %opts) = @_;
330              
331             # Emulate parts of new(), which makes things a bit earier for the caller.
332              
333             $self -> options($opts{options}) if (defined $opts{options});
334             $self -> text($opts{text}) if (defined $opts{text});
335             $self -> pos($opts{pos}) if (defined $opts{pos});
336             $self -> length($opts{length}) if (defined $opts{length});
337              
338             $self -> recce
339             (
340             Marpa::R2::Scanless::R -> new
341             ({
342             exhaustion => 'event',
343             grammar => $self -> grammar,
344             ranking_method => 'high_rule_only',
345             })
346             );
347              
348             # Since $self -> node_stack has not been initialized yet,
349             # we can't call _add_daughter() until after this statement.
350              
351             $self -> uid(0);
352             $self -> tree(Tree -> new('root') );
353             $self -> tree -> meta({text => '', uid => $self -> uid});
354             $self -> node_stack([$self -> tree -> root]);
355              
356             # Return 0 for success and 1 for failure.
357              
358             my($result) = 0;
359              
360             my($message);
361              
362             try
363             {
364             if (defined (my $value = $self -> _process) )
365             {
366             }
367             else
368             {
369             $result = 1;
370              
371             print "Error: Parse failed\n" if ($self -> options & print_errors);
372             }
373             }
374             catch
375             {
376             $result = 1;
377              
378             print "Error: Parse failed. ${_}" if ($self -> options & print_errors);
379             };
380              
381             # Return 0 for success and 1 for failure.
382              
383             return $result;
384              
385             } # End of parse.
386              
387             # ------------------------------------------------
388              
389             sub _pop_node_stack
390             {
391             my($self) = @_;
392             my($stack) = $self -> node_stack;
393              
394             pop @$stack;
395              
396             $self -> node_stack($stack);
397              
398             } # End of _pop_node_stack.
399              
400             # ------------------------------------------------
401              
402             sub _process
403             {
404             my($self) = @_;
405             my($stringref) = $self -> text || \''; # Allow for undef. Use ' in comment for UltraEdit.
406             my($pos) = $self -> pos;
407             my($first_pos) = $pos;
408             my($total_length) = length($$stringref);
409             my($length) = $self -> length || $total_length;
410             my($text) = '';
411             my($format) = "%-20s %5s %5s %5s %-20s %-20s\n";
412             my($last_event) = '';
413             my($matching_delimiter) = $self -> matching_delimiter;
414              
415             if ($self -> options & print_debugs)
416             {
417             print "Length of input: $length. Input |$$stringref|\n";
418             print sprintf($format, 'Event', 'Start', 'Span', 'Pos', 'Lexeme', 'Comment');
419             }
420              
421             my($delimiter_frequency, $delimiter_stack);
422             my($event_name);
423             my($lexeme);
424             my($message);
425             my($original_lexeme);
426             my($span, $start);
427             my($tos);
428              
429             # We use read()/lexeme_read()/resume() because we pause at each lexeme.
430             # Also, in read(), we use $pos and $length to avoid reading Ruby Slippers tokens (if any).
431             # For the latter, see scripts/match.parentheses.02.pl in MarpaX::Demo::SampleScripts.
432              
433             for
434             (
435             $pos = $self -> recce -> read($stringref, $pos, $length);
436             ($pos < $total_length) && ( ($pos - $first_pos) <= $length);
437             $pos = $self -> recce -> resume($pos)
438             )
439             {
440             $delimiter_frequency = $self -> delimiter_frequency;
441             $delimiter_stack = $self -> delimiter_stack;
442             ($start, $span) = $self -> recce -> pause_span;
443             ($event_name, $span, $pos) = $self -> _validate_event($stringref, $start, $span, $pos, $delimiter_frequency);
444              
445             # If the input is exhausted, we exit immediately so we don't try to use
446             # the values of $start, $span or $pos. They are ignored upon exit.
447              
448             last if ($event_name eq "'exhausted"); # Yes, it has a leading quote.
449              
450             $lexeme = $self -> recce -> literal($start, $span);
451             $original_lexeme = $lexeme;
452             $pos = $self -> recce -> lexeme_read($event_name);
453              
454             die "lexeme_read($event_name) rejected lexeme |$lexeme|\n" if (! defined $pos);
455              
456             print sprintf($format, $event_name, $start, $span, $pos, $lexeme, '-') if ($self -> options & print_debugs);
457              
458             if ($event_name ne 'text')
459             {
460             $self -> _save_text($text);
461              
462             $text = '';
463             }
464              
465             if ($event_name eq 'close_delim')
466             {
467             $$delimiter_frequency{$lexeme}--;
468              
469             $self -> delimiter_frequency($delimiter_frequency);
470              
471             $tos = pop @$delimiter_stack;
472              
473             $self -> delimiter_stack($delimiter_stack);
474              
475             # If the top of the delimiter stack is not the lexeme corresponding to the
476             # opening delimiter of the current closing delimiter, then there's an error.
477              
478             if ($$matching_delimiter{$$tos{lexeme} } ne $lexeme)
479             {
480             $message = "Last open delimiter: $$tos{lexeme}. Unexpected closing delimiter: $lexeme";
481              
482             $self -> error_message($message);
483             $self -> error_number(1);
484              
485             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
486              
487             die "$message\n" if ($self -> options & overlap_is_fatal);
488              
489             # If we did not die, then it's a warning message.
490              
491             $self -> error_number(-1);
492              
493             print "Warning: $message\n" if ($self -> options & print_warnings);
494             }
495              
496             $self -> _pop_node_stack;
497             $self -> _add_daughter('close', $lexeme);
498             }
499             elsif ($event_name eq 'open_delim')
500             {
501             $$delimiter_frequency{$$matching_delimiter{$lexeme} }++;
502              
503             # If the top of the delimiter stack reaches 2, then there's an error.
504             # Unlink mismatched delimiters (just above), this is never gets a warning.
505              
506             if ($$delimiter_frequency{$$matching_delimiter{$lexeme} } > 1)
507             {
508             $message = "Opened delimiter $lexeme again before closing previous one";
509              
510             $self -> error_message($message);
511             $self -> error_number(2);
512              
513             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
514              
515             die "$message\n" if ($self -> options & nesting_is_fatal);
516              
517             # If we did not die, then it's a warning message.
518              
519             $self -> error_number(-2);
520              
521             print "Warning: $message\n" if ($self -> options & print_warnings);
522             }
523              
524             $self -> delimiter_frequency($delimiter_frequency);
525              
526             push @$delimiter_stack,
527             {
528             count => $$delimiter_frequency{$$matching_delimiter{$lexeme} },
529             lexeme => $lexeme,
530             };
531              
532             $self -> delimiter_stack($delimiter_stack);
533              
534             $self -> _add_daughter('open', $lexeme);
535             $self -> _push_node_stack;
536             }
537             elsif ($event_name eq 'text')
538             {
539             $text .= $lexeme;
540             }
541              
542             $last_event = $event_name;
543             }
544              
545             # Mop up any left-over chars.
546              
547             $self -> _save_text($text);
548              
549             if ($self -> recce -> exhausted)
550             {
551             $message = 'Parse exhausted';
552              
553             $self -> error_message($message);
554             $self -> error_number(6);
555              
556             if ($self -> options & exhaustion_is_fatal)
557             {
558             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
559              
560             die "$message\n";
561             }
562             else
563             {
564             $self -> error_number(-6);
565              
566             print "Warning: $message\n" if ($self -> options & print_warnings);
567             }
568             }
569             elsif (my $status = $self -> recce -> ambiguous)
570             {
571             my($terminals) = $self -> recce -> terminals_expected;
572             $terminals = ['(None)'] if ($#$terminals < 0);
573             $message = "Ambiguous parse. Status: $status. Terminals expected: " . join(', ', @$terminals);
574              
575             $self -> error_message($message);
576             $self -> error_number(3);
577              
578             if ($self -> options & ambiguity_is_fatal)
579             {
580             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
581              
582             die "$message\n";
583             }
584             elsif ($self -> options & print_warnings)
585             {
586             $self -> error_number(-3);
587              
588             print "Warning: $message\n";
589             }
590             }
591              
592             # Return a defined value for success and undef for failure.
593              
594             return $self -> recce -> value;
595              
596             } # End of _process.
597              
598             # ------------------------------------------------
599              
600             sub _push_node_stack
601             {
602             my($self) = @_;
603             my($stack) = $self -> node_stack;
604             my(@daughters) = $$stack[$#$stack] -> children;
605              
606             push @$stack, $daughters[$#daughters];
607              
608             } # End of _push_node_stack.
609              
610             # ------------------------------------------------
611              
612             sub _save_text
613             {
614             my($self, $text) = @_;
615              
616             $self -> _add_daughter('text', $text) if (length($text) );
617              
618             return '';
619              
620             } # End of _save_text.
621              
622             # ------------------------------------------------
623              
624             sub _validate_event
625             {
626             my($self, $stringref, $start, $span, $pos, $delimiter_frequency) = @_;
627             my(@event) = @{$self -> recce -> events};
628             my($event_count) = scalar @event;
629             my(@event_name) = sort map{$$_[0]} @event;
630             my($event_name) = $event_name[0]; # Default.
631              
632             # If the input is exhausted, we return immediately so we don't try to use
633             # the values of $start, $span or $pos. They are ignored upon return.
634              
635             if ($event_name eq "'exhausted") # Yes, it has a leading quote.
636             {
637             return ($event_name, $span, $pos);
638             }
639              
640             my($lexeme) = substr($$stringref, $start, $span);
641             my($line, $column) = $self -> recce -> line_column($start);
642             my($literal) = $self -> next_few_chars($stringref, $start + $span);
643             my($message) = "Location: ($line, $column). Lexeme: |$lexeme|. Next few chars: |$literal|";
644             $message = "$message. Events: $event_count. Names: ";
645              
646             print $message, join(', ', @event_name), "\n" if ($self -> options & print_debugs);
647              
648             my(%event_name);
649              
650             @event_name{@event_name} = (1) x @event_name;
651              
652             for (@event_name)
653             {
654             if (! ${$self -> known_events}{$_})
655             {
656             $message = "Unexpected event name '$_'";
657              
658             $self -> error_message($message);
659             $self -> error_number(10);
660              
661             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
662              
663             die "$message\n";
664             }
665             }
666              
667             if ($event_count > 1)
668             {
669             my($delimiter_action) = $self -> delimiter_action;
670              
671             if (defined $event_name{string})
672             {
673             $event_name = $$delimiter_action{$lexeme};
674              
675             print "Disambiguated lexeme |$lexeme| as '$event_name'\n" if ($self -> options & print_debugs);
676             }
677             elsif ( ($lexeme =~ /["']/) && (join(', ', @event_name) eq 'close_delim, open_delim') ) # ".
678             {
679             # At the time _validate_event() is called, the quote count has not yet been bumped.
680             # If this is the 1st quote, then it's an open_delim.
681             # If this is the 2nd quote, them it's a close delim.
682              
683             if ($$delimiter_frequency{$lexeme} % 2 == 0)
684             {
685             $event_name = 'open_delim';
686              
687             print "Disambiguated lexeme |$lexeme| as '$event_name'\n" if ($self -> options & print_debugs);
688             }
689             else
690             {
691             $event_name = 'close_delim';
692              
693             print "Disambiguated lexeme |$lexeme| as '$event_name'\n" if ($self -> options & print_debugs);
694             }
695             }
696             else
697             {
698             $message = join(', ', @event_name);
699             $message = "The code does not handle these events simultaneously: $message";
700              
701             $self -> error_message($message);
702             $self -> error_number(11);
703              
704             # This 'die' is inside try{}catch{}, which adds the prefix 'Error: '.
705              
706             die "$message\n";
707             }
708             }
709              
710             return ($event_name, $span, $pos);
711              
712             } # End of _validate_event.
713              
714             # ------------------------------------------------
715              
716             sub _validate_open_close
717             {
718             my($self) = @_;
719             my($open) = $self -> open;
720             my($close) = $self -> close;
721              
722             my($message);
723              
724             if ( ($#$open < 0) || ($#$close < 0) )
725             {
726             $message = 'There must be at least 1 pair of open/close delimiters';
727              
728             $self -> error_message($message);
729             $self -> error_number(8);
730              
731             # This 'die' is not inside try{}catch{}, so we add the prefix 'Error: '.
732              
733             die "Error: $message\n";
734             }
735              
736             if ($#$open != $#$close)
737             {
738             $message = 'The # of open delimiters must match the # of close delimiters';
739              
740             $self -> error_message($message);
741             $self -> error_number(9);
742              
743             # This 'die' is not inside try{}catch{}, so we add the prefix 'Error: '.
744              
745             die "Error: $message\n";
746             }
747              
748             my(%substitute) = (_close_ => '', _delimiter_ => '', _open_ => '');
749             my($matching_delimiter) = {};
750             my(%seen) = (close => {}, open => {});
751              
752             my($close_quote);
753             my(%delimiter_action, %delimiter_frequency);
754             my($open_quote);
755             my($prefix, %prefix);
756              
757             for my $i (0 .. $#$open)
758             {
759             if ( ($$open[$i] =~ /\\/) || ($$close[$i] =~ /\\/) )
760             {
761             $message = 'Backslash is forbidden as a delimiter character';
762              
763             $self -> error_message($message);
764             $self -> error_number(4);
765              
766             # This 'die' is not inside try{}catch{}, so we add the prefix 'Error: '.
767              
768             die "Error: $message\n";
769             }
770              
771             if ( ( (length($$open[$i]) > 1) && ($$open[$i] =~ /'/) ) || ( (length($$close[$i]) > 1) && ($$close[$i] =~ /'/) ) )
772             {
773             $message = 'Single-quotes are forbidden in multi-character delimiters';
774              
775             $self -> error_message($message);
776             $self -> error_number(5);
777              
778             # This 'die' is not inside try{}catch{}, so we add the prefix 'Error: '.
779              
780             die "Error: $message\n";
781             }
782              
783             $seen{open}{$$open[$i]} = 0 if (! $seen{open}{$$open[$i]});
784             $seen{close}{$$close[$i]} = 0 if (! $seen{close}{$$close[$i]});
785              
786             $seen{open}{$$open[$i]}++;
787             $seen{close}{$$close[$i]}++;
788              
789             $delimiter_action{$$open[$i]} = 'open';
790             $delimiter_action{$$close[$i]} = 'close';
791             $$matching_delimiter{$$open[$i]} = $$close[$i];
792             $delimiter_frequency{$$open[$i]} = 0;
793             $delimiter_frequency{$$close[$i]} = 0;
794              
795             if (length($$open[$i]) == 1)
796             {
797             $open_quote = $$open[$i] eq '[' ? "[\\$$open[$i]]" : "[$$open[$i]]";
798             }
799             else
800             {
801             # This fails if length > 1 and open contains a single quote.
802              
803             $open_quote = "'$$open[$i]'";
804             }
805              
806             if (length($$close[$i]) == 1)
807             {
808             $close_quote = $$close[$i] eq ']' ? "[\\$$close[$i]]" : "[$$close[$i]]";
809             }
810             else
811             {
812             # This fails if length > 1 and close contains a single quote.
813              
814             $close_quote = "'$$close[$i]'";
815             }
816              
817             $substitute{_open_} .= "open_delim\t\t\t\~ $open_quote\n" if ($seen{open}{$$open[$i]} <= 1);
818             $substitute{_close_} .= "close_delim\t\t\t\~ $close_quote\n" if ($seen{close}{$$close[$i]} <= 1);
819             $prefix = substr($$open[$i], 0, 1);
820             $prefix = "\\$prefix" if ($prefix =~ /[\[\]]/);
821             $prefix{$prefix} = 0 if (! $prefix{$prefix});
822              
823             $prefix{$prefix}++;
824              
825             $substitute{_delimiter_} .= $prefix if ($prefix{$prefix} == 1);
826             $prefix = substr($$close[$i], 0, 1);
827             $prefix = "\\$prefix" if ($prefix =~ /[\[\]]/);
828             $prefix{$prefix} = 0 if (! $prefix{$prefix});
829              
830             $prefix{$prefix}++;
831              
832             $substitute{_delimiter_} .= $prefix if ($prefix{$prefix} == 1);
833             }
834              
835             $self -> delimiter_action(\%delimiter_action);
836             $self -> delimiter_frequency(\%delimiter_frequency);
837             $self -> matching_delimiter($matching_delimiter);
838              
839             return \%substitute;
840              
841             } # End of _validate_open_close.
842              
843             # ------------------------------------------------
844              
845             1;
846              
847             =pod
848              
849             =head1 NAME
850              
851             C - Extract delimited text sequences from strings
852              
853             =head1 Synopsis
854              
855             #!/usr/bin/env perl
856              
857             use strict;
858             use warnings;
859              
860             use Text::Balanced::Marpa ':constants';
861              
862             # -----------
863              
864             my($count) = 0;
865             my($parser) = Text::Balanced::Marpa -> new
866             (
867             open => ['<:' ,'[%'],
868             close => [':>', '%]'],
869             options => nesting_is_fatal | print_warnings,
870             );
871             my(@text) =
872             (
873             q|<: a :>|,
874             q|a [% b <: c :> d %] e|,
875             q|a <: b <: c :> d :> e|, # nesting_is_fatal triggers an error here.
876             );
877              
878             my($result);
879              
880             for my $text (@text)
881             {
882             $count++;
883              
884             print "Parsing |$text|\n";
885              
886             $result = $parser -> parse(text => \$text);
887              
888             print join("\n", @{$parser -> tree -> tree2string}), "\n";
889             print "Parse result: $result (0 is success)\n";
890              
891             if ($count == 3)
892             {
893             print "Deliberate error: Failed to parse |$text|\n";
894             print 'Error number: ', $parser -> error_number, '. Error message: ',
895             $parser -> error_message, "\n";
896             }
897              
898             print '-' x 50, "\n";
899             }
900              
901             See scripts/synopsis.pl.
902              
903             This is the printout of synopsis.pl:
904              
905             Parsing |<: a :>|
906             Parsed text:
907             root. Attributes: {}
908             |--- open. Attributes: {text => "<:"}
909             | |--- string. Attributes: {text => " a "}
910             |--- close. Attributes: {text => ":>"}
911             Parse result: 0 (0 is success)
912             --------------------------------------------------
913             Parsing |a [% b <: c :> d %] e|
914             Parsed text:
915             root. Attributes: {}
916             |--- string. Attributes: {text => "a "}
917             |--- open. Attributes: {text => "[%"}
918             | |--- string. Attributes: {text => " b "}
919             | |--- open. Attributes: {text => "<:"}
920             | | |--- string. Attributes: {text => " c "}
921             | |--- close. Attributes: {text => ":>"}
922             | |--- string. Attributes: {text => " d "}
923             |--- close. Attributes: {text => "%]"}
924             |--- string. Attributes: {text => " e"}
925             Parse result: 0 (0 is success)
926             --------------------------------------------------
927             Parsing |a <: b <: c :> d :> e|
928             Error: Parse failed. Opened delimiter <: again before closing previous one
929             Text parsed so far:
930             root. Attributes: {}
931             |--- string. Attributes: {text => "a "}
932             |--- open. Attributes: {text => "<:"}
933             |--- string. Attributes: {text => " b "}
934             Parse result: 1 (0 is success)
935             Deliberate error: Failed to parse |a <: b <: c :> d :> e|
936             Error number: 2. Error message: Opened delimiter <: again before closing previous one
937             --------------------------------------------------
938              
939             =head1 Description
940              
941             L provides a L-based parser for extracting delimited text
942             sequences from strings.
943              
944             See the L for various topics, including:
945              
946             =over 4
947              
948             =item o UFT8 handling
949              
950             See t/utf8.t.
951              
952             =item o Escaping delimiters within the text
953              
954             See t/escapes.t.
955              
956             =item o Options to make nested and/or overlapped delimiters fatal errors
957              
958             See t/colons.t.
959              
960             =item o Using delimiters which are part of another delimiter
961              
962             See t/escapes.t and t/perl.delimiters.
963              
964             =item o Processing the tree-structured output
965              
966             See scripts/traverse.pl.
967              
968             =item o Emulating L's use of '<:' and ':>
969              
970             See t/colons.t and t/percents.t.
971              
972             =item o Implementing a really trivial HTML parser
973              
974             See scripts/traverse.pl and t/html.t.
975              
976             In the same vein, see t/angle.brackets.t, for code where the delimiters are just '<' and '>'.
977              
978             =item o Handling multiple sets of delimiters
979              
980             See t/multiple.delimiters.t.
981              
982             =item o Skipping (leading) characters in the input string
983              
984             See t/skip.prefix.t.
985              
986             =item o Implementing hard-to-read text strings as delimiters
987              
988             See t/silly.delimiters.
989              
990             =back
991              
992             =head1 Distributions
993              
994             This module is available as a Unix-style distro (*.tgz).
995              
996             See L
997             for help on unpacking and installing distros.
998              
999             =head1 Installation
1000              
1001             Install L as you would any C module:
1002              
1003             Run:
1004              
1005             cpanm Text::Balanced::Marpa
1006              
1007             or run:
1008              
1009             sudo cpan Text::Balanced::Marpa
1010              
1011             or unpack the distro, and then either:
1012              
1013             perl Build.PL
1014             ./Build
1015             ./Build test
1016             sudo ./Build install
1017              
1018             or:
1019              
1020             perl Makefile.PL
1021             make (or dmake or nmake)
1022             make test
1023             make install
1024              
1025             =head1 Constructor and Initialization
1026              
1027             C is called as C<< my($parser) = Text::Balanced::Marpa -> new(k1 => v1, k2 => v2, ...) >>.
1028              
1029             It returns a new object of type C.
1030              
1031             Key-value pairs accepted in the parameter list (see corresponding methods for details
1032             [e.g. L]):
1033              
1034             =over 4
1035              
1036             =item o close => $arrayref
1037              
1038             An arrayref of strings, each one a closing delimiter.
1039              
1040             The # of elements must match the # of elements in the 'open' arrayref.
1041              
1042             See the L for details and warnings.
1043              
1044             A value for this option is mandatory.
1045              
1046             Default: None.
1047              
1048             =item o length => $integer
1049              
1050             The maxiumum length of the input string to process.
1051              
1052             This parameter works in conjunction with the C parameter.
1053              
1054             C can also be used as a key in the hash passed to L.
1055              
1056             See the L for details.
1057              
1058             Default: Calls Perl's length() function on the input string.
1059              
1060             =item o next_few_limit => $integer
1061              
1062             This controls how many characters are printed when displaying 'the next few chars'.
1063              
1064             It only affects debug output.
1065              
1066             Default: 20.
1067              
1068             =item o open => $arrayref
1069              
1070             An arrayref of strings, each one an opening delimiter.
1071              
1072             The # of elements must match the # of elements in the 'open' arrayref.
1073              
1074             See the L for details and warnings.
1075              
1076             A value for this option is mandatory.
1077              
1078             Default: None.
1079              
1080             =item o options => $bit_string
1081              
1082             This allows you to turn on various options.
1083              
1084             C can also be used as a key in the hash passed to L.
1085              
1086             Default: 0 (nothing is fatal).
1087              
1088             See the L for details.
1089              
1090             =item o pos => $integer
1091              
1092             The offset within the input string at which to start processing.
1093              
1094             This parameter works in conjunction with the C parameter.
1095              
1096             C can also be used as a key in the hash passed to L.
1097              
1098             See the L for details.
1099              
1100             Note: The first character in the input string is at pos == 0.
1101              
1102             Default: 0.
1103              
1104             =item o text => $stringref
1105              
1106             This is a reference to the string to be parsed. A stringref is used to avoid copying what could
1107             potentially be a very long string.
1108              
1109             C can also be used as a key in the hash passed to L.
1110              
1111             Default: \''.
1112              
1113             =back
1114              
1115             =head1 Methods
1116              
1117             =head2 bnf()
1118              
1119             Returns a string containing the grammar constructed based on user input.
1120              
1121             =head2 close()
1122              
1123             Get the arrayref of closing delimiters.
1124              
1125             See also L.
1126              
1127             See the L for details and warnings.
1128              
1129             'close' is a parameter to L. See L for details.
1130              
1131             =head2 delimiter_action()
1132              
1133             Returns a hashref, where the keys are delimiters and the values are either 'open' or 'close'.
1134              
1135             =head2 delimiter_frequency()
1136              
1137             Returns a hashref where the keys are opening and closing delimiters, and the values are the # of
1138             times each delimiter appears in the input stream.
1139              
1140             The value is incremented for each opening delimiter and decremented for each closing delimiter.
1141              
1142             =head2 error_message()
1143              
1144             Returns the last error or warning message set.
1145              
1146             Error messages always start with 'Error: '. Messages never end with "\n".
1147              
1148             Parsing error strings is not a good idea, ever though this module's format for them is fixed.
1149              
1150             See L.
1151              
1152             =head2 error_number()
1153              
1154             Returns the last error or warning number set.
1155              
1156             Warnings have values < 0, and errors have values > 0.
1157              
1158             If the value is > 0, the message has the prefix 'Error: ', and if the value is < 0, it has the
1159             prefix 'Warning: '. If this is not the case, it's a reportable bug.
1160              
1161             Possible values for error_number() and error_message():
1162              
1163             =over 4
1164              
1165             =item o 0 => ""
1166              
1167             This is the default value.
1168              
1169             =item o 1/-1 => "Last open delimiter: $lexeme_1. Unexpected closing delimiter: $lexeme_2"
1170              
1171             If L returns 1, it's an error, and if it returns -1 it's a warning.
1172              
1173             You can set the option C to make it fatal.
1174              
1175             =item o 2/-2 => "Opened delimiter $lexeme again before closing previous one"
1176              
1177             If L returns 2, it's an error, and if it returns -2 it's a warning.
1178              
1179             You can set the option C to make it fatal.
1180              
1181             =item o 3/-3 => "Ambiguous parse. Status: $status. Terminals expected: a, b, ..."
1182              
1183             This message is only produced when the parse is ambiguous.
1184              
1185             If L returns 3, it's an error, and if it returns -3 it's a warning.
1186              
1187             You can set the option C to make it fatal.
1188              
1189             =item o 4 => "Backslash is forbidden as a delimiter character"
1190              
1191             This preempts some types of sabotage.
1192              
1193             This message can never be just a warning message.
1194              
1195             =item o 5 => "Single-quotes are forbidden in multi-character delimiters"
1196              
1197             This limitation is due to the syntax of
1198             L.
1199              
1200             This message can never be just a warning message.
1201              
1202             =item o 6/-6 => "Parse exhausted"
1203              
1204             If L returns 6, it's an error, and if it returns -6 it's a warning.
1205              
1206             You can set the option C to make it fatal.
1207              
1208             =item o 7 => 'Single-quote is forbidden as an escape character'
1209              
1210             This limitation is due to the syntax of
1211             L.
1212              
1213             This message can never be just a warning message.
1214              
1215             =item o 8 => "There must be at least 1 pair of open/close delimiters"
1216              
1217             This message can never be just a warning message.
1218              
1219             =item o 9 => "The # of open delimiters must match the # of close delimiters"
1220              
1221             This message can never be just a warning message.
1222              
1223             =item o 10 => "Unexpected event name 'xyz'"
1224              
1225             Marpa has trigged an event and it's name is not in the hash of event names derived from the BNF.
1226              
1227             This message can never be just a warning message.
1228              
1229             =item o 11 => "The code does not handle these events simultaneously: a, b, ..."
1230              
1231             The code is written to handle single events at a time, or in rare cases, 2 events at the same time.
1232             But here, multiple events have been triggered and the code cannot handle the given combination.
1233              
1234             This message can never be just a warning message.
1235              
1236             =back
1237              
1238             See L.
1239              
1240             =head2 escape_char()
1241              
1242             Get the escape char.
1243              
1244             =head2 known_events()
1245              
1246             Returns a hashref where the keys are event names and the values are 1.
1247              
1248             =head2 length([$integer])
1249              
1250             Here, the [] indicate an optional parameter.
1251              
1252             Get or set the length of the input string to process.
1253              
1254             See also the L and L.
1255              
1256             'length' is a parameter to L. See L for details.
1257              
1258             =head2 matching_delimiter()
1259              
1260             Returns a hashref where the keys are opening delimiters and the values are the corresponding closing
1261             delimiters.
1262              
1263             =head2 new()
1264              
1265             See L for details on the parameters accepted by L.
1266              
1267             =head2 next_few_chars($stringref, $offset)
1268              
1269             Returns a substring of $s, starting at $offset, for use in debug messages.
1270              
1271             See L.
1272              
1273             =head2 next_few_limit([$integer])
1274              
1275             Here, the [] indicate an optional parameter.
1276              
1277             Get or set the number of characters called 'the next few chars', which are printed during debugging.
1278              
1279             'next_few_limit' is a parameter to L. See L for details.
1280              
1281             =head2 open()
1282              
1283             Get the arrayref of opening delimiters.
1284              
1285             See also L.
1286              
1287             See the L for details and warnings.
1288              
1289             'open' is a parameter to L. See L for details.
1290              
1291             =head2 options([$bit_string])
1292              
1293             Here, the [] indicate an optional parameter.
1294              
1295             Get or set the option flags.
1296              
1297             For typical usage, see scripts/synopsis.pl.
1298              
1299             See the L for details.
1300              
1301             'options' is a parameter to L. See L for details.
1302              
1303             =head2 parse([%hash])
1304              
1305             Here, the [] indicate an optional parameter.
1306              
1307             This is the only method the user needs to call. All data can be supplied when calling L.
1308              
1309             You can of course call other methods (e.g. L ) after calling L but
1310             before calling C.
1311              
1312             The optional hash takes these ($key => $value) pairs (exactly the same as for L):
1313              
1314             =over 4
1315              
1316             =item o length => $integer
1317              
1318             =item o options => $bit_string
1319              
1320             =item o pos => $integer
1321              
1322             =item o text => $stringref
1323              
1324             =back
1325              
1326             Note: If a value is passed to C, it takes precedence over any value with the same
1327             key passed to L, and over any value previously passed to the method whose name is $key.
1328             Further, the value passed to C is always passed to the corresponding method (i.e. whose
1329             name is $key), meaning any subsequent call to that method returns the value passed to C.
1330              
1331             See scripts/samples.pl.
1332              
1333             Returns 0 for success and 1 for failure.
1334              
1335             If the value is 1, you should call L to find out what happened.
1336              
1337             =head2 pos([$integer])
1338              
1339             Here, the [] indicate an optional parameter.
1340              
1341             Get or set the offset within the input string at which to start processing.
1342              
1343             See also the L and L.
1344              
1345             'pos' is a parameter to L. See L for details.
1346              
1347             =head2 text([$stringref])
1348              
1349             Here, the [] indicate an optional parameter.
1350              
1351             Get or set a reference to the string to be parsed.
1352              
1353             'text' is a parameter to L. See L for details.
1354              
1355             =head2 tree()
1356              
1357             Returns an object of type L, which holds the parsed data.
1358              
1359             Obviously, it only makes sense to call C after calling C.
1360              
1361             See scripts/traverse.pl for sample code which processes this tree's nodes.
1362              
1363             =head1 FAQ
1364              
1365             =head2 Where are the error messages and numbers described?
1366              
1367             See L and L.
1368              
1369             =head2 How do I escape delimiters?
1370              
1371             By backslash-escaping the first character of all open and close delimiters which appear in the
1372             text.
1373              
1374             As an example, if the delimiters are '<:' and ':>', this means you have to escape I the '<'
1375             chars and I the colons in the text.
1376              
1377             The backslash is preserved in the output.
1378              
1379             If you don't want to use backslash for escaping, or can't, you can pass a different escape character
1380             to L.
1381              
1382             See t/escapes.t.
1383              
1384             =head2 How do the length and pos parameters to new() work?
1385              
1386             The recognizer - an object of type Marpa::R2::Scanless::R - is called in a loop, like this:
1387              
1388             for
1389             (
1390             $pos = $self -> recce -> read($stringref, $pos, $length);
1391             $pos < $length;
1392             $pos = $self -> recce -> resume($pos)
1393             )
1394              
1395             L and L can be used to initialize $pos and $length.
1396              
1397             Note: The first character in the input string is at pos == 0.
1398              
1399             See L for details.
1400              
1401             =head2 Does this package support Unicode/UTF8?
1402              
1403             Yes. See t/escapes.t, t/multiple.quotes.t and t/utf8.t.
1404              
1405             =head2 Does this package handler Perl delimiters (e.g. q|..|, qq|..|, qr/../, qw/../)?
1406              
1407             See t/perl.delimiters.t.
1408              
1409             =head2 Warning: Calling mutators after calling new()
1410              
1411             The only mutator which works after calling new() is L.
1412              
1413             In particular, you can't call L, L or L after calling L.
1414             This is because parameters passed to C are interpolated into the grammar before parsing
1415             begins. And that's why the docs for those methods all say 'Get the...' and not 'Get and set the...'.
1416              
1417             To make the code work, you would have to manually call _validate_open_close(). But even then
1418             a lot of things would have to be re-initialized to give the code any hope of working.
1419              
1420             =head2 What is the format of the 'open' and 'close' parameters to new()?
1421              
1422             Each of these parameters takes an arrayref as a value.
1423              
1424             The # of elements in the 2 arrayrefs must be the same.
1425              
1426             The 1st element in the 'open' arrayref is the 1st user-chosen opening delimiter, and the 1st
1427             element in the 'close' arrayref must be the corresponding closing delimiter.
1428              
1429             It is possible to use a delimiter which is part of another delimiter.
1430              
1431             See scripts/samples.pl. It uses both '<' and '<:' as opening delimiters and their corresponding
1432             closing delimiters are '>' and ':>'. Neat, huh?
1433              
1434             =head2 What are the possible values for the 'options' parameter to new()?
1435              
1436             Firstly, to make these constants available, you must say:
1437              
1438             use Text::Balanced::Marpa ':constants';
1439              
1440             Secondly, more detail on errors and warnings can be found at L.
1441              
1442             Thirdly, for usage of these option flags, see t/angle.brackets.t, t/colons.t, t/escapes.t,
1443             t/multiple.quotes.t, t/percents.t and scripts/samples.pl.
1444              
1445             Now the flags themselves:
1446              
1447             =over 4
1448              
1449             =item o nothing_is_fatal
1450              
1451             This is the default.
1452              
1453             C has the value of 0.
1454              
1455             =item o print_error
1456              
1457             Print errors if this flag is set.
1458              
1459             C has the value of 1.
1460              
1461             =item o print_warnings
1462              
1463             Print various warnings if this flag is set:
1464              
1465             =over 4
1466              
1467             =item o The ambiguity status and terminals expected, if the parse is ambiguous
1468              
1469             =item o See L for other warnings which might be printed
1470              
1471             Ambiguity is not, in and of itself, an error. But see the C option, below.
1472              
1473             =back
1474              
1475             It's tempting to call this option C, but Perl already has C, so I didn't.
1476              
1477             C has the value of 2.
1478              
1479             =item o print_debugs
1480              
1481             Print extra stuff if this flag is set.
1482              
1483             C has the value of 4.
1484              
1485             =item o overlap_is_fatal
1486              
1487             This means overlapping delimiters cause a fatal error.
1488              
1489             So, setting C means '{Bold [Italic}]' would be a fatal error.
1490              
1491             I use this example since it gives me the opportunity to warn you, this will I do what you want
1492             if you try to use the delimiters of '<' and '>' for HTML. That is, 'Bold Italic' is
1493             not an error because what overlap are '' and '' BUT THEY ARE NOT TAGS. The tags are '<' and
1494             '>', ok? See also t/html.t.
1495              
1496             C has the value of 8.
1497              
1498             =item o nesting_is_fatal
1499              
1500             This means nesting of identical opening delimiters is fatal.
1501              
1502             So, using C means 'a <: b <: c :> d :> e' would be a fatal error.
1503              
1504             C has the value of 16.
1505              
1506             =item o ambiguity_is_fatal
1507              
1508             This makes L return 3 rather than -3.
1509              
1510             C has the value of 32.
1511              
1512             =item o exhaustion_is_fatal
1513              
1514             This makes L return 6 rather than -6.
1515              
1516             C has the value of 64.
1517              
1518             =back
1519              
1520             =head2 How do I print the tree built by the parser?
1521              
1522             See L.
1523              
1524             =head2 How do I make use of the tree built by the parser?
1525              
1526             See scripts/traverse.pl. It is a copy of t/html.t with tree-walking code instead of test code.
1527              
1528             =head2 How is the parsed data held in RAM?
1529              
1530             The parsed output is held in a tree managed by L.
1531              
1532             The tree always has a root node, which has nothing to do with the input data. So, even an empty
1533             imput string will produce a tree with 1 node. This root has an empty hashref associated with it.
1534              
1535             Nodes have a name and a hashref of attributes.
1536              
1537             The name indicates the type of node. Names are one of these literals:
1538              
1539             =over 4
1540              
1541             =item o close
1542              
1543             =item o open
1544              
1545             =item o root
1546              
1547             =item o text
1548              
1549             =back
1550              
1551             For 'open' and 'close', the delimiter is given by the value of the 'text' key in the hashref.
1552              
1553             The (key => value) pairs in the hashref are:
1554              
1555             =over 4
1556              
1557             =item o text => $string
1558              
1559             If the node name is 'open' or 'close', $string is the delimiter.
1560              
1561             If the node name is 'text', $string is the verbatim text from the document.
1562              
1563             Verbatim means, for example, that backslashes in the input are preserved.
1564              
1565             =back
1566              
1567             Try:
1568              
1569             perl -Ilib scripts/samples.pl info
1570              
1571             =head2 How is HTML/XML handled?
1572              
1573             The tree does not preserve the nested nature of HTML/XML.
1574              
1575             Post-processing (valid) HTML could easily generate another view of the data.
1576              
1577             But anyway, to get perfect HTML you'd be grabbing the output of L, right?
1578              
1579             See scripts/traverse.pl and t/html.t for a trivial HTML parser.
1580              
1581             =head2 What is the homepage of Marpa?
1582              
1583             L.
1584              
1585             That page has a long list of links.
1586              
1587             =head2 How do I run author tests?
1588              
1589             This runs both standard and author tests:
1590              
1591             shell> perl Build.PL; ./Build; ./Build authortest
1592              
1593             =head1 TODO
1594              
1595             =over 4
1596              
1597             =item o Advanced error reporting
1598              
1599             See L.
1600              
1601             Perhaps this could be a sub-class?
1602              
1603             =item o I8N support for error messages
1604              
1605             =item o An explicit test program for parse exhaustion
1606              
1607             =back
1608              
1609             =head1 See Also
1610              
1611             L.
1612              
1613             L and L.
1614              
1615             L - for various usages of L, but not of this module.
1616              
1617             =head1 Machine-Readable Change Log
1618              
1619             The file Changes was converted into Changelog.ini by L.
1620              
1621             =head1 Version Numbers
1622              
1623             Version numbers < 1.00 represent development versions. From 1.00 up, they are production versions.
1624              
1625             =head1 Thanks
1626              
1627             Thanks to Jeffrey Kegler, who wrote Marpa and L.
1628              
1629             And thanks to rns (Ruslan Shvedov) for writing the grammar for double-quoted strings used in
1630             L's scripts/quoted.strings.02.pl. I adapted it to HTML (see
1631             scripts/quoted.strings.05.pl in that module), and then incorporated the grammar into
1632             L, and - after more extensions - into this module.
1633              
1634             Lastly, thanks to Robert Rothenberg for L, a module which works the same way
1635             Perl does.
1636              
1637             =head1 Repository
1638              
1639             L
1640              
1641             =head1 Support
1642              
1643             Email the author, or log a bug on RT:
1644              
1645             L.
1646              
1647             =head1 Author
1648              
1649             L was written by Ron Savage Iron@savage.net.auE> in 2014.
1650              
1651             Marpa's homepage: L.
1652              
1653             My homepage: L.
1654              
1655             =head1 Copyright
1656              
1657             Australian copyright (c) 2014, Ron Savage.
1658              
1659             All Programs of mine are 'OSI Certified Open Source Software';
1660             you can redistribute them and/or modify them under the terms of
1661             The Artistic License 2.0, a copy of which is available at:
1662             http://opensource.org/licenses/alphabetical.
1663              
1664             =cut