File Coverage

blib/lib/Marpa/HTML.pm
Criterion Covered Total %
statement 432 551 78.4
branch 104 202 51.4
condition 34 68 50.0
subroutine 35 38 92.1
pod 0 1 0.0
total 605 860 70.3


line stmt bran cond sub pod time code
1             # This software is copyright (c) 2011 by Jeffrey Kegler
2             # This is free software; you can redistribute it and/or modify it
3             # under the same terms as the Perl 5 programming language system
4             # itself.
5              
6             package Marpa::HTML;
7              
8 6     6   66524 use 5.010;
  6         24  
  6         265  
9 6     6   32 use strict;
  6         11  
  6         198  
10 6     6   42 use warnings;
  6         21  
  6         199  
11              
12 6     6   37 use vars qw( $VERSION $STRING_VERSION );
  6         10  
  6         519  
13             $VERSION = '0.112000';
14             $STRING_VERSION = $VERSION;
15             {
16             ## no critic (BuiltinFunctions::ProhibitStringyEval)
17             ## no critic (ValuesAndExpressions::RequireConstantVersion)
18             $VERSION = eval $VERSION;
19             }
20              
21 6     6   3356 use Marpa::HTML::Version;
  6         15  
  6         275  
22              
23             our @EXPORT_OK;
24 6     6   35 use base qw(Exporter);
  6         22  
  6         655  
25 6     6   154 BEGIN { @EXPORT_OK = qw(html); }
26              
27             package Marpa::HTML::Internal;
28              
29 6     6   30 use Carp;
  6         12  
  6         465  
30 6     6   32 use HTML::PullParser;
  6         11  
  6         139  
31 6     6   36 use HTML::Entities qw(decode_entities);
  6         11  
  6         385  
32 6     6   5145 use HTML::Tagset ();
  6         10529  
  6         737  
33              
34             # versions below must be coordinated with
35             # those required in Build.PL
36             BEGIN {
37 6     6   12 my $using_xs = eval {
38 6         2609 require Marpa::XS::Installed;
39 0 0       0 defined $Marpa::XS::Installed::VERSION
40             and $Marpa::XS::Installed::VERSION >= $Marpa::HTML::MARPA_XS_VERSION;
41             };
42 6 50       37 if ($using_xs) {
43 0         0 require Marpa::XS;
44 0         0 Marpa::XS->VERSION($Marpa::HTML::MARPA_XS_VERSION); # double check
45 0         0 $Marpa::HTML::MARPA_MODULE = 'Marpa::XS';
46 6     6   39 no strict 'refs';
  6         12  
  6         788  
47 0         0 *Marpa::HTML::Recognizer::new = \&Marpa::XS::Recognizer::new;
48 0         0 *Marpa::HTML::Grammar::new = \&Marpa::XS::Grammar::new;
49             } ## end if ($using_xs)
50             else {
51 6         40023 require Marpa::PP;
52 6         582241 Marpa::PP->VERSION($Marpa::HTML::MARPA_PP_VERSION);
53 6         27 $Marpa::HTML::MARPA_MODULE = 'Marpa::PP';
54 6     6   29 no strict 'refs';
  6         11  
  6         414  
55 6         27 *Marpa::HTML::Recognizer::new = \&Marpa::PP::Recognizer::new;
56 6         227 *Marpa::HTML::Grammar::new = \&Marpa::PP::Grammar::new;
57             } ## end else [ if ($using_xs) ]
58             } ## end BEGIN
59              
60             # use Smart::Comments '-ENV';
61              
62             ### Using smart comments ...
63              
64 6     6   81 use English qw( -no_match_vars );
  6         12  
  6         56  
65              
66 6         40 use Marpa::HTML::Offset qw(
67             :package=Marpa::HTML::Internal::TDesc
68             TYPE
69             START_TOKEN
70             END_TOKEN
71 6     6   6425 );
  6         17  
72              
73 6         29 use Marpa::HTML::Offset qw(
74             :package=Marpa::HTML::Internal::TDesc::Element
75             TYPE
76             START_TOKEN
77             END_TOKEN
78             VALUE
79             NODE_DATA
80 6     6   33 );
  6         11  
81              
82             %Marpa::HTML::PULL_PARSER_OPTIONS = (
83             start => q{'S',line,column,offset,offset_end,tagname,attr},
84             end => q{'E',line,column,offset,offset_end,tagname},
85             text => q{'T',line,column,offset,offset_end,is_cdata},
86             comment => q{'C',line,column,offset,offset_end},
87             declaration => q{'D',line,column,offset,offset_end},
88             process => q{'PI',line,column,offset,offset_end},
89              
90             # options that default on
91             unbroken_text => 1,
92             );
93              
94 6         28 use Marpa::HTML::Offset qw(
95             :package=Marpa::HTML::Internal::Token
96             TYPE
97             LINE
98             COL
99             =COLUMN
100             START_OFFSET
101             END_OFFSET
102             TAGNAME
103             =IS_CDATA
104             ATTR
105 6     6   34 );
  6         11  
106              
107 6     6   3731 use Marpa::HTML::Callback;
  6         16  
  6         26508  
108              
109             sub per_element_handlers {
110 1386     1386   2420 my ( $element, $user_handlers ) = @_;
111 1386 100       3032 return {} if not $element;
112 1374 100       3708 return {} if not $user_handlers;
113 687   50     1758 my $wildcard_handlers = $user_handlers->{ANY} // {};
114 687         866 my %handlers = %{$wildcard_handlers};
  687         2189  
115 687   100     10601 my $per_element_handlers = $user_handlers->{$element} // {};
116 687         1142 @handlers{ keys %{$per_element_handlers} } =
  687         1501  
117 687         1003 values %{$per_element_handlers};
118 687         2094 return \%handlers;
119             } ## end sub per_element_handlers
120              
121             sub tdesc_list_to_literal {
122 843     843   1308 my ( $self, $tdesc_list ) = @_;
123              
124 843         1186 my $text = q{};
125 843         1499 my $document = $self->{document};
126 843         1174 my $tokens = $self->{tokens};
127 843         953 TDESC: for my $tdesc ( @{$tdesc_list} ) {
  843         1660  
128 981         1500 given ( $tdesc->[Marpa::HTML::Internal::TDesc::TYPE] ) {
129 981         1597 when ('POINT') { break; }
  107         283  
130 874         1246 when ('VALUED_SPAN') {
131 528 50       1455 if (defined(
132             my $value =
133             $tdesc
134             ->[Marpa::HTML::Internal::TDesc::Element::VALUE]
135             )
136             )
137             {
138 528         1195 $text .= $value;
139 528         1575 break; # next TDESC;
140             } ## end if ( defined( my $value = $tdesc->[...]))
141              
142             # next TDESC if no first token id
143             #<<< As of 2009-11-22 perltidy cycles on this code
144             break
145 0 0       0 if not defined( my $first_token_id = $tdesc
146             ->[ Marpa::HTML::Internal::TDesc::START_TOKEN ] );
147             #>>>
148              
149             # next TDESC if no last token id
150             #<<< As of 2009-11-22 perltidy cycles on this code
151             break
152 0 0       0 if not defined( my $last_token_id =
153             $tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN] );
154             #>>>
155              
156 0         0 my $offset =
157             $tokens->[$first_token_id]
158             ->[Marpa::HTML::Internal::Token::START_OFFSET];
159 0         0 my $end_offset =
160             $tokens->[$last_token_id]
161             ->[Marpa::HTML::Internal::Token::END_OFFSET];
162 0         0 $text .= substr ${$document}, $offset,
  0         0  
163             ( $end_offset - $offset );
164             } ## end when ('VALUED_SPAN')
165 346         536 when ('UNVALUED_SPAN') {
166 346         534 my $first_token_id =
167             $tdesc->[Marpa::HTML::Internal::TDesc::START_TOKEN];
168 346         512 my $last_token_id =
169             $tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN];
170 346         722 my $offset =
171             $tokens->[$first_token_id]
172             ->[Marpa::HTML::Internal::Token::START_OFFSET];
173 346         520 my $end_offset =
174             $tokens->[$last_token_id]
175             ->[Marpa::HTML::Internal::Token::END_OFFSET];
176              
177 346         525 $text .= substr ${$document}, $offset,
  346         2014  
178             ( $end_offset - $offset );
179             } ## end when ('UNVALUED_SPAN')
180 0         0 default {
181 0         0 Carp::croak(qq{Internal error: unknown tdesc type "$_"});
182             }
183             } ## end given
184             } ## end for my $tdesc ( @{$tdesc_list} )
185 843         4122 return \$text;
186             } ## end sub tdesc_list_to_literal
187              
188             # Convert a list of text descriptions to text
189             sub default_top_handler {
190 87     87   23930 my ( $dummy, @tdesc_lists ) = @_;
191 87         180 my $self = $Marpa::HTML::Internal::PARSE_INSTANCE;
192 87         210 my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists;
  179         218  
  179         467  
  348         927  
193 87         267 return tdesc_list_to_literal( $self, \@tdesc_list );
194              
195             } ## end sub default_top_handler
196              
197             sub wrap_user_top_handler {
198 4     4   9 my ($user_handler) = @_;
199             return sub {
200 4     4   230 my ( $dummy, @tdesc_lists ) = @_;
201 4         8 my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists;
  9         10  
  9         21  
  16         30  
202 4         7 local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list;
203 4         1027 local $Marpa::HTML::Internal::PER_NODE_DATA =
204             { pseudoclass => 'TOP' };
205 4         22 return scalar $user_handler->();
206 4         38 };
207             } ## end sub wrap_user_top_handler
208              
209             # Convert a list of text descriptions to a
210             # single, shortened text description
211             sub create_tdesc_handler {
212 693     693   1092 my ( $self, $element ) = @_;
213 693 100       2231 my $handlers_by_class =
214             per_element_handlers( $element,
215             ( $self ? $self->{user_handlers_by_class} : {} ) );
216 693 100       3083 my $handlers_by_id =
217             per_element_handlers( $element,
218             ( $self ? $self->{user_handlers_by_id} : {} ) );
219              
220             return sub {
221 3265     3265   1997221 my ( $dummy, @tdesc_lists ) = @_;
222              
223 3265         5114 my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists;
  5200         5244  
  5200         11479  
  5581         11282  
224 3265         4755 local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list;
225              
226 9248         14079 my @token_ids = sort { $a <=> $b } grep {defined} map {
  10852         21677  
  5426         12612  
227 3265         4368 @{$_}[
  5426         5634  
228             Marpa::HTML::Internal::TDesc::START_TOKEN,
229             Marpa::HTML::Internal::TDesc::END_TOKEN
230             ]
231             } @tdesc_list;
232              
233 3265         4578 my $first_token_id_in_node = $token_ids[0];
234 3265         3451 my $last_token_id_in_node = $token_ids[-1];
235 3265         9995 my $per_node_data = {
236             element => $element,
237             first_token_id => $first_token_id_in_node,
238             last_token_id => $last_token_id_in_node,
239             };
240              
241 3265 100       8945 if ( $tdesc_list[0]->[Marpa::HTML::Internal::TDesc::TYPE] ne 'POINT' )
242             {
243 2962         4850 $per_node_data->{start_tag_token_id} = $first_token_id_in_node;
244             }
245              
246 3265 100       7173 if ($tdesc_list[-1]->[Marpa::HTML::Internal::TDesc::TYPE] ne 'POINT' )
247             {
248 2895         4481 $per_node_data->{end_tag_token_id} = $last_token_id_in_node;
249             }
250              
251 3265         4159 local $Marpa::HTML::Internal::PER_NODE_DATA = $per_node_data;
252              
253 3265         3645 my $self = $Marpa::HTML::Internal::PARSE_INSTANCE;
254 3265         4603 my $trace_fh = $self->{trace_fh};
255 3265         4630 my $trace_handlers = $self->{trace_handlers};
256              
257 3265         5884 my $tokens = $self->{tokens};
258              
259 3265         3607 my $user_handler;
260             GET_USER_HANDLER: {
261 3265 50       3347 if ( my $id = Marpa::HTML::id() ) {
  3265         7496  
262 0 0       0 if ( $user_handler = $handlers_by_id->{$id} ) {
263 0 0       0 if ($trace_handlers) {
264 0 0       0 say {$trace_fh}
  0         0  
265             "Resolved to user handler by element ($element) and id ($id)"
266             or Carp::croak("Cannot print: $ERRNO");
267             }
268 0         0 last GET_USER_HANDLER;
269             } ## end if ( $user_handler = $handlers_by_id->{$id} )
270             } ## end if ( my $id = Marpa::HTML::id() )
271 3265 100       7317 if ( my $class = Marpa::HTML::class() ) {
272 744 100       1725 if ( $user_handler = $handlers_by_class->{$class} ) {
273 8 50       32 if ($trace_handlers) {
274 0 0       0 say {$trace_fh}
  0         0  
275             "Resolved to user handler by element ($element) and class ($class)"
276             or Carp::croak("Cannot print: $ERRNO");
277             }
278 8         15 last GET_USER_HANDLER;
279             } ## end if ( $user_handler = $handlers_by_class->{$class} )
280             } ## end if ( my $class = Marpa::HTML::class() )
281 3257         4703 $user_handler = $handlers_by_class->{ANY};
282 3257 50 33     7934 if ( $trace_handlers and $user_handler ) {
283 0 0       0 say {$trace_fh} +(
  0 0       0  
284             defined $element
285             ? "Resolved to user handler by element ($element)"
286             : 'Resolved to default user handler'
287             ) or Carp::croak("Cannot print: $ERRNO");
288             } ## end if ( $trace_handlers and $user_handler )
289             } ## end GET_USER_HANDLER:
290              
291 3265 100       6063 if ( defined $user_handler ) {
292              
293             # scalar context needed for the user handler
294             # because so that a bare return returns undef
295             # and not an empty list.
296             return [
297 429         1244 [ VALUED_SPAN => $first_token_id_in_node,
298             $last_token_id_in_node, ( scalar $user_handler->() ),
299             $per_node_data
300             ]
301             ];
302             } ## end if ( defined $user_handler )
303              
304 2836         3616 my $doc = $self->{doc};
305 2836         3797 my @tdesc_result = ();
306              
307 2836         2824 my $first_token_id_in_current_span;
308             my $last_token_id_in_current_span;
309              
310 2836         6443 TDESC: for my $tdesc ( @tdesc_list, ['FINAL'] ) {
311              
312 6933         6703 my $next_tdesc;
313             my $first_token_id;
314 0         0 my $last_token_id;
315 6933         9788 PARSE_TDESC: {
316 6933         6991 my $ref_type = ref $tdesc;
317 6933 50 33     34239 if ( not $ref_type or $ref_type ne 'ARRAY' ) {
318 0         0 $next_tdesc = $tdesc;
319 0         0 last PARSE_TDESC;
320             }
321 6933         9424 given ( $tdesc->[Marpa::HTML::Internal::TDesc::TYPE] ) {
322 6933         9441 when ('POINT') { break; }
  63         115  
323 6870         8280 when ('VALUED_SPAN') {
324 913 50       2375 if (not defined(
325             my $value = $tdesc->[
326             Marpa::HTML::Internal::TDesc::Element::VALUE
327             ]
328             )
329             )
330             {
331             #<<< As of 2009-11-22 pertidy cycles on this
332 0         0 $first_token_id = $tdesc->[
333             Marpa::HTML::Internal::TDesc::START_TOKEN ];
334 0         0 $last_token_id =
335             $tdesc
336             ->[ Marpa::HTML::Internal::TDesc::END_TOKEN
337             ];
338             #>>>
339 0         0 break; # last PARSE_TDESC;
340             } ## end if ( not defined( my $value = $tdesc->[ ...]))
341 913         2013 $next_tdesc = $tdesc;
342             } ## end when ('VALUED_SPAN')
343 5957         6922 when ('FINAL') {
344 2836         11352 $next_tdesc = $tdesc;
345             }
346 3121         4019 when ('UNVALUED_SPAN') {
347 3121         3474 $first_token_id = $tdesc
348             ->[Marpa::HTML::Internal::TDesc::START_TOKEN];
349 3121         6201 $last_token_id =
350             $tdesc->[Marpa::HTML::Internal::TDesc::END_TOKEN];
351             } ## end when ('UNVALUED_SPAN')
352 0         0 default {
353 0         0 Carp::croak("Unknown text description type: $_");
354             }
355             } ## end given
356             } ## end PARSE_TDESC:
357              
358 6933 100 66     21000 if ( defined $first_token_id and defined $last_token_id ) {
359 3121 100       5479 if ( defined $first_token_id_in_current_span ) {
360 952 50       1845 if ( $first_token_id
361             <= $last_token_id_in_current_span + 1 )
362             {
363 952         945 $last_token_id_in_current_span = $last_token_id;
364 952         1447 next TDESC;
365             } ## end if ( $first_token_id <= ...)
366 0         0 push @tdesc_result,
367             [
368             'UNVALUED_SPAN',
369             $first_token_id_in_current_span,
370             $last_token_id_in_current_span
371             ];
372             } ## end if ( defined $first_token_id_in_current_span )
373 2169         2085 $first_token_id_in_current_span = $first_token_id;
374 2169         2052 $last_token_id_in_current_span = $last_token_id;
375 2169         3414 next TDESC;
376             } ## end if ( defined $first_token_id and defined $last_token_id)
377              
378 3812 100       11865 if ( defined $next_tdesc ) {
379 3749 100       6617 if ( defined $first_token_id_in_current_span ) {
380 2169         5528 push @tdesc_result,
381             [
382             'UNVALUED_SPAN',
383             $first_token_id_in_current_span,
384             $last_token_id_in_current_span
385             ];
386              
387 2169         3063 $first_token_id_in_current_span =
388             $last_token_id_in_current_span = undef;
389             } ## end if ( defined $first_token_id_in_current_span )
390 3749         5013 my $ref_type = ref $next_tdesc;
391              
392             last TDESC
393 3749 100 66     18696 if $ref_type eq 'ARRAY'
394             and $next_tdesc->[Marpa::HTML::Internal::TDesc::TYPE]
395             eq 'FINAL';
396 913         2089 push @tdesc_result, $next_tdesc;
397             } ## end if ( defined $next_tdesc )
398              
399             } ## end for my $tdesc ( @tdesc_list, ['FINAL'] )
400              
401 2836         16345 return \@tdesc_result;
402 693         12344 };
403             } ## end sub create_tdesc_handler
404              
405             sub wrap_user_tdesc_handler {
406 313     313   461 my ( $user_handler, $per_node_data ) = @_;
407              
408             return sub {
409 109     109   50005 my ( $dummy, @tdesc_lists ) = @_;
410 109         266 my @tdesc_list = map { @{$_} } grep {defined} @tdesc_lists;
  109         197  
  109         419  
  109         299  
411 109         205 local $Marpa::HTML::Internal::TDESC_LIST = \@tdesc_list;
412 112         319 my @token_ids = sort { $a <=> $b } grep {defined} map {
  220         550  
  110         342  
413 109         183 @{$_}[
  110         163  
414             Marpa::HTML::Internal::TDesc::START_TOKEN,
415             Marpa::HTML::Internal::TDesc::END_TOKEN
416             ]
417             } @tdesc_list;
418              
419 109         194 my $first_token_id = $token_ids[0];
420 109         181 my $last_token_id = $token_ids[-1];
421 109   50     300 $per_node_data //= {};
422 109         229 $per_node_data->{first_token_id} = $first_token_id;
423 109         241 $per_node_data->{last_token_id} = $last_token_id;
424 109         165 local $Marpa::HTML::Internal::PER_NODE_DATA = $per_node_data;
425              
426             # scalar context needed for the user handler
427             # because so that a bare return returns undef
428             # and not an empty list.
429             return [
430 109         419 [ VALUED_SPAN => $first_token_id,
431             $last_token_id, ( scalar $user_handler->() ),
432             $per_node_data
433             ]
434             ];
435              
436 313         2913 };
437             } ## end sub wrap_user_tdesc_handler
438              
439             sub earleme_to_linecol {
440 0     0   0 my ( $self, $token_offset ) = @_;
441 0         0 my $html_parser_tokens = $self->{tokens};
442              
443             # Special start of file for undefined offset
444 0 0       0 if ( not defined $token_offset ) {
445 0         0 return ( 1, 0 );
446             }
447              
448             # Special case needed for a token offset after the last
449             # token. This happens with the EOF.
450 0 0 0     0 if ( $token_offset < 0 or $token_offset > $#{$html_parser_tokens} ) {
  0         0  
451 0         0 $token_offset = $#{$html_parser_tokens};
  0         0  
452             }
453              
454 0         0 return @{ $html_parser_tokens->[$token_offset] }[
  0         0  
455             Marpa::HTML::Internal::Token::LINE,
456             Marpa::HTML::Internal::Token::COLUMN,
457             ];
458              
459             } ## end sub earleme_to_linecol
460              
461             sub earleme_to_offset {
462              
463 0     0   0 my ( $self, $token_offset ) = @_;
464 0         0 my $html_parser_tokens = $self->{tokens};
465              
466             # Special start of file for undefined offset
467 0 0       0 if ( not defined $token_offset ) {
468 0         0 return 0;
469             }
470              
471             # Special case needed for a token offset after the last
472             # token. This happens with the EOF.
473 0         0 my $offset;
474 0 0 0     0 if ( $token_offset < 0 or $token_offset > $#{$html_parser_tokens} ) {
  0         0  
475 0         0 $offset = length ${ $self->{document} };
  0         0  
476             }
477             else {
478 0         0 $offset =
479             $html_parser_tokens->[$token_offset]
480             ->[Marpa::HTML::Internal::Token::END_OFFSET];
481             }
482 0         0 return $offset;
483              
484             } ## end sub earleme_to_offset
485              
486             my %ARGS = (
487             start => q{'S',offset,offset_end,tagname,attr},
488             end => q{'E',offset,offset_end,tagname},
489             text => q{'T',offset,offset_end,is_cdata},
490             process => q{'PI',offset,offset_end},
491             comment => q{'C',offset,offset_end},
492             declaration => q{'D',offset,offset_end},
493              
494             # options that default on
495             unbroken_text => 1,
496             );
497              
498             sub add_handler {
499 413     413   674 my ( $self, $handler_description ) = @_;
500 413   50     1064 my $ref_type = ref $handler_description || 'not a reference';
501 413 50       957 Carp::croak(
502             "Long form handler description should be ref to hash, but it is $ref_type"
503             ) if $ref_type ne 'HASH';
504 413         866 my $element = delete $handler_description->{element};
505 413         674 my $id = delete $handler_description->{id};
506 413         763 my $class = delete $handler_description->{class};
507 413         705 my $pseudoclass = delete $handler_description->{pseudoclass};
508 413         644 my $action = delete $handler_description->{action};
509 0         0 Carp::croak(
510             'Unknown option(s) in Long form handler description: ',
511 413         1254 ( join q{ }, keys %{$handler_description} )
512 413 50       549 ) if scalar keys %{$handler_description};
513              
514 413 50       1076 Carp::croak('Handler action must be CODE ref')
515             if ref $action ne 'CODE';
516              
517 413 100 100     1491 $element = ( not $element or $element eq q{*} ) ? 'ANY' : lc $element;
518 413 100       957 if ( defined $pseudoclass ) {
519 317         913 $self->{user_handlers_by_pseudoclass}->{$element}->{$pseudoclass} =
520             $action;
521 317         1302 return 1;
522             }
523              
524 96 50       199 if ( defined $id ) {
525 0         0 $self->{user_handlers_by_id}->{$element}->{ lc $id } = $action;
526 0         0 return 1;
527             }
528 96 100       328 $class = defined $class ? lc $class : 'ANY';
529 96         441 $self->{user_handlers_by_class}->{$element}->{$class} = $action;
530 96         354 return 1;
531             } ## end sub add_handler
532              
533             sub add_handlers_from_hashes {
534 0     0   0 my ( $self, $handler_specs ) = @_;
535 0   0     0 my $ref_type = ref $handler_specs || 'not a reference';
536 0 0       0 Carp::croak("handlers arg must must be ref to ARRAY, it is $ref_type")
537             if $ref_type ne 'ARRAY';
538 0         0 for my $handler_spec ( keys %{$handler_specs} ) {
  0         0  
539 0         0 add_handler( $self, $handler_spec );
540             }
541 0         0 return 1;
542             } ## end sub add_handlers_from_hashes
543              
544             sub add_handlers {
545 90     90   171 my ( $self, $handler_specs ) = @_;
546 90         171 HANDLER_SPEC: for my $specifier ( keys %{$handler_specs} ) {
  90         472  
547 413         552 my ( $element, $id, $class, $pseudoclass );
548 413         728 my $action = $handler_specs->{$specifier};
549 413 100 66     5167 ( $element, $id ) = ( $specifier =~ /\A ([^#]*) [#] (.*) \z/xms )
      100        
550             or ( $element, $class ) =
551             ( $specifier =~ /\A ([^.]*) [.] (.*) \z/xms )
552             or ( $element, $pseudoclass ) =
553             ( $specifier =~ /\A ([^:]*) [:] (.*) \z/xms )
554             or $element = $specifier;
555 413 50 66     2942 if ($pseudoclass
556             and not $pseudoclass ~~ [
557             qw(TOP PI DECL COMMENT PROLOG TRAILER WHITESPACE CDATA PCDATA CRUFT)
558             ]
559             )
560             {
561 0         0 Carp::croak( qq{pseudoclass "$pseudoclass" is not known:\n},
562             "Specifier was $specifier\n" );
563             } ## end if ( $pseudoclass and not $pseudoclass ~~ [ ...])
564 413 50 66     1762 if ( $pseudoclass and $element ) {
565 0         0 Carp::croak(
566             qq{pseudoclass "$pseudoclass" may not have an element specified:\n},
567             "Specifier was $specifier\n"
568             );
569             } ## end if ( $pseudoclass and $element )
570             add_handler(
571 413         2161 $self,
572             { element => $element,
573             id => $id,
574             class => $class,
575             pseudoclass => $pseudoclass,
576             action => $action
577             }
578             );
579             } ## end for my $specifier ( keys %{$handler_specs} )
580              
581 90         252 return 1;
582             } ## end sub add_handlers
583              
584             # If we factor this package, this will be the constructor.
585             ## no critic (Subroutines::RequireArgUnpacking)
586             sub create {
587              
588             ## use critic
589 91     91   203 my $self = {};
590 91         360 $self->{trace_fh} = \*STDERR;
591 91         289 ARG: for my $arg (@_) {
592 90   50     482 my $ref_type = ref $arg || 'not a reference';
593 90 50       358 if ( $ref_type eq 'HASH' ) {
594 90         375 Marpa::HTML::Internal::add_handlers( $self, $arg );
595 90         293 next ARG;
596             }
597 0 0       0 Carp::croak("Argument must be hash or refs to hash: it is $ref_type")
598             if $ref_type ne 'REF';
599 0         0 my $option_hash = ${$arg};
  0         0  
600 0   0     0 $ref_type = ref $option_hash || 'not a reference';
601 0 0       0 Carp::croak(
602             "Argument must be hash or refs to hash: it is ref to $ref_type")
603             if $ref_type ne 'HASH';
604 0         0 OPTION: for my $option ( keys %{$option_hash} ) {
  0         0  
605 0 0       0 if ( $option eq 'handlers' ) {
606 0         0 add_handlers_from_hashes( $self, $option_hash->{$option} );
607             }
608 0 0       0 if (not $option ~~ [
609             qw(trace_fh trace_values trace_handlers trace_actions
610             trace_conflicts trace_ambiguity trace_rules trace_QDFA
611             trace_earley_sets trace_terminals trace_cruft)
612             ]
613             )
614             {
615 0         0 Carp::croak("unknown option: $option");
616             } ## end if ( not $option ~~ [ ...])
617 0         0 $self->{$option} = $option_hash->{$option};
618             } ## end for my $option ( keys %{$option_hash} )
619             } ## end for my $arg (@_)
620 91         228 return $self;
621             } ## end sub create
622              
623             # block_element is for block-level ONLY elements.
624             # head is for anything legal inside the HTML header.
625             # Note that isindex can be both a head element and
626             # and block level element in the body.
627             # ISINDEX is classified as a header_element
628             %Marpa::HTML::Internal::ELEMENT_TYPE = (
629             ( map { $_ => 'block_element' }
630             qw(
631             h1 h2 h3 h4 h5 h6
632             ul ol dir menu
633             pre
634             p dl div center
635             noscript noframes
636             blockquote form hr
637             table fieldset address
638             )
639             ),
640             ( map { $_ => 'header_element' }
641             qw(
642             script style meta link object title isindex base
643             )
644             ),
645             ( map { $_ => 'list_item_element' } qw( li dd dt ) ),
646             ( map { $_ => 'table_cell_element' } qw( td th ) ),
647             ( map { $_ => 'table_row_element' } qw( tr ) ),
648             );
649              
650             @Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS = qw(
651             E_html
652             E_body
653             S_table
654             E_head
655             E_table
656             E_tbody
657             E_tr
658             E_td
659             S_td
660             S_tr
661             S_tbody
662             S_head
663             S_body
664             S_html
665             );
666              
667             %Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS = ();
668             for my $rank ( 0 .. $#Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS ) {
669             $Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS{
670             $Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS[$rank] } = $rank;
671             }
672              
673             %Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY = ();
674             {
675             my $hierarchy = <<'END_OF_STRING';
676             th td
677             tr
678             col
679             caption colgroup tfoot thead tbody
680             table
681             body head
682             html
683             END_OF_STRING
684              
685             my $iota = 0;
686             my @hierarchy;
687             for my $level ( split /\n/xms, $hierarchy ) {
688             push @hierarchy,
689             map { ( "S_$_" => $iota, "E_$_" => $iota ) }
690             ( split q{ }, $level );
691             $iota++;
692             } ## end for my $level ( split /\n/xms, $hierarchy )
693             %Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY = @hierarchy;
694             $Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{EOF} =
695             $Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{E_tbody};
696             }
697              
698             # This display set to be ignored
699             # until the HTML::Implementation doc
700             # is ready.
701              
702             # Marpa::HTML::Display
703             # name: HTML BNF
704             # ignore: 1
705             # start-after-line: END_OF_BNF
706             # end-before-line: '^END_OF_BNF$'
707              
708             my $BNF = <<'END_OF_BNF';
709             cruft ::= CRUFT
710             comment ::= C
711             pi ::= PI
712             decl ::= D
713             pcdata ::= PCDATA
714             cdata ::= CDATA
715             whitespace ::= WHITESPACE
716             SGML_item ::= comment
717             SGML_item ::= pi
718             SGML_item ::= decl
719             SGML_flow_item ::= SGML_item
720             SGML_flow_item ::= whitespace
721             SGML_flow_item ::= cruft
722             SGML_flow ::= SGML_flow_item*
723             document ::= prolog ELE_html trailer EOF
724             prolog ::= SGML_flow
725             trailer ::= SGML_flow
726             ELE_html ::= S_html Contents_html E_html
727             Contents_html ::= SGML_flow ELE_head SGML_flow ELE_body SGML_flow
728             ELE_head ::= S_head Contents_head E_head
729             Contents_head ::= head_item*
730             ELE_body ::= S_body flow E_body
731             ELE_table ::= S_table table_flow E_table
732             ELE_tbody ::= S_tbody table_section_flow E_tbody
733             ELE_tr ::= S_tr table_row_flow E_tr
734             ELE_td ::= S_td flow E_td
735             flow ::= flow_item*
736             flow_item ::= cruft
737             flow_item ::= SGML_item
738             flow_item ::= ELE_table
739             flow_item ::= list_item_element
740             flow_item ::= header_element
741             flow_item ::= block_element
742             flow_item ::= inline_element
743             flow_item ::= whitespace
744             flow_item ::= cdata
745             flow_item ::= pcdata
746             head_item ::= header_element
747             head_item ::= cruft
748             head_item ::= whitespace
749             head_item ::= SGML_item
750             inline_flow ::= inline_flow_item*
751             inline_flow_item ::= pcdata_flow_item
752             inline_flow_item ::= inline_element
753             pcdata_flow ::= pcdata_flow_item*
754             pcdata_flow_item ::= cdata
755             pcdata_flow_item ::= pcdata
756             pcdata_flow_item ::= cruft
757             pcdata_flow_item ::= whitespace
758             pcdata_flow_item ::= SGML_item
759             Contents_select ::= select_flow_item*
760             select_flow_item ::= ELE_optgroup
761             select_flow_item ::= ELE_option
762             select_flow_item ::= SGML_flow_item
763             Contents_optgroup ::= optgroup_flow_item*
764             optgroup_flow_item ::= ELE_option
765             optgroup_flow_item ::= SGML_flow_item
766             list_item_flow ::= list_item_flow_item*
767             list_item_flow_item ::= cruft
768             list_item_flow_item ::= SGML_item
769             list_item_flow_item ::= header_element
770             list_item_flow_item ::= block_element
771             list_item_flow_item ::= inline_element
772             list_item_flow_item ::= whitespace
773             list_item_flow_item ::= cdata
774             list_item_flow_item ::= pcdata
775             Contents_colgroup ::= colgroup_flow_item*
776             colgroup_flow_item ::= ELE_col
777             colgroup_flow_item ::= SGML_flow_item
778             table_row_flow ::= table_row_flow_item*
779             table_row_flow_item ::= ELE_th
780             table_row_flow_item ::= ELE_td
781             table_row_flow_item ::= SGML_flow_item
782             table_section_flow ::= table_section_flow_item*
783             table_section_flow_item ::= table_row_element
784             table_section_flow_item ::= SGML_flow_item
785             table_row_element ::= ELE_tr
786             table_flow ::= table_flow_item*
787             table_flow_item ::= ELE_colgroup
788             table_flow_item ::= ELE_thead
789             table_flow_item ::= ELE_tfoot
790             table_flow_item ::= ELE_tbody
791             table_flow_item ::= ELE_caption
792             table_flow_item ::= ELE_col
793             table_flow_item ::= SGML_flow_item
794             empty ::=
795             END_OF_BNF
796              
797             @Marpa::HTML::Internal::CORE_RULES = ();
798              
799             my %handler = (
800             cruft => '!CRUFT_handler',
801             comment => '!COMMENT_handler',
802             pi => '!PI_handler',
803             decl => '!DECL_handler',
804             document => '!TOP_handler',
805             whitespace => '!WHITESPACE_handler',
806             pcdata => '!PCDATA_handler',
807             cdata => '!CDATA_handler',
808             prolog => '!PROLOG_handler',
809             trailer => '!TRAILER_handler',
810             );
811              
812             for my $bnf_production ( split /\n/xms, $BNF ) {
813             my $sequence = ( $bnf_production =~ s/ [*] \s* $//xms );
814             $bnf_production =~ s/ \s* [:][:][=] \s* / /xms;
815             my @symbols = ( split q{ }, $bnf_production );
816             my $lhs = shift @symbols;
817             my %rule_descriptor = (
818             lhs => $lhs,
819             rhs => \@symbols,
820             );
821             if ($sequence) {
822             $rule_descriptor{min} = 0;
823             }
824             if ( my $handler = $handler{$lhs} ) {
825             $rule_descriptor{action} = $handler;
826             }
827             elsif ( $lhs =~ /^ELE_/xms ) {
828             $rule_descriptor{action} = "!$lhs";
829             }
830             push @Marpa::HTML::Internal::CORE_RULES, \%rule_descriptor;
831             } ## end for my $bnf_production ( split /\n/xms, $BNF )
832              
833             @Marpa::HTML::Internal::CORE_TERMINALS =
834             qw(C D PI CRUFT CDATA PCDATA WHITESPACE EOF );
835              
836             push @Marpa::HTML::Internal::CORE_TERMINALS,
837             keys %Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS;
838              
839 6     6   60 no strict 'refs';
  6         14  
  6         304  
840             *{'Marpa::HTML::Internal::default_action'} = create_tdesc_handler();
841 6     6   45 use strict;
  6         12  
  6         25537  
842              
843             %Marpa::HTML::Internal::EMPTY_ELEMENT = map { $_ => 1 } qw(
844             area base basefont br col frame hr
845             img input isindex link meta param);
846              
847             %Marpa::HTML::Internal::CONTENTS = (
848             'p' => 'inline_flow',
849             'select' => 'Contents_select',
850             'option' => 'pcdata_flow',
851             'optgroup' => 'Contents_optgroup',
852             'dt' => 'inline_flow',
853             'dd' => 'list_item_flow',
854             'li' => 'list_item_flow',
855             'colgroup' => 'Contents_colgroup',
856             'thead' => 'table_section_flow',
857             'tfoot' => 'table_section_flow',
858             'tbody' => 'table_section_flow',
859             'table' => 'table_flow',
860             ( map { $_ => 'empty' } keys %Marpa::HTML::Internal::EMPTY_ELEMENT ),
861             );
862              
863             sub parse {
864 91     91   244 my ( $self, $document_ref ) = @_;
865              
866 91         221 my %start_tags = ();
867 91         169 my %end_tags = ();
868              
869 91 50       350 Carp::croak(
870             "parse() already run on this object\n",
871             'For a new parse, create a new object'
872             ) if $self->{document};
873              
874 91         193 my $trace_cruft = $self->{trace_cruft};
875 91   50     514 my $trace_terminals = $self->{trace_terminals} // 0;
876 91         183 my $trace_conflicts = $self->{trace_conflicts};
877 91         164 my $trace_fh = $self->{trace_fh};
878 91         224 my $ref_type = ref $document_ref;
879 91         311 Carp::croak('Arg to parse() must be ref to string')
880             if not $ref_type
881             or $ref_type ne 'SCALAR'
882 91 50 33     578 or not defined ${$document_ref};
      33        
883              
884 91         147 my %pull_parser_args;
885 91         376 my $document = $pull_parser_args{doc} = $self->{document} = $document_ref;
886 91   33     1074 my $pull_parser =
887             HTML::PullParser->new( %pull_parser_args,
888             %Marpa::HTML::PULL_PARSER_OPTIONS )
889             || Carp::croak('Could not create pull parser');
890              
891 91         16295 my @tokens = ();
892              
893 91         283 my %terminals = map { $_ => 1 } @Marpa::HTML::Internal::CORE_TERMINALS;
  2002         4387  
894 91         1147 my %optional_terminals = %Marpa::HTML::Internal::CORE_OPTIONAL_TERMINALS;
895 91         267 my @html_parser_tokens = ();
896 91         203 my @marpa_tokens = ();
897             HTML_PARSER_TOKEN:
898 91         469 while ( my $html_parser_token = $pull_parser->get_token ) {
899 1361         2485 my ( $token_type, $line, $column, $offset, $offset_end ) =
900 1361         17865 @{$html_parser_token};
901              
902             # If it's a virtual token from HTML::Parser,
903             # pretend it never existed.
904             # We figure out where the missing tags are,
905             # and HTML::Parser's guesses are not helpful.
906 1361 100       2975 next HTML_PARSER_TOKEN if $offset_end <= $offset;
907              
908 1359         2262 my $token_number = scalar @html_parser_tokens;
909 1359         1893 push @html_parser_tokens, $html_parser_token;
910              
911 1359         1610 given ($token_type) {
912 1359         2444 when ('T') {
913 626         905 my $is_cdata = $html_parser_token
914             ->[Marpa::HTML::Internal::Token::IS_CDATA];
915 626         10661 push @marpa_tokens,
916             [
917             ( substr(
918 626 50       737 ${$document}, $offset,
    100          
919             ( $offset_end - $offset )
920             ) =~ / \A \s* \z /xms ? 'WHITESPACE'
921             : $is_cdata ? 'CDATA'
922             : 'PCDATA'
923             ),
924             [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ],
925             ];
926             } ## end when ('T')
927 733         1012 when ('S') {
928 398         580 my $tag_name = $html_parser_token
929             ->[Marpa::HTML::Internal::Token::TAGNAME];
930 398         699 $start_tags{$tag_name}++;
931 398         678 my $terminal = "S_$tag_name";
932 398         606 $terminals{$terminal}++;
933 398         2748 push @marpa_tokens,
934             [
935             $terminal,
936             [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ],
937             ];
938             } ## end when ('S')
939 335         500 when ('E') {
940 331         505 my $tag_name = $html_parser_token
941             ->[Marpa::HTML::Internal::Token::TAGNAME];
942 331         566 $end_tags{$tag_name}++;
943 331         506 my $terminal = "E_$tag_name";
944 331         674 $terminals{$terminal}++;
945 331         2428 push @marpa_tokens,
946             [
947             $terminal,
948             [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ],
949             ];
950             } ## end when ('E')
951 4         15 when ( [qw(C D)] ) {
952 4         34 push @marpa_tokens,
953             [
954             $_, [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ],
955             ];
956             } ## end when ( [qw(C D)] )
957 0         0 when ( ['PI'] ) {
958 0         0 push @marpa_tokens,
959             [
960             $_, [ [ 'UNVALUED_SPAN', $token_number, $token_number ] ],
961             ];
962             } ## end when ( ['PI'] )
963 0         0 default { Carp::croak("Unprovided-for event: $_") }
  0         0  
964             } ## end given
965             } ## end while ( my $html_parser_token = $pull_parser->get_token)
966              
967             # Points AFTER the last HTML
968             # Parser token.
969             # The other logic needs to be ready for this.
970 91         1379 push @marpa_tokens, [ 'EOF', [ ['POINT'] ] ];
971              
972 91         184 $pull_parser = undef; # conserve memory
973              
974 91         1617 my @rules = @Marpa::HTML::Internal::CORE_RULES;
975 91         872 my @terminals = keys %terminals;
976              
977 91         560 my %pseudoclass_element_actions = ();
978 91         262 my %element_actions = ();
979              
980             # Special cases which are dealt with elsewhere.
981             # As of now the only special cases are elements with optional
982             # start and end tags
983 91         227 for my $special_element (qw(html head body table tbody tr td)) {
984 637         767 delete $start_tags{$special_element};
985 637         1600 $element_actions{"!ELE_$special_element"} = $special_element;
986             }
987              
988 91         335 ELEMENT: for ( keys %start_tags ) {
989 50         113 my $start_tag = "S_$_";
990 50         102 my $end_tag = "E_$_";
991 50   100     258 my $contents = $Marpa::HTML::Internal::CONTENTS{$_} // 'flow';
992 50   100     331 my $element_type = $Marpa::HTML::Internal::ELEMENT_TYPE{$_}
993             // 'inline_element';
994              
995 50         3270 push @rules,
996             {
997             lhs => $element_type,
998             rhs => ["ELE_$_"],
999             },
1000             {
1001             lhs => "ELE_$_",
1002             rhs => [ $start_tag, $contents, $end_tag ],
1003             action => "!ELE_$_",
1004             };
1005              
1006             # There may be no
1007             # end tag in the input.
1008             # This silences the warning.
1009 50 100       185 if ( not $terminals{$end_tag} ) {
1010 21         41 push @terminals, $end_tag;
1011 21         52 $terminals{$end_tag}++;
1012             }
1013              
1014             # Make each new optional terminal the highest ranking
1015 50         113 $optional_terminals{$end_tag} = keys %optional_terminals;
1016              
1017 50         249 $element_actions{"!ELE_$_"} = $_;
1018             } ## end for ( keys %start_tags )
1019              
1020             # The question is where to put cruft -- in the current element,
1021             # or at a higher level. As a first step, we set up a system of
1022             # levels for specific elements, going from the lowest, where no
1023             # cruft is allowed, to the highest, where everything is
1024             # acceptable as cruft, if only because it has nowhere else to go.
1025              
1026             # First step, set up the levels, using specific elements.
1027             # Some of these elements will are stand-ins for large category.
1028             # For example, the HR element stands in for those elements
1029             # such as empty elements,
1030             # which tolerate zero cruft, while SPAN stands in for
1031             # inline elements and DIV stands in for the class of
1032             # block-level elements
1033              
1034 91         228 my %ok_as_cruft = ();
1035 91         221 DECIDE_CRUFT_TREATMENT: {
1036 91         146 my %level = ();
1037 91         1434 my @elements_by_level = (
1038             [qw( HR HEAD )],
1039             [qw( SPAN OPTION )],
1040             [qw( LI OPTGROUP DD DT )],
1041             [qw( DIR MENU )],
1042             [qw( DIV )],
1043             [qw( UL OL DL )],
1044             [qw( TH TD )],
1045             [qw( TR )],
1046             [qw( COL )],
1047             [qw( CAPTION COLGROUP THEAD TFOOT TBODY )],
1048             [qw( TABLE )],
1049             [qw( BODY )],
1050             [qw( HTML )],
1051             );
1052              
1053             # EOF comes after everything -- it is
1054             # the highest level of all
1055 91         279 $level{EOF} = scalar @elements_by_level;
1056              
1057             # Assign levels to the end tags of the elements
1058             # in the above table.
1059 91         339 for my $level ( 0 .. $#elements_by_level ) {
1060 1183         1307 for my $element ( @{ $elements_by_level[$level] } ) {
  1183         1981  
1061 2366         8527 $level{ 'S_' . lc $element } = $level{ 'E_' . lc $element } =
1062             $level;
1063             }
1064             } ## end for my $level ( 0 .. $#elements_by_level )
1065              
1066 91         245 my $no_cruft_allowed = $level{E_hr};
1067 91         168 my $block_level = $level{E_div};
1068 91         165 my $inline_level = $level{E_span};
1069              
1070             # Now that we have set out the structure of levels
1071             # fill it in for all the terminals we have yet to
1072             # define.
1073 3426         5909 TERMINAL:
1074 91         489 for my $terminal ( grep { not defined $level{$_} }
1075             ( @terminals, keys %optional_terminals ) )
1076             {
1077              
1078             # With the exception of EOF,
1079             # only tags can have levels because only they really
1080             # tell us anyting about "state" --
1081             # whether we are awaiting something
1082             # or are inside something.
1083 721 100       1715 if ( $terminal !~ /^[SE]_/xms ) {
1084 637         942 $level{$terminal} = $no_cruft_allowed;
1085 637         927 next TERMINAL;
1086             }
1087 84         151 my $element = substr $terminal, 2;
1088 84 100       213 if ( $Marpa::HTML::Internal::EMPTY_ELEMENT{$element} ) {
1089 24         37 $level{$terminal} = $no_cruft_allowed;
1090 24         49 next TERMINAL;
1091             }
1092              
1093 60         98 my $element_type = $Marpa::HTML::Internal::ELEMENT_TYPE{$element};
1094 60 50 33     405 if ( defined $element_type
1095             and $element_type ~~ [qw(block_element header_element)] )
1096             {
1097 60         112 $level{$terminal} = $block_level;
1098 60         165 next TERMINAL;
1099             } ## end if ( defined $element_type and $element_type ~~ [...])
1100              
1101 0         0 $level{$terminal} = $inline_level;
1102              
1103             } ## end for my $terminal ( grep { not defined $level{$_} } ( ...))
1104              
1105             EXPECTED_TERMINAL:
1106 91         520 for my $expected_terminal ( keys %optional_terminals ) {
1107              
1108             # Regardless of levels, allow no cruft before a start tag.
1109             # Start whatever it is, then deal with the cruft.
1110 1324 100       4460 next EXPECTED_TERMINAL if $expected_terminal =~ /^S_/xms;
1111              
1112             # For end tags, use the levels
1113 687         900 TERMINAL: for my $actual_terminal (@terminals) {
1114 15970         38925 $ok_as_cruft{$expected_terminal}{$actual_terminal} =
1115             $level{$actual_terminal} < $level{$expected_terminal};
1116             }
1117             } ## end for my $expected_terminal ( keys %optional_terminals )
1118              
1119             } ## end DECIDE_CRUFT_TREATMENT:
1120              
1121 91         1573 my $grammar = Marpa::HTML::Grammar->new(
1122             { rules => \@rules,
1123             start => 'document',
1124             terminals => \@terminals,
1125             inaccessible_ok => 1,
1126             unproductive_ok => 1,
1127             default_action => 'Marpa::HTML::Internal::default_action',
1128             strip => 0,
1129             }
1130             );
1131 91         2106424 $grammar->precompute();
1132              
1133 91 50       27728292 if ( $self->{trace_rules} ) {
1134 0 0       0 say {$trace_fh} $grammar->show_rules()
  0         0  
1135             or Carp::croak("Cannot print: $ERRNO");
1136             }
1137 91 50       464 if ( $self->{trace_QDFA} ) {
1138 0 0       0 say {$trace_fh} $grammar->show_QDFA()
  0         0  
1139             or Carp::croak("Cannot print: $ERRNO");
1140             }
1141              
1142 91         1661 my $recce = Marpa::HTML::Recognizer->new(
1143             { grammar => $grammar,
1144             trace_terminals => $self->{trace_terminals},
1145             trace_earley_sets => $self->{trace_earley_sets},
1146             mode => 'stream',
1147             }
1148             );
1149              
1150 91         49745 $self->{recce} = $recce;
1151 91         374 $self->{tokens} = \@html_parser_tokens;
1152              
1153             # These variables track virtual start tokens as
1154             # a protection against infinite loops.
1155 91         247 my %start_virtuals_used = ();
1156 91         189 my $earleme_of_last_start_virtual = -1;
1157              
1158 91         277 my $marpa_token = shift @marpa_tokens;
1159 91         382 RECCE_RESPONSE: while ( defined $marpa_token ) {
1160              
1161 2125         2630 my $read_result = $recce->read( @{$marpa_token} );
  2125         8355  
1162 2125 100       1254697 if ( defined $read_result ) {
1163 1450         2797 $marpa_token = shift @marpa_tokens;
1164 1450         4796 next RECCE_RESPONSE;
1165             }
1166              
1167 675         1246 my $actual_terminal = $marpa_token->[0];
1168 675 50       1849 if ($trace_terminals) {
1169 0 0       0 say {$trace_fh} 'Literal Token not accepted: ', $actual_terminal
  0         0  
1170             or Carp::croak("Cannot print: $ERRNO");
1171             }
1172              
1173 675         1024 my $virtual_token_to_add;
1174              
1175 675         751 FIND_VIRTUAL_TOKEN: {
1176 675         1013 my $virtual_terminal;
1177 214         1157 my @virtuals_expected =
1178 4561         12731 sort { $optional_terminals{$a} <=> $optional_terminals{$b} }
1179 675         2093 grep { defined $optional_terminals{$_} }
1180 675         861 @{ $recce->terminals_expected() };
1181 675 50       1668 if ($trace_conflicts) {
1182 0 0       0 say {$trace_fh} 'Conflict of virtual choices'
  0         0  
1183             or Carp::croak("Cannot print: $ERRNO");
1184 0 0       0 say {$trace_fh} "Actual Token is $actual_terminal"
  0         0  
1185             or Carp::croak("Cannot print: $ERRNO");
1186 0 0       0 say {$trace_fh} +( scalar @virtuals_expected ),
  0         0  
1187             ' virtual terminals expected: ', join q{ },
1188             @virtuals_expected
1189             or Carp::croak("Cannot print: $ERRNO");
1190             } ## end if ($trace_conflicts)
1191              
1192             LOOKAHEAD_VIRTUAL_TERMINAL:
1193 675         1996 while ( my $candidate = pop @virtuals_expected ) {
1194              
1195             # Start an implied table only if the next token is one which
1196             # can only occur inside a table
1197 817 100       2393 if ( $candidate eq 'S_table' ) {
1198 90 100       906 if (not $actual_terminal ~~ [
1199             qw(
1200             S_caption S_col S_colgroup S_thead S_tfoot
1201             S_tbody S_tr S_th S_td
1202             E_caption E_col E_colgroup E_thead E_tfoot
1203             E_tbody E_tr E_th E_td
1204             E_table
1205             )
1206             ]
1207             )
1208             {
1209 86         457 next LOOKAHEAD_VIRTUAL_TERMINAL;
1210             } ## end if ( not $actual_terminal ~~ [ qw(...)])
1211              
1212             # The above test implies the others below, so
1213             # this virtual table start terminal is OK.
1214 4         23 $virtual_terminal = $candidate;
1215 4         13 last LOOKAHEAD_VIRTUAL_TERMINAL;
1216             } ## end if ( $candidate eq 'S_table' )
1217              
1218             # For other than , we are permissive.
1219             # Unless the lookahead gives us
1220             # a specific reason to
1221             # reject the virtual terminal, we accept it.
1222              
1223             # No need to check lookahead, unless we are starting
1224             # an element
1225 727 100       3384 if ( $candidate !~ /^S_/xms ) {
1226 372         826 $virtual_terminal = $candidate;
1227 372         755 last LOOKAHEAD_VIRTUAL_TERMINAL;
1228             }
1229              
1230             #<<< no perltidy cycles as of 12 Mar 2010
1231              
1232             my $candidate_level =
1233             $Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{
1234 355         956 $candidate };
1235              
1236             #>>>
1237             # If the candidate is not part of the hierarchy, no need to check
1238             # lookahead
1239 355 50       804 if ( not defined $candidate_level ) {
1240 0         0 $virtual_terminal = $candidate;
1241 0         0 last LOOKAHEAD_VIRTUAL_TERMINAL;
1242             }
1243              
1244             my $actual_terminal_level =
1245             $Marpa::HTML::Internal::VIRTUAL_TOKEN_HIERARCHY{
1246 355         708 $actual_terminal};
1247              
1248             # If the actual terminal is not part of the hierarchy, no need to check
1249             # lookahead, either
1250 355 100       1013 if ( not defined $actual_terminal_level ) {
1251 179         240 $virtual_terminal = $candidate;
1252 179         359 last LOOKAHEAD_VIRTUAL_TERMINAL;
1253             }
1254              
1255             # Here we are trying to deal with a higher-level element's
1256             # start or end, by starting a new lower level element.
1257             # This won't work, because we'll have to close it
1258             # immediately with another virtual terminal.
1259             # At best this means useless, empty elements.
1260             # At worst, it means an infinite loop where
1261             # empty lower-level elements are repeatedly added.
1262             #
1263             next LOOKAHEAD_VIRTUAL_TERMINAL
1264 176 100       620 if $candidate_level <= $actual_terminal_level;
1265              
1266 120         195 $virtual_terminal = $candidate;
1267 120         241 last LOOKAHEAD_VIRTUAL_TERMINAL;
1268              
1269             } ## end while ( my $candidate = pop @virtuals_expected )
1270              
1271 675 50       1570 if ($trace_terminals) {
1272 0 0       0 say {$trace_fh} 'Converting Token: ', $actual_terminal
  0         0  
1273             or Carp::croak("Cannot print: $ERRNO");
1274 0 0       0 if ( defined $virtual_terminal ) {
1275 0 0       0 say {$trace_fh} 'Candidate as Virtual Token: ',
  0         0  
1276             $virtual_terminal
1277             or Carp::croak("Cannot print: $ERRNO");
1278             }
1279             } ## end if ($trace_terminals)
1280              
1281             # Depending on the expected (optional or virtual)
1282             # terminal and the actual
1283             # terminal, we either want to add the actual one as cruft, or add
1284             # the virtual one to move on in the parse.
1285              
1286 675 50 33     1832 if ( $trace_terminals > 1 and defined $virtual_terminal ) {
1287 0         0 say {$trace_fh}
  0         0  
1288             "OK as cruft when expecting $virtual_terminal: ",
1289 0 0       0 join q{ }, keys %{ $ok_as_cruft{$virtual_terminal} }
1290             or Carp::croak("Cannot print: $ERRNO");
1291             } ## end if ( $trace_terminals > 1 and defined $virtual_terminal)
1292              
1293 675 50       1413 last FIND_VIRTUAL_TOKEN if not defined $virtual_terminal;
1294             last FIND_VIRTUAL_TOKEN
1295 675 100       2829 if $ok_as_cruft{$virtual_terminal}{$actual_terminal};
1296              
1297 673 100       2040 CHECK_FOR_INFINITE_LOOP: {
1298              
1299             # It is sufficient to check for start tags.
1300             # Just ending things will never cause an infinite loop.
1301 673         744 last CHECK_FOR_INFINITE_LOOP if $virtual_terminal !~ /^S_/xms;
1302              
1303             # Are we at the same earleme as we were when the last
1304             # virtual start was added? If not, no problem.
1305             # But we need to reinitialize.
1306 303         1186 my $current_earleme = $recce->current_earleme();
1307 303 50       1571 if ( $current_earleme != $earleme_of_last_start_virtual ) {
1308 303         492 $earleme_of_last_start_virtual = $current_earleme;
1309 303         508 %start_virtuals_used = ();
1310 303         697 last CHECK_FOR_INFINITE_LOOP;
1311             }
1312              
1313             # Is this the first time we've added this start
1314             # terminal? If so, we're OK.
1315             last CHECK_FOR_INFINITE_LOOP
1316 0 0       0 if $start_virtuals_used{$virtual_terminal}++ <= 1;
1317              
1318             # Attempt to add duplicate.
1319             # Give up on adding virtual at this location,
1320             # and warn the user.
1321 0         0 ( my $tagname = $virtual_terminal ) =~ s/^S_//xms;
1322 0 0       0 say {$trace_fh}
  0         0  
1323             "Warning: attempt to add <$tagname> twice at the same place"
1324             or Carp::croak("Cannot print: $ERRNO");
1325 0         0 last FIND_VIRTUAL_TOKEN;
1326              
1327             } ## end CHECK_FOR_INFINITE_LOOP:
1328              
1329 673         1085 my $tdesc_list = $marpa_token->[1];
1330 673         1421 my $first_tdesc_start_token =
1331             $tdesc_list->[0]->[Marpa::HTML::Internal::TDesc::START_TOKEN];
1332 673         3325 $virtual_token_to_add = [
1333             $virtual_terminal, [ [ 'POINT', $first_tdesc_start_token ] ]
1334             ];
1335              
1336             } ## end FIND_VIRTUAL_TOKEN:
1337              
1338 675 100       1830 if ( defined $virtual_token_to_add ) {
1339 673         813 $recce->read( @{$virtual_token_to_add} );
  673         2474  
1340 673         470858 next RECCE_RESPONSE;
1341             }
1342              
1343             # If we didn't find a token to add, add the
1344             # current physical token as CRUFT.
1345              
1346 2 50       6 if ($trace_terminals) {
1347 0 0       0 say {$trace_fh} 'Adding actual token as cruft: ', $actual_terminal
  0         0  
1348             or Carp::croak("Cannot print: $ERRNO");
1349             }
1350              
1351             # Cruft tokens are not virtual.
1352             # They are the real things, hacked up.
1353 2         4 $marpa_token->[0] = 'CRUFT';
1354 2 50       8 if ($trace_cruft) {
1355 0         0 my ( $line, $col ) =
1356             earleme_to_linecol( $self, $recce->current_earleme() );
1357              
1358             # HTML::Parser uses one-based line numbers,
1359             # but zero-based column numbers
1360             # The convention (in vi and cut) is that
1361             # columns are also one-based.
1362 0         0 $col++;
1363              
1364 0         0 say {$trace_fh} qq{Cruft at line $line, column $col: "},
  0         0  
1365 0 0       0 ${ tdesc_list_to_literal( $self, $marpa_token->[1] ) }, q{"}
1366             or Carp::croak("Cannot print: $ERRNO");
1367             } ## end if ($trace_cruft)
1368              
1369             } ## end while ( defined $marpa_token )
1370              
1371 91 50       448 if ($trace_terminals) {
1372 0 0       0 say {$trace_fh} 'at end of tokens'
  0         0  
1373             or Carp::croak("Cannot print: $ERRNO");
1374             }
1375              
1376 91         465 $recce->end_input();
1377              
1378 91         930 my %closure = ();
1379             {
1380 91         179 my $user_top_handler =
  91         406  
1381             $self->{user_handlers_by_pseudoclass}->{ANY}->{TOP};
1382 91 100       524 $closure{'!TOP_handler'} =
1383             defined $user_top_handler
1384             ? wrap_user_top_handler($user_top_handler)
1385             : \&Marpa::HTML::Internal::default_top_handler;
1386             } ## end if ( defined( my $user_top_handler = $self->{...}))
1387              
1388 91 100       554 if ( defined $self->{user_handlers_by_class}->{ANY}->{ANY} ) {
1389 83         287 $closure{'!DEFAULT_ELE_handler'} =
1390             $self->{user_handlers_by_class}->{ANY}->{ANY};
1391             }
1392              
1393             PSEUDO_CLASS:
1394 91         233 for my $pseudoclass (
1395             qw(PI DECL COMMENT PROLOG TRAILER WHITESPACE CDATA PCDATA CRUFT))
1396             {
1397 819         1978 my $pseudoclass_action =
1398             $self->{user_handlers_by_pseudoclass}->{ANY}->{$pseudoclass};
1399 819         1503 my $pseudoclass_action_name = "!$pseudoclass" . '_handler';
1400 819 100       1562 if ($pseudoclass_action) {
1401 313         1255 $closure{$pseudoclass_action_name} =
1402             wrap_user_tdesc_handler( $pseudoclass_action,
1403             { pseudoclass => $pseudoclass } );
1404 313         775 next PSEUDO_CLASS;
1405             } ## end if ($pseudoclass_action)
1406 506         1632 $closure{$pseudoclass_action_name} =
1407             \&Marpa::HTML::Internal::default_action;
1408             } ## end for my $pseudoclass (...)
1409              
1410 91         628 while ( my ( $element_action, $element ) = each %element_actions ) {
1411 687         1273 $closure{$element_action} = create_tdesc_handler( $self, $element );
1412             }
1413              
1414             ELEMENT_ACTION:
1415 91         568 while ( my ( $element_action, $data ) =
1416             each %pseudoclass_element_actions )
1417             {
1418              
1419             # As of now, there are
1420             # no per-element pseudo-classes, and since I can't regression test
1421             # this logic any more, I'm commenting it out.
1422 0         0 Carp::croak('per-element pseudo-classes not implemented');
1423              
1424             # my ( $pseudoclass, $element ) = @{$data};
1425             # my $pseudoclass_action =
1426             # $self->{user_handlers_by_pseudoclass}->{$element}
1427             # ->{$pseudoclass}
1428             # // $self->{user_handlers_by_pseudoclass}->{ANY}->{$pseudoclass};
1429             # if ( defined $pseudoclass_action ) {
1430             # $pseudoclass_action =
1431             # wrap_user_tdesc_handler($pseudoclass_action);
1432             # }
1433             # $pseudoclass_action //= \&Marpa::HTML::Internal::default_action;
1434             # $closure{$element_action} = $pseudoclass_action;
1435             } ## end while ( my ( $element_action, $data ) = each ...)
1436              
1437 91         248 my $value = do {
1438 91         192 local $Marpa::HTML::Internal::PARSE_INSTANCE = $self;
1439 91         176 local $Marpa::HTML::INSTANCE = {};
1440 91         1113 $recce->value(
1441             { trace_values => $self->{trace_values},
1442             trace_actions => $self->{trace_actions},
1443             closures => \%closure,
1444             }
1445             );
1446             };
1447 91 50       7368 Carp::croak('No parse: evaler returned undef') if not defined $value;
1448 91         166 return ${$value};
  91         211167  
1449              
1450             } ## end sub parse
1451              
1452             sub Marpa::HTML::html {
1453 91     91 0 36804 my ( $document_ref, @args ) = @_;
1454 91         380 my $html = Marpa::HTML::Internal::create(@args);
1455 91         316 return Marpa::HTML::Internal::parse( $html, $document_ref );
1456             }
1457              
1458             1;