File Coverage

blib/lib/HTML/Parser/Simple.pm
Criterion Covered Total %
statement 161 192 83.8
branch 63 84 75.0
condition 22 36 61.1
subroutine 18 19 94.7
pod 4 14 28.5
total 268 345 77.6


line stmt bran cond sub pod time code
1             package HTML::Parser::Simple;
2              
3 10     10   100573 use strict;
  10         18  
  10         327  
4 10     10   39 use warnings;
  10         16  
  10         213  
5              
6 10     10   2148 use Moo;
  10         47109  
  10         51  
7              
8 10     10   11875 use Tree::Simple;
  10         25916  
  10         76  
9              
10             has block =>
11             (
12             default => sub {return {} },
13             is => 'rw',
14             );
15              
16             has current_node =>
17             (
18             default => sub {return ''},
19             is => 'rw',
20             );
21              
22             has depth =>
23             (
24             default => sub {return 0},
25             is => 'rw',
26             );
27              
28             has empty =>
29             (
30             default => sub {return {} },
31             is => 'rw',
32             );
33              
34             has inline =>
35             (
36             default => sub {return {} },
37             is => 'rw',
38             );
39              
40             has input_file =>
41             (
42             default => sub {return ''},
43             is => 'rw',
44             );
45              
46             has node_type =>
47             (
48             default => sub {return 'global'},
49             is => 'rw',
50             );
51              
52             has output_file =>
53             (
54             default => sub {return ''},
55             is => 'rw',
56             );
57              
58             has result =>
59             (
60             default => sub {return ''},
61             is => 'rw',
62             );
63              
64             has root =>
65             (
66             default => sub {return ''},
67             is => 'rw',
68             );
69              
70             has self_close =>
71             (
72             default => sub {return {} },
73             is => 'rw',
74             );
75              
76             has tagged_attribute =>
77             (
78             default => sub {return {} },
79             is => 'rw',
80             );
81              
82             has verbose =>
83             (
84             default => sub {return 0},
85             is => 'rw',
86             );
87              
88             has xhtml =>
89             (
90             default => sub {return 0},
91             is => 'rw',
92              
93             trigger =>
94             sub
95             {
96             my($self, $new) = @_;
97              
98             $self -> _set_tagged_attribute($new);
99             }
100             );
101              
102             our $VERSION = '2.02';
103              
104             # -----------------------------------------------
105              
106             sub BUILD
107             {
108 9     9 0 59 my($self) = @_;
109              
110 9         342 $self -> block
111             ({
112             address => 1,
113             applet => 1,
114             blockquote => 1,
115             button => 1,
116             center => 1,
117             dd => 1,
118             del => 1,
119             dir => 1,
120             div => 1,
121             dl => 1,
122             dt => 1,
123             fieldset => 1,
124             form => 1,
125             frameset => 1,
126             hr => 1,
127             iframe => 1,
128             ins => 1,
129             isindex => 1,
130             li => 1,
131             map => 1,
132             menu => 1,
133             noframes => 1,
134             noscript => 1,
135             object => 1,
136             ol => 1,
137             p => 1,
138             pre => 1,
139             script => 1,
140             table => 1,
141             tbody => 1,
142             td => 1,
143             tfoot => 1,
144             th => 1,
145             thead => 1,
146             'tr' => 1,
147             ul => 1,
148             });
149              
150 9         113 $self -> empty
151             ({
152             area => 1,
153             base => 1,
154             basefont => 1,
155             br => 1,
156             col => 1,
157             embed => 1,
158             frame => 1,
159             hr => 1,
160             img => 1,
161             input => 1,
162             isindex => 1,
163             link => 1,
164             meta => 1,
165             param => 1,
166             wbr => 1,
167             });
168              
169 9         221 $self -> inline
170             ({
171             a => 1,
172             abbr => 1,
173             acronym => 1,
174             applet => 1,
175             b => 1,
176             basefont => 1,
177             bdo => 1,
178             big => 1,
179             br => 1,
180             button => 1,
181             cite => 1,
182             code => 1,
183             del => 1,
184             dfn => 1,
185             em => 1,
186             font => 1,
187             i => 1,
188             iframe => 1,
189             img => 1,
190             input => 1,
191             ins => 1,
192             kbd => 1,
193             label => 1,
194             map => 1,
195             object => 1,
196             'q' => 1,
197             's' => 1,
198             samp => 1,
199             script => 1,
200             select => 1,
201             small => 1,
202             span => 1,
203             strike => 1,
204             strong => 1,
205             sub => 1,
206             sup => 1,
207             textarea => 1,
208             tt => 1,
209             u => 1,
210             var => 1,
211             });
212              
213 9         70 $self -> self_close
214             ({
215             colgroup => 1,
216             dd => 1,
217             dt => 1,
218             li => 1,
219             options => 1,
220             p => 1,
221             td => 1,
222             tfoot => 1,
223             th => 1,
224             thead => 1,
225             'tr' => 1,
226             });
227              
228 9         84 $self -> current_node($self -> create_new_node('root', '', Tree::Simple -> ROOT) );
229 9         463 $self -> root($self -> current_node);
230              
231 9 100       48 if ($self -> xhtml)
232             {
233             # Compared to the non-XHTML re, this has an extra ':' in the first [].
234              
235 2         888 $self -> tagged_attribute
236             (
237             q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
238             );
239             }
240             else
241             {
242 7         2973 $self -> tagged_attribute
243             (
244             q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
245             );
246             }
247              
248             } # End of BUILD.
249              
250             # -----------------------------------------------
251             # Create a new node to store the new tag.
252             # Each node has metadata:
253             # o attributes: The tag's attributes, as a string with N spaces as a prefix.
254             # o content: The content before the tag was parsed.
255             # o name: The HTML tag.
256             # o node_type: This holds 'global' before '' and between ''
257             # and '', and after ''. It holds 'head' from
258             # '' to ', and holds 'body' from '' to
259             # ''. It's just there in case you need it.
260              
261             sub create_new_node
262             {
263 82     82 0 105 my($self, $name, $attributes, $parent) = @_;
264 82         447 my($metadata) =
265             {
266             attributes => $attributes,
267             content => [],
268             depth => $self -> depth,
269             name => $name,
270             node_type => $self -> node_type,
271             };
272              
273 82         242 return Tree::Simple -> new($metadata, $parent);
274              
275             } # End of create_new_node.
276              
277             # -----------------------------------------------
278              
279             sub handle_comment
280             {
281 2     2 0 4 my($self, $s) = @_;
282              
283 2         4 $self -> handle_content($s);
284              
285             } # End of handle_comment.
286              
287             # -----------------------------------------------
288              
289             sub handle_content
290             {
291 111     111 0 165 my($self, $s) = @_;
292 111         279 my($count) = $self -> current_node -> getChildCount;
293 111         511 my($metadata) = $self -> current_node -> getNodeValue;
294 111         397 $$metadata{'content'}[$count] .= $s;
295              
296 111         247 $self -> current_node -> setNodeValue($metadata);
297              
298             } # End of handle_content.
299              
300             # -----------------------------------------------
301              
302             sub handle_doctype
303             {
304 3     3 0 7 my($self, $s) = @_;
305              
306 3         7 $self -> handle_content($s);
307              
308             } # End of handle_doctype.
309              
310             # -----------------------------------------------
311              
312             sub handle_end_tag
313             {
314 51     51 0 54 my($self, $tag_name) = @_;
315              
316 51 100 100     201 $self -> node_type('global') if ( ($tag_name eq 'head') || ($tag_name eq 'body') );
317              
318 51 50       44 if (! ${$self -> empty}{$tag_name})
  51         142  
319             {
320 51         128 $self -> current_node($self -> current_node -> getParent);
321 51         286 $self -> depth($self -> depth - 1);
322             }
323              
324             } # End of handle_end_tag.
325              
326             # -----------------------------------------------
327              
328             sub handle_start_tag
329             {
330 73     73 0 92 my($self, $tag_name, $attributes, $unary) = @_;
331              
332 73         145 $self -> depth($self -> depth + 1);
333              
334 73 100       177 if ($tag_name eq 'head')
    100          
335             {
336 6         18 $self -> node_type('head');
337             }
338             elsif ($tag_name eq 'body')
339             {
340 8         26 $self -> node_type('body');
341             }
342              
343 73         156 my($node) = $self -> create_new_node($tag_name, $attributes, $self -> current_node);
344              
345 73 100       7018 $self -> current_node($node) if (! ${$self -> empty}{$tag_name});
  73         353  
346              
347             } # End of handle_start_tag.
348              
349             # -----------------------------------------------
350              
351             sub handle_xml_declaration
352             {
353 2     2 0 4 my($self, $s) = @_;
354              
355 2         8 $self -> handle_content($s);
356              
357             } # End of handle_xml_declaration.
358              
359             # -----------------------------------------------
360              
361             sub log
362             {
363 3     3 1 5 my($self, $msg) = @_;
364              
365 3 50       14 print STDERR "$msg\n" if ($self -> verbose);
366              
367             } # End of log.
368              
369             # -----------------------------------------------
370              
371             sub parse
372             {
373 9     9 1 254 my($self, $html) = @_;
374 9         17 my($original) = $html;
375 9         107 my(%special) =
376             (
377             script => 1,
378             style => 1,
379             );
380 9         49 my($tagged_attribute) = $self -> tagged_attribute;
381              
382 9         16 my($in_content);
383             my($offset);
384 0         0 my(@stack, $s);
385              
386 9         28 for (; $html;)
387             {
388 235         190 $in_content = 1;
389              
390             # Make sure we're not in a script or style element.
391              
392 235 50 66     847 if (! $stack[$#stack] || ! $special{$stack[$#stack]})
393             {
394             # Rearrange order of testing so rarer possiblilites are further down.
395             # Is it an end tag?
396              
397 235         298 $s = substr($html, 0, 2);
398              
399 235 100       408 if ($s eq '
400             {
401 51 50       188 if ($html =~ /^(<\/(\w+)[^>]*>)/)
402             {
403 51         89 substr($html, 0, length $1) = '';
404 51         44 $in_content = 0;
405              
406 51         116 $self -> parse_end_tag($2, \@stack);
407             }
408             }
409              
410             # Is it a start tag?
411              
412 235 100       334 if ($in_content)
413             {
414 184 100       308 if (substr($html, 0, 1) eq '<')
415             {
416             # Use lc() since tags are stored in this module in lower-case.
417              
418 81 100       1370 if (lc($html) =~ /$tagged_attribute/)
419             {
420             # Since the regexp matched, save matches in lower-case.
421             # Then, re-match to get attributes in original case.
422             # In each case:
423             # o $1 => The whole string which matched.
424             # o $2 => The tag name.
425             # o $3 => The attributes.
426             # o $4 => The trailing / if any (aka $unity).
427             # But we have to lower-case the prefix '<$tag' of the string
428             # to ensure the 2nd regexp actually matches.
429              
430 73         242 my(@match) = ($2, $3, $4);
431 73         187 substr($html, 0, length($2) + 1) = lc substr($html, 0, length($2) + 1);
432              
433 73 50       841 if ($html =~ /$tagged_attribute/)
434             {
435 73         156 substr($html, 0, length $1) = '';
436 73         65 $in_content = 0;
437              
438             # Here we use $3 from the 2nd match to get the attributes in the original case.
439 73         204 $self -> parse_start_tag($match[0], $3, $match[2], \@stack);
440             }
441             }
442             }
443             }
444              
445             # Is it a comment?
446              
447 235 100       366 if ($in_content)
448             {
449 111         113 $s = substr($html, 0, 4);
450              
451 111 100       176 if ($s eq '');
454              
455 2 50       7 if ($offset >= 0)
456             {
457 2         10 $self -> handle_comment(substr($html, 0, ($offset + 3) ) );
458              
459 2         10 substr($html, 0, $offset + 3) = '';
460 2         3 $in_content = 0;
461             }
462             }
463             }
464              
465             # Is it a doctype?
466              
467 235 100       314 if ($in_content)
468             {
469 109         110 $s = substr($html, 0, 9);
470              
471 109 100       167 if ($s eq '
472             {
473 3         7 $offset = index($html, '>');
474              
475 3 50       8 if ($offset >= 0)
476             {
477 3         14 $self -> handle_doctype(substr($html, 0, ($offset + 1) ) );
478              
479 3         18 substr($html, 0, $offset + 1) = '';
480 3         4 $in_content = 0;
481             }
482             }
483             }
484              
485             # Is is an XML declaration?
486              
487 235 100 100     4472 if ($self -> xhtml && $in_content)
488             {
489 30         202 $s = substr($html, 0, 5);
490              
491 30 100       49 if ($s eq '
492             {
493 2         6 $offset = index($html, '?>');
494              
495 2 50       6 if ($offset >= 0)
496             {
497 2         9 $self -> handle_xml_declaration(substr($html, 0, ($offset + 2) ) );
498              
499 2         14 substr($html, 0, $offset + 2) = '';
500 2         4 $in_content = 0;
501             }
502             }
503             }
504              
505 235 100       1251 if ($in_content)
506             {
507 104         136 $offset = index($html, '<');
508              
509 104 100       142 if ($offset < 0)
510             {
511 7         27 $self -> handle_content($html);
512              
513 7         26 $html = '';
514             }
515             else
516             {
517 97         224 $self -> handle_content(substr($html, 0, $offset) );
518              
519 97         457 substr($html, 0, $offset) = '';
520             }
521             }
522             }
523             else
524             {
525 0         0 my($re) = "(.*)<\/$stack[$#stack]\[^>]*>";
526              
527             # lc() is needed because only lc tag names are pushed onto the stack.
528              
529 0 0       0 if (lc($html) =~ /$re/s)
530             {
531 0         0 my($text) = $1;
532 0         0 $text =~ s//$1/g;
533 0         0 $text =~ s//$1/g;
534              
535 0         0 $self -> handle_content($text);
536             }
537              
538 0         0 $self -> parse_end_tag($stack[$#stack], \@stack);
539             }
540              
541 235 100       402 if ($html eq $original)
542             {
543 1         2 my($msg) = 'Parse error. ';
544 1         5 my($parent) = $self -> current_node -> getParent;
545              
546 1         3 my($metadata);
547              
548 1 50 33     12 if ($parent && $parent -> can('getNodeValue') )
549             {
550 1         2 $metadata = $parent -> getNodeValue;
551 1         5 $msg .= "Parent tag: <$$metadata{'name'}>. ";
552             }
553              
554 1         3 $metadata = $self -> current_node -> getNodeValue;
555 1         7 $msg .= "Current tag: <$$metadata{'name'}>. Next 100 chars: " . substr($html, 0, 100);
556              
557 1         8 die "$msg\n";
558             }
559              
560 234         409 $original = $html;
561             }
562              
563             # Clean up any remaining tags.
564              
565 8         25 $self -> parse_end_tag('', \@stack);
566              
567             # Return the invocant to allow method chaining.
568              
569 8         39 return $self;
570              
571             } # End of parse.
572              
573             # -----------------------------------------------
574              
575             sub parse_end_tag
576             {
577 59     59 0 76 my($self, $tag_name, $stack) = @_;
578 59         69 $tag_name = lc $tag_name;
579              
580             # Find the closest opened tag of the same name.
581              
582 59         46 my($pos);
583              
584 59 100       78 if ($tag_name)
585             {
586 51         112 for ($pos = $#$stack; $pos >= 0; $pos--)
587             {
588 51 50       122 last if ($$stack[$pos] eq $tag_name);
589             }
590             }
591             else
592             {
593 8         12 $pos = 0;
594             }
595              
596 59 50       105 if ($pos >= 0)
597             {
598             # Close all the open tags, up the stack.
599              
600 59         56 my($count) = 0;
601              
602 59         129 for (my($i) = $#$stack; $i >= $pos; $i--)
603             {
604 51         72 $count++;
605              
606 51         98 $self -> handle_end_tag($$stack[$i]);
607             }
608              
609             # Remove the open elements from the stack.
610             # Does not work: $#$stack = $pos. Could use splice().
611              
612 59         166 pop @$stack for ($count);
613             }
614              
615             } # End of parse_end_tag.
616              
617             # -----------------------------------------------
618              
619             sub parse_file
620             {
621 0     0 1 0 my($self, $input_file_name, $output_file_name) = @_;
622 0   0     0 $input_file_name ||= $self -> input_file;
623 0   0     0 $output_file_name ||= $self -> output_file;
624              
625 0         0 $self -> input_file($input_file_name);
626 0         0 $self -> output_file($output_file_name);
627 0         0 $self -> log("Reading $input_file_name");
628              
629 0 0       0 open(my $fh, $input_file_name) || die "Can't open($input_file_name): $!\n";
630 0         0 my($html);
631 0         0 read($fh, $html, -s $fh);
632 0         0 close $fh;
633              
634 0 0       0 die "Can't read($input_file_name): $!\n" if (! defined $html);
635              
636 0         0 $self -> log('Parsing');
637              
638 0         0 $self -> parse($html);
639              
640 0         0 $self -> log('Traversing');
641              
642 0         0 $self -> traverse($self -> root);
643              
644 0         0 $self -> log("Writing $output_file_name");
645              
646 0 0       0 open($fh, "> $output_file_name") || die "Can't open(> $output_file_name): $!\n";
647 0         0 print $fh $self -> result;
648 0         0 close $fh;
649              
650             # Return the invocant to allow method chaining.
651              
652 0         0 return $self;
653              
654             } # End of parse_file.
655              
656             # -----------------------------------------------
657              
658             sub parse_start_tag
659             {
660 73     73 0 123 my($self, $tag_name, $attributes, $unary, $stack) = @_;
661 73         80 $tag_name = lc $tag_name;
662              
663 73 100       58 if (${$self -> block}{$tag_name})
  73         223  
664             {
665 23   33     61 for (; $#$stack >= 0 && ${$self -> inline}{$$stack[$#$stack]};)
  23         93  
666             {
667 0         0 $self -> parse_end_tag($$stack[$#$stack], $stack);
668             }
669             }
670              
671 73 50 66     68 if (${$self -> self_close}{$tag_name} && ($$stack[$#$stack] eq $tag_name) )
  73         248  
672             {
673 0         0 $self -> parse_end_tag($tag_name, $stack);
674             }
675              
676 73   66     62 $unary = ${$self -> empty}{$tag_name} || $unary;
677              
678 73 100       159 push @$stack, $tag_name if (! $unary);
679              
680 73         128 $self -> handle_start_tag($tag_name, $attributes, $unary);
681              
682             } # End of parse_start_tag.
683              
684             # -----------------------------------------------
685              
686             sub _set_tagged_attribute
687             {
688 2     2   7 my($self, $new, $old) = @_;
689              
690 2 50       7 if ($new)
691             {
692 2         50 $self -> tagged_attribute
693             (
694             # Compared to the non-XHTML re, this has an extra ':' in the first [].
695              
696             q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
697             );
698             }
699             else
700             {
701 0         0 $self -> tagged_attribute
702             (
703             q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
704             );
705             }
706              
707             } # End of _set_tagged_attribute.
708              
709             # -----------------------------------------------
710              
711             sub traverse
712             {
713 51     51 1 63 my($self, $node) = @_;
714 51         98 my(@child) = $node -> getAllChildren;
715 51         242 my($metadata) = $node -> getNodeValue;
716 51         122 my($content) = $$metadata{'content'};
717 51         53 my($name) = $$metadata{'name'};
718              
719             # Special check to avoid printing '' when we still need to output
720             # the content of the root, e.g. the DOCTYPE.
721              
722 51 100       180 $self -> result($self -> result . "<$name$$metadata{'attributes'}>") if ($name ne 'root');
723              
724 51         33 my($index);
725             my($s);
726              
727 51         79 for $index (0 .. $#child)
728             {
729 45 100 100     262 $self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
730 45         131 $self -> traverse($child[$index]);
731             }
732              
733             # Output the content after the last child node has been closed,
734             # but before the current node is closed.
735              
736 51         45 $index = $#child + 1;
737              
738 51 100 66     231 $self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
739 51 100 100     35 $self -> result($self -> result . "") if (! ${$self -> empty}{$name} && ($name ne 'root') );
  51         253  
740              
741             # Return the invocant to allow method chaining.
742              
743 51         107 return $self;
744              
745             } # End of traverse.
746              
747             # -----------------------------------------------
748              
749             1;
750              
751             =head1 NAME
752              
753             HTML::Parser::Simple - Parse nice HTML files without needing a compiler
754              
755             =head1 Synopsis
756              
757             #!/usr/bin/env perl
758              
759             use strict;
760             use warnings;
761              
762             use HTML::Parser::Simple;
763              
764             # -------------------------
765              
766             # Method 1:
767              
768             my($p) = HTML::Parser::Simple -> new
769             (
770             input_file => 'data/s.1.html',
771             output_file => 'data/s.2.html',
772             );
773              
774             $p -> parse_file;
775              
776             # Method 2:
777              
778             my($p) = HTML::Parser::Simple -> new;
779              
780             $p -> parse_file('data/s.1.html', 'data/s.2.html');
781              
782             # Method 3:
783              
784             my($p) = HTML::Parser::Simple -> new;
785              
786             print $p -> parse('...') -> traverse($p -> root) -> result;
787              
788             Of course, these can be abbreviated by using method chaining. E.g. Method 2 could be:
789              
790             HTML::Parser::Simple -> new -> parse_file('data/s.1.html', 'data/s.2.html');
791              
792             See scripts/parse.html.pl and scripts/parse.xhtml.pl.
793              
794             =head1 Description
795              
796             C is a pure Perl module.
797              
798             It parses HTML V 4 files, and generates a tree of nodes, with 1 node per HTML tag.
799              
800             The data associated with each node is documented in the L.
801              
802             See also L and L.
803              
804             =head1 Distributions
805              
806             This module is available as a Unix-style distro (*.tgz).
807              
808             See L for details.
809              
810             See L for
811             help on unpacking and installing.
812              
813             =head1 Constructor and initialization
814              
815             new(...) returns an object of type C.
816              
817             This is the class contructor.
818              
819             Usage: C<< HTML::Parser::Simple -> new >>.
820              
821             This method takes a hash of options.
822              
823             Call C<< new() >> as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
824              
825             Available options (each one of which is also a method):
826              
827             =over 4
828              
829             =item o input_file => $a_file_name
830              
831             This takes the file name, including the path, of the input file.
832              
833             Default: '' (the empty string).
834              
835             =item o output_file => $a_file_name
836              
837             This takes the file name, including the path, of the output file.
838              
839             Default: '' (the empty string).
840              
841             =item o verbose => $Boolean
842              
843             This takes either a 0 or a 1.
844              
845             Write more or less progress messages.
846              
847             Default: 0.
848              
849             =item o xhtml => $Boolean
850              
851             This takes either a 0 or a 1.
852              
853             0 means do not accept an XML declaration, such as
854             at the start of the input file, and some other XHTML features, explained next.
855              
856             1 means accept XHTML input.
857              
858             Default: 0.
859              
860             The only XHTML changes to this code, so far, are:
861              
862             =over 4
863              
864             =item o Accept the XML declaration
865              
866             E.g.: .
867              
868             =item o Accept attribute names containing the ':' char
869              
870             E.g.: .
871              
872             =back
873              
874             =back
875              
876             =head1 Methods
877              
878             =head2 block()
879              
880             Returns a hashref where the keys are the names of block-level HTML tags.
881              
882             The corresponding values in the hashref are just 1.
883              
884             Typical keys: address, form, p, table, tr.
885              
886             Note: Some keys, e.g. tr, are also returned by L.
887              
888             =head2 current_node()
889              
890             Returns the L object which the parser calls the current node.
891              
892             =head2 depth()
893              
894             Returns the nesting depth of the current tag.
895              
896             The method is just here in case you need it.
897              
898             =head2 empty()
899              
900             Returns a hashref where the keys are the names of HTML tags of type empty.
901              
902             The corresponding values in the hashref are just 1.
903              
904             Typical keys: area, base, input, wbr.
905              
906             =head2 inline()
907              
908             Returns a hashref where the keys are the names of HTML tags of type inline.
909              
910             The corresponding values in the hashref are just 1.
911              
912             Typical keys: a, em, img, textarea.
913              
914             =head2 input_file($in_file_name)
915              
916             Gets or sets the input file name used by L.
917              
918             Note: The parameters passed in to L, take
919             precedence over the I and I parameters passed in to C<< new() >>, and over
920             the internal values set with C<< input_file($in_file_name) >> and
921             C<< output_file($out_file_name) >>.
922              
923             'input_file' is a parameter to L. See L for details.
924              
925             =head2 log($msg)
926              
927             Print $msg to STDERR if C<< new() >> was called as C<< new(verbose => 1) >>, or if
928             C<< $p -> verbose(1) >> was called.
929              
930             Otherwise, print nothing.
931              
932             =head2 new()
933              
934             This is the constructor. See L for details.
935              
936             =head2 node_type()
937              
938             Returns the type of the most recently created node, I, I, or I.
939              
940             See the first question in the L for details.
941              
942             =head2 output_file($out_file_name)
943              
944             Gets or sets the output file name used by L.
945              
946             Note: The parameters passed in to L, take
947             precedence over the I and I parameters passed in to C<< new() >>, and over
948             the internal values set with C<< input_file($in_file_name) >> and
949             C<< output_file($out_file_name) >>.
950              
951             'output_file' is a parameter to L. See L for details.
952              
953             =head2 parse($html)
954              
955             Returns the invocant. Thus C<< $p -> parse >> returns $p. This allows for method chaining. See the
956             L.
957              
958             Parses the string of HTML in $html, and builds a tree of nodes.
959              
960             After calling C<< $p -> parse($html) >>, you must call C<< $p -> traverse($p -> root) >> before
961             calling C<< $p -> result >>.
962              
963             Alternately, use C<< $p -> parse_file >>, which calls all these methods for you.
964              
965             Note: C<< parse() >> may be called directly or via C<< parse_file() >>.
966              
967             =head2 parse_file($input_file_name, $output_file_name)
968              
969             Returns the invocant. Thus C<< $p -> parse_file >> returns $p. This allows for method chaining. See
970             the L.
971              
972             Parses the HTML in the input file, and writes the result to the output file.
973              
974             C<< parse_file() >> calls L and L, using C<< $p -> root >> for
975             $node.
976              
977             Note: The parameters passed in to C<< parse_file($input_file_name, $output_file_name) >>, take
978             precedence over the I and I parameters passed in to C<< new() >>, and over
979             the internal values set with C<< input_file($in_file_name) >> and
980             C<< output_file($out_file_name) >>.
981              
982             Lastly, the parameters passed in to C<< parse_file($input_file_name, $output_file_name) >> are used
983             to update the internal values set with the I and I parameters passed in to
984             C<< new() >>, or set with calls to C<< input_file($in_file_name) >> and
985             C<< output_file($out_file_name) >>.
986              
987             =head2 result()
988              
989             Returns the string which is the result of the parse.
990              
991             See scripts/parse.html.pl.
992              
993             =head2 root()
994              
995             Returns the L object which the parser calls the root of the tree of nodes.
996              
997             =head2 self_close()
998              
999             Returns a hashref where the keys are the names of HTML tags of type self close.
1000              
1001             The corresponding values in the hashref are just 1.
1002              
1003             Typical keys: dd, dt, p, tr.
1004              
1005             Note: Some keys, e.g. tr, are also returned by L.
1006              
1007             =head2 tagged_attribute()
1008              
1009             Returns a string to be used as a regexp, to capture tags and their optional attributes.
1010              
1011             It does not return qr/$s/; it just returns $s.
1012              
1013             This regexp takes one of two forms, depending on the state of the I option. See
1014             L.
1015              
1016             The regexp has four (4) sets of capturing parentheses:
1017              
1018             =over 4
1019              
1020             =item o 1 for the whole tag and attribute and trailing / combination
1021              
1022             E.g.: <(....)>
1023              
1024             =item o 1 for the tag itself
1025              
1026             E.g.: <(img)...>
1027              
1028             =item o 1 for the optional attributes of the tag
1029              
1030             E.g.: A graph
1031              
1032             =item o 1 for the optional trailing / of the tag
1033              
1034             E.g.:
1035              
1036             =back
1037              
1038             =head2 traverse($node)
1039              
1040             Returns the invocant. Thus C<< $p -> traverse >> returns $p. This allows for method chaining.
1041             See the L.
1042              
1043             Traverses the tree of nodes, starting at $node.
1044              
1045             You normally call this as C<< $p -> traverse($p -> root) >>, to ensure all nodes are visited.
1046              
1047             See the L for sample code.
1048              
1049             Or, see scripts/traverse.file.pl, which uses L, and calls
1050             C<< traverse($node) >> via L.
1051              
1052             =head2 verbose($Boolean)
1053              
1054             Gets or sets the verbose parameter.
1055              
1056             'verbose' is a parameter to L. See L for details.
1057              
1058             =head2 xhtml($Boolean)
1059              
1060             Gets or sets the xhtml parameter.
1061              
1062             If you call this after object creation, the I feature of L is used to call
1063             L so as to correctly set the regexp which recognises xhtml.
1064              
1065             'xhtm'> is a parameter to L. See L for details.
1066              
1067             =head1 FAQ
1068              
1069             =head2 What is the format of the data stored in each node of the tree?
1070              
1071             The data of each node is a hash ref. The keys/values of this hash ref are:
1072              
1073             =over 4
1074              
1075             =item o attributes
1076              
1077             This is the string of HTML attributes associated with the HTML tag.
1078              
1079             Attributes are stored in lower-case.
1080              
1081             So, will have an attributes string of
1082             " align = 'center' summary = 'body'".
1083              
1084             Note the leading space.
1085              
1086             =item o content
1087              
1088             This is an arrayref of bits and pieces of content.
1089              
1090             Consider this fragment of HTML:
1091              
1092            

I did not say I liked debugging.

1093              
1094             When parsing 'I did ', the number of child nodes (of

) is 0, since has not yet been detected.

1095              
1096             So, 'I did ' is stored in the 0th element of the arrayref belonging to

.

1097              
1098             Likewise, 'not' is stored in the 0th element of the arrayref belonging to the node .
1099              
1100             Next, ' say I ' is stored in the 1st element of the arrayref belonging to

,

1101             because it follows the 1st child node ().
1102              
1103             Likewise, ' debugging' is stored in the 2nd element of the arrayref belonging to

.

1104              
1105             This way, the input string can be reproduced by successively outputting the elements of the arrayref
1106             of content interspersed with the contents of the child nodes (processed recusively).
1107              
1108             Note: If you are processing this tree, never forget that there can be content after the last child
1109             node has been closed, but before the current node is closed.
1110              
1111             Note: The DOCTYPE declaration is stored as the 0th element of the content of the root node.
1112              
1113             =item o depth
1114              
1115             The nesting depth of the tag within the document.
1116              
1117             The root is at depth 0, '' is at depth 1, '' and '' are a depth 2, and so on.
1118              
1119             It is just there in case you need it.
1120              
1121             =item o name
1122              
1123             So, the tag '' will mean the name is 'html'.
1124              
1125             Tag names are stored in lower-case.
1126              
1127             The root of the tree is called 'root', and holds the DOCTYPE, if any, as content.
1128              
1129             The root has the node 'html' as the only child, of course.
1130              
1131             =item o node_type
1132              
1133             This holds 'global' before '' and between '' and '', and after ''.
1134              
1135             It holds 'head' for all nodes from '' to '', and holds 'body' from '' to
1136             ''.
1137              
1138             It is just there in case you need it.
1139              
1140             =back
1141              
1142             =head2 How are tags and attributes handled?
1143              
1144             Tags are stored in lower-case, in a tree managed by L.
1145              
1146             Attributes are stored in the same case as in the original HTML.
1147              
1148             The root of the tree is returned be L.
1149              
1150             =head2 How are HTML comments handled?
1151              
1152             They are treated as content. This includes the prefix ''.
1153              
1154             =head2 How is DOCTYPE handled?
1155              
1156             It is treated as content belonging to the root of the tree.
1157              
1158             =head2 How is the XML declaration handled?
1159              
1160             It is treated as content belonging to the root of the tree.
1161              
1162             =head2 Does this module handle all HTML pages?
1163              
1164             No, never.
1165              
1166             =head2 Which versions of HTML does this module handle?
1167              
1168             Up to V 4.
1169              
1170             =head2 What do I do if this module does not handle my HTML page?
1171              
1172             Make yourself a nice cup of tea, and then fix your page.
1173              
1174             =head2 Does this validate the HTML input?
1175              
1176             No.
1177              
1178             For example, if you feed in a HTML page without the title tag, this module does not care.
1179              
1180             =head2 How do I view the output HTML?
1181              
1182             There are various ways.
1183              
1184             =over 4
1185              
1186             =item o See scripts/parse.html.pl
1187              
1188             =item o By installing HTML::Revelation, of course!
1189              
1190             Sample output:
1191              
1192             L.
1193              
1194             =back
1195              
1196             =head2 How do I test this module (or my file)?
1197              
1198             Preferably, see the previous question, or...
1199              
1200             Suggested steps:
1201              
1202             Note: There are quite a few files involved. Proceed with caution.
1203              
1204             =over 4
1205              
1206             =item o Select a HTML file to test
1207              
1208             Call this input.html.
1209              
1210             =item o Run input.html thru reveal.pl
1211              
1212             Reveal.pl ships with HTML::Revelation.
1213              
1214             Call the output file output.1.html.
1215              
1216             =item o Run input.html thru parse.html.pl
1217              
1218             parse.html.pl ships with HTML::Parser::Simple.
1219              
1220             Call the output file parsed.html.
1221              
1222             =item o Run parsed.html thru reveal.pl
1223              
1224             Call the output file output.2.html.
1225              
1226             =item o Compare output.1.html and output.2.html
1227              
1228             If they match, or even if they don't match, you're finished.
1229              
1230             =back
1231              
1232             =head2 Will you implement a 'quirks' mode to handle my special HTML file?
1233              
1234             No, never.
1235              
1236             Help with quirks: L.
1237              
1238             =head2 Is there anything I should be aware of?
1239              
1240             Yes. If your HTML file is not nice, the interpretation of tag nesting will not match
1241             your preconceptions.
1242              
1243             In such cases, do not seek to fix the code. Instead, fix your (faulty) preconceptions, and fix your
1244             HTML file.
1245              
1246             The 'a' tag, for example, is defined to be an inline tag, but the 'div' tag is a block-level tag.
1247              
1248             I do not define 'a' to be inline, others do, e.g. L and hence
1249             L.
1250              
1251             Inline means:
1252              
1253            
NAME
1254              
1255             will I be parsed as an 'a' containing a 'div'.
1256              
1257             The 'a' tag will be closed before the 'div' is opened. So, the result will look like:
1258              
1259            
NAME
1260              
1261             To achieve what was presumably intended, use 'span':
1262              
1263             NAME
1264              
1265             Some people (*cough* *cough*) have had to redo their entire websites due to this very problem.
1266              
1267             Of course, this is just one of a vast set of possible problems.
1268              
1269             You have been warned.
1270              
1271             =head2 Why did you use Tree::Simple but not Tree or Tree::Fast or Tree::DAG_Node?
1272              
1273             During testing, Tree::Fast crashed, so I replaced it with Tree and everything worked. Spooky.
1274              
1275             Late news: Tree does not cope with an arrayref stored in the metadata, so I have switched to
1276             L.
1277              
1278             Stop press: As an experiment I switched to L. Since it also works I will just keep
1279             using it.
1280              
1281             =head2 Why is this module not called HTML::Parser::PurePerl?
1282              
1283             =over 4
1284              
1285             =item o The API
1286              
1287             That name sounds like a pure Perl version of the same API as used by HTML::Parser.
1288              
1289             But the 2 APIs are not, and are not meant to be, compatible.
1290              
1291             =item o The tie-in
1292              
1293             Some people might falsely assume L can automatically fall back to
1294             L in the absence of a compiler.
1295              
1296             =back
1297              
1298             =head2 How do I output my own stuff while traversing the tree?
1299              
1300             =over 4
1301              
1302             =item o The sophisticated way
1303              
1304             As always with OO code, sub-class! In this case, you write a new version of the traverse() method.
1305              
1306             See L, for example. It overrides L.
1307              
1308             =item o The crude way
1309              
1310             Alternately, implement another method in your sub-class, e.g. process(), which recurses like
1311             traverse(). Then call parse() and process().
1312              
1313             =back
1314              
1315             =head2 How is the source formatted?
1316              
1317             I edit with UltraEdit. That means, in general, leading 4-space tabs.
1318              
1319             All vertical alignment within lines is done manually with spaces.
1320              
1321             Perl::Critic is off the agenda.
1322              
1323             =head2 Why did you choose Moos?
1324              
1325             For the 2012 Google Code-in, I had a quick look at 122 class-building classes, and decided
1326             L was suitable, given it is pure-Perl and has the trigger feature I needed.
1327              
1328             See L.
1329              
1330             =head1 Credits
1331              
1332             This Perl HTML parser has been converted from a JavaScript one written by John Resig.
1333              
1334             L.
1335              
1336             Well done John!
1337              
1338             Note also the comments published here:
1339              
1340             L.
1341              
1342             =head1 Repository
1343              
1344             L
1345              
1346             =head1 Support
1347              
1348             Email the author, or log a bug on RT:
1349              
1350             L.
1351              
1352             =head1 Author
1353              
1354             C was written by Ron Savage Iron@savage.net.auE> in 2009.
1355              
1356             Home page: L.
1357              
1358             =head1 Copyright
1359              
1360             Australian copyright (c) 2009 Ron Savage.
1361              
1362             All Programs of mine are 'OSI Certified Open Source Software';
1363             you can redistribute them and/or modify them under the terms of
1364             The Artistic License, a copy of which is available at:
1365             http://www.opensource.org/licenses/index.html
1366              
1367             =cut