File Coverage

blib/lib/HTML/Parser/Simple.pm
Criterion Covered Total %
statement 161 192 83.8
branch 63 84 75.0
condition 22 36 61.1
subroutine 18 19 94.7
pod 4 14 28.5
total 268 345 77.6


line stmt bran cond sub pod time code
1             package HTML::Parser::Simple;
2              
3 10     10   136980 use strict;
  10         22  
  10         300  
4 10     10   47 use warnings;
  10         22  
  10         211  
5              
6 10     10   3708 use Moo;
  10         66939  
  10         51  
7              
8 10     10   19871 use Tree::Simple;
  10         32902  
  10         81  
9              
10             has block =>
11             (
12             default => sub {return {} },
13             is => 'rw',
14             );
15              
16             has current_node =>
17             (
18             default => sub {return ''},
19             is => 'rw',
20             );
21              
22             has depth =>
23             (
24             default => sub {return 0},
25             is => 'rw',
26             );
27              
28             has empty =>
29             (
30             default => sub {return {} },
31             is => 'rw',
32             );
33              
34             has inline =>
35             (
36             default => sub {return {} },
37             is => 'rw',
38             );
39              
40             has input_file =>
41             (
42             default => sub {return ''},
43             is => 'rw',
44             );
45              
46             has node_type =>
47             (
48             default => sub {return 'global'},
49             is => 'rw',
50             );
51              
52             has output_file =>
53             (
54             default => sub {return ''},
55             is => 'rw',
56             );
57              
58             has result =>
59             (
60             default => sub {return ''},
61             is => 'rw',
62             );
63              
64             has root =>
65             (
66             default => sub {return ''},
67             is => 'rw',
68             );
69              
70             has self_close =>
71             (
72             default => sub {return {} },
73             is => 'rw',
74             );
75              
76             has tagged_attribute =>
77             (
78             default => sub {return {} },
79             is => 'rw',
80             );
81              
82             has verbose =>
83             (
84             default => sub {return 0},
85             is => 'rw',
86             );
87              
88             has xhtml =>
89             (
90             default => sub {return 0},
91             is => 'rw',
92              
93             trigger =>
94             sub
95             {
96             my($self, $new) = @_;
97              
98             $self -> _set_tagged_attribute($new);
99             }
100             );
101              
102             our $VERSION = '2.01';
103              
104             # -----------------------------------------------
105              
106             sub BUILD
107             {
108 9     9 0 76 my($self) = @_;
109              
110 9         382 $self -> block
111             ({
112             address => 1,
113             applet => 1,
114             blockquote => 1,
115             button => 1,
116             center => 1,
117             dd => 1,
118             del => 1,
119             dir => 1,
120             div => 1,
121             dl => 1,
122             dt => 1,
123             fieldset => 1,
124             form => 1,
125             frameset => 1,
126             hr => 1,
127             iframe => 1,
128             ins => 1,
129             isindex => 1,
130             li => 1,
131             map => 1,
132             menu => 1,
133             noframes => 1,
134             noscript => 1,
135             object => 1,
136             ol => 1,
137             p => 1,
138             pre => 1,
139             script => 1,
140             table => 1,
141             tbody => 1,
142             td => 1,
143             tfoot => 1,
144             th => 1,
145             thead => 1,
146             'tr' => 1,
147             ul => 1,
148             });
149              
150 9         153 $self -> empty
151             ({
152             area => 1,
153             base => 1,
154             basefont => 1,
155             br => 1,
156             col => 1,
157             embed => 1,
158             frame => 1,
159             hr => 1,
160             img => 1,
161             input => 1,
162             isindex => 1,
163             link => 1,
164             meta => 1,
165             param => 1,
166             wbr => 1,
167             });
168              
169 9         263 $self -> inline
170             ({
171             a => 1,
172             abbr => 1,
173             acronym => 1,
174             applet => 1,
175             b => 1,
176             basefont => 1,
177             bdo => 1,
178             big => 1,
179             br => 1,
180             button => 1,
181             cite => 1,
182             code => 1,
183             del => 1,
184             dfn => 1,
185             em => 1,
186             font => 1,
187             i => 1,
188             iframe => 1,
189             img => 1,
190             input => 1,
191             ins => 1,
192             kbd => 1,
193             label => 1,
194             map => 1,
195             object => 1,
196             'q' => 1,
197             's' => 1,
198             samp => 1,
199             script => 1,
200             select => 1,
201             small => 1,
202             span => 1,
203             strike => 1,
204             strong => 1,
205             sub => 1,
206             sup => 1,
207             textarea => 1,
208             tt => 1,
209             u => 1,
210             var => 1,
211             });
212              
213 9         93 $self -> self_close
214             ({
215             colgroup => 1,
216             dd => 1,
217             dt => 1,
218             li => 1,
219             options => 1,
220             p => 1,
221             td => 1,
222             tfoot => 1,
223             th => 1,
224             thead => 1,
225             'tr' => 1,
226             });
227              
228 9         104 $self -> current_node($self -> create_new_node('root', '', Tree::Simple -> ROOT) );
229 9         477 $self -> root($self -> current_node);
230              
231 9 100       61 if ($self -> xhtml)
232             {
233             # Compared to the non-XHTML re, this has an extra ':' in the first [].
234              
235 2         9262 $self -> tagged_attribute
236             (
237             q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
238             );
239             }
240             else
241             {
242 7         3548 $self -> tagged_attribute
243             (
244             q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
245             );
246             }
247              
248             } # End of BUILD.
249              
250             # -----------------------------------------------
251             # Create a new node to store the new tag.
252             # Each node has metadata:
253             # o attributes: The tag's attributes, as a string with N spaces as a prefix.
254             # o content: The content before the tag was parsed.
255             # o name: The HTML tag.
256             # o node_type: This holds 'global' before '' and between ''
257             # and '', and after ''. It holds 'head' from
258             # '' to ', and holds 'body' from '' to
259             # ''. It's just there in case you need it.
260              
261             sub create_new_node
262             {
263 82     82 0 146 my($self, $name, $attributes, $parent) = @_;
264 82         542 my($metadata) =
265             {
266             attributes => $attributes,
267             content => [],
268             depth => $self -> depth,
269             name => $name,
270             node_type => $self -> node_type,
271             };
272              
273 82         376 return Tree::Simple -> new($metadata, $parent);
274              
275             } # End of create_new_node.
276              
277             # -----------------------------------------------
278              
279             sub handle_comment
280             {
281 2     2 0 5 my($self, $s) = @_;
282              
283 2         5 $self -> handle_content($s);
284              
285             } # End of handle_comment.
286              
287             # -----------------------------------------------
288              
289             sub handle_content
290             {
291 111     111 0 230 my($self, $s) = @_;
292 111         380 my($count) = $self -> current_node -> getChildCount;
293 111         745 my($metadata) = $self -> current_node -> getNodeValue;
294 111         862 $$metadata{'content'}[$count] .= $s;
295              
296 111         353 $self -> current_node -> setNodeValue($metadata);
297              
298             } # End of handle_content.
299              
300             # -----------------------------------------------
301              
302             sub handle_doctype
303             {
304 3     3 0 9 my($self, $s) = @_;
305              
306 3         9 $self -> handle_content($s);
307              
308             } # End of handle_doctype.
309              
310             # -----------------------------------------------
311              
312             sub handle_end_tag
313             {
314 51     51 0 80 my($self, $tag_name) = @_;
315              
316 51 100 100     261 $self -> node_type('global') if ( ($tag_name eq 'head') || ($tag_name eq 'body') );
317              
318 51 50       63 if (! ${$self -> empty}{$tag_name})
  51         193  
319             {
320 51         178 $self -> current_node($self -> current_node -> getParent);
321 51         388 $self -> depth($self -> depth - 1);
322             }
323              
324             } # End of handle_end_tag.
325              
326             # -----------------------------------------------
327              
328             sub handle_start_tag
329             {
330 73     73 0 153 my($self, $tag_name, $attributes, $unary) = @_;
331              
332 73         186 $self -> depth($self -> depth + 1);
333              
334 73 100       221 if ($tag_name eq 'head')
    100          
335             {
336 6         22 $self -> node_type('head');
337             }
338             elsif ($tag_name eq 'body')
339             {
340 8         27 $self -> node_type('body');
341             }
342              
343 73         203 my($node) = $self -> create_new_node($tag_name, $attributes, $self -> current_node);
344              
345 73 100       9488 $self -> current_node($node) if (! ${$self -> empty}{$tag_name});
  73         456  
346              
347             } # End of handle_start_tag.
348              
349             # -----------------------------------------------
350              
351             sub handle_xml_declaration
352             {
353 2     2 0 7 my($self, $s) = @_;
354              
355 2         9 $self -> handle_content($s);
356              
357             } # End of handle_xml_declaration.
358              
359             # -----------------------------------------------
360              
361             sub log
362             {
363 3     3 1 6 my($self, $msg) = @_;
364              
365 3 50       19 print STDERR "$msg\n" if ($self -> verbose);
366              
367             } # End of log.
368              
369             # -----------------------------------------------
370              
371             sub parse
372             {
373 9     9 1 709 my($self, $html) = @_;
374 9         22 my($original) = $html;
375 9         124 my(%special) =
376             (
377             script => 1,
378             style => 1,
379             );
380 9         52 my($tagged_attribute) = $self -> tagged_attribute;
381              
382 9         15 my($in_content);
383             my($offset);
384 0         0 my(@stack, $s);
385              
386 9         42 for (; $html;)
387             {
388 235         284 $in_content = 1;
389              
390             # Make sure we're not in a script or style element.
391              
392 235 50 66     1172 if (! $stack[$#stack] || ! $special{$stack[$#stack]})
393             {
394             # Rearrange order of testing so rarer possiblilites are further down.
395             # Is it an end tag?
396              
397 235         425 $s = substr($html, 0, 2);
398              
399 235 100       534 if ($s eq '
400             {
401 51 50       223 if ($html =~ /^(<\/(\w+)[^>]*>)/)
402             {
403 51         104 substr($html, 0, length $1) = '';
404 51         59 $in_content = 0;
405              
406 51         162 $self -> parse_end_tag($2, \@stack);
407             }
408             }
409              
410             # Is it a start tag?
411              
412 235 100       474 if ($in_content)
413             {
414 184 100       408 if (substr($html, 0, 1) eq '<')
415             {
416             # Use lc() since tags are stored in this module in lower-case.
417              
418 81 100       7810 if (lc($html) =~ /$tagged_attribute/)
419             {
420             # Since the regexp matched, save matches in lower-case.
421             # Then, re-match to get attributes in original case.
422             # In each case:
423             # o $1 => The whole string which matched.
424             # o $2 => The tag name.
425             # o $3 => The attributes.
426             # o $4 => The trailing / if any (aka $unity).
427             # But we have to lower-case the prefix '<$tag' of the string
428             # to ensure the 2nd regexp actually matches.
429              
430 73         293 my(@match) = ($2, $3, $4);
431 73         236 substr($html, 0, length($2) + 1) = lc substr($html, 0, length($2) + 1);
432              
433 73 50       1028 if ($html =~ /$tagged_attribute/)
434             {
435 73         183 substr($html, 0, length $1) = '';
436 73         79 $in_content = 0;
437              
438             # Here we use $3 from the 2nd match to get the attributes in the original case.
439 73         247 $self -> parse_start_tag($match[0], $3, $match[2], \@stack);
440             }
441             }
442             }
443             }
444              
445             # Is it a comment?
446              
447 235 100       481 if ($in_content)
448             {
449 111         159 $s = substr($html, 0, 4);
450              
451 111 100       230 if ($s eq '');
454              
455 2 50       6 if ($offset >= 0)
456             {
457 2         9 $self -> handle_comment(substr($html, 0, ($offset + 3) ) );
458              
459 2         13 substr($html, 0, $offset + 3) = '';
460 2         3 $in_content = 0;
461             }
462             }
463             }
464              
465             # Is it a doctype?
466              
467 235 100       444 if ($in_content)
468             {
469 109         150 $s = substr($html, 0, 9);
470              
471 109 100       228 if ($s eq '
472             {
473 3         7 $offset = index($html, '>');
474              
475 3 50       11 if ($offset >= 0)
476             {
477 3         17 $self -> handle_doctype(substr($html, 0, ($offset + 1) ) );
478              
479 3         24 substr($html, 0, $offset + 1) = '';
480 3         14 $in_content = 0;
481             }
482             }
483             }
484              
485             # Is is an XML declaration?
486              
487 235 100 100     5767 if ($self -> xhtml && $in_content)
488             {
489 30         271 $s = substr($html, 0, 5);
490              
491 30 100       67 if ($s eq '
492             {
493 2         5 $offset = index($html, '?>');
494              
495 2 50       7 if ($offset >= 0)
496             {
497 2         14 $self -> handle_xml_declaration(substr($html, 0, ($offset + 2) ) );
498              
499 2         19 substr($html, 0, $offset + 2) = '';
500 2         4 $in_content = 0;
501             }
502             }
503             }
504              
505 235 100       1661 if ($in_content)
506             {
507 104         169 $offset = index($html, '<');
508              
509 104 100       212 if ($offset < 0)
510             {
511 7         22 $self -> handle_content($html);
512              
513 7         40 $html = '';
514             }
515             else
516             {
517 97         303 $self -> handle_content(substr($html, 0, $offset) );
518              
519 97         634 substr($html, 0, $offset) = '';
520             }
521             }
522             }
523             else
524             {
525 0         0 my($re) = "(.*)<\/$stack[$#stack]\[^>]*>";
526              
527             # lc() is needed because only lc tag names are pushed onto the stack.
528              
529 0 0       0 if (lc($html) =~ /$re/s)
530             {
531 0         0 my($text) = $1;
532 0         0 $text =~ s//$1/g;
533 0         0 $text =~ s//$1/g;
534              
535 0         0 $self -> handle_content($text);
536             }
537              
538 0         0 $self -> parse_end_tag($stack[$#stack], \@stack);
539             }
540              
541 235 100       527 if ($html eq $original)
542             {
543 1         2 my($msg) = 'Parse error. ';
544 1         5 my($parent) = $self -> current_node -> getParent;
545              
546 1         5 my($metadata);
547              
548 1 50 33     20 if ($parent && $parent -> can('getNodeValue') )
549             {
550 1         4 $metadata = $parent -> getNodeValue;
551 1         7 $msg .= "Parent tag: <$$metadata{'name'}>. ";
552             }
553              
554 1         5 $metadata = $self -> current_node -> getNodeValue;
555 1         12 $msg .= "Current tag: <$$metadata{'name'}>. Next 100 chars: " . substr($html, 0, 100);
556              
557 1         11 die "$msg\n";
558             }
559              
560 234         631 $original = $html;
561             }
562              
563             # Clean up any remaining tags.
564              
565 8         32 $self -> parse_end_tag('', \@stack);
566              
567             # Return the invocant to allow method chaining.
568              
569 8         47 return $self;
570              
571             } # End of parse.
572              
573             # -----------------------------------------------
574              
575             sub parse_end_tag
576             {
577 59     59 0 110 my($self, $tag_name, $stack) = @_;
578 59         82 $tag_name = lc $tag_name;
579              
580             # Find the closest opened tag of the same name.
581              
582 59         69 my($pos);
583              
584 59 100       165 if ($tag_name)
585             {
586 51         154 for ($pos = $#$stack; $pos >= 0; $pos--)
587             {
588 51 50       136 last if ($$stack[$pos] eq $tag_name);
589             }
590             }
591             else
592             {
593 8         14 $pos = 0;
594             }
595              
596 59 50       136 if ($pos >= 0)
597             {
598             # Close all the open tags, up the stack.
599              
600 59         141 my($count) = 0;
601              
602 59         157 for (my($i) = $#$stack; $i >= $pos; $i--)
603             {
604 51         66 $count++;
605              
606 51         123 $self -> handle_end_tag($$stack[$i]);
607             }
608              
609             # Remove the open elements from the stack.
610             # Does not work: $#$stack = $pos. Could use splice().
611              
612 59         230 pop @$stack for ($count);
613             }
614              
615             } # End of parse_end_tag.
616              
617             # -----------------------------------------------
618              
619             sub parse_file
620             {
621 0     0 1 0 my($self, $input_file_name, $output_file_name) = @_;
622 0   0     0 $input_file_name ||= $self -> input_file;
623 0   0     0 $output_file_name ||= $self -> output_file;
624              
625 0         0 $self -> input_file($input_file_name);
626 0         0 $self -> output_file($output_file_name);
627 0         0 $self -> log("Reading $input_file_name");
628              
629 0 0       0 open(INX, $input_file_name) || die "Can't open($input_file_name): $!\n";
630 0         0 my($html);
631 0         0 read(INX, $html, -s INX);
632 0         0 close INX;
633              
634 0 0       0 die "Can't read($input_file_name): $!\n" if (! defined $html);
635              
636 0         0 $self -> log('Parsing');
637              
638 0         0 $self -> parse($html);
639              
640 0         0 $self -> log('Traversing');
641              
642 0         0 $self -> traverse($self -> root);
643              
644 0         0 $self -> log("Writing $output_file_name");
645              
646 0 0       0 open(OUT, "> $output_file_name") || die "Can't open(> $output_file_name): $!\n";
647 0         0 print OUT $self -> result;
648 0         0 close OUT;
649              
650             # Return the invocant to allow method chaining.
651              
652 0         0 return $self;
653              
654             } # End of parse_file.
655              
656             # -----------------------------------------------
657              
658             sub parse_start_tag
659             {
660 73     73 0 165 my($self, $tag_name, $attributes, $unary, $stack) = @_;
661 73         98 $tag_name = lc $tag_name;
662              
663 73 100       89 if (${$self -> block}{$tag_name})
  73         286  
664             {
665 23   33     74 for (; $#$stack >= 0 && ${$self -> inline}{$$stack[$#$stack]};)
  23         123  
666             {
667 0         0 $self -> parse_end_tag($$stack[$#$stack], $stack);
668             }
669             }
670              
671 73 50 66     95 if (${$self -> self_close}{$tag_name} && ($$stack[$#$stack] eq $tag_name) )
  73         302  
672             {
673 0         0 $self -> parse_end_tag($tag_name, $stack);
674             }
675              
676 73   66     89 $unary = ${$self -> empty}{$tag_name} || $unary;
677              
678 73 100       217 push @$stack, $tag_name if (! $unary);
679              
680 73         176 $self -> handle_start_tag($tag_name, $attributes, $unary);
681              
682             } # End of parse_start_tag.
683              
684             # -----------------------------------------------
685              
686             sub _set_tagged_attribute
687             {
688 2     2   6 my($self, $new, $old) = @_;
689              
690 2 50       7 if ($new)
691             {
692 2         71 $self -> tagged_attribute
693             (
694             # Compared to the non-XHTML re, this has an extra ':' in the first [].
695              
696             q#^(<(\w+)((?:\s+[-:\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
697             );
698             }
699             else
700             {
701 0         0 $self -> tagged_attribute
702             (
703             q#^(<(\w+)((?:\s+[-\w]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>)#
704             );
705             }
706              
707             } # End of _set_tagged_attribute.
708              
709             # -----------------------------------------------
710              
711             sub traverse
712             {
713 51     51 1 86 my($self, $node) = @_;
714 51         131 my(@child) = $node -> getAllChildren;
715 51         333 my($metadata) = $node -> getNodeValue;
716 51         285 my($content) = $$metadata{'content'};
717 51         77 my($name) = $$metadata{'name'};
718              
719             # Special check to avoid printing '' when we still need to output
720             # the content of the root, e.g. the DOCTYPE.
721              
722 51 100       255 $self -> result($self -> result . "<$name$$metadata{'attributes'}>") if ($name ne 'root');
723              
724 51         62 my($index);
725             my($s);
726              
727 51         93 for $index (0 .. $#child)
728             {
729 45 100 100     303 $self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
730 45         198 $self -> traverse($child[$index]);
731             }
732              
733             # Output the content after the last child node has been closed,
734             # but before the current node is closed.
735              
736 51         81 $index = $#child + 1;
737              
738 51 100 66     330 $self -> result($self -> result . ($index <= $#$content && defined($$content[$index]) ? $$content[$index] : '') );
739 51 100 100     60 $self -> result($self -> result . "") if (! ${$self -> empty}{$name} && ($name ne 'root') );
  51         370  
740              
741             # Return the invocant to allow method chaining.
742              
743 51         164 return $self;
744              
745             } # End of traverse.
746              
747             # -----------------------------------------------
748              
749             1;
750              
751             =head1 NAME
752              
753             HTML::Parser::Simple - Parse nice HTML files without needing a compiler
754              
755             =head1 Synopsis
756              
757             #!/usr/bin/env perl
758              
759             use strict;
760             use warnings;
761              
762             use HTML::Parser::Simple;
763              
764             # -------------------------
765              
766             # Method 1:
767              
768             my($p) = HTML::Parser::Simple -> new
769             (
770             input_file => 'data/s.1.html',
771             output_file => 'data/s.2.html',
772             );
773              
774             $p -> parse_file;
775              
776             # Method 2:
777              
778             my($p) = HTML::Parser::Simple -> new;
779              
780             $p -> parse_file('data/s.1.html', 'data/s.2.html');
781              
782             # Method 3:
783              
784             my($p) = HTML::Parser::Simple -> new;
785              
786             print $p -> parse('...') -> traverse($p -> root) -> result;
787              
788             Of course, these can be abbreviated by using method chaining. E.g. Method 2 could be:
789              
790             HTML::Parser::Simple -> new -> parse_file('data/s.1.html', 'data/s.2.html');
791              
792             See scripts/parse.html.pl and scripts/parse.xhtml.pl.
793              
794             =head1 Description
795              
796             C is a pure Perl module.
797              
798             It parses HTML V 4 files, and generates a tree of nodes, with 1 node per HTML tag.
799              
800             The data associated with each node is documented in the L.
801              
802             See also L and L.
803              
804             =head1 Distributions
805              
806             This module is available as a Unix-style distro (*.tgz).
807              
808             See L for details.
809              
810             See L for
811             help on unpacking and installing.
812              
813             =head1 Constructor and initialization
814              
815             new(...) returns an object of type C.
816              
817             This is the class contructor.
818              
819             Usage: C<< HTML::Parser::Simple -> new >>.
820              
821             This method takes a hash of options.
822              
823             Call C<< new() >> as C<< new(option_1 => value_1, option_2 => value_2, ...) >>.
824              
825             Available options (each one of which is also a method):
826              
827             =over 4
828              
829             =item o input_file => $a_file_name
830              
831             This takes the file name, including the path, of the input file.
832              
833             Default: '' (the empty string).
834              
835             =item o output_file => $a_file_name
836              
837             This takes the file name, including the path, of the output file.
838              
839             Default: '' (the empty string).
840              
841             =item o verbose => $Boolean
842              
843             This takes either a 0 or a 1.
844              
845             Write more or less progress messages.
846              
847             Default: 0.
848              
849             =item o xhtml => $Boolean
850              
851             This takes either a 0 or a 1.
852              
853             0 means do not accept an XML declaration, such as
854             at the start of the input file, and some other XHTML features, explained next.
855              
856             1 means accept XHTML input.
857              
858             Default: 0.
859              
860             The only XHTML changes to this code, so far, are:
861              
862             =over 4
863              
864             =item o Accept the XML declaration
865              
866             E.g.: .
867              
868             =item o Accept attribute names containing the ':' char
869              
870             E.g.: .
871              
872             =back
873              
874             =back
875              
876             =head1 Methods
877              
878             =head2 block()
879              
880             Returns a hashref where the keys are the names of block-level HTML tags.
881              
882             The corresponding values in the hashref are just 1.
883              
884             Typical keys: address, form, p, table, tr.
885              
886             Note: Some keys, e.g. tr, are also returned by L.
887              
888             =head2 current_node()
889              
890             Returns the L object which the parser calls the current node.
891              
892             =head2 depth()
893              
894             Returns the nesting depth of the current tag.
895              
896             The method is just here in case you need it.
897              
898             =head2 empty()
899              
900             Returns a hashref where the keys are the names of HTML tags of type empty.
901              
902             The corresponding values in the hashref are just 1.
903              
904             Typical keys: area, base, input, wbr.
905              
906             =head2 inline()
907              
908             Returns a hashref where the keys are the names of HTML tags of type inline.
909              
910             The corresponding values in the hashref are just 1.
911              
912             Typical keys: a, em, img, textarea.
913              
914             =head2 input_file($in_file_name)
915              
916             Gets or sets the input file name used by L.
917              
918             Note: The parameters passed in to L, take precedence over the
919             I and I parameters passed in to C<< new() >>, and over the internal values set with
920             C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>.
921              
922             'input_file' is a parameter to L. See L for details.
923              
924             =head2 log($msg)
925              
926             Print $msg to STDERR if C<< new() >> was called as C<< new(verbose => 1) >>, or if C<< $p -> verbose(1) >>
927             was called.
928              
929             Otherwise, print nothing.
930              
931             =head2 new()
932              
933             This is the constructor. See L for details.
934              
935             =head2 node_type()
936              
937             Returns the type of the most recently created node, I, I, or I.
938              
939             See the first question in the L for details.
940              
941             =head2 output_file($out_file_name)
942              
943             Gets or sets the output file name used by L.
944              
945             Note: The parameters passed in to L, take precedence over the
946             I and I parameters passed in to C<< new() >>, and over the internal values set with
947             C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>.
948              
949             'output_file' is a parameter to L. See L for details.
950              
951             =head2 parse($html)
952              
953             Returns the invocant. Thus C<< $p -> parse >> returns $p. This allows for method chaining. See the L.
954              
955             Parses the string of HTML in $html, and builds a tree of nodes.
956              
957             After calling C<< $p -> parse($html) >>, you must call C<< $p -> traverse($p -> root) >> before calling
958             C<< $p -> result >>.
959              
960             Alternately, use C<< $p -> parse_file >>, which calls all these methods for you.
961              
962             Note: C<< parse() >> may be called directly or via C<< parse_file() >>.
963              
964             =head2 parse_file($input_file_name, $output_file_name)
965              
966             Returns the invocant. Thus C<< $p -> parse_file >> returns $p. This allows for method chaining. See the L.
967              
968             Parses the HTML in the input file, and writes the result to the output file.
969              
970             C<< parse_file() >> calls L and L, using C<< $p -> root >> for $node.
971              
972             Note: The parameters passed in to C<< parse_file($input_file_name, $output_file_name) >>, take precedence over the
973             I and I parameters passed in to C<< new() >>, and over the internal values set with
974             C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>.
975              
976             Lastly, the parameters passed in to C<< parse_file($input_file_name, $output_file_name) >> are used to update
977             the internal values set with the I and I parameters passed in to C<< new() >>,
978             or set with calls to C<< input_file($in_file_name) >> and C<< output_file($out_file_name) >>.
979              
980             =head2 result()
981              
982             Returns the string which is the result of the parse.
983              
984             See scripts/parse.html.pl.
985              
986             =head2 root()
987              
988             Returns the L object which the parser calls the root of the tree of nodes.
989              
990             =head2 self_close()
991              
992             Returns a hashref where the keys are the names of HTML tags of type self close.
993              
994             The corresponding values in the hashref are just 1.
995              
996             Typical keys: dd, dt, p, tr.
997              
998             Note: Some keys, e.g. tr, are also returned by L.
999              
1000             =head2 tagged_attribute()
1001              
1002             Returns a string to be used as a regexp, to capture tags and their optional attributes.
1003              
1004             It does not return qr/$s/; it just returns $s.
1005              
1006             This regexp takes one of two forms, depending on the state of the I option. See L.
1007              
1008             The regexp has four (4) sets of capturing parentheses:
1009              
1010             =over 4
1011              
1012             =item o 1 for the whole tag and attribute and trailing / combination
1013              
1014             E.g.: <(....)>
1015              
1016             =item o 1 for the tag itself
1017              
1018             E.g.: <(img)...>
1019              
1020             =item o 1 for the tag's optional attributes
1021              
1022             E.g.: A graph
1023              
1024             =item o 1 for the tag's optional trailing /
1025              
1026             E.g.:
1027              
1028             =back
1029              
1030             =head2 traverse($node)
1031              
1032             Returns the invocant. Thus C<< $p -> traverse >> returns $p. This allows for method chaining. See the L.
1033              
1034             Traverses the tree of nodes, starting at $node.
1035              
1036             You normally call this as C<< $p -> traverse($p -> root) >>, to ensure all nodes are visited.
1037              
1038             See the L for sample code.
1039              
1040             Or, see scripts/traverse.file.pl, which uses L, and calls C<< traverse($node) >>
1041             via L.
1042              
1043             =head2 verbose($Boolean)
1044              
1045             Gets or sets the verbose parameter.
1046              
1047             'verbose' is a parameter to L. See L for details.
1048              
1049             =head2 xhtml($Boolean)
1050              
1051             Gets or sets the xhtml parameter.
1052              
1053             If you call this after object creation, the I feature of L is used to call
1054             L so as to correctly set the regexp which recognises xhtml.
1055              
1056             'xhtm'> is a parameter to L. See L for details.
1057              
1058             =head1 FAQ
1059              
1060             =head2 What is the format of the data stored in each node of the tree?
1061              
1062             The data of each node is a hash ref. The keys/values of this hash ref are:
1063              
1064             =over 4
1065              
1066             =item o attributes
1067              
1068             This is the string of HTML attributes associated with the HTML tag.
1069              
1070             Attributes are stored in lower-case.
1071              
1072             So, will have an attributes string of
1073             " align = 'center' summary = 'body'".
1074              
1075             Note the leading space.
1076              
1077             =item o content
1078              
1079             This is an arrayref of bits and pieces of content.
1080              
1081             Consider this fragment of HTML:
1082              
1083            

I did not say I liked debugging.

1084              
1085             When parsing 'I did ', the number of child nodes (of

) is 0, since has not yet been detected.

1086              
1087             So, 'I did ' is stored in the 0th element of the arrayref belonging to

.

1088              
1089             Likewise, 'not' is stored in the 0th element of the arrayref belonging to the node .
1090              
1091             Next, ' say I ' is stored in the 1st element of the arrayref belonging to

,

1092             because it follows the 1st child node ().
1093              
1094             Likewise, ' debugging' is stored in the 2nd element of the arrayref belonging to

.

1095              
1096             This way, the input string can be reproduced by successively outputting the elements of the arrayref of content
1097             interspersed with the contents of the child nodes (processed recusively).
1098              
1099             Note: If you are processing this tree, never forget that there can be content after the last child node has been closed,
1100             but before the current node is closed.
1101              
1102             Note: The DOCTYPE declaration is stored as the 0th element of the content of the root node.
1103              
1104             =item o depth
1105              
1106             The nesting depth of the tag within the document.
1107              
1108             The root is at depth 0, '' is at depth 1, '' and '' are a depth 2, and so on.
1109              
1110             It's just there in case you need it.
1111              
1112             =item o name
1113              
1114             So, the tag '' will mean the name is 'html'.
1115              
1116             Tag names are stored in lower-case.
1117              
1118             The root of the tree is called 'root', and holds the DOCTYPE, if any, as content.
1119              
1120             The root has the node 'html' as the only child, of course.
1121              
1122             =item o node_type
1123              
1124             This holds 'global' before '' and between '' and '', and after ''.
1125              
1126             It holds 'head' for all nodes from '' to '', and holds 'body' from '' to ''.
1127              
1128             It's just there in case you need it.
1129              
1130             =back
1131              
1132             =head2 How are tags and attributes handled?
1133              
1134             Tags are stored in lower-case, in a tree managed by L.
1135              
1136             Attributes are stored in the same case as in the original HTML.
1137              
1138             The root of the tree is returned be L.
1139              
1140             =head2 How are HTML comments handled?
1141              
1142             They are treated as content. This includes the prefix ''.
1143              
1144             =head2 How is DOCTYPE handled?
1145              
1146             It is treated as content belonging to the root of the tree.
1147              
1148             =head2 How is the XML declaration handled?
1149              
1150             It is treated as content belonging to the root of the tree.
1151              
1152             =head2 Does this module handle all HTML pages?
1153              
1154             No, never.
1155              
1156             =head2 Which versions of HTML does this module handle?
1157              
1158             Up to V 4.
1159              
1160             =head2 What do I do if this module does not handle my HTML page?
1161              
1162             Make yourself a nice cup of tea, and then fix your page.
1163              
1164             =head2 Does this validate the HTML input?
1165              
1166             No.
1167              
1168             For example, if you feed in a HTML page without the title tag, this module does not care.
1169              
1170             =head2 How do I view the output HTML?
1171              
1172             There are various ways.
1173              
1174             =over 4
1175              
1176             =item o See scripts/parse.html.pl
1177              
1178             =item o By installing HTML::Revelation, of course!
1179              
1180             Sample output:
1181              
1182             L.
1183              
1184             =back
1185              
1186             =head2 How do I test this module (or my file)?
1187              
1188             Preferably, see the previous question, or...
1189              
1190             Suggested steps:
1191              
1192             Note: There are quite a few files involved. Proceed with caution.
1193              
1194             =over 4
1195              
1196             =item o Select a HTML file to test
1197              
1198             Call this input.html.
1199              
1200             =item o Run input.html thru reveal.pl
1201              
1202             Reveal.pl ships with HTML::Revelation.
1203              
1204             Call the output file output.1.html.
1205              
1206             =item o Run input.html thru parse.html.pl
1207              
1208             parse.html.pl ships with HTML::Parser::Simple.
1209              
1210             Call the output file parsed.html.
1211              
1212             =item o Run parsed.html thru reveal.pl
1213              
1214             Call the output file output.2.html.
1215              
1216             =item o Compare output.1.html and output.2.html
1217              
1218             If they match, or even if they don't match, you're finished.
1219              
1220             =back
1221              
1222             =head2 Will you implement a 'quirks' mode to handle my special HTML file?
1223              
1224             No, never.
1225              
1226             Help with quirks: L.
1227              
1228             =head2 Is there anything I should be aware of?
1229              
1230             Yes. If your HTML file is not nice, the interpretation of tag nesting will not match
1231             your preconceptions.
1232              
1233             In such cases, do not seek to fix the code. Instead, fix your (faulty) preconceptions, and fix your HTML file.
1234              
1235             The 'a' tag, for example, is defined to be an inline tag, but the 'div' tag is a block-level tag.
1236              
1237             I don't define 'a' to be inline, others do, e.g. L and hence HTML::Tagset.
1238              
1239             Inline means:
1240              
1241            
NAME
1242              
1243             will I be parsed as an 'a' containing a 'div'.
1244              
1245             The 'a' tag will be closed before the 'div' is opened. So, the result will look like:
1246              
1247            
NAME
1248              
1249             To achieve what was presumably intended, use 'span':
1250              
1251             NAME
1252              
1253             Some people (*cough* *cough*) have had to redo their entire websites due to this very problem.
1254              
1255             Of course, this is just one of a vast set of possible problems.
1256              
1257             You have been warned.
1258              
1259             =head2 Why did you use Tree::Simple but not Tree or Tree::Fast or Tree::DAG_Node?
1260              
1261             During testing, Tree::Fast crashed, so I replaced it with Tree and everything worked. Spooky.
1262              
1263             Late news: Tree does not cope with an arrayref stored in the metadata, so I've switched to Tree::DAG_Node.
1264              
1265             Stop press: As an experiment I switched to Tree::Simple. Since it also works I'll just keep using it.
1266              
1267             =head2 Why isn't this module called HTML::Parser::PurePerl?
1268              
1269             =over 4
1270              
1271             =item o The API
1272              
1273             That name sounds like a pure Perl version of the same API as used by HTML::Parser.
1274              
1275             But the API's are not, and are not meant to be, compatible.
1276              
1277             =item o The tie-in
1278              
1279             Some people might falsely assume HTML::Parser can automatically fall back to HTML::Parser::PurePerl in the absence of a compiler.
1280              
1281             =back
1282              
1283             =head2 How do I output my own stuff while traversing the tree?
1284              
1285             =over 4
1286              
1287             =item o The sophisticated way
1288              
1289             As always with OO code, sub-class! In this case, you write a new version of the traverse() method.
1290              
1291             See L, for example. It overrides L.
1292              
1293             =item o The crude way
1294              
1295             Alternately, implement another method in your sub-class, e.g. process(), which recurses like traverse().
1296             Then call parse() and process().
1297              
1298             =back
1299              
1300             =head2 Is the code on github?
1301              
1302             Yes. See: git://github.com/ronsavage/html--parser--simple.git
1303              
1304             =head2 How is the source formatted?
1305              
1306             I edit with UltraEdit. That means, in general, leading 4-space tabs.
1307              
1308             All vertical alignment within lines is done manually with spaces.
1309              
1310             Perl::Critic is off the agenda.
1311              
1312             =head2 Why did you choose Moos?
1313              
1314             For this year's (2012) Google Code-in, I had a quick look at 122 class-building classes, and decided
1315             L was suitable, given it is pure-Perl and has the trigger feature I needed.
1316              
1317             See L.
1318              
1319             =head1 Credits
1320              
1321             This Perl HTML parser has been converted from a JavaScript one written by John Resig.
1322              
1323             L.
1324              
1325             Well done John!
1326              
1327             Note also the comments published here:
1328              
1329             L.
1330              
1331             =head1 Author
1332              
1333             C was written by Ron Savage Iron@savage.net.auE> in 2009.
1334              
1335             Home page: L.
1336              
1337             =head1 Copyright
1338              
1339             Australian copyright (c) 2009 Ron Savage.
1340              
1341             All Programs of mine are 'OSI Certified Open Source Software';
1342             you can redistribute them and/or modify them under the terms of
1343             The Artistic License, a copy of which is available at:
1344             http://www.opensource.org/licenses/index.html
1345              
1346             =cut