File Coverage

blib/lib/HTML/HTML5/Parser.pm
Criterion Covered Total %
statement 16 18 88.8
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 22 24 91.6


line stmt bran cond sub pod time code
1             package HTML::HTML5::Parser;
2              
3             ## skip Test::Tabs
4 10     10   505889 use 5.008001;
  10         45  
  10         537  
5 10     10   68 use strict;
  10         172  
  10         483  
6 10     10   56 use warnings;
  10         27  
  10         659  
7              
8             our $AUTOLOAD;
9             our $VERSION = '0.301';
10              
11 10     10   58 use Carp;
  10         27  
  10         1224  
12 10     10   15550 use HTML::HTML5::Parser::Error;
  10         111  
  10         317  
13 10     10   20978 use HTML::HTML5::Parser::TagSoupParser;
  0            
  0            
14             use Scalar::Util qw(blessed);
15             use URI::file;
16             use XML::LibXML;
17              
18             BEGIN {
19             croak "Please upgrade to XML::LibXML 1.94"
20             if XML::LibXML->VERSION =~ /^1\.9[12]/;
21             }
22              
23             sub new
24             {
25             my $class = shift;
26             my %p = @_;
27             my $self = bless {
28             errors => [],
29             parser => HTML::HTML5::Parser::TagSoupParser->new(%p),
30             }, $class;
31             return $self;
32             }
33              
34             sub parse_file
35             {
36             require HTML::HTML5::Parser::UA;
37            
38             my $self = shift;
39             my $file = shift;
40             my $opts = shift || {};
41            
42             unless (blessed($file) and $file->isa('URI'))
43             {
44             if ($file =~ /^[a-z0-9_\.-]+:\S+$/i)
45             { $file = URI->new($file); }
46             else
47             { $file = URI::file->new_abs($file); }
48             }
49            
50             my $response = HTML::HTML5::Parser::UA->get($file, $opts->{user_agent});
51             croak "HTTP response code was not 200 OK. (Set \$opts{ignore_http_response_code} to ignore this error.)"
52             unless ($response->{success} || $opts->{ignore_http_response_code});
53            
54             my $content = $response->{decoded_content};
55             my $c_type = $response->{headers}{'content-type'};
56            
57             $opts->{'response'} = $response;
58            
59             if ($c_type =~ /xml/i and not $opts->{'force_html'})
60             {
61             $opts->{'parser_used'} = 'XML::LibXML::Parser';
62             my $xml_parser = XML::LibXML->new;
63             $xml_parser->validation(0);
64             $xml_parser->recover(2);
65             $xml_parser->base_uri($response->base);
66             $xml_parser->load_catalog($opts->{'xml_catalogue'})
67             if -r $opts->{'xml_catalogue'};
68             return $xml_parser->parse_string($content);
69             }
70            
71             return $self->parse_string($content, $opts);
72             }
73             *parse_html_file = \&parse_file;
74              
75             sub parse_fh
76             {
77             my $self = shift;
78             my $handle = shift;
79             my $opts = shift || {};
80            
81             my $string = '';
82             while (<$handle>)
83             {
84             $string .= $_;
85             }
86            
87             return $self->parse_string($string, $opts);
88             }
89             *parse_html_fh = \&parse_fh;
90              
91             sub parse_string
92             {
93             my $self = shift;
94             my $text = shift;
95             my $opts = shift || {};
96              
97             $self->{'errors'} = [];
98             $opts->{'parser_used'} = 'HTML::HTML5::Parser';
99             my $dom = XML::LibXML::Document->createDocument;
100              
101             if (defined $opts->{'encoding'}||1)
102             {
103             # XXX AGAIN DO THIS TO STOP ENORMOUS MEMORY LEAKS
104             my ($errh, $errors) = @{$self}{qw(error_handler errors)};
105             $self->{parser}->parse_byte_string(
106             $opts->{'encoding'}, $text, $dom,
107             sub {
108             my $err = HTML::HTML5::Parser::Error->new(@_);
109             $errh->($err) if $errh;
110             push @$errors, $err;
111             });
112             }
113             else
114             {
115             $self->{parser}->parse_char_string($text, $dom, sub{
116             my $err = HTML::HTML5::Parser::Error->new(@_);
117             $self->{error_handler}->($err) if $self->{error_handler};
118             push @{$self->{'errors'}}, $err;
119             });
120             }
121            
122             return $dom;
123             }
124             *parse_html_string = \&parse_string;
125              
126             # TODO: noembed, noframes, noscript
127             my %within = (
128             html => [qw/html/],
129             frameset => [qw/html frameset/],
130             frame => [qw/html frameset frame/],
131             head => [qw/html head/],
132             title => [qw/html head title/],
133             style => [qw/html head style/],
134             (map { $_ => undef }
135             qw/base link meta basefont bgsound/),
136             body => [qw/html body/],
137             script => [qw/html body script/],
138             div => [qw/html body div/],
139             (map { $_ => [qw/html body div/, $_] }
140             qw/a abbr acronym address applet area article aside big blockquote
141             button center code details dir dl em fieldset figure font
142             footer form h1 h2 h3 h4 h5 h6 header hgroup i iframe
143             listing marquee menu nav nobr object ol p plaintext pre
144             ruby s section small strike strong tt u ul xmp/),
145             (map { $_ => undef }
146             qw/br col command datagrid embed hr img input keygen
147             param wbr/),
148             dd => [qw/html body dl dd/],
149             dd => [qw/html body dl dt/],
150             figcaption => [qw/html body figure/],
151             li => [qw/html body ul li/],
152             ul__li => [qw/html body ul li/],
153             ol__li => [qw/html body ol li/],
154             optgroup => [qw/html body form div select/],
155             option => [qw/html body form div select/],
156             rp => [qw/html body div ruby/],
157             rt => [qw/html body div ruby/],
158             select => [qw/html body form div select/],
159             summary => [qw/html body div details/],
160             table => [qw/html body table/],
161             (map { $_ => [qw/html body table/, $_] }
162             qw/thead tfoot tbody tr caption colgroup/),
163             (map { $_ => [qw/html body table tbody tr/, $_] }
164             qw/td th/),
165             textarea => [qw/html body form div textarea/],
166             );
167              
168             sub parse_balanced_chunk
169             {
170             my ($self, $chunk, $o) = @_;
171             my %options = %{ $o || {} };
172            
173             $options{as} = 'default' unless defined $options{as};
174            
175             my $w = $options{force_within} || $options{within} || 'div';
176             my $ancestors = $within{ lc $w };
177             croak "Cannot parse chunk as if within $w."
178             if !defined $ancestors;
179            
180             my $parent = $ancestors->[-1];
181             my $n = scalar(@$ancestors) - 2;
182             my @a = $n ? @$ancestors[0 .. $n] : ();
183            
184             my $uniq = sprintf('rand_id_%09d', int rand 1_000_000_000);
185             my $document =
186             "<!doctype html>\n".
187             (join q{}, map { "<$_>" } @a).
188             "<$parent id='$uniq'>".
189             $chunk.
190             ''.# "</$parent>".
191             '';# (join q{}, map { "</$_>" } reverse @a);
192            
193             my $dom = $self->parse_html_string($document);
194             $parent = $dom->findnodes("//*[\@id='$uniq']")->get_node(1);
195            
196             if ($options{debug})
197             {
198             if (exists &Test::More::diag)
199             {
200             Test::More::diag($document);
201             Test::More::diag($dom->toString);
202             }
203             else
204             {
205             warn $document."\n";
206             warn $dom->toString."\n";
207             }
208             }
209            
210             my @results = $parent->childNodes;
211            
212             unless ($options{force_within})
213             {
214             while ($parent)
215             {
216             my $sibling = $parent->nextSibling;
217             while ($sibling)
218             {
219             unless ($sibling->nodeName =~ /^(head|body)$/)
220             {
221             $sibling->setAttribute('data-perl-html-html5-parser-outlier', 1)
222             if $options{mark_outliers}
223             && $sibling->can('setAttribute');
224             push @results, $sibling;
225             }
226             $sibling = $sibling->nextSibling;
227             }
228            
229             $sibling = $parent->previousSibling;
230             while ($sibling)
231             {
232             unless ($sibling->nodeName =~ /^(head|body)$/)
233             {
234             $sibling->setAttribute('data-perl-html-html5-parser-outlier', 1)
235             if $options{mark_outliers}
236             && $sibling->can('setAttribute');
237             unshift @results, $sibling;
238             }
239             $sibling = $sibling->previousSibling;
240             }
241            
242             $parent = $parent->parentNode;
243             }
244             }
245            
246             my $frag = XML::LibXML::DocumentFragment->new;
247             $frag->appendChild($_) foreach @results;
248            
249             if (lc $options{as} eq 'list')
250             {
251             return wantarray ? @results : XML::LibXML::NodeList->new(@results);
252             }
253            
254             return wantarray ? @results : $frag;
255             }
256              
257             sub load_html
258             {
259             my $class_or_self = shift;
260            
261             my %args = map { ref($_) eq 'HASH' ? (%$_) : $_ } @_;
262             my $URI = delete($args{URI});
263             $URI = "$URI" if defined $URI; # stringify in case it is an URI object
264             my $parser = ref($class_or_self)
265             ? $class_or_self
266             : $class_or_self->new;
267            
268             my $dom;
269             if ( defined $args{location} )
270             { $dom = $parser->parse_file( "$args{location}" ) }
271             elsif ( defined $args{string} )
272             { $dom = $parser->parse_string( $args{string}, $URI ) }
273             elsif ( defined $args{IO} )
274             { $dom = $parser->parse_fh( $args{IO}, $URI ) }
275             else
276             { croak("HTML::HTML5::Parser->load_html: specify location, string, or IO"); }
277            
278             return $dom;
279             }
280              
281             sub load_xml
282             {
283             my $self = shift;
284             my $dom;
285             eval {
286             $dom = XML::LibXML->load_xml(@_);
287             };
288             return $dom if blessed($dom);
289             return $self->load_html(@_);
290             }
291              
292             sub AUTOLOAD
293             {
294             my $self = shift;
295             my $func = $AUTOLOAD;
296             $func =~ s/.*://;
297            
298             # LibXML Push Parser.
299             if ($func =~ /^( parse_chunk | start_push | push | finish_push )$/xi)
300             {
301             croak "Push parser ($func) not implemented by HTML::HTML5::Parser.";
302             }
303            
304             # Misc LibXML functions with no compatible interface provided.
305             if ($func =~ /^( parse_balanced_chunk | parse_xml_chunk |
306             process_?xincludes | get_last_error )$/xi)
307             {
308             croak "$func not implemented by HTML::HTML5::Parser.";
309             }
310            
311             # Fixed options which are true.
312             if ($func =~ /^( recover | recover_silently | expand_entities |
313             keep_blanks | no_network )$/xi)
314             {
315             my $set = shift;
316             if ((!$set) && defined $set)
317             {
318             carp "Option $func cannot be switched off.";
319             }
320             return 1;
321             }
322              
323             # Fixed options which are false.
324             if ($func =~ /^( validation | pedantic_parser | line_numbers
325             load_ext_dtd | complete_attributes | expand_xinclude |
326             load_catalog | base_uri | gdome_dom | clean_namespaces )$/xi)
327             {
328             my $set = shift;
329             if (($set) && defined $set)
330             {
331             carp "Option $func cannot be switched on.";
332             }
333             return 0;
334             }
335              
336             carp "HTML::HTML5::Parser doesn't understand '$func'." if length $func;
337             }
338              
339             sub error_handler
340             {
341             my $self = shift;
342             $self->{error_handler} = shift if @_;
343             return $self->{error_handler};
344             }
345              
346             sub errors
347             {
348             my $self = shift;
349             return @{ $self->{errors} };
350             }
351              
352             sub compat_mode
353             {
354             my $self = shift;
355             my $node = shift;
356            
357             return $self->{parser}->_data($node)->{'manakai_compat_mode'};
358             }
359              
360             sub charset
361             {
362             my $self = shift;
363             my $node = shift;
364            
365             return $self->{parser}->_data($node)->{'charset'};
366             }
367              
368             sub dtd_public_id
369             {
370             my $self = shift;
371             my $node = shift;
372            
373             return $self->{parser}->_data($node)->{'DTD_PUBLIC_ID'};
374             }
375              
376             sub dtd_system_id
377             {
378             my $self = shift;
379             my $node = shift;
380            
381             return $self->{parser}->_data($node)->{'DTD_SYSTEM_ID'};
382             }
383              
384             sub dtd_element
385             {
386             my $self = shift;
387             my $node = shift;
388            
389             return $self->{parser}->_data($node)->{'DTD_ELEMENT'};
390             }
391              
392             sub source_line
393             {
394             my $self = shift;
395             my $node = shift;
396              
397             my $data = ref $self ? $self->{parser}->_data($node) :
398             HTML::HTML5::Parser::TagSoupParser::DATA($node);
399             my $line = $data->{'manakai_source_line'};
400              
401             if (wantarray)
402             {
403             return (
404             $line,
405             $data->{'manakai_source_column'},
406             ($data->{'implied'} || 0),
407             );
408             }
409             else
410             {
411             return $line;
412             }
413             }
414              
415             sub DESTROY {}
416              
417             __END__
418              
419             =pod
420              
421             =encoding utf8
422              
423             =begin stopwords
424              
425             XML::LibXML-like
426             XML::LibXML-Compatible
427             'utf-8')
428             foobar
429             doctype:
430             html
431             implictness
432              
433             =end stopwords
434              
435             =head1 NAME
436              
437             HTML::HTML5::Parser - parse HTML reliably
438              
439             =head1 SYNOPSIS
440              
441             use HTML::HTML5::Parser;
442            
443             my $parser = HTML::HTML5::Parser->new;
444             my $doc = $parser->parse_string(<<'EOT');
445             <!doctype html>
446             <title>Foo</title>
447             <p><b><i>Foo</b> bar</i>.
448             <p>Baz</br>Quux.
449             EOT
450            
451             my $fdoc = $parser->parse_file( $html_file_name );
452             my $fhdoc = $parser->parse_fh( $html_file_handle );
453              
454             =head1 DESCRIPTION
455              
456             This library is substantially the same as the non-CPAN module Whatpm::HTML.
457             Changes include:
458              
459             =over 8
460              
461             =item * Provides an XML::LibXML-like DOM interface. If you usually use XML::LibXML's DOM parser, this should be a drop-in solution for tag soup HTML.
462              
463             =item * Constructs an XML::LibXML::Document as the result of parsing.
464              
465             =item * Via bundling and modifications, removed external dependencies on non-CPAN packages.
466              
467             =back
468              
469             =head2 Constructor
470              
471             =over 8
472              
473             =item C<new>
474              
475             $parser = HTML::HTML5::Parser->new;
476             # or
477             $parser = HTML::HTML5::Parser->new(no_cache => 1);
478              
479             The constructor does nothing interesting besides take one flag
480             argument, C<no_cache =E<gt> 1>, to disable the global element metadata
481             cache. Disabling the cache is handy for conserving memory if you parse
482             a large number of documents, however, class methods such as
483             C</source_line> will not work, and must be run from an instance of
484             this parser.
485              
486             =back
487              
488             =head2 XML::LibXML-Compatible Methods
489              
490             =over
491              
492             =item C<parse_file>, C<parse_html_file>
493              
494             $doc = $parser->parse_file( $html_file_name [,\%opts] );
495            
496             This function parses an HTML document from a file or network;
497             C<$html_file_name> can be either a filename or an URL.
498              
499             Options include 'encoding' to indicate file encoding (e.g.
500             'utf-8') and 'user_agent' which should be a blessed C<LWP::UserAgent>
501             (or L<HTTP::Tiny>) object to be used when retrieving URLs.
502              
503             If requesting a URL and the response Content-Type header indicates
504             an XML-based media type (such as XHTML), XML::LibXML::Parser
505             will be used automatically (instead of the tag soup parser). The XML
506             parser can be told to use a DTD catalogue by setting the option
507             'xml_catalogue' to the filename of the catalogue.
508              
509             HTML (tag soup) parsing can be forced using the option 'force_html', even
510             when an XML media type is returned. If an options hashref was passed,
511             parse_file will set $options->{'parser_used'} to the name of the class used
512             to parse the URL, to allow the calling code to double-check which parser
513             was used afterwards.
514              
515             If an options hashref was passed, parse_file will set $options->{'response'}
516             to the HTTP::Response object obtained by retrieving the URI.
517              
518             =item C<parse_fh>, C<parse_html_fh>
519              
520             $doc = $parser->parse_fh( $io_fh [,\%opts] );
521            
522             C<parse_fh()> parses a IOREF or a subclass of C<IO::Handle>.
523              
524             Options include 'encoding' to indicate file encoding (e.g.
525             'utf-8').
526              
527             =item C<parse_string>, C<parse_html_string>
528              
529             $doc = $parser->parse_string( $html_string [,\%opts] );
530              
531             This function is similar to C<parse_fh()>, but it parses an HTML
532             document that is available as a single string in memory.
533              
534             Options include 'encoding' to indicate file encoding (e.g.
535             'utf-8').
536              
537             =item C<load_xml>, C<load_html>
538              
539             Wrappers for the parse_* functions. These should be roughly compatible with
540             the equivalently named functions in L<XML::LibXML>.
541              
542             Note that C<load_xml> first attempts to parse as real XML, falling back to
543             HTML5 parsing; C<load_html> just goes straight for HTML5.
544              
545             =item C<parse_balanced_chunk>
546              
547             $fragment = $parser->parse_balanced_chunk( $string [,\%opts] );
548              
549             This method is roughly equivalent to XML::LibXML's method of the same
550             name, but unlike XML::LibXML, and despite its name it does not require
551             the chunk to be "balanced". This method is somewhat black magic, but
552             should work, and do the proper thing in most cases. Of course, the
553             proper thing might not be what you'd expect! I'll try to keep this
554             explanation as brief as possible...
555              
556             Consider the following string:
557              
558             <b>Hello</b></td></tr> <i>World</i>
559              
560             What is the proper way to parse that? If it were found in a document like
561             this:
562              
563             <html>
564             <head><title>X</title></head>
565             <body>
566             <div>
567             <b>Hello</b></td></tr> <i>World</i>
568             </div>
569             </body>
570             </html>
571              
572             Then the document would end up equivalent to the following XHTML:
573              
574             <html>
575             <head><title>X</title></head>
576             <body>
577             <div>
578             <b>Hello</b> <i>World</i>
579             </div>
580             </body>
581             </html>
582              
583             The superfluous C<< </td></tr> >> is simply ignored. However, if it
584             were found in a document like this:
585              
586             <html>
587             <head><title>X</title></head>
588             <body>
589             <table><tbody><tr><td>
590             <b>Hello</b></td></tr> <i>World</i>
591             </td></tr></tbody></table>
592             </body>
593             </html>
594              
595             Then the result would be:
596              
597             <html>
598             <head><title>X</title></head>
599             <body>
600             <i>World</i>
601             <table><tbody><tr><td>
602             <b>Hello</b></td></tr>
603             </tbody></table>
604             </body>
605             </html>
606              
607             Yes, C<< <i>World</i> >> gets hoisted up before the C<< <table> >>. This
608             is weird, I know, but it's how browsers do it in real life.
609              
610             So what should:
611              
612             $string = q{<b>Hello</b></td></tr> <i>World</i>};
613             $fragment = $parser->parse_balanced_chunk($string);
614              
615             actually return? Well, you can choose...
616              
617             $string = q{<b>Hello</b></td></tr> <i>World</i>};
618            
619             $frag1 = $parser->parse_balanced_chunk($string, {within=>'div'});
620             say $frag1->toString; # <b>Hello</b> <i>World</i>
621            
622             $frag2 = $parser->parse_balanced_chunk($string, {within=>'td'});
623             say $frag2->toString; # <i>World</i><b>Hello</b>
624              
625             If you don't pass a "within" option, then the chunk is parsed as if it
626             were within a C<< <div> >> element. This is often the most sensible
627             option. If you pass something like C<< { within => "foobar" } >>
628             where "foobar" is not a real HTML element name (as found in the HTML5
629             spec), then this method will croak; if you pass the name of a void
630             element (e.g. C<< "br" >> or C<< "meta" >>) then this method will
631             croak; there are a handful of other unsupported elements which will
632             croak (namely: C<< "noscript" >>, C<< "noembed" >>, C<< "noframes" >>).
633              
634             Note that the second time around, although we parsed the string "as
635             if it were within a C<< <td> >> element", the C<< <i>Hello</i> >>
636             bit did not strictly end up within the C<< <td> >> element (not
637             even within the C<< <table> >> element!) yet it still gets returned.
638             We'll call things such as this "outliers". There is a "force_within"
639             option which tells parse_balanced_chunk to ignore outliers:
640              
641             $frag3 = $parser->parse_balanced_chunk($string,
642             {force_within=>'td'});
643             say $frag3->toString; # <b>Hello</b>
644              
645             There is a boolean option "mark_outliers" which marks each outlier
646             with an attribute (C<< data-perl-html-html5-parser-outlier >>) to
647             indicate its outlier status. Clearly, this is ignored when you use
648             "force_within" because no outliers are returned. Some outliers may
649             be XML::LibXML::Text elements; text nodes don't have attributes, so
650             these will not be marked with an attribute.
651              
652             A last note is to mention what gets returned by this method. Normally
653             it's an L<XML::LibXML::DocumentFragment> object, but if you call the
654             method in list context, a list of the individual node elements is
655             returned. Alternatively you can request the data to be returned as an
656             L<XML::LibXML::NodeList> object:
657              
658             # Get an XML::LibXML::NodeList
659             my $list = $parser->parse_balanced_chunk($str, {as=>'list'});
660              
661             The exact implementation of this method may change from version to
662             version, but the long-term goal will be to approach how common
663             desktop browsers parse HTML fragments when implementing the setter
664             for DOM's C<innerHTML> attribute.
665              
666             =back
667              
668             The push parser and SAX-based parser are not supported. Trying
669             to change an option (such as recover_silently) will make
670             HTML::HTML5::Parser carp a warning. (But you can inspect the
671             options.)
672              
673             =head2 Error Handling
674              
675             Error handling is obviously different to XML::LibXML, as errors are
676             (bugs notwithstanding) non-fatal.
677              
678             =over
679              
680             =item C<error_handler>
681              
682             Get/set an error handling function. Must be set to a coderef or undef.
683              
684             The error handling function will be called with a single parameter, a
685             L<HTML::HTML5::Parser::Error> object.
686              
687             =item C<errors>
688              
689             Returns a list of errors that occurred during the last parse.
690              
691             See L<HTML::HTML5::Parser::Error>.
692              
693             =back
694              
695             =head2 Additional Methods
696              
697             The module provides a few methods to obtain additional, non-DOM data from
698             DOM nodes.
699              
700             =over
701              
702             =item C<dtd_public_id>
703              
704             $pubid = $parser->dtd_public_id( $doc );
705            
706             For an XML::LibXML::Document which has been returned by
707             HTML::HTML5::Parser, using this method will tell you the
708             Public Identifier of the DTD used (if any).
709              
710             =item C<dtd_system_id>
711              
712             $sysid = $parser->dtd_system_id( $doc );
713            
714             For an XML::LibXML::Document which has been returned by
715             HTML::HTML5::Parser, using this method will tell you the
716             System Identifier of the DTD used (if any).
717              
718             =item C<dtd_element>
719              
720             $element = $parser->dtd_element( $doc );
721              
722             For an XML::LibXML::Document which has been returned by
723             HTML::HTML5::Parser, using this method will tell you the
724             root element declared in the DTD used (if any). That is,
725             if the document has this doctype:
726              
727             <!doctype html>
728              
729             ... it will return "html".
730              
731             This may return the empty string if a DTD was present but
732             did not contain a root element; or undef if no DTD was
733             present.
734              
735             =item C<compat_mode>
736              
737             $mode = $parser->compat_mode( $doc );
738            
739             Returns 'quirks', 'limited quirks' or undef (standards mode).
740              
741             =item C<charset>
742              
743             $charset = $parser->charset( $doc );
744              
745             The character set apparently used by the document.
746              
747             =item C<source_line>
748              
749             ($line, $col) = $parser->source_line( $node );
750             $line = $parser->source_line( $node );
751              
752             In scalar context, C<source_line> returns the line number of the
753             source code that started a particular node (element, attribute or
754             comment).
755              
756             In list context, returns a tuple: $line, $column, $implicitness.
757             Tab characters count as one column, not eight.
758              
759             $implicitness indicates that the node was not explicitly marked
760             up in the source code, but its existence was inferred by the parser.
761             For example, in the following markup, the HTML, TITLE and P elements
762             are explicit, but the HEAD and BODY elements are implicit.
763              
764             <html>
765             <title>I have an implicit head</title>
766             <p>And an implicit body too!</p>
767             </html>
768              
769             (Note that implicit elements do still have a line number and column
770             number.) The implictness indicator is a new feature, and I'd appreciate
771             any bug reports where it gets things wrong.
772              
773             L<XML::LibXML::Node> has a C<line_number> method. In general this
774             will always return 0 and HTML::HTML5::Parser has no way of influencing
775             it. However, if you install L<XML::LibXML::Devel::SetLineNumber> on
776             your system, the C<line_number> method will start working (at least for
777             elements).
778              
779             =back
780              
781             =head1 SEE ALSO
782              
783             L<http://suika.fam.cx/www/markup/html/whatpm/Whatpm/HTML.html>.
784              
785             L<HTML::HTML5::Writer>,
786             L<HTML::HTML5::Builder>,
787             L<XML::LibXML>,
788             L<XML::LibXML::PrettyPrint>,
789             L<XML::LibXML::Devel::SetLineNumber>.
790              
791             =head1 AUTHOR
792              
793             Toby Inkster, E<lt>tobyink@cpan.orgE<gt>
794              
795             =head1 COPYRIGHT AND LICENCE
796              
797             Copyright (C) 2007-2011 by Wakaba
798              
799             Copyright (C) 2009-2012 by Toby Inkster
800              
801             This library is free software; you can redistribute it and/or modify
802             it under the same terms as Perl itself.
803              
804             =head1 DISCLAIMER OF WARRANTIES
805              
806             THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
807             WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
808             MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
809