File Coverage

blib/lib/HTML/TreeBuilder.pm
Criterion Covered Total %
statement 401 553 72.5
branch 204 348 58.6
condition 124 255 48.6
subroutine 44 52 84.6
pod 38 38 100.0
total 811 1246 65.0


line stmt bran cond sub pod time code
1             package HTML::TreeBuilder;
2              
3             # ABSTRACT: Parser that builds a HTML syntax tree
4              
5 20     20   263963 use 5.008;
  20         67  
6 20     20   95 use warnings;
  20         39  
  20         489  
7 20     20   92 use strict;
  20         47  
  20         360  
8 20     20   7480 use integer; # vroom vroom!
  20         270  
  20         82  
9 20     20   474 use Carp ();
  20         41  
  20         299  
10 20     20   86 use Scalar::Util qw(openhandle);
  20         35  
  20         3106  
11              
12             our $VERSION = '5.910'; # TRIAL VERSION from OurPkgVersion
13              
14             #---------------------------------------------------------------------------
15             # Make a 'DEBUG' constant...
16              
17             our $DEBUG; # Must be set BEFORE loading this file
18             BEGIN {
19              
20             # We used to have things like
21             # print $indent, "lalala" if $Debug;
22             # But there were an awful lot of having to evaluate $Debug's value.
23             # If we make that depend on a constant, like so:
24             # sub DEBUG () { 1 } # or whatever value.
25             # ...
26             # print $indent, "lalala" if DEBUG;
27             # Which at compile-time (thru the miracle of constant folding) turns into:
28             # print $indent, "lalala";
29             # or, if DEBUG is a constant with a true value, then that print statement
30             # is simply optimized away, and doesn't appear in the target code at all.
31             # If you don't believe me, run:
32             # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
33             # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
34             # and see for yourself (substituting whatever value you want for $DEBUG
35             # there).
36             ## no critic
37 20 50   20   232 if ( defined &DEBUG ) {
    50          
    50          
    0          
38              
39             # Already been defined! Do nothing.
40             }
41             elsif ( $] < 5.00404 ) {
42              
43             # Grudgingly accommodate ancient (pre-constant) versions.
44 0         0 eval 'sub DEBUG { $Debug } ';
45             }
46             elsif ( !$DEBUG ) {
47 20         979 eval 'sub DEBUG () {0}'; # Make it a constant.
48             }
49             elsif ( $DEBUG =~ m<^\d+$>s ) {
50 0         0 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
51             }
52             else { # WTF?
53 0         0 warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
54 0         0 eval 'sub DEBUG () { $DEBUG }'; # I guess.
55             }
56             ## use critic
57             }
58              
59             #---------------------------------------------------------------------------
60              
61 20     20   7624 use HTML::Entities ();
  20         97165  
  20         671  
62 20     20   7579 use HTML::Tagset 3.02 ();
  20         20179  
  20         491  
63              
64 20     20   13364 use HTML::Element ();
  20         53  
  20         545  
65 20     20   113 use HTML::Parser 3.46 ();
  20         393  
  20         23659  
66             our @ISA = qw(HTML::Element HTML::Parser);
67              
68             # This looks schizoid, I know.
69             # It's not that we ARE an element AND a parser.
70             # We ARE an element, but one that knows how to handle signals
71             # (method calls) from Parser in order to elaborate its subtree.
72              
73             # Legacy aliases:
74             *HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
75             *HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
76             *HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
77             *HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
78             *HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
79             *HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
80             *HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
81             *HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
82             *HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
83             *HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
84              
85             #==========================================================================
86             # Two little shortcut constructors:
87              
88             sub new_from_file { # or from a FH
89 17     17 1 11852 my $class = shift;
90 17 50       66 Carp::croak("new_from_file takes an odd number of arguments")
91             unless @_ % 2;
92 17 50       48 Carp::croak("new_from_file is a class method only")
93             if ref $class;
94 17         36 my $file = shift;
95 17         56 my $new = $class->new(@_);
96 17 100       48 defined $new->parse_file( $file )
97             or Carp::croak("unable to parse file: $!");
98 16         55 return $new;
99             }
100              
101             sub new_from_content { # from any number of scalars
102 8     8 1 1664 my $class = shift;
103 8 50       34 Carp::croak("new_from_content is a class method only")
104             if ref $class;
105 8         31 my $new = $class->new();
106 8         25 foreach my $whunk (@_) {
107 9 100       40 if ( ref($whunk) eq 'SCALAR' ) {
108 2         18 $new->parse($$whunk);
109             }
110             else {
111 7         77 $new->parse($whunk);
112             }
113 9 50       37 last if $new->{'_stunted'}; # might as well check that.
114             }
115 8         30 $new->eof();
116 8         29 return $new;
117             }
118              
119             sub new_from_string { # from a single scalar (plus options)
120 16     16 1 7050 my $class = shift;
121 16 50       58 Carp::croak("new_from_string takes an odd number of arguments")
122             unless @_ % 2;
123 16 50       46 Carp::croak("new_from_string is a class method only")
124             if ref $class;
125 16         32 my $string = shift;
126 16         52 my $new = $class->new(@_);
127 16         49 $new->parse_content($string);
128 16         39 return $new;
129             }
130              
131             sub new_from_url { # should accept anything that LWP does.
132 4     4 1 14 undef our $lwp_response;
133 4         9 my $class = shift;
134 4 50       19 Carp::croak("new_from_url takes an odd number of arguments")
135             unless @_ % 2;
136 4 50       13 Carp::croak("new_from_url is a class method only")
137             if ref $class;
138 4         9 my $url = shift;
139              
140 4         27 require LWP::UserAgent;
141             # RECOMMEND PREREQ: LWP::UserAgent 5.815
142 4         62 LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method
143 4         24 $lwp_response = LWP::UserAgent->new->get( $url );
144              
145 4 100       32475 Carp::croak("GET failed on $url: " . $lwp_response->status_line)
146             unless $lwp_response->is_success;
147 3 100       37 Carp::croak("$url returned " . $lwp_response->content_type . " not HTML")
148             unless $lwp_response->content_is_html;
149              
150 2         70 my $new = $class->new_from_http($lwp_response, @_);
151              
152 2         26 undef $lwp_response; # Processed successfully
153 2         9 return $new;
154             }
155              
156             sub new_from_http { # from a HTTP::Message (or subclass)
157 3     3 1 6 my $class = shift;
158 3 50       11 Carp::croak("new_from_http takes an odd number of arguments")
159             unless @_ % 2;
160 3 50       9 Carp::croak("new_from_http is a class method only")
161             if ref $class;
162 3         7 my $message = shift;
163 3         12 my $new = $class->new(@_);
164              
165 3         5 my $cref;
166              
167 3         8 my %opt = @_;
168 3 50       10 if (defined $opt{encoding}) {
169             # User-specified charset:
170 0   0     0 my $charset = ($opt{encoding} || 'none');
171 0         0 $charset =~ s/:BOM\z//;
172 0         0 $cref = $message->decoded_content(ref => 1, charset => $charset);
173             } else {
174             # Auto-detect charset:
175 3   50     14 my $charset = $message->content_charset || 'cp1252';
176 3         2871 $cref = $message->decoded_content(ref => 1, charset => $charset);
177 3 50       336 if ($charset eq 'none') {
178 0         0 $charset = '';
179             } else {
180 3         18 require Encode;
181 3 50       11 if (my $encoding = Encode::find_encoding($charset)) {
182 3         60 $charset = $encoding->name; # canonical name
183 3 50       13 $charset .= ':BOM' if $$cref =~ /^\x{FeFF}/;
184             } else {
185 0         0 undef $charset; # Encode doesn't recognize it
186             }
187             }
188 3         8 $new->{_encoding} = $charset;
189             } # end else auto-detect charset
190              
191 3         24 $new->parse( $$cref );
192 3         13 $new->eof;
193 3         11 return $new;
194             }
195              
196             # TODO: document more fully?
197             sub parse_content { # from any number of scalars
198 20     20 1 53 my $tree = shift;
199 20         36 my $retval;
200 20         46 foreach my $whunk (@_) {
201 21 100       58 if ( ref($whunk) eq 'SCALAR' ) {
202 3         24 $retval = $tree->parse($$whunk);
203             }
204             else {
205 18         176 $retval = $tree->parse($whunk);
206             }
207 21 50       73 last if $tree->{'_stunted'}; # might as well check that.
208             }
209 20         69 $tree->eof();
210 20         46 return $retval;
211             }
212              
213             #---------------------------------------------------------------------
214             sub parse_file {
215 20     20 1 55 my ($self, $file) = @_;
216              
217 20 50       51 Carp::croak("parse_file requires file argument") unless defined $file;
218              
219 20         58 my $fh = openhandle($file);
220 20 100       52 unless (defined $fh) {
221 17         35 my $encoding = $self->{_encoding};
222              
223 17 100       47 if (not defined $encoding) {
224 11         2151 require IO::HTML;
225              
226 11         28822 { local $@;
  11         20  
227 11         21 eval {
228 11         41 ($fh, $encoding, my $bom) =
229             IO::HTML::file_and_encoding($file);
230 10 100       5113 $encoding .= ':BOM' if $bom;
231             };
232             } # end local $@
233 11         311 $self->{_encoding} = $encoding;
234             } # end if auto encoding
235             else {
236 6         13 $encoding =~ s/:BOM$//;
237 6 100       168 open($fh, (length($encoding) ? "<:encoding($encoding):crlf"
    50          
238             : "<:raw"), $file)
239             or undef $fh;
240             }
241              
242 17 100       857 return undef unless defined $fh;
243             } # end unless filehandle was passed in
244              
245 19         107 $self->SUPER::parse_file($fh);
246             }
247              
248             #---------------------------------------------------------------------------
249              
250             sub new { # constructor!
251 379     379 1 517955 my $class = shift;
252 379   33     12870 $class = ref($class) || $class;
253              
254             # Initialize HTML::Element part
255 379         12157 my $self = $class->element_class->new('html');
256              
257             {
258              
259             # A hack for certain strange versions of Parser:
260 379         11759 my $other_self = HTML::Parser->new();
  379         12495  
261 379         264707 %$self = ( %$self, %$other_self ); # copy fields
262             # Yes, multiple inheritance is messy. Kids, don't try this at home.
263 379         23807 bless $other_self, "HTML::TreeBuilder::_hideyhole";
264              
265             # whack it out of the HTML::Parser class, to avoid the destructor
266             }
267              
268             # The root of the tree is special, as it has these funny attributes,
269             # and gets reblessed into this class.
270              
271             # Initialize parser settings
272 379         11920 $self->{'_implicit_tags'} = 1;
273 379         11727 $self->{'_implicit_body_p_tag'} = 0;
274              
275             # If true, trying to insert text, or any of %isPhraseMarkup right
276             # under 'body' will implicate a 'p'. If false, will just go there.
277              
278 379         11821 $self->{'_tighten'} = 1;
279              
280             # whether ignorable WS in this tree should be deleted
281              
282 379         11985 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
283              
284 379         11926 $self->{'_ignore_unknown'} = 1;
285 379         11820 $self->{'_ignore_text'} = 0;
286 379         11712 $self->{'_warn'} = 0;
287 379         11681 $self->{'_no_space_compacting'} = 0;
288 379         11693 $self->{'_self_closed_tags'} = 0;
289 379         11627 $self->{'_store_comments'} = 0;
290 379         11649 $self->{'_store_declarations'} = 1;
291 379         11970 $self->{'_store_pis'} = 0;
292 379         11718 $self->{'_p_strict'} = 0;
293 379         11598 $HTML::Element::encoded_content = $self->{'_no_expand_entities'} = 0;
294              
295             # rebless to our class
296 379         11706 bless $self, $class;
297              
298             # Parse attributes passed in as arguments
299 379 100       12077 if (@_) {
300 17 50       53 Carp::croak("new must be passed key => value pairs") if @_ % 2;
301              
302 17         55 my %attr = @_;
303 17         80 my $allowed = $self->_is_attr;
304              
305 17         71 while (my ($attr, $value) = each %attr ) {
306 28 50       77 if ($allowed->{$attr}) {
307 28         496 $self->$attr($value);
308             } else {
309 0         0 Carp::carp("Ignoring unknown attribute $attr");
310             }
311             } # end while each $attr
312             } # end if attributes passed to new
313              
314 379         12032 $self->{'_element_count'} = 1;
315              
316             # undocumented, informal, and maybe not exactly correct
317              
318 379         12320 $self->{'_head'} = $self->insert_element( 'head', 1 );
319 379         11732 $self->{'_pos'} = undef; # pull it back up
320 379         12008 $self->{'_body'} = $self->insert_element( 'body', 1 );
321 379         11711 $self->{'_pos'} = undef; # pull it back up again
322              
323 379         22980 return $self;
324             }
325              
326             #==========================================================================
327              
328             sub _elem # universal accessor...
329             {
330 68     68   5544 my ( $self, $elem, $val ) = @_;
331 68         5564 my $old = $self->{$elem};
332 68 50       5619 $self->{$elem} = $val if defined $val;
333 68         16347 return $old;
334             }
335              
336 0         0 BEGIN {
337 20     20   114 my @attributes = qw(
338             implicit_tags
339             implicit_body_p_tag
340             p_strict
341             no_space_compacting
342             ignore_unknown
343             ignore_text
344             self_closed_tags
345             store_comments
346             store_declarations
347             store_pis
348             warn
349             );
350              
351             # Create accessor methods:
352 20         47 my $code = join('', map { "sub $_ { shift->_elem( '_$_', \@_ ); }\n" }
  220         623  
353             @attributes);
354 20         52 my $err;
355             {
356 20         33 local $@;
  20         38  
357 20 50 0 2 1 4202 $err = $@ || "UNKNOWN ERROR" unless eval "$code 1"; ## no critic
  2     2 1 38  
  2     36 1 8  
  36     11 1 60581  
  11     5 1 244  
  5     0 1 19  
  0     1 1 0  
  1     1 1 6  
  1     1 1 47  
  1     0 1 411  
  0     2 1 0  
  2         8  
358             }
359 20 50       68 die "$code$err" if $err;
360              
361             # Record names of class attributes:
362 20         49 my %is_attr = map { $_ => 1 } (@attributes, qw(
  280         85297  
363             encoding
364             ignore_ignorable_whitespace
365             no_expand_entities
366             ));
367              
368 17     17   39 sub _is_attr { return \%is_attr }
369             }
370              
371             # Custom accessors:
372             sub ignore_ignorable_whitespace {
373 5     5 1 13 shift->_elem( '_tighten', @_ ); # internal name is different
374             }
375              
376             sub no_expand_entities {
377 2     2 1 6 my $self = shift;
378 2         8 my $return = $self->_elem( '_no_expand_entities', @_ );
379 2         5 $HTML::Element::encoded_content = $self->{_no_expand_entities};
380 2         9 $return;
381             }
382              
383             #==========================================================================
384              
385             sub warning {
386 3     3 1 6 my $self = shift;
387 3 50       12 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
388              
389             # should maybe say HTML::TreeBuilder instead
390             }
391              
392             #==========================================================================
393              
394             {
395              
396             # To avoid having to rebuild these lists constantly...
397             my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
398             my $indent;
399              
400             sub start {
401 1252 50   1252 1 16007 return if $_[0]{'_stunted'};
402              
403             # Accept a signal from HTML::Parser for start-tags.
404 1252         6879 my ( $self, $tag, $attr ) = @_;
405              
406 1252   100     7447 my $self_closed = ($self->{'_self_closed_tags'} and
407             $_[4] =~ m!/[\n\r\f\t ]*>\z!);
408 1252 100       7001 delete $attr->{'/'} if $self_closed;
409              
410             # Parser passes more, actually:
411             # $self->start($tag, $attr, $attrseq, $origtext)
412             # But we can merrily ignore $attrseq and $origtext.
413              
414 1252 50       7321 if ( $tag eq 'x-html' ) {
415 0         0 print "Ignoring open-x-html tag.\n" if DEBUG;
416              
417             # inserted by some lame code-generators.
418 0         0 return; # bypass tweaking.
419             }
420              
421 1252         6559 $tag =~ s{/$}{}s; # So turns into . Silently forgive.
422              
423 1252 50       8661 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
424 0         0 DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
425 0         0 return;
426              
427             # This avoids having Element's new() throw an exception.
428             }
429              
430 1252   66     8260 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
431 1252         6004 my $already_inserted;
432              
433             #my($indent);
434 1252         5918 if (DEBUG) {
435              
436             # optimization -- don't figure out indenting unless we're in debug mode
437             my @lineage = $pos->lineage;
438             $indent = ' ' x ( 1 + @lineage );
439             print $indent, "Proposing a new \U$tag\E under ",
440             join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
441             || 'Root',
442             ".\n";
443              
444             #} else {
445             # $indent = ' ';
446             }
447              
448             #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
449             # $attr = {%$attr};
450              
451 1252         7708 foreach my $k ( keys %$attr ) {
452              
453             # Make sure some stooge doesn't have "".
454             # That happens every few million Web pages.
455 597 50 33     2815 $attr->{ ' ' . $k } = delete $attr->{$k}
456             if length $k and substr( $k, 0, 1 ) eq '_';
457              
458             # Looks bad, but is fine for round-tripping.
459             }
460              
461 1252         6969 my $e = $self->element_class->new( $tag, %$attr );
462              
463             # Make a new element object.
464             # (Only rarely do we end up just throwing it away later in this call.)
465              
466             # Some prep -- custom messiness for those damned tables, and strict P's.
467 1252 100       7805 if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
468              
469 1214 100       7238 unless ( $HTML::TreeBuilder::isTableElement{$tag} ) {
470 1036 50       7126 if ( $ptag eq 'table' ) {
    50          
471 0         0 print $indent,
472             " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
473             if DEBUG > 1;
474 0         0 $self->insert_element( 'tr', 1 );
475 0         0 $pos = $self->insert_element( 'td', 1 )
476             ; # yes, needs updating
477             }
478             elsif ( $ptag eq 'tr' ) {
479 0         0 print $indent,
480             " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
481             if DEBUG > 1;
482 0         0 $pos = $self->insert_element( 'td', 1 )
483             ; # yes, needs updating
484             }
485 1036         10209 $ptag = $pos->{'_tag'}; # yes, needs updating
486             }
487              
488             # end of table-implication block.
489              
490             # Now maybe do a little dance to enforce P-strictness.
491             # This seems like it should be integrated with the big
492             # "ALL HOPE..." block, further below, but that doesn't
493             # seem feasible.
494 1214 0 33     11250 if ( $self->{'_p_strict'}
      0        
495             and $HTML::TreeBuilder::isKnown{$tag}
496             and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
497             {
498 0         0 my $here = $pos;
499 0         0 my $here_tag = $ptag;
500 0         0 while (1) {
501 0 0       0 if ( $here_tag eq 'p' ) {
502 0         0 print $indent, " * Inserting $tag closes strict P.\n"
503             if DEBUG > 1;
504 0         0 $self->end( \q{p} );
505              
506             # NB: same as \'q', but less confusing to emacs cperl-mode
507 0         0 last;
508             }
509              
510             #print("Lasting from $here_tag\n"),
511             last
512             if $HTML::TreeBuilder::isKnown{$here_tag}
513             and
514             not $HTML::Tagset::is_Possible_Strict_P_Content{
515 0 0 0     0 $here_tag};
516              
517             # Don't keep looking up the tree if we see something that can't
518             # be strict-P content.
519              
520             $here_tag
521 0   0     0 = ( $here = $here->{'_parent'} || last )->{'_tag'};
522             } # end while
523             $ptag = ( $pos = $self->{'_pos'} || $self )
524 0   0     0 ->{'_tag'}; # better update!
525             }
526              
527             # end of strict-p block.
528             }
529              
530             # And now, get busy...
531             #----------------------------------------------------------------------
532 1252 100       9073 if ( !$self->{'_implicit_tags'} ) { # bimskalabim
    100          
    100          
    100          
    100          
    100          
    50          
    0          
    0          
    0          
533             # do nothing
534 38         59 print $indent, " * _implicit_tags is off. doing nothing\n"
535             if DEBUG > 1;
536              
537             #----------------------------------------------------------------------
538             }
539             elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) {
540 2 100       10 if ( $pos->is_inside('body') ) { # all is well
    50          
541 1         2 print $indent,
542             " * ambilocal element \U$tag\E is fine under BODY.\n"
543             if DEBUG > 1;
544             }
545             elsif ( $pos->is_inside('head') ) {
546 1         2 print $indent,
547             " * ambilocal element \U$tag\E is fine under HEAD.\n"
548             if DEBUG > 1;
549             }
550             else {
551              
552             # In neither head nor body! mmmmm... put under head?
553              
554 0 0       0 if ( $ptag eq 'html' ) { # expected case
555             # TODO?? : would there ever be a case where _head would be
556             # absent from a tree that would ever be accessed at this
557             # point?
558 0 0       0 die "Where'd my head go?" unless ref $self->{'_head'};
559 0 0       0 if ( $self->{'_head'}{'_implicit'} ) {
560 0         0 print $indent,
561             " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
562             if DEBUG > 1;
563              
564             # or rather, points us at it.
565             $self->{'_pos'}
566 0         0 = $self->{'_head'}; # to insert under...
567             }
568             else {
569 0         0 $self->warning(
570             "Ambilocal element <$tag> not under HEAD or BODY!?"
571             );
572              
573             # Put it under HEAD by default, I guess
574             $self->{'_pos'}
575 0         0 = $self->{'_head'}; # to insert under...
576             }
577              
578             }
579             else {
580              
581             # Neither under head nor body, nor right under html... pass thru?
582 0         0 $self->warning(
583             "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
584             );
585             }
586             }
587              
588             #----------------------------------------------------------------------
589             }
590             elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) {
591              
592             # Ensure that we are within
593 943 100 66     7840 if ( $ptag eq 'body' ) {
    100          
    100          
    50          
594              
595             # We're good.
596             }
597             elsif (
598             $HTML::TreeBuilder::isBodyElement{$ptag} # glarg
599             and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
600             )
601             {
602              
603             # Special case: Save ourselves a call to is_inside further down.
604             # If our $ptag is an isBodyElement element (but not an
605             # isHeadOrBodyElement element), then we must be under body!
606 258         385 print $indent, " * Inferring that $ptag is under BODY.\n",
607             if DEBUG > 3;
608              
609             # I think this and the test for 'body' trap everything
610             # bodyworthy, except the case where the parent element is
611             # under an unknown element that's a descendant of body.
612             }
613             elsif ( $pos->is_inside('head') ) {
614 169         349 print $indent,
615             " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
616             if DEBUG > 1;
617             $ptag = (
618             $pos = $self->{'_pos'}
619             = $self->{'_body'} # yes, needs updating
620             || die "Where'd my body go?"
621 169   50     593 )->{'_tag'}; # yes, needs updating
622             }
623             elsif ( !$pos->is_inside('body') ) {
624 104         4336 print $indent,
625             " * body-element \U$tag\E makes implicit BODY.\n"
626             if DEBUG > 1;
627             $ptag = (
628             $pos = $self->{'_pos'}
629             = $self->{'_body'} # yes, needs updating
630             || die "Where'd my body go?"
631 104   50     8681 )->{'_tag'}; # yes, needs updating
632             }
633              
634             # else we ARE under body, so okay.
635              
636             # Handle implicit endings and insert based on and position
637             # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
638 943 100 100     20273 if ( $tag eq 'p'
    100 100        
    100 100        
    50 66        
    100 66        
    100 33        
    100 66        
      66        
      66        
      33        
639             or $tag eq 'h1'
640             or $tag eq 'h2'
641             or $tag eq 'h3'
642             or $tag eq 'h4'
643             or $tag eq 'h5'
644             or $tag eq 'h6'
645             or $tag eq 'form'
646              
647             # Hm, should
really be here?!
648             )
649             {
650              
651             # Can't have

, or inside these

652 307         914 $self->end(
653             $_Closed_by_structurals,
654             @HTML::TreeBuilder::p_closure_barriers
655              
656             # used to be just li!
657             );
658              
659             }
660             elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
661              
662             # Can't have lists inside -- in the unlikely
663             # event anyone tries to put them there!
664 9 50 33     114 if ( $ptag eq 'h1'
      33        
      33        
      33        
      33        
665             or $ptag eq 'h2'
666             or $ptag eq 'h3'
667             or $ptag eq 'h4'
668             or $ptag eq 'h5'
669             or $ptag eq 'h6' )
670             {
671 0         0 $self->end( \$ptag );
672             }
673              
674             # TODO: Maybe keep closing up the tree until
675             # the ptag isn't any of the above?
676             # But anyone that says

    ...

677             # deserves what they get anyway.
678              
679             }
680             elsif ( $tag eq 'li' ) { # list item
681             # Get under a list tag, one way or another
682 12 50 66     47 unless (
683             exists $HTML::TreeBuilder::isList{$ptag}
684             or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #'
685             )
686             {
687 0         0 print $indent,
688             " * inserting implicit UL for lack of containing ",
689             join( '|', keys %HTML::TreeBuilder::isList ), ".\n"
690             if DEBUG > 1;
691 0         0 $self->insert_element( 'ul', 1 );
692             }
693              
694             }
695             elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
696              
697             # Get under a DL, one way or another
698 0 0 0     0 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #'
699 0         0 print $indent,
700             " * inserting implicit DL for lack of containing DL.\n"
701             if DEBUG > 1;
702 0         0 $self->insert_element( 'dl', 1 );
703             }
704              
705             }
706             elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) {
707 1 50 33     5 if ($self->{
708             '_ignore_formies_outside_form'} # TODO: document this
709             and not $pos->is_inside('form')
710             )
711             {
712 0         0 print $indent,
713             " * ignoring \U$tag\E because not in a FORM.\n"
714             if DEBUG > 1;
715 0         0 return; # bypass tweaking.
716             }
717 1 50       4 if ( $tag eq 'option' ) {
718              
719             # return unless $ptag eq 'select';
720 0         0 $self->end( \q{option} );
721 0   0     0 $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
722 0 0 0     0 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
723 0         0 print $indent,
724             " * \U$tag\E makes an implicit SELECT.\n"
725             if DEBUG > 1;
726 0         0 $pos = $self->insert_element( 'select', 1 );
727              
728             # but not a very useful select -- has no 'name' attribute!
729             # is $pos's value used after this?
730             }
731             }
732             }
733             elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) {
734 178 100       473 if ( !$pos->is_inside('table') ) {
735 3         6 print $indent, " * \U$tag\E makes an implicit TABLE\n"
736             if DEBUG > 1;
737 3         9 $self->insert_element( 'table', 1 );
738             }
739              
740 178 100 100     606 if ( $tag eq 'td' or $tag eq 'th' ) {
741              
742             # Get under a tr one way or another
743 111 100 100     330 unless (
744             $ptag eq 'tr' # either under a tr
745             or $self->end( \q{*}, 'tr',
746             'table' ) #or we can get under one
747             )
748             {
749 2         5 print $indent,
750             " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
751             if DEBUG > 1;
752 2         7 $self->insert_element( 'tr', 1 );
753              
754             # presumably pos's value isn't used after this.
755             }
756             }
757             else {
758 67         185 $self->end( \$tag, 'table' ); #'
759             }
760              
761             # Hmm, I guess this is right. To work it out:
762             # tr closes any open tr (limited at a table)
763             # thead closes any open thead (limited at a table)
764             # tbody closes any open tbody (limited at a table)
765             # tfoot closes any open tfoot (limited at a table)
766             # colgroup closes any open colgroup (limited at a table)
767             # col can try, but will always fail, at the enclosing table,
768             # as col is empty, and therefore never open!
769             # But!
770             # td closes any open td OR th (limited at a table)
771             # th closes any open th OR td (limited at a table)
772             # ...implementable as "close to a tr, or make a tr"
773              
774             }
775             elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) {
776 387 100 66     7335 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
777 9         1369 print
778             " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
779             if DEBUG > 1;
780 9         1371 $pos = $self->insert_element( 'p', 1 );
781              
782             # is $pos's value used after this?
783             }
784             }
785              
786             # End of implicit endings logic
787              
788             # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}"
789             #----------------------------------------------------------------------
790              
791             }
792             elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) {
793 204 100       585 if ( $pos->is_inside('body') ) {
    100          
794 3         6 print $indent, " * head element \U$tag\E found inside BODY!\n"
795             if DEBUG;
796 3         16 $self->warning("Header element <$tag> in body"); # [sic]
797             }
798             elsif ( !$pos->is_inside('head') ) {
799 167         300 print $indent,
800             " * head element \U$tag\E makes an implicit HEAD.\n"
801             if DEBUG > 1;
802             }
803             else {
804 34         62 print $indent,
805             " * head element \U$tag\E goes inside existing HEAD.\n"
806             if DEBUG > 1;
807             }
808 204   50     648 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
809              
810             #----------------------------------------------------------------------
811             }
812             elsif ( $tag eq 'html' ) {
813 21 50       61 if ( delete $self->{'_implicit'} ) { # first time here
814 21         33 print $indent, " * good! found the real HTML element!\n"
815             if DEBUG > 1;
816             }
817             else {
818 0         0 print $indent, " * Found a second HTML element\n"
819             if DEBUG;
820 0         0 $self->warning("Found a nested element");
821             }
822              
823             # in either case, migrate attributes to the real element
824 21         56 for ( keys %$attr ) {
825 6         27 $self->attr( $_, $attr->{$_} );
826             }
827 21         39 $self->{'_pos'} = undef;
828 21         160 return $self; # bypass tweaking.
829              
830             #----------------------------------------------------------------------
831             }
832             elsif ( $tag eq 'head' ) {
833 22   50     67 my $head = $self->{'_head'} || die "Where'd my head go?";
834 22 50       65 if ( delete $head->{'_implicit'} ) { # first time here
835 22         38 print $indent, " * good! found the real HEAD element!\n"
836             if DEBUG > 1;
837             }
838             else { # been here before
839 0         0 print $indent, " * Found a second HEAD element\n"
840             if DEBUG;
841 0         0 $self->warning("Found a second element");
842             }
843              
844             # in either case, migrate attributes to the real element
845 22         60 for ( keys %$attr ) {
846 0         0 $head->attr( $_, $attr->{$_} );
847             }
848 22         138 return $self->{'_pos'} = $head; # bypass tweaking.
849              
850             #----------------------------------------------------------------------
851             }
852             elsif ( $tag eq 'body' ) {
853 22   50     77 my $body = $self->{'_body'} || die "Where'd my body go?";
854 22 50       66 if ( delete $body->{'_implicit'} ) { # first time here
855 22         37 print $indent, " * good! found the real BODY element!\n"
856             if DEBUG > 1;
857             }
858             else { # been here before
859 0         0 print $indent, " * Found a second BODY element\n"
860             if DEBUG;
861 0         0 $self->warning("Found a second element");
862             }
863              
864             # in either case, migrate attributes to the real element
865 22         62 for ( keys %$attr ) {
866 0         0 $body->attr( $_, $attr->{$_} );
867             }
868 22         164 return $self->{'_pos'} = $body; # bypass tweaking.
869              
870             #----------------------------------------------------------------------
871             }
872             elsif ( $tag eq 'frameset' ) {
873 0 0 0     0 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen
      0        
874             and !$self->{'_noframes_seen'}
875              
876             # otherwise it'll be under the noframes already
877             and !$self->is_inside('body')
878             )
879             {
880              
881             # The following is a bit of a hack. We don't use the normal
882             # insert_element because 1) we don't want it as _pos, but instead
883             # right under $self, and 2), more importantly, that we don't want
884             # this inserted at the /end/ of $self's content_list, but instead
885             # in the middle of it, specifically right before the body element.
886             #
887 0   0     0 my $c = $self->{'_content'} || die "Contentless root?";
888 0   0     0 my $body = $self->{'_body'} || die "Where'd my BODY go?";
889 0         0 for ( my $i = 0; $i < @$c; ++$i ) {
890 0 0       0 if ( $c->[$i] eq $body ) {
891 0         0 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
892 0         0 HTML::Element::_weaken($e->{'_parent'} = $self);
893 0         0 $already_inserted = 1;
894 0         0 print $indent,
895             " * inserting 'frameset' right before BODY.\n"
896             if DEBUG > 1;
897 0         0 last;
898             }
899             }
900 0 0       0 die "BODY not found in children of root?"
901             unless $already_inserted;
902             }
903              
904             }
905             elsif ( $tag eq 'frame' ) {
906              
907             # Okay, fine, pass thru.
908             # Should probably enforce that these should be under a frameset.
909             # But hey. Ditto for enforcing that 'noframes' should be under
910             # a 'frameset', as the DTDs say.
911              
912             }
913             elsif ( $tag eq 'noframes' ) {
914              
915             # This basically assumes there'll be exactly one 'noframes' element
916             # per document. At least, only the first one gets to have the
917             # body under it. And if there are no noframes elements, then
918             # the body pretty much stays where it is. Is that ever a problem?
919 0 0       0 if ( $self->{'_noframes_seen'}++ ) {
920 0         0 print $indent, " * ANOTHER noframes element?\n" if DEBUG;
921             }
922             else {
923 0 0       0 if ( $pos->is_inside('body') ) {
924 0         0 print $indent, " * 'noframes' inside 'body'. Odd!\n"
925             if DEBUG;
926              
927             # In that odd case, we /can't/ make body a child of 'noframes',
928             # because it's an ancestor of the 'noframes'!
929             }
930             else {
931 0   0     0 $e->push_content( $self->{'_body'}
932             || die "Where'd my body go?" );
933 0         0 print $indent, " * Moving body to be under noframes.\n"
934             if DEBUG;
935             }
936             }
937              
938             #----------------------------------------------------------------------
939             }
940             else {
941              
942             # unknown tag
943 0 0       0 if ( $self->{'_ignore_unknown'} ) {
944 0         0 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
945 0         0 $self->warning("Skipping unknown tag $tag");
946 0         0 return;
947             }
948             else {
949 0         0 print $indent, " * Accepting unknown tag \U$tag\E\n"
950             if DEBUG;
951             }
952             }
953              
954             #----------------------------------------------------------------------
955             # End of mumbo-jumbo
956              
957             print $indent, "(Attaching ", $e->{'_tag'}, " under ",
958 1187         5905 ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
959              
960             # because if _pos isn't defined, it goes under self
961             if DEBUG;
962              
963             # The following if-clause is to delete /some/ ignorable whitespace
964             # nodes, as we're making the tree.
965             # This'd be a node we'd catch later anyway, but we might as well
966             # nip it in the bud now.
967             # This doesn't catch /all/ deletable WS-nodes, so we do have to call
968             # the tightener later to catch the rest.
969              
970 1187 100 66     9127 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
971             { # if tightenable
972 1178         6092 my ( $sibs, $par );
973 1178 100 66     18215 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
      66        
      100        
      100        
      66        
      66        
      66        
974             and @$sibs # parent already has content
975             and !
976             ref( $sibs->[-1] ) # and the last one there is a text node
977             and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
978              
979             and ( # one of these has to be eligible...
980             $HTML::TreeBuilder::canTighten{$tag}
981             or (( @$sibs == 1 )
982             ? # WS is leftmost -- so parent matters
983             $HTML::TreeBuilder::canTighten{ $par->{'_tag'} }
984             : # WS is after another node -- it matters
985             ( ref $sibs->[-2]
986             and
987             $HTML::TreeBuilder::canTighten{ $sibs->[-2]
988             {'_tag'} }
989             )
990             )
991             )
992              
993             and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
994              
995             # we're clear
996             )
997             {
998 131         224 pop @$sibs;
999 131         216 print $indent, "Popping a preceding all-WS node\n" if DEBUG;
1000             }
1001             }
1002              
1003 1187 50       6967 unless ($already_inserted) {
1004 1187 100       6564 if ($self_closed) { $self->pos->push_content($e) }
  8         24  
1005 1179         7228 else { $self->insert_element($e) }
1006             }
1007              
1008 1187         6183 if (DEBUG) {
1009             if ( $self->{'_pos'} ) {
1010             print $indent, "(Current lineage of pos: \U$tag\E under ",
1011             join(
1012             '/',
1013             reverse(
1014              
1015             # $self->{'_pos'}{'_tag'}, # don't list myself!
1016             $self->{'_pos'}->lineage_tag_names
1017             )
1018             ),
1019             ".)\n";
1020             }
1021             else {
1022             print $indent, "(Pos points nowhere!?)\n";
1023             }
1024             }
1025              
1026 1187 100 50     8452 unless ( ( $self->{'_pos'} || '' ) eq $e ) {
1027              
1028             # if it's an empty element -- i.e., if it didn't change the _pos
1029 51         1548 &{ $self->{"_tweak_$tag"}
1030 51 50 33     3219 || $self->{'_tweak_*'}
1031             || return $e }( map $_, $e, $tag, $self )
1032             ; # make a list so the user can't clobber
1033             }
1034              
1035 1136         12770 return $e;
1036             }
1037             }
1038              
1039             #==========================================================================
1040              
1041             {
1042             my $indent;
1043              
1044             sub end {
1045 1714 50   1714 1 18274 return if $_[0]{'_stunted'};
1046              
1047             # Either: Acccept an end-tag signal from HTML::Parser
1048             # Or: Method for closing currently open elements in some fairly complex
1049             # way, as used by other methods in this class.
1050 1714         18619 my ( $self, $tag, @stop ) = @_;
1051 1714 50       18082 if ( $tag eq 'x-html' ) {
1052 0         0 print "Ignoring close-x-html tag.\n" if DEBUG;
1053              
1054             # inserted by some lame code-generators.
1055 0         0 return;
1056             }
1057              
1058 1714 50 66     21789 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
1059 0         0 DEBUG and print "End-tag name $tag is no good. Skipping.\n";
1060 0         0 return;
1061              
1062             # This avoids having Element's new() throw an exception.
1063             }
1064              
1065             # This method accepts two calling formats:
1066             # 1) from Parser: $self->end('tag_name', 'origtext')
1067             # in which case we shouldn't mistake origtext as a blocker tag
1068             # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... )
1069             # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... )
1070              
1071             # End the specified tag, but don't move above any of the blocker tags.
1072             # The tag can also be a reference to an array. Terminate the first
1073             # tag found.
1074              
1075 1714   66     18873 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
1076              
1077             # $p and $ptag are sort-of scratch
1078              
1079 1714 100       17449 if ( ref($tag) ) {
1080              
1081             # First param is a ref of one sort or another --
1082             # THE CALL IS COMING FROM INSIDE THE HOUSE!
1083 415 100       973 $tag = $$tag if ref($tag) eq 'SCALAR';
1084              
1085             # otherwise it's an arrayref.
1086             }
1087             else {
1088              
1089             # the call came from Parser -- just ignore origtext
1090             # except in a table ignore unmatched table tags RT #59980
1091 1299 100       31119 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
1092             }
1093              
1094             #my($indent);
1095 1714         16308 if (DEBUG) {
1096              
1097             # optimization -- don't figure out depth unless we're in debug mode
1098             my @lineage_tags = $p->lineage_tag_names;
1099             $indent = ' ' x ( 1 + @lineage_tags );
1100              
1101             # now announce ourselves
1102             print $indent, "Ending ",
1103             ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
1104             scalar(@stop)
1105             ? ( " no higher than [", join( ' ', @stop ), "]" )
1106             : (), ".\n";
1107              
1108             print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
1109             ".)\n"
1110             if DEBUG > 1;
1111              
1112             if ( DEBUG > 3 ) {
1113              
1114             #my(
1115             # $package, $filename, $line, $subroutine,
1116             # $hasargs, $wantarray, $evaltext, $is_require) = caller;
1117             print $indent,
1118             " (Called from ", ( caller(1) )[3], ' line ',
1119             ( caller(1) )[2],
1120             ")\n";
1121             }
1122              
1123             #} else {
1124             # $indent = ' ';
1125             }
1126              
1127             # End of if DEBUG
1128              
1129             # Now actually do it
1130 1714         16430 my @to_close;
1131 1714 100       18135 if ( $tag eq '*' ) {
    100          
1132              
1133             # Special -- close everything up to (but not including) the first
1134             # limiting tag, or return if none found. Somewhat of a special case.
1135             PARENT:
1136 41         93 while ( defined $p ) {
1137 80         128 $ptag = $p->{'_tag'};
1138 80         106 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1139 80         141 for (@stop) {
1140 131 100       314 if ( $ptag eq $_ ) {
1141 41         62 print $indent,
1142             " (Hit a $_; closing everything up to here.)\n"
1143             if DEBUG > 2;
1144 41         78 last PARENT;
1145             }
1146             }
1147 39         67 push @to_close, $p;
1148 39         67 $p = $p->{'_parent'}; # no match so far? keep moving up
1149             print $indent,
1150 39         83 " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
1151             if DEBUG > 1;
1152             }
1153 41 50       95 unless ( defined $p ) { # We never found what we were looking for.
1154 0         0 print $indent, " (We never found a limit.)\n" if DEBUG > 1;
1155 0         0 return;
1156             }
1157              
1158             #print
1159             # $indent,
1160             # " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1161             # if DEBUG > 4;
1162              
1163             # Otherwise update pos and fall thru.
1164 41         74 $self->{'_pos'} = $p;
1165             }
1166             elsif ( ref $tag ) {
1167              
1168             # Close the first of any of the matching tags, giving up if you hit
1169             # any of the stop-tags.
1170             PARENT:
1171 307         757 while ( defined $p ) {
1172 584         991 $ptag = $p->{'_tag'};
1173 584         785 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1174 584         1096 for (@$tag) {
1175 5014 100       10511 if ( $ptag eq $_ ) {
1176 31         42 print $indent, " (Closing $_.)\n" if DEBUG > 2;
1177 31         62 last PARENT;
1178             }
1179             }
1180 553         985 for (@stop) {
1181 8295 50       16900 if ( $ptag eq $_ ) {
1182 0         0 print $indent,
1183             " (Hit a limiting $_ -- bailing out.)\n"
1184             if DEBUG > 1;
1185 0         0 return; # so it was all for naught
1186             }
1187             }
1188 553         922 push @to_close, $p;
1189 553         1300 $p = $p->{'_parent'};
1190             }
1191 307 100       1180 return unless defined $p; # We went off the top of the tree.
1192             # Otherwise specified element was found; set pos to its parent.
1193 31         65 push @to_close, $p;
1194 31         64 $self->{'_pos'} = $p->{'_parent'};
1195             }
1196             else {
1197              
1198             # Close the first of the specified tag, giving up if you hit
1199             # any of the stop-tags.
1200 1366         16988 while ( defined $p ) {
1201 1876         32053 $ptag = $p->{'_tag'};
1202 1876         31446 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1203 1876 100       33525 if ( $ptag eq $tag ) {
1204 1324         14532 print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1205 1324         27417 last;
1206             }
1207 552         17839 for (@stop) {
1208 67 100       167 if ( $ptag eq $_ ) {
1209 32         41 print $indent,
1210             " (Hit a limiting $_ -- bailing out.)\n"
1211             if DEBUG > 1;
1212 32         94 return; # so it was all for naught
1213             }
1214             }
1215 520         17715 push @to_close, $p;
1216 520         19113 $p = $p->{'_parent'};
1217             }
1218 1334 100       18382 return unless defined $p; # We went off the top of the tree.
1219             # Otherwise specified element was found; set pos to its parent.
1220 1324         14740 push @to_close, $p;
1221 1324         27324 $self->{'_pos'} = $p->{'_parent'};
1222             }
1223              
1224 1396 100 100     17696 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1225             print $indent, "(Pos now points to ",
1226 1396         14626 $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1227             if DEBUG > 1;
1228              
1229             ### EXPENSIVE, because has to check that it's not under a pre
1230             ### or a CDATA-parent. That's one more method call per end()!
1231             ### Might as well just do this at the end of the tree-parse, I guess,
1232             ### at which point we'd be parsing top-down, and just not traversing
1233             ### under pre's or CDATA-parents.
1234             ##
1235             ## Take this opportunity to nix any terminal whitespace nodes.
1236             ## TODO: consider whether this (plus the logic in start(), above)
1237             ## would ever leave any WS nodes in the tree.
1238             ## If not, then there's no reason to have eof() call
1239             ## delete_ignorable_whitespace on the tree, is there?
1240             ##
1241             #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1242             # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1243             #) { # if tightenable
1244             # my($children, $e_tag);
1245             # foreach my $e (reverse @to_close) { # going top-down
1246             # last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1247             # $HTML::Tagset::isCDATA_Parent{$e_tag};
1248             #
1249             # if(
1250             # $children = $e->{'_content'}
1251             # and @$children # has children
1252             # and !ref($children->[-1])
1253             # and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1254             # and
1255             # (
1256             # # has a tightenable parent:
1257             # $HTML::TreeBuilder::canTighten{ $e_tag }
1258             # or
1259             # ( # has a tightenable left sibling:
1260             # @$children > 1 and
1261             # ref($children->[-2])
1262             # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1263             # )
1264             # )
1265             # ) {
1266             # pop @$children;
1267             # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1268             # # " (", $e->address, ") while exiting.\n" if DEBUG;
1269             # }
1270             # }
1271             #}
1272              
1273 1396         14932 foreach my $e (@to_close) {
1274              
1275             # Call the applicable callback, if any
1276 1885         28117 $ptag = $e->{'_tag'};
1277 1885         29966 &{ $self->{"_tweak_$ptag"}
1278 1885 50 33     58609 || $self->{'_tweak_*'}
1279             || next }( map $_, $e, $ptag, $self );
1280 0         0 print $indent, "Back from tweaking.\n" if DEBUG;
1281             last
1282 0 0       0 if $self->{ '_stunted'
1283             }; # in case one of the handlers called stunt
1284             }
1285 1396         30331 return @to_close;
1286             }
1287             }
1288              
1289             #==========================================================================
1290             {
1291             my ( $indent, $nugget );
1292              
1293             sub text {
1294 2587 50   2587 1 93193 return if $_[0]{'_stunted'};
1295              
1296             # Accept a "here's a text token" signal from HTML::Parser.
1297 2587         16780 my ( $self, $text, $is_cdata ) = @_;
1298              
1299             # the >3.0 versions of Parser may pass a cdata node.
1300             # Thanks to Gisle Aas for pointing this out.
1301              
1302 2587 50       17186 return unless length $text; # I guess that's always right
1303              
1304 2587         15308 my $ignore_text = $self->{'_ignore_text'};
1305 2587         15180 my $no_space_compacting = $self->{'_no_space_compacting'};
1306 2587         15061 my $no_expand_entities = $self->{'_no_expand_entities'};
1307 2587   66     17801 my $pos = $self->{'_pos'} || $self;
1308              
1309             HTML::Entities::decode($text)
1310             unless $ignore_text
1311             || $is_cdata
1312 2587 100 33     35009 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
      33        
      66        
1313             || $no_expand_entities;
1314              
1315             #my($indent, $nugget);
1316 2587         14960 if (DEBUG) {
1317              
1318             # optimization -- don't figure out depth unless we're in debug mode
1319             my @lineage_tags = $pos->lineage_tag_names;
1320             $indent = ' ' x ( 1 + @lineage_tags );
1321              
1322             $nugget
1323             = ( length($text) <= 25 )
1324             ? $text
1325             : ( substr( $text, 0, 25 ) . '...' );
1326             $nugget =~ s<([\x00-\x1F])>
1327             <'\\x'.(unpack("H2",$1))>eg;
1328             print $indent, "Proposing a new text node ($nugget) under ",
1329             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1330             || 'Root',
1331             ".\n";
1332              
1333             #} else {
1334             # $indent = ' ';
1335             }
1336              
1337 2587         15087 my $ptag;
1338 2587 50 33     20861 if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1339              
1340             #or $pos->is_inside('pre')
1341             or $pos->is_inside( 'pre', 'textarea' )
1342             )
1343             {
1344 0 0       0 return if $ignore_text;
1345 0         0 $pos->push_content($text);
1346             }
1347             else {
1348              
1349             # return unless $text =~ /\S/; # This is sometimes wrong
1350              
1351 2587 100 100     30332 if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
    50 33        
    100          
    100          
    100          
    100          
1352              
1353             # don't change anything
1354             }
1355             elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1356 0 0       0 if ( $self->{'_implicit_body_p_tag'} ) {
1357 0         0 print $indent,
1358             " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1359             if DEBUG > 1;
1360 0         0 $self->end( \$ptag );
1361             $pos = $self->{'_body'}
1362             ? ( $self->{'_pos'}
1363 0 0       0 = $self->{'_body'} ) # expected case
1364             : $self->insert_element( 'body', 1 );
1365 0         0 $pos = $self->insert_element( 'p', 1 );
1366             }
1367             else {
1368 0         0 print $indent,
1369             " * Text node under \U$ptag\E closes, implicates BODY.\n"
1370             if DEBUG > 1;
1371 0         0 $self->end( \$ptag );
1372             $pos = $self->{'_body'}
1373             ? ( $self->{'_pos'}
1374 0 0       0 = $self->{'_body'} ) # expected case
1375             : $self->insert_element( 'body', 1 );
1376             }
1377             }
1378             elsif ( $ptag eq 'html' ) {
1379 54 100       5695 if ( $self->{'_implicit_body_p_tag'} ) {
1380 9         1389 print $indent,
1381             " * Text node under HTML implicates BODY and P.\n"
1382             if DEBUG > 1;
1383             $pos = $self->{'_body'}
1384             ? ( $self->{'_pos'}
1385 9 50       1378 = $self->{'_body'} ) # expected case
1386             : $self->insert_element( 'body', 1 );
1387 9         1373 $pos = $self->insert_element( 'p', 1 );
1388             }
1389             else {
1390 45         4317 print $indent,
1391             " * Text node under HTML implicates BODY.\n"
1392             if DEBUG > 1;
1393             $pos = $self->{'_body'}
1394             ? ( $self->{'_pos'}
1395 45 50       12686 = $self->{'_body'} ) # expected case
1396             : $self->insert_element( 'body', 1 );
1397              
1398             #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1399             }
1400             }
1401             elsif ( $ptag eq 'body' ) {
1402 400 50       9760 if ( $self->{'_implicit_body_p_tag'} ) {
1403 0         0 print $indent, " * Text node under BODY implicates P.\n"
1404             if DEBUG > 1;
1405 0         0 $pos = $self->insert_element( 'p', 1 );
1406             }
1407             }
1408             elsif ( $ptag eq 'table' ) {
1409 8         12 print $indent,
1410             " * Text node under TABLE implicates TR and TD.\n"
1411             if DEBUG > 1;
1412 8         28 $self->insert_element( 'tr', 1 );
1413 8         21 $pos = $self->insert_element( 'td', 1 );
1414              
1415             # double whammy!
1416             }
1417             elsif ( $ptag eq 'tr' ) {
1418 3         7 print $indent, " * Text node under TR implicates TD.\n"
1419             if DEBUG > 1;
1420 3         8 $pos = $self->insert_element( 'td', 1 );
1421             }
1422              
1423             # elsif (
1424             # # $ptag eq 'li' ||
1425             # # $ptag eq 'dd' ||
1426             # $ptag eq 'form') {
1427             # $pos = $self->insert_element('p', 1);
1428             #}
1429              
1430             # Whatever we've done above should have had the side
1431             # effect of updating $self->{'_pos'}
1432              
1433             #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1434              
1435 2587 50       17024 return if $ignore_text;
1436 2587 100       21919 $text =~ s/[\n\r\f\t ]+/ /g # canonical space
1437             unless $no_space_compacting;
1438              
1439             print $indent, " (Attaching text node ($nugget) under ",
1440              
1441             # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1442 2587         15537 $pos->{'_tag'}, ").\n"
1443             if DEBUG > 1;
1444              
1445 2587         17650 $pos->push_content($text);
1446             }
1447              
1448 2587 50       43748 &{ $self->{'_tweak_~text'} || return }( $text, $pos,
1449 2587         16509 $pos->{'_tag'} . '' );
1450              
1451             # Note that this is very exceptional -- it doesn't fall back to
1452             # _tweak_*, and it gives its tweak different arguments.
1453 0         0 return;
1454             }
1455             }
1456              
1457             #==========================================================================
1458              
1459             # TODO: test whether comment(), declaration(), and process(), do the right
1460             # thing as far as tightening and whatnot.
1461             # Also, currently, doctypes and comments that appear before head or body
1462             # show up in the tree in the wrong place. Something should be done about
1463             # this. Tricky. Maybe this whole business of pre-making the body and
1464             # whatnot is wrong.
1465              
1466             sub comment {
1467 195 50   195 1 1948 return if $_[0]{'_stunted'};
1468              
1469             # Accept a "here's a comment" signal from HTML::Parser.
1470              
1471 195         414 my ( $self, $text ) = @_;
1472 195   66     583 my $pos = $self->{'_pos'} || $self;
1473             return
1474             unless $self->{'_store_comments'}
1475 195 100 66     1207 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1476              
1477 1         1 if (DEBUG) {
1478             my @lineage_tags = $pos->lineage_tag_names;
1479             my $indent = ' ' x ( 1 + @lineage_tags );
1480              
1481             my $nugget
1482             = ( length($text) <= 25 )
1483             ? $text
1484             : ( substr( $text, 0, 25 ) . '...' );
1485             $nugget =~ s<([\x00-\x1F])>
1486             <'\\x'.(unpack("H2",$1))>eg;
1487             print $indent, "Proposing a Comment ($nugget) under ",
1488             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1489             ".\n";
1490             }
1491              
1492 1         4 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1493 1         4 $pos->push_content($e);
1494 1         2 ++( $self->{'_element_count'} );
1495              
1496 1         4 &{ $self->{'_tweak_~comment'}
1497 1 50 33     7 || $self->{'_tweak_*'}
1498             || return $e }( map $_, $e, '~comment', $self );
1499              
1500 0         0 return $e;
1501             }
1502              
1503             sub declaration {
1504 17 50   17 1 1158 return if $_[0]{'_stunted'};
1505              
1506             # Accept a "here's a markup declaration" signal from HTML::Parser.
1507              
1508 17         77 my ( $self, $text ) = @_;
1509 17   33     80 my $pos = $self->{'_pos'} || $self;
1510              
1511 17         28 if (DEBUG) {
1512             my @lineage_tags = $pos->lineage_tag_names;
1513             my $indent = ' ' x ( 1 + @lineage_tags );
1514              
1515             my $nugget
1516             = ( length($text) <= 25 )
1517             ? $text
1518             : ( substr( $text, 0, 25 ) . '...' );
1519             $nugget =~ s<([\x00-\x1F])>
1520             <'\\x'.(unpack("H2",$1))>eg;
1521             print $indent, "Proposing a Declaration ($nugget) under ",
1522             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1523             ".\n";
1524             }
1525 17         47 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1526              
1527 17         39 $self->{_decl} = $e;
1528 17         106 return $e;
1529             }
1530              
1531             #==========================================================================
1532              
1533             sub process {
1534 2 50   2 1 138 return if $_[0]{'_stunted'};
1535              
1536             # Accept a "here's a PI" signal from HTML::Parser.
1537              
1538 2 50       17 return unless $_[0]->{'_store_pis'};
1539 0         0 my ( $self, $text ) = @_;
1540 0   0     0 my $pos = $self->{'_pos'} || $self;
1541              
1542 0         0 if (DEBUG) {
1543             my @lineage_tags = $pos->lineage_tag_names;
1544             my $indent = ' ' x ( 1 + @lineage_tags );
1545              
1546             my $nugget
1547             = ( length($text) <= 25 )
1548             ? $text
1549             : ( substr( $text, 0, 25 ) . '...' );
1550             $nugget =~ s<([\x00-\x1F])>
1551             <'\\x'.(unpack("H2",$1))>eg;
1552             print $indent, "Proposing a PI ($nugget) under ",
1553             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1554             ".\n";
1555             }
1556 0         0 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
1557 0         0 $pos->push_content($e);
1558 0         0 ++( $self->{'_element_count'} );
1559              
1560 0 0 0     0 &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
  0         0  
1561             $e, '~pi', $self );
1562              
1563 0         0 return $e;
1564             }
1565              
1566             #==========================================================================
1567              
1568             #When you call $tree->parse_file($filename), and the
1569             #tree's ignore_ignorable_whitespace attribute is on (as it is
1570             #by default), HTML::TreeBuilder's logic will manage to avoid
1571             #creating some, but not all, nodes that represent ignorable
1572             #whitespace. However, at the end of its parse, it traverses the
1573             #tree and deletes any that it missed. (It does this with an
1574             #around-method around HTML::Parser's eof method.)
1575             #
1576             #However, with $tree->parse($content), the cleanup-traversal step
1577             #doesn't happen automatically -- so when you're done parsing all
1578             #content for a document (regardless of whether $content is the only
1579             #bit, or whether it's just another chunk of content you're parsing into
1580             #the tree), call $tree->eof() to signal that you're at the end of the
1581             #text you're inputting to the tree. Besides properly cleaning any bits
1582             #of ignorable whitespace from the tree, this will also ensure that
1583             #HTML::Parser's internal buffer is flushed.
1584              
1585             sub eof {
1586              
1587             # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1588              
1589 373 50   373 1 24207 return if $_[0]->{'_done'}; # we've already been here
1590              
1591 373 50       11906 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1592              
1593 373         11746 my $x = $_[0];
1594 373         11587 print "EOF received.\n" if DEBUG;
1595 373         11560 my (@rv);
1596 373 50       11901 if (wantarray) {
1597              
1598             # I don't think this makes any difference for this particular
1599             # method, but let's be scrupulous, for once.
1600 0         0 @rv = $x->SUPER::eof();
1601             }
1602             else {
1603 373         18397 $rv[0] = $x->SUPER::eof();
1604             }
1605              
1606 373 100 66     13467 $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1607              
1608             # That SHOULD close everything, and will run the appropriate tweaks.
1609             # We /could/ be running under some insane mode such that there's more
1610             # than one HTML element, but really, that's just insane to do anyhow.
1611              
1612 373 100       12124 unless ( $x->{'_implicit_tags'} ) {
1613              
1614             # delete those silly implicit head and body in case we put
1615             # them there in implicit tags mode
1616 9         22 foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1617             $node->replace_with_content
1618             if defined $node
1619             and ref $node
1620             and $node->{'_implicit'}
1621 18 50 33     167 and $node->{'_parent'};
      33        
      33        
1622              
1623             # I think they should be empty anyhow, since the only
1624             # logic that'd insert under them can apply only, I think,
1625             # in the case where _implicit_tags is on
1626             }
1627              
1628             # this may still leave an implicit 'html' at the top, but there's
1629             # nothing we can do about that, is there?
1630             }
1631              
1632             $x->delete_ignorable_whitespace()
1633              
1634             # this's why we trap this -- an after-method
1635 373 100 66     13757 if $x->{'_tighten'} and !$x->{'_ignore_text'};
1636 373         11808 $x->{'_done'} = 1;
1637              
1638 373 50       11946 return @rv if wantarray;
1639 373         23805 return $rv[0];
1640             }
1641              
1642             #==========================================================================
1643              
1644             # TODO: document
1645              
1646             sub stunt {
1647 0     0 1 0 my $self = $_[0];
1648 0         0 print "Stunting the tree.\n" if DEBUG;
1649 0         0 $self->{'_done'} = 1;
1650              
1651 0 0       0 if ( $HTML::Parser::VERSION < 3 ) {
1652              
1653             #This is a MEAN MEAN HACK. And it works most of the time!
1654 0         0 $self->{'_buf'} = '';
1655 0         0 my $fh = *HTML::Parser::F{IO};
1656              
1657             # the local'd FH used by parse_file loop
1658 0 0       0 if ( defined $fh ) {
1659 0         0 print "Closing Parser's filehandle $fh\n" if DEBUG;
1660 0         0 close($fh);
1661             }
1662              
1663             # But if they called $tree->parse_file($filehandle)
1664             # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1665             # to close. Ahwell. Not a problem for most users these days.
1666              
1667             }
1668             else {
1669 0         0 $self->SUPER::eof();
1670              
1671             # Under 3+ versions, calling eof from inside a parse will abort the
1672             # parse / parse_file
1673             }
1674              
1675             # In the off chance that the above didn't work, we'll throw
1676             # this flag to make any future events be no-ops.
1677 0         0 $self->stunted(1);
1678 0         0 return;
1679             }
1680              
1681             # TODO: document
1682 0     0 1 0 sub stunted { shift->_elem( '_stunted', @_ ); }
1683 0     0 1 0 sub done { shift->_elem( '_done', @_ ); }
1684              
1685             #==========================================================================
1686              
1687             sub delete {
1688              
1689             # Override Element's delete method.
1690             # This does most, if not all, of what Element's delete does anyway.
1691             # Deletes content, including content in some special attributes.
1692             # But doesn't empty out the hash.
1693              
1694 270     270 1 90470 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct
1695              
1696 270         4095 delete @{ $_[0] }{ '_body', '_head', '_pos' };
  270         8212  
1697 270         4053 for (
1698 270 50       8288 @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content
1699              
1700             # delete @{$_[0]}{'_body', '_head', '_pos'}
1701             # ...and these, in case these elements don't appear in the
1702             # content, which is possible. If they did appear (as they
1703             # usually do), then calling $_->delete on them again is harmless.
1704             # I don't think that's such a hot idea now. Thru creative reattachment,
1705             # those could actually now point to elements in OTHER trees (which we do
1706             # NOT want to delete!).
1707             ## Reasoned out:
1708             # If these point to elements not in the content list of any element in this
1709             # tree, but not in the content list of any element in any OTHER tree, then
1710             # just deleting these will make their refcounts hit zero.
1711             # If these point to elements in the content lists of elements in THIS tree,
1712             # then we'll get to deleting them when we delete from the top.
1713             # If these point to elements in the content lists of elements in SOME OTHER
1714             # tree, then they're not to be deleted.
1715             )
1716             {
1717 540 50 33     7840 $_->delete
      33        
1718             if defined $_ and ref $_ # Make sure it's an object.
1719             and $_ ne $_[0]; # And avoid hitting myself, just in case!
1720             }
1721              
1722 270 0 33     4545 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1723              
1724             # An 'html' element having a parent is quite unlikely.
1725              
1726 270         17306 return;
1727             }
1728              
1729             sub tighten_up { # legacy
1730 0     0 1 0 shift->delete_ignorable_whitespace(@_);
1731             }
1732              
1733             sub elementify {
1734              
1735             # Rebless this object down into the normal element class.
1736 27     27 1 4295 my $self = $_[0];
1737 27         4337 my $to_class = $self->element_class;
1738 27         8874 delete @{$self}{
1739             grep {
1740 27         4412 ;
1741 621 100 33     206618 length $_ and substr( $_, 0, 1 ) eq '_'
      66        
      66        
      100        
      100        
      100        
      66        
1742              
1743             # The private attributes that we'll retain:
1744             and $_ ne '_tag'
1745             and $_ ne '_parent'
1746             and $_ ne '_content'
1747             and $_ ne '_implicit'
1748             and $_ ne '_pos'
1749             and $_ ne '_element_class'
1750             and $_ ne '_encoding'
1751             } keys %$self
1752             };
1753 27         8696 bless $self, $to_class; # Returns the same object we were fed
1754             }
1755              
1756             sub element_class {
1757 4637 100   4637 1 67216 return 'HTML::Element' if not ref $_[0];
1758 4258   50     89971 return $_[0]->{_element_class} || 'HTML::Element';
1759             }
1760              
1761             #--------------------------------------------------------------------------
1762              
1763             sub guts {
1764 0     0 1   my @out;
1765 0           my @stack = ( $_[0] );
1766 0           my $destructive = $_[1];
1767 0           my $this;
1768 0           while (@stack) {
1769 0           $this = shift @stack;
1770 0 0         if ( !ref $this ) {
    0          
1771 0           push @out, $this; # yes, it can include text nodes
1772             }
1773             elsif ( !$this->{'_implicit'} ) {
1774 0           push @out, $this;
1775 0 0         delete $this->{'_parent'} if $destructive;
1776             }
1777             else {
1778              
1779             # it's an implicit node. Delete it and recurse
1780 0 0         delete $this->{'_parent'} if $destructive;
1781             unshift @stack,
1782             @{
1783 0           ( $destructive
1784             ? delete( $this->{'_content'} )
1785 0 0         : $this->{'_content'}
    0          
1786             )
1787             || []
1788             };
1789             }
1790             }
1791              
1792             # Doesn't call a real $root->delete on the (when implicit) root,
1793             # but I don't think it needs to.
1794              
1795 0 0         return @out if wantarray; # one simple normal case.
1796 0 0         return unless @out;
1797 0 0 0       return $out[0] if @out == 1 and ref( $out[0] );
1798 0           my $x = HTML::Element->new( 'div', '_implicit' => 1 );
1799 0           $x->push_content(@out);
1800 0           return $x;
1801             }
1802              
1803 0     0 1   sub disembowel { $_[0]->guts(1) }
1804              
1805             #--------------------------------------------------------------------------
1806             1;
1807              
1808             __END__