File Coverage

blib/lib/HTML/TreeBuilder.pm
Criterion Covered Total %
statement 402 553 72.6
branch 205 348 58.9
condition 126 255 49.4
subroutine 44 52 84.6
pod 38 38 100.0
total 815 1246 65.4


line stmt bran cond sub pod time code
1             package HTML::TreeBuilder;
2              
3             # ABSTRACT: Parser that builds a HTML syntax tree
4              
5 20     20   516606 use 5.008;
  20         76  
  20         829  
6 20     20   115 use warnings;
  20         47  
  20         625  
7 20     20   116 use strict;
  20         42  
  20         714  
8 20     20   27666 use integer; # vroom vroom!
  20         213  
  20         109  
9 20     20   550 use Carp ();
  20         41  
  20         421  
10 20     20   117 use Scalar::Util qw(openhandle);
  20         51  
  20         5218  
11              
12             our $VERSION = '5.909'; # TRIAL VERSION from OurPkgVersion
13              
14             #---------------------------------------------------------------------------
15             # Make a 'DEBUG' constant...
16              
17             our $DEBUG; # Must be set BEFORE loading this file
18             BEGIN {
19              
20             # We used to have things like
21             # print $indent, "lalala" if $Debug;
22             # But there were an awful lot of having to evaluate $Debug's value.
23             # If we make that depend on a constant, like so:
24             # sub DEBUG () { 1 } # or whatever value.
25             # ...
26             # print $indent, "lalala" if DEBUG;
27             # Which at compile-time (thru the miracle of constant folding) turns into:
28             # print $indent, "lalala";
29             # or, if DEBUG is a constant with a true value, then that print statement
30             # is simply optimized away, and doesn't appear in the target code at all.
31             # If you don't believe me, run:
32             # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
33             # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
34             # and see for yourself (substituting whatever value you want for $DEBUG
35             # there).
36             ## no critic
37 20 50   20   236 if ( defined &DEBUG ) {
    50          
    50          
    0          
38              
39             # Already been defined! Do nothing.
40             }
41             elsif ( $] < 5.00404 ) {
42              
43             # Grudgingly accommodate ancient (pre-constant) versions.
44 0         0 eval 'sub DEBUG { $Debug } ';
45             }
46             elsif ( !$DEBUG ) {
47 20         1492 eval 'sub DEBUG () {0}'; # Make it a constant.
48             }
49             elsif ( $DEBUG =~ m<^\d+$>s ) {
50 0         0 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
51             }
52             else { # WTF?
53 0         0 warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
54 0         0 eval 'sub DEBUG () { $DEBUG }'; # I guess.
55             }
56             ## use critic
57             }
58              
59             #---------------------------------------------------------------------------
60              
61 20     20   20365 use HTML::Entities ();
  20         187880  
  20         1085  
62 20     20   22106 use HTML::Tagset 3.02 ();
  20         45934  
  20         770  
63              
64 20     20   40682 use HTML::Element ();
  20         66  
  20         677  
65 20     20   167 use HTML::Parser 3.46 ();
  20         701  
  20         40117  
66             our @ISA = qw(HTML::Element HTML::Parser);
67              
68             # This looks schizoid, I know.
69             # It's not that we ARE an element AND a parser.
70             # We ARE an element, but one that knows how to handle signals
71             # (method calls) from Parser in order to elaborate its subtree.
72              
73             # Legacy aliases:
74             *HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
75             *HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
76             *HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
77             *HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
78             *HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
79             *HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
80             *HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
81             *HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
82             *HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
83             *HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
84              
85             #==========================================================================
86             # Two little shortcut constructors:
87              
88             sub new_from_file { # or from a FH
89 17     17 1 27649 my $class = shift;
90 17 50       114 Carp::croak("new_from_file takes an odd number of arguments")
91             unless @_ % 2;
92 17 50       63 Carp::croak("new_from_file is a class method only")
93             if ref $class;
94 17         37 my $file = shift;
95 17         86 my $new = $class->new(@_);
96 17 100       67 defined $new->parse_file( $file )
97             or Carp::croak("unable to parse file: $!");
98 16         89 return $new;
99             }
100              
101             sub new_from_content { # from any number of scalars
102 8     8 1 6667 my $class = shift;
103 8 50       252 Carp::croak("new_from_content is a class method only")
104             if ref $class;
105 8         219 my $new = $class->new();
106 8         31 foreach my $whunk (@_) {
107 9 100       60 if ( ref($whunk) eq 'SCALAR' ) {
108 2         34 $new->parse($$whunk);
109             }
110             else {
111 7         209 $new->parse($whunk);
112             }
113 9 50       51 last if $new->{'_stunted'}; # might as well check that.
114             }
115 8         40 $new->eof();
116 8         43 return $new;
117             }
118              
119             sub new_from_string { # from a single scalar (plus options)
120 16     16 1 13725 my $class = shift;
121 16 50       121 Carp::croak("new_from_string takes an odd number of arguments")
122             unless @_ % 2;
123 16 50       55 Carp::croak("new_from_string is a class method only")
124             if ref $class;
125 16         30 my $string = shift;
126 16         76 my $new = $class->new(@_);
127 16         58 $new->parse_content($string);
128 16         49 return $new;
129             }
130              
131             sub new_from_url { # should accept anything that LWP does.
132 4     4 1 11 undef our $lwp_response;
133 4         14 my $class = shift;
134 4 50       26 Carp::croak("new_from_url takes an odd number of arguments")
135             unless @_ % 2;
136 4 50       49 Carp::croak("new_from_url is a class method only")
137             if ref $class;
138 4         10 my $url = shift;
139              
140 4         42 require LWP::UserAgent;
141             # RECOMMEND PREREQ: LWP::UserAgent 5.815
142 4         87 LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method
143 4         45 $lwp_response = LWP::UserAgent->new->get( $url );
144              
145 4 100       63794 Carp::croak("GET failed on $url: " . $lwp_response->status_line)
146             unless $lwp_response->is_success;
147 3 100       48 Carp::croak("$url returned " . $lwp_response->content_type . " not HTML")
148             unless $lwp_response->content_is_html;
149              
150 2         154 my $new = $class->new_from_http($lwp_response, @_);
151              
152 2         17 undef $lwp_response; # Processed successfully
153 2         82 return $new;
154             }
155              
156             sub new_from_http { # from a HTTP::Message (or subclass)
157 3     3 1 8 my $class = shift;
158 3 50       14 Carp::croak("new_from_http takes an odd number of arguments")
159             unless @_ % 2;
160 3 50       12 Carp::croak("new_from_http is a class method only")
161             if ref $class;
162 3         5 my $message = shift;
163 3         16 my $new = $class->new(@_);
164              
165 3         6 my $cref;
166              
167 3         7 my %opt = @_;
168 3 50       11 if (defined $opt{encoding}) {
169             # User-specified charset:
170 0   0     0 my $charset = ($opt{encoding} || 'none');
171 0         0 $charset =~ s/:BOM\z//;
172 0         0 $cref = $message->decoded_content(ref => 1, charset => $charset);
173             } else {
174             # Auto-detect charset:
175 3   50     22 my $charset = $message->content_charset || 'cp1252';
176 3         4284 $cref = $message->decoded_content(ref => 1, charset => $charset);
177 3 50       342 if ($charset eq 'none') {
178 0         0 $charset = '';
179             } else {
180 3         30 require Encode;
181 3 50       14 if (my $encoding = Encode::find_encoding($charset)) {
182 3         74 $charset = $encoding->name; # canonical name
183 3 50       15 $charset .= ':BOM' if $$cref =~ /^\x{FeFF}/;
184             } else {
185 0         0 undef $charset; # Encode doesn't recognize it
186             }
187             }
188 3         8 $new->{_encoding} = $charset;
189             } # end else auto-detect charset
190              
191 3         44 $new->parse( $$cref );
192 3         20 $new->eof;
193 3         55 return $new;
194             }
195              
196             # TODO: document more fully?
197             sub parse_content { # from any number of scalars
198 20     20 1 53 my $tree = shift;
199 20         29 my $retval;
200 20         43 foreach my $whunk (@_) {
201 21 100       61 if ( ref($whunk) eq 'SCALAR' ) {
202 3         40 $retval = $tree->parse($$whunk);
203             }
204             else {
205 18         300 $retval = $tree->parse($whunk);
206             }
207 21 50       89 last if $tree->{'_stunted'}; # might as well check that.
208             }
209 20         80 $tree->eof();
210 20         37 return $retval;
211             }
212              
213             #---------------------------------------------------------------------
214             sub parse_file {
215 20     20 1 56 my ($self, $file) = @_;
216              
217 20 50       57 Carp::croak("parse_file requires file argument") unless defined $file;
218              
219 20         68 my $fh = openhandle($file);
220 20 100       60 unless (defined $fh) {
221 17         39 my $encoding = $self->{_encoding};
222              
223 17 100       47 if (not defined $encoding) {
224 11         4118 require IO::HTML;
225              
226 11         50554 { local $@;
  11         18  
227 11         22 eval {
228 11         51 ($fh, $encoding, my $bom) =
229             IO::HTML::file_and_encoding($file);
230 10 100       15774 $encoding .= ':BOM' if $bom;
231             };
232             } # end local $@
233 11         319 $self->{_encoding} = $encoding;
234             } # end if auto encoding
235             else {
236 6         13 $encoding =~ s/:BOM$//;
237 6 100       435 open($fh, (length($encoding) ? "<:encoding($encoding):crlf"
    50          
238             : "<:raw"), $file)
239             or undef $fh;
240             }
241              
242 17 100       1215 return undef unless defined $fh;
243             } # end unless filehandle was passed in
244              
245 19         145 $self->SUPER::parse_file($fh);
246             }
247              
248             #---------------------------------------------------------------------------
249              
250             sub new { # constructor!
251 315     315 1 140766 my $class = shift;
252 315   33     2012 $class = ref($class) || $class;
253              
254             # Initialize HTML::Element part
255 315         1157 my $self = $class->element_class->new('html');
256              
257             {
258              
259             # A hack for certain strange versions of Parser:
260 315         673 my $other_self = HTML::Parser->new();
  315         1730  
261 315         23341 %$self = ( %$self, %$other_self ); # copy fields
262             # Yes, multiple inheritance is messy. Kids, don't try this at home.
263 315         1964 bless $other_self, "HTML::TreeBuilder::_hideyhole";
264              
265             # whack it out of the HTML::Parser class, to avoid the destructor
266             }
267              
268             # The root of the tree is special, as it has these funny attributes,
269             # and gets reblessed into this class.
270              
271             # Initialize parser settings
272 315         942 $self->{'_implicit_tags'} = 1;
273 315         671 $self->{'_implicit_body_p_tag'} = 0;
274              
275             # If true, trying to insert text, or any of %isPhraseMarkup right
276             # under 'body' will implicate a 'p'. If false, will just go there.
277              
278 315         655 $self->{'_tighten'} = 1;
279              
280             # whether ignorable WS in this tree should be deleted
281              
282 315         1013 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
283              
284 315         742 $self->{'_ignore_unknown'} = 1;
285 315         630 $self->{'_ignore_text'} = 0;
286 315         614 $self->{'_warn'} = 0;
287 315         723 $self->{'_no_space_compacting'} = 0;
288 315         726 $self->{'_self_closed_tags'} = 0;
289 315         769 $self->{'_store_comments'} = 0;
290 315         568 $self->{'_store_declarations'} = 1;
291 315         1007 $self->{'_store_pis'} = 0;
292 315         1559 $self->{'_p_strict'} = 0;
293 315         701 $HTML::Element::encoded_content = $self->{'_no_expand_entities'} = 0;
294              
295             # rebless to our class
296 315         1078 bless $self, $class;
297              
298             # Parse attributes passed in as arguments
299 315 100       1128 if (@_) {
300 17 50       126 Carp::croak("new must be passed key => value pairs") if @_ % 2;
301              
302 17         259 my %attr = @_;
303 17         311 my $allowed = $self->_is_attr;
304              
305 17         255 while (my ($attr, $value) = each %attr ) {
306 28 50       291 if ($allowed->{$attr}) {
307 28         1415 $self->$attr($value);
308             } else {
309 0         0 Carp::carp("Ignoring unknown attribute $attr");
310             }
311             } # end while each $attr
312             } # end if attributes passed to new
313              
314 315         737 $self->{'_element_count'} = 1;
315              
316             # undocumented, informal, and maybe not exactly correct
317              
318 315         2072 $self->{'_head'} = $self->insert_element( 'head', 1 );
319 315         621 $self->{'_pos'} = undef; # pull it back up
320 315         1031 $self->{'_body'} = $self->insert_element( 'body', 1 );
321 315         611 $self->{'_pos'} = undef; # pull it back up again
322              
323 315         965 return $self;
324             }
325              
326             #==========================================================================
327              
328             sub _elem # universal accessor...
329             {
330 36     36   334 my ( $self, $elem, $val ) = @_;
331 36         100 my $old = $self->{$elem};
332 36 50       1068 $self->{$elem} = $val if defined $val;
333 36         329 return $old;
334             }
335              
336             BEGIN {
337 20     20   200 my @attributes = qw(
338             implicit_tags
339             implicit_body_p_tag
340             p_strict
341             no_space_compacting
342             ignore_unknown
343             ignore_text
344             self_closed_tags
345             store_comments
346             store_declarations
347             store_pis
348             warn
349             );
350              
351             # Create accessor methods:
352 20         59 my $code = join('', map { "sub $_ { shift->_elem( '_$_', \@_ ); }\n" }
  220         669  
353             @attributes);
354 20         81 my $err;
355             {
356 20         45 local $@;
  20         40  
357 20 50 0 2 1 7540 $err = $@ || "UNKNOWN ERROR" unless eval "$code 1"; ## no critic
  2     2 1 10  
  2     4 1 10  
  4     11 1 161  
  11     5 1 360  
  5     0 1 192  
  0     1 1 0  
  1     1 1 6  
  1     1 1 64  
  1     0 1 696  
  0     2 1 0  
  2         7  
358             }
359 20 50       83 die "$code$err" if $err;
360              
361             # Record names of class attributes:
362 20         45 my %is_attr = map { $_ => 1 } (@attributes, qw(
  280         160956  
363             encoding
364             ignore_ignorable_whitespace
365             no_expand_entities
366             ));
367              
368 17     17   52 sub _is_attr { return \%is_attr }
369             }
370              
371             # Custom accessors:
372             sub ignore_ignorable_whitespace {
373 5     5 1 16 shift->_elem( '_tighten', @_ ); # internal name is different
374             }
375              
376             sub no_expand_entities {
377 2     2 1 107 my $self = shift;
378 2         11 my $return = $self->_elem( '_no_expand_entities', @_ );
379 2         7 $HTML::Element::encoded_content = $self->{_no_expand_entities};
380 2         15 $return;
381             }
382              
383             #==========================================================================
384              
385             sub warning {
386 3     3 1 6 my $self = shift;
387 3 50       14 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
388              
389             # should maybe say HTML::TreeBuilder instead
390             }
391              
392             #==========================================================================
393              
394             {
395              
396             # To avoid having to rebuild these lists constantly...
397             my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
398             my $indent;
399              
400             sub start {
401 1228 50   1228 1 5461 return if $_[0]{'_stunted'};
402              
403             # Accept a signal from HTML::Parser for start-tags.
404 1228         2412 my ( $self, $tag, $attr ) = @_;
405              
406 1228   100     3493 my $self_closed = ($self->{'_self_closed_tags'} and
407             $_[4] =~ m!/[\n\r\f\t ]*>\z!);
408 1228 100       2618 delete $attr->{'/'} if $self_closed;
409              
410             # Parser passes more, actually:
411             # $self->start($tag, $attr, $attrseq, $origtext)
412             # But we can merrily ignore $attrseq and $origtext.
413              
414 1228 50       2896 if ( $tag eq 'x-html' ) {
415 0         0 print "Ignoring open-x-html tag.\n" if DEBUG;
416              
417             # inserted by some lame code-generators.
418 0         0 return; # bypass tweaking.
419             }
420              
421 1228         9950 $tag =~ s{/$}{}s; # So turns into . Silently forgive.
422              
423 1228 50       4902 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
424 0         0 DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
425 0         0 return;
426              
427             # This avoids having Element's new() throw an exception.
428             }
429              
430 1228   66     4656 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
431 1228         1792 my $already_inserted;
432              
433             #my($indent);
434 1228         1409 if (DEBUG) {
435              
436             # optimization -- don't figure out indenting unless we're in debug mode
437             my @lineage = $pos->lineage;
438             $indent = ' ' x ( 1 + @lineage );
439             print $indent, "Proposing a new \U$tag\E under ",
440             join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
441             || 'Root',
442             ".\n";
443              
444             #} else {
445             # $indent = ' ';
446             }
447              
448             #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
449             # $attr = {%$attr};
450              
451 1228         4294 foreach my $k ( keys %$attr ) {
452              
453             # Make sure some stooge doesn't have "".
454             # That happens every few million Web pages.
455 597 50 33     3699 $attr->{ ' ' . $k } = delete $attr->{$k}
456             if length $k and substr( $k, 0, 1 ) eq '_';
457              
458             # Looks bad, but is fine for round-tripping.
459             }
460              
461 1228         3259 my $e = $self->element_class->new( $tag, %$attr );
462              
463             # Make a new element object.
464             # (Only rarely do we end up just throwing it away later in this call.)
465              
466             # Some prep -- custom messiness for those damned tables, and strict P's.
467 1228 100       3653 if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
468              
469 1190 100       3759 unless ( $HTML::TreeBuilder::isTableElement{$tag} ) {
470 1012 50       3358 if ( $ptag eq 'table' ) {
    50          
471 0         0 print $indent,
472             " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
473             if DEBUG > 1;
474 0         0 $self->insert_element( 'tr', 1 );
475 0         0 $pos = $self->insert_element( 'td', 1 )
476             ; # yes, needs updating
477             }
478             elsif ( $ptag eq 'tr' ) {
479 0         0 print $indent,
480             " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
481             if DEBUG > 1;
482 0         0 $pos = $self->insert_element( 'td', 1 )
483             ; # yes, needs updating
484             }
485 1012         2103 $ptag = $pos->{'_tag'}; # yes, needs updating
486             }
487              
488             # end of table-implication block.
489              
490             # Now maybe do a little dance to enforce P-strictness.
491             # This seems like it should be integrated with the big
492             # "ALL HOPE..." block, further below, but that doesn't
493             # seem feasible.
494 1190 0 33     3877 if ( $self->{'_p_strict'}
      33        
495             and $HTML::TreeBuilder::isKnown{$tag}
496             and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
497             {
498 0         0 my $here = $pos;
499 0         0 my $here_tag = $ptag;
500 0         0 while (1) {
501 0 0       0 if ( $here_tag eq 'p' ) {
502 0         0 print $indent, " * Inserting $tag closes strict P.\n"
503             if DEBUG > 1;
504 0         0 $self->end( \q{p} );
505              
506             # NB: same as \'q', but less confusing to emacs cperl-mode
507 0         0 last;
508             }
509              
510             #print("Lasting from $here_tag\n"),
511             last
512             if $HTML::TreeBuilder::isKnown{$here_tag}
513             and
514             not $HTML::Tagset::is_Possible_Strict_P_Content{
515 0 0 0     0 $here_tag};
516              
517             # Don't keep looking up the tree if we see something that can't
518             # be strict-P content.
519              
520 0   0     0 $here_tag
521             = ( $here = $here->{'_parent'} || last )->{'_tag'};
522             } # end while
523 0   0     0 $ptag = ( $pos = $self->{'_pos'} || $self )
524             ->{'_tag'}; # better update!
525             }
526              
527             # end of strict-p block.
528             }
529              
530             # And now, get busy...
531             #----------------------------------------------------------------------
532 1228 100       7649 if ( !$self->{'_implicit_tags'} ) { # bimskalabim
    100          
    100          
    100          
    100          
    100          
    50          
    0          
    0          
    0          
533             # do nothing
534 38         53 print $indent, " * _implicit_tags is off. doing nothing\n"
535             if DEBUG > 1;
536              
537             #----------------------------------------------------------------------
538             }
539             elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) {
540 2 100       10 if ( $pos->is_inside('body') ) { # all is well
    50          
541 1         3 print $indent,
542             " * ambilocal element \U$tag\E is fine under BODY.\n"
543             if DEBUG > 1;
544             }
545             elsif ( $pos->is_inside('head') ) {
546 1         3 print $indent,
547             " * ambilocal element \U$tag\E is fine under HEAD.\n"
548             if DEBUG > 1;
549             }
550             else {
551              
552             # In neither head nor body! mmmmm... put under head?
553              
554 0 0       0 if ( $ptag eq 'html' ) { # expected case
555             # TODO?? : would there ever be a case where _head would be
556             # absent from a tree that would ever be accessed at this
557             # point?
558 0 0       0 die "Where'd my head go?" unless ref $self->{'_head'};
559 0 0       0 if ( $self->{'_head'}{'_implicit'} ) {
560 0         0 print $indent,
561             " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
562             if DEBUG > 1;
563              
564             # or rather, points us at it.
565 0         0 $self->{'_pos'}
566             = $self->{'_head'}; # to insert under...
567             }
568             else {
569 0         0 $self->warning(
570             "Ambilocal element <$tag> not under HEAD or BODY!?"
571             );
572              
573             # Put it under HEAD by default, I guess
574 0         0 $self->{'_pos'}
575             = $self->{'_head'}; # to insert under...
576             }
577              
578             }
579             else {
580              
581             # Neither under head nor body, nor right under html... pass thru?
582 0         0 $self->warning(
583             "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
584             );
585             }
586             }
587              
588             #----------------------------------------------------------------------
589             }
590             elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) {
591              
592             # Ensure that we are within
593 919 100 66     5052 if ( $ptag eq 'body' ) {
    100          
    100          
    50          
594              
595             # We're good.
596             }
597             elsif (
598             $HTML::TreeBuilder::isBodyElement{$ptag} # glarg
599             and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
600             )
601             {
602              
603             # Special case: Save ourselves a call to is_inside further down.
604             # If our $ptag is an isBodyElement element (but not an
605             # isHeadOrBodyElement element), then we must be under body!
606 258         324 print $indent, " * Inferring that $ptag is under BODY.\n",
607             if DEBUG > 3;
608              
609             # I think this and the test for 'body' trap everything
610             # bodyworthy, except the case where the parent element is
611             # under an unknown element that's a descendant of body.
612             }
613             elsif ( $pos->is_inside('head') ) {
614 169         247 print $indent,
615             " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
616             if DEBUG > 1;
617 169   50     918 $ptag = (
618             $pos = $self->{'_pos'}
619             = $self->{'_body'} # yes, needs updating
620             || die "Where'd my body go?"
621             )->{'_tag'}; # yes, needs updating
622             }
623             elsif ( !$pos->is_inside('body') ) {
624 80         107 print $indent,
625             " * body-element \U$tag\E makes implicit BODY.\n"
626             if DEBUG > 1;
627 80   50     358 $ptag = (
628             $pos = $self->{'_pos'}
629             = $self->{'_body'} # yes, needs updating
630             || die "Where'd my body go?"
631             )->{'_tag'}; # yes, needs updating
632             }
633              
634             # else we ARE under body, so okay.
635              
636             # Handle implicit endings and insert based on and position
637             # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
638 919 100 100     19925 if ( $tag eq 'p'
    100 100        
    100 100        
    50 66        
    100 66        
    100 33        
    100 66        
      66        
      66        
      33        
639             or $tag eq 'h1'
640             or $tag eq 'h2'
641             or $tag eq 'h3'
642             or $tag eq 'h4'
643             or $tag eq 'h5'
644             or $tag eq 'h6'
645             or $tag eq 'form'
646              
647             # Hm, should
really be here?!
648             )
649             {
650              
651             # Can't have

, or inside these

652 307         1143 $self->end(
653             $_Closed_by_structurals,
654             @HTML::TreeBuilder::p_closure_barriers
655              
656             # used to be just li!
657             );
658              
659             }
660             elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
661              
662             # Can't have lists inside -- in the unlikely
663             # event anyone tries to put them there!
664 9 50 33     150 if ( $ptag eq 'h1'
      33        
      33        
      33        
      33        
665             or $ptag eq 'h2'
666             or $ptag eq 'h3'
667             or $ptag eq 'h4'
668             or $ptag eq 'h5'
669             or $ptag eq 'h6' )
670             {
671 0         0 $self->end( \$ptag );
672             }
673              
674             # TODO: Maybe keep closing up the tree until
675             # the ptag isn't any of the above?
676             # But anyone that says

    ...

677             # deserves what they get anyway.
678              
679             }
680             elsif ( $tag eq 'li' ) { # list item
681             # Get under a list tag, one way or another
682 12 50 66     58 unless (
683             exists $HTML::TreeBuilder::isList{$ptag}
684             or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #'
685             )
686             {
687 0         0 print $indent,
688             " * inserting implicit UL for lack of containing ",
689             join( '|', keys %HTML::TreeBuilder::isList ), ".\n"
690             if DEBUG > 1;
691 0         0 $self->insert_element( 'ul', 1 );
692             }
693              
694             }
695             elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
696              
697             # Get under a DL, one way or another
698 0 0 0     0 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #'
699 0         0 print $indent,
700             " * inserting implicit DL for lack of containing DL.\n"
701             if DEBUG > 1;
702 0         0 $self->insert_element( 'dl', 1 );
703             }
704              
705             }
706             elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) {
707 1 50 33     6 if ($self->{
708             '_ignore_formies_outside_form'} # TODO: document this
709             and not $pos->is_inside('form')
710             )
711             {
712 0         0 print $indent,
713             " * ignoring \U$tag\E because not in a FORM.\n"
714             if DEBUG > 1;
715 0         0 return; # bypass tweaking.
716             }
717 1 50       4 if ( $tag eq 'option' ) {
718              
719             # return unless $ptag eq 'select';
720 0         0 $self->end( \q{option} );
721 0   0     0 $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
722 0 0 0     0 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
723 0         0 print $indent,
724             " * \U$tag\E makes an implicit SELECT.\n"
725             if DEBUG > 1;
726 0         0 $pos = $self->insert_element( 'select', 1 );
727              
728             # but not a very useful select -- has no 'name' attribute!
729             # is $pos's value used after this?
730             }
731             }
732             }
733             elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) {
734 178 100       663 if ( !$pos->is_inside('table') ) {
735 3         5 print $indent, " * \U$tag\E makes an implicit TABLE\n"
736             if DEBUG > 1;
737 3         12 $self->insert_element( 'table', 1 );
738             }
739              
740 178 100 100     916 if ( $tag eq 'td' or $tag eq 'th' ) {
741              
742             # Get under a tr one way or another
743 111 100 100     412 unless (
744             $ptag eq 'tr' # either under a tr
745             or $self->end( \q{*}, 'tr',
746             'table' ) #or we can get under one
747             )
748             {
749 2         2 print $indent,
750             " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
751             if DEBUG > 1;
752 2         8 $self->insert_element( 'tr', 1 );
753              
754             # presumably pos's value isn't used after this.
755             }
756             }
757             else {
758 67         177 $self->end( \$tag, 'table' ); #'
759             }
760              
761             # Hmm, I guess this is right. To work it out:
762             # tr closes any open tr (limited at a table)
763             # thead closes any open thead (limited at a table)
764             # tbody closes any open tbody (limited at a table)
765             # tfoot closes any open tfoot (limited at a table)
766             # colgroup closes any open colgroup (limited at a table)
767             # col can try, but will always fail, at the enclosing table,
768             # as col is empty, and therefore never open!
769             # But!
770             # td closes any open td OR th (limited at a table)
771             # th closes any open th OR td (limited at a table)
772             # ...implementable as "close to a tr, or make a tr"
773              
774             }
775             elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) {
776 371 100 100     2292 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
777 1         2 print
778             " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
779             if DEBUG > 1;
780 1         4 $pos = $self->insert_element( 'p', 1 );
781              
782             # is $pos's value used after this?
783             }
784             }
785              
786             # End of implicit endings logic
787              
788             # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}"
789             #----------------------------------------------------------------------
790              
791             }
792             elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) {
793 204 100       772 if ( $pos->is_inside('body') ) {
    100          
794 3         6 print $indent, " * head element \U$tag\E found inside BODY!\n"
795             if DEBUG;
796 3         19 $self->warning("Header element <$tag> in body"); # [sic]
797             }
798             elsif ( !$pos->is_inside('head') ) {
799 167         496 print $indent,
800             " * head element \U$tag\E makes an implicit HEAD.\n"
801             if DEBUG > 1;
802             }
803             else {
804 34         62 print $indent,
805             " * head element \U$tag\E goes inside existing HEAD.\n"
806             if DEBUG > 1;
807             }
808 204   50     958 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
809              
810             #----------------------------------------------------------------------
811             }
812             elsif ( $tag eq 'html' ) {
813 21 50       100 if ( delete $self->{'_implicit'} ) { # first time here
814 21         28 print $indent, " * good! found the real HTML element!\n"
815             if DEBUG > 1;
816             }
817             else {
818 0         0 print $indent, " * Found a second HTML element\n"
819             if DEBUG;
820 0         0 $self->warning("Found a nested element");
821             }
822              
823             # in either case, migrate attributes to the real element
824 21         83 for ( keys %$attr ) {
825 6         42 $self->attr( $_, $attr->{$_} );
826             }
827 21         58 $self->{'_pos'} = undef;
828 21         217 return $self; # bypass tweaking.
829              
830             #----------------------------------------------------------------------
831             }
832             elsif ( $tag eq 'head' ) {
833 22   50     104 my $head = $self->{'_head'} || die "Where'd my head go?";
834 22 50       79 if ( delete $head->{'_implicit'} ) { # first time here
835 22         41 print $indent, " * good! found the real HEAD element!\n"
836             if DEBUG > 1;
837             }
838             else { # been here before
839 0         0 print $indent, " * Found a second HEAD element\n"
840             if DEBUG;
841 0         0 $self->warning("Found a second element");
842             }
843              
844             # in either case, migrate attributes to the real element
845 22         75 for ( keys %$attr ) {
846 0         0 $head->attr( $_, $attr->{$_} );
847             }
848 22         194 return $self->{'_pos'} = $head; # bypass tweaking.
849              
850             #----------------------------------------------------------------------
851             }
852             elsif ( $tag eq 'body' ) {
853 22   50     97 my $body = $self->{'_body'} || die "Where'd my body go?";
854 22 50       89 if ( delete $body->{'_implicit'} ) { # first time here
855 22         36 print $indent, " * good! found the real BODY element!\n"
856             if DEBUG > 1;
857             }
858             else { # been here before
859 0         0 print $indent, " * Found a second BODY element\n"
860             if DEBUG;
861 0         0 $self->warning("Found a second element");
862             }
863              
864             # in either case, migrate attributes to the real element
865 22         83 for ( keys %$attr ) {
866 0         0 $body->attr( $_, $attr->{$_} );
867             }
868 22         202 return $self->{'_pos'} = $body; # bypass tweaking.
869              
870             #----------------------------------------------------------------------
871             }
872             elsif ( $tag eq 'frameset' ) {
873 0 0 0     0 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen
      0        
874             and !$self->{'_noframes_seen'}
875              
876             # otherwise it'll be under the noframes already
877             and !$self->is_inside('body')
878             )
879             {
880              
881             # The following is a bit of a hack. We don't use the normal
882             # insert_element because 1) we don't want it as _pos, but instead
883             # right under $self, and 2), more importantly, that we don't want
884             # this inserted at the /end/ of $self's content_list, but instead
885             # in the middle of it, specifically right before the body element.
886             #
887 0   0     0 my $c = $self->{'_content'} || die "Contentless root?";
888 0   0     0 my $body = $self->{'_body'} || die "Where'd my BODY go?";
889 0         0 for ( my $i = 0; $i < @$c; ++$i ) {
890 0 0       0 if ( $c->[$i] eq $body ) {
891 0         0 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
892 0         0 HTML::Element::_weaken($e->{'_parent'} = $self);
893 0         0 $already_inserted = 1;
894 0         0 print $indent,
895             " * inserting 'frameset' right before BODY.\n"
896             if DEBUG > 1;
897 0         0 last;
898             }
899             }
900 0 0       0 die "BODY not found in children of root?"
901             unless $already_inserted;
902             }
903              
904             }
905             elsif ( $tag eq 'frame' ) {
906              
907             # Okay, fine, pass thru.
908             # Should probably enforce that these should be under a frameset.
909             # But hey. Ditto for enforcing that 'noframes' should be under
910             # a 'frameset', as the DTDs say.
911              
912             }
913             elsif ( $tag eq 'noframes' ) {
914              
915             # This basically assumes there'll be exactly one 'noframes' element
916             # per document. At least, only the first one gets to have the
917             # body under it. And if there are no noframes elements, then
918             # the body pretty much stays where it is. Is that ever a problem?
919 0 0       0 if ( $self->{'_noframes_seen'}++ ) {
920 0         0 print $indent, " * ANOTHER noframes element?\n" if DEBUG;
921             }
922             else {
923 0 0       0 if ( $pos->is_inside('body') ) {
924 0         0 print $indent, " * 'noframes' inside 'body'. Odd!\n"
925             if DEBUG;
926              
927             # In that odd case, we /can't/ make body a child of 'noframes',
928             # because it's an ancestor of the 'noframes'!
929             }
930             else {
931 0   0     0 $e->push_content( $self->{'_body'}
932             || die "Where'd my body go?" );
933 0         0 print $indent, " * Moving body to be under noframes.\n"
934             if DEBUG;
935             }
936             }
937              
938             #----------------------------------------------------------------------
939             }
940             else {
941              
942             # unknown tag
943 0 0       0 if ( $self->{'_ignore_unknown'} ) {
944 0         0 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
945 0         0 $self->warning("Skipping unknown tag $tag");
946 0         0 return;
947             }
948             else {
949 0         0 print $indent, " * Accepting unknown tag \U$tag\E\n"
950             if DEBUG;
951             }
952             }
953              
954             #----------------------------------------------------------------------
955             # End of mumbo-jumbo
956              
957 1163         1530 print $indent, "(Attaching ", $e->{'_tag'}, " under ",
958             ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
959              
960             # because if _pos isn't defined, it goes under self
961             if DEBUG;
962              
963             # The following if-clause is to delete /some/ ignorable whitespace
964             # nodes, as we're making the tree.
965             # This'd be a node we'd catch later anyway, but we might as well
966             # nip it in the bud now.
967             # This doesn't catch /all/ deletable WS-nodes, so we do have to call
968             # the tightener later to catch the rest.
969              
970 1163 100 66     6275 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
971             { # if tightenable
972 1154         1586 my ( $sibs, $par );
973 1154 100 66     12742 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
      66        
      100        
      100        
      66        
      66        
      66        
974             and @$sibs # parent already has content
975             and !
976             ref( $sibs->[-1] ) # and the last one there is a text node
977             and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
978              
979             and ( # one of these has to be eligible...
980             $HTML::TreeBuilder::canTighten{$tag}
981             or (( @$sibs == 1 )
982             ? # WS is leftmost -- so parent matters
983             $HTML::TreeBuilder::canTighten{ $par->{'_tag'} }
984             : # WS is after another node -- it matters
985             ( ref $sibs->[-2]
986             and
987             $HTML::TreeBuilder::canTighten{ $sibs->[-2]
988             {'_tag'} }
989             )
990             )
991             )
992              
993             and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
994              
995             # we're clear
996             )
997             {
998 131         190 pop @$sibs;
999 131         232 print $indent, "Popping a preceding all-WS node\n" if DEBUG;
1000             }
1001             }
1002              
1003 1163 50       2745 unless ($already_inserted) {
1004 1163 100       2150 if ($self_closed) { $self->pos->push_content($e) }
  8         37  
1005 1155         3688 else { $self->insert_element($e) }
1006             }
1007              
1008 1163         1705 if (DEBUG) {
1009             if ( $self->{'_pos'} ) {
1010             print $indent, "(Current lineage of pos: \U$tag\E under ",
1011             join(
1012             '/',
1013             reverse(
1014              
1015             # $self->{'_pos'}{'_tag'}, # don't list myself!
1016             $self->{'_pos'}->lineage_tag_names
1017             )
1018             ),
1019             ".)\n";
1020             }
1021             else {
1022             print $indent, "(Pos points nowhere!?)\n";
1023             }
1024             }
1025              
1026 1163 100 50     6297 unless ( ( $self->{'_pos'} || '' ) eq $e ) {
1027              
1028             # if it's an empty element -- i.e., if it didn't change the _pos
1029 43 50 33     192 &{ $self->{"_tweak_$tag"}
  43         645  
1030             || $self->{'_tweak_*'}
1031             || return $e }( map $_, $e, $tag, $self )
1032             ; # make a list so the user can't clobber
1033             }
1034              
1035 1120         9738 return $e;
1036             }
1037             }
1038              
1039             #==========================================================================
1040              
1041             {
1042             my $indent;
1043              
1044             sub end {
1045 1634 50   1634 1 5078 return if $_[0]{'_stunted'};
1046              
1047             # Either: Acccept an end-tag signal from HTML::Parser
1048             # Or: Method for closing currently open elements in some fairly complex
1049             # way, as used by other methods in this class.
1050 1634         4840 my ( $self, $tag, @stop ) = @_;
1051 1634 50       4290 if ( $tag eq 'x-html' ) {
1052 0         0 print "Ignoring close-x-html tag.\n" if DEBUG;
1053              
1054             # inserted by some lame code-generators.
1055 0         0 return;
1056             }
1057              
1058 1634 50 66     9009 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
1059 0         0 DEBUG and print "End-tag name $tag is no good. Skipping.\n";
1060 0         0 return;
1061              
1062             # This avoids having Element's new() throw an exception.
1063             }
1064              
1065             # This method accepts two calling formats:
1066             # 1) from Parser: $self->end('tag_name', 'origtext')
1067             # in which case we shouldn't mistake origtext as a blocker tag
1068             # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... )
1069             # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... )
1070              
1071             # End the specified tag, but don't move above any of the blocker tags.
1072             # The tag can also be a reference to an array. Terminate the first
1073             # tag found.
1074              
1075 1634   66     6163 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
1076              
1077             # $p and $ptag are sort-of scratch
1078              
1079 1634 100       3174 if ( ref($tag) ) {
1080              
1081             # First param is a ref of one sort or another --
1082             # THE CALL IS COMING FROM INSIDE THE HOUSE!
1083 415 100       2357 $tag = $$tag if ref($tag) eq 'SCALAR';
1084              
1085             # otherwise it's an arrayref.
1086             }
1087             else {
1088              
1089             # the call came from Parser -- just ignore origtext
1090             # except in a table ignore unmatched table tags RT #59980
1091 1219 100       4054 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
1092             }
1093              
1094             #my($indent);
1095 1634         10299 if (DEBUG) {
1096              
1097             # optimization -- don't figure out depth unless we're in debug mode
1098             my @lineage_tags = $p->lineage_tag_names;
1099             $indent = ' ' x ( 1 + @lineage_tags );
1100              
1101             # now announce ourselves
1102             print $indent, "Ending ",
1103             ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
1104             scalar(@stop)
1105             ? ( " no higher than [", join( ' ', @stop ), "]" )
1106             : (), ".\n";
1107              
1108             print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
1109             ".)\n"
1110             if DEBUG > 1;
1111              
1112             if ( DEBUG > 3 ) {
1113              
1114             #my(
1115             # $package, $filename, $line, $subroutine,
1116             # $hasargs, $wantarray, $evaltext, $is_require) = caller;
1117             print $indent,
1118             " (Called from ", ( caller(1) )[3], ' line ',
1119             ( caller(1) )[2],
1120             ")\n";
1121             }
1122              
1123             #} else {
1124             # $indent = ' ';
1125             }
1126              
1127             # End of if DEBUG
1128              
1129             # Now actually do it
1130 1634         2331 my @to_close;
1131 1634 100       4517 if ( $tag eq '*' ) {
    100          
1132              
1133             # Special -- close everything up to (but not including) the first
1134             # limiting tag, or return if none found. Somewhat of a special case.
1135             PARENT:
1136 41         110 while ( defined $p ) {
1137 80         116 $ptag = $p->{'_tag'};
1138 80         87 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1139 80         159 for (@stop) {
1140 125 100       312 if ( $ptag eq $_ ) {
1141 41         83 print $indent,
1142             " (Hit a $_; closing everything up to here.)\n"
1143             if DEBUG > 2;
1144 41         82 last PARENT;
1145             }
1146             }
1147 39         66 push @to_close, $p;
1148 39         62 $p = $p->{'_parent'}; # no match so far? keep moving up
1149 39         75 print $indent,
1150             " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
1151             if DEBUG > 1;
1152             }
1153 41 50       101 unless ( defined $p ) { # We never found what we were looking for.
1154 0         0 print $indent, " (We never found a limit.)\n" if DEBUG > 1;
1155 0         0 return;
1156             }
1157              
1158             #print
1159             # $indent,
1160             # " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1161             # if DEBUG > 4;
1162              
1163             # Otherwise update pos and fall thru.
1164 41         78 $self->{'_pos'} = $p;
1165             }
1166             elsif ( ref $tag ) {
1167              
1168             # Close the first of any of the matching tags, giving up if you hit
1169             # any of the stop-tags.
1170             PARENT:
1171 307         812 while ( defined $p ) {
1172 584         1008 $ptag = $p->{'_tag'};
1173 584         722 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1174 584         1330 for (@$tag) {
1175 5014 100       11764 if ( $ptag eq $_ ) {
1176 31         40 print $indent, " (Closing $_.)\n" if DEBUG > 2;
1177 31         76 last PARENT;
1178             }
1179             }
1180 553         1152 for (@stop) {
1181 8295 50       19001 if ( $ptag eq $_ ) {
1182 0         0 print $indent,
1183             " (Hit a limiting $_ -- bailing out.)\n"
1184             if DEBUG > 1;
1185 0         0 return; # so it was all for naught
1186             }
1187             }
1188 553         989 push @to_close, $p;
1189 553         1696 $p = $p->{'_parent'};
1190             }
1191 307 100       1522 return unless defined $p; # We went off the top of the tree.
1192             # Otherwise specified element was found; set pos to its parent.
1193 31         60 push @to_close, $p;
1194 31         115 $self->{'_pos'} = $p->{'_parent'};
1195             }
1196             else {
1197              
1198             # Close the first of the specified tag, giving up if you hit
1199             # any of the stop-tags.
1200 1286         2852 while ( defined $p ) {
1201 1708         3019 $ptag = $p->{'_tag'};
1202 1708         1712 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1203 1708 100       3875 if ( $ptag eq $tag ) {
1204 1252         1511 print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1205 1252         1956 last;
1206             }
1207 456         980 for (@stop) {
1208 67 100       204 if ( $ptag eq $_ ) {
1209 32         51 print $indent,
1210             " (Hit a limiting $_ -- bailing out.)\n"
1211             if DEBUG > 1;
1212 32         115 return; # so it was all for naught
1213             }
1214             }
1215 424         946 push @to_close, $p;
1216 424         1176 $p = $p->{'_parent'};
1217             }
1218 1254 100       3135 return unless defined $p; # We went off the top of the tree.
1219             # Otherwise specified element was found; set pos to its parent.
1220 1252         1975 push @to_close, $p;
1221 1252         2758 $self->{'_pos'} = $p->{'_parent'};
1222             }
1223              
1224 1324 100 100     7131 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1225 1324         1427 print $indent, "(Pos now points to ",
1226             $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1227             if DEBUG > 1;
1228              
1229             ### EXPENSIVE, because has to check that it's not under a pre
1230             ### or a CDATA-parent. That's one more method call per end()!
1231             ### Might as well just do this at the end of the tree-parse, I guess,
1232             ### at which point we'd be parsing top-down, and just not traversing
1233             ### under pre's or CDATA-parents.
1234             ##
1235             ## Take this opportunity to nix any terminal whitespace nodes.
1236             ## TODO: consider whether this (plus the logic in start(), above)
1237             ## would ever leave any WS nodes in the tree.
1238             ## If not, then there's no reason to have eof() call
1239             ## delete_ignorable_whitespace on the tree, is there?
1240             ##
1241             #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1242             # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1243             #) { # if tightenable
1244             # my($children, $e_tag);
1245             # foreach my $e (reverse @to_close) { # going top-down
1246             # last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1247             # $HTML::Tagset::isCDATA_Parent{$e_tag};
1248             #
1249             # if(
1250             # $children = $e->{'_content'}
1251             # and @$children # has children
1252             # and !ref($children->[-1])
1253             # and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1254             # and
1255             # (
1256             # # has a tightenable parent:
1257             # $HTML::TreeBuilder::canTighten{ $e_tag }
1258             # or
1259             # ( # has a tightenable left sibling:
1260             # @$children > 1 and
1261             # ref($children->[-2])
1262             # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1263             # )
1264             # )
1265             # ) {
1266             # pop @$children;
1267             # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1268             # # " (", $e->address, ") while exiting.\n" if DEBUG;
1269             # }
1270             # }
1271             #}
1272              
1273 1324         2385 foreach my $e (@to_close) {
1274              
1275             # Call the applicable callback, if any
1276 1741         3043 $ptag = $e->{'_tag'};
1277 1741 50 33     5718 &{ $self->{"_tweak_$ptag"}
  1741         13194  
1278             || $self->{'_tweak_*'}
1279             || next }( map $_, $e, $ptag, $self );
1280 0         0 print $indent, "Back from tweaking.\n" if DEBUG;
1281             last
1282 0 0       0 if $self->{ '_stunted'
1283             }; # in case one of the handlers called stunt
1284             }
1285 1324         8741 return @to_close;
1286             }
1287             }
1288              
1289             #==========================================================================
1290             {
1291             my ( $indent, $nugget );
1292              
1293             sub text {
1294 2523 50   2523 1 97611 return if $_[0]{'_stunted'};
1295              
1296             # Accept a "here's a text token" signal from HTML::Parser.
1297 2523         5005 my ( $self, $text, $is_cdata ) = @_;
1298              
1299             # the >3.0 versions of Parser may pass a cdata node.
1300             # Thanks to Gisle Aas for pointing this out.
1301              
1302 2523 50       6268 return unless length $text; # I guess that's always right
1303              
1304 2523         4268 my $ignore_text = $self->{'_ignore_text'};
1305 2523         11605 my $no_space_compacting = $self->{'_no_space_compacting'};
1306 2523         3700 my $no_expand_entities = $self->{'_no_expand_entities'};
1307 2523   66     7788 my $pos = $self->{'_pos'} || $self;
1308              
1309 2523 100 33     30413 HTML::Entities::decode($text)
      33        
      66        
1310             unless $ignore_text
1311             || $is_cdata
1312             || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
1313             || $no_expand_entities;
1314              
1315             #my($indent, $nugget);
1316 2523         2893 if (DEBUG) {
1317              
1318             # optimization -- don't figure out depth unless we're in debug mode
1319             my @lineage_tags = $pos->lineage_tag_names;
1320             $indent = ' ' x ( 1 + @lineage_tags );
1321              
1322             $nugget
1323             = ( length($text) <= 25 )
1324             ? $text
1325             : ( substr( $text, 0, 25 ) . '...' );
1326             $nugget =~ s<([\x00-\x1F])>
1327             <'\\x'.(unpack("H2",$1))>eg;
1328             print $indent, "Proposing a new text node ($nugget) under ",
1329             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1330             || 'Root',
1331             ".\n";
1332              
1333             #} else {
1334             # $indent = ' ';
1335             }
1336              
1337 2523         3330 my $ptag;
1338 2523 50 33     14666 if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1339              
1340             #or $pos->is_inside('pre')
1341             or $pos->is_inside( 'pre', 'textarea' )
1342             )
1343             {
1344 0 0       0 return if $ignore_text;
1345 0         0 $pos->push_content($text);
1346             }
1347             else {
1348              
1349             # return unless $text =~ /\S/; # This is sometimes wrong
1350              
1351 2523 100 100     43623 if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
    50 33        
    100          
    100          
    100          
    100          
1352              
1353             # don't change anything
1354             }
1355             elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1356 0 0       0 if ( $self->{'_implicit_body_p_tag'} ) {
1357 0         0 print $indent,
1358             " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1359             if DEBUG > 1;
1360 0         0 $self->end( \$ptag );
1361 0 0       0 $pos = $self->{'_body'}
1362             ? ( $self->{'_pos'}
1363             = $self->{'_body'} ) # expected case
1364             : $self->insert_element( 'body', 1 );
1365 0         0 $pos = $self->insert_element( 'p', 1 );
1366             }
1367             else {
1368 0         0 print $indent,
1369             " * Text node under \U$ptag\E closes, implicates BODY.\n"
1370             if DEBUG > 1;
1371 0         0 $self->end( \$ptag );
1372 0 0       0 $pos = $self->{'_body'}
1373             ? ( $self->{'_pos'}
1374             = $self->{'_body'} ) # expected case
1375             : $self->insert_element( 'body', 1 );
1376             }
1377             }
1378             elsif ( $ptag eq 'html' ) {
1379 22 100       62 if ( $self->{'_implicit_body_p_tag'} ) {
1380 1         2 print $indent,
1381             " * Text node under HTML implicates BODY and P.\n"
1382             if DEBUG > 1;
1383 1 50       5 $pos = $self->{'_body'}
1384             ? ( $self->{'_pos'}
1385             = $self->{'_body'} ) # expected case
1386             : $self->insert_element( 'body', 1 );
1387 1         4 $pos = $self->insert_element( 'p', 1 );
1388             }
1389             else {
1390 21         30 print $indent,
1391             " * Text node under HTML implicates BODY.\n"
1392             if DEBUG > 1;
1393 21 50       94 $pos = $self->{'_body'}
1394             ? ( $self->{'_pos'}
1395             = $self->{'_body'} ) # expected case
1396             : $self->insert_element( 'body', 1 );
1397              
1398             #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1399             }
1400             }
1401             elsif ( $ptag eq 'body' ) {
1402 376 50       1226 if ( $self->{'_implicit_body_p_tag'} ) {
1403 0         0 print $indent, " * Text node under BODY implicates P.\n"
1404             if DEBUG > 1;
1405 0         0 $pos = $self->insert_element( 'p', 1 );
1406             }
1407             }
1408             elsif ( $ptag eq 'table' ) {
1409 8         11 print $indent,
1410             " * Text node under TABLE implicates TR and TD.\n"
1411             if DEBUG > 1;
1412 8         30 $self->insert_element( 'tr', 1 );
1413 8         25 $pos = $self->insert_element( 'td', 1 );
1414              
1415             # double whammy!
1416             }
1417             elsif ( $ptag eq 'tr' ) {
1418 3         5 print $indent, " * Text node under TR implicates TD.\n"
1419             if DEBUG > 1;
1420 3         11 $pos = $self->insert_element( 'td', 1 );
1421             }
1422              
1423             # elsif (
1424             # # $ptag eq 'li' ||
1425             # # $ptag eq 'dd' ||
1426             # $ptag eq 'form') {
1427             # $pos = $self->insert_element('p', 1);
1428             #}
1429              
1430             # Whatever we've done above should have had the side
1431             # effect of updating $self->{'_pos'}
1432              
1433             #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1434              
1435 2523 50       6179 return if $ignore_text;
1436 2523 100       17610 $text =~ s/[\n\r\f\t ]+/ /g # canonical space
1437             unless $no_space_compacting;
1438              
1439 2523         3085 print $indent, " (Attaching text node ($nugget) under ",
1440              
1441             # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1442             $pos->{'_tag'}, ").\n"
1443             if DEBUG > 1;
1444              
1445 2523         7628 $pos->push_content($text);
1446             }
1447              
1448 2523 50       5298 &{ $self->{'_tweak_~text'} || return }( $text, $pos,
  2523         40098  
1449             $pos->{'_tag'} . '' );
1450              
1451             # Note that this is very exceptional -- it doesn't fall back to
1452             # _tweak_*, and it gives its tweak different arguments.
1453 0         0 return;
1454             }
1455             }
1456              
1457             #==========================================================================
1458              
1459             # TODO: test whether comment(), declaration(), and process(), do the right
1460             # thing as far as tightening and whatnot.
1461             # Also, currently, doctypes and comments that appear before head or body
1462             # show up in the tree in the wrong place. Something should be done about
1463             # this. Tricky. Maybe this whole business of pre-making the body and
1464             # whatnot is wrong.
1465              
1466             sub comment {
1467 195 50   195 1 2271 return if $_[0]{'_stunted'};
1468              
1469             # Accept a "here's a comment" signal from HTML::Parser.
1470              
1471 195         371 my ( $self, $text ) = @_;
1472 195   66     630 my $pos = $self->{'_pos'} || $self;
1473             return
1474 195 100 66     1562 unless $self->{'_store_comments'}
1475             || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1476              
1477 1         2 if (DEBUG) {
1478             my @lineage_tags = $pos->lineage_tag_names;
1479             my $indent = ' ' x ( 1 + @lineage_tags );
1480              
1481             my $nugget
1482             = ( length($text) <= 25 )
1483             ? $text
1484             : ( substr( $text, 0, 25 ) . '...' );
1485             $nugget =~ s<([\x00-\x1F])>
1486             <'\\x'.(unpack("H2",$1))>eg;
1487             print $indent, "Proposing a Comment ($nugget) under ",
1488             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1489             ".\n";
1490             }
1491              
1492 1         5 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1493 1         5 $pos->push_content($e);
1494 1         3 ++( $self->{'_element_count'} );
1495              
1496 1 50 33     5 &{ $self->{'_tweak_~comment'}
  1         12  
1497             || $self->{'_tweak_*'}
1498             || return $e }( map $_, $e, '~comment', $self );
1499              
1500 0         0 return $e;
1501             }
1502              
1503             sub declaration {
1504 17 50   17 1 1987 return if $_[0]{'_stunted'};
1505              
1506             # Accept a "here's a markup declaration" signal from HTML::Parser.
1507              
1508 17         92 my ( $self, $text ) = @_;
1509 17   33     88 my $pos = $self->{'_pos'} || $self;
1510              
1511 17         29 if (DEBUG) {
1512             my @lineage_tags = $pos->lineage_tag_names;
1513             my $indent = ' ' x ( 1 + @lineage_tags );
1514              
1515             my $nugget
1516             = ( length($text) <= 25 )
1517             ? $text
1518             : ( substr( $text, 0, 25 ) . '...' );
1519             $nugget =~ s<([\x00-\x1F])>
1520             <'\\x'.(unpack("H2",$1))>eg;
1521             print $indent, "Proposing a Declaration ($nugget) under ",
1522             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1523             ".\n";
1524             }
1525 17         47 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1526              
1527 17         47 $self->{_decl} = $e;
1528 17         134 return $e;
1529             }
1530              
1531             #==========================================================================
1532              
1533             sub process {
1534 2 50   2 1 177 return if $_[0]{'_stunted'};
1535              
1536             # Accept a "here's a PI" signal from HTML::Parser.
1537              
1538 2 50       25 return unless $_[0]->{'_store_pis'};
1539 0         0 my ( $self, $text ) = @_;
1540 0   0     0 my $pos = $self->{'_pos'} || $self;
1541              
1542 0         0 if (DEBUG) {
1543             my @lineage_tags = $pos->lineage_tag_names;
1544             my $indent = ' ' x ( 1 + @lineage_tags );
1545              
1546             my $nugget
1547             = ( length($text) <= 25 )
1548             ? $text
1549             : ( substr( $text, 0, 25 ) . '...' );
1550             $nugget =~ s<([\x00-\x1F])>
1551             <'\\x'.(unpack("H2",$1))>eg;
1552             print $indent, "Proposing a PI ($nugget) under ",
1553             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1554             ".\n";
1555             }
1556 0         0 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
1557 0         0 $pos->push_content($e);
1558 0         0 ++( $self->{'_element_count'} );
1559              
1560 0 0 0     0 &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
  0         0  
1561             $e, '~pi', $self );
1562              
1563 0         0 return $e;
1564             }
1565              
1566             #==========================================================================
1567              
1568             #When you call $tree->parse_file($filename), and the
1569             #tree's ignore_ignorable_whitespace attribute is on (as it is
1570             #by default), HTML::TreeBuilder's logic will manage to avoid
1571             #creating some, but not all, nodes that represent ignorable
1572             #whitespace. However, at the end of its parse, it traverses the
1573             #tree and deletes any that it missed. (It does this with an
1574             #around-method around HTML::Parser's eof method.)
1575             #
1576             #However, with $tree->parse($content), the cleanup-traversal step
1577             #doesn't happen automatically -- so when you're done parsing all
1578             #content for a document (regardless of whether $content is the only
1579             #bit, or whether it's just another chunk of content you're parsing into
1580             #the tree), call $tree->eof() to signal that you're at the end of the
1581             #text you're inputting to the tree. Besides properly cleaning any bits
1582             #of ignorable whitespace from the tree, this will also ensure that
1583             #HTML::Parser's internal buffer is flushed.
1584              
1585             sub eof {
1586              
1587             # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1588              
1589 309 50   309 1 2247 return if $_[0]->{'_done'}; # we've already been here
1590              
1591 309 50       928 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1592              
1593 309         534 my $x = $_[0];
1594 309         329 print "EOF received.\n" if DEBUG;
1595 309         540 my (@rv);
1596 309 50       666 if (wantarray) {
1597              
1598             # I don't think this makes any difference for this particular
1599             # method, but let's be scrupulous, for once.
1600 0         0 @rv = $x->SUPER::eof();
1601             }
1602             else {
1603 309         2086 $rv[0] = $x->SUPER::eof();
1604             }
1605              
1606 309 100 66     16228 $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1607              
1608             # That SHOULD close everything, and will run the appropriate tweaks.
1609             # We /could/ be running under some insane mode such that there's more
1610             # than one HTML element, but really, that's just insane to do anyhow.
1611              
1612 309 100       5855 unless ( $x->{'_implicit_tags'} ) {
1613              
1614             # delete those silly implicit head and body in case we put
1615             # them there in implicit tags mode
1616 9         30 foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1617 18 50 33     340 $node->replace_with_content
      33        
      33        
1618             if defined $node
1619             and ref $node
1620             and $node->{'_implicit'}
1621             and $node->{'_parent'};
1622              
1623             # I think they should be empty anyhow, since the only
1624             # logic that'd insert under them can apply only, I think,
1625             # in the case where _implicit_tags is on
1626             }
1627              
1628             # this may still leave an implicit 'html' at the top, but there's
1629             # nothing we can do about that, is there?
1630             }
1631              
1632             $x->delete_ignorable_whitespace()
1633              
1634             # this's why we trap this -- an after-method
1635 309 100 66     3651 if $x->{'_tighten'} and !$x->{'_ignore_text'};
1636 309         752 $x->{'_done'} = 1;
1637              
1638 309 50       829 return @rv if wantarray;
1639 309         1722 return $rv[0];
1640             }
1641              
1642             #==========================================================================
1643              
1644             # TODO: document
1645              
1646             sub stunt {
1647 0     0 1 0 my $self = $_[0];
1648 0         0 print "Stunting the tree.\n" if DEBUG;
1649 0         0 $self->{'_done'} = 1;
1650              
1651 0 0       0 if ( $HTML::Parser::VERSION < 3 ) {
1652              
1653             #This is a MEAN MEAN HACK. And it works most of the time!
1654 0         0 $self->{'_buf'} = '';
1655 0         0 my $fh = *HTML::Parser::F{IO};
1656              
1657             # the local'd FH used by parse_file loop
1658 0 0       0 if ( defined $fh ) {
1659 0         0 print "Closing Parser's filehandle $fh\n" if DEBUG;
1660 0         0 close($fh);
1661             }
1662              
1663             # But if they called $tree->parse_file($filehandle)
1664             # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1665             # to close. Ahwell. Not a problem for most users these days.
1666              
1667             }
1668             else {
1669 0         0 $self->SUPER::eof();
1670              
1671             # Under 3+ versions, calling eof from inside a parse will abort the
1672             # parse / parse_file
1673             }
1674              
1675             # In the off chance that the above didn't work, we'll throw
1676             # this flag to make any future events be no-ops.
1677 0         0 $self->stunted(1);
1678 0         0 return;
1679             }
1680              
1681             # TODO: document
1682 0     0 1 0 sub stunted { shift->_elem( '_stunted', @_ ); }
1683 0     0 1 0 sub done { shift->_elem( '_done', @_ ); }
1684              
1685             #==========================================================================
1686              
1687             sub delete {
1688              
1689             # Override Element's delete method.
1690             # This does most, if not all, of what Element's delete does anyway.
1691             # Deletes content, including content in some special attributes.
1692             # But doesn't empty out the hash.
1693              
1694 250     250 1 134382 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct
1695              
1696 250         482 delete @{ $_[0] }{ '_body', '_head', '_pos' };
  250         1039  
1697 250         461 for (
1698 250 50       1287 @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content
1699              
1700             # delete @{$_[0]}{'_body', '_head', '_pos'}
1701             # ...and these, in case these elements don't appear in the
1702             # content, which is possible. If they did appear (as they
1703             # usually do), then calling $_->delete on them again is harmless.
1704             # I don't think that's such a hot idea now. Thru creative reattachment,
1705             # those could actually now point to elements in OTHER trees (which we do
1706             # NOT want to delete!).
1707             ## Reasoned out:
1708             # If these point to elements not in the content list of any element in this
1709             # tree, but not in the content list of any element in any OTHER tree, then
1710             # just deleting these will make their refcounts hit zero.
1711             # If these point to elements in the content lists of elements in THIS tree,
1712             # then we'll get to deleting them when we delete from the top.
1713             # If these point to elements in the content lists of elements in SOME OTHER
1714             # tree, then they're not to be deleted.
1715             )
1716             {
1717 500 50 33     5793 $_->delete
      33        
1718             if defined $_ and ref $_ # Make sure it's an object.
1719             and $_ ne $_[0]; # And avoid hitting myself, just in case!
1720             }
1721              
1722 250 50 33     1474 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1723              
1724             # An 'html' element having a parent is quite unlikely.
1725              
1726 250         8694 return;
1727             }
1728              
1729             sub tighten_up { # legacy
1730 0     0 1 0 shift->delete_ignorable_whitespace(@_);
1731             }
1732              
1733             sub elementify {
1734              
1735             # Rebless this object down into the normal element class.
1736 3     3 1 7 my $self = $_[0];
1737 3         7 my $to_class = $self->element_class;
1738 3         35 delete @{$self}{
1739             grep {
1740 3         15 ;
1741 69 100 33     1107 length $_ and substr( $_, 0, 1 ) eq '_'
      66        
      66        
      100        
      100        
      100        
      66        
1742              
1743             # The private attributes that we'll retain:
1744             and $_ ne '_tag'
1745             and $_ ne '_parent'
1746             and $_ ne '_content'
1747             and $_ ne '_implicit'
1748             and $_ ne '_pos'
1749             and $_ ne '_element_class'
1750             and $_ ne '_encoding'
1751             } keys %$self
1752             };
1753 3         17 bless $self, $to_class; # Returns the same object we were fed
1754             }
1755              
1756             sub element_class {
1757 4381 100   4381 1 13185 return 'HTML::Element' if not ref $_[0];
1758 4066   50     35297 return $_[0]->{_element_class} || 'HTML::Element';
1759             }
1760              
1761             #--------------------------------------------------------------------------
1762              
1763             sub guts {
1764 0     0 1   my @out;
1765 0           my @stack = ( $_[0] );
1766 0           my $destructive = $_[1];
1767 0           my $this;
1768 0           while (@stack) {
1769 0           $this = shift @stack;
1770 0 0         if ( !ref $this ) {
    0          
1771 0           push @out, $this; # yes, it can include text nodes
1772             }
1773             elsif ( !$this->{'_implicit'} ) {
1774 0           push @out, $this;
1775 0 0         delete $this->{'_parent'} if $destructive;
1776             }
1777             else {
1778              
1779             # it's an implicit node. Delete it and recurse
1780 0 0         delete $this->{'_parent'} if $destructive;
1781 0 0         unshift @stack,
    0          
1782             @{
1783 0           ( $destructive
1784             ? delete( $this->{'_content'} )
1785             : $this->{'_content'}
1786             )
1787             || []
1788             };
1789             }
1790             }
1791              
1792             # Doesn't call a real $root->delete on the (when implicit) root,
1793             # but I don't think it needs to.
1794              
1795 0 0         return @out if wantarray; # one simple normal case.
1796 0 0         return unless @out;
1797 0 0 0       return $out[0] if @out == 1 and ref( $out[0] );
1798 0           my $x = HTML::Element->new( 'div', '_implicit' => 1 );
1799 0           $x->push_content(@out);
1800 0           return $x;
1801             }
1802              
1803 0     0 1   sub disembowel { $_[0]->guts(1) }
1804              
1805             #--------------------------------------------------------------------------
1806             1;
1807              
1808             __END__