File Coverage

blib/lib/HTML/TreeBuilder.pm
Criterion Covered Total %
statement 323 477 67.7
branch 173 306 56.5
condition 115 243 47.3
subroutine 31 44 70.4
pod 34 34 100.0
total 676 1104 61.2


line stmt bran cond sub pod time code
1             package HTML::TreeBuilder;
2              
3             # ABSTRACT: Parser that builds a HTML syntax tree
4              
5 16     16   659682 use warnings;
  16         34  
  16         508  
6 16     16   74 use strict;
  16         29  
  16         263  
7 16     16   4434 use integer; # vroom vroom!
  16         191  
  16         66  
8 16     16   404 use Carp ();
  16         32  
  16         1879  
9              
10             our $VERSION = '5.07'; # VERSION from OurPkgVersion
11              
12             #---------------------------------------------------------------------------
13             # Make a 'DEBUG' constant...
14              
15             our $DEBUG; # Must be set BEFORE loading this file
16             BEGIN {
17              
18             # We used to have things like
19             # print $indent, "lalala" if $Debug;
20             # But there were an awful lot of having to evaluate $Debug's value.
21             # If we make that depend on a constant, like so:
22             # sub DEBUG () { 1 } # or whatever value.
23             # ...
24             # print $indent, "lalala" if DEBUG;
25             # Which at compile-time (thru the miracle of constant folding) turns into:
26             # print $indent, "lalala";
27             # or, if DEBUG is a constant with a true value, then that print statement
28             # is simply optimized away, and doesn't appear in the target code at all.
29             # If you don't believe me, run:
30             # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
31             # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
32             # and see for yourself (substituting whatever value you want for $DEBUG
33             # there).
34             ## no critic
35 16 50   16   173 if ( defined &DEBUG ) {
    50          
    50          
    0          
36              
37             # Already been defined! Do nothing.
38             }
39             elsif ( $] < 5.00404 ) {
40              
41             # Grudgingly accomodate ancient (pre-constant) versions.
42 0         0 eval 'sub DEBUG { $Debug } ';
43             }
44             elsif ( !$DEBUG ) {
45 16         792 eval 'sub DEBUG () {0}'; # Make it a constant.
46             }
47             elsif ( $DEBUG =~ m<^\d+$>s ) {
48 0         0 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
49             }
50             else { # WTF?
51 0         0 warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
52 0         0 eval 'sub DEBUG () { $DEBUG }'; # I guess.
53             }
54             ## use critic
55             }
56              
57             #---------------------------------------------------------------------------
58              
59 16     16   4434 use HTML::Entities ();
  16         70682  
  16         564  
60 16     16   4408 use HTML::Tagset 3.02 ();
  16         16051  
  16         421  
61              
62 16     16   9353 use HTML::Element ();
  16         46  
  16         495  
63 16     16   103 use HTML::Parser 3.46 ();
  16         374  
  16         86598  
64             our @ISA = qw(HTML::Element HTML::Parser);
65              
66             # This looks schizoid, I know.
67             # It's not that we ARE an element AND a parser.
68             # We ARE an element, but one that knows how to handle signals
69             # (method calls) from Parser in order to elaborate its subtree.
70              
71             # Legacy aliases:
72             *HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
73             *HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
74             *HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
75             *HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
76             *HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
77             *HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
78             *HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
79             *HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
80             *HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
81             *HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
82              
83             #==========================================================================
84             # Two little shortcut constructors:
85              
86             sub new_from_file { # or from a FH
87 3     3 1 8 my $class = shift;
88 3 50       12 Carp::croak("new_from_file takes only one argument")
89             unless @_ == 1;
90 3 50       10 Carp::croak("new_from_file is a class method only")
91             if ref $class;
92 3         13 my $new = $class->new();
93 3 100       23 defined $new->parse_file( $_[0] )
94             or Carp::croak("unable to parse file: $!");
95 2         7 return $new;
96             }
97              
98             sub new_from_content { # from any number of scalars
99 8     8 1 2005 my $class = shift;
100 8 50       30 Carp::croak("new_from_content is a class method only")
101             if ref $class;
102 8         27 my $new = $class->new();
103 8         21 foreach my $whunk (@_) {
104 9 100       28 if ( ref($whunk) eq 'SCALAR' ) {
105 1         8 $new->parse($$whunk);
106             }
107             else {
108 8         71 $new->parse($whunk);
109             }
110 9 50       60 last if $new->{'_stunted'}; # might as well check that.
111             }
112 8         32 $new->eof();
113 8         25 return $new;
114             }
115              
116             sub new_from_url { # should accept anything that LWP does.
117 4     4 1 14 undef our $lwp_response;
118 4         6 my $class = shift;
119 4 50       17 Carp::croak("new_from_url takes only one argument")
120             unless @_ == 1;
121 4 50       12 Carp::croak("new_from_url is a class method only")
122             if ref $class;
123 4         6 my $url = shift;
124 4         20 my $new = $class->new();
125              
126 4         33 require LWP::UserAgent;
127             # RECOMMEND PREREQ: LWP::UserAgent 5.815
128 4         79 LWP::UserAgent->VERSION( 5.815 ); # HTTP::Headers content_is_html method
129 4         54 $lwp_response = LWP::UserAgent->new->get( $url );
130              
131 4 100       33516 Carp::croak("GET failed on $url: " . $lwp_response->status_line)
132             unless $lwp_response->is_success;
133 3 100       34 Carp::croak("$url returned " . $lwp_response->content_type . " not HTML")
134             unless $lwp_response->content_is_html;
135              
136 2         61 $new->parse( $lwp_response->decoded_content );
137 2         9 $new->eof;
138 2         44 undef $lwp_response; # Processed successfully
139 2         11 return $new;
140             }
141              
142             # TODO: document more fully?
143             sub parse_content { # from any number of scalars
144 9     9 1 34 my $tree = shift;
145 9         13 my $retval;
146 9         18 foreach my $whunk (@_) {
147 10 100       25 if ( ref($whunk) eq 'SCALAR' ) {
148 1         7 $retval = $tree->parse($$whunk);
149             }
150             else {
151 9         97 $retval = $tree->parse($whunk);
152             }
153 10 50       30 last if $tree->{'_stunted'}; # might as well check that.
154             }
155 9         25 $tree->eof();
156 9         18 return $retval;
157             }
158              
159             #---------------------------------------------------------------------------
160              
161             sub new { # constructor!
162 291     291 1 91675 my $class = shift;
163 291   33     1028 $class = ref($class) || $class;
164              
165             # Initialize HTML::Element part
166 291         641 my $self = $class->element_class->new('html');
167              
168             {
169              
170             # A hack for certain strange versions of Parser:
171 291         390 my $other_self = HTML::Parser->new();
  291         801  
172 291         12538 %$self = ( %$self, %$other_self ); # copy fields
173             # Yes, multiple inheritance is messy. Kids, don't try this at home.
174 291         904 bless $other_self, "HTML::TreeBuilder::_hideyhole";
175              
176             # whack it out of the HTML::Parser class, to avoid the destructor
177             }
178              
179             # The root of the tree is special, as it has these funny attributes,
180             # and gets reblessed into this class.
181              
182             # Initialize parser settings
183 291         518 $self->{'_implicit_tags'} = 1;
184 291         414 $self->{'_implicit_body_p_tag'} = 0;
185              
186             # If true, trying to insert text, or any of %isPhraseMarkup right
187             # under 'body' will implicate a 'p'. If false, will just go there.
188              
189 291         390 $self->{'_tighten'} = 1;
190              
191             # whether ignorable WS in this tree should be deleted
192              
193 291         384 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
194              
195 291         660 $self->{'_ignore_unknown'} = 1;
196 291         404 $self->{'_ignore_text'} = 0;
197 291         377 $self->{'_warn'} = 0;
198 291         441 $self->{'_no_space_compacting'} = 0;
199 291         404 $self->{'_store_comments'} = 0;
200 291         394 $self->{'_store_declarations'} = 1;
201 291         408 $self->{'_store_pis'} = 0;
202 291         407 $self->{'_p_strict'} = 0;
203 291         585 $self->{'_no_expand_entities'} = 0;
204              
205             # Parse attributes passed in as arguments
206 291 100       555 if (@_) {
207 4         13 my %attr = @_;
208 4         12 for ( keys %attr ) {
209 10         26 $self->{"_$_"} = $attr{$_};
210             }
211             }
212              
213 291         418 $HTML::Element::encoded_content = $self->{'_no_expand_entities'};
214              
215             # rebless to our class
216 291         408 bless $self, $class;
217              
218 291         424 $self->{'_element_count'} = 1;
219              
220             # undocumented, informal, and maybe not exactly correct
221              
222 291         706 $self->{'_head'} = $self->insert_element( 'head', 1 );
223 291         460 $self->{'_pos'} = undef; # pull it back up
224 291         526 $self->{'_body'} = $self->insert_element( 'body', 1 );
225 291         462 $self->{'_pos'} = undef; # pull it back up again
226              
227 291         547 return $self;
228             }
229              
230             #==========================================================================
231              
232             sub _elem # universal accessor...
233             {
234 21     21   40 my ( $self, $elem, $val ) = @_;
235 21         38 my $old = $self->{$elem};
236 21 50       49 $self->{$elem} = $val if defined $val;
237 21         33 return $old;
238             }
239              
240             # accessors....
241 5     5 1 36 sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); }
242 4     4 1 20 sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); }
243 0     0 1 0 sub p_strict { shift->_elem( '_p_strict', @_ ); }
244 5     5 1 17 sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); }
245 0     0 1 0 sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); }
246 0     0 1 0 sub ignore_text { shift->_elem( '_ignore_text', @_ ); }
247 5     5 1 19 sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); }
248 1     1 1 6 sub store_comments { shift->_elem( '_store_comments', @_ ); }
249 1     1 1 373 sub store_declarations { shift->_elem( '_store_declarations', @_ ); }
250 0     0 1 0 sub store_pis { shift->_elem( '_store_pis', @_ ); }
251 0     0 1 0 sub warn { shift->_elem( '_warn', @_ ); }
252              
253             sub no_expand_entities {
254 0     0 1 0 shift->_elem( '_no_expand_entities', @_ );
255 0         0 $HTML::Element::encoded_content = @_;
256             }
257              
258             #==========================================================================
259              
260             sub warning {
261 1     1 1 1 my $self = shift;
262 1 50       4 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
263              
264             # should maybe say HTML::TreeBuilder instead
265             }
266              
267             #==========================================================================
268              
269             {
270              
271             # To avoid having to rebuild these lists constantly...
272             my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
273             my $indent;
274              
275             sub start {
276 1005 50   1005 1 3190 return if $_[0]{'_stunted'};
277              
278             # Accept a signal from HTML::Parser for start-tags.
279 1005         1859 my ( $self, $tag, $attr ) = @_;
280              
281             # Parser passes more, actually:
282             # $self->start($tag, $attr, $attrseq, $origtext)
283             # But we can merrily ignore $attrseq and $origtext.
284              
285 1005 50       1803 if ( $tag eq 'x-html' ) {
286 0         0 print "Ignoring open-x-html tag.\n" if DEBUG;
287              
288             # inserted by some lame code-generators.
289 0         0 return; # bypass tweaking.
290             }
291              
292 1005         1628 $tag =~ s{/$}{}s; # So turns into . Silently forgive.
293              
294 1005 50       2913 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
295 0         0 DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
296 0         0 return;
297              
298             # This avoids having Element's new() throw an exception.
299             }
300              
301 1005   66     2342 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
302 1005         1244 my $already_inserted;
303              
304             #my($indent);
305 1005         1087 if (DEBUG) {
306              
307             # optimization -- don't figure out indenting unless we're in debug mode
308             my @lineage = $pos->lineage;
309             $indent = ' ' x ( 1 + @lineage );
310             print $indent, "Proposing a new \U$tag\E under ",
311             join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
312             || 'Root',
313             ".\n";
314              
315             #} else {
316             # $indent = ' ';
317             }
318              
319             #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
320             # $attr = {%$attr};
321              
322 1005         2362 foreach my $k ( keys %$attr ) {
323              
324             # Make sure some stooge doesn't have "".
325             # That happens every few million Web pages.
326 503 50 33     1531 $attr->{ ' ' . $k } = delete $attr->{$k}
327             if length $k and substr( $k, 0, 1 ) eq '_';
328              
329             # Looks bad, but is fine for round-tripping.
330             }
331              
332 1005         1836 my $e = $self->element_class->new( $tag, %$attr );
333              
334             # Make a new element object.
335             # (Only rarely do we end up just throwing it away later in this call.)
336              
337             # Some prep -- custom messiness for those damned tables, and strict P's.
338 1005 100       2006 if ( $self->{'_implicit_tags'} ) { # wallawallawalla!
339              
340 983 100       1880 unless ( $HTML::TreeBuilder::isTableElement{$tag} ) {
341 829 50       1673 if ( $ptag eq 'table' ) {
    50          
342 0         0 print $indent,
343             " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
344             if DEBUG > 1;
345 0         0 $self->insert_element( 'tr', 1 );
346 0         0 $pos = $self->insert_element( 'td', 1 )
347             ; # yes, needs updating
348             }
349             elsif ( $ptag eq 'tr' ) {
350 0         0 print $indent,
351             " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
352             if DEBUG > 1;
353 0         0 $pos = $self->insert_element( 'td', 1 )
354             ; # yes, needs updating
355             }
356 829         1151 $ptag = $pos->{'_tag'}; # yes, needs updating
357             }
358              
359             # end of table-implication block.
360              
361             # Now maybe do a little dance to enforce P-strictness.
362             # This seems like it should be integrated with the big
363             # "ALL HOPE..." block, further below, but that doesn't
364             # seem feasable.
365 983 0 33     1793 if ( $self->{'_p_strict'}
      0        
366             and $HTML::TreeBuilder::isKnown{$tag}
367             and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
368             {
369 0         0 my $here = $pos;
370 0         0 my $here_tag = $ptag;
371 0         0 while (1) {
372 0 0       0 if ( $here_tag eq 'p' ) {
373 0         0 print $indent, " * Inserting $tag closes strict P.\n"
374             if DEBUG > 1;
375 0         0 $self->end( \q{p} );
376              
377             # NB: same as \'q', but less confusing to emacs cperl-mode
378 0         0 last;
379             }
380              
381             #print("Lasting from $here_tag\n"),
382             last
383             if $HTML::TreeBuilder::isKnown{$here_tag}
384             and
385             not $HTML::Tagset::is_Possible_Strict_P_Content{
386 0 0 0     0 $here_tag};
387              
388             # Don't keep looking up the tree if we see something that can't
389             # be strict-P content.
390              
391             $here_tag
392 0   0     0 = ( $here = $here->{'_parent'} || last )->{'_tag'};
393             } # end while
394             $ptag = ( $pos = $self->{'_pos'} || $self )
395 0   0     0 ->{'_tag'}; # better update!
396             }
397              
398             # end of strict-p block.
399             }
400              
401             # And now, get busy...
402             #----------------------------------------------------------------------
403 1005 100       2778 if ( !$self->{'_implicit_tags'} ) { # bimskalabim
    100          
    100          
    100          
    100          
    100          
    50          
    0          
    0          
    0          
404             # do nothing
405 22         24 print $indent, " * _implicit_tags is off. doing nothing\n"
406             if DEBUG > 1;
407              
408             #----------------------------------------------------------------------
409             }
410             elsif ( $HTML::TreeBuilder::isHeadOrBodyElement{$tag} ) {
411 1 50       3 if ( $pos->is_inside('body') ) { # all is well
    50          
412 0         0 print $indent,
413             " * ambilocal element \U$tag\E is fine under BODY.\n"
414             if DEBUG > 1;
415             }
416             elsif ( $pos->is_inside('head') ) {
417 1         2 print $indent,
418             " * ambilocal element \U$tag\E is fine under HEAD.\n"
419             if DEBUG > 1;
420             }
421             else {
422              
423             # In neither head nor body! mmmmm... put under head?
424              
425 0 0       0 if ( $ptag eq 'html' ) { # expected case
426             # TODO?? : would there ever be a case where _head would be
427             # absent from a tree that would ever be accessed at this
428             # point?
429 0 0       0 die "Where'd my head go?" unless ref $self->{'_head'};
430 0 0       0 if ( $self->{'_head'}{'_implicit'} ) {
431 0         0 print $indent,
432             " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
433             if DEBUG > 1;
434              
435             # or rather, points us at it.
436             $self->{'_pos'}
437 0         0 = $self->{'_head'}; # to insert under...
438             }
439             else {
440 0         0 $self->warning(
441             "Ambilocal element <$tag> not under HEAD or BODY!?"
442             );
443              
444             # Put it under HEAD by default, I guess
445             $self->{'_pos'}
446 0         0 = $self->{'_head'}; # to insert under...
447             }
448              
449             }
450             else {
451              
452             # Neither under head nor body, nor right under html... pass thru?
453 0         0 $self->warning(
454             "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
455             );
456             }
457             }
458              
459             #----------------------------------------------------------------------
460             }
461             elsif ( $HTML::TreeBuilder::isBodyElement{$tag} ) {
462              
463             # Ensure that we are within
464 799 100 66     2113 if ( $ptag eq 'body' ) {
    100          
    100          
    50          
465              
466             # We're good.
467             }
468             elsif (
469             $HTML::TreeBuilder::isBodyElement{$ptag} # glarg
470             and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
471             )
472             {
473              
474             # Special case: Save ourselves a call to is_inside further down.
475             # If our $ptag is an isBodyElement element (but not an
476             # isHeadOrBodyElement element), then we must be under body!
477 206         251 print $indent, " * Inferring that $ptag is under BODY.\n",
478             if DEBUG > 3;
479              
480             # I think this and the test for 'body' trap everything
481             # bodyworthy, except the case where the parent element is
482             # under an unknown element that's a descendant of body.
483             }
484             elsif ( $pos->is_inside('head') ) {
485 164         204 print $indent,
486             " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
487             if DEBUG > 1;
488             $ptag = (
489             $pos = $self->{'_pos'}
490             = $self->{'_body'} # yes, needs updating
491             || die "Where'd my body go?"
492 164   50     394 )->{'_tag'}; # yes, needs updating
493             }
494             elsif ( !$pos->is_inside('body') ) {
495 77         90 print $indent,
496             " * body-element \U$tag\E makes implicit BODY.\n"
497             if DEBUG > 1;
498             $ptag = (
499             $pos = $self->{'_pos'}
500             = $self->{'_body'} # yes, needs updating
501             || die "Where'd my body go?"
502 77   50     197 )->{'_tag'}; # yes, needs updating
503             }
504              
505             # else we ARE under body, so okay.
506              
507             # Handle implicit endings and insert based on and position
508             # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
509 799 100 100     7840 if ( $tag eq 'p'
    100 66        
    100 66        
    50 33        
    100 33        
    100 33        
    100 33        
      66        
      66        
      33        
510             or $tag eq 'h1'
511             or $tag eq 'h2'
512             or $tag eq 'h3'
513             or $tag eq 'h4'
514             or $tag eq 'h5'
515             or $tag eq 'h6'
516             or $tag eq 'form'
517              
518             # Hm, should
really be here?!
519             )
520             {
521              
522             # Can't have

, or inside these

523 243         483 $self->end(
524             $_Closed_by_structurals,
525             @HTML::TreeBuilder::p_closure_barriers
526              
527             # used to be just li!
528             );
529              
530             }
531             elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
532              
533             # Can't have lists inside -- in the unlikely
534             # event anyone tries to put them there!
535 9 50 33     79 if ( $ptag eq 'h1'
      33        
      33        
      33        
      33        
536             or $ptag eq 'h2'
537             or $ptag eq 'h3'
538             or $ptag eq 'h4'
539             or $ptag eq 'h5'
540             or $ptag eq 'h6' )
541             {
542 0         0 $self->end( \$ptag );
543             }
544              
545             # TODO: Maybe keep closing up the tree until
546             # the ptag isn't any of the above?
547             # But anyone that says

    ...

548             # deserves what they get anyway.
549              
550             }
551             elsif ( $tag eq 'li' ) { # list item
552             # Get under a list tag, one way or another
553 12 50 66     32 unless (
554             exists $HTML::TreeBuilder::isList{$ptag}
555             or $self->end( \q{*}, keys %HTML::TreeBuilder::isList ) #'
556             )
557             {
558 0         0 print $indent,
559             " * inserting implicit UL for lack of containing ",
560             join( '|', keys %HTML::TreeBuilder::isList ), ".\n"
561             if DEBUG > 1;
562 0         0 $self->insert_element( 'ul', 1 );
563             }
564              
565             }
566             elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
567              
568             # Get under a DL, one way or another
569 0 0 0     0 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #'
570 0         0 print $indent,
571             " * inserting implicit DL for lack of containing DL.\n"
572             if DEBUG > 1;
573 0         0 $self->insert_element( 'dl', 1 );
574             }
575              
576             }
577             elsif ( $HTML::TreeBuilder::isFormElement{$tag} ) {
578 1 50 33     6 if ($self->{
579             '_ignore_formies_outside_form'} # TODO: document this
580             and not $pos->is_inside('form')
581             )
582             {
583 0         0 print $indent,
584             " * ignoring \U$tag\E because not in a FORM.\n"
585             if DEBUG > 1;
586 0         0 return; # bypass tweaking.
587             }
588 1 50       3 if ( $tag eq 'option' ) {
589              
590             # return unless $ptag eq 'select';
591 0         0 $self->end( \q{option} );
592 0   0     0 $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
593 0 0 0     0 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
594 0         0 print $indent,
595             " * \U$tag\E makes an implicit SELECT.\n"
596             if DEBUG > 1;
597 0         0 $pos = $self->insert_element( 'select', 1 );
598              
599             # but not a very useful select -- has no 'name' attribute!
600             # is $pos's value used after this?
601             }
602             }
603             }
604             elsif ( $HTML::TreeBuilder::isTableElement{$tag} ) {
605 154 100       340 if ( !$pos->is_inside('table') ) {
606 3         4 print $indent, " * \U$tag\E makes an implicit TABLE\n"
607             if DEBUG > 1;
608 3         6 $self->insert_element( 'table', 1 );
609             }
610              
611 154 100 100     394 if ( $tag eq 'td' or $tag eq 'th' ) {
612              
613             # Get under a tr one way or another
614 96 100 100     238 unless (
615             $ptag eq 'tr' # either under a tr
616             or $self->end( \q{*}, 'tr',
617             'table' ) #or we can get under one
618             )
619             {
620 2         3 print $indent,
621             " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
622             if DEBUG > 1;
623 2         5 $self->insert_element( 'tr', 1 );
624              
625             # presumably pos's value isn't used after this.
626             }
627             }
628             else {
629 58         137 $self->end( \$tag, 'table' ); #'
630             }
631              
632             # Hmm, I guess this is right. To work it out:
633             # tr closes any open tr (limited at a table)
634             # thead closes any open thead (limited at a table)
635             # tbody closes any open tbody (limited at a table)
636             # tfoot closes any open tfoot (limited at a table)
637             # colgroup closes any open colgroup (limited at a table)
638             # col can try, but will always fail, at the enclosing table,
639             # as col is empty, and therefore never open!
640             # But!
641             # td closes any open td OR th (limited at a table)
642             # th closes any open th OR td (limited at a table)
643             # ...implementable as "close to a tr, or make a tr"
644              
645             }
646             elsif ( $HTML::TreeBuilder::isPhraseMarkup{$tag} ) {
647 347 100 100     1012 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
648 1         1 print
649             " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
650             if DEBUG > 1;
651 1         4 $pos = $self->insert_element( 'p', 1 );
652              
653             # is $pos's value used after this?
654             }
655             }
656              
657             # End of implicit endings logic
658              
659             # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}"
660             #----------------------------------------------------------------------
661              
662             }
663             elsif ( $HTML::TreeBuilder::isHeadElement{$tag} ) {
664 167 100       328 if ( $pos->is_inside('body') ) {
    100          
665 1         1 print $indent, " * head element \U$tag\E found inside BODY!\n"
666             if DEBUG;
667 1         6 $self->warning("Header element <$tag> in body"); # [sic]
668             }
669             elsif ( !$pos->is_inside('head') ) {
670 164         176 print $indent,
671             " * head element \U$tag\E makes an implicit HEAD.\n"
672             if DEBUG > 1;
673             }
674             else {
675 2         3 print $indent,
676             " * head element \U$tag\E goes inside existing HEAD.\n"
677             if DEBUG > 1;
678             }
679 167   50     381 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
680              
681             #----------------------------------------------------------------------
682             }
683             elsif ( $tag eq 'html' ) {
684 4 50       12 if ( delete $self->{'_implicit'} ) { # first time here
685 4         4 print $indent, " * good! found the real HTML element!\n"
686             if DEBUG > 1;
687             }
688             else {
689 0         0 print $indent, " * Found a second HTML element\n"
690             if DEBUG;
691 0         0 $self->warning("Found a nested element");
692             }
693              
694             # in either case, migrate attributes to the real element
695 4         11 for ( keys %$attr ) {
696 0         0 $self->attr( $_, $attr->{$_} );
697             }
698 4         6 $self->{'_pos'} = undef;
699 4         31 return $self; # bypass tweaking.
700              
701             #----------------------------------------------------------------------
702             }
703             elsif ( $tag eq 'head' ) {
704 6   50     60 my $head = $self->{'_head'} || die "Where'd my head go?";
705 6 50       16 if ( delete $head->{'_implicit'} ) { # first time here
706 6         9 print $indent, " * good! found the real HEAD element!\n"
707             if DEBUG > 1;
708             }
709             else { # been here before
710 0         0 print $indent, " * Found a second HEAD element\n"
711             if DEBUG;
712 0         0 $self->warning("Found a second element");
713             }
714              
715             # in either case, migrate attributes to the real element
716 6         15 for ( keys %$attr ) {
717 0         0 $head->attr( $_, $attr->{$_} );
718             }
719 6         37 return $self->{'_pos'} = $head; # bypass tweaking.
720              
721             #----------------------------------------------------------------------
722             }
723             elsif ( $tag eq 'body' ) {
724 6   50     20 my $body = $self->{'_body'} || die "Where'd my body go?";
725 6 50       15 if ( delete $body->{'_implicit'} ) { # first time here
726 6         10 print $indent, " * good! found the real BODY element!\n"
727             if DEBUG > 1;
728             }
729             else { # been here before
730 0         0 print $indent, " * Found a second BODY element\n"
731             if DEBUG;
732 0         0 $self->warning("Found a second element");
733             }
734              
735             # in either case, migrate attributes to the real element
736 6         18 for ( keys %$attr ) {
737 0         0 $body->attr( $_, $attr->{$_} );
738             }
739 6         43 return $self->{'_pos'} = $body; # bypass tweaking.
740              
741             #----------------------------------------------------------------------
742             }
743             elsif ( $tag eq 'frameset' ) {
744 0 0 0     0 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen
      0        
745             and !$self->{'_noframes_seen'}
746              
747             # otherwise it'll be under the noframes already
748             and !$self->is_inside('body')
749             )
750             {
751              
752             # The following is a bit of a hack. We don't use the normal
753             # insert_element because 1) we don't want it as _pos, but instead
754             # right under $self, and 2), more importantly, that we don't want
755             # this inserted at the /end/ of $self's content_list, but instead
756             # in the middle of it, specifically right before the body element.
757             #
758 0   0     0 my $c = $self->{'_content'} || die "Contentless root?";
759 0   0     0 my $body = $self->{'_body'} || die "Where'd my BODY go?";
760 0         0 for ( my $i = 0; $i < @$c; ++$i ) {
761 0 0       0 if ( $c->[$i] eq $body ) {
762 0         0 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
763 0         0 HTML::Element::_weaken($e->{'_parent'} = $self);
764 0         0 $already_inserted = 1;
765 0         0 print $indent,
766             " * inserting 'frameset' right before BODY.\n"
767             if DEBUG > 1;
768 0         0 last;
769             }
770             }
771 0 0       0 die "BODY not found in children of root?"
772             unless $already_inserted;
773             }
774              
775             }
776             elsif ( $tag eq 'frame' ) {
777              
778             # Okay, fine, pass thru.
779             # Should probably enforce that these should be under a frameset.
780             # But hey. Ditto for enforcing that 'noframes' should be under
781             # a 'frameset', as the DTDs say.
782              
783             }
784             elsif ( $tag eq 'noframes' ) {
785              
786             # This basically assumes there'll be exactly one 'noframes' element
787             # per document. At least, only the first one gets to have the
788             # body under it. And if there are no noframes elements, then
789             # the body pretty much stays where it is. Is that ever a problem?
790 0 0       0 if ( $self->{'_noframes_seen'}++ ) {
791 0         0 print $indent, " * ANOTHER noframes element?\n" if DEBUG;
792             }
793             else {
794 0 0       0 if ( $pos->is_inside('body') ) {
795 0         0 print $indent, " * 'noframes' inside 'body'. Odd!\n"
796             if DEBUG;
797              
798             # In that odd case, we /can't/ make body a child of 'noframes',
799             # because it's an ancestor of the 'noframes'!
800             }
801             else {
802 0   0     0 $e->push_content( $self->{'_body'}
803             || die "Where'd my body go?" );
804 0         0 print $indent, " * Moving body to be under noframes.\n"
805             if DEBUG;
806             }
807             }
808              
809             #----------------------------------------------------------------------
810             }
811             else {
812              
813             # unknown tag
814 0 0       0 if ( $self->{'_ignore_unknown'} ) {
815 0         0 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
816 0         0 $self->warning("Skipping unknown tag $tag");
817 0         0 return;
818             }
819             else {
820 0         0 print $indent, " * Accepting unknown tag \U$tag\E\n"
821             if DEBUG;
822             }
823             }
824              
825             #----------------------------------------------------------------------
826             # End of mumbo-jumbo
827              
828             print $indent, "(Attaching ", $e->{'_tag'}, " under ",
829 989         1178 ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
830              
831             # because if _pos isn't defined, it goes under self
832             if DEBUG;
833              
834             # The following if-clause is to delete /some/ ignorable whitespace
835             # nodes, as we're making the tree.
836             # This'd be a node we'd catch later anyway, but we might as well
837             # nip it in the bud now.
838             # This doesn't catch /all/ deletable WS-nodes, so we do have to call
839             # the tightener later to catch the rest.
840              
841 989 100 66     2862 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
842             { # if tightenable
843 980         1221 my ( $sibs, $par );
844 980 100 66     5251 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
      66        
      100        
      100        
      66        
      100        
      66        
845             and @$sibs # parent already has content
846             and !
847             ref( $sibs->[-1] ) # and the last one there is a text node
848             and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
849              
850             and ( # one of these has to be eligible...
851             $HTML::TreeBuilder::canTighten{$tag}
852             or (( @$sibs == 1 )
853             ? # WS is leftmost -- so parent matters
854             $HTML::TreeBuilder::canTighten{ $par->{'_tag'} }
855             : # WS is after another node -- it matters
856             ( ref $sibs->[-2]
857             and
858             $HTML::TreeBuilder::canTighten{ $sibs->[-2]
859             {'_tag'} }
860             )
861             )
862             )
863              
864             and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
865              
866             # we're clear
867             )
868             {
869 34         48 pop @$sibs;
870 34         51 print $indent, "Popping a preceding all-WS node\n" if DEBUG;
871             }
872             }
873              
874 989 50       2927 $self->insert_element($e) unless $already_inserted;
875              
876 989         1048 if (DEBUG) {
877             if ( $self->{'_pos'} ) {
878             print $indent, "(Current lineage of pos: \U$tag\E under ",
879             join(
880             '/',
881             reverse(
882              
883             # $self->{'_pos'}{'_tag'}, # don't list myself!
884             $self->{'_pos'}->lineage_tag_names
885             )
886             ),
887             ".)\n";
888             }
889             else {
890             print $indent, "(Pos points nowhere!?)\n";
891             }
892             }
893              
894 989 100 50     2694 unless ( ( $self->{'_pos'} || '' ) eq $e ) {
895              
896             # if it's an empty element -- i.e., if it didn't change the _pos
897 15         41 &{ $self->{"_tweak_$tag"}
898 15 50 33     143 || $self->{'_tweak_*'}
899             || return $e }( map $_, $e, $tag, $self )
900             ; # make a list so the user can't clobber
901             }
902              
903 974         5540 return $e;
904             }
905             }
906              
907             #==========================================================================
908              
909             {
910             my $indent;
911              
912             sub end {
913 1400 50   1400 1 2838 return if $_[0]{'_stunted'};
914              
915             # Either: Acccept an end-tag signal from HTML::Parser
916             # Or: Method for closing currently open elements in some fairly complex
917             # way, as used by other methods in this class.
918 1400         3004 my ( $self, $tag, @stop ) = @_;
919 1400 50       2416 if ( $tag eq 'x-html' ) {
920 0         0 print "Ignoring close-x-html tag.\n" if DEBUG;
921              
922             # inserted by some lame code-generators.
923 0         0 return;
924             }
925              
926 1400 50 66     4818 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
927 0         0 DEBUG and print "End-tag name $tag is no good. Skipping.\n";
928 0         0 return;
929              
930             # This avoids having Element's new() throw an exception.
931             }
932              
933             # This method accepts two calling formats:
934             # 1) from Parser: $self->end('tag_name', 'origtext')
935             # in which case we shouldn't mistake origtext as a blocker tag
936             # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... )
937             # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... )
938              
939             # End the specified tag, but don't move above any of the blocker tags.
940             # The tag can also be a reference to an array. Terminate the first
941             # tag found.
942              
943 1400   66     2886 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
944              
945             # $p and $ptag are sort-of stratch
946              
947 1400 100       2055 if ( ref($tag) ) {
948              
949             # First param is a ref of one sort or another --
950             # THE CALL IS COMING FROM INSIDE THE HOUSE!
951 336 100       606 $tag = $$tag if ref($tag) eq 'SCALAR';
952              
953             # otherwise it's an arrayref.
954             }
955             else {
956              
957             # the call came from Parser -- just ignore origtext
958             # except in a table ignore unmatched table tags RT #59980
959 1064 100       2274 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
960             }
961              
962             #my($indent);
963 1400         1518 if (DEBUG) {
964              
965             # optimization -- don't figure out depth unless we're in debug mode
966             my @lineage_tags = $p->lineage_tag_names;
967             $indent = ' ' x ( 1 + @lineage_tags );
968              
969             # now announce ourselves
970             print $indent, "Ending ",
971             ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
972             scalar(@stop)
973             ? ( " no higher than [", join( ' ', @stop ), "]" )
974             : (), ".\n";
975              
976             print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
977             ".)\n"
978             if DEBUG > 1;
979              
980             if ( DEBUG > 3 ) {
981              
982             #my(
983             # $package, $filename, $line, $subroutine,
984             # $hasargs, $wantarray, $evaltext, $is_require) = caller;
985             print $indent,
986             " (Called from ", ( caller(1) )[3], ' line ',
987             ( caller(1) )[2],
988             ")\n";
989             }
990              
991             #} else {
992             # $indent = ' ';
993             }
994              
995             # End of if DEBUG
996              
997             # Now actually do it
998 1400         1533 my @to_close;
999 1400 100       2428 if ( $tag eq '*' ) {
    100          
1000              
1001             # Special -- close everything up to (but not including) the first
1002             # limiting tag, or return if none found. Somewhat of a special case.
1003             PARENT:
1004 35         62 while ( defined $p ) {
1005 68         102 $ptag = $p->{'_tag'};
1006 68         70 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1007 68         95 for (@stop) {
1008 113 100       223 if ( $ptag eq $_ ) {
1009 35         42 print $indent,
1010             " (Hit a $_; closing everything up to here.)\n"
1011             if DEBUG > 2;
1012 35         59 last PARENT;
1013             }
1014             }
1015 33         46 push @to_close, $p;
1016 33         43 $p = $p->{'_parent'}; # no match so far? keep moving up
1017             print $indent,
1018 33         53 " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
1019             if DEBUG > 1;
1020             }
1021 35 50       57 unless ( defined $p ) { # We never found what we were looking for.
1022 0         0 print $indent, " (We never found a limit.)\n" if DEBUG > 1;
1023 0         0 return;
1024             }
1025              
1026             #print
1027             # $indent,
1028             # " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1029             # if DEBUG > 4;
1030              
1031             # Otherwise update pos and fall thru.
1032 35         48 $self->{'_pos'} = $p;
1033             }
1034             elsif ( ref $tag ) {
1035              
1036             # Close the first of any of the matching tags, giving up if you hit
1037             # any of the stop-tags.
1038             PARENT:
1039 243         405 while ( defined $p ) {
1040 463         579 $ptag = $p->{'_tag'};
1041 463         765 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1042 463         678 for (@$tag) {
1043 3983 100       5464 if ( $ptag eq $_ ) {
1044 23         31 print $indent, " (Closing $_.)\n" if DEBUG > 2;
1045 23         37 last PARENT;
1046             }
1047             }
1048 440         543 for (@stop) {
1049 6600 50       8987 if ( $ptag eq $_ ) {
1050 0         0 print $indent,
1051             " (Hit a limiting $_ -- bailing out.)\n"
1052             if DEBUG > 1;
1053 0         0 return; # so it was all for naught
1054             }
1055             }
1056 440         561 push @to_close, $p;
1057 440         742 $p = $p->{'_parent'};
1058             }
1059 243 100       698 return unless defined $p; # We went off the top of the tree.
1060             # Otherwise specified element was found; set pos to its parent.
1061 23         30 push @to_close, $p;
1062 23         38 $self->{'_pos'} = $p->{'_parent'};
1063             }
1064             else {
1065              
1066             # Close the first of the specified tag, giving up if you hit
1067             # any of the stop-tags.
1068 1122         1883 while ( defined $p ) {
1069 1511         1962 $ptag = $p->{'_tag'};
1070 1511         1561 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1071 1511 100       2661 if ( $ptag eq $tag ) {
1072 1092         1109 print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1073 1092         1340 last;
1074             }
1075 419         583 for (@stop) {
1076 58 100       121 if ( $ptag eq $_ ) {
1077 29         33 print $indent,
1078             " (Hit a limiting $_ -- bailing out.)\n"
1079             if DEBUG > 1;
1080 29         69 return; # so it was all for naught
1081             }
1082             }
1083 390         511 push @to_close, $p;
1084 390         640 $p = $p->{'_parent'};
1085             }
1086 1093 100       1610 return unless defined $p; # We went off the top of the tree.
1087             # Otherwise specified element was found; set pos to its parent.
1088 1092         1384 push @to_close, $p;
1089 1092         1527 $self->{'_pos'} = $p->{'_parent'};
1090             }
1091              
1092 1150 100 100     3218 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1093             print $indent, "(Pos now points to ",
1094 1150         1244 $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1095             if DEBUG > 1;
1096              
1097             ### EXPENSIVE, because has to check that it's not under a pre
1098             ### or a CDATA-parent. That's one more method call per end()!
1099             ### Might as well just do this at the end of the tree-parse, I guess,
1100             ### at which point we'd be parsing top-down, and just not traversing
1101             ### under pre's or CDATA-parents.
1102             ##
1103             ## Take this opportunity to nix any terminal whitespace nodes.
1104             ## TODO: consider whether this (plus the logic in start(), above)
1105             ## would ever leave any WS nodes in the tree.
1106             ## If not, then there's no reason to have eof() call
1107             ## delete_ignorable_whitespace on the tree, is there?
1108             ##
1109             #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1110             # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1111             #) { # if tightenable
1112             # my($children, $e_tag);
1113             # foreach my $e (reverse @to_close) { # going top-down
1114             # last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1115             # $HTML::Tagset::isCDATA_Parent{$e_tag};
1116             #
1117             # if(
1118             # $children = $e->{'_content'}
1119             # and @$children # has children
1120             # and !ref($children->[-1])
1121             # and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1122             # and
1123             # (
1124             # # has a tightable parent:
1125             # $HTML::TreeBuilder::canTighten{ $e_tag }
1126             # or
1127             # ( # has a tightenable left sibling:
1128             # @$children > 1 and
1129             # ref($children->[-2])
1130             # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1131             # )
1132             # )
1133             # ) {
1134             # pop @$children;
1135             # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1136             # # " (", $e->address, ") while exiting.\n" if DEBUG;
1137             # }
1138             # }
1139             #}
1140              
1141 1150         1469 foreach my $e (@to_close) {
1142              
1143             # Call the applicable callback, if any
1144 1535         1920 $ptag = $e->{'_tag'};
1145 1535         3507 &{ $self->{"_tweak_$ptag"}
1146 1535 50 33     5731 || $self->{'_tweak_*'}
1147             || next }( map $_, $e, $ptag, $self );
1148 0         0 print $indent, "Back from tweaking.\n" if DEBUG;
1149             last
1150 0 0       0 if $self->{ '_stunted'
1151             }; # in case one of the handlers called stunt
1152             }
1153 1150         3992 return @to_close;
1154             }
1155             }
1156              
1157             #==========================================================================
1158             {
1159             my ( $indent, $nugget );
1160              
1161             sub text {
1162 2152 50   2152 1 61184 return if $_[0]{'_stunted'};
1163              
1164             # Accept a "here's a text token" signal from HTML::Parser.
1165 2152         3839 my ( $self, $text, $is_cdata ) = @_;
1166              
1167             # the >3.0 versions of Parser may pass a cdata node.
1168             # Thanks to Gisle Aas for pointing this out.
1169              
1170 2152 50       3648 return unless length $text; # I guess that's always right
1171              
1172 2152         2710 my $ignore_text = $self->{'_ignore_text'};
1173 2152         2568 my $no_space_compacting = $self->{'_no_space_compacting'};
1174 2152         2507 my $no_expand_entities = $self->{'_no_expand_entities'};
1175 2152   66     3752 my $pos = $self->{'_pos'} || $self;
1176              
1177             HTML::Entities::decode($text)
1178             unless $ignore_text
1179             || $is_cdata
1180 2152 100 33     13648 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
      33        
      66        
1181             || $no_expand_entities;
1182              
1183             #my($indent, $nugget);
1184 2152         2538 if (DEBUG) {
1185              
1186             # optimization -- don't figure out depth unless we're in debug mode
1187             my @lineage_tags = $pos->lineage_tag_names;
1188             $indent = ' ' x ( 1 + @lineage_tags );
1189              
1190             $nugget
1191             = ( length($text) <= 25 )
1192             ? $text
1193             : ( substr( $text, 0, 25 ) . '...' );
1194             $nugget =~ s<([\x00-\x1F])>
1195             <'\\x'.(unpack("H2",$1))>eg;
1196             print $indent, "Proposing a new text node ($nugget) under ",
1197             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1198             || 'Root',
1199             ".\n";
1200              
1201             #} else {
1202             # $indent = ' ';
1203             }
1204 2152         2294  
1205 2152 50 33     6325 my $ptag;
1206             if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1207              
1208             #or $pos->is_inside('pre')
1209             or $pos->is_inside( 'pre', 'textarea' )
1210             )
1211 0 0       0 {
1212 0         0 return if $ignore_text;
1213             $pos->push_content($text);
1214             }
1215             else {
1216              
1217             # return unless $text =~ /\S/; # This is sometimes wrong
1218 2152 100 100     11238  
    50 33        
    100          
    100          
    100          
    100          
1219             if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
1220              
1221             # don't change anything
1222             }
1223 0 0       0 elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1224 0         0 if ( $self->{'_implicit_body_p_tag'} ) {
1225             print $indent,
1226             " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1227 0         0 if DEBUG > 1;
1228             $self->end( \$ptag );
1229             $pos = $self->{'_body'}
1230 0 0       0 ? ( $self->{'_pos'}
1231             = $self->{'_body'} ) # expected case
1232 0         0 : $self->insert_element( 'body', 1 );
1233             $pos = $self->insert_element( 'p', 1 );
1234             }
1235 0         0 else {
1236             print $indent,
1237             " * Text node under \U$ptag\E closes, implicates BODY.\n"
1238 0         0 if DEBUG > 1;
1239             $self->end( \$ptag );
1240             $pos = $self->{'_body'}
1241 0 0       0 ? ( $self->{'_pos'}
1242             = $self->{'_body'} ) # expected case
1243             : $self->insert_element( 'body', 1 );
1244             }
1245             }
1246 22 100       48 elsif ( $ptag eq 'html' ) {
1247 1         2 if ( $self->{'_implicit_body_p_tag'} ) {
1248             print $indent,
1249             " * Text node under HTML implicates BODY and P.\n"
1250             if DEBUG > 1;
1251             $pos = $self->{'_body'}
1252 1 50       4 ? ( $self->{'_pos'}
1253             = $self->{'_body'} ) # expected case
1254 1         3 : $self->insert_element( 'body', 1 );
1255             $pos = $self->insert_element( 'p', 1 );
1256             }
1257 21         29 else {
1258             print $indent,
1259             " * Text node under HTML implicates BODY.\n"
1260             if DEBUG > 1;
1261             $pos = $self->{'_body'}
1262 21 50       47 ? ( $self->{'_pos'}
1263             = $self->{'_body'} ) # expected case
1264             : $self->insert_element( 'body', 1 );
1265              
1266             #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1267             }
1268             }
1269 370 50       687 elsif ( $ptag eq 'body' ) {
1270 0         0 if ( $self->{'_implicit_body_p_tag'} ) {
1271             print $indent, " * Text node under BODY implicates P.\n"
1272 0         0 if DEBUG > 1;
1273             $pos = $self->insert_element( 'p', 1 );
1274             }
1275             }
1276 8         12 elsif ( $ptag eq 'table' ) {
1277             print $indent,
1278             " * Text node under TABLE implicates TR and TD.\n"
1279 8         20 if DEBUG > 1;
1280 8         14 $self->insert_element( 'tr', 1 );
1281             $pos = $self->insert_element( 'td', 1 );
1282              
1283             # double whammy!
1284             }
1285 3         4 elsif ( $ptag eq 'tr' ) {
1286             print $indent, " * Text node under TR implicates TD.\n"
1287 3         7 if DEBUG > 1;
1288             $pos = $self->insert_element( 'td', 1 );
1289             }
1290              
1291             # elsif (
1292             # # $ptag eq 'li' ||
1293             # # $ptag eq 'dd' ||
1294             # $ptag eq 'form') {
1295             # $pos = $self->insert_element('p', 1);
1296             #}
1297              
1298             # Whatever we've done above should have had the side
1299             # effect of updating $self->{'_pos'}
1300              
1301             #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1302 2152 50       3344  
1303 2152 100       7350 return if $ignore_text;
1304             $text =~ s/[\n\r\f\t ]+/ /g # canonical space
1305             unless $no_space_compacting;
1306              
1307             print $indent, " (Attaching text node ($nugget) under ",
1308              
1309 2152         2684 # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1310             $pos->{'_tag'}, ").\n"
1311             if DEBUG > 1;
1312 2152         4067  
1313             $pos->push_content($text);
1314             }
1315 2152 50       11454  
1316 2152         3781 &{ $self->{'_tweak_~text'} || return }( $text, $pos,
1317             $pos->{'_tag'} . '' );
1318              
1319             # Note that this is very exceptional -- it doesn't fall back to
1320 0         0 # _tweak_*, and it gives its tweak different arguments.
1321             return;
1322             }
1323             }
1324              
1325             #==========================================================================
1326              
1327             # TODO: test whether comment(), declaration(), and process(), do the right
1328             # thing as far as tightening and whatnot.
1329             # Also, currently, doctypes and comments that appear before head or body
1330             # show up in the tree in the wrong place. Something should be done about
1331             # this. Tricky. Maybe this whole business of pre-making the body and
1332             # whatnot is wrong.
1333              
1334 188 50   188 1 1468 sub comment {
1335             return if $_[0]{'_stunted'};
1336              
1337             # Accept a "here's a comment" signal from HTML::Parser.
1338 188         282  
1339 188   66     349 my ( $self, $text ) = @_;
1340             my $pos = $self->{'_pos'} || $self;
1341             return
1342 188 100 66     715 unless $self->{'_store_comments'}
1343             || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1344 1         1  
1345             if (DEBUG) {
1346             my @lineage_tags = $pos->lineage_tag_names;
1347             my $indent = ' ' x ( 1 + @lineage_tags );
1348              
1349             my $nugget
1350             = ( length($text) <= 25 )
1351             ? $text
1352             : ( substr( $text, 0, 25 ) . '...' );
1353             $nugget =~ s<([\x00-\x1F])>
1354             <'\\x'.(unpack("H2",$1))>eg;
1355             print $indent, "Proposing a Comment ($nugget) under ",
1356             join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1357             ".\n";
1358 1         2 }
1359 1         3  
1360 1         2 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1361             $pos->push_content($e);
1362 1         4 ++( $self->{'_element_count'} );
1363 1 50 33     8  
1364             &{ $self->{'_tweak_~comment'}
1365             || $self->{'_tweak_*'}
1366 0         0 || return $e }( map $_, $e, '~comment', $self );
1367              
1368             return $e;
1369             }
1370 2 50   2 1 532  
1371             sub declaration {
1372             return if $_[0]{'_stunted'};
1373              
1374 2         11 # Accept a "here's a markup declaration" signal from HTML::Parser.
1375 2   33     11  
1376             my ( $self, $text ) = @_;
1377 2         5 my $pos = $self->{'_pos'} || $self;
1378              
1379             if (DEBUG) {
1380             my @lineage_tags = $pos->lineage_tag_names;
1381             my $indent = ' ' x ( 1 + @lineage_tags );
1382              
1383             my $nugget
1384             = ( length($text) <= 25 )
1385             ? $text
1386             : ( substr( $text, 0, 25 ) . '...' );
1387             $nugget =~ s<([\x00-\x1F])>
1388             <'\\x'.(unpack("H2",$1))>eg;
1389             print $indent, "Proposing a Declaration ($nugget) under ",
1390 2         7 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1391             ".\n";
1392 2         6 }
1393 2         19 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1394              
1395             $self->{_decl} = $e;
1396             return $e;
1397             }
1398              
1399 0 0   0 1 0 #==========================================================================
1400              
1401             sub process {
1402             return if $_[0]{'_stunted'};
1403 0 0       0  
1404 0         0 # Accept a "here's a PI" signal from HTML::Parser.
1405 0   0     0  
1406             return unless $_[0]->{'_store_pis'};
1407 0         0 my ( $self, $text ) = @_;
1408             my $pos = $self->{'_pos'} || $self;
1409              
1410             if (DEBUG) {
1411             my @lineage_tags = $pos->lineage_tag_names;
1412             my $indent = ' ' x ( 1 + @lineage_tags );
1413              
1414             my $nugget
1415             = ( length($text) <= 25 )
1416             ? $text
1417             : ( substr( $text, 0, 25 ) . '...' );
1418             $nugget =~ s<([\x00-\x1F])>
1419             <'\\x'.(unpack("H2",$1))>eg;
1420 0         0 print $indent, "Proposing a PI ($nugget) under ",
1421 0         0 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1422 0         0 ".\n";
1423             }
1424 0 0 0     0 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
  0         0  
1425             $pos->push_content($e);
1426             ++( $self->{'_element_count'} );
1427 0         0  
1428             &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
1429             $e, '~pi', $self );
1430              
1431             return $e;
1432             }
1433              
1434             #==========================================================================
1435              
1436             #When you call $tree->parse_file($filename), and the
1437             #tree's ignore_ignorable_whitespace attribute is on (as it is
1438             #by default), HTML::TreeBuilder's logic will manage to avoid
1439             #creating some, but not all, nodes that represent ignorable
1440             #whitespace. However, at the end of its parse, it traverses the
1441             #tree and deletes any that it missed. (It does this with an
1442             #around-method around HTML::Parser's eof method.)
1443             #
1444             #However, with $tree->parse($content), the cleanup-traversal step
1445             #doesn't happen automatically -- so when you're done parsing all
1446             #content for a document (regardless of whether $content is the only
1447             #bit, or whether it's just another chunk of content you're parsing into
1448             #the tree), call $tree->eof() to signal that you're at the end of the
1449             #text you're inputting to the tree. Besides properly cleaning any bits
1450             #of ignorable whitespace from the tree, this will also ensure that
1451             #HTML::Parser's internal buffer is flushed.
1452              
1453 283 50   283 1 1166 sub eof {
1454              
1455 283 50       491 # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1456              
1457 283         341 return if $_[0]->{'_done'}; # we've already been here
1458 283         296  
1459 283         308 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1460 283 50       468  
1461             my $x = $_[0];
1462             print "EOF received.\n" if DEBUG;
1463             my (@rv);
1464 0         0 if (wantarray) {
1465              
1466             # I don't think this makes any difference for this particular
1467 283         1079 # method, but let's be scrupulous, for once.
1468             @rv = $x->SUPER::eof();
1469             }
1470 283 100 66     1152 else {
1471             $rv[0] = $x->SUPER::eof();
1472             }
1473              
1474             $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1475              
1476 283 100       480 # That SHOULD close everything, and will run the appropriate tweaks.
1477             # We /could/ be running under some insane mode such that there's more
1478             # than one HTML element, but really, that's just insane to do anyhow.
1479              
1480 5         9 unless ( $x->{'_implicit_tags'} ) {
1481              
1482             # delete those silly implicit head and body in case we put
1483             # them there in implicit tags mode
1484             foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1485 10 50 33     65 $node->replace_with_content
      33        
      33        
1486             if defined $node
1487             and ref $node
1488             and $node->{'_implicit'}
1489             and $node->{'_parent'};
1490              
1491             # I think they should be empty anyhow, since the only
1492             # logic that'd insert under them can apply only, I think,
1493             # in the case where _implicit_tags is on
1494             }
1495              
1496             # this may still leave an implicit 'html' at the top, but there's
1497             # nothing we can do about that, is there?
1498             }
1499 283 100 66     1327  
1500 283         447 $x->delete_ignorable_whitespace()
1501              
1502 283 50       459 # this's why we trap this -- an after-method
1503 283         620 if $x->{'_tighten'} and !$x->{'_ignore_text'};
1504             $x->{'_done'} = 1;
1505              
1506             return @rv if wantarray;
1507             return $rv[0];
1508             }
1509              
1510             #==========================================================================
1511 0     0 1 0  
1512 0         0 # TODO: document
1513 0         0  
1514             sub stunt {
1515 0 0       0 my $self = $_[0];
1516             print "Stunting the tree.\n" if DEBUG;
1517             $self->{'_done'} = 1;
1518 0         0  
1519 0         0 if ( $HTML::Parser::VERSION < 3 ) {
1520              
1521             #This is a MEAN MEAN HACK. And it works most of the time!
1522 0 0       0 $self->{'_buf'} = '';
1523 0         0 my $fh = *HTML::Parser::F{IO};
1524 0         0  
1525             # the local'd FH used by parse_file loop
1526             if ( defined $fh ) {
1527             print "Closing Parser's filehandle $fh\n" if DEBUG;
1528             close($fh);
1529             }
1530              
1531             # But if they called $tree->parse_file($filehandle)
1532             # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1533 0         0 # to close. Ahwell. Not a problem for most users these days.
1534              
1535             }
1536             else {
1537             $self->SUPER::eof();
1538              
1539             # Under 3+ versions, calling eof from inside a parse will abort the
1540             # parse / parse_file
1541 0         0 }
1542 0         0  
1543             # In the off chance that the above didn't work, we'll throw
1544             # this flag to make any future events be no-ops.
1545             $self->stunted(1);
1546 0     0 1 0 return;
1547 0     0 1 0 }
1548              
1549             # TODO: document
1550             sub stunted { shift->_elem( '_stunted', @_ ); }
1551             sub done { shift->_elem( '_done', @_ ); }
1552              
1553             #==========================================================================
1554              
1555             sub delete {
1556              
1557             # Override Element's delete method.
1558 250     250 1 70008 # This does most, if not all, of what Element's delete does anyway.
1559             # Deletes content, including content in some special attributes.
1560 250         342 # But doesn't empty out the hash.
  250         641  
1561 250         333  
1562 250 50       668 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct
1563              
1564             delete @{ $_[0] }{ '_body', '_head', '_pos' };
1565             for (
1566             @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content
1567              
1568             # delete @{$_[0]}{'_body', '_head', '_pos'}
1569             # ...and these, in case these elements don't appear in the
1570             # content, which is possible. If they did appear (as they
1571             # usually do), then calling $_->delete on them again is harmless.
1572             # I don't think that's such a hot idea now. Thru creative reattachment,
1573             # those could actually now point to elements in OTHER trees (which we do
1574             # NOT want to delete!).
1575             ## Reasoned out:
1576             # If these point to elements not in the content list of any element in this
1577             # tree, but not in the content list of any element in any OTHER tree, then
1578             # just deleting these will make their refcounts hit zero.
1579             # If these point to elements in the content lists of elements in THIS tree,
1580             # then we'll get to deleting them when we delete from the top.
1581 500 50 33     2506 # If these point to elements in the content lists of elements in SOME OTHER
      33        
1582             # tree, then they're not to be deleted.
1583             )
1584             {
1585             $_->delete
1586 250 0 33     700 if defined $_ and ref $_ # Make sure it's an object.
1587             and $_ ne $_[0]; # And avoid hitting myself, just in case!
1588             }
1589              
1590 250         1606 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1591              
1592             # An 'html' element having a parent is quite unlikely.
1593              
1594 0     0 1 0 return;
1595             }
1596              
1597             sub tighten_up { # legacy
1598             shift->delete_ignorable_whitespace(@_);
1599             }
1600 3     3 1 4  
1601 3         6 sub elementify {
1602 3         27  
1603             # Rebless this object down into the normal element class.
1604 3         14 my $self = $_[0];
1605 63 100 33     502 my $to_class = $self->element_class;
      66        
      66        
      100        
      100        
      100        
1606             delete @{$self}{
1607             grep {
1608             ;
1609             length $_ and substr( $_, 0, 1 ) eq '_'
1610              
1611             # The private attributes that we'll retain:
1612             and $_ ne '_tag'
1613             and $_ ne '_parent'
1614             and $_ ne '_content'
1615             and $_ ne '_implicit'
1616 3         12 and $_ ne '_pos'
1617             and $_ ne '_element_class'
1618             } keys %$self
1619             };
1620 4071 100   4071 1 7617 bless $self, $to_class; # Returns the same object we were fed
1621 3780   50     14145 }
1622              
1623             sub element_class {
1624             return 'HTML::Element' if not ref $_[0];
1625             return $_[0]->{_element_class} || 'HTML::Element';
1626             }
1627 0     0 1    
1628 0           #--------------------------------------------------------------------------
1629 0            
1630 0           sub guts {
1631 0           my @out;
1632 0           my @stack = ( $_[0] );
1633 0 0         my $destructive = $_[1];
    0          
1634 0           my $this;
1635             while (@stack) {
1636             $this = shift @stack;
1637 0           if ( !ref $this ) {
1638 0 0         push @out, $this; # yes, it can include text nodes
1639             }
1640             elsif ( !$this->{'_implicit'} ) {
1641             push @out, $this;
1642             delete $this->{'_parent'} if $destructive;
1643 0 0         }
1644             else {
1645              
1646 0           # it's an implicit node. Delete it and recurse
1647             delete $this->{'_parent'} if $destructive;
1648 0 0         unshift @stack,
    0          
1649             @{
1650             ( $destructive
1651             ? delete( $this->{'_content'} )
1652             : $this->{'_content'}
1653             )
1654             || []
1655             };
1656             }
1657             }
1658 0 0          
1659 0 0         # Doesn't call a real $root->delete on the (when implicit) root,
1660 0 0 0       # but I don't think it needs to.
1661 0            
1662 0           return @out if wantarray; # one simple normal case.
1663 0           return unless @out;
1664             return $out[0] if @out == 1 and ref( $out[0] );
1665             my $x = HTML::Element->new( 'div', '_implicit' => 1 );
1666 0     0 1   $x->push_content(@out);
1667             return $x;
1668             }
1669              
1670             sub disembowel { $_[0]->guts(1) }
1671              
1672             #--------------------------------------------------------------------------
1673             1;
1674              
1675             __END__