File Coverage

blib/lib/HTML/TagParser.pm
Criterion Covered Total %
statement 259 276 93.8
branch 104 134 77.6
condition 33 41 80.4
subroutine 33 36 91.6
pod 9 10 90.0
total 438 497 88.1


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             HTML::TagParser - Yet another HTML document parser with DOM-like methods
4              
5             =head1 SYNOPSIS
6              
7             Parse a HTML file and find its element's value. </td> </tr> <tr> <td class="h" > <a name="8">8</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="9">9</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> my $html = HTML::TagParser->new( "index-j.html" ); </td> </tr> <tr> <td class="h" > <a name="10">10</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> my $elem = $html->getElementsByTagName( "title" ); </td> </tr> <tr> <td class="h" > <a name="11">11</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> print "<title>", $elem->innerText(), "\n" if ref $elem;
12              
13             Parse a HTML source and find its first
attribute's value
14             and find all input elements belonging to this form.
15              
16             my $src = '...';
17             my $html = HTML::TagParser->new( $src );
18             my $elem = $html->getElementsByTagName( "form" );
19             print "
getAttribute("action"), "\">\n" if ref $elem;
20             my @first_inputs = $elem->subTree()->getElementsByTagName( "input" );
21             my $form = $first_inputs[0]->getParent();
22              
23             Fetch a HTML file via HTTP, and display its all elements and attributes.
24              
25             my $url = 'http://www.kawa.net/xp/index-e.html';
26             my $html = HTML::TagParser->new( $url );
27             my @list = $html->getElementsByTagName( "a" );
28             foreach my $elem ( @list ) {
29             my $tagname = $elem->tagName;
30             my $attr = $elem->attributes;
31             my $text = $elem->innerText;
32             print "<$tagname";
33             foreach my $key ( sort keys %$attr ) {
34             print " $key=\"$attr->{$key}\"";
35             }
36             if ( $text eq "" ) {
37             print " />\n";
38             } else {
39             print ">$text\n";
40             }
41             }
42              
43             =head1 DESCRIPTION
44              
45             HTML::TagParser is a pure Perl module which parses HTML/XHTML files.
46             This module provides some methods like DOM interface.
47             This module is not strict about XHTML format
48             because many of HTML pages are not strict.
49             You know, many pages use
elemtents instead of
50             and have

elements which are not closed.

51              
52             =head1 METHODS
53              
54             =head2 $html = HTML::TagParser->new();
55              
56             This method constructs an empty instance of the C class.
57              
58             =head2 $html = HTML::TagParser->new( $url );
59              
60             If new() is called with a URL,
61             this method fetches a HTML file from remote web server and parses it
62             and returns its instance.
63             L module is required to fetch a file.
64              
65             =head2 $html = HTML::TagParser->new( $file );
66              
67             If new() is called with a filename,
68             this method parses a local HTML file and returns its instance
69              
70             =head2 $html = HTML::TagParser->new( "...snip..." );
71              
72             If new() is called with a string of HTML source code,
73             this method parses it and returns its instance.
74              
75             =head2 $html->fetch( $url, %param );
76              
77             This method fetches a HTML file from remote web server and parse it.
78             The second argument is optional parameters for L module.
79              
80             =head2 $html->open( $file );
81              
82             This method parses a local HTML file.
83              
84             =head2 $html->parse( $source );
85              
86             This method parses a string of HTML source code.
87              
88             =head2 $elem = $html->getElementById( $id );
89              
90             This method returns the element which id attribute is $id.
91              
92             =head2 @elem = $html->getElementsByName( $name );
93              
94             This method returns an array of elements which name attribute is $name.
95             On scalar context, the first element is only retruned.
96              
97             =head2 @elem = $html->getElementsByTagName( $tagname );
98              
99             This method returns an array of elements which tagName is $tagName.
100             On scalar context, the first element is only retruned.
101              
102             =head2 @elem = $html->getElementsByClassName( $class );
103              
104             This method returns an array of elements which className is $tagName.
105             On scalar context, the first element is only retruned.
106              
107             =head2 @elem = $html->getElementsByAttribute( $attrname, $value );
108              
109             This method returns an array of elements which $attrname attribute's value is $value.
110             On scalar context, the first element is only retruned.
111              
112             =head1 HTML::TagParser::Element SUBCLASS
113              
114             =head2 $tagname = $elem->tagName();
115              
116             This method returns $elem's tagName.
117              
118             =head2 $text = $elem->id();
119              
120             This method returns $elem's id attribute.
121              
122             =head2 $text = $elem->innerText();
123              
124             This method returns $elem's innerText without tags.
125              
126             =head2 $subhtml = $elem->subTree();
127              
128             This method returns a new object of class HTML::Parser,
129             with all the elements that are in the DOM hierarchy under $elem.
130              
131             =head2 $elem = $elem->nextSibling();
132              
133             This method returns the next sibling within the same parent.
134             It returns undef when called on a closing tag or on the lastChild node
135             of a parentNode.
136              
137             =head2 $elem = $elem->previousSibling();
138              
139             This method returns the previous sibling within the same parent.
140             It returns undef when called on the firstChild node of a parentNode.
141              
142             =head2 $child_elem = $elem->firstChild();
143              
144             This method returns the first child node of $elem.
145             It returns undef when called on a closing tag element or on a
146             non-container or empty container element.
147              
148             =head2 $child_elems = $elem->childNodes();
149              
150             This method creates an array of all child nodes of $elem and returns the array by reference.
151             It returns an empty array-ref [] whenever firstChild() would return undef.
152              
153             =head2 $child_elem = $elem->lastChild();
154              
155             This method returns the last child node of $elem.
156             It returns undef whenever firstChild() would return undef.
157              
158             =head2 $parent = $elem->parentNode();
159              
160             This method returns the parent node of $elem.
161             It returns undef when called on root nodes.
162              
163             =head2 $attr = $elem->attributes();
164              
165             This method returns a hash of $elem's all attributes.
166              
167             =head2 $value = $elem->getAttribute( $key );
168              
169             This method returns the value of $elem's attributes which name is $key.
170              
171             =head1 BUGS
172              
173             The HTML-Parser is simple. Methods innerText and subTree may be
174             fooled by nested tags or embedded javascript code.
175              
176             The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results.
177             The most expensive ones are lastChild() and previousSibling().
178             parentNode() is also expensive, but only once. It does caching.
179              
180             The DOM tree is read-only, as this is just a parser.
181              
182             =head1 INTERNATIONALIZATION
183              
184             This module natively understands the character encoding used in document
185             by parsing its meta element.
186              
187            
188              
189             The parsed document's encoding is converted
190             as this class's fixed internal encoding "UTF-8".
191              
192             =head1 AUTHORS AND CONTRIBUTORS
193              
194             drry [drry]
195             Juergen Weigert [jnw]
196             Yusuke Kawasaki [kawasaki] [kawanet]
197             Tim Wilde [twilde]
198              
199             =head1 COPYRIGHT AND LICENSE
200              
201             The following copyright notice applies to all the files provided in this
202             distribution, including binary files, unless explicitly noted otherwise.
203              
204             Copyright 2006-2012 Yusuke Kawasaki
205              
206             This program is free software; you can redistribute it and/or
207             modify it under the same terms as Perl itself.
208              
209             =cut
210             # ----------------------------------------------------------------
211              
212             package HTML::TagParser;
213 16     16   682399 use 5.008_001;
  16         66  
  16         838  
214 16     16   96 use strict;
  16         36  
  16         1082  
215 16     16   22975 use Symbol ();
  16         33051  
  16         436  
216 16     16   126 use Carp ();
  16         26  
  16         854  
217 16     16   34656 use Encode ();
  16         427777  
  16         22745  
218              
219             our $VERSION = "0.20";
220              
221             my $SEC_OF_DAY = 60 * 60 * 24;
222              
223             # [000] '/' if closing tag.
224             # [001] tagName
225             # [002] attributes string (with trailing /, if self-closing tag).
226             # [003] content until next (nested) tag.
227             # [004] attributes hash cache.
228             # [005] innerText combined strings cache.
229             # [006] index of matching closing tag (or opening tag, if [000]=='/')
230             # [007] index of parent (aka container) tag.
231             #
232             sub new {
233 26     26 1 1403953 my $package = shift;
234 26         56 my $src = shift;
235 26         63 my $self = {};
236 26         73 bless $self, $package;
237 26 100       102 return $self unless defined $src;
238              
239 19 100 66     569 if ( $src =~ m#^https?://\w# ) {
    100          
    50          
240 1         7 $self->fetch( $src, @_ );
241             }
242             elsif ( $src !~ m#[<>|]# && -f $src ) {
243 12         47 $self->open($src);
244             }
245             elsif ( $src =~ /<.*>/ ) {
246 6         33 $self->parse($src);
247             }
248              
249 19         92 $self;
250             }
251              
252             sub fetch {
253 2     2 1 8 my $self = shift;
254 2         6 my $url = shift;
255 2 50       8 if ( !defined $URI::Fetch::VERSION ) {
256 0         0 local $@;
257 0         0 eval { require URI::Fetch; };
  0         0  
258 0 0       0 Carp::croak "URI::Fetch is required: $url" if $@;
259             }
260 2         16 my $res = URI::Fetch->fetch( $url, @_ );
261 2 50       788652 Carp::croak "URI::Fetch failed: $url" unless ref $res;
262 2 50       12 return if $res->is_error();
263 2         70 $self->{modified} = $res->last_modified();
264 2         29 my $text = $res->content();
265 2         23 $self->parse( \$text );
266             }
267              
268             sub open {
269 13     13 1 21 my $self = shift;
270 13         27 my $file = shift;
271 13         45 my $text = HTML::TagParser::Util::read_text_file($file);
272 13 50       47 return unless defined $text;
273 13         223 my $epoch = ( time() - ( -M $file ) * $SEC_OF_DAY );
274 13         28 $epoch -= $epoch % 60;
275 13         63 $self->{modified} = $epoch;
276 13         50 $self->parse( \$text );
277             }
278              
279             sub parse {
280 25     25 1 66 my $self = shift;
281 25         49 my $text = shift;
282 25 100       111 my $txtref = ref $text ? $text : \$text;
283              
284 25         112 my $charset = HTML::TagParser::Util::find_meta_charset($txtref);
285 25   66     218 $self->{charset} ||= $charset;
286 25 100 100     136 if ($charset && Encode::find_encoding($charset)) {
287 11         40530 HTML::TagParser::Util::encode_from_to( $txtref, $charset, "utf-8" );
288             }
289 25         665 my $flat = HTML::TagParser::Util::html_to_flat($txtref);
290 25 50       105 Carp::croak "Null HTML document." unless scalar @$flat;
291 25         120 $self->{flat} = $flat;
292 25         192 scalar @$flat;
293             }
294              
295             sub getElementsByTagName {
296 35     35 1 25397 my $self = shift;
297 35         96 my $tagname = lc(shift);
298              
299 35         104 my $flat = $self->{flat};
300 35         73 my $out = [];
301 35         234 for( my $i = 0 ; $i <= $#$flat ; $i++ ) {
302 4450 100       12869 next if ( $flat->[$i]->[001] ne $tagname );
303 79 100       192 next if $flat->[$i]->[000]; # close
304 75         335 my $elem = HTML::TagParser::Element->new( $flat, $i );
305 75 100       251 return $elem unless wantarray;
306 49         136 push( @$out, $elem );
307             }
308 9 50       28 return unless wantarray;
309 9         53 @$out;
310             }
311              
312             sub getElementsByAttribute {
313 34     34 1 61 my $self = shift;
314 34         962 my $key = lc(shift);
315 34         49 my $val = shift;
316              
317 34         71 my $flat = $self->{flat};
318 34         66 my $out = [];
319 34         128 for ( my $i = 0 ; $i <= $#$flat ; $i++ ) {
320 5306 100       16376 next if $flat->[$i]->[000]; # close
321 2965         6794 my $elem = HTML::TagParser::Element->new( $flat, $i );
322 2965         12229 my $attr = $elem->attributes();
323 2965 100       13454 next unless exists $attr->{$key};
324 635 100       3021 next if ( $attr->{$key} ne $val );
325 85 100       413 return $elem unless wantarray;
326 57         198 push( @$out, $elem );
327             }
328 6 50       61 return unless wantarray;
329 6         107 @$out;
330             }
331              
332             sub getElementsByClassName {
333 5     5 1 13 my $self = shift;
334 5         11 my $class = shift;
335 5         18 return $self->getElementsByAttribute( "class", $class );
336             }
337              
338             sub getElementsByName {
339 5     5 1 107 my $self = shift;
340 5         11 my $name = shift;
341 5         18 return $self->getElementsByAttribute( "name", $name );
342             }
343              
344             sub getElementById {
345 15     15 1 3249 my $self = shift;
346 15         1025 my $id = shift;
347 15         66 return scalar $self->getElementsByAttribute( "id", $id );
348             }
349              
350             sub modified {
351 0     0 0 0 $_[0]->{modified};
352             }
353              
354             # ----------------------------------------------------------------
355              
356             package HTML::TagParser::Element;
357 16     16   181 use strict;
  16         44  
  16         42861  
358              
359             sub new {
360 3052     3052   4065 my $package = shift;
361 3052         6142 my $self = [@_];
362 3052         14165 bless $self, $package;
363 3052         9912 $self;
364             }
365              
366             sub tagName {
367 6     6   33 my $self = shift;
368 6         15 my ( $flat, $cur ) = @$self;
369 6         39 return $flat->[$cur]->[001];
370             }
371              
372             sub id {
373 2     2   3 my $self = shift;
374 2         13 $self->getAttribute("id");
375             }
376              
377             sub getAttribute {
378 106     106   68545 my $self = shift;
379 106         214 my $name = lc(shift);
380 106         264 my $attr = $self->attributes();
381 106 100       288 return unless exists $attr->{$name};
382 96         524 $attr->{$name};
383             }
384              
385             sub innerText {
386 43     43   5938 my $self = shift;
387 43         140 my ( $flat, $cur ) = @$self;
388 43         83 my $elem = $flat->[$cur];
389 43 100       180 return $elem->[005] if defined $elem->[005]; # cache
390 36 50       99 return if $elem->[000]; #
391 36 50 66     1396 return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); #
392              
393 36         76 my $tagname = $elem->[001];
394 36         201 my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
395 36         89 my $list = [];
396 36         114 for ( ; $cur < $closing ; $cur++ ) {
397 307         891 push( @$list, $flat->[$cur]->[003] );
398             }
399 36         86 my $text = join( "", grep { $_ ne "" } @$list );
  307         578  
400 36         765 $text =~ s/^\s+|\s+$//sg;
401             # $text = "" if ( $cur == $#$flat ); # end of source
402 36         186 $elem->[005] = HTML::TagParser::Util::xml_unescape( $text );
403             }
404              
405             sub subTree
406             {
407 0     0   0 my $self = shift;
408 0         0 my ( $flat, $cur ) = @$self;
409 0         0 my $elem = $flat->[$cur];
410 0 0       0 return if $elem->[000]; #
411 0         0 my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
412 0         0 my $list = [];
413 0         0 while (++$cur < $closing)
414             {
415 0         0 push @$list, $flat->[$cur];
416             }
417              
418             # allow the getElement...() methods on the returned object.
419 0         0 return bless { flat => $list }, 'HTML::TagParser';
420             }
421              
422              
423             sub nextSibling
424             {
425 4     4   8 my $self = shift;
426 4         8 my ( $flat, $cur ) = @$self;
427 4         6 my $elem = $flat->[$cur];
428              
429 4 50       11 return undef if $elem->[000]; #
430 4         7 my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
431 4         9 my $next_s = $flat->[$closing+1];
432 4 50       10 return undef unless $next_s;
433 4 100       15 return undef if $next_s->[000]; # parent's
434 2         7 return HTML::TagParser::Element->new( $flat, $closing+1 );
435             }
436              
437             sub firstChild
438             {
439 1     1   2 my $self = shift;
440 1         2 my ( $flat, $cur ) = @$self;
441 1         2 my $elem = $flat->[$cur];
442 1 50       19 return undef if $elem->[000]; #
443 1         4 my $closing = HTML::TagParser::Util::find_closing($flat, $cur);
444 1 50       6 return undef if $closing <= $cur+1; # no children here.
445 1         5 return HTML::TagParser::Element->new( $flat, $cur+1 );
446             }
447              
448             sub childNodes
449             {
450 1     1   2 my $self = shift;
451 1         4 my ( $flat, $cur ) = @$self;
452 1         6 my $child = firstChild($self);
453 1 50       5 return [] unless $child; # an empty array is easier for our callers than undef
454 1         3 my @c = ( $child );
455 1         4 while (defined ($child = nextSibling($child)))
456             {
457 1         3 push @c, $child;
458             }
459 1         4 return \@c;
460             }
461              
462             sub lastChild
463             {
464 0     0   0 my $c = childNodes(@_);
465 0 0       0 return undef unless $c->[0];
466 0         0 return $c->[-1];
467             }
468              
469             sub previousSibling
470             {
471 7     7   7 my $self = shift;
472 7         10 my ( $flat, $cur ) = @$self;
473              
474             ## This one is expensive.
475             ## We use find_closing() which walks forward.
476             ## We'd need a find_opening() which walks backwards.
477             ## So we walk backwards one by one and consult find_closing()
478             ## until we find $cur-1 or $cur.
479              
480 7         8 my $idx = $cur-1;
481 7         15 while ($idx >= 0)
482             {
483 16 100 100     49 if ($flat->[$idx][000] && defined($flat->[$idx][006]))
484             {
485 2         3 $idx = $flat->[$idx][006]; # use cache for backwards skipping
486 2         39 next;
487             }
488              
489 14         26 my $closing = HTML::TagParser::Util::find_closing($flat, $idx);
490 14 100 66     81 return HTML::TagParser::Element->new( $flat, $idx )
      33        
491             if defined $closing and ($closing == $cur || $closing == $cur-1);
492 12         25 $idx--;
493             }
494 5         20 return undef;
495             }
496              
497             sub parentNode
498             {
499 8     8   621 my $self = shift;
500 8         10 my ( $flat, $cur ) = @$self;
501              
502 8 100       26 return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache
503              
504             ##
505             ## This one is very expensive.
506             ## We use previousSibling() to walk backwards, and
507             ## previousSibling() is expensive.
508             ##
509 5         5 my $ps = $self;
510 5         6 my $first = $self;
511              
512 5         20 while (defined($ps = previousSibling($ps))) { $first = $ps; }
  2         4  
513              
514 5         7 my $parent = $first->[1] - 1;
515 5 100       13 return undef if $parent < 0;
516 4 50       8 die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur;
517              
518 4         8 $flat->[$cur][007] = $parent; # cache
519 4         10 return HTML::TagParser::Element->new( $flat, $parent )
520             }
521              
522             ##
523             ## feature:
524             ## self-closing tags have an additional attribute '/' => '/'.
525             ##
526             sub attributes {
527 3071     3071   3190 my $self = shift;
528 3071         4877 my ( $flat, $cur ) = @$self;
529 3071         4049 my $elem = $flat->[$cur];
530 3071 100       10154 return $elem->[004] if ref $elem->[004]; # cache
531 1666 100       5524 return unless defined $elem->[002];
532 811         1129 my $attr = {};
533 811         19303 while ( $elem->[002] =~ m{
534             ([^\s="']+)(\s*=\s*(?:["']((?(?<=")(?:\\"|[^"])*?|(?:\\'|[^'])*?))["']|([^'"\s=]+)['"]*))?
535             }sgx ) {
536 1994         5730 my $key = $1;
537 1994         3888 my $test = $2;
538 1994   66     9916 my $val = $3 || $4;
539 1994         4059 my $lckey = lc($key);
540 1994 100       22106 if ($test) {
541 1719         2433 $key =~ tr/A-Z/a-z/;
542 1719         5784 $val = HTML::TagParser::Util::xml_unescape( $val );
543 1719         17542 $attr->{$lckey} = $val;
544             }
545             else {
546 275         1670 $attr->{$lckey} = $key;
547             }
548             }
549 811         1703 $elem->[004] = $attr; # cache
550 811         2882 $attr;
551             }
552              
553             # ----------------------------------------------------------------
554              
555             package HTML::TagParser::Util;
556 16     16   147 use strict;
  16         34  
  16         28142  
557              
558             sub xml_unescape {
559 1755     1755   3088 my $str = shift;
560 1755 100       4071 return unless defined $str;
561 1568         2674 $str =~ s/"/"/g;
562 1568         1823 $str =~ s/</
563 1568         2305 $str =~ s/>/>/g;
564 1568         10175 $str =~ s/&/&/g;
565 1568         3970 $str;
566             }
567              
568             sub read_text_file {
569 13     13   22 my $file = shift;
570 13         56 my $fh = Symbol::gensym();
571 13 50       1320 open( $fh, $file ) or Carp::croak "$! - $file\n";
572 13         125 local $/ = undef;
573 13         757 my $text = <$fh>;
574 13         148 close($fh);
575 13         81 $text;
576             }
577              
578             sub html_to_flat {
579 25     25   45 my $txtref = shift; # reference
580 25         56 my $flat = [];
581 25         86 pos($$txtref) = undef; # reset matching position
582 25         294 while ( $$txtref =~ m{
583             (?:[^<]*) < (?:
584             ( / )? ( [^/!<>\s"'=]+ )
585             ( (?:"[^"]*"|'[^']*'|[^"'<>])+ )?
586             |
587             (!-- .*? -- | ![^\-] .*? )
588             ) > ([^<]*)
589             }sxg ) {
590             # [000] $1 close
591             # [001] $2 tagName
592             # [002] $3 attributes
593             # $4 comment element
594             # [003] $5 content
595 3914 100       10500 next if defined $4;
596 3868         19774 my $array = [ $1, $2, $3, $5 ];
597 3868         11232 $array->[001] =~ tr/A-Z/a-z/;
598             # $array->[003] =~ s/^\s+//s;
599             # $array->[003] =~ s/\s+$//s;
600 3868         87087 push( @$flat, $array );
601             }
602 25         90 $flat;
603             }
604              
605             ## returns 1 beyond the end, if not found.
606             ## returns undef if called on a closing tag
607             sub find_closing
608             {
609 59     59   88 my ($flat, $cur) = @_;
610              
611 59 100       194 return $flat->[$cur][006] if $flat->[$cur][006]; # cache
612 44 100 100     270 return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing
613              
614 43         91 my $name = $flat->[$cur][001];
615 43         53 my $pre_nest = 0;
616             ## count how many levels deep this type of tag is nested.
617 43         150 my $idx;
618 43         138 for ($idx = 0; $idx <= $cur; $idx++)
619             {
620 3318         6265 my $e = $flat->[$idx];
621 3318 100       8804 next unless $e->[001] eq $name;
622 278 50 100     1014 next if (($e->[002]||'') =~ m{/$}); # self-closing
623 278 100       4882 $pre_nest += ($e->[000]) ? -1 : 1;
624 278 100       625 $pre_nest = 0 if $pre_nest < 0;
625 278 100 100     1322 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
626             }
627 43         77 my $last_idx = $#$flat;
628              
629             ## we move last_idx closer, in case this container
630             ## has not all its subcontainers closed properly.
631 43         72 my $post_nest = 0;
632 43         151 for ($idx = $last_idx; $idx > $cur; $idx--)
633             {
634 2526         3198 my $e = $flat->[$idx];
635 2526 100       14167 next unless $e->[001] eq $name;
636 45         84 $last_idx = $idx-1; # remember where a matching tag was
637 45 50 100     362 next if (($e->[002]||'') =~ m{/$}); # self-closing
638 45 100       136 $post_nest -= ($e->[000]) ? -1 : 1;
639 45 100       146 $post_nest = 0 if $post_nest < 0;
640 45 100       141 last if $pre_nest <= $post_nest;
641 7 50 66     37 $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards.
642             }
643              
644 43         68 my $nest = 1; # we know it is not self-closing. start behind.
645              
646 43         140 for ($idx = $cur+1; $idx <= $last_idx; $idx++)
647             {
648 316         310 my $e = $flat->[$idx];
649 316 100       938 next unless $e->[001] eq $name;
650 17 100 100     116 next if (($e->[002]||'') =~ m{/$}); # self-closing
651 14 100       42 $nest += ($e->[000]) ? -1 : 1;
652 14 100       37 if ($nest <= 0)
653             {
654 6 50       19 die "assert " unless $e->[000];
655 6         16 $e->[006] = $cur; # point back to opening tag
656 6         23 return $flat->[$cur][006] = $idx;
657             }
658 8 100 100     106 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.
659             }
660              
661             # not all closed, but cannot go further
662 37         153 return $flat->[$cur][006] = $last_idx+1;
663             }
664              
665             sub find_meta_charset {
666 25     25   50 my $txtref = shift; # reference
667 25         368 while ( $$txtref =~ m{
668             ]+\s )? http-equiv\s*=\s*['"]?Content-Type [^>]+ ) >
669             }sxgi ) {
670 12         37 my $args = $1;
671 12 50       99 return $1 if ( $args =~ m# charset=['"]?([^'"\s/]+) #sxgi );
672             }
673 13         29 undef;
674             }
675              
676             sub encode_from_to {
677 11     11   37 my ( $txtref, $from, $to ) = @_;
678 11 50       164 return if ( $from eq "" );
679 11 50       38 return if ( $to eq "" );
680 11 100       46 return $to if ( uc($from) eq uc($to) );
681 7         54 Encode::from_to( $$txtref, $from, $to, Encode::XMLCREF() );
682 7         5292 return $to;
683             }
684              
685             # ----------------------------------------------------------------
686             1;
687             # ----------------------------------------------------------------