| blib/lib/HTML/TagParser.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 259 | 276 | 93.8 |
| branch | 104 | 134 | 77.6 |
| condition | 33 | 41 | 80.4 |
| subroutine | 33 | 36 | 91.6 |
| pod | 9 | 10 | 90.0 |
| total | 438 | 497 | 88.1 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | =head1 NAME | ||||||
| 2 | |||||||
| 3 | HTML::TagParser - Yet another HTML document parser with DOM-like methods | ||||||
| 4 | |||||||
| 5 | =head1 SYNOPSIS | ||||||
| 6 | |||||||
| 7 | Parse a HTML file and find its |
||||||
| 8 | |||||||
| 9 | my $html = HTML::TagParser->new( "index-j.html" ); | ||||||
| 10 | my $elem = $html->getElementsByTagName( "title" ); | ||||||
| 11 | print " |
||||||
| 12 | |||||||
| 13 | Parse a HTML source and find its first | ||||||
| 14 | and find all input elements belonging to this form. | ||||||
| 15 | |||||||
| 16 | my $src = ''; | ||||||
| 17 | my $html = HTML::TagParser->new( $src ); | ||||||
| 18 | my $elem = $html->getElementsByTagName( "form" ); | ||||||
| 19 | print " | ||||||
| 20 | my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); | ||||||
| 21 | my $form = $first_inputs[0]->getParent(); | ||||||
| 22 | |||||||
| 23 | Fetch a HTML file via HTTP, and display its all elements and attributes. | ||||||
| 24 | |||||||
| 25 | my $url = 'http://www.kawa.net/xp/index-e.html'; | ||||||
| 26 | my $html = HTML::TagParser->new( $url ); | ||||||
| 27 | my @list = $html->getElementsByTagName( "a" ); | ||||||
| 28 | foreach my $elem ( @list ) { | ||||||
| 29 | my $tagname = $elem->tagName; | ||||||
| 30 | my $attr = $elem->attributes; | ||||||
| 31 | my $text = $elem->innerText; | ||||||
| 32 | print "<$tagname"; | ||||||
| 33 | foreach my $key ( sort keys %$attr ) { | ||||||
| 34 | print " $key=\"$attr->{$key}\""; | ||||||
| 35 | } | ||||||
| 36 | if ( $text eq "" ) { | ||||||
| 37 | print " />\n"; | ||||||
| 38 | } else { | ||||||
| 39 | print ">$text$tagname>\n"; | ||||||
| 40 | } | ||||||
| 41 | } | ||||||
| 42 | |||||||
| 43 | =head1 DESCRIPTION | ||||||
| 44 | |||||||
| 45 | HTML::TagParser is a pure Perl module which parses HTML/XHTML files. | ||||||
| 46 | This module provides some methods like DOM interface. | ||||||
| 47 | This module is not strict about XHTML format | ||||||
| 48 | because many of HTML pages are not strict. | ||||||
| 49 | You know, many pages use elemtents instead of |
||||||
| 50 | and have elements which are not closed. |
||||||
| 51 | |||||||
| 52 | =head1 METHODS | ||||||
| 53 | |||||||
| 54 | =head2 $html = HTML::TagParser->new(); | ||||||
| 55 | |||||||
| 56 | This method constructs an empty instance of the C |
||||||
| 57 | |||||||
| 58 | =head2 $html = HTML::TagParser->new( $url ); | ||||||
| 59 | |||||||
| 60 | If new() is called with a URL, | ||||||
| 61 | this method fetches a HTML file from remote web server and parses it | ||||||
| 62 | and returns its instance. | ||||||
| 63 | L |
||||||
| 64 | |||||||
| 65 | =head2 $html = HTML::TagParser->new( $file ); | ||||||
| 66 | |||||||
| 67 | If new() is called with a filename, | ||||||
| 68 | this method parses a local HTML file and returns its instance | ||||||
| 69 | |||||||
| 70 | =head2 $html = HTML::TagParser->new( "...snip..." ); | ||||||
| 71 | |||||||
| 72 | If new() is called with a string of HTML source code, | ||||||
| 73 | this method parses it and returns its instance. | ||||||
| 74 | |||||||
| 75 | =head2 $html->fetch( $url, %param ); | ||||||
| 76 | |||||||
| 77 | This method fetches a HTML file from remote web server and parse it. | ||||||
| 78 | The second argument is optional parameters for L |
||||||
| 79 | |||||||
| 80 | =head2 $html->open( $file ); | ||||||
| 81 | |||||||
| 82 | This method parses a local HTML file. | ||||||
| 83 | |||||||
| 84 | =head2 $html->parse( $source ); | ||||||
| 85 | |||||||
| 86 | This method parses a string of HTML source code. | ||||||
| 87 | |||||||
| 88 | =head2 $elem = $html->getElementById( $id ); | ||||||
| 89 | |||||||
| 90 | This method returns the element which id attribute is $id. | ||||||
| 91 | |||||||
| 92 | =head2 @elem = $html->getElementsByName( $name ); | ||||||
| 93 | |||||||
| 94 | This method returns an array of elements which name attribute is $name. | ||||||
| 95 | On scalar context, the first element is only retruned. | ||||||
| 96 | |||||||
| 97 | =head2 @elem = $html->getElementsByTagName( $tagname ); | ||||||
| 98 | |||||||
| 99 | This method returns an array of elements which tagName is $tagName. | ||||||
| 100 | On scalar context, the first element is only retruned. | ||||||
| 101 | |||||||
| 102 | =head2 @elem = $html->getElementsByClassName( $class ); | ||||||
| 103 | |||||||
| 104 | This method returns an array of elements which className is $tagName. | ||||||
| 105 | On scalar context, the first element is only retruned. | ||||||
| 106 | |||||||
| 107 | =head2 @elem = $html->getElementsByAttribute( $attrname, $value ); | ||||||
| 108 | |||||||
| 109 | This method returns an array of elements which $attrname attribute's value is $value. | ||||||
| 110 | On scalar context, the first element is only retruned. | ||||||
| 111 | |||||||
| 112 | =head1 HTML::TagParser::Element SUBCLASS | ||||||
| 113 | |||||||
| 114 | =head2 $tagname = $elem->tagName(); | ||||||
| 115 | |||||||
| 116 | This method returns $elem's tagName. | ||||||
| 117 | |||||||
| 118 | =head2 $text = $elem->id(); | ||||||
| 119 | |||||||
| 120 | This method returns $elem's id attribute. | ||||||
| 121 | |||||||
| 122 | =head2 $text = $elem->innerText(); | ||||||
| 123 | |||||||
| 124 | This method returns $elem's innerText without tags. | ||||||
| 125 | |||||||
| 126 | =head2 $subhtml = $elem->subTree(); | ||||||
| 127 | |||||||
| 128 | This method returns a new object of class HTML::Parser, | ||||||
| 129 | with all the elements that are in the DOM hierarchy under $elem. | ||||||
| 130 | |||||||
| 131 | =head2 $elem = $elem->nextSibling(); | ||||||
| 132 | |||||||
| 133 | This method returns the next sibling within the same parent. | ||||||
| 134 | It returns undef when called on a closing tag or on the lastChild node | ||||||
| 135 | of a parentNode. | ||||||
| 136 | |||||||
| 137 | =head2 $elem = $elem->previousSibling(); | ||||||
| 138 | |||||||
| 139 | This method returns the previous sibling within the same parent. | ||||||
| 140 | It returns undef when called on the firstChild node of a parentNode. | ||||||
| 141 | |||||||
| 142 | =head2 $child_elem = $elem->firstChild(); | ||||||
| 143 | |||||||
| 144 | This method returns the first child node of $elem. | ||||||
| 145 | It returns undef when called on a closing tag element or on a | ||||||
| 146 | non-container or empty container element. | ||||||
| 147 | |||||||
| 148 | =head2 $child_elems = $elem->childNodes(); | ||||||
| 149 | |||||||
| 150 | This method creates an array of all child nodes of $elem and returns the array by reference. | ||||||
| 151 | It returns an empty array-ref [] whenever firstChild() would return undef. | ||||||
| 152 | |||||||
| 153 | =head2 $child_elem = $elem->lastChild(); | ||||||
| 154 | |||||||
| 155 | This method returns the last child node of $elem. | ||||||
| 156 | It returns undef whenever firstChild() would return undef. | ||||||
| 157 | |||||||
| 158 | =head2 $parent = $elem->parentNode(); | ||||||
| 159 | |||||||
| 160 | This method returns the parent node of $elem. | ||||||
| 161 | It returns undef when called on root nodes. | ||||||
| 162 | |||||||
| 163 | =head2 $attr = $elem->attributes(); | ||||||
| 164 | |||||||
| 165 | This method returns a hash of $elem's all attributes. | ||||||
| 166 | |||||||
| 167 | =head2 $value = $elem->getAttribute( $key ); | ||||||
| 168 | |||||||
| 169 | This method returns the value of $elem's attributes which name is $key. | ||||||
| 170 | |||||||
| 171 | =head1 BUGS | ||||||
| 172 | |||||||
| 173 | The HTML-Parser is simple. Methods innerText and subTree may be | ||||||
| 174 | fooled by nested tags or embedded javascript code. | ||||||
| 175 | |||||||
| 176 | The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. | ||||||
| 177 | The most expensive ones are lastChild() and previousSibling(). | ||||||
| 178 | parentNode() is also expensive, but only once. It does caching. | ||||||
| 179 | |||||||
| 180 | The DOM tree is read-only, as this is just a parser. | ||||||
| 181 | |||||||
| 182 | =head1 INTERNATIONALIZATION | ||||||
| 183 | |||||||
| 184 | This module natively understands the character encoding used in document | ||||||
| 185 | by parsing its meta element. | ||||||
| 186 | |||||||
| 187 | |||||||
| 188 | |||||||
| 189 | The parsed document's encoding is converted | ||||||
| 190 | as this class's fixed internal encoding "UTF-8". | ||||||
| 191 | |||||||
| 192 | =head1 AUTHORS AND CONTRIBUTORS | ||||||
| 193 | |||||||
| 194 | drry [drry] | ||||||
| 195 | Juergen Weigert [jnw] | ||||||
| 196 | Yusuke Kawasaki [kawasaki] [kawanet] | ||||||
| 197 | Tim Wilde [twilde] | ||||||
| 198 | |||||||
| 199 | =head1 COPYRIGHT AND LICENSE | ||||||
| 200 | |||||||
| 201 | The following copyright notice applies to all the files provided in this | ||||||
| 202 | distribution, including binary files, unless explicitly noted otherwise. | ||||||
| 203 | |||||||
| 204 | Copyright 2006-2012 Yusuke Kawasaki | ||||||
| 205 | |||||||
| 206 | This program is free software; you can redistribute it and/or | ||||||
| 207 | modify it under the same terms as Perl itself. | ||||||
| 208 | |||||||
| 209 | =cut | ||||||
| 210 | # ---------------------------------------------------------------- | ||||||
| 211 | |||||||
| 212 | package HTML::TagParser; | ||||||
| 213 | 16 | 16 | 682399 | use 5.008_001; | |||
| 16 | 66 | ||||||
| 16 | 838 | ||||||
| 214 | 16 | 16 | 96 | use strict; | |||
| 16 | 36 | ||||||
| 16 | 1082 | ||||||
| 215 | 16 | 16 | 22975 | use Symbol (); | |||
| 16 | 33051 | ||||||
| 16 | 436 | ||||||
| 216 | 16 | 16 | 126 | use Carp (); | |||
| 16 | 26 | ||||||
| 16 | 854 | ||||||
| 217 | 16 | 16 | 34656 | use Encode (); | |||
| 16 | 427777 | ||||||
| 16 | 22745 | ||||||
| 218 | |||||||
| 219 | our $VERSION = "0.20"; | ||||||
| 220 | |||||||
| 221 | my $SEC_OF_DAY = 60 * 60 * 24; | ||||||
| 222 | |||||||
| 223 | # [000] '/' if closing tag. | ||||||
| 224 | # [001] tagName | ||||||
| 225 | # [002] attributes string (with trailing /, if self-closing tag). | ||||||
| 226 | # [003] content until next (nested) tag. | ||||||
| 227 | # [004] attributes hash cache. | ||||||
| 228 | # [005] innerText combined strings cache. | ||||||
| 229 | # [006] index of matching closing tag (or opening tag, if [000]=='/') | ||||||
| 230 | # [007] index of parent (aka container) tag. | ||||||
| 231 | # | ||||||
| 232 | sub new { | ||||||
| 233 | 26 | 26 | 1 | 1403953 | my $package = shift; | ||
| 234 | 26 | 56 | my $src = shift; | ||||
| 235 | 26 | 63 | my $self = {}; | ||||
| 236 | 26 | 73 | bless $self, $package; | ||||
| 237 | 26 | 100 | 102 | return $self unless defined $src; | |||
| 238 | |||||||
| 239 | 19 | 100 | 66 | 569 | if ( $src =~ m#^https?://\w# ) { | ||
| 100 | |||||||
| 50 | |||||||
| 240 | 1 | 7 | $self->fetch( $src, @_ ); | ||||
| 241 | } | ||||||
| 242 | elsif ( $src !~ m#[<>|]# && -f $src ) { | ||||||
| 243 | 12 | 47 | $self->open($src); | ||||
| 244 | } | ||||||
| 245 | elsif ( $src =~ /<.*>/ ) { | ||||||
| 246 | 6 | 33 | $self->parse($src); | ||||
| 247 | } | ||||||
| 248 | |||||||
| 249 | 19 | 92 | $self; | ||||
| 250 | } | ||||||
| 251 | |||||||
| 252 | sub fetch { | ||||||
| 253 | 2 | 2 | 1 | 8 | my $self = shift; | ||
| 254 | 2 | 6 | my $url = shift; | ||||
| 255 | 2 | 50 | 8 | if ( !defined $URI::Fetch::VERSION ) { | |||
| 256 | 0 | 0 | local $@; | ||||
| 257 | 0 | 0 | eval { require URI::Fetch; }; | ||||
| 0 | 0 | ||||||
| 258 | 0 | 0 | 0 | Carp::croak "URI::Fetch is required: $url" if $@; | |||
| 259 | } | ||||||
| 260 | 2 | 16 | my $res = URI::Fetch->fetch( $url, @_ ); | ||||
| 261 | 2 | 50 | 788652 | Carp::croak "URI::Fetch failed: $url" unless ref $res; | |||
| 262 | 2 | 50 | 12 | return if $res->is_error(); | |||
| 263 | 2 | 70 | $self->{modified} = $res->last_modified(); | ||||
| 264 | 2 | 29 | my $text = $res->content(); | ||||
| 265 | 2 | 23 | $self->parse( \$text ); | ||||
| 266 | } | ||||||
| 267 | |||||||
| 268 | sub open { | ||||||
| 269 | 13 | 13 | 1 | 21 | my $self = shift; | ||
| 270 | 13 | 27 | my $file = shift; | ||||
| 271 | 13 | 45 | my $text = HTML::TagParser::Util::read_text_file($file); | ||||
| 272 | 13 | 50 | 47 | return unless defined $text; | |||
| 273 | 13 | 223 | my $epoch = ( time() - ( -M $file ) * $SEC_OF_DAY ); | ||||
| 274 | 13 | 28 | $epoch -= $epoch % 60; | ||||
| 275 | 13 | 63 | $self->{modified} = $epoch; | ||||
| 276 | 13 | 50 | $self->parse( \$text ); | ||||
| 277 | } | ||||||
| 278 | |||||||
| 279 | sub parse { | ||||||
| 280 | 25 | 25 | 1 | 66 | my $self = shift; | ||
| 281 | 25 | 49 | my $text = shift; | ||||
| 282 | 25 | 100 | 111 | my $txtref = ref $text ? $text : \$text; | |||
| 283 | |||||||
| 284 | 25 | 112 | my $charset = HTML::TagParser::Util::find_meta_charset($txtref); | ||||
| 285 | 25 | 66 | 218 | $self->{charset} ||= $charset; | |||
| 286 | 25 | 100 | 100 | 136 | if ($charset && Encode::find_encoding($charset)) { | ||
| 287 | 11 | 40530 | HTML::TagParser::Util::encode_from_to( $txtref, $charset, "utf-8" ); | ||||
| 288 | } | ||||||
| 289 | 25 | 665 | my $flat = HTML::TagParser::Util::html_to_flat($txtref); | ||||
| 290 | 25 | 50 | 105 | Carp::croak "Null HTML document." unless scalar @$flat; | |||
| 291 | 25 | 120 | $self->{flat} = $flat; | ||||
| 292 | 25 | 192 | scalar @$flat; | ||||
| 293 | } | ||||||
| 294 | |||||||
| 295 | sub getElementsByTagName { | ||||||
| 296 | 35 | 35 | 1 | 25397 | my $self = shift; | ||
| 297 | 35 | 96 | my $tagname = lc(shift); | ||||
| 298 | |||||||
| 299 | 35 | 104 | my $flat = $self->{flat}; | ||||
| 300 | 35 | 73 | my $out = []; | ||||
| 301 | 35 | 234 | for( my $i = 0 ; $i <= $#$flat ; $i++ ) { | ||||
| 302 | 4450 | 100 | 12869 | next if ( $flat->[$i]->[001] ne $tagname ); | |||
| 303 | 79 | 100 | 192 | next if $flat->[$i]->[000]; # close | |||
| 304 | 75 | 335 | my $elem = HTML::TagParser::Element->new( $flat, $i ); | ||||
| 305 | 75 | 100 | 251 | return $elem unless wantarray; | |||
| 306 | 49 | 136 | push( @$out, $elem ); | ||||
| 307 | } | ||||||
| 308 | 9 | 50 | 28 | return unless wantarray; | |||
| 309 | 9 | 53 | @$out; | ||||
| 310 | } | ||||||
| 311 | |||||||
| 312 | sub getElementsByAttribute { | ||||||
| 313 | 34 | 34 | 1 | 61 | my $self = shift; | ||
| 314 | 34 | 962 | my $key = lc(shift); | ||||
| 315 | 34 | 49 | my $val = shift; | ||||
| 316 | |||||||
| 317 | 34 | 71 | my $flat = $self->{flat}; | ||||
| 318 | 34 | 66 | my $out = []; | ||||
| 319 | 34 | 128 | for ( my $i = 0 ; $i <= $#$flat ; $i++ ) { | ||||
| 320 | 5306 | 100 | 16376 | next if $flat->[$i]->[000]; # close | |||
| 321 | 2965 | 6794 | my $elem = HTML::TagParser::Element->new( $flat, $i ); | ||||
| 322 | 2965 | 12229 | my $attr = $elem->attributes(); | ||||
| 323 | 2965 | 100 | 13454 | next unless exists $attr->{$key}; | |||
| 324 | 635 | 100 | 3021 | next if ( $attr->{$key} ne $val ); | |||
| 325 | 85 | 100 | 413 | return $elem unless wantarray; | |||
| 326 | 57 | 198 | push( @$out, $elem ); | ||||
| 327 | } | ||||||
| 328 | 6 | 50 | 61 | return unless wantarray; | |||
| 329 | 6 | 107 | @$out; | ||||
| 330 | } | ||||||
| 331 | |||||||
| 332 | sub getElementsByClassName { | ||||||
| 333 | 5 | 5 | 1 | 13 | my $self = shift; | ||
| 334 | 5 | 11 | my $class = shift; | ||||
| 335 | 5 | 18 | return $self->getElementsByAttribute( "class", $class ); | ||||
| 336 | } | ||||||
| 337 | |||||||
| 338 | sub getElementsByName { | ||||||
| 339 | 5 | 5 | 1 | 107 | my $self = shift; | ||
| 340 | 5 | 11 | my $name = shift; | ||||
| 341 | 5 | 18 | return $self->getElementsByAttribute( "name", $name ); | ||||
| 342 | } | ||||||
| 343 | |||||||
| 344 | sub getElementById { | ||||||
| 345 | 15 | 15 | 1 | 3249 | my $self = shift; | ||
| 346 | 15 | 1025 | my $id = shift; | ||||
| 347 | 15 | 66 | return scalar $self->getElementsByAttribute( "id", $id ); | ||||
| 348 | } | ||||||
| 349 | |||||||
| 350 | sub modified { | ||||||
| 351 | 0 | 0 | 0 | 0 | $_[0]->{modified}; | ||
| 352 | } | ||||||
| 353 | |||||||
| 354 | # ---------------------------------------------------------------- | ||||||
| 355 | |||||||
| 356 | package HTML::TagParser::Element; | ||||||
| 357 | 16 | 16 | 181 | use strict; | |||
| 16 | 44 | ||||||
| 16 | 42861 | ||||||
| 358 | |||||||
| 359 | sub new { | ||||||
| 360 | 3052 | 3052 | 4065 | my $package = shift; | |||
| 361 | 3052 | 6142 | my $self = [@_]; | ||||
| 362 | 3052 | 14165 | bless $self, $package; | ||||
| 363 | 3052 | 9912 | $self; | ||||
| 364 | } | ||||||
| 365 | |||||||
| 366 | sub tagName { | ||||||
| 367 | 6 | 6 | 33 | my $self = shift; | |||
| 368 | 6 | 15 | my ( $flat, $cur ) = @$self; | ||||
| 369 | 6 | 39 | return $flat->[$cur]->[001]; | ||||
| 370 | } | ||||||
| 371 | |||||||
| 372 | sub id { | ||||||
| 373 | 2 | 2 | 3 | my $self = shift; | |||
| 374 | 2 | 13 | $self->getAttribute("id"); | ||||
| 375 | } | ||||||
| 376 | |||||||
| 377 | sub getAttribute { | ||||||
| 378 | 106 | 106 | 68545 | my $self = shift; | |||
| 379 | 106 | 214 | my $name = lc(shift); | ||||
| 380 | 106 | 264 | my $attr = $self->attributes(); | ||||
| 381 | 106 | 100 | 288 | return unless exists $attr->{$name}; | |||
| 382 | 96 | 524 | $attr->{$name}; | ||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | sub innerText { | ||||||
| 386 | 43 | 43 | 5938 | my $self = shift; | |||
| 387 | 43 | 140 | my ( $flat, $cur ) = @$self; | ||||
| 388 | 43 | 83 | my $elem = $flat->[$cur]; | ||||
| 389 | 43 | 100 | 180 | return $elem->[005] if defined $elem->[005]; # cache | |||
| 390 | 36 | 50 | 99 | return if $elem->[000]; # | |||
| 391 | 36 | 50 | 66 | 1396 | return if ( defined $elem->[002] && $elem->[002] =~ m#/$# ); # |
||
| 392 | |||||||
| 393 | 36 | 76 | my $tagname = $elem->[001]; | ||||
| 394 | 36 | 201 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
| 395 | 36 | 89 | my $list = []; | ||||
| 396 | 36 | 114 | for ( ; $cur < $closing ; $cur++ ) { | ||||
| 397 | 307 | 891 | push( @$list, $flat->[$cur]->[003] ); | ||||
| 398 | } | ||||||
| 399 | 36 | 86 | my $text = join( "", grep { $_ ne "" } @$list ); | ||||
| 307 | 578 | ||||||
| 400 | 36 | 765 | $text =~ s/^\s+|\s+$//sg; | ||||
| 401 | # $text = "" if ( $cur == $#$flat ); # end of source | ||||||
| 402 | 36 | 186 | $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); | ||||
| 403 | } | ||||||
| 404 | |||||||
| 405 | sub subTree | ||||||
| 406 | { | ||||||
| 407 | 0 | 0 | 0 | my $self = shift; | |||
| 408 | 0 | 0 | my ( $flat, $cur ) = @$self; | ||||
| 409 | 0 | 0 | my $elem = $flat->[$cur]; | ||||
| 410 | 0 | 0 | 0 | return if $elem->[000]; # | |||
| 411 | 0 | 0 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
| 412 | 0 | 0 | my $list = []; | ||||
| 413 | 0 | 0 | while (++$cur < $closing) | ||||
| 414 | { | ||||||
| 415 | 0 | 0 | push @$list, $flat->[$cur]; | ||||
| 416 | } | ||||||
| 417 | |||||||
| 418 | # allow the getElement...() methods on the returned object. | ||||||
| 419 | 0 | 0 | return bless { flat => $list }, 'HTML::TagParser'; | ||||
| 420 | } | ||||||
| 421 | |||||||
| 422 | |||||||
| 423 | sub nextSibling | ||||||
| 424 | { | ||||||
| 425 | 4 | 4 | 8 | my $self = shift; | |||
| 426 | 4 | 8 | my ( $flat, $cur ) = @$self; | ||||
| 427 | 4 | 6 | my $elem = $flat->[$cur]; | ||||
| 428 | |||||||
| 429 | 4 | 50 | 11 | return undef if $elem->[000]; # | |||
| 430 | 4 | 7 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
| 431 | 4 | 9 | my $next_s = $flat->[$closing+1]; | ||||
| 432 | 4 | 50 | 10 | return undef unless $next_s; | |||
| 433 | 4 | 100 | 15 | return undef if $next_s->[000]; # parent's | |||
| 434 | 2 | 7 | return HTML::TagParser::Element->new( $flat, $closing+1 ); | ||||
| 435 | } | ||||||
| 436 | |||||||
| 437 | sub firstChild | ||||||
| 438 | { | ||||||
| 439 | 1 | 1 | 2 | my $self = shift; | |||
| 440 | 1 | 2 | my ( $flat, $cur ) = @$self; | ||||
| 441 | 1 | 2 | my $elem = $flat->[$cur]; | ||||
| 442 | 1 | 50 | 19 | return undef if $elem->[000]; # | |||
| 443 | 1 | 4 | my $closing = HTML::TagParser::Util::find_closing($flat, $cur); | ||||
| 444 | 1 | 50 | 6 | return undef if $closing <= $cur+1; # no children here. | |||
| 445 | 1 | 5 | return HTML::TagParser::Element->new( $flat, $cur+1 ); | ||||
| 446 | } | ||||||
| 447 | |||||||
| 448 | sub childNodes | ||||||
| 449 | { | ||||||
| 450 | 1 | 1 | 2 | my $self = shift; | |||
| 451 | 1 | 4 | my ( $flat, $cur ) = @$self; | ||||
| 452 | 1 | 6 | my $child = firstChild($self); | ||||
| 453 | 1 | 50 | 5 | return [] unless $child; # an empty array is easier for our callers than undef | |||
| 454 | 1 | 3 | my @c = ( $child ); | ||||
| 455 | 1 | 4 | while (defined ($child = nextSibling($child))) | ||||
| 456 | { | ||||||
| 457 | 1 | 3 | push @c, $child; | ||||
| 458 | } | ||||||
| 459 | 1 | 4 | return \@c; | ||||
| 460 | } | ||||||
| 461 | |||||||
| 462 | sub lastChild | ||||||
| 463 | { | ||||||
| 464 | 0 | 0 | 0 | my $c = childNodes(@_); | |||
| 465 | 0 | 0 | 0 | return undef unless $c->[0]; | |||
| 466 | 0 | 0 | return $c->[-1]; | ||||
| 467 | } | ||||||
| 468 | |||||||
| 469 | sub previousSibling | ||||||
| 470 | { | ||||||
| 471 | 7 | 7 | 7 | my $self = shift; | |||
| 472 | 7 | 10 | my ( $flat, $cur ) = @$self; | ||||
| 473 | |||||||
| 474 | ## This one is expensive. | ||||||
| 475 | ## We use find_closing() which walks forward. | ||||||
| 476 | ## We'd need a find_opening() which walks backwards. | ||||||
| 477 | ## So we walk backwards one by one and consult find_closing() | ||||||
| 478 | ## until we find $cur-1 or $cur. | ||||||
| 479 | |||||||
| 480 | 7 | 8 | my $idx = $cur-1; | ||||
| 481 | 7 | 15 | while ($idx >= 0) | ||||
| 482 | { | ||||||
| 483 | 16 | 100 | 100 | 49 | if ($flat->[$idx][000] && defined($flat->[$idx][006])) | ||
| 484 | { | ||||||
| 485 | 2 | 3 | $idx = $flat->[$idx][006]; # use cache for backwards skipping | ||||
| 486 | 2 | 39 | next; | ||||
| 487 | } | ||||||
| 488 | |||||||
| 489 | 14 | 26 | my $closing = HTML::TagParser::Util::find_closing($flat, $idx); | ||||
| 490 | 14 | 100 | 66 | 81 | return HTML::TagParser::Element->new( $flat, $idx ) | ||
| 33 | |||||||
| 491 | if defined $closing and ($closing == $cur || $closing == $cur-1); | ||||||
| 492 | 12 | 25 | $idx--; | ||||
| 493 | } | ||||||
| 494 | 5 | 20 | return undef; | ||||
| 495 | } | ||||||
| 496 | |||||||
| 497 | sub parentNode | ||||||
| 498 | { | ||||||
| 499 | 8 | 8 | 621 | my $self = shift; | |||
| 500 | 8 | 10 | my ( $flat, $cur ) = @$self; | ||||
| 501 | |||||||
| 502 | 8 | 100 | 26 | return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache | |||
| 503 | |||||||
| 504 | ## | ||||||
| 505 | ## This one is very expensive. | ||||||
| 506 | ## We use previousSibling() to walk backwards, and | ||||||
| 507 | ## previousSibling() is expensive. | ||||||
| 508 | ## | ||||||
| 509 | 5 | 5 | my $ps = $self; | ||||
| 510 | 5 | 6 | my $first = $self; | ||||
| 511 | |||||||
| 512 | 5 | 20 | while (defined($ps = previousSibling($ps))) { $first = $ps; } | ||||
| 2 | 4 | ||||||
| 513 | |||||||
| 514 | 5 | 7 | my $parent = $first->[1] - 1; | ||||
| 515 | 5 | 100 | 13 | return undef if $parent < 0; | |||
| 516 | 4 | 50 | 8 | die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; | |||
| 517 | |||||||
| 518 | 4 | 8 | $flat->[$cur][007] = $parent; # cache | ||||
| 519 | 4 | 10 | return HTML::TagParser::Element->new( $flat, $parent ) | ||||
| 520 | } | ||||||
| 521 | |||||||
| 522 | ## | ||||||
| 523 | ## feature: | ||||||
| 524 | ## self-closing tags have an additional attribute '/' => '/'. | ||||||
| 525 | ## | ||||||
| 526 | sub attributes { | ||||||
| 527 | 3071 | 3071 | 3190 | my $self = shift; | |||
| 528 | 3071 | 4877 | my ( $flat, $cur ) = @$self; | ||||
| 529 | 3071 | 4049 | my $elem = $flat->[$cur]; | ||||
| 530 | 3071 | 100 | 10154 | return $elem->[004] if ref $elem->[004]; # cache | |||
| 531 | 1666 | 100 | 5524 | return unless defined $elem->[002]; | |||
| 532 | 811 | 1129 | my $attr = {}; | ||||
| 533 | 811 | 19303 | while ( $elem->[002] =~ m{ | ||||
| 534 | ([^\s="']+)(\s*=\s*(?:["']((?(?<=")(?:\\"|[^"])*?|(?:\\'|[^'])*?))["']|([^'"\s=]+)['"]*))? | ||||||
| 535 | }sgx ) { | ||||||
| 536 | 1994 | 5730 | my $key = $1; | ||||
| 537 | 1994 | 3888 | my $test = $2; | ||||
| 538 | 1994 | 66 | 9916 | my $val = $3 || $4; | |||
| 539 | 1994 | 4059 | my $lckey = lc($key); | ||||
| 540 | 1994 | 100 | 22106 | if ($test) { | |||
| 541 | 1719 | 2433 | $key =~ tr/A-Z/a-z/; | ||||
| 542 | 1719 | 5784 | $val = HTML::TagParser::Util::xml_unescape( $val ); | ||||
| 543 | 1719 | 17542 | $attr->{$lckey} = $val; | ||||
| 544 | } | ||||||
| 545 | else { | ||||||
| 546 | 275 | 1670 | $attr->{$lckey} = $key; | ||||
| 547 | } | ||||||
| 548 | } | ||||||
| 549 | 811 | 1703 | $elem->[004] = $attr; # cache | ||||
| 550 | 811 | 2882 | $attr; | ||||
| 551 | } | ||||||
| 552 | |||||||
| 553 | # ---------------------------------------------------------------- | ||||||
| 554 | |||||||
| 555 | package HTML::TagParser::Util; | ||||||
| 556 | 16 | 16 | 147 | use strict; | |||
| 16 | 34 | ||||||
| 16 | 28142 | ||||||
| 557 | |||||||
| 558 | sub xml_unescape { | ||||||
| 559 | 1755 | 1755 | 3088 | my $str = shift; | |||
| 560 | 1755 | 100 | 4071 | return unless defined $str; | |||
| 561 | 1568 | 2674 | $str =~ s/"/"/g; | ||||
| 562 | 1568 | 1823 | $str =~ s/</ | ||||
| 563 | 1568 | 2305 | $str =~ s/>/>/g; | ||||
| 564 | 1568 | 10175 | $str =~ s/&/&/g; | ||||
| 565 | 1568 | 3970 | $str; | ||||
| 566 | } | ||||||
| 567 | |||||||
| 568 | sub read_text_file { | ||||||
| 569 | 13 | 13 | 22 | my $file = shift; | |||
| 570 | 13 | 56 | my $fh = Symbol::gensym(); | ||||
| 571 | 13 | 50 | 1320 | open( $fh, $file ) or Carp::croak "$! - $file\n"; | |||
| 572 | 13 | 125 | local $/ = undef; | ||||
| 573 | 13 | 757 | my $text = <$fh>; | ||||
| 574 | 13 | 148 | close($fh); | ||||
| 575 | 13 | 81 | $text; | ||||
| 576 | } | ||||||
| 577 | |||||||
| 578 | sub html_to_flat { | ||||||
| 579 | 25 | 25 | 45 | my $txtref = shift; # reference | |||
| 580 | 25 | 56 | my $flat = []; | ||||
| 581 | 25 | 86 | pos($$txtref) = undef; # reset matching position | ||||
| 582 | 25 | 294 | while ( $$txtref =~ m{ | ||||
| 583 | (?:[^<]*) < (?: | ||||||
| 584 | ( / )? ( [^/!<>\s"'=]+ ) | ||||||
| 585 | ( (?:"[^"]*"|'[^']*'|[^"'<>])+ )? | ||||||
| 586 | | | ||||||
| 587 | (!-- .*? -- | ![^\-] .*? ) | ||||||
| 588 | ) > ([^<]*) | ||||||
| 589 | }sxg ) { | ||||||
| 590 | # [000] $1 close | ||||||
| 591 | # [001] $2 tagName | ||||||
| 592 | # [002] $3 attributes | ||||||
| 593 | # $4 comment element | ||||||
| 594 | # [003] $5 content | ||||||
| 595 | 3914 | 100 | 10500 | next if defined $4; | |||
| 596 | 3868 | 19774 | my $array = [ $1, $2, $3, $5 ]; | ||||
| 597 | 3868 | 11232 | $array->[001] =~ tr/A-Z/a-z/; | ||||
| 598 | # $array->[003] =~ s/^\s+//s; | ||||||
| 599 | # $array->[003] =~ s/\s+$//s; | ||||||
| 600 | 3868 | 87087 | push( @$flat, $array ); | ||||
| 601 | } | ||||||
| 602 | 25 | 90 | $flat; | ||||
| 603 | } | ||||||
| 604 | |||||||
| 605 | ## returns 1 beyond the end, if not found. | ||||||
| 606 | ## returns undef if called on a closing tag | ||||||
| 607 | sub find_closing | ||||||
| 608 | { | ||||||
| 609 | 59 | 59 | 88 | my ($flat, $cur) = @_; | |||
| 610 | |||||||
| 611 | 59 | 100 | 194 | return $flat->[$cur][006] if $flat->[$cur][006]; # cache | |||
| 612 | 44 | 100 | 100 | 270 | return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing | ||
| 613 | |||||||
| 614 | 43 | 91 | my $name = $flat->[$cur][001]; | ||||
| 615 | 43 | 53 | my $pre_nest = 0; | ||||
| 616 | ## count how many levels deep this type of tag is nested. | ||||||
| 617 | 43 | 150 | my $idx; | ||||
| 618 | 43 | 138 | for ($idx = 0; $idx <= $cur; $idx++) | ||||
| 619 | { | ||||||
| 620 | 3318 | 6265 | my $e = $flat->[$idx]; | ||||
| 621 | 3318 | 100 | 8804 | next unless $e->[001] eq $name; | |||
| 622 | 278 | 50 | 100 | 1014 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
| 623 | 278 | 100 | 4882 | $pre_nest += ($e->[000]) ? -1 : 1; | |||
| 624 | 278 | 100 | 625 | $pre_nest = 0 if $pre_nest < 0; | |||
| 625 | 278 | 100 | 100 | 1322 | $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. | ||
| 626 | } | ||||||
| 627 | 43 | 77 | my $last_idx = $#$flat; | ||||
| 628 | |||||||
| 629 | ## we move last_idx closer, in case this container | ||||||
| 630 | ## has not all its subcontainers closed properly. | ||||||
| 631 | 43 | 72 | my $post_nest = 0; | ||||
| 632 | 43 | 151 | for ($idx = $last_idx; $idx > $cur; $idx--) | ||||
| 633 | { | ||||||
| 634 | 2526 | 3198 | my $e = $flat->[$idx]; | ||||
| 635 | 2526 | 100 | 14167 | next unless $e->[001] eq $name; | |||
| 636 | 45 | 84 | $last_idx = $idx-1; # remember where a matching tag was | ||||
| 637 | 45 | 50 | 100 | 362 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
| 638 | 45 | 100 | 136 | $post_nest -= ($e->[000]) ? -1 : 1; | |||
| 639 | 45 | 100 | 146 | $post_nest = 0 if $post_nest < 0; | |||
| 640 | 45 | 100 | 141 | last if $pre_nest <= $post_nest; | |||
| 641 | 7 | 50 | 66 | 37 | $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. | ||
| 642 | } | ||||||
| 643 | |||||||
| 644 | 43 | 68 | my $nest = 1; # we know it is not self-closing. start behind. | ||||
| 645 | |||||||
| 646 | 43 | 140 | for ($idx = $cur+1; $idx <= $last_idx; $idx++) | ||||
| 647 | { | ||||||
| 648 | 316 | 310 | my $e = $flat->[$idx]; | ||||
| 649 | 316 | 100 | 938 | next unless $e->[001] eq $name; | |||
| 650 | 17 | 100 | 100 | 116 | next if (($e->[002]||'') =~ m{/$}); # self-closing | ||
| 651 | 14 | 100 | 42 | $nest += ($e->[000]) ? -1 : 1; | |||
| 652 | 14 | 100 | 37 | if ($nest <= 0) | |||
| 653 | { | ||||||
| 654 | 6 | 50 | 19 | die "assert " unless $e->[000]; | |||
| 655 | 6 | 16 | $e->[006] = $cur; # point back to opening tag | ||||
| 656 | 6 | 23 | return $flat->[$cur][006] = $idx; | ||||
| 657 | } | ||||||
| 658 | 8 | 100 | 100 | 106 | $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. | ||
| 659 | } | ||||||
| 660 | |||||||
| 661 | # not all closed, but cannot go further | ||||||
| 662 | 37 | 153 | return $flat->[$cur][006] = $last_idx+1; | ||||
| 663 | } | ||||||
| 664 | |||||||
| 665 | sub find_meta_charset { | ||||||
| 666 | 25 | 25 | 50 | my $txtref = shift; # reference | |||
| 667 | 25 | 368 | while ( $$txtref =~ m{ | ||||
| 668 | ]+\s )? http-equiv\s*=\s*['"]?Content-Type [^>]+ ) > | ||||||
| 669 | }sxgi ) { | ||||||
| 670 | 12 | 37 | my $args = $1; | ||||
| 671 | 12 | 50 | 99 | return $1 if ( $args =~ m# charset=['"]?([^'"\s/]+) #sxgi ); | |||
| 672 | } | ||||||
| 673 | 13 | 29 | undef; | ||||
| 674 | } | ||||||
| 675 | |||||||
| 676 | sub encode_from_to { | ||||||
| 677 | 11 | 11 | 37 | my ( $txtref, $from, $to ) = @_; | |||
| 678 | 11 | 50 | 164 | return if ( $from eq "" ); | |||
| 679 | 11 | 50 | 38 | return if ( $to eq "" ); | |||
| 680 | 11 | 100 | 46 | return $to if ( uc($from) eq uc($to) ); | |||
| 681 | 7 | 54 | Encode::from_to( $$txtref, $from, $to, Encode::XMLCREF() ); | ||||
| 682 | 7 | 5292 | return $to; | ||||
| 683 | } | ||||||
| 684 | |||||||
| 685 | # ---------------------------------------------------------------- | ||||||
| 686 | 1; | ||||||
| 687 | # ---------------------------------------------------------------- |