| blib/lib/Mojo/DOM/HTML.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 103 | 104 | 99.0 |
| branch | 88 | 90 | 97.7 |
| condition | 48 | 52 | 92.3 |
| subroutine | 14 | 14 | 100.0 |
| pod | 4 | 4 | 100.0 |
| total | 257 | 264 | 97.3 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Mojo::DOM::HTML; | ||||||
| 2 | 61 | 61 | 478 | use Mojo::Base -base; | |||
| 61 | 153 | ||||||
| 61 | 506 | ||||||
| 3 | |||||||
| 4 | 61 | 61 | 449 | use Exporter qw(import); | |||
| 61 | 142 | ||||||
| 61 | 2084 | ||||||
| 5 | 61 | 61 | 353 | use Mojo::Util qw(html_attr_unescape html_unescape xml_escape); | |||
| 61 | 156 | ||||||
| 61 | 3990 | ||||||
| 6 | 61 | 61 | 456 | use Scalar::Util qw(weaken); | |||
| 61 | 235 | ||||||
| 61 | 128947 | ||||||
| 7 | |||||||
| 8 | our @EXPORT_OK = ('tag_to_html'); | ||||||
| 9 | |||||||
| 10 | has tree => sub { ['root'] }; | ||||||
| 11 | has 'xml'; | ||||||
| 12 | |||||||
| 13 | my $ATTR_RE = qr/ | ||||||
| 14 | ([^<>=\s\/0-9.\-][^<>=\s\/]*|\/) # Key | ||||||
| 15 | (?: | ||||||
| 16 | \s*=\s* | ||||||
| 17 | (?s:(["'])(.*?)\g{-2}|([^>\s]*)) # Value | ||||||
| 18 | )? | ||||||
| 19 | \s* | ||||||
| 20 | /x; | ||||||
| 21 | my $TOKEN_RE = qr/ | ||||||
| 22 | ([^<]+)? # Text | ||||||
| 23 | (?: | ||||||
| 24 | <(?: | ||||||
| 25 | !(?: | ||||||
| 26 | DOCTYPE( | ||||||
| 27 | \s+\w+ # Doctype | ||||||
| 28 | (?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID | ||||||
| 29 | (?:\s+\[.+?\])? # Int Subset | ||||||
| 30 | \s*) | ||||||
| 31 | | | ||||||
| 32 | --(.*?)--\s* # Comment | ||||||
| 33 | | | ||||||
| 34 | \[CDATA\[(.*?)\]\] # CDATA | ||||||
| 35 | ) | ||||||
| 36 | | | ||||||
| 37 | \?(.*?)\? # Processing Instruction | ||||||
| 38 | | | ||||||
| 39 | \s*((?:\/\s*)?[^<>\s\/0-9.\-][^<>\s\/]*\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag | ||||||
| 40 | )> | ||||||
| 41 | | | ||||||
| 42 | (<) # Runaway "<" | ||||||
| 43 | )?? | ||||||
| 44 | /xis; | ||||||
| 45 | |||||||
| 46 | # HTML elements that only contain raw text | ||||||
| 47 | my %RAW = map { $_ => 1 } qw(script style); | ||||||
| 48 | |||||||
| 49 | # HTML elements that only contain raw text and entities | ||||||
| 50 | my %RCDATA = map { $_ => 1 } qw(title textarea); | ||||||
| 51 | |||||||
| 52 | # HTML elements with optional end tags | ||||||
| 53 | my %END = (body => 'head', optgroup => 'optgroup', option => 'option'); | ||||||
| 54 | |||||||
| 55 | # HTML elements that break paragraphs | ||||||
| 56 | map { $END{$_} = 'p' } ( | ||||||
| 57 | qw(address article aside blockquote details dialog div dl fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6), | ||||||
| 58 | qw(header hgroup hr main menu nav ol p pre section table ul) | ||||||
| 59 | ); | ||||||
| 60 | |||||||
| 61 | # Container HTML elements that create their own scope | ||||||
| 62 | my %SCOPE = map { $_ => 1 } qw(math svg); | ||||||
| 63 | |||||||
| 64 | # HTML table elements with optional end tags | ||||||
| 65 | my %TABLE = map { $_ => 1 } qw(colgroup tbody td tfoot th thead tr); | ||||||
| 66 | |||||||
| 67 | # HTML elements with optional end tags and scoping rules | ||||||
| 68 | my %CLOSE = (li => [{li => 1}, {ul => 1, ol => 1}], tr => [{tr => 1}, {table => 1}]); | ||||||
| 69 | $CLOSE{$_} = [\%TABLE, {table => 1}] for qw(colgroup tbody tfoot thead); | ||||||
| 70 | $CLOSE{$_} = [{dd => 1, dt => 1}, {dl => 1}] for qw(dd dt); | ||||||
| 71 | $CLOSE{$_} = [{rp => 1, rt => 1}, {ruby => 1}] for qw(rp rt); | ||||||
| 72 | $CLOSE{$_} = [{th => 1, td => 1}, {table => 1}] for qw(td th); | ||||||
| 73 | |||||||
| 74 | # HTML parent elements that signal no more content when closed, but that are also phrasing content | ||||||
| 75 | my %NO_MORE_CONTENT = (ruby => [qw(rt rp)], select => [qw(option optgroup)]); | ||||||
| 76 | |||||||
| 77 | # HTML elements without end tags | ||||||
| 78 | my %EMPTY = map { $_ => 1 } qw(area base br col embed hr img input keygen link menuitem meta param source track wbr); | ||||||
| 79 | |||||||
| 80 | # HTML elements categorized as phrasing content (and obsolete inline elements) | ||||||
| 81 | my @PHRASING = ( | ||||||
| 82 | qw(a abbr area audio b bdi bdo br button canvas cite code data datalist del dfn em embed i iframe img input ins kbd), | ||||||
| 83 | qw(keygen label link map mark math meta meter noscript object output picture progress q ruby s samp script select), | ||||||
| 84 | qw(slot small span strong sub sup svg template textarea time u var video wbr) | ||||||
| 85 | ); | ||||||
| 86 | my @OBSOLETE = qw(acronym applet basefont big font strike tt); | ||||||
| 87 | my %PHRASING = map { $_ => 1 } @OBSOLETE, @PHRASING; | ||||||
| 88 | |||||||
| 89 | # HTML elements that don't get their self-closing flag acknowledged | ||||||
| 90 | my %BLOCK = map { $_ => 1 } ( | ||||||
| 91 | qw(a address applet article aside b big blockquote body button caption center code col colgroup dd details dialog), | ||||||
| 92 | qw(dir div dl dt em fieldset figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head header hgroup html), | ||||||
| 93 | qw(i iframe li listing main marquee menu nav nobr noembed noframes noscript object ol optgroup option p plaintext), | ||||||
| 94 | qw(pre rp rt s script section select small strike strong style summary table tbody td template textarea tfoot th), | ||||||
| 95 | qw(thead title tr tt u ul xmp) | ||||||
| 96 | ); | ||||||
| 97 | |||||||
| 98 | sub parse { | ||||||
| 99 | 323 | 323 | 1 | 1322 | my ($self, $html) = (shift, "$_[0]"); | ||
| 100 | |||||||
| 101 | 323 | 872 | my $xml = $self->xml; | ||||
| 102 | 323 | 764 | my $current = my $tree = ['root']; | ||||
| 103 | 323 | 3974 | while ($html =~ /\G$TOKEN_RE/gcso) { | ||||
| 104 | 9459 | 29532 | my ($text, $doctype, $comment, $cdata, $pi, $tag, $runaway) = ($1, $2, $3, $4, $5, $6, $11); | ||||
| 105 | |||||||
| 106 | # Text (and runaway "<") | ||||||
| 107 | 9459 | 100 | 16597 | $text .= '<' if defined $runaway; | |||
| 108 | 9459 | 100 | 17815 | _node($current, 'text', html_unescape $text) if defined $text; | |||
| 109 | |||||||
| 110 | # Tag | ||||||
| 111 | 9459 | 100 | 188240 | if (defined $tag) { | |||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 112 | |||||||
| 113 | # End | ||||||
| 114 | 3206 | 100 | 10620 | if ($tag =~ /^\/\s*(\S+)/) { | |||
| 50 | |||||||
| 115 | 1394 | 100 | 3362 | my $end = $xml ? $1 : lc $1; | |||
| 116 | |||||||
| 117 | # No more content | ||||||
| 118 | 1394 | 100 | 100 | 6130 | if (!$xml && (my $tags = $NO_MORE_CONTENT{$end})) { _end($_, $xml, \$current) for @$tags } | ||
| 15 | 50 | ||||||
| 119 | |||||||
| 120 | 1394 | 2911 | _end($end, $xml, \$current); | ||||
| 121 | } | ||||||
| 122 | |||||||
| 123 | # Start | ||||||
| 124 | elsif ($tag =~ m!^([^\s/]+)([\s\S]*)!) { | ||||||
| 125 | 1812 | 100 | 5901 | my ($start, $attr) = ($xml ? $1 : lc $1, $2); | |||
| 126 | |||||||
| 127 | # Attributes | ||||||
| 128 | 1812 | 2824 | my (%attrs, $closing); | ||||
| 129 | 1812 | 6074 | while ($attr =~ /$ATTR_RE/go) { | ||||
| 130 | 33942 | 100 | 100 | 139762 | my ($key, $value) = ($xml ? $1 : lc $1, $3 // $4); | ||
| 131 | |||||||
| 132 | # Empty tag | ||||||
| 133 | 33942 | 100 | 50 | 61109 | ++$closing and next if $key eq '/'; | ||
| 134 | |||||||
| 135 | 33890 | 100 | 73389 | $attrs{$key} = defined $value ? html_attr_unescape $value : $value; | |||
| 136 | } | ||||||
| 137 | |||||||
| 138 | # "image" is an alias for "img" | ||||||
| 139 | 1812 | 100 | 100 | 6088 | $start = 'img' if !$xml && $start eq 'image'; | ||
| 140 | 1812 | 4760 | _start($start, \%attrs, $xml, \$current); | ||||
| 141 | |||||||
| 142 | # Element without end tag (self-closing) | ||||||
| 143 | 1812 | 100 | 100 | 10970 | _end($start, $xml, \$current) if !$xml && $EMPTY{$start} || ($xml || !$BLOCK{$start}) && $closing; | ||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 144 | |||||||
| 145 | # Raw text elements | ||||||
| 146 | 1812 | 100 | 100 | 12821 | next if $xml || !$RAW{$start} && !$RCDATA{$start}; | ||
| 100 | |||||||
| 147 | 62 | 100 | 1173 | next unless $html =~ m!\G(.*?)\Q$start\E(?:\s+|\s*>)!gcsi; | |||
| 148 | 61 | 100 | 33048 | _node($current, 'raw', $RCDATA{$start} ? html_unescape $1 : $1); | |||
| 149 | 61 | 191 | _end($start, 0, \$current); | ||||
| 150 | } | ||||||
| 151 | } | ||||||
| 152 | |||||||
| 153 | # DOCTYPE | ||||||
| 154 | 17 | 75 | elsif (defined $doctype) { _node($current, 'doctype', $doctype) } | ||||
| 155 | |||||||
| 156 | # Comment | ||||||
| 157 | 13 | 68 | elsif (defined $comment) { _node($current, 'comment', $comment) } | ||||
| 158 | |||||||
| 159 | # CDATA | ||||||
| 160 | 7 | 23 | elsif (defined $cdata) { _node($current, 'cdata', $cdata) } | ||||
| 161 | |||||||
| 162 | # Processing instruction (try to detect XML) | ||||||
| 163 | elsif (defined $pi) { | ||||||
| 164 | 18 | 100 | 100 | 166 | $self->xml($xml = 1) if !exists $self->{xml} && $pi =~ /xml/i; | ||
| 165 | 18 | 44 | _node($current, 'pi', $pi); | ||||
| 166 | } | ||||||
| 167 | } | ||||||
| 168 | |||||||
| 169 | 323 | 1204 | return $self->tree($tree); | ||||
| 170 | } | ||||||
| 171 | |||||||
| 172 | 164 | 164 | 1 | 429 | sub render { _render($_[0]->tree, $_[0]->xml) } | ||
| 173 | |||||||
| 174 | 11 | 11 | 1 | 30 | sub tag { shift->tree(['root', _tag(@_)]) } | ||
| 175 | |||||||
| 176 | 877 | 877 | 1 | 1966 | sub tag_to_html { _render(_tag(@_), undef) } | ||
| 177 | |||||||
| 178 | sub _end { | ||||||
| 179 | 2091 | 2091 | 3725 | my ($end, $xml, $current) = @_; | |||
| 180 | |||||||
| 181 | # Search stack for start tag | ||||||
| 182 | 2091 | 2826 | my $next = $$current; | ||||
| 183 | 2091 | 2669 | do { | ||||
| 184 | |||||||
| 185 | # Ignore useless end tag | ||||||
| 186 | 4422 | 100 | 8188 | return if $next->[0] eq 'root'; | |||
| 187 | |||||||
| 188 | # Don’t traverse a container tag | ||||||
| 189 | 4002 | 100 | 100 | 8154 | return if $SCOPE{$next->[1]} && $next->[1] ne $end; | ||
| 190 | |||||||
| 191 | # Right tag | ||||||
| 192 | 3999 | 100 | 12242 | return $$current = $next->[3] if $next->[1] eq $end; | |||
| 193 | |||||||
| 194 | # Phrasing content can only cross phrasing content | ||||||
| 195 | 2337 | 100 | 100 | 7901 | return if !$xml && $PHRASING{$end} && !$PHRASING{$next->[1]}; | ||
| 66 | |||||||
| 196 | |||||||
| 197 | } while $next = $next->[3]; | ||||||
| 198 | } | ||||||
| 199 | |||||||
| 200 | sub _node { | ||||||
| 201 | 2709 | 2709 | 4594 | my ($current, $type, $content) = @_; | |||
| 202 | 2709 | 8028 | push @$current, my $new = [$type, $content, $current]; | ||||
| 203 | 2709 | 7067 | weaken $new->[2]; | ||||
| 204 | } | ||||||
| 205 | |||||||
| 206 | sub _render { | ||||||
| 207 | 2395 | 2395 | 4064 | my ($tree, $xml) = @_; | |||
| 208 | |||||||
| 209 | # Tag | ||||||
| 210 | 2395 | 3710 | my $type = $tree->[0]; | ||||
| 211 | 2395 | 100 | 4595 | if ($type eq 'tag') { | |||
| 212 | |||||||
| 213 | # Start tag | ||||||
| 214 | 1330 | 2169 | my $tag = $tree->[1]; | ||||
| 215 | 1330 | 2467 | my $result = "<$tag"; | ||||
| 216 | |||||||
| 217 | # Attributes | ||||||
| 218 | 1330 | 1808 | for my $key (sort keys %{$tree->[2]}) { | ||||
| 1330 | 5039 | ||||||
| 219 | 1528 | 2702 | my $value = $tree->[2]{$key}; | ||||
| 220 | 1528 | 100 | 50 | 3017 | $result .= $xml ? qq{ $key="$key"} : " $key" and next unless defined $value; | ||
| 100 | |||||||
| 221 | 1491 | 3974 | $result .= qq{ $key="} . xml_escape($value) . '"'; | ||||
| 222 | } | ||||||
| 223 | |||||||
| 224 | # No children | ||||||
| 225 | 1330 | 100 | 6058 | return $xml ? "$result />" : $EMPTY{$tag} ? "$result>" : "$result>$tag>" unless $tree->[4]; | |||
| 100 | |||||||
| 100 | |||||||
| 226 | |||||||
| 227 | # Children | ||||||
| 228 | 61 | 61 | 763 | no warnings 'recursion'; | |||
| 61 | 164 | ||||||
| 61 | 50841 | ||||||
| 229 | 894 | 2167 | $result .= '>' . join '', map { _render($_, $xml) } @$tree[4 .. $#$tree]; | ||||
| 1166 | 2899 | ||||||
| 230 | |||||||
| 231 | # End tag | ||||||
| 232 | 894 | 5691 | return "$result$tag>"; | ||||
| 233 | } | ||||||
| 234 | |||||||
| 235 | # Text (escaped) | ||||||
| 236 | 1065 | 100 | 2659 | return xml_escape $tree->[1] if $type eq 'text'; | |||
| 237 | |||||||
| 238 | # Raw text | ||||||
| 239 | 520 | 100 | 2007 | return $tree->[1] if $type eq 'raw'; | |||
| 240 | |||||||
| 241 | # Root | ||||||
| 242 | 150 | 100 | 614 | return join '', map { _render($_, $xml) } @$tree[1 .. $#$tree] if $type eq 'root'; | |||
| 188 | 368 | ||||||
| 243 | |||||||
| 244 | # DOCTYPE | ||||||
| 245 | 14 | 100 | 91 | return '[1] . '>' if $type eq 'doctype'; | |||
| 246 | |||||||
| 247 | # Comment | ||||||
| 248 | 10 | 100 | 81 | return '' if $type eq 'comment'; | |||
| 249 | |||||||
| 250 | # CDATA | ||||||
| 251 | 6 | 100 | 49 | return '[1] . ']]>' if $type eq 'cdata'; | |||
| 252 | |||||||
| 253 | # Processing instruction | ||||||
| 254 | 3 | 50 | 52 | return '' . $tree->[1] . '?>' if $type eq 'pi'; | |||
| 255 | |||||||
| 256 | # Everything else | ||||||
| 257 | 0 | 0 | return ''; | ||||
| 258 | } | ||||||
| 259 | |||||||
| 260 | sub _start { | ||||||
| 261 | 1812 | 1812 | 3457 | my ($start, $attrs, $xml, $current) = @_; | |||
| 262 | |||||||
| 263 | # Autoclose optional HTML elements | ||||||
| 264 | 1812 | 100 | 100 | 6292 | if (!$xml && $$current->[0] ne 'root') { | ||
| 265 | 1396 | 100 | 4646 | if (my $end = $END{$start}) { _end($end, 0, $current) } | |||
| 405 | 100 | 780 | |||||
| 266 | |||||||
| 267 | elsif (my $close = $CLOSE{$start}) { | ||||||
| 268 | 504 | 938 | my ($allowed, $scope) = @$close; | ||||
| 269 | |||||||
| 270 | # Close allowed parent elements in scope | ||||||
| 271 | 504 | 704 | my $parent = $$current; | ||||
| 272 | 504 | 66 | 1925 | while ($parent->[0] ne 'root' && !$scope->{$parent->[1]}) { | |||
| 273 | 377 | 100 | 856 | _end($parent->[1], 0, $current) if $allowed->{$parent->[1]}; | |||
| 274 | 377 | 1301 | $parent = $parent->[3]; | ||||
| 275 | } | ||||||
| 276 | } | ||||||
| 277 | } | ||||||
| 278 | |||||||
| 279 | # New tag | ||||||
| 280 | 1812 | 5749 | push @$$current, my $new = ['tag', $start, $attrs, $$current]; | ||||
| 281 | 1812 | 5415 | weaken $new->[3]; | ||||
| 282 | 1812 | 2919 | $$current = $new; | ||||
| 283 | } | ||||||
| 284 | |||||||
| 285 | sub _tag { | ||||||
| 286 | 888 | 888 | 2117 | my $tree = ['tag', shift, undef, undef]; | |||
| 287 | |||||||
| 288 | # Content | ||||||
| 289 | 888 | 100 | 3394 | push @$tree, ref $_[-1] eq 'CODE' ? ['raw', pop->()] : ['text', pop] if @_ % 2; | |||
| 100 | |||||||
| 290 | |||||||
| 291 | # Attributes | ||||||
| 292 | 888 | 2966 | my $attrs = $tree->[2] = {@_}; | ||||
| 293 | 888 | 100 | 100 | 3402 | return $tree unless exists $attrs->{data} && ref $attrs->{data} eq 'HASH'; | ||
| 294 | 5 | 17 | my $data = delete $attrs->{data}; | ||||
| 295 | 5 | 23 | @$attrs{map { y/_/-/; lc "data-$_" } keys %$data} = values %$data; | ||||
| 9 | 24 | ||||||
| 9 | 51 | ||||||
| 296 | 5 | 19 | return $tree; | ||||
| 297 | } | ||||||
| 298 | |||||||
| 299 | 1; | ||||||
| 300 | |||||||
| 301 | =encoding utf8 | ||||||
| 302 | |||||||
| 303 | =head1 NAME | ||||||
| 304 | |||||||
| 305 | Mojo::DOM::HTML - HTML/XML engine | ||||||
| 306 | |||||||
| 307 | =head1 SYNOPSIS | ||||||
| 308 | |||||||
| 309 | use Mojo::DOM::HTML; | ||||||
| 310 | |||||||
| 311 | # Turn HTML into DOM tree | ||||||
| 312 | my $html = Mojo::DOM::HTML->new; | ||||||
| 313 | $html->parse(' Test 123 |
||||||
| 314 | my $tree = $html->tree; | ||||||
| 315 | |||||||
| 316 | =head1 DESCRIPTION | ||||||
| 317 | |||||||
| 318 | L | ||||||
| 319 | Standard|https://html.spec.whatwg.org> and the L |
||||||
| 320 | |||||||
| 321 | =head1 FUNCTIONS | ||||||
| 322 | |||||||
| 323 | L |
||||||
| 324 | |||||||
| 325 | =head2 tag_to_html | ||||||
| 326 | |||||||
| 327 | my $str = tag_to_html 'div', id => 'foo', 'safe content'; | ||||||
| 328 | |||||||
| 329 | Generate HTML/XML tag and render it right away. This is a significantly faster alternative to L"tag"> for template | ||||||
| 330 | systems that have to generate a lot of tags. | ||||||
| 331 | |||||||
| 332 | =head1 ATTRIBUTES | ||||||
| 333 | |||||||
| 334 | L |
||||||
| 335 | |||||||
| 336 | =head2 tree | ||||||
| 337 | |||||||
| 338 | my $tree = $html->tree; | ||||||
| 339 | $html = $html->tree(['root']); | ||||||
| 340 | |||||||
| 341 | Document Object Model. Note that this structure should only be used very carefully since it is very dynamic. | ||||||
| 342 | |||||||
| 343 | =head2 xml | ||||||
| 344 | |||||||
| 345 | my $bool = $html->xml; | ||||||
| 346 | $html = $html->xml($bool); | ||||||
| 347 | |||||||
| 348 | Disable HTML semantics in parser and activate case-sensitivity, defaults to auto-detection based on XML declarations. | ||||||
| 349 | |||||||
| 350 | =head1 METHODS | ||||||
| 351 | |||||||
| 352 | L |
||||||
| 353 | |||||||
| 354 | =head2 parse | ||||||
| 355 | |||||||
| 356 | $html = $html->parse(' |
||||||
| 357 | |||||||
| 358 | Parse HTML/XML fragment. | ||||||
| 359 | |||||||
| 360 | =head2 render | ||||||
| 361 | |||||||
| 362 | my $str = $html->render; | ||||||
| 363 | |||||||
| 364 | Render DOM to HTML/XML. | ||||||
| 365 | |||||||
| 366 | =head2 tag | ||||||
| 367 | |||||||
| 368 | $html = $html->tag('div', id => 'foo', 'safe content'); | ||||||
| 369 | |||||||
| 370 | Generate HTML/XML tag. | ||||||
| 371 | |||||||
| 372 | =head1 SEE ALSO | ||||||
| 373 | |||||||
| 374 | L |
||||||
| 375 | |||||||
| 376 | =cut |