| blib/lib/HTML/Laundry.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 358 | 417 | 85.8 |
| branch | 131 | 160 | 81.8 |
| condition | 11 | 21 | 52.3 |
| subroutine | 50 | 52 | 96.1 |
| pod | 18 | 18 | 100.0 |
| total | 568 | 668 | 85.0 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | ######################################################## | ||||||
| 2 | # Copyright © 2009 Six Apart, Ltd. | ||||||
| 3 | |||||||
| 4 | package HTML::Laundry; | ||||||
| 5 | |||||||
| 6 | 15 | 15 | 22853 | use strict; | |||
| 15 | 19 | ||||||
| 15 | 396 | ||||||
| 7 | 15 | 15 | 52 | use warnings; | |||
| 15 | 16 | ||||||
| 15 | 389 | ||||||
| 8 | |||||||
| 9 | 15 | 15 | 310 | use 5.008; | |||
| 15 | 36 | ||||||
| 10 | 15 | 15 | 5873 | use version; our $VERSION = 0.0107; | |||
| 15 | 20568 | ||||||
| 15 | 77 | ||||||
| 11 | |||||||
| 12 | =head1 NAME | ||||||
| 13 | |||||||
| 14 | HTML::Laundry - Perl module to clean HTML by the piece | ||||||
| 15 | |||||||
| 16 | =head1 VERSION | ||||||
| 17 | |||||||
| 18 | Version 0.0107 | ||||||
| 19 | |||||||
| 20 | =head1 SYNOPSIS | ||||||
| 21 | |||||||
| 22 | #!/usr/bin/perl -w | ||||||
| 23 | use strict; | ||||||
| 24 | use HTML::Laundry; | ||||||
| 25 | my $laundry = HTML::Laundry->new(); | ||||||
| 26 | my $snippet = q{ | ||||||
| 27 | |||||||
| 28 | If your gloves are sterilized |
||||||
| 29 | Rinse your mouth with Listerine | ||||||
| 30 | Blow disinfectant in her eyes" |
||||||
| 31 | -- X-Ray Spex, Germ-Free Adolescents | ||||||
| 32 | |||||||
| 33 | }; | ||||||
| 34 | my $germfree = $laundry->clean($snippet); | ||||||
| 35 | # $germfree is now: | ||||||
| 36 | # "You may get to touch her |
||||||
| 37 | # If your gloves are sterilized |
||||||
| 38 | # Rinse your mouth with Listerine |
||||||
| 39 | # Blow disinfectant in her eyes" |
||||||
| 40 | # -- X-Ray Spex, Germ-Free Adolescents | ||||||
| 41 | |||||||
| 42 | =head1 DESCRIPTION | ||||||
| 43 | |||||||
| 44 | HTML::Laundry is an L |
||||||
| 45 | meant for small pieces of HTML, such as user comments, Atom feed entries, | ||||||
| 46 | and the like, rather than full pages. Laundry takes these and returns clean, | ||||||
| 47 | sanitary, UTF-8-based XHTML. The parser's behavior may be changed with | ||||||
| 48 | callbacks, and the whitelist of acceptable elements and attributes may be | ||||||
| 49 | updated on the fly. | ||||||
| 50 | |||||||
| 51 | A snippet is cleaned several ways: | ||||||
| 52 | |||||||
| 53 | =over 4 | ||||||
| 54 | |||||||
| 55 | =item * Normalized, using HTML::Parser: attributes and elements will be | ||||||
| 56 | lowercased, empty elements such as will be forced into |
||||||
| 57 | the empty tag syntax if needed, and unknown attributes and elements will be | ||||||
| 58 | stripped. | ||||||
| 59 | |||||||
| 60 | =item * Sanitized, using an extensible whitelist of valid attributes and | ||||||
| 61 | elements based on Mark Pilgrim and Aaron Swartz's work on C |
||||||
| 62 | and attributes which are known to be possible attack vectors are removed. | ||||||
| 63 | |||||||
| 64 | =item * Tidied, using L |
||||||
| 65 | (as available): unclosed tags will be closed and the output generally | ||||||
| 66 | neatened; future version may also use tidying to deal with character encoding | ||||||
| 67 | issues. | ||||||
| 68 | |||||||
| 69 | =item * Optionally rebased, to turn relative URLs in attributes into | ||||||
| 70 | absolute ones. | ||||||
| 71 | |||||||
| 72 | =back | ||||||
| 73 | |||||||
| 74 | HTML::Laundry provides mechanisms to extend the list of known allowed | ||||||
| 75 | (and disallowed) tags, along with callback methods to allow scripts using | ||||||
| 76 | HTML::Laundry to extend the behavior in various ways. Future versions | ||||||
| 77 | may provide additional options for altering the rules used to clean | ||||||
| 78 | snippets. | ||||||
| 79 | |||||||
| 80 | Out of the box, HTML::Laundry does not currently know about the tag | ||||||
| 81 | and its children. For santizing full HTML pages, consider using L |
||||||
| 82 | or L |
||||||
| 83 | |||||||
| 84 | =cut | ||||||
| 85 | |||||||
| 86 | require HTML::Laundry::Rules; | ||||||
| 87 | require HTML::Laundry::Rules::Default; | ||||||
| 88 | |||||||
| 89 | require HTML::Parser; | ||||||
| 90 | 15 | 15 | 6876 | use HTML::Entities qw(encode_entities encode_entities_numeric); | |||
| 15 | 53752 | ||||||
| 15 | 1200 | ||||||
| 91 | 15 | 15 | 6588 | use URI; | |||
| 15 | 44870 | ||||||
| 15 | 399 | ||||||
| 92 | 15 | 15 | 116 | use URI::Escape qw(uri_unescape uri_escape uri_escape_utf8); | |||
| 15 | 15 | ||||||
| 15 | 799 | ||||||
| 93 | 15 | 15 | 5971 | use URI::Split qw(); | |||
| 15 | 6332 | ||||||
| 15 | 276 | ||||||
| 94 | 15 | 15 | 69 | use Scalar::Util 'blessed'; | |||
| 15 | 13 | ||||||
| 15 | 44243 | ||||||
| 95 | |||||||
| 96 | my @fragments; | ||||||
| 97 | my $unacceptable_count; | ||||||
| 98 | my $local_unacceptable_count; | ||||||
| 99 | my $cdata_dirty; | ||||||
| 100 | my $in_cdata; | ||||||
| 101 | my $tag_leading_whitespace = qr/ | ||||||
| 102 | (?<=<) # Left bracket followed by | ||||||
| 103 | \s* # any amount of whitespace | ||||||
| 104 | (\/?) # optionally with a forward slash | ||||||
| 105 | \s* # and then more whitespace | ||||||
| 106 | /x; | ||||||
| 107 | |||||||
| 108 | =head1 FUNCTIONS | ||||||
| 109 | |||||||
| 110 | =head2 new | ||||||
| 111 | |||||||
| 112 | Create an HTML::Laundry object. | ||||||
| 113 | |||||||
| 114 | my $l = HTML::Laundry->new(); | ||||||
| 115 | |||||||
| 116 | Takes an optional anonymous hash of arguments: | ||||||
| 117 | |||||||
| 118 | =over 4 | ||||||
| 119 | |||||||
| 120 | =item * base_url | ||||||
| 121 | |||||||
| 122 | This turns relative URIs, as in C< >, into |
||||||
| 123 | absolute URIs, as for use in feed parsing. | ||||||
| 124 | |||||||
| 125 | my $l = HTML::Laundry->new({ base_uri => 'http://example.com/foo/' }); | ||||||
| 126 | |||||||
| 127 | |||||||
| 128 | =item * notidy | ||||||
| 129 | |||||||
| 130 | Disable use of HTML::Tidy or HTML::Tidy::libXML, even if | ||||||
| 131 | they are available on your system. | ||||||
| 132 | |||||||
| 133 | my $l = HTML::Laundry->new({ notidy => 1 }); | ||||||
| 134 | |||||||
| 135 | =back | ||||||
| 136 | |||||||
| 137 | =cut | ||||||
| 138 | |||||||
| 139 | sub new { | ||||||
| 140 | 25 | 25 | 1 | 4561 | my $self = {}; | ||
| 141 | 25 | 34 | my $class = shift; | ||||
| 142 | 25 | 28 | my $args = shift; | ||||
| 143 | |||||||
| 144 | 25 | 100 | 249 | if ( blessed $args ) { | |||
| 100 | |||||||
| 145 | 1 | 50 | 6 | if ( $args->isa('HTML::Laundry::Rules') ) { | |||
| 146 | 1 | 3 | $args = { rules => $args }; | ||||
| 147 | } | ||||||
| 148 | else { | ||||||
| 149 | 0 | 0 | $args = {}; | ||||
| 150 | } | ||||||
| 151 | } | ||||||
| 152 | elsif ( ref $args ne 'HASH' ) { | ||||||
| 153 | 4 | 4 | my $rules; | ||||
| 154 | { | ||||||
| 155 | 4 | 3 | local $@; | ||||
| 4 | 5 | ||||||
| 156 | 4 | 5 | eval { | ||||
| 157 | 4 | 100 | 51 | $args->isa('HTML::Laundry::Rules') | |||
| 158 | and $rules = $args->new; | ||||||
| 159 | }; | ||||||
| 160 | } | ||||||
| 161 | 4 | 100 | 11 | if ($rules) { | |||
| 162 | 1 | 10 | $args = { rules => $args }; | ||||
| 163 | } | ||||||
| 164 | else { | ||||||
| 165 | 3 | 4 | $args = {}; | ||||
| 166 | } | ||||||
| 167 | } | ||||||
| 168 | |||||||
| 169 | 25 | 50 | $self->{tidy} = undef; | ||||
| 170 | 25 | 34 | $self->{tidy_added_inline} = {}; | ||||
| 171 | 25 | 38 | $self->{tidy_added_empty} = {}; | ||||
| 172 | 25 | 129 | $self->{base_uri} = q{}; | ||||
| 173 | 25 | 31 | bless $self, $class; | ||||
| 174 | 25 | 55 | $self->clear_callback('start_tag'); | ||||
| 175 | 25 | 34 | $self->clear_callback('end_tag'); | ||||
| 176 | 25 | 33 | $self->clear_callback('uri'); | ||||
| 177 | 25 | 33 | $self->clear_callback('text'); | ||||
| 178 | 25 | 32 | $self->clear_callback('output'); | ||||
| 179 | $self->{parser} = HTML::Parser->new( | ||||||
| 180 | api_version => 3, | ||||||
| 181 | utf8_mode => 1, | ||||||
| 182 | 481 | 481 | 933 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 183 | 453 | 453 | 753 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 184 | 25 | 139 | 228 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
| 139 | 267 | ||||||
| 185 | empty_element_tags => 1, | ||||||
| 186 | marked_sections => 1, | ||||||
| 187 | ); | ||||||
| 188 | $self->{cdata_parser} = HTML::Parser->new( | ||||||
| 189 | api_version => 3, | ||||||
| 190 | utf8_mode => 1, | ||||||
| 191 | 5 | 5 | 9 | start_h => [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 192 | 5 | 5 | 8 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 193 | 25 | 14 | 1682 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext' ], | |||
| 14 | 23 | ||||||
| 194 | empty_element_tags => 1, | ||||||
| 195 | unbroken_text => 1, | ||||||
| 196 | marked_sections => 0, | ||||||
| 197 | ); | ||||||
| 198 | 25 | 1075 | $self->initialize($args); | ||||
| 199 | |||||||
| 200 | 25 | 100 | 58 | if ( !$args->{notidy} ) { | |||
| 201 | 8 | 12 | $self->_generate_tidy; | ||||
| 202 | } | ||||||
| 203 | 25 | 51 | return $self; | ||||
| 204 | } | ||||||
| 205 | |||||||
| 206 | =head2 initialize | ||||||
| 207 | |||||||
| 208 | Instantiates the Laundry object properties based on an | ||||||
| 209 | HTML::Laundry::Rules module. | ||||||
| 210 | |||||||
| 211 | =cut | ||||||
| 212 | |||||||
| 213 | sub initialize { | ||||||
| 214 | 25 | 25 | 1 | 32 | my ( $self, $args ) = @_; | ||
| 215 | |||||||
| 216 | # Set defaults | ||||||
| 217 | 25 | 33 | $self->{tidy_added_tags} = undef; | ||||
| 218 | 25 | 30 | $self->{tidy_empty_tags} = undef; | ||||
| 219 | 25 | 26 | $self->{trim_trailing_whitespace} = 1; | ||||
| 220 | 25 | 26 | $self->{trim_tag_whitespace} = 0; | ||||
| 221 | $self->{base_uri} = URI->new( $args->{base_uri} ) | ||||||
| 222 | 25 | 100 | 87 | if $args->{base_uri}; | |||
| 223 | 25 | 2381 | my $rules = $args->{rules}; | ||||
| 224 | 25 | 66 | 209 | $rules ||= HTML::Laundry::Rules::Default->new(); | |||
| 225 | |||||||
| 226 | 25 | 56 | $self->{ruleset} = $rules; | ||||
| 227 | |||||||
| 228 | # Initialize based on ruleset | ||||||
| 229 | 25 | 87 | $self->{acceptable_a} = $rules->acceptable_a(); | ||||
| 230 | 25 | 90 | $self->{acceptable_e} = $rules->acceptable_e(); | ||||
| 231 | 25 | 87 | $self->{empty_e} = $rules->empty_e(); | ||||
| 232 | 25 | 119 | $self->{unacceptable_e} = $rules->unacceptable_e(); | ||||
| 233 | 25 | 76 | $self->{uri_list} = $rules->uri_list(); | ||||
| 234 | 25 | 83 | $self->{allowed_schemes} = $rules->allowed_schemes(); | ||||
| 235 | 25 | 83 | $rules->finalize_initialization($self); | ||||
| 236 | |||||||
| 237 | 25 | 33 | return; | ||||
| 238 | } | ||||||
| 239 | |||||||
| 240 | =head2 add_callback | ||||||
| 241 | |||||||
| 242 | Adds a callback of type "start_tag", "end_tag", "text", "uri", or "output" to | ||||||
| 243 | the appropriate internal array. | ||||||
| 244 | |||||||
| 245 | $l->add_callback('start_tag', sub { | ||||||
| 246 | my ($laundry, $tagref, $attrhashref) = @_; | ||||||
| 247 | # Now, perform actions and return | ||||||
| 248 | }); | ||||||
| 249 | |||||||
| 250 | start_tag, end_tag, text, and uri callbacks that return false values will | ||||||
| 251 | suppress the return value of the element they are processing; this allows | ||||||
| 252 | additional checks to be done (for instance, images can be allowed only from | ||||||
| 253 | whitelisted source domains). | ||||||
| 254 | |||||||
| 255 | =cut | ||||||
| 256 | |||||||
| 257 | sub add_callback { | ||||||
| 258 | 21 | 21 | 1 | 4229 | my ( $self, $action, $ref ) = @_; | ||
| 259 | 21 | 50 | 41 | return if ( ref($ref) ne 'CODE' ); | |||
| 260 | 21 | 100 | 55 | if ($action eq q{start_tag}) { | |||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 50 | |||||||
| 261 | 4 | 4 | push @{ $self->{start_tag_callback} }, $ref; | ||||
| 4 | 6 | ||||||
| 262 | } elsif ($action eq q{end_tag}) { | ||||||
| 263 | 4 | 4 | push @{ $self->{end_tag_callback} }, $ref; | ||||
| 4 | 9 | ||||||
| 264 | } elsif ($action eq q{text}) { | ||||||
| 265 | 6 | 5 | push @{ $self->{text_callback} }, $ref; | ||||
| 6 | 8 | ||||||
| 266 | } elsif ($action eq q{uri}) { | ||||||
| 267 | 4 | 4 | push @{ $self->{uri_callback} }, $ref; | ||||
| 4 | 9 | ||||||
| 268 | } elsif ($action eq q{output}) { | ||||||
| 269 | 3 | 3 | push @{ $self->{output_callback} }, $ref; | ||||
| 3 | 6 | ||||||
| 270 | } | ||||||
| 271 | 21 | 27 | return; | ||||
| 272 | } | ||||||
| 273 | |||||||
| 274 | =head2 clear_callback | ||||||
| 275 | |||||||
| 276 | Removes all callbacks of given type. | ||||||
| 277 | |||||||
| 278 | $l->clear_callback('start_tag'); | ||||||
| 279 | |||||||
| 280 | =cut | ||||||
| 281 | |||||||
| 282 | sub clear_callback { | ||||||
| 283 | 139 | 139 | 1 | 10737 | my ( $self, $action ) = @_; | ||
| 284 | 139 | 100 | 367 | if ($action eq q{start_tag}) { | |||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 50 | |||||||
| 285 | 27 | 493 | 159 | $self->{start_tag_callback} = [ sub { 1; } ]; | |||
| 493 | 429 | ||||||
| 286 | } elsif ($action eq q{end_tag}) { | ||||||
| 287 | 27 | 467 | 67 | $self->{end_tag_callback} = [ sub { 1; } ]; | |||
| 467 | 375 | ||||||
| 288 | } elsif ($action eq q{text}) { | ||||||
| 289 | 29 | 143 | 85 | $self->{text_callback} = [ sub { 1; } ]; | |||
| 143 | 128 | ||||||
| 290 | } elsif ($action eq q{uri}) { | ||||||
| 291 | 29 | 58 | 74 | $self->{uri_callback} = [ sub { 1; } ]; | |||
| 58 | 52 | ||||||
| 292 | } elsif ($action eq q{output}) { | ||||||
| 293 | 27 | 462 | 73 | $self->{output_callback} = [ sub { 1; } ]; | |||
| 462 | 418 | ||||||
| 294 | } | ||||||
| 295 | 139 | 218 | return; | ||||
| 296 | } | ||||||
| 297 | |||||||
| 298 | =head2 clean | ||||||
| 299 | |||||||
| 300 | Cleans a snippet of HTML, using the ruleset and object creation options given | ||||||
| 301 | to the Laundry object. The snippet should be passed as a scalar. | ||||||
| 302 | |||||||
| 303 | $output1 = $l->clean( ' The X-rays were penetrating' ); |
||||||
| 304 | $output2 = $l->clean( $snippet ); | ||||||
| 305 | |||||||
| 306 | =cut | ||||||
| 307 | |||||||
| 308 | sub clean { | ||||||
| 309 | 462 | 462 | 1 | 93510 | my ( $self, $chunk, $args ) = @_; | ||
| 310 | 462 | 756 | $self->_reset_state(); | ||||
| 311 | 462 | 50 | 878 | if ( $self->{trim_tag_whitespace} ) { | |||
| 312 | 0 | 0 | $chunk =~ s/$tag_leading_whitespace/$1/gs; | ||||
| 313 | } | ||||||
| 314 | 462 | 392 | my $p = $self->{parser}; | ||||
| 315 | 462 | 342 | my $cp = $self->{cdata_parser}; | ||||
| 316 | 462 | 2894 | $p->parse($chunk); | ||||
| 317 | 462 | 100 | 33 | 1121 | if ( !$in_cdata && !$unacceptable_count ) { | ||
| 318 | 461 | 793 | $p->eof(); | ||||
| 319 | } | ||||||
| 320 | 462 | 50 | 33 | 820 | if ( $in_cdata && !$local_unacceptable_count ) { | ||
| 321 | 0 | 0 | $cp->eof(); | ||||
| 322 | } | ||||||
| 323 | 462 | 586 | my $output = $self->gen_output; | ||||
| 324 | 462 | 748 | $cp->eof(); # Clear buffer if we haven't already | ||||
| 325 | 462 | 100 | 516 | if ($cdata_dirty) { # Overkill to get out of CDATA parser state | |||
| 326 | $self->{parser} = HTML::Parser->new( | ||||||
| 327 | api_version => 3, | ||||||
| 328 | start_h => | ||||||
| 329 | 7 | 7 | 12 | [ sub { $self->_tag_start_handler(@_) }, 'tagname,attr' ], | |||
| 330 | 9 | 9 | 12 | end_h => [ sub { $self->_tag_end_handler(@_) }, 'tagname,attr' ], | |||
| 331 | 4 | 19 | 31 | text_h => [ sub { $self->_text_handler(@_) }, 'dtext,is_cdata' ], | |||
| 19 | 30 | ||||||
| 332 | empty_element_tags => 1, | ||||||
| 333 | marked_sections => 1, | ||||||
| 334 | ); | ||||||
| 335 | } | ||||||
| 336 | else { | ||||||
| 337 | 458 | 573 | $p->eof(); # Clear buffer if we haven't already | ||||
| 338 | } | ||||||
| 339 | 462 | 1717 | return $output; | ||||
| 340 | } | ||||||
| 341 | |||||||
| 342 | =head2 base_uri | ||||||
| 343 | |||||||
| 344 | Used to get or set the base_uri property, used in URI rebasing. | ||||||
| 345 | |||||||
| 346 | my $base_uri = $l->base_uri; # returns current base_uri | ||||||
| 347 | $l->base_uri(q{http://example.com}); # return 'http://example.com' | ||||||
| 348 | $l->base_uri(''); # unsets base_uri | ||||||
| 349 | |||||||
| 350 | =cut | ||||||
| 351 | |||||||
| 352 | sub base_uri { | ||||||
| 353 | 3 | 3 | 1 | 5 | my ( $self, $new_base ) = @_; | ||
| 354 | 3 | 100 | 66 | 13 | if ( defined $new_base and !ref $new_base ) { | ||
| 355 | 2 | 3 | $self->{base_uri} = $new_base; | ||||
| 356 | } | ||||||
| 357 | 3 | 12 | return $self->{base_uri}; | ||||
| 358 | } | ||||||
| 359 | |||||||
| 360 | sub _run_callbacks { | ||||||
| 361 | 1623 | 1623 | 1168 | my $self = shift; | |||
| 362 | 1623 | 1169 | my $action = shift; | ||||
| 363 | 1623 | 50 | 2081 | return unless $action; | |||
| 364 | 1623 | 1480 | my $type = $action . q{_callback}; | ||||
| 365 | 1623 | 1065 | for my $callback ( @{ $self->{$type} } ) { | ||||
| 1623 | 2329 | ||||||
| 366 | 1650 | 1879 | my $result = $callback->( $self, @_ ); | ||||
| 367 | 1650 | 100 | 12206 | return unless $result; | |||
| 368 | } | ||||||
| 369 | 1619 | 2335 | return 1; | ||||
| 370 | } | ||||||
| 371 | |||||||
| 372 | =head2 gen_output | ||||||
| 373 | |||||||
| 374 | Used to generate the final, XHTML output from the internal stack of text and | ||||||
| 375 | tag tokens. Generally meant to be used internally, but potentially useful for | ||||||
| 376 | callbacks that require a snapshot of what the output would look like | ||||||
| 377 | before the cleaning process is complete. | ||||||
| 378 | |||||||
| 379 | my $xhtml = $l->gen_output; | ||||||
| 380 | |||||||
| 381 | =cut | ||||||
| 382 | |||||||
| 383 | sub gen_output { | ||||||
| 384 | 462 | 462 | 1 | 342 | my $self = shift; | ||
| 385 | 462 | 50 | 590 | if ( !$self->_run_callbacks( q{output}, \@fragments ) ) { | |||
| 386 | 0 | 0 | return q{}; | ||||
| 387 | } | ||||||
| 388 | 462 | 663 | my $output = join '', @fragments; | ||||
| 389 | 462 | 50 | 708 | if ( $self->{tidy} ) { | |||
| 390 | 0 | 0 | 0 | if ( $self->{tidy_engine} eq q{HTML::Tidy} ) { | |||
| 0 | |||||||
| 391 | 0 | 0 | $output = $self->{tidy}->clean($output); | ||||
| 392 | 0 | 0 | $self->{tidy}->clear_messages; | ||||
| 393 | } | ||||||
| 394 | elsif ( $self->{tidy_engine} eq q{HTML::Tidy::libXML} ) { | ||||||
| 395 | my $clean | ||||||
| 396 | = $self->{tidy} | ||||||
| 397 | ->clean( $self->{tidy_head} . $output . $self->{tidy_foot}, | ||||||
| 398 | 0 | 0 | 'UTF-8', 1 ); | ||||
| 399 | 0 | 0 | $output = substr( $clean, length $self->{tidy_head} ); | ||||
| 400 | 0 | 0 | $output = substr( $output, 0, -1 * length $self->{tidy_foot} ); | ||||
| 401 | } | ||||||
| 402 | } | ||||||
| 403 | 462 | 50 | 633 | if ( $self->{trim_trailing_whitespace} ) { | |||
| 404 | 462 | 1049 | $output =~ s/\s+$//; | ||||
| 405 | } | ||||||
| 406 | 462 | 512 | return $output; | ||||
| 407 | } | ||||||
| 408 | |||||||
| 409 | =head2 empty_elements | ||||||
| 410 | |||||||
| 411 | Returns a list of the Laundry object's known empty elements: elements such | ||||||
| 412 | as which must not contain any children. |
||||||
| 413 | |||||||
| 414 | =cut | ||||||
| 415 | |||||||
| 416 | sub empty_elements { | ||||||
| 417 | 0 | 0 | 1 | 0 | my ( $self, $listref ) = @_; | ||
| 418 | 0 | 0 | 0 | if ($listref) { | |||
| 419 | 0 | 0 | my @list = @{$listref}; | ||||
| 0 | 0 | ||||||
| 420 | 0 | 0 | my %empty = map { ( $_, 1 ) } @list; | ||||
| 0 | 0 | ||||||
| 421 | 0 | 0 | $self->{empty_e} = \%empty; | ||||
| 422 | } | ||||||
| 423 | 0 | 0 | return keys %{ $self->{empty_e} }; | ||||
| 0 | 0 | ||||||
| 424 | } | ||||||
| 425 | |||||||
| 426 | =head2 remove_empty_element | ||||||
| 427 | |||||||
| 428 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
| 429 | the "empty elements" list maintained by the Laundry object. | ||||||
| 430 | |||||||
| 431 | $l->remove_empty_element(['img', 'br']); # Let's break XHTML! | ||||||
| 432 | |||||||
| 433 | This will not affect the acceptable/unacceptable status of the elements. | ||||||
| 434 | |||||||
| 435 | =cut | ||||||
| 436 | |||||||
| 437 | sub remove_empty_element { | ||||||
| 438 | 4 | 4 | 1 | 286 | my ( $self, $new_e, $args ) = @_; | ||
| 439 | 4 | 4 | my $empty = $self->{empty_e}; | ||||
| 440 | 4 | 100 | 8 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 441 | 1 | 2 | foreach my $e ( @{$new_e} ) { | ||||
| 1 | 3 | ||||||
| 442 | 2 | 5 | $self->remove_empty_element( $e, $args ); | ||||
| 443 | } | ||||||
| 444 | } | ||||||
| 445 | else { | ||||||
| 446 | 3 | 4 | delete $empty->{$new_e}; | ||||
| 447 | } | ||||||
| 448 | 4 | 6 | return 1; | ||||
| 449 | } | ||||||
| 450 | |||||||
| 451 | =head2 acceptable_elements | ||||||
| 452 | |||||||
| 453 | Returns a list of the Laundry object's known acceptable elements, which will | ||||||
| 454 | not be stripped during the sanitizing process. | ||||||
| 455 | |||||||
| 456 | =cut | ||||||
| 457 | |||||||
| 458 | sub acceptable_elements { | ||||||
| 459 | 4 | 4 | 1 | 284 | my ( $self, $listref ) = @_; | ||
| 460 | 4 | 100 | 15 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 461 | 1 | 1 | my @list = @{$listref}; | ||||
| 1 | 3 | ||||||
| 462 | 1 | 2 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
| 5 | 8 | ||||||
| 463 | 1 | 3 | $self->{acceptable_e} = \%acceptable; | ||||
| 464 | } | ||||||
| 465 | 4 | 10 | return keys %{ $self->{acceptable_e} }; | ||||
| 4 | 61 | ||||||
| 466 | } | ||||||
| 467 | |||||||
| 468 | =head2 add_acceptable_element | ||||||
| 469 | |||||||
| 470 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
| 471 | "acceptable elements" list maintained by the Laundry object. Items added in | ||||||
| 472 | this manner will automatically be removed from the "unacceptable elements" | ||||||
| 473 | list if they are present. | ||||||
| 474 | |||||||
| 475 | $l->add_acceptable_element('style'); | ||||||
| 476 | |||||||
| 477 | Elements which are empty may be flagged as such with an optional argument. | ||||||
| 478 | If this flag is set, all elements provided by the call will be added to | ||||||
| 479 | the "empty element" list. | ||||||
| 480 | |||||||
| 481 | $l->add_acceptable_element(['applet', 'script'], { empty => 1 }); | ||||||
| 482 | |||||||
| 483 | =cut | ||||||
| 484 | |||||||
| 485 | sub add_acceptable_element { | ||||||
| 486 | 10 | 10 | 1 | 816 | my ( $self, $new_e, $args ) = @_; | ||
| 487 | 10 | 10 | my $acceptable = $self->{acceptable_e}; | ||||
| 488 | 10 | 9 | my $empty = $self->{empty_e}; | ||||
| 489 | 10 | 7 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 490 | 10 | 100 | 22 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 491 | 2 | 3 | foreach my $e ( @{$new_e} ) { | ||||
| 2 | 5 | ||||||
| 492 | 4 | 11 | $self->add_acceptable_element( $e, $args ); | ||||
| 493 | } | ||||||
| 494 | } | ||||||
| 495 | else { | ||||||
| 496 | 8 | 10 | $acceptable->{$new_e} = 1; | ||||
| 497 | 8 | 100 | 15 | if ( $args->{empty} ) { | |||
| 50 | |||||||
| 498 | 4 | 7 | $empty->{$new_e} = 1; | ||||
| 499 | 4 | 50 | 8 | if ( $self->{tidy} ) { | |||
| 500 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
| 501 | 0 | 0 | $self->{tidy_added_empty}->{$new_e} = 1; | ||||
| 502 | 0 | 0 | $self->_generate_tidy; | ||||
| 503 | } | ||||||
| 504 | } | ||||||
| 505 | elsif ( $self->{tidy} ) { | ||||||
| 506 | 0 | 0 | $self->{tidy_added_inline}->{$new_e} = 1; | ||||
| 507 | 0 | 0 | $self->_generate_tidy; | ||||
| 508 | } | ||||||
| 509 | 8 | 7 | delete $unacceptable->{$new_e}; | ||||
| 510 | |||||||
| 511 | } | ||||||
| 512 | 10 | 14 | return 1; | ||||
| 513 | } | ||||||
| 514 | |||||||
| 515 | =head2 remove_acceptable_element | ||||||
| 516 | |||||||
| 517 | Removes an element (or, if given an array reference, multiple elements) to the | ||||||
| 518 | "acceptable elements" list maintained by the Laundry object. These items | ||||||
| 519 | (although not their child elements) will now be stripped during parsing. | ||||||
| 520 | |||||||
| 521 | $l->remove_acceptable_element(['img', 'h1', 'h2']); | ||||||
| 522 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
| 523 | # returns 'The Day the World Turned Day-Glo' | ||||||
| 524 | |||||||
| 525 | =cut | ||||||
| 526 | |||||||
| 527 | sub remove_acceptable_element { | ||||||
| 528 | 16 | 16 | 1 | 16 | my ( $self, $new_e, $args ) = @_; | ||
| 529 | 16 | 17 | my $acceptable = $self->{acceptable_e}; | ||||
| 530 | 16 | 100 | 21 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 531 | 2 | 2 | foreach my $e ( @{$new_e} ) { | ||||
| 2 | 5 | ||||||
| 532 | 4 | 8 | $self->remove_acceptable_element( $e, $args ); | ||||
| 533 | } | ||||||
| 534 | } | ||||||
| 535 | else { | ||||||
| 536 | 14 | 17 | delete $acceptable->{$new_e}; | ||||
| 537 | } | ||||||
| 538 | 16 | 17 | return 1; | ||||
| 539 | } | ||||||
| 540 | |||||||
| 541 | =head2 unacceptable_elements | ||||||
| 542 | |||||||
| 543 | Returns a list of the Laundry object's unacceptable elements, which will be | ||||||
| 544 | stripped -- B |
||||||
| 545 | |||||||
| 546 | =cut | ||||||
| 547 | |||||||
| 548 | sub unacceptable_elements { | ||||||
| 549 | 3 | 3 | 1 | 4 | my ( $self, $listref ) = @_; | ||
| 550 | 3 | 100 | 7 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 551 | 1 | 1 | my @list = @{$listref}; | ||||
| 1 | 3 | ||||||
| 552 | my %unacceptable | ||||||
| 553 | 1 | 2 | = map { $self->remove_acceptable_element($_); ( $_, 1 ); } @list; | ||||
| 5 | 7 | ||||||
| 5 | 7 | ||||||
| 554 | 1 | 3 | $self->{unacceptable_e} = \%unacceptable; | ||||
| 555 | } | ||||||
| 556 | 3 | 3 | return keys %{ $self->{unacceptable_e} }; | ||||
| 3 | 11 | ||||||
| 557 | } | ||||||
| 558 | |||||||
| 559 | =head2 add_unacceptable_element | ||||||
| 560 | |||||||
| 561 | Adds an element (or, if given an array reference, multiple elements) to the | ||||||
| 562 | "unacceptable elements" list maintained by the Laundry object. | ||||||
| 563 | |||||||
| 564 | $l->add_unacceptable_element(['h1', 'h2']); | ||||||
| 565 | $l->clean(q{The Day the World Turned Day-Glo}); |
||||||
| 566 | # returns null string | ||||||
| 567 | |||||||
| 568 | =cut | ||||||
| 569 | |||||||
| 570 | sub add_unacceptable_element { | ||||||
| 571 | 4 | 4 | 1 | 472 | my ( $self, $new_e, $args ) = @_; | ||
| 572 | 4 | 4 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 573 | 4 | 100 | 6 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 574 | 1 | 1 | foreach my $e ( @{$new_e} ) { | ||||
| 1 | 2 | ||||||
| 575 | 2 | 4 | $self->add_unacceptable_element( $e, $args ); | ||||
| 576 | } | ||||||
| 577 | } | ||||||
| 578 | else { | ||||||
| 579 | 3 | 4 | $self->remove_acceptable_element($new_e); | ||||
| 580 | 3 | 2 | $unacceptable->{$new_e} = 1; | ||||
| 581 | } | ||||||
| 582 | 4 | 5 | return 1; | ||||
| 583 | } | ||||||
| 584 | |||||||
| 585 | =head2 remove_unacceptable_element | ||||||
| 586 | |||||||
| 587 | Removes an element (or, if given an array reference, multiple elements) from | ||||||
| 588 | the "unacceptable elements" list maintained by the Laundry object. Note that | ||||||
| 589 | this does not automatically add the element to the acceptable_element list. | ||||||
| 590 | |||||||
| 591 | $l->clean(q{}); | ||||||
| 592 | # returns null string | ||||||
| 593 | $l->remove_unacceptable_element( q{script} ); | ||||||
| 594 | $l->clean(q{}); | ||||||
| 595 | # returns "alert('!')" | ||||||
| 596 | |||||||
| 597 | =cut | ||||||
| 598 | |||||||
| 599 | sub remove_unacceptable_element { | ||||||
| 600 | 4 | 4 | 1 | 6 | my ( $self, $new_e, $args ) = @_; | ||
| 601 | 4 | 3 | my $unacceptable = $self->{unacceptable_e}; | ||||
| 602 | 4 | 100 | 7 | if ( ref($new_e) eq 'ARRAY' ) { | |||
| 603 | 1 | 1 | foreach my $a ( @{$new_e} ) { | ||||
| 1 | 2 | ||||||
| 604 | 2 | 7 | $self->remove_unacceptable_element( $a, $args ); | ||||
| 605 | } | ||||||
| 606 | } | ||||||
| 607 | else { | ||||||
| 608 | 3 | 4 | delete $unacceptable->{$new_e}; | ||||
| 609 | } | ||||||
| 610 | 4 | 5 | return 1; | ||||
| 611 | } | ||||||
| 612 | |||||||
| 613 | =head2 acceptable_attributes | ||||||
| 614 | |||||||
| 615 | Returns a list of the Laundry object's known acceptable attributes, which will | ||||||
| 616 | not be stripped during the sanitizing process. | ||||||
| 617 | |||||||
| 618 | =cut | ||||||
| 619 | |||||||
| 620 | sub acceptable_attributes { | ||||||
| 621 | 3 | 3 | 1 | 5 | my ( $self, $listref ) = @_; | ||
| 622 | 3 | 100 | 8 | if ( ref($listref) eq 'ARRAY' ) { | |||
| 623 | 1 | 1 | my @list = @{$listref}; | ||||
| 1 | 3 | ||||||
| 624 | 1 | 2 | my %acceptable = map { ( $_, 1 ) } @list; | ||||
| 3 | 8 | ||||||
| 625 | 1 | 3 | $self->{acceptable_a} = \%acceptable; | ||||
| 626 | } | ||||||
| 627 | 3 | 7 | return keys %{ $self->{acceptable_a} }; | ||||
| 3 | 39 | ||||||
| 628 | } | ||||||
| 629 | |||||||
| 630 | =head2 add_acceptable_attribute | ||||||
| 631 | |||||||
| 632 | Adds an attribute (or, if given an array reference, multiple attributes) to the | ||||||
| 633 | "acceptable attributes" list maintained by the Laundry object. | ||||||
| 634 | |||||||
| 635 | my $snippet = q{ "My dear Mr. Bennet," said his lady to |
||||||
| 636 | him one day, "have you heard that | ||||||
| 637 | Netherfield Park is let at last?" | ||||||
| 638 | }; | ||||||
| 639 | $l->clean( $snippet ); | ||||||
| 640 | # returns: | ||||||
| 641 | # "My dear Mr. Bennet," said his lady to him one day, |
||||||
| 642 | # "have you heard that Netherfield Park is let at | ||||||
| 643 | # last?" | ||||||
| 644 | $l->add_acceptable_attribute([austen:id, austen:footnote]); | ||||||
| 645 | $l->clean( $snippet ); | ||||||
| 646 | # returns: | ||||||
| 647 | # "My dear Mr. Bennet," said his lady to him |
||||||
| 648 | # one day, "have you heard that | ||||||
| 649 | # Netherfield Park is let at last?" | ||||||
| 650 | |||||||
| 651 | =cut | ||||||
| 652 | |||||||
| 653 | sub add_acceptable_attribute { | ||||||
| 654 | 4 | 4 | 1 | 791 | my ( $self, $new_a, $args ) = @_; | ||
| 655 | 4 | 5 | my $acceptable = $self->{acceptable_a}; | ||||
| 656 | 4 | 100 | 8 | if ( ref($new_a) eq 'ARRAY' ) { | |||
| 657 | 1 | 1 | foreach my $a ( @{$new_a} ) { | ||||
| 1 | 3 | ||||||
| 658 | 2 | 5 | $self->add_acceptable_attribute( $a, $args ); | ||||
| 659 | } | ||||||
| 660 | } | ||||||
| 661 | else { | ||||||
| 662 | 3 | 4 | $acceptable->{$new_a} = 1; | ||||
| 663 | } | ||||||
| 664 | 4 | 4 | return 1; | ||||
| 665 | } | ||||||
| 666 | |||||||
| 667 | =head2 remove_acceptable_attribute | ||||||
| 668 | |||||||
| 669 | Removes an attribute (or, if given an array reference, multiple attributes) | ||||||
| 670 | from the "acceptable attributes" list maintained by the Laundry object. | ||||||
| 671 | |||||||
| 672 | $l->clean(q{ plover }); |
||||||
| 673 | # returns ' plover ' |
||||||
| 674 | $l->remove_acceptable_element( q{id} ); | ||||||
| 675 | $l->clean(q{ plover }); |
||||||
| 676 | # returns ' plover |
||||||
| 677 | |||||||
| 678 | =cut | ||||||
| 679 | |||||||
| 680 | sub remove_acceptable_attribute { | ||||||
| 681 | 4 | 4 | 1 | 6 | my ( $self, $new_a, $args ) = @_; | ||
| 682 | 4 | 5 | my $acceptable = $self->{acceptable_a}; | ||||
| 683 | 4 | 100 | 9 | if ( ref($new_a) eq 'ARRAY' ) { | |||
| 684 | 1 | 2 | foreach my $a ( @{$new_a} ) { | ||||
| 1 | 2 | ||||||
| 685 | 2 | 6 | $self->remove_acceptable_attribute( $a, $args ); | ||||
| 686 | } | ||||||
| 687 | } | ||||||
| 688 | else { | ||||||
| 689 | 3 | 5 | delete $acceptable->{$new_a}; | ||||
| 690 | } | ||||||
| 691 | 4 | 5 | return 1; | ||||
| 692 | } | ||||||
| 693 | |||||||
| 694 | sub _generate_tidy { | ||||||
| 695 | 8 | 8 | 8 | my $self = shift; | |||
| 696 | 8 | 6 | my $param = shift; | ||||
| 697 | 8 | 12 | $self->_generate_html_tidy; | ||||
| 698 | 8 | 50 | 31 | if ( !$self->{tidy} ) { | |||
| 699 | 8 | 14 | $self->_generate_html_tidy_libxml; | ||||
| 700 | } | ||||||
| 701 | 8 | 22 | return; | ||||
| 702 | } | ||||||
| 703 | |||||||
| 704 | sub _generate_html_tidy_libxml { | ||||||
| 705 | 8 | 8 | 8 | my $self = shift; | |||
| 706 | { | ||||||
| 707 | 8 | 7 | local $@; | ||||
| 8 | 8 | ||||||
| 708 | 8 | 9 | eval { | ||||
| 709 | 8 | 1212 | require HTML::Tidy::libXML; | ||||
| 710 | 0 | 0 | $self->{tidy} = HTML::Tidy::libXML->new(); | ||||
| 711 | 0 | 0 | $self->{tidy_head} = q{ | ||||
| 712 | |||||||
| 713 | "http://www.w3.org/TR/ html1/DTD/ html1-transitional.dtd"> | ||||||
| 714 | }; | ||||||
| 715 | 0 | 0 | $self->{tidy_foot} = q{ | ||||
| 716 | }; | ||||||
| 717 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy::libXML}; | ||||
| 718 | 0 | 0 | 1; | ||||
| 719 | }; | ||||||
| 720 | } | ||||||
| 721 | } | ||||||
| 722 | |||||||
| 723 | sub _generate_html_tidy { | ||||||
| 724 | 8 | 8 | 4 | my $self = shift; | |||
| 725 | { | ||||||
| 726 | 8 | 8 | local $@; | ||||
| 8 | 8 | ||||||
| 727 | 8 | 10 | eval { | ||||
| 728 | 8 | 1402 | require HTML::Tidy; | ||||
| 729 | 0 | 0 | $self->{tidy_ruleset} = $self->{ruleset}->tidy_ruleset; | ||||
| 730 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_inline} } ) { | |||
| 0 | 0 | ||||||
| 731 | $self->{tidy_ruleset}->{new_inline_tags} | ||||||
| 732 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_inline} } ); | ||||
| 0 | 0 | ||||||
| 733 | } | ||||||
| 734 | 0 | 0 | 0 | if ( keys %{ $self->{tidy_added_empty} } ) { | |||
| 0 | 0 | ||||||
| 735 | $self->{tidy_ruleset}->{new_empty_tags} | ||||||
| 736 | 0 | 0 | = join( q{,}, keys %{ $self->{tidy_added_empty} } ); | ||||
| 0 | 0 | ||||||
| 737 | } | ||||||
| 738 | 0 | 0 | $self->{tidy} = HTML::Tidy->new( $self->{tidy_ruleset} ); | ||||
| 739 | 0 | 0 | $self->{tidy_engine} = q{HTML::Tidy}; | ||||
| 740 | 0 | 0 | 1; | ||||
| 741 | }; | ||||||
| 742 | } | ||||||
| 743 | } | ||||||
| 744 | |||||||
| 745 | sub _reset_state { | ||||||
| 746 | 462 | 462 | 370 | my ($self) = @_; | |||
| 747 | 462 | 603 | @fragments = (); | ||||
| 748 | 462 | 339 | $unacceptable_count = 0; | ||||
| 749 | 462 | 314 | $local_unacceptable_count = 0; | ||||
| 750 | 462 | 296 | $in_cdata = 0; | ||||
| 751 | 462 | 293 | $cdata_dirty = 0; | ||||
| 752 | 462 | 415 | return; | ||||
| 753 | } | ||||||
| 754 | |||||||
| 755 | sub _tag_start_handler { | ||||||
| 756 | 493 | 493 | 489 | my ( $self, $tagname, $attr ) = @_; | |||
| 757 | 493 | 100 | 814 | if ( !$self->_run_callbacks( q{start_tag}, \$tagname, $attr ) ) { | |||
| 758 | 1 | 8 | return; | ||||
| 759 | } | ||||||
| 760 | 492 | 100 | 685 | if ( !$in_cdata ) { | |||
| 761 | 487 | 369 | $cdata_dirty = 0; | ||||
| 762 | } | ||||||
| 763 | 492 | 385 | my @attributes; | ||||
| 764 | 492 | 360 | foreach my $k ( keys %{$attr} ) { | ||||
| 492 | 1029 | ||||||
| 765 | 259 | 100 | 513 | if ( $self->{acceptable_a}->{$k} ) { | |||
| 766 | 174 | 100 | 125 | if ( grep {/^$k$/} @{ $self->{uri_list}->{$tagname} } ) { | |||
| 151 | 753 | ||||||
| 174 | 391 | ||||||
| 767 | $self->_uri_handler( $tagname, \$k, \$attr->{$k}, | ||||||
| 768 | 58 | 166 | $self->{base_uri} ); | ||||
| 769 | } | ||||||
| 770 | |||||||
| 771 | # Allow uri handler to suppress insertion | ||||||
| 772 | 174 | 100 | 267 | if ($k) { | |||
| 773 | 157 | 346 | push @attributes, $k . q{="} . $attr->{$k} . q{"}; | ||||
| 774 | } | ||||||
| 775 | } | ||||||
| 776 | } | ||||||
| 777 | 492 | 666 | my $attributes = join q{ }, @attributes; | ||||
| 778 | 492 | 100 | 783 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
| 779 | 376 | 100 | 493 | if ( $self->{empty_e}->{$tagname} ) { | |||
| 780 | 58 | 100 | 94 | if ($attributes) { | |||
| 781 | 19 | 22 | $attributes = $attributes . q{ }; | ||||
| 782 | } | ||||||
| 783 | 58 | 102 | push @fragments, "<$tagname $attributes/>"; | ||||
| 784 | } | ||||||
| 785 | else { | ||||||
| 786 | 318 | 100 | 412 | if ($attributes) { | |||
| 787 | 122 | 164 | $attributes = q{ } . $attributes; | ||||
| 788 | } | ||||||
| 789 | 318 | 493 | push @fragments, "<$tagname$attributes>"; | ||||
| 790 | } | ||||||
| 791 | } | ||||||
| 792 | else { | ||||||
| 793 | 116 | 100 | 190 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
| 794 | 24 | 100 | 35 | if ($in_cdata) { | |||
| 795 | 3 | 5 | $local_unacceptable_count += 1; | ||||
| 796 | } | ||||||
| 797 | else { | ||||||
| 798 | 21 | 48 | $unacceptable_count += 1; | ||||
| 799 | } | ||||||
| 800 | } | ||||||
| 801 | } | ||||||
| 802 | 492 | 1874 | return; | ||||
| 803 | } | ||||||
| 804 | |||||||
| 805 | sub _tag_end_handler { | ||||||
| 806 | 467 | 467 | 458 | my ( $self, $tagname ) = @_; | |||
| 807 | 467 | 100 | 589 | if ( !$self->_run_callbacks( q{end_tag}, \$tagname ) ) { | |||
| 808 | 1 | 4 | return; | ||||
| 809 | } | ||||||
| 810 | 466 | 100 | 606 | if ( !$in_cdata ) { | |||
| 811 | 463 | 366 | $cdata_dirty = 0; | ||||
| 812 | } | ||||||
| 813 | 466 | 100 | 726 | if ( $self->{acceptable_e}->{$tagname} ) { | |||
| 814 | 346 | 100 | 501 | if ( !$self->{empty_e}->{$tagname} ) { | |||
| 815 | 316 | 466 | push @fragments, "$tagname>"; | ||||
| 816 | } | ||||||
| 817 | } | ||||||
| 818 | else { | ||||||
| 819 | 120 | 100 | 168 | if ( $self->{unacceptable_e}->{$tagname} ) { | |||
| 820 | 30 | 100 | 34 | if ($in_cdata) { | |||
| 821 | 1 | 2 | $local_unacceptable_count -= 1; | ||||
| 822 | 1 | 50 | 2 | $local_unacceptable_count = 0 | |||
| 823 | if ( $local_unacceptable_count < 0 ); | ||||||
| 824 | } | ||||||
| 825 | else { | ||||||
| 826 | 29 | 26 | $unacceptable_count -= 1; | ||||
| 827 | 29 | 100 | 57 | $unacceptable_count = 0 if ( $unacceptable_count < 0 ); | |||
| 828 | } | ||||||
| 829 | } | ||||||
| 830 | } | ||||||
| 831 | 466 | 751 | return; | ||||
| 832 | } | ||||||
| 833 | |||||||
| 834 | sub _text_handler { | ||||||
| 835 | 172 | 172 | 196 | my ( $self, $text, $is_cdata ) = @_; | |||
| 836 | 172 | 100 | 100 | 321 | if ( $in_cdata && $local_unacceptable_count ) { | ||
| 837 | 1 | 3 | return; | ||||
| 838 | } | ||||||
| 839 | 171 | 100 | 272 | if ($unacceptable_count) { | |||
| 840 | 15 | 39 | return; | ||||
| 841 | } | ||||||
| 842 | 156 | 100 | 188 | if ($is_cdata) { | |||
| 843 | 13 | 16 | my $cp = $self->{cdata_parser}; | ||||
| 844 | 13 | 7 | $in_cdata = 1; | ||||
| 845 | 13 | 34 | $cp->parse($text); | ||||
| 846 | 13 | 100 | 20 | if ( !$local_unacceptable_count ) { | |||
| 847 | 11 | 30 | $cp->eof(); | ||||
| 848 | } | ||||||
| 849 | 13 | 8 | $cdata_dirty = 1; | ||||
| 850 | 13 | 12 | $in_cdata = 0; | ||||
| 851 | 13 | 29 | return; | ||||
| 852 | } | ||||||
| 853 | else { | ||||||
| 854 | 143 | 100 | 190 | if ( !$self->_run_callbacks( q{text}, \$text, $is_cdata ) ) { | |||
| 855 | 1 | 3 | return q{}; | ||||
| 856 | } | ||||||
| 857 | 142 | 297 | $text = encode_entities( $text, '<>&"' ); | ||||
| 858 | 142 | 6372 | $cdata_dirty = 0; | ||||
| 859 | } | ||||||
| 860 | 142 | 156 | push @fragments, $text; | ||||
| 861 | 142 | 403 | return; | ||||
| 862 | } | ||||||
| 863 | |||||||
| 864 | sub _uri_handler { | ||||||
| 865 | 58 | 58 | 72 | my ( $self, $tagname, $attr_ref, $value_ref, $base ) = @_; | |||
| 866 | 58 | 53 | my ( $attr, $value ) = ( ${$attr_ref}, ${$value_ref} ); | ||||
| 58 | 56 | ||||||
| 58 | 79 | ||||||
| 867 | 58 | 136 | $value =~ s/[`\x00-\x1f\x7f]+//g; | ||||
| 868 | 58 | 70 | $value =~ s/\ufffd//g; | ||||
| 869 | 58 | 189 | my $uri = URI->new($value); | ||||
| 870 | 58 | 617883 | $uri = $uri->canonical; | ||||
| 871 | 58 | 100 | 3175 | if ( !$self->_run_callbacks( q{uri}, $tagname, $attr, \$uri ) ) { | |||
| 872 | 1 | 2 | ${$attr_ref} = q{}; | ||||
| 1 | 2 | ||||||
| 873 | 1 | 3 | return undef; | ||||
| 874 | } | ||||||
| 875 | 57 | 100 | 66 | 218 | if ( $self->{allowed_schemes} and $uri->scheme ) { | ||
| 876 | 42 | 100 | 462 | unless ( $self->{allowed_schemes}->{ $uri->scheme } ) { | |||
| 877 | 16 | 111 | ${$attr_ref} = q{}; | ||||
| 16 | 19 | ||||||
| 878 | 16 | 37 | return undef; | ||||
| 879 | } | ||||||
| 880 | } | ||||||
| 881 | 41 | 100 | 387 | if ( $self->{base_uri} ) { | |||
| 882 | 8 | 39 | $uri = URI->new_abs( $uri->as_string, $self->{base_uri} ); | ||||
| 883 | } | ||||||
| 884 | 41 | 100 | 1279 | if ( $uri->scheme ) { # Not a local URI | |||
| 885 | 33 | 227 | my $host; | ||||
| 886 | { | ||||||
| 887 | 33 | 27 | local $@; | ||||
| 33 | 27 | ||||||
| 888 | 33 | 33 | eval { $host = $uri->host; }; | ||||
| 33 | 56 | ||||||
| 889 | } | ||||||
| 890 | 33 | 50 | 468 | if ($host) { | |||
| 891 | |||||||
| 892 | # We may need to manually unescape domain names | ||||||
| 893 | # to deal with issues like tinyarro.ws | ||||||
| 894 | 33 | 50 | my $utf8_host = $self->_decode_utf8($host); | ||||
| 895 | 33 | 49 | utf8::upgrade($utf8_host); | ||||
| 896 | 33 | 50 | 49 | if ( $uri->host ne $utf8_host ) { | |||
| 897 | |||||||
| 898 | # TODO: Optionally use Punycode in this case | ||||||
| 899 | |||||||
| 900 | 0 | 0 | 0 | 0 | if ( $uri->port and $uri->port == $uri->default_port ) { | ||
| 901 | 0 | 0 | $uri->port(undef); | ||||
| 902 | } | ||||||
| 903 | 0 | 0 | my $escaped_host = $self->_encode_utf8( $uri->host ); | ||||
| 904 | 0 | 0 | my $uri_str = $uri->canonical->as_string; | ||||
| 905 | 0 | 0 | $uri_str =~ s/$escaped_host/$utf8_host/; | ||||
| 906 | 0 | 0 | utf8::upgrade($uri_str); | ||||
| 907 | 0 | 0 | ${$value_ref} = $uri_str; | ||||
| 0 | 0 | ||||||
| 908 | 0 | 0 | return; | ||||
| 909 | } | ||||||
| 910 | } | ||||||
| 911 | } | ||||||
| 912 | 41 | 513 | ${$value_ref} = $uri->canonical->as_string; | ||||
| 41 | 1674 | ||||||
| 913 | 41 | 113 | return; | ||||
| 914 | } | ||||||
| 915 | |||||||
| 916 | sub _decode_utf8 { | ||||||
| 917 | 33 | 33 | 38 | my $self = shift; | |||
| 918 | 33 | 31 | my $orig = my $str = shift; | ||||
| 919 | 33 | 28 | $str =~ s/\%([0-9a-f]{2})/chr(hex($1))/egi; | ||||
| 0 | 0 | ||||||
| 920 | 33 | 50 | 141 | return $str if utf8::decode($str); | |||
| 921 | 0 | return $orig; | |||||
| 922 | } | ||||||
| 923 | |||||||
| 924 | sub _encode_utf8 { | ||||||
| 925 | 0 | 0 | my $self = shift; | ||||
| 926 | 0 | my $str = shift; | |||||
| 927 | 0 | my $highbit = qr/[^\w\$-_.+!*'(),]/; | |||||
| 928 | 0 | $str =~ s/($highbit)/ sprintf ("%%%02X", ord($1)) /ge; | |||||
| 0 | |||||||
| 929 | 0 | utf8::upgrade($str); | |||||
| 930 | 0 | return $str; | |||||
| 931 | } | ||||||
| 932 | |||||||
| 933 | =head1 SEE ALSO | ||||||
| 934 | |||||||
| 935 | There are a number of tools designed for sanitizing HTML, some of which | ||||||
| 936 | may be better suited than HTML::Laundry to particular circumstances. In | ||||||
| 937 | addition to L |
||||||
| 938 | L |
||||||
| 939 | solely for the purposes of sanitizing HTML from potential XSS attack vectors; | ||||||
| 940 | L |
||||||
| 941 | L |
||||||
| 942 | |||||||
| 943 | =head1 AUTHOR | ||||||
| 944 | |||||||
| 945 | Steve Cook, C<< |
||||||
| 946 | |||||||
| 947 | =head1 BUGS | ||||||
| 948 | |||||||
| 949 | Please report any bugs or feature requests on the GitHub page for this project, | ||||||
| 950 | http://github.com/snark/html-laundry. | ||||||
| 951 | |||||||
| 952 | =head1 ACKNOWLEDGMENTS | ||||||
| 953 | |||||||
| 954 | Thanks to Dave Cross and Vera Tobin. | ||||||
| 955 | |||||||
| 956 | =head1 SUPPORT | ||||||
| 957 | |||||||
| 958 | You can find documentation for this module with the perldoc command. | ||||||
| 959 | |||||||
| 960 | perldoc HTML::Laundry | ||||||
| 961 | |||||||
| 962 | =head1 COPYRIGHT & LICENSE | ||||||
| 963 | |||||||
| 964 | Copyright 2009 Six Apart, Ltd., all rights reserved. | ||||||
| 965 | |||||||
| 966 | This program is free software; you can redistribute it and/or modify it | ||||||
| 967 | under the same terms as Perl itself. | ||||||
| 968 | |||||||
| 969 | =cut | ||||||
| 970 | |||||||
| 971 | 1; # End of HTML::Laundry |