File Coverage

blib/lib/HTML/Detoxifier.pm
Criterion Covered Total %
statement 27 114 23.6
branch 0 74 0.0
condition 0 32 0.0
subroutine 9 11 81.8
pod 0 2 0.0
total 36 233 15.4


line stmt bran cond sub pod time code
1             # -----------------------------------------------------------------------------
2             # HTML::Detoxifier - strips harmful HTML from user input v0.02 - 03/01/2004
3             #
4             # Copyright (c) 2004 Patrick Walton
5             # but freely redistributable under the same terms as Perl itself.
6             # -----------------------------------------------------------------------------
7              
8             package HTML::Detoxifier;
9              
10 1     1   5647 use strict;
  1         1  
  1         32  
11 1     1   4 use warnings FATAL => 'all';
  1         1  
  1         34  
12 1     1   776 use HTML::TokeParser;
  1         11647  
  1         26  
13 1     1   6 use HTML::Entities;
  1         2  
  1         63  
14              
15 1     1   4 use base qw;
  1         2  
  1         353  
16             @HTML::Detoxifier::EXPORT_OK = qw(detoxify);
17              
18             $HTML::Detoxifier::VERSION = 0.01;
19              
20             =head1 NAME
21              
22             HTML::Detoxifier - practical module to strip harmful HTML
23              
24             =head1 SYNOPSIS
25              
26             use HTML::Detoxifier qw;
27            
28             my $clean_html = detoxify $html;
29            
30             my $cleaner_html = detoxify($html, disallow =>
31             [qw(dynamic images document)]);
32            
33             my $stripped_html = detoxify($html, disallow => [qw(everything)]);
34              
35             =head1 DESCRIPTION
36              
37             HTML::Detoxifier is a practical module to remove harmful tags from HTML input.
38             It's intended to be used for web sites that accept user input in the form of
39             HTML and then present that information in some form.
40              
41             Accepting all HTML from untrusted users is generally a very bad idea;
42             typically, all HTML should be run through some kind of filter before being
43             presented to end users. Cross-site scripting (XSS) vulnerabilities can run
44             rampant without a filter. The most common and obvious HTML vulnerability lies
45             in stealing users' login cookies through JavaScript.
46              
47             Unlike other modules, HTML::Detoxifier is intended to be a practical solution
48             that abstracts away all the specifics of whitelisting certain tags easily
49             and securely. Tags are divided into functional groups, each of which can be
50             disallowed or allowed as you wish. Additionally, HTML::Detoxifier knows how to
51             clean inline CSS; with HTML::Detoxifier, you can securely allow users to use
52             style sheets without allowing cross-site scripting vulnerabilities. (Yes, it is
53             possible to execute JavaScript from CSS!)
54              
55             In addition to this main purpose, HTML::Detoxifier cleans up some common
56             mistakes with HTML: all tags are closed, empty tags are converted to valid
57             XML (that is, with a trailing /), and images without ALT text as required in
58             HTML 4.0 are given a plain ALT tag. The module does its best to emit valid
59             XHTML 1.0; it even adds XML declarations and DOCTYPE elements where needed.
60              
61             =cut
62              
63 1         230 use constant TAG_GROUPS => {
64             links => {
65             a => undef,
66             area => undef,
67             link => undef,
68             map => undef
69             },
70             document => {
71             base => undef,
72             basefont => undef,
73             bdo => undef,
74             head => undef,
75             body => undef,
76             html => undef,
77             link => undef,
78             meta => undef,
79             style => undef,
80             title => undef
81             },
82             aesthetic => {
83             b => undef,
84             basefont => undef,
85             big => undef,
86             blink => undef,
87             em => undef,
88             h1 => undef,
89             h2 => undef,
90             h3 => undef,
91             h4 => undef,
92             h5 => undef,
93             h6 => undef,
94             i => undef,
95             kbd => undef,
96             marquee => undef,
97             pre => undef,
98             s => undef,
99             small => undef,
100             strike => undef,
101             strong => undef,
102             style => undef,
103             'sub' => undef,
104             sup => undef,
105             tt => undef,
106             u => undef,
107             var => undef
108             },
109             'size-changing' => {
110             big => undef,
111             h1 => undef,
112             h2 => undef,
113             h3 => undef,
114             h4 => undef,
115             h5 => undef,
116             h6 => undef,
117             small => undef,
118             style => undef,
119             'sub' => undef,
120             sup => undef
121             },
122             block => {
123             blockquote => undef,
124             br => undef,
125             code => undef,
126             div => undef,
127             dl => undef,
128             h1 => undef,
129             h2 => undef,
130             h3 => undef,
131             h4 => undef,
132             h5 => undef,
133             h6 => undef,
134             hr => undef,
135             li => undef,
136             marquee => undef,
137             ol => undef,
138             p => undef,
139             pre => undef,
140             q => undef,
141             samp => undef,
142             style => undef,
143             ul => undef
144             },
145             forms => {
146             button => undef,
147             fieldset => undef,
148             form => undef,
149             input => undef,
150             label => undef,
151             legend => undef,
152             optgroup => undef,
153             option => undef,
154             select => undef,
155             textarea => undef
156             },
157             layout => {
158             caption => undef,
159             col => undef,
160             colgroup => undef,
161             style => undef,
162             table => undef,
163             tbody => undef,
164             td => undef,
165             tfoot => undef,
166             th => undef,
167             thead => undef,
168             tr => undef
169             },
170             images => {
171             img => undef,
172             map => undef,
173             style => undef
174             },
175             annoying => {
176             marquee => undef,
177             blink => undef
178             },
179             dynamic => {
180             applet => undef,
181             embed => undef,
182             noscript => undef,
183             object => undef,
184             param => undef,
185             script => undef
186             },
187             misc => {
188             abbr => undef,
189             cite => undef,
190             dd => undef,
191             del => undef,
192             dfn => undef,
193             dt => undef,
194             span => undef
195             }
196 1     1   5 };
  1         1  
197              
198             =head1 HTML TAG GROUPS
199              
200             The following groups can be disallowed or allowed as you choose. Some tags are
201             present in more than one group. In these cases, the tag must be present in
202             I allowed group, or the tag will be removed.
203              
204             =head2 everything
205              
206             All HTML.
207              
208             =head2 document
209              
210             Markup that defines the basic structure of a document (e.g. html, head, body).
211              
212             =head2 aesthetic
213              
214             Markup that alters the appearance of text (e.g. strong, strike, b, i, em).
215              
216             =head2 size-altering
217              
218             Markup that can alter the size of text (e.g. big, small).
219              
220             =head2 block
221              
222             Most block-level markup as defined in the HTML4 specification.
223              
224             =head2 comments
225              
226             HTML comments.
227              
228             =head2 forms
229              
230             Markup used to create fill-in forms.
231              
232             =head2 layout
233              
234             Markup that creates tables or otherwise controls page layout.
235              
236             =head2 images
237              
238             Markup that creates images.
239              
240             =head2 annoying
241              
242             Markup that creates "annoying" effects undesirable by the majority of web users
243             (marquee, blink).
244              
245             =head2 dynamic
246              
247             Markup that specifies JavaScript or some other embedded format (SVG, Flash,
248             Java, etc.) Possibly dangerous.
249              
250             =head2 misc
251              
252             Usually seldom-used, typically-harmless HTML tags that specify special types
253             of inline text. (e.g. abbr, dd, span).
254              
255             =cut
256              
257 1         89 use constant TAGS => {
258             a => undef,
259             abbr => undef,
260             acronym => undef,
261             address => undef,
262             applet => undef,
263             area => undef,
264             b => undef,
265             base => undef,
266             basefont => undef,
267             bdo => undef,
268             big => undef,
269             blink => undef,
270             blockquote => undef,
271             body => undef,
272             br => undef,
273             button => undef,
274             caption => undef,
275             cite => undef,
276             code => undef,
277             col => undef,
278             colgroup => undef,
279             dd => undef,
280             del => undef,
281             dfn => undef,
282             div => undef,
283             dl => undef,
284             dt => undef,
285             em => undef,
286             embed => undef,
287             fieldset => undef,
288             form => undef,
289             h1 => undef,
290             h2 => undef,
291             h3 => undef,
292             h4 => undef,
293             h5 => undef,
294             h6 => undef,
295             head => undef,
296             hr => undef,
297             html => undef,
298             i => undef,
299             img => undef,
300             input => undef,
301             ins => undef,
302             kbd => undef,
303             label => undef,
304             legend => undef,
305             li => undef,
306             link => undef,
307             map => undef,
308             marquee => undef,
309             meta => undef,
310             noscript => undef,
311             object => undef,
312             ol => undef,
313             optgroup => undef,
314             option => undef,
315             p => undef,
316             param => undef,
317             pre => undef,
318             q => undef,
319             s => undef,
320             samp => undef,
321             script => undef,
322             select => undef,
323             small => undef,
324             span => undef,
325             strike => undef,
326             strong => undef,
327             style => undef,
328             'sub' => undef,
329             sup => undef,
330             table => undef,
331             tbody => undef,
332             td => undef,
333             textarea => undef,
334             tfoot => undef,
335             th => undef,
336             thead => undef,
337             title => undef,
338             tr => undef,
339             tt => undef,
340             u => undef,
341             ul => undef,
342             var => undef
343 1     1   4 };
  1         1  
344              
345 1         46 use constant EMPTY_ELEMENTS => {
346             area => undef,
347             base => undef,
348             basefont => undef,
349             br => undef,
350             col => undef,
351             frame => undef,
352             hr => undef,
353             img => undef,
354             input => undef,
355             isindex => undef,
356             link => undef,
357             meta => undef,
358             param => undef
359 1     1   4 };
  1         1  
360              
361 1         1504 use constant STYLES_ALLOWED_IF => {
362             aesthetic => undef,
363             block => undef,
364             layout => undef,
365             'size-changing' => undef,
366             images => undef
367 1     1   4 };
  1         1  
368              
369             # -- Helper routine to do the common task of removing scripts from CSS --------
370              
371             sub remove_scripts_from_css
372             {
373 0     0 0   local $_ = shift;
374              
375             # This is fairly rough.
376 0           $_ = decode_entities $_;
377 0           s/[a-z]+script://gis;
378 0           s/\@import//gis;
379              
380 0           $_;
381             }
382              
383             # -- Now the actual detoxify routine ------------------------------------------
384              
385             =head1 INVOCATION
386              
387             detoxify(html, options)
388              
389             Call I to detoxify I with the given I. The most common
390             key in for the I hash is I, which disallows certain features
391             of HTML. See above for the list of acceptable values. Pass a reference to an
392             array of strings specifying groups as the value to the optional I
393             hash. You may also specify I, which has the same syntax but
394             performs the reverse action: only the specified tag sets are allowed. If no
395             options are specified, dynamic content only is removed.
396              
397             If you want to detoxify a document in multiple stages, set the I
398             key in the I hash to the value 'first' on the first page and 'next'
399             on every subsequent page. This will postpone the tag closing mechanism until
400             you pass 'last' as the value to the I
key.
401              
402             =cut
403              
404             sub detoxify
405             {
406 0     0 0   local $_ = shift;
407 0           my $out = "";
408              
409 0           my $parser = new HTML::TokeParser(\$_);
410 0           our (@tagstack, @oldtagstacks);
411 0           my %opts = @_;
412 0           my $checkcss = 0;
413              
414 0 0 0       if (not exists $opts{section} or $opts{section} eq 'first') {
415             # Tag stack stacks?
416 0           push @oldtagstacks, [@tagstack];
417 0           @tagstack = ();
418             }
419              
420 0 0         if ($opts{allow_only}) {
    0          
421 0           my %allowed = map { $_, undef } @{$opts{allow_only}};
  0            
  0            
422 0           $opts{disallow} = { map { $_, undef } grep { not exists $allowed{$_} }
  0            
  0            
423 0           keys %{TAG_GROUPS()} }
424             } elsif ($opts{disallow}) {
425 0           $opts{disallow} = { map { $_, undef } @{$opts{disallow}} }
  0            
  0            
426             } else {
427 0           $opts{disallow} = { dynamic => undef }
428             }
429              
430 0           my $styles_allowed = 1;
431 0           foreach my $restriction (keys %{$opts{disallow}}) {
  0            
432 0 0         $styles_allowed = 0, last if exists STYLES_ALLOWED_IF->{$restriction}
433             }
434              
435 0           TOKEN: while (my $token = get_token $parser) {
436 0 0         if ($token->[0] eq 'S') {
    0          
    0          
    0          
437 0 0         next TOKEN if exists $opts{disallow}{everything};
438 0 0         next TOKEN unless exists TAGS->{lc $token->[1]};
439              
440 0           foreach my $restriction (keys %{$opts{disallow}}) {
  0            
441             next TOKEN if
442 0 0         exists TAG_GROUPS->{$restriction}{lc $token->[1]}
443             }
444              
445 0           my %attrs;
446 0           while (my ($key, $value) = each %{$token->[2]}) {
  0            
447 0 0         next unless $key =~ /^[a-z]/i;
448              
449 0 0         if (exists $opts{disallow}{dynamic}) {
450 0 0         next if $key =~ /^on/is;
451 0 0 0       next if lc($key) eq 'href' and
452             $value =~ /^[a-z]+?script:/is;
453             }
454              
455 0           $attrs{lc $key} = $value
456             }
457              
458             # As a special case, external style sheets must be disabled if
459             # dynamic content is disallowed.
460 0 0 0       next TOKEN if lc $token->[1] eq 'link' and (
      0        
461             exists $attrs{rel} && lc $attrs{rel} =~
462             /^\s*style\s*sheet\s*$/is or
463             exists $attrs{type} && lc $attrs{type} =~
464             m(^\s*text/css\s*$));
465              
466             # If this is a style declaration and dynamic content is
467             # disallowed, we need to flag it for checking.
468 0 0 0       $checkcss = 1 if lc $token->[1] eq 'style' and exists
469             $opts{disallow}{dynamic};
470              
471             # Add an ALT tag to images if it's needed.
472 0 0 0       $attrs{alt} = '[' .
      0        
      0        
473             (($attrs{src} =~ m{([^/.]*)\.[a-z]+$}gi)[0] or 'image') .
474             ']' if lc $token->[1] eq 'img' and $attrs{src} and not
475             $attrs{alt};
476              
477 0 0         if (not $styles_allowed) {
    0          
478 0 0         delete $attrs{style} if exists $attrs{style};
479 0 0         delete $attrs{class} if exists $attrs{class};
480 0 0         delete $attrs{id} if exists $attrs{id}
481             } elsif (exists $opts{disallow}{dynamic}) {
482 0 0         $attrs{style} = remove_scripts_from_css $attrs{style} if
483             $attrs{style}
484             }
485            
486 0 0         if (lc $token->[1] eq 'html') {
487             # Add a valid XML declaration and a doctype. HTML::Detoxifier
488             # converts everything to XHTML 1.0, so we might as well
489             # qualify it!
490              
491 0           $out = <<"ENDDECL" . $out;
492            
493            
494             ENDDECL
495              
496 0 0         $attrs{xmlns} = "http://www.w3.org/1999/xhtml"
497             unless $attrs{xmlns};
498 0 0         $attrs{lang} = "en-US" unless $attrs{lang};
499             }
500              
501 0           $out .= "<" . lc $token->[1];
502 0           while (my ($key, $value) = each %attrs) {
503 0           $value = encode_entities $value;
504 0           $out .= qq( $key="$value");
505             }
506              
507 0 0         if (exists EMPTY_ELEMENTS->{lc $token->[1]}) {
508 0           $out .= " />";
509             } else {
510 0           unshift @tagstack, $token->[1];
511 0           $out .= ">";
512             }
513             } elsif ($token->[0] eq 'E') {
514 0 0         next TOKEN unless exists TAGS->{lc $token->[1]};
515 0           foreach my $restriction (keys %{$opts{disallow}}) {
  0            
516             next TOKEN if
517 0 0         exists TAG_GROUPS->{$restriction}{lc $token->[1]}
518             }
519              
520 0           while (@tagstack) {
521 0           my $tag = shift @tagstack;
522 0           $out .= "";
523 0 0         last if $tag eq lc $token->[1];
524             }
525              
526 0 0 0       $checkcss = 0 if lc $token->[1] eq 'style' and exists
527             $opts{disallow}{dynamic};
528             } elsif ($token->[0] eq 'T') {
529 0           local $_ = $token->[1];
530 0 0         $_ = remove_scripts_from_css $_ if $checkcss;
531            
532 0           $out .= $_;
533             } elsif ($token->[0] eq 'C') {
534 0           local $_ = $token->[1];
535 0 0         $_ = remove_scripts_from_css $_ if $checkcss;
536              
537 0           s/(?:)//g;
538              
539 0 0 0       $out .= "" unless exists $opts{disallow}{comments}
540             or exists $opts{disallow}{everything};
541             }
542             }
543              
544 0 0 0       if (not exists $opts{section} or $opts{section} eq 'last') {
545 0           foreach my $unclosedtag (@tagstack) {
546 0           $out .= "";
547             }
548              
549 0 0         @tagstack = @oldtagstacks ? @{pop @oldtagstacks} : ();
  0            
550             }
551              
552 0           $out;
553             }
554              
555             =head1 AUTHOR
556              
557             Patrick Walton
558              
559             =head1 SEE ALSO
560              
561             L, L, L, L
562              
563             =head1 COPYRIGHT
564              
565             Copyright (c) 2004 Patrick Walton. You may redistribute this module under the
566             same terms as Perl itself. For more information, see the appropriate LICENSE
567             file.
568              
569             =cut
570              
571             1;
572