File Coverage

blib/lib/HTML/TagFilter.pm
Criterion Covered Total %
statement 256 276 92.7
branch 124 160 77.5
condition 39 60 65.0
subroutine 44 49 89.8
pod 28 30 93.3
total 491 575 85.3


line stmt bran cond sub pod time code
1             package HTML::TagFilter;
2 4     4   91575 use strict;
  4         10  
  4         162  
3 4     4   18 use base qw(HTML::Parser);
  4         8  
  4         4198  
4 4     4   30771 use URI::Escape;
  4         7158  
  4         528  
5 4     4   30 use vars qw($VERSION);
  4         8  
  4         2948  
6              
7             $VERSION = '1.03';
8              
9             =head1 NAME
10              
11             HTML::TagFilter - A fine-grained html-filter, xss-blocker and mailto-obfuscator
12              
13             =head1 SYNOPSIS
14              
15             use HTML::TagFilter;
16             my $tf = new HTML::TagFilter;
17             my $clean_html = $tf->filter($dirty_html);
18            
19             # or
20            
21             my $tf = HTML::TagFilter->new(
22             allow=>{...},
23             deny=>{...},
24             log_rejects => 1,
25             strip_comments => 1,
26             echo => 1,
27             verbose => 1,
28             skip_xss_protection => 1,
29             skip_entification => 1,
30             skip_mailto_entification => 1,
31             xss_risky_attributes => [...],
32             xss_permitted_protocols => [...],
33             xss_allow_local_links => 1,
34             );
35            
36             # or
37              
38             my $tf = HTML::TagFilter->new(
39             on_finish_document =>sub {
40             return "\n

" . $self->report . "

\n";
41             },
42             );
43            
44             $tf->parse($some_html);
45             $tf->parse($more_html);
46             my $clean_html = $tf->output;
47             my $cleaning_summary = $tf->report;
48             my @tags_removed = $tf->report;
49             my $error_log = $tf->error;
50            
51             =head1 DESCRIPTION
52              
53             HTML::TagFilter is a subclass of HTML::Parser with a single purpose: it will remove unwanted html tags and attributes from a piece of text. It can act in a more or less fine-grained way - you can specify permitted tags, permitted attributes of each tag, and permitted values for each attribute in as much detail as you like.
54              
55             Tags which are not allowed are removed. Tags which are allowed are trimmed down to only the attributes which are allowed for each tag. It is possible to allow all or no attributes from a tag, or to allow all or no values for an attribute, and so on.
56              
57             The filter will also guard against cross-site scripting attacks and obfuscate any mailto:email addresses, unless you tell it not to.
58              
59             The original purpose for this was to screen user input. In that setting you'll often find that just using:
60              
61             my $tf = new HTML::TagFilter;
62             put_in_database($tf->filter($my_text));
63              
64             will do. However, it can also be used for display processes (eg text-only translation) or cleanup (eg removal of old javascript). In those cases you'll probably want to override the default rule set with a small number of denial rules.
65              
66             my $self = HTML::TagFilter->new(deny => {img => {'all'}});
67             print $tf->filter($my_text);
68              
69             Will strip out all images, for example, but leave everything else untouched.
70              
71             nb (faq #1) the filter only removes the tags themselves: all it does to text which is not part of a tag is to escape the s, to guard against false negatives and some common cross-site attacks.
72              
73             obPascal: Sorry about the incredibly long documentation, by the way. When I have time I'll make it shorter.
74              
75             =head1 CONFIGURATION: RULES
76              
77             Creating the rule set is fairly simple. You have three options:
78              
79             =head2 use the defaults
80              
81             which will produce safe but still formatted html, without tables, javascript or much else apart from inline text formatting, links and images.
82              
83             =head2 selectively override the defaults
84              
85             use the allow_tags and deny_tags methods to pass in one or more additional tag settings. eg:
86              
87             $self->allow_tags({ p => { class=> ['lurid','sombre','plain']} });
88             $self->deny_tags({ img => { all => [] });
89              
90             will mean that all attributes other than class="lurid|sombre|plain" will be removed from

tags, but the other default rules will remain unchanged. See below for more about how to specify rules.

91              
92             =head2 supply your own configuration
93              
94             To override the defaults completely, supply the constructor with some rules:
95              
96             my $self = HTML::TagFilter->new(
97             allow=>{ p => { class=> ['lurid','sombre','plain']} }
98             );
99              
100             In this case only the rules you supply will be applied: the defaults are ignored. You can achieve the same thing after construction by first clearing the rule set:
101              
102             my $self = HTML::TagFilter->new();
103             $self->clear_rules();
104             $self->allow_tags({ p => { align=> ['left','right','center']} });
105              
106             Future versions are intended to offer a more sophisticated rule system, allowing you to specify combinations of attributes, ranges for values and generally match names in a more fuzzy way.
107              
108             =head1 CONFIGURATION: BEHAVIOUR
109              
110             There are currently seven switches that will change the behaviour of the filter. They're supplied at construction time alongside any rules you care to specify. All of them default to 'off':
111              
112             my $tf = HTML::TagFilter->new(
113             log_rejects => 1,
114             strip_comments => 1,
115             echo => 1,
116             verbose => 1,
117             skip_xss_protection => 1,
118             skip_ltgt_entification => 1,
119             skip_mailto_entification => 1,
120             );
121            
122             =over 4
123              
124             =head3 log_rejects
125              
126             Set log to something true and the filter will keep a detailed log of all the tags it removes. The log can be retrieved by calling report(), which will return a summary in scalar context and a detailed AoH in list.
127              
128             =head3 echo
129              
130             Set echo to 1, or anything true, and the output of the filter will be sent straight to STDOUT. Otherwise the filter is silent until you call output().
131              
132             =head3 verbose
133              
134             Set verbose to 1, or anything true, and error messages will be output to STDERR as well as being stockpiled ready for a call to error().
135              
136             =head3 strip_comments
137              
138             Set strip_comments to 1 and comments will be stripped. If you don't, they won't.
139              
140             =head3 skip_xss_protection
141              
142             Unless you set skip_xss_protection to 1, the filter will postprocess some of its output to protect against common cross-site scripting attacks.
143              
144             It will entify any < and > in non-tag text, entify quotes in attribute values (the Parser will have unencoded them) and strip out values for vulnerable attributes if they don't look suitably like urls. By default these attributes are checked: src, lowsrc, href, background and cite. You can replace that list (not extend it) at any time:
145              
146             $self->xss_risky_attributes( qw(your list of attributes) );
147              
148             =head3 skip_ltgt_entification
149              
150             Disables the entification of < and > even if cross-site protection is on.
151              
152             =head3 skip_mailto_entification
153              
154             Unless you specify otherwise, any mailto:url seen by the filter is completely turned into html entities. will becomes will. This should defeat most email-harvesting software, but note that it has no effect on the text of your link, only its address. Links like wross@cpan.org are only partly obscured and should be avoided.
155              
156             =head3 other constructor parameters
157              
158             You can also supply values that will be used as default values for the methods of the same name:
159              
160             xss_risky_attributes
161             xss_permitted_protocols
162            
163             each of which expects a list of strings, and
164              
165             xss_allow_local_links
166              
167             which wants a single true or false value.
168              
169             =back
170              
171             =head1 RULES
172              
173             Each element is tested as it is encountered, in two stages:
174              
175             =over 4
176              
177             =head3 tag filter
178              
179             Just checks that this tag is permitted, and blocks the whole thing if not. Applied to both opening and closing tags.
180              
181             =head3 attribute filter
182              
183             Any tag that passes the tag filter will remain in the text, but the attribute filter will strip out of it any attributes that are not permitted, or which have values that are not permitted for that tag/attribute combination.
184              
185             =back
186              
187             =head2 format for rules
188              
189             There are two kinds of rule: permissions and denials. They work as you'd expect, and can coexist, but they're not quite symmetrical. Denial rules are intended to complement permission rules, so that they can provide a kind of compound 'unless'.
190              
191             * If there are any 'permission' rules, then everything that doesn't satisfy any of them is eliminated.
192              
193             * If there are any 'deny' rules, then anything that satisfies any of them is eliminated.
194              
195             * If there are both denial and permission rules, then everything either satisfies a denial rule or fails to satisfy any of the permission rules is eliminated.
196              
197             * If there is neither kind, we strip out everything just to be on the safe side.
198              
199             The two most likely setups are
200              
201             1. a full set of permission rules and maybe a couple of denial rules to eliminate pet hates.
202              
203             2. no permission rules at all and a small set of denial rules to remove particular tags.
204              
205             Rules are passed in as a HoHoL:
206              
207             { tag name->{attribute name}->[valuelist] }
208              
209             There are three reserved words: 'any and 'none' stand respectively for 'anything is permitted' and 'nothing is permitted', or if in denial: 'anything is removed' and 'nothing is removed'. 'all' is only used in denial rules and it indicates that the whole tag should be stripped out: see below for an explanation and some mumbled excuses.
210              
211             For example:
212              
213             $self->allow_tags({ p => { any => [] });
214              
215             Will permit

tags with any attributes. For clarity's sake it may be shortened to:

216              
217             $self->allow_tags({ p => { 'any' });
218              
219             but note that you'll get a warning about the odd number of hash elements if -w is on, and in the absence of the => the quotes are required. And
220              
221             $self->allow_tags({ p => { none => [] });
222              
223             Will allow

tags to remain in the text, but all attributes will be removed. The same rules apply at all levels in the tag/attribute/value hierarchy, so you can say things like:

224              
225             $self->allow_tags({ any => { align => [qw(left center right)] });
226             $self->allow_tags({ p => { align => ['any'] });
227              
228             =head2 examples
229              
230             To indicate that a link destination is ok and you don't mind what value it takes:
231              
232             $self->allow_tags({ a => { 'href' } });
233              
234             To limit the values an attribute can take:
235              
236             $self->allow_tags({ a => { class => [qw(big small middling)] } });
237              
238             To clear all permissions:
239              
240             $self->allow_tags({});
241              
242             To remove all onClicks from links but allow all targets:
243              
244             $self->allow_tags({ a => { onClick => ['none'], target => [], } });
245              
246             You can combine allows and denies to create 'unless' rules:
247              
248             $self->allow_tags({ a => { any => [] } });
249             $self->deny_tags({ a => { onClick => [] } });
250              
251             Will remove only the onClick attribute of a link, allowing everything else through. If this was your only purpose, you could achieve the same thing just with the denial rule and an empty permission set, but if there's other stuff going on then you probably need this combination.
252              
253             =head2 order of application
254              
255             denial rules are applied first. we take out whatever you specify in deny, then take out whatever you don't specify in allow, unless the allow set is empty, in which case we ignore it. If both sets are empty, no tags gets through.
256              
257             (We prefer to err on the side of less markup, but I expect this will be configurable soon.)
258              
259             =head2 oddities
260              
261             Only one deliberate one, so far. The main asymmetry between permission and denial rules is that from
262              
263             allow_tags->{ p => {...}}
264              
265             it follows that p tags are permitted, but the reverse is not true:
266              
267             deny_tags->{ p => {...}}
268              
269             doesn't imply that p tags are removed, just that the relevant attributes are removed from them. If you want to use a denial rule to eliminate a whole tag, you have to say so explicitly:
270              
271             deny_tags->{ p => {'all'}}
272              
273             will remove every

tag, whereas

274              
275             deny_tags->{ p => {'any'}}
276              
277             will just remove all the attributes from

tags. Not very pretty, I know. It's likely to change, but probably not until after we've invented a system for supplying rules in a more readable format.

278              
279             =cut
280              
281             sub allowed_by_default {
282             return {
283 8     8 0 383 h1 => { none => [] },
284             h2 => { none => [] },
285             h3 => { none => [] },
286             h4 => { none => [] },
287             h5 => { none => [] },
288             p => { none => [] },
289             a => { href => [], name => [], target => [] },
290             br => { clear => [qw(left right all)] },
291             ul =>{ type => [] },
292             li =>{ type => [] },
293             ol => { none => [] },
294             em => { none => [] },
295             i => { none => [] },
296             b => { none => [] },
297             strong => { none => [] },
298             tt => { none => [] },
299             pre => { none => [] },
300             code => { none => [] },
301             hr => { none => [] },
302             blockquote => { none => [] },
303             img => { src => [], height => [], width => [], alt => [], align => [] },
304             any => { align => [qw(left right center)] },
305             };
306             }
307              
308             sub denied_by_default {
309             return {
310 0     0 0 0 blink => { all => [] },
311             marquee => { all => [] },
312             any => { style => [], onMouseover => [], onClick => [], onMouseout => [], },
313             };
314             }
315              
316             sub new {
317 10     10 1 126 my $class = shift;
318 10         45 my $config = {@_};
319            
320 10         98 my $self = $class->SUPER::new(api_version => 3);
321              
322 10         413 $self->SUPER::handler(start => "filter_start", 'self, tagname, attr, attrseq');
323 10         44 $self->SUPER::handler(end => "filter_end", 'self, tagname');
324 10         42 $self->SUPER::handler(default => "clean_text", "self, text");
325 10 100       47 $self->SUPER::handler(comment => "") if delete $config->{strip_comments};
326            
327 10         39 $self->{_allows} = {};
328 10         22 $self->{_denies} = {};
329 10         20 $self->{_settings} = {};
330 10         19 $self->{_log} = ();
331 10         17 $self->{_error} = ();
332 10         20 $self->{_triggers} = ();
333              
334 10 100 100     82 $config->{allow} = allowed_by_default() unless $config->{allow} || $config->{deny};
335 10 50 66     55 $config->{deny} = denied_by_default() unless $config->{allow} || $config->{deny};
336              
337 10         52 $self->_add_trigger($_ => delete $config->{$_}) for (qw(on_construct on_start_document on_open_tag on_process_text on_close_tag on_finish_document));
338 10         42 $self->allow_tags(delete $config->{allow});
339 10         109 $self->deny_tags(delete $config->{deny});
340            
341 10 100       31 $self->{_settings}->{log} = 1 if delete $config->{log_rejects};
342 10 50       27 $self->{_settings}->{echo} = 1 if delete $config->{echo};
343 10 50       45 $self->{_settings}->{xss} = 1 unless delete $config->{skip_xss_protection};
344 10 100 66     69 $self->{_settings}->{ent} = 1 unless delete $config->{skip_entification} || delete $config->{skip_ltgt_entification};
345 10 100       50 $self->{_settings}->{mailto} = 1 unless delete $config->{skip_mailto_entification};
346 10 100       27 $self->{_settings}->{verbose} = 1 if delete $config->{verbose};
347 10         37 $self->_log_error("[warning] ignored unknown config field: $_") for keys %$config;
348            
349 10         38 $self->_call_trigger('on_construct');
350 10         33 return $self;
351             }
352              
353             sub _add_trigger {
354 60     60   85 my ($self, $point, $sub) = @_;
355 60 100 66     159 if ($sub && ref $sub eq 'CODE') {
356 5         30 $self->{_triggers}{$point} = $sub;
357             } else {
358 55         242 $self->{_triggers}{$point} = 1;
359 55         90 my $class = ref $self;
360 4     4   32 no strict ('refs');
  4         10  
  4         13592  
361 55     167   151 *{"HTML::TagFilter::$point"} = sub { return };
  55         362  
  167         503  
362             }
363             }
364              
365             sub _call_trigger {
366 190     190   350 my ($self, $point, @args) = @_;
367 190         339 my $sub = $self->{_triggers}{$point};
368 190 100 66     3808 if ( $sub && ref $sub eq 'CODE') {
    50          
369 22         25 my $response;
370 22         27 eval {
371 22         59 $response = &$sub($self, @args);
372             };
373 22 50       319 if ($@) {
    100          
374 0         0 $self->_log_error("[warning] $point callback failed: $@");
375             } elsif ($response) {
376 6         17 $self->add_to_output( $response );
377             }
378            
379             } elsif ($sub) {
380 168         410 $self->$point(@args);
381            
382             } else {
383 0         0 my ($package, $filename, $line) = caller;
384 0         0 $self->_log_error("[warning] unknown trigger point '$point' called at $package line $line");
385             }
386             }
387              
388             =head1 CALLBACKS
389              
390             Several trigger points are provided for the convenience of people who want to extend rather than replacing the normal behaviour of a tagfilter object. To use them, you just pass in a code reference with the appropriate name at construction time.
391              
392             The example below will maintain a stack of seen tags and make the filter repair tag nesting, so that any unclosed tags are closed in roughly the right place, and any unopened close tags are omitted:
393              
394             my $filter = HTML::TagFilter->new(
395             on_start_document => sub {
396             my ($self, $rawtext) = @_;
397             $self->{_tag_stack} = [];
398             return;
399             },
400             on_open_tag => sub {
401             my ($self, $tag, $attributes, $sequence) = @_;
402             push @{ $self->{_tag_stack} }, $$tag unless grep {$_ eq $$tag} qw(img br hr meta link);
403             return;
404             },
405             on_close_tag => sub {
406             my ($self, $tag) = @_;
407             unless (@{ $self->{_tag_stack} } && grep {$_ eq $$tag} @{ $self->{_tag_stack} }) {
408             undef ${ $tag };
409             return;
410             }
411             my @unclosed;
412             while (my $lasttag = pop @{ $self->{_tag_stack} }) {
413             return join '', map "", @unclosed if $lasttag eq $$tag;
414             push @unclosed, $lasttag;
415             }
416             },
417             on_finish_document => sub {
418             my ($self, $cleantext) = @_;
419             return join '', map "", reverse @{ $self->{_tag_stack} };
420             },
421             );
422              
423             You can also fill these trigger points in subclass: If no callback method is supplied, we will call the class method of the same (triggerpoint) name instead. In this class those methods do nothing, so you can selectively override them without affecting normal functionality. To change all tags to tags, for example:
424              
425             sub on_open_tag {
426             my ($self, $tag, $attributes, $sequence) = @_;
427             $$tag = 'strong' if $$tag eq 'b';
428             }
429              
430             sub on_close_tag {
431             my ($self, $tag) = @_;
432             $$tag = 'strong' if $$tag eq 'b';
433             }
434              
435             As you can see here The tag and attribute values are passed in as string references: changes you make in callback will change the tag itself.
436              
437             The available trigger points are:
438              
439             =head3 on_construct ()
440              
441             This is called during construction of a new TagFilter object, just before the constructed object is returned. It receives no arguments apart from the tagfilter object.
442              
443             =head3 on_start_document ( $text )
444              
445             This is called by the filter() method, and passed a reference to the text that is to be filtered. You can change the text, or return any values that should be prepended to output.
446              
447             =head3 on_open_tag ( $tagname, $attributes, $attribute_sequence )
448              
449             This is called by the filter_start() method, with is the checker of opening and single tags. It is passed the same variables as that method uses: the name of the tag, a hashref containing all its attributes and a listref holding attribute names in order.
450              
451             Together with on_close_tag, this hook is very useful for adding document-tidying functions like tag closure, or for more sophisticated logging than tagfilter provides by itself.
452              
453             =head3 on_process_text ( $text )
454              
455             We normally just translate disallowed characters in text blocks, but this method receives a reference to the text string, so you can do what you like with it.
456              
457             Note that if you just want to add more disallowed characters, you can just subclass character_map().
458              
459             =head3 on_close_tag ( $text )
460              
461             This is called by the filter_end() method, which is the checker of closing tags. It is passed the closing tag name.
462              
463             =head3 on_finish_document ( $text )
464              
465             This is called by the output() method. It receives no arguments, or we get the output a bit tangled up, but whatever you return will be appended to the final output string.
466              
467             =head1 METHODS
468              
469             For reference:
470              
471             =head3 HTML::TagFilter->new();
472              
473             If called without parameters, loads the default set. Otherwise loads the rules you supply. For the rule format, see above.
474              
475             =head2 FILTER METHODS
476              
477             These make up the main interface. You probably won't often need to call anything but filter().
478              
479             =head3 $tf->filter($html);
480              
481             Exactly equivalent to:
482              
483             $tf->parse($html);
484             $tf->output();
485              
486             but more useful, because it'll fit in a oneliner. eg:
487              
488             print $tf->filter( $pages{$_} ) for keys %pages;
489            
490             Note that calling filter() will clear anything that was waiting in the output buffer, and will clear the buffer again when it's finished. it's meant to be a one-shot operation and doesn't co-operate well. use parse() and output() if you want to daisychain.
491              
492             =cut
493              
494             sub filter {
495 24     24 1 774 my ($self, $text) = @_;
496 24 50       73 return unless $text;
497 24         62 $self->_call_trigger('on_start_document', \$text);
498 24         56 $self->{output} = '';
499 24         234 $self->parse($text);
500 24 50       98 return $self->output unless $self->{_settings}->{echo};
501             }
502              
503             =head3 parse($text);
504              
505             The parse method is inherited from HTML::Parser, but most of its ancillary methods are subclassed here and the output they normally print is kept for later. The other configuration options that HTML::Parser normally offers are not passed on, at the moment, nor can you override the handler definitions in this module.
506              
507             =head3 output()
508              
509             This will return and clear the output buffer. It will conclude the processing of your text, but you can of course pass a new piece of text to the same parser object and begin again.
510              
511             =cut
512              
513             sub output {
514 24     24 1 32 my $self = shift;
515 24         143 $self->eof;
516 24         51 $self->_call_trigger('on_finish_document');
517 24         46 my $output = $self->{output};
518 24 50       102 $self->_log_error("[warning] no output from filter") unless $output;
519 24         37 $self->{output} = '';
520 24         142 return $output;
521             }
522              
523             =head3 report()
524              
525             If called in list context, returns the array of rejected tag/attribute/value combinations.
526              
527             In scalar context returns a more or less readable summary. Returns () if logging not enabled. Clears the log.
528              
529             =cut
530              
531             sub report {
532 2     2 1 4 my $self = shift;
533 2 100       46 return () unless defined $self->{_log};
534 1         2 my @rejects = @{ $self->{_log} };
  1         3  
535 1         3 $self->{_log} = ();
536 1 50       3 return @rejects if wantarray;
537              
538 1         2 my $report = "The following tags and attributes have been stripped:\n";
539 1         3 for (@rejects) {
540 6 100       10 if ($_->{attribute}) {
541 2         7 $report .= $_->{attribute} . '="' . $_->{value} . '" from the tag <' . $_->{tag} . ">";
542 2 50       7 $report .= "(url disallowed)" if $_->{reason} eq 'url';
543 2         3 $report .= "\n";
544             } else {
545 4         11 $report .= '<' . $_->{tag} . ">\n";
546             }
547             }
548 1         8 return $report;
549             }
550              
551             =head3 filter_start($tag, $attributes_hashref, $attribute_sequence_listref);
552              
553             This is the handler for html start tags: it checks the tag against the current set of rules, then checks each attribute and its value. Any text that fails is stripped out: the rest is passed to output.
554              
555             =cut
556              
557             sub filter_start {
558 34     34 1 58 my ($self, $tagname, $attributes, $attribute_sequence) = @_;
559 34 100       100 return unless $self->tag_ok(lc($tagname));
560 30         86 $self->_call_trigger('on_open_tag', \$tagname, $attributes, $attribute_sequence);
561 30 50       62 return unless $tagname;
562            
563 30         61 for (@$attribute_sequence) {
564 18         59 my @data = (lc($tagname), lc($_), lc($attributes->{$_})); # (tag, attribute, value)
565 18 100 100     46 delete $attributes->{$_} unless $self->attribute_ok(@data) && $self->url_ok(@data);
566             }
567 30         93 my $surviving_attributes = join('', map { " $_=\"" . $self->_xss_clean_attribute($attributes->{$_}, $_) . '"' } grep { defined $attributes->{$_} } @$attribute_sequence);
  9         32  
  18         49  
568 30         99 $self->add_to_output("<$tagname$surviving_attributes>");
569             }
570              
571             =head3 filter_end($tag);
572              
573             This is the handler for html end tags: it checks the tag against the current set of rules, and passes it to output if it's ok.
574              
575             =cut
576              
577             sub filter_end {
578 27     27 1 47 my ($self, $tagname) = @_;
579 27 100       263 return unless $self->tag_ok(lc($tagname));
580 23         62 $self->_call_trigger('on_close_tag', \$tagname);
581 23 100       62 return unless $tagname;
582 21         62 $self->add_to_output( "" );
583             }
584              
585             =head3 clean_text($text);
586              
587             This is the handler for text: anything which is not tag is passed through here before being passed to output. At the moment it only applies some very simple cross-site protection: subclassing this method is an easy way to modify just the text part of your page.
588              
589             =cut
590              
591             sub clean_text {
592 79     79 1 118 my ($self, $text) = @_;
593 79         167 $self->_call_trigger('on_process_text', \$text);
594 79         217 $self->add_to_output($self->_xss_clean_text($text));
595             }
596              
597             sub _xss_clean_text {
598 79     79   130 my ($self, $text) = @_;
599 79 50       203 return $text unless $self->{_settings}->{xss};
600 79 100       196 return $text unless $self->{_settings}->{ent};
601 73         136 return $self->_clean_text($text);
602             }
603              
604             sub _clean_text {
605 73     73   98 my ($self, $text) = @_;
606 73         298 my $filter = $self->character_map;
607 73         2566 $text =~ s/$_/$$filter{$_}/gs for keys %$filter;
608 73         451 return $text;
609             }
610              
611             =head3 character_map($text);
612              
613             Returns a hashref of {disallowed_character => replacement_character} for use when cleaning text blocks.
614              
615             =cut
616              
617             sub character_map {
618 82     82 1 93 my $self = shift;
619 82 50       168 return $self->{_settings}->{charmap} = $_[0] if @_;
620 82 100       428 return $self->{_settings}->{charmap} if exists $self->{_settings}->{charmap};
621 9         58 return $self->{_settings}->{charmap} = {
622             '"' => '"',
623             "'" => ''',
624             '>' => '>',
625             '<' => '<',
626             };
627             }
628              
629             =head3 add_to_output($text);
630              
631             The supplied text is appended to the output buffer (or immediately printed, if echo is on).
632              
633             =cut
634              
635             sub add_to_output {
636 136     136 1 330 my $self = shift;
637 136 50 33     593 return unless @_ && defined $_[0];
638 136 50       314 if ($self->{_settings}->{echo}) {
639 0         0 print $_[0];
640             } else {
641 136         1075 $self->{output} .= $_[0];
642             }
643             }
644              
645             =head3 logging($boolean);
646              
647             This provides get-or-set access to the 'log' configuration parameter. Switching logging on or off during parsing will result in incomplete reports, of course.
648              
649             =cut
650              
651             sub logging {
652 17     17 1 53 my $self = shift;
653 17 50       37 $self->{_settings}->{log} = $_[0] if @_;
654 17         66 return $self->{_settings}->{log};
655             }
656              
657             =head3 log_denied($refused_tag);
658              
659             If logging is on, this method will append the supplied failure information to the log. The standard form for this is a hashref that will contain some or all of these keys: 'tag', 'attribute', 'value' and 'reason'.
660              
661             =cut
662              
663             sub log_denied {
664 17     17 1 27 my ($self, $bad_tag) = @_;
665 17 100       30 return unless $self->logging;
666 8         9 push @{ $self->{_log} } , $bad_tag;
  8         17  
667             }
668              
669             =head2 RULE CHECKERS
670              
671             Compare individual tags and attributes against the rule set currently in force. These simple methods are the core of tagfilter: most of the rest is configuration, and the filter methods are really just glue to connect these tests to HTML::Parser's progress through a document.
672              
673             =head3 tag_ok($tag);
674              
675             Returns true if the supplied tag name is allowed in the text. If not, returns false and logs the failure with the reason 'tag'.
676              
677             =cut
678              
679             sub tag_ok {
680 61     61 1 88 my ($self, $tagname) = @_;
681 61         123 my $ok = $self->_tag_ok($tagname);
682 61 100       142 $self->log_denied({tag => $tagname, reason => 'tag' }) unless $ok;
683 61         180 return $ok;
684             }
685              
686             sub _tag_ok {
687 61     61   78 my ($self, $tagname) = @_;
688 61 100 66     355 return 0 unless $tagname && $self->has_rules;
689 57 100       342 return 0 if $self->_check('_denies', 'attributes', $tagname, 'all');
690 55 100       101 return 1 unless $self->has_allow_rules;
691 49 100       118 return 1 if $self->_check('_allows', 'tags', $tagname);
692 2         3 return 0;
693             }
694              
695             =head3 attribute_ok($tag, $attribute);
696              
697             Returns true if it that attribute is allowed for that tag, and it is allowed to have the supplied value. If not, returns false and logs the failure with the reason 'attribute'.
698              
699             =cut
700              
701             sub attribute_ok {
702 18     18 1 31 my ($self, $tagname, $attribute, $value) = @_;
703 18         42 my $ok = $self->_attribute_ok( $tagname, $attribute, $value );
704 18 100       66 $self->log_denied({ tag => $tagname, attribute => $attribute, value => $value, reason => 'attribute' }) unless $ok;
705 18         83 return $ok;
706             }
707              
708             sub _attribute_ok {
709 18     18   26 my ($self, $tagname, $attribute, $value) = @_;
710 18 50 33     97 return 0 unless $tagname && $attribute && $self->has_rules;
      33        
711 18 100       40 return 0 if $self->_check('_denies','attributes', $tagname, 'any');
712 17 50       40 return 0 if $self->_check('_denies','values', $tagname, 'all',);
713 17 50       41 return 0 if $self->_check('_denies','values', $tagname, $attribute, 'any');
714 17 50       41 return 0 if $self->_check('_denies','values', $tagname, $attribute, $value);
715 17 50       34 return 1 unless $self->has_allow_rules;
716 17 50       39 return 1 if $self->_check('_allows','attributes', $tagname, 'any');
717 17 50       37 return 1 if $self->_check('_allows','values', 'any', $attribute, 'any');
718 17 50       39 return 1 if $self->_check('_allows','values', 'any', $attribute, $value);
719 17 100       35 return 1 if $self->_check('_allows','values', $tagname, $attribute, 'any');
720 5 100       11 return 1 if $self->_check('_allows','values', $tagname, $attribute, $value);
721 4         6 return 0;
722             }
723              
724             =head3 url_ok($tag, $attributes, $value);
725              
726             If xss protection is on, we check whether this attribute is a url field, and if it is we check that the url is a url (rather than a script tag or some other naughtiness). Failures are logged with the reason 'url'.
727              
728             =cut
729              
730             sub url_ok {
731 13     13 1 28 my ($self, $tagname, $attribute, $value) = @_;
732 13         29 my $ok = $self->_url_ok( $attribute, $value );
733 13 100       64 $self->log_denied({ tag => $tagname, attribute => $attribute, value => $value, reason => 'url' }) unless $ok;
734 13         81 return $ok;
735             }
736              
737             sub _url_ok {
738 13     13   21 my ($self, $attribute, $value) = @_;
739 13 50       60 return 1 unless $self->{_settings}->{xss};
740 13 100       27 return 1 unless $self->_is_risky($attribute);
741 7 100 66     28 return 1 if $self->xss_allow_local_links && ($value =~ /^\.*\//s || $value !~ /:/s);
      33        
742 6 100       19 return 1 if grep { $value =~ /^$_:/s } $self->xss_permitted_protocols;
  24         245  
743 4         11 return 0;
744             }
745              
746             # _xss_clean_attribute(): defends against very basic XSS attacks by entifying quote marks and <>
747              
748             sub _xss_clean_attribute {
749 9     9   15 my ($self, $text, $attribute) = @_;
750 9 50       33 return $text unless $self->{_settings}->{xss};
751 9         17 my $filter = $self->character_map;
752 9         237 $text =~ s/$_/$$filter{$_}/gs for keys %$filter;
753 9 100       31 return $self->_obfuscate_mailto($text) if $attribute eq 'href';
754 7         30 return $text;
755             }
756              
757             sub _is_risky {
758 13     13   15 my ($self, $attribute) = @_;
759 13         38 my %risky = map { $_ => 1 } $self->xss_risky_attributes;
  65         157  
760 13         66 return $risky{$attribute};
761             }
762              
763             # uri_escape is imported from URI::Escape
764              
765             sub _obfuscate_mailto {
766 2     2   6 my ($self, $address) = @_;
767 2 100       12 return $address unless $self->{_settings}->{mailto};
768 1 50       7 return $address unless $address =~ /^mailto:(.*)/;
769 1         8 my $garbled = join '', map { uri_escape($_, "\0-\377") } split //, $1;
  14         713  
770 1         52 return "mailto:$garbled";
771             }
772              
773             # _check(): a private function to test for a value buried deep in a HoHoHo
774             # without cluttering the place up with autovivification.
775              
776             sub _check {
777 248     248   286 my $self = shift;
778 248         258 my $field = shift;
779 248         613 my @russian_dolls = @_;
780 248 50       440 unless (@russian_dolls) {
781 0         0 $self->_log_error("[warning] _check: no keys supplied");
782 0         0 return 0;
783             }
784 248         390 my $deepref = $self->{$field};
785 248         332 for (@russian_dolls) {
786 467 50       826 unless (ref $deepref eq 'HASH') {
787 0         0 $self->_log_error("[error] _check: deepref not a hashref");
788 0         0 return 0;
789             }
790 467 100       1527 return 0 unless $deepref->{$_};
791 282         438 $deepref = $deepref->{$_};
792             }
793 63         235 return 1;
794             }
795              
796             =head2 configuration methods
797              
798             The configuration of the filter is held in a hash of hashes, usually referred to here as a hohoho as it usually has at least three levels. These methods expect to receive full or partial rule sets in the simplified form described above and merge them into - or drop them on top of - the active set.
799              
800             =head3 allow_tags($hashref)
801              
802             Takes a hashref of permissions and adds them to what we already have, replacing at the tag level where rules are already defined. In other words, you can add a tag to the existing set, but to add an attribute to an existing tag you have to specify the whole set of attribute permissions.
803              
804             If no rules are sent (eg an empty hashref, or nothing at all, or a non-hashref) this clears the permission rule set.
805              
806             =cut
807              
808             sub allow_tags {
809 10     10 1 16 my ($self, $tagset) = @_;
810 10 100 66     137 if ($tagset && ref $tagset eq 'HASH' && %$tagset) {
      66        
811 9         33 $self->_configurise('_allows', $tagset);
812             } else {
813 1         7 $self->{_allows} = {};
814             }
815 10         27 return 1;
816             }
817              
818             =head3 deny_tags($hashref)
819              
820             likewise but sets up (or clears) denial rules.
821              
822             =cut
823              
824             sub deny_tags {
825 10     10 1 13 my ($self, $tagset) = @_;
826 10 100 66     52 if ($tagset && ref $tagset eq 'HASH' && %$tagset) {
      66        
827 1         5 $self->_configurise('_denies', $tagset);
828             } else {
829 9         23 $self->{_denies} = {};
830             }
831 10         17 return 1;
832             }
833              
834             =head3 has_rules()
835              
836             Returns true only if either allow or deny rules have been defined.
837              
838             =cut
839              
840             sub has_rules {
841 79     79 1 88 my $self = shift;
842 79 100 100     303 return 1 if $self->has_allow_rules || $self->has_deny_rules;
843 4         15 return 0;
844             }
845              
846             =head3 has_allow_rules()
847              
848             Returns true if allow rules have been defined.
849              
850             =cut
851              
852             sub has_allow_rules {
853 151     151 1 152 my $self = shift;
854 151 100 66     357 return 1 if $self->{_allows} && %{ $self->{_allows} };
  151         1422  
855 19         70 return 0;
856             }
857              
858             =head3 has_deny_rules()
859              
860             Returns true if denial rules have been defined.
861              
862             =cut
863              
864             sub has_deny_rules {
865 13     13 1 17 my $self = shift;
866 13 100 66     33 return 1 if $self->{_denies} && %{ $self->{_denies} };
  13         105  
867 4         13 return 0;
868             }
869              
870             =head3 clear_rules()
871              
872             Clears the entire rule set ready for the supply of a new set. A filter with no rules will strip *all* html from supplied text, by the way.
873              
874             =cut
875              
876             sub clear_rules {
877 1     1 1 3 my $self = shift;
878 1         3 $self->{_allows} = {};
879 1         16 $self->{_denies} = {};
880             }
881              
882             # _configurise(): a private function that translates input rules into
883             # the bushy HoHoHo's we're using for lookup.
884              
885             sub _configurise {
886 10     10   19 my ($self, $field, $tagset) = @_;
887              
888 10 50       32 unless (ref $tagset eq 'HASH') {
889 0         0 $self->_log_error("[error] _configurise: supplied rules not a hashref");
890 0         0 return ();
891             }
892 10 50       36 $self->_log_error("[warning] _configurise: supplied rule set empty") unless keys %$tagset;
893              
894 10         49 TAG: foreach my $tag (keys %$tagset) {
895 179         372 $self->{$field}->{tags}->{$tag} = 1;
896            
897 179         172 ATT: foreach my $att (keys %{ $tagset->{$tag} }) {
  179         565  
898 227 100       443 if ($att eq 'none') {
899 128         268 $self->{$field}->{attributes}->{$tag} = {};
900 128         243 next TAG;
901             }
902 99         234 $self->{$field}->{attributes}->{$tag}->{$att} = 1;
903 99         489 $self->{$field}->{values}->{$tag}->{$att}->{any} = 1
904 99 100 66     249 unless defined( $tagset->{$tag}->{$att} ) && @{ $tagset->{$tag}->{$att} };
905 99         104 foreach my $val (@{ $tagset->{$tag}->{$att} }) {
  99         242  
906 51         207 $self->{$field}->{values}->{$tag}->{$att}->{$val} = 1;
907             }
908             }
909             }
910             }
911              
912             =head3 allows()
913              
914             Returns the full set of permissions as a HoHoho. Can't be set this way: just a utility function in case you want to either display the rule set, or get the whole thing so you can send it back to allow_tags in a modified form.
915              
916             =head3 denies()
917              
918             Likewise for denial rules.
919              
920             =cut
921              
922             sub allows {
923 0     0 1 0 my $self = shift;
924 0         0 return $self->{_allows};
925             }
926              
927             sub denies {
928 0     0 1 0 my $self = shift;
929 0         0 return $self->{_denies};
930             }
931              
932             =head2 XSS configuration
933              
934             Cross-site scripting attacks are invented or identified all the time. We'll try and stay up to date, but you may be more paranoid or up to date than us: if so, just override one or more of these methods.
935              
936             =head3 xss_risky_attributes( @list_of_attributes );
937              
938             Sets and returns a list of attributes that are considered to be urls, and should be checked for well-formedness.
939              
940             The default list is href, src, lowsrc, cite and background: any supplied values will be used to replace (not extend) this list.
941              
942             =cut
943              
944             sub xss_risky_attributes {
945 13     13 1 15 my $self = shift;
946 13 50       31 return @{ $self->{_xss_att} } = @_ if @_;
  0         0  
947 13 100       32 return @{ $self->{_xss_att} } if $self->{_xss_att};
  7         21  
948 6         10 return @{ $self->{_xss_att} } = qw(src href cite lowsrc background) ;
  6         39  
949             }
950              
951             =head3 xss_permitted_protocols( @list_of_prefixes );
952              
953             Sets and returns a list of protocols that are acceptable in attributes that we considered to be urls (ie they're in the list returned by C).
954              
955             The default list is http, https, ftp and mailto. Any supplied values will be used to replace (not extend) this list. Don't include the colon.
956              
957             =cut
958              
959             sub xss_permitted_protocols {
960 6     6 1 7 my $self = shift;
961 6 50       17 return @{ $self->{_xss_stems} } = @_ if @_;
  0         0  
962 6 100       18 return @{ $self->{_xss_stems} } if $self->{_xss_stems};
  2         7  
963 4         6 return @{ $self->{_xss_stems} } = qw(http https mailto ftp) ;
  4         17  
964             }
965              
966             =head3 xss_allow_local_links( $boolean );
967              
968             If this method returns a true value, then addresses that begin '/' or '../' will be accepted in url fields.
969              
970             You can set this value by calling the method with a parameter, as usual. The default is true.
971              
972             =cut
973              
974             sub xss_allow_local_links {
975 7     7 1 10 my $self = shift;
976 7 50       17 return $self->{_xss_local} = $_[0] if @_;
977 7 100       36 return $self->{_xss_local} if defined $self->{_xss_local};
978 5         60 return $self->{_xss_local} = 1;
979             }
980              
981             =head3 error()
982              
983             Returns an error report of currently dubious usefulness. If you want to record error messages in subclass, call $self->_add_error(@messages).
984              
985             There is no class-level error logging mechanism at the moment, which is why the usefulness of this is rather limited.
986              
987             =cut
988              
989             sub error {
990 1     1 1 240 my $self = shift;
991 1 50       5 return "HTML::TagFilter errors:\n" . join("\n", @{$self->{_error}}) if $self->{_error};
  1         5  
992 0         0 return '';
993             }
994              
995             # _log_error: append a message to the error log
996              
997             sub _log_error {
998 1     1   2 my $self = shift;
999 1         1 push @{ $self ->{_error} } , @_;
  1         4  
1000 1 50       7 warn @_ if $self->{_settings}->{verbose};
1001             }
1002              
1003             # handler() exists here only to admonish people who try to use this module as they would
1004             # HTML::Parser. The handler definitions in new() use SUPER::handler() to get around this.
1005              
1006             sub handler {
1007 0     0 1   die("You can't set handlers for HTML::TagFilter. Perhaps you should be using HTML::Parser directly?");
1008             }
1009              
1010             sub version {
1011 0     0 1   return $VERSION;
1012             }
1013              
1014             1;
1015              
1016             =head1 TO DO
1017              
1018             Make the documentation about half as long
1019              
1020             More sanity checks on incoming rules
1021              
1022             Simpler rule-definition interface
1023              
1024             Complex rules. The long term goal is that someone can supply a rule like "remove all images where height or width is missing" or "change all font tags where size="2" to . Which will be hard. For a start, HTML::Parser doesn't see paired start and close tags, which would be required for conditional actions.
1025              
1026             An option to speed up operations by working only at the tag level and using HTML::Parser's built-in screens.
1027              
1028             =head1 REQUIRES
1029              
1030             HTML::Parser
1031              
1032             =head1 SEE ALSO
1033              
1034             L
1035              
1036             =head1 AUTHOR
1037              
1038             William Ross, wross@cpan.org
1039              
1040             =head1 COPYRIGHT
1041              
1042             Copyright 2001-3 William Ross
1043              
1044             This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
1045              
1046             Please use https://rt.cpan.org/ to report bugs & omissions, describe cross-site attacks that get through, or suggest improvements.
1047              
1048             =cut