File Coverage

blib/lib/HTML/StripScripts.pm
Criterion Covered Total %
statement 366 382 95.8
branch 169 202 83.6
condition 44 55 80.0
subroutine 68 72 94.4
pod 31 31 100.0
total 678 742 91.3


line stmt bran cond sub pod time code
1             package HTML::StripScripts;
2 10     10   138283 use strict;
  10         13  
  10         271  
3 10     10   33 use warnings FATAL => 'all';
  10         11  
  10         386  
4              
5 10     10   39 use vars qw($VERSION);
  10         13  
  10         18755  
6             $VERSION = '1.06';
7              
8             =head1 NAME
9              
10             HTML::StripScripts - Strip scripting constructs out of HTML
11              
12             =head1 SYNOPSIS
13              
14             use HTML::StripScripts;
15              
16             my $hss = HTML::StripScripts->new({ Context => 'Inline' });
17              
18             $hss->input_start_document;
19              
20             $hss->input_start('');
21             $hss->input_text('hello, world!');
22             $hss->input_end('');
23              
24             $hss->input_end_document;
25              
26             print $hss->filtered_document;
27              
28             =head1 DESCRIPTION
29              
30             This module strips scripting constructs out of HTML, leaving as
31             much non-scripting markup in place as possible. This allows web
32             applications to display HTML originating from an untrusted source
33             without introducing XSS (cross site scripting) vulnerabilities.
34              
35             You will probably use L rather than using
36             this module directly.
37              
38             The process is based on whitelists of tags, attributes and attribute
39             values. This approach is the most secure against disguised scripting
40             constructs hidden in malicious HTML documents.
41              
42             As well as removing scripting constructs, this module ensures that
43             there is a matching end for each start tag, and that the tags are
44             properly nested.
45              
46             Previously, in order to customise the output, you needed to subclass
47             C and override methods. Now, most customisation
48             can be done through the C option provided to C. (See
49             examples/declaration/ and examples/tags/ for cases where subclassing is
50             necessary.)
51              
52             The HTML document must be parsed into start tags, end tags and
53             text before it can be filtered by this module. Use either
54             L or L instead
55             if you want to input an unparsed HTML document.
56              
57             See examples/direct/ for an example of how to feed tokens directly to
58             HTML::StripScripts.
59              
60             =head1 CONSTRUCTORS
61              
62             =over
63              
64             =item new ( CONFIG )
65              
66             Creates a new C filter object, bound to a
67             particular filtering policy. If present, the CONFIG parameter
68             must be a hashref. The following keys are recognized (unrecognized
69             keys will be silently ignored).
70              
71             $s = HTML::Stripscripts->new({
72             Context => 'Document|Flow|Inline|NoTags',
73             BanList => [qw( br img )] | {br => '1', img => '1'},
74             BanAllBut => [qw(p div span)],
75             AllowSrc => 0|1,
76             AllowHref => 0|1,
77             AllowRelURL => 0|1,
78             AllowMailto => 0|1,
79             EscapeFiltered => 0|1,
80             Rules => { See below for details },
81             });
82              
83             =over
84              
85             =item C
86              
87             A string specifying the context in which the filtered document
88             will be used. This influences the set of tags that will be
89             allowed.
90              
91             If present, the C value must be one of:
92              
93             =over
94              
95             =item C
96              
97             If C is C then the filter will allow a full
98             HTML document, including the C tag and C and C
99             sections.
100              
101             =item C
102              
103             If C is C then most of the cosmetic tags that one
104             would expect to find in a document body are allowed, including
105             lists and tables but not including forms.
106              
107             =item C
108              
109             If C is C then only inline tags such as C
110             and C are allowed.
111              
112             =item C
113              
114             If C is C then no tags are allowed.
115              
116             =back
117              
118             The default C value is C.
119              
120             =item C
121              
122             If present, this option must be an arrayref or a hashref. Any tag that
123             would normally be allowed (because it presents no XSS hazard) will be
124             blocked if the lowercase name of the tag is in this list.
125              
126             For example, in a guestbook application where C
tags are used to
127             separate posts, you may wish to prevent posts from including C
128             tags, even though C
is not an XSS risk.
129              
130             =item C
131              
132             If present, this option must be reference to an array holding a list of
133             lowercase tag names. This has the effect of adding all but the listed
134             tags to the ban list, so that only those tags listed will be allowed.
135              
136             =item C
137              
138             By default, the filter won't allow constructs that cause the browser to
139             fetch things automatically, such as C attributes in C tags.
140             If this option is present and true then those constructs will be
141             allowed.
142              
143             =item C
144              
145             By default, the filter won't allow constructs that cause the browser to
146             fetch things if the user clicks on something, such as the C
147             attribute in C tags. Set this option to a true value to allow this
148             type of construct.
149              
150             =item C
151              
152             By default, the filter won't allow relative URLs such as C<../foo.html>
153             in C and C attribute values. Set this option to a true value
154             to allow them. C and / or C also need to be set to true
155             for this to have any effect.
156              
157             =item C
158              
159             By default, C links are not allowed. If C is set to
160             a true value, then this construct will be allowed. This can be enabled
161             separately from AllowHref.
162              
163             =item C
164              
165             By default, any filtered tags are outputted as C<< >>. If
166             C is set to a true value, then the filtered tags are converted
167             to HTML entities.
168              
169             For instance:
170              
171            
--> <br>
172              
173             =item C
174              
175             The C option provides a very flexible way of customising the filter.
176              
177             The focus is safety-first, so it is applied after all of the previous validation.
178             This means that you cannot all malicious data should already have been cleared.
179              
180             Rules can be specified for tags and for attributes. Any tag or attribute
181             not explicitly listed will be handled by the default C<*> rules.
182              
183             The following is a synopsis of all of the options that you can use to
184             configure rules. Below, an example is broken into sections and explained.
185              
186             Rules => {
187              
188             tag => 0 | 1 | sub { tag_callback }
189             | {
190             attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
191             '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
192             required => [qw(attrname attrname)],
193             tag => sub { tag_callback }
194             },
195              
196             '*' => 0 | 1 | sub { tag_callback }
197             | {
198             attr => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
199             '*' => 0 | 1 | 'regex' | qr/regex/ | sub { attr_callback},
200             tag => sub { tag_callback }
201             }
202              
203             }
204              
205             EXAMPLE:
206              
207             Rules => {
208              
209             ##########################
210             ##### EXPLICIT RULES #####
211             ##########################
212              
213             ## Allow
tags, reject tags
214             br => 1,
215             img => 0,
216              
217             ## Send all
tags to a sub
218             div => sub { tag_callback },
219              
220             ## Allow
tags,and allow the 'cite' attribute
221             ## All other attributes are handled by the default C<*>
222             blockquote => {
223             cite => 1,
224             },
225              
226             ## Allow tags, and
227             a => {
228              
229             ## Allow the 'title' attribute
230             title => 1,
231              
232             ## Allow the 'href' attribute if it matches the regex
233             href => '^http://yourdomain.com'
234             OR href => qr{^http://yourdomain.com},
235              
236             ## 'style' attributes are handled by a sub
237             style => sub { attr_callback },
238              
239             ## All other attributes are rejected
240             '*' => 0,
241              
242             ## Additionally, the tag should be handled by this sub
243             tag => sub { tag_callback},
244              
245             ## If the tag doesn't have these attributes, filter the tag
246             required => [qw(href title)],
247              
248             },
249              
250             ##########################
251             ##### DEFAULT RULES #####
252             ##########################
253              
254             ## The default '*' rule - accepts all the same options as above.
255             ## If a tag or attribute is not mentioned above, then the default
256             ## rule is applied:
257              
258             ## Reject all tags
259             '*' => 0,
260              
261             ## Allow all tags and all attributes
262             '*' => 1,
263              
264             ## Send all tags to the sub
265             '*' => sub { tag_callback },
266              
267             ## Allow all tags, reject all attributes
268             '*' => { '*' => 0 },
269              
270             ## Allow all tags, and
271             '*' => {
272              
273             ## Allow the 'title' attribute
274             title => 1,
275              
276             ## Allow the 'href' attribute if it matches the regex
277             href => '^http://yourdomain.com'
278             OR href => qr{^http://yourdomain.com},
279              
280             ## 'style' attributes are handled by a sub
281             style => sub { attr_callback },
282              
283             ## All other attributes are rejected
284             '*' => 0,
285              
286             ## Additionally, all tags should be handled by this sub
287             tag => sub { tag_callback},
288              
289             },
290              
291             =over
292              
293             =item Tag Callbacks
294              
295             sub tag_callback {
296             my ($filter,$element) = (@_);
297              
298             $element = {
299             tag => 'tag',
300             content => 'inner_html',
301             attr => {
302             attr_name => 'attr_value',
303             }
304             };
305             return 0 | 1;
306             }
307              
308             A tag callback accepts two parameters, the C<$filter> object and the C$element>.
309             It should return C<0> to completely ignore the tag and its content (which includes
310             any nested HTML tags), or C<1> to accept and output the tag.
311              
312             The C<$element> is a hash ref containing the keys:
313              
314             =item C
315              
316             This is the tagname in lowercase, eg C, C
, C. If you set
317             the tag value to an empty string, then the tag will not be outputted, but
318             the tag contents will.
319              
320             =item C
321              
322             This is the equivalent of DOM's innerHTML. It contains the text content
323             and any HTML tags contained within this element. You can change the content
324             or set it to an empty string so that it is not outputted.
325              
326             =item C
327              
328             C contains a hashref containing the attribute names and values
329              
330             =back
331              
332             If for instance, you wanted to replace C<< >> tags with C<< >> tags,
333             you could do this:
334              
335             sub b_callback {
336             my ($filter,$element) = @_;
337             $element->{tag} = 'span';
338             $element->{attr}{style} = 'font-weight:bold';
339             return 1;
340             }
341              
342             =item Attribute Callbacks
343              
344             sub attr_callback {
345             my ( $filter, $tag, $attr_name, $attr_val ) = @_;
346             return undef | '' | 'value';
347             }
348              
349             Attribute callbacks accept four parameters, the C<$filter> object, the C<$tag>
350             name, the C<$attr_name> and the C<$attr_value>.
351              
352             It should return either C to reject the attribute, or the value to be
353             used. An empty string keeps the attribute, but without a value.
354              
355             =item C vs C vs C
356              
357             It is not necessary to use C or C - everything can be done
358             via C, however it may be simpler to write:
359              
360             BanAllBut => [qw(p div span)]
361              
362             The logic works as follows:
363              
364             * If BanAllBut exists, then ban everything but the tags in the list
365             * Add to the ban list any elements in BanList
366             * Any tags mentioned explicitly in Rules (eg a => 0, br => 1)
367             are added or removed from the BanList
368             * A default rule of { '*' => 0 } would ban all tags except
369             those mentioned in Rules
370             * A default rule of { '*' => 1 } would allow all tags except
371             those disallowed in the ban list, or by explicit rules
372              
373             =back
374              
375             =cut
376              
377             sub new {
378 1487     1487 1 22585 my ( $pkg, $cfg ) = @_;
379              
380 1487   66     5171 my $self = bless {}, ref $pkg || $pkg;
381 1487         2114 $self->hss_init($cfg);
382 1487         2249 return $self;
383             }
384              
385             =back
386              
387             =head1 METHODS
388              
389             This class provides the following methods:
390              
391             =over
392              
393             =item hss_init ()
394              
395             This method is called by new() and does the actual initialisation work
396             for the new HTML::StripScripts object.
397              
398             =cut
399              
400             sub hss_init {
401 1487     1487 1 1498 my ( $self, $cfg ) = @_;
402 1487   100     2288 $cfg ||= {};
403              
404 1487         1814 $self->{_hssCfg} = $cfg;
405              
406 1487         2268 $self->{_hssContext} = $self->init_context_whitelist;
407 1487         2233 $self->{_hssAttrib} = $self->init_attrib_whitelist;
408 1487         2121 $self->{_hssAttVal} = $self->init_attval_whitelist;
409 1487         1701 $self->{_hssStyle} = $self->init_style_whitelist;
410 1487         1894 $self->{_hssDeInter} = $self->init_deinter_whitelist;
411 1487         2062 $self->{_hssBanList} = $self->_hss_prepare_ban_list($cfg);
412 1487         2063 $self->{_hssRules} = $self->_hss_prepare_rules($cfg);
413             }
414              
415             =item input_start_document ()
416              
417             This method initializes the filter, and must be called once before
418             starting on each HTML document to be filtered.
419              
420             =cut
421              
422             sub input_start_document {
423 1890     1890 1 50534 my ( $self, $context ) = @_;
424              
425             $self->{_hssStack} = [ { NAME => '',
426 1890   100     9029 CTX => $self->{_hssCfg}{Context} || 'Flow',
427             CONTENT => '',
428             }
429             ];
430 1890         2088 $self->{_hssOutput} = '';
431              
432 1890         2683 $self->output_start_document;
433             }
434              
435             =item input_start ( TEXT )
436              
437             Handles a start tag from the input document. TEXT must be the
438             full text of the tag, including angle-brackets.
439              
440             =cut
441              
442             sub input_start {
443 3160     3160 1 6200 my ( $self, $text ) = @_;
444              
445 3160 100       3641 $self->_hss_accept_input_start($text) or $self->reject_start($text);
446             }
447              
448             sub _hss_accept_input_start {
449 3160     3160   2269 my ( $self, $text ) = @_;
450              
451 3160 100       12790 return 0 unless $text =~ m|^<([a-zA-Z0-9]+)\b(.*)>$|m;
452 3153         6588 my ( $tag, $attr ) = ( lc $1, $self->strip_nonprintable($2) );
453              
454 3153 50       4789 return 0 if $self->{_hssSkipToEnd};
455 3153 50 33     9882 if ( $tag eq 'script' or $tag eq 'style' ) {
456 0         0 $self->{_hssSkipToEnd} = $tag;
457 0         0 return 0;
458             }
459              
460 3153 100       4043 return 0 if $self->_hss_tag_is_banned($tag);
461              
462 3119         3688 my $allowed_attr = $self->{_hssAttrib}{$tag};
463 3119 100       3888 return 0 unless defined $allowed_attr;
464              
465 3117 100       3307 return 0 unless $self->_hss_get_to_valid_context($tag);
466              
467 3114         3136 my $default_filters = $self->{_hssRules}{'*'};
468 3114   100     6715 my $tag_filters = $self->{_hssRules}{$tag}
469             || $default_filters;
470              
471 3114         2434 my %filtered_attr;
472 3114         14986 while ( $attr
473             =~ s#^\s*([\w\-]+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?## )
474             {
475 4365         5719 my $key = lc $1;
476 4365 100       10463 my $val = ( defined $2 ? $self->unquoted_to_canonical_form($2)
    100          
    100          
477             : defined $3 ? $self->quoted_to_canonical_form($3)
478             : defined $4 ? $self->quoted_to_canonical_form($4)
479             : ''
480             );
481              
482 4365         4348 my $value_class = $allowed_attr->{$key};
483 4365 100       6005 next unless defined $value_class;
484              
485 4364         4601 my $attval_handler = $self->{_hssAttVal}{$value_class};
486 4364 50       4754 next unless defined $attval_handler;
487              
488 4364         2534 my $attr_filter;
489 4364 100       5535 if ($tag_filters) {
490 4115         5182 $attr_filter =
491             $self->_hss_get_attr_filter( $default_filters, $tag_filters,
492             $key );
493              
494             # filter == 0
495 4115 100       7428 next unless $attr_filter;
496             }
497              
498 3568         2543 my $filtered_value = &{$attval_handler}( $self, $tag, $key, $val );
  3568         4189  
499 3568 100       4822 next unless defined $filtered_value;
500              
501             # send value to filter if sub
502 3555 100 100     10722 if ( $tag_filters && ref $attr_filter ) {
503 2392         3223 $filtered_value
504             = $attr_filter->( $self, $tag, $key, $filtered_value );
505 2392 100       8622 next unless defined $filtered_value;
506             }
507              
508 3026         10797 $filtered_attr{$key} = $filtered_value;
509              
510             }
511              
512             # Check required attributes
513 3114 100       4537 if ( my $required = $tag_filters->{required} ) {
514 3         6 foreach my $key (@$required) {
515             return 0
516 5 100 66     25 unless defined $filtered_attr{$key} && length($filtered_attr{$key});
517             }
518             }
519              
520             # Check for callback
521             my $tag_callback = $tag_filters && $tag_filters->{tag}
522 3112   66     7365 || $default_filters->{tag};
523              
524 3112         4426 my $new_context = $self->{_hssContext}{ $self->{_hssStack}[0]{CTX} }{$tag};
525              
526 3112         8423 my %stack_entry = ( NAME => $tag,
527             ATTR => \%filtered_attr,
528             CTX => $new_context,
529             CALLBACK => $tag_callback,
530             CONTENT => '',
531             );
532 3112 100       4101 if ( $new_context eq 'EMPTY' ) {
533 1576         2435 $self->output_stack_entry( \%stack_entry );
534             }
535             else {
536 1536         1030 unshift @{ $self->{_hssStack} }, \%stack_entry;
  1536         2770  
537              
538             }
539              
540 3112         9299 return 1;
541             }
542              
543             =item input_end ( TEXT )
544              
545             Handles an end tag from the input document. TEXT must be the
546             full text of the end tag, including angle-brackets.
547              
548             =cut
549              
550             sub input_end {
551 1508     1508 1 3387 my ( $self, $text ) = @_;
552              
553 1508 100       1950 $self->_hss_accept_input_end($text) or $self->reject_end($text);
554             }
555              
556             sub _hss_accept_input_end {
557 1508     1508   1061 my ( $self, $text ) = @_;
558              
559 1508 100       3868 return 0 unless $text =~ m#^$#;
560 1505         1773 my $tag = lc $1;
561              
562 1505 50       2172 if ( $self->{_hssSkipToEnd} ) {
563 0 0       0 if ( $self->{_hssSkipToEnd} eq $tag ) {
564 0         0 delete $self->{_hssSkipToEnd};
565             }
566 0         0 return 0;
567             }
568              
569             # Ignore a close without an open
570 1505 100       1003 return 0 unless grep { $_->{NAME} eq $tag } @{ $self->{_hssStack} };
  3000         5600  
  1505         2187  
571              
572             # Close open tags up to the matching open
573 1478         1660 my @close = ();
574              
575 1478         1021 while ( scalar @{ $self->{_hssStack} } ) {
  1480         2425  
576 1480         966 my $entry = shift @{ $self->{_hssStack} };
  1480         1662  
577 1480         1551 push @close, $entry;
578 1480         1873 $self->output_stack_entry($entry);
579 1480         1633 $entry->{CONTENT} = '';
580 1480 100       2807 last if $entry->{NAME} eq $tag;
581             }
582              
583             # Reopen any we closed early if all that were closed are
584             # configured to be auto de-interleaved.
585 1478 100       1394 unless ( grep { !$self->{_hssDeInter}{ $_->{NAME} } } @close ) {
  1480         3764  
586 89         73 pop @close;
587 89         162 unshift @{ $self->{_hssStack} }, @close;
  89         102  
588             }
589              
590 1478         4726 return 1;
591             }
592              
593             =item input_text ( TEXT )
594              
595             Handles some non-tag text from the input document.
596              
597             =cut
598              
599             sub input_text {
600 382     382 1 806 my ( $self, $text ) = @_;
601              
602 382 50       623 return if $self->{_hssSkipToEnd};
603              
604 382         492 $text = $self->strip_nonprintable($text);
605              
606 382 100       1256 if ( $text =~ /^(\s*)$/ ) {
607 2         4 $self->output_text($1);
608 2         2 return;
609             }
610              
611 380 100       538 unless ( $self->_hss_get_to_valid_context('CDATA') ) {
612 1         3 $self->reject_text($text);
613 1         1 return;
614             }
615              
616 379         607 my $filtered = $self->filter_text( $self->text_to_canonical_form($text) );
617 379         563 $self->output_text( $self->canonical_form_to_text($filtered) );
618             }
619              
620             =item input_process ( TEXT )
621              
622             Handles a processing instruction from the input document.
623              
624             =cut
625              
626             sub input_process {
627 1     1 1 5 my ( $self, $text ) = @_;
628              
629 1         2 $self->reject_process($text);
630             }
631              
632             =item input_comment ( TEXT )
633              
634             Handles an HTML comment from the input document.
635              
636             =cut
637              
638             sub input_comment {
639 1     1 1 5 my ( $self, $text ) = @_;
640              
641 1         3 $self->reject_comment($text);
642             }
643              
644             =item input_declaration ( TEXT )
645              
646             Handles an declaration from the input document.
647              
648             =cut
649              
650             sub input_declaration {
651 1     1 1 7 my ( $self, $text ) = @_;
652              
653 1         4 $self->reject_declaration($text);
654             }
655              
656             =item input_end_document ()
657              
658             Call this method to signal the end of the input document.
659              
660             =cut
661              
662             sub input_end_document {
663 1890     1890 1 3562 my ($self) = @_;
664              
665 1890         1693 delete $self->{_hssSkipToEnd};
666              
667 1890         1395 while ( @{ $self->{_hssStack} } > 1 ) {
  1946         3516  
668 56         72 $self->output_stack_entry( shift @{ $self->{_hssStack} } );
  56         119  
669             }
670              
671 1890         2286 $self->output_end_document;
672 1890         1265 my $last_entry = shift @{ $self->{_hssStack} };
  1890         2007  
673 1890         2006 $self->{_hssOutput} = $last_entry->{CONTENT};
674 1890         3683 delete $self->{_hssStack};
675              
676             }
677              
678             =item filtered_document ()
679              
680             Returns the filtered document as a string.
681              
682             =cut
683              
684             sub filtered_document {
685 1890     1890 1 3059 my ($self) = @_;
686 1890         5215 $self->{_hssOutput};
687             }
688              
689             =back
690              
691             =cut
692              
693             =head1 SUBCLASSING
694              
695             The only reason for subclassing this module now is to add to the
696             list of accepted tags, attributes and styles (See
697             L). Everything else can be
698             achieved with L.
699              
700             The C class is subclassable. Filter objects are plain
701             hashes and C reserves only hash keys that start with
702             C<_hss>. The filter configuration can be set up by invoking the
703             hss_init() method, which takes the same arguments as new().
704              
705             =head1 OUTPUT METHODS
706              
707             The filter outputs a stream of start tags, end tags, text, comments,
708             declarations and processing instructions, via the following C
709             methods. Subclasses may override these to intercept the filter output.
710              
711             The default implementations of the C methods pass the
712             text on to the output() method. The default implementation of the
713             output() method appends the text to a string, which can be fetched with
714             the filtered_document() method once processing is complete.
715              
716             If the output() method or the individual C methods are
717             overridden in a subclass, then filtered_document() will not work in
718             that subclass.
719              
720             =over
721              
722             =item output_start_document ()
723              
724             This method gets called once at the start of each HTML document passed
725             through the filter. The default implementation does nothing.
726              
727             =cut
728              
729       3778 1   sub output_start_document { }
730              
731             =item output_end_document ()
732              
733             This method gets called once at the end of each HTML document passed
734             through the filter. The default implementation does nothing.
735              
736             =cut
737              
738             *output_end_document = \&output_start_document;
739              
740             =item output_start ( TEXT )
741              
742             This method is used to output a filtered start tag.
743              
744             =cut
745              
746 5108     5108 1 6227 sub output_start { $_[0]->output( $_[1] ) }
747              
748             =item output_end ( TEXT )
749              
750             This method is used to output a filtered end tag.
751              
752             =cut
753              
754             *output_end = \&output_start;
755              
756             =item output_text ( TEXT )
757              
758             This method is used to output some filtered non-tag text.
759              
760             =cut
761              
762             *output_text = \&output_start;
763              
764             =item output_declaration ( TEXT )
765              
766             This method is used to output a filtered declaration.
767              
768             =cut
769              
770             *output_declaration = \&output_start;
771              
772             =item output_comment ( TEXT )
773              
774             This method is used to output a filtered HTML comment.
775              
776             =cut
777              
778             *output_comment = \&output_start;
779              
780             =item output_process ( TEXT )
781              
782             This method is used to output a filtered processing instruction.
783              
784             =cut
785              
786             *output_process = \&output_start;
787              
788             =item output ( TEXT )
789              
790             This method is invoked by all of the default C methods. The
791             default implementation appends the text to the string that the
792             filtered_document() method will return.
793              
794             =cut
795              
796 5111     5111 1 8137 sub output { $_[0]->{_hssStack}[0]{CONTENT} .= $_[1]; }
797              
798             =item output_stack_entry ( TEXT )
799              
800             This method is invoked when a tag plus all text and nested HTML content
801             within the tag has been processed. It adds the tag plus its content
802             to the content for its parent tag.
803              
804             =cut
805              
806             sub output_stack_entry {
807 3113     3113 1 2414 my ( $self, $tag ) = @_;
808              
809 3113         1964 my %entry;
810 3113         2788 @entry{qw(tag attr content)} = @{$tag}{qw(NAME ATTR CONTENT)};
  3113         7771  
811              
812 3113 100       5131 if ( my $tag_callback = $tag->{CALLBACK} ) {
813 41 100       74 $tag_callback->( $self, \%entry )
814             or return;
815             }
816              
817 3112         2514 my $tagname = $entry{tag};
818 3112         3879 my $filtered_attrs = $self->_hss_join_attribs( $entry{attr} );
819              
820 3112 100       5081 if ( $tag->{CTX} eq 'EMPTY' ) {
821             $self->output_start("<$tagname$filtered_attrs />")
822 1576 50       4035 if $entry{tag};
823 1576         2937 return;
824             }
825 1536 100       1953 if ($tagname) {
826 1535         2590 $self->output_start("<$tagname$filtered_attrs>");
827             }
828              
829 1536 50       2476 if ( defined $entry{content} ) {
830 1536         1488 $self->{_hssStack}[0]{CONTENT} .= $entry{content};
831             }
832              
833 1536 100       1992 if ($tagname) {
834 1535         2244 $self->output_end("");
835             }
836             }
837              
838             =back
839              
840             =head1 REJECT METHODS
841              
842             When the filter encounters something in the input document which it
843             cannot transform into an acceptable construct, it invokes one of
844             the following C methods to put something in the output
845             document to take the place of the unacceptable construct.
846              
847             The TEXT parameter is the full text of the unacceptable construct.
848              
849             The default implementations of these methods output an HTML comment
850             containing the text C. If L
851             is set to true, then the rejected text is HTML escaped instead.
852              
853             Subclasses may override these methods, but should exercise caution.
854             The TEXT parameter is unfiltered input and may contain malicious
855             constructs.
856              
857             =over
858              
859             =item reject_start ( TEXT )
860              
861             =item reject_end ( TEXT )
862              
863             =item reject_text ( TEXT )
864              
865             =item reject_declaration ( TEXT )
866              
867             =item reject_comment ( TEXT )
868              
869             =item reject_process ( TEXT )
870              
871             =back
872              
873             =cut
874              
875             sub reject_start {
876             $_[0]->{_hssCfg}{EscapeFiltered}
877 76 100   76 1 161 ? $_[0]->output_text( $_[0]->escape_html_metachars( $_[1] ) )
878             : $_[0]->output_comment('');
879             }
880             *reject_end = \&reject_start;
881             *reject_text = \&reject_start;
882             *reject_declaration = \&reject_start;
883             *reject_comment = \&reject_start;
884             *reject_process = \&reject_start;
885              
886             =head1 WHITELIST INITIALIZATION METHODS
887              
888             The filter refers to various whitelists to determine which constructs
889             are acceptable. To modify these whitelists, subclasses can override
890             the following methods.
891              
892             Each method is called once at object initialization time, and must
893             return a reference to a nested data structure. These references are
894             installed into the object, and used whenever the filter needs to refer
895             to a whitelist.
896              
897             The default implementations of these methods can be invoked as class
898             methods.
899              
900             See examples/tags/ and examples/declaration/ for examples of how to
901             override these methods.
902              
903             =over
904              
905             =item init_context_whitelist ()
906              
907             Returns a reference to the C whitelist, which determines
908             which tags may appear at each point in the document, and which other
909             tags may be nested within them.
910              
911             It is a hash, and the keys are context names, such as C and
912             C.
913              
914             The values in the hash are hashrefs. The keys in these subhashes are
915             lowercase tag names, and the values are context names, specifying the
916             context that the tag provides to any other tags nested within it.
917              
918             The special context C as a value in a subhash indicates that
919             nothing can be nested within that tag.
920              
921             =cut
922              
923 10     10   49 use vars qw(%_Context);
  10         12  
  10         1937  
924              
925             BEGIN {
926              
927 10     10   173 my %pre_content = ( 'br' => 'EMPTY',
928             'span' => 'Inline',
929             'tt' => 'Inline',
930             'i' => 'Inline',
931             'b' => 'Inline',
932             'u' => 'Inline',
933             's' => 'Inline',
934             'strike' => 'Inline',
935             'em' => 'Inline',
936             'strong' => 'Inline',
937             'dfn' => 'Inline',
938             'code' => 'Inline',
939             'q' => 'Inline',
940             'samp' => 'Inline',
941             'kbd' => 'Inline',
942             'var' => 'Inline',
943             'cite' => 'Inline',
944             'abbr' => 'Inline',
945             'acronym' => 'Inline',
946             'ins' => 'Inline',
947             'del' => 'Inline',
948             'a' => 'Inline',
949             'CDATA' => 'CDATA',
950             );
951              
952 10         147 my %inline = ( %pre_content,
953             'img' => 'EMPTY',
954             'big' => 'Inline',
955             'small' => 'Inline',
956             'sub' => 'Inline',
957             'sup' => 'Inline',
958             'font' => 'Inline',
959             'nobr' => 'Inline',
960             );
961              
962 10         218 my %flow = ( %inline,
963             'ins' => 'Flow',
964             'del' => 'Flow',
965             'div' => 'Flow',
966             'p' => 'Inline',
967             'h1' => 'Inline',
968             'h2' => 'Inline',
969             'h3' => 'Inline',
970             'h4' => 'Inline',
971             'h5' => 'Inline',
972             'h6' => 'Inline',
973             'ul' => 'list',
974             'ol' => 'list',
975             'menu' => 'list',
976             'dir' => 'list',
977             'dl' => 'dt_dd',
978             'address' => 'Inline',
979             'hr' => 'EMPTY',
980             'pre' => 'pre.content',
981             'blockquote' => 'Flow',
982             'center' => 'Flow',
983             'table' => 'table',
984             );
985              
986 10         49 my %table = ( 'caption' => 'Inline',
987             'thead' => 'tr_only',
988             'tfoot' => 'tr_only',
989             'tbody' => 'tr_only',
990             'colgroup' => 'colgroup',
991             'col' => 'EMPTY',
992             'tr' => 'th_td',
993             );
994              
995 10         12 my %head = ( 'title' => 'NoTags', );
996              
997 10         465 %_Context = ( 'Document' => { 'html' => 'Html' },
998             'Html' => { 'head' => 'Head', 'body' => 'Flow' },
999             'Head' => \%head,
1000             'Inline' => \%inline,
1001             'Flow' => \%flow,
1002             'NoTags' => { 'CDATA' => 'CDATA' },
1003             'pre.content' => \%pre_content,
1004             'table' => \%table,
1005             'list' => { 'li' => 'Flow' },
1006             'dt_dd' => { 'dt' => 'Inline', 'dd' => 'Flow' },
1007             'tr_only' => { 'tr' => 'th_td' },
1008             'colgroup' => { 'col' => 'EMPTY' },
1009             'th_td' => { 'th' => 'Flow', 'td' => 'Flow' },
1010             );
1011             }
1012              
1013 1487     1487 1 1926 sub init_context_whitelist { return \%_Context; }
1014              
1015             =item init_attrib_whitelist ()
1016              
1017             Returns a reference to the C whitelist, which determines which
1018             attributes each tag can have and the values that those attributes can
1019             take.
1020              
1021             It is a hash, and the keys are lowercase tag names.
1022              
1023             The values in the hash are hashrefs. The keys in these subhashes are
1024             lowercase attribute names, and the values are attribute value class names,
1025             which are short strings describing the type of values that the
1026             attribute can take, such as C or C.
1027              
1028             =cut
1029              
1030 10     10   42 use vars qw(%_Attrib);
  10         18  
  10         3159  
1031              
1032             BEGIN {
1033              
1034 10     10   32 my %attr = ( 'style' => 'style' );
1035              
1036 10         40 my %font_attr = ( %attr,
1037             'size' => 'size',
1038             'face' => 'wordlist',
1039             'color' => 'color',
1040             );
1041              
1042 10         27 my %insdel_attr = ( %attr,
1043             'cite' => 'href',
1044             'datetime' => 'text',
1045             );
1046              
1047 10         20 my %texta_attr = ( %attr, 'align' => 'word', );
1048              
1049 10         20 my %cellha_attr = ( 'align' => 'word',
1050             'char' => 'word',
1051             'charoff' => 'size',
1052             );
1053              
1054 10         16 my %cellva_attr = ( 'valign' => 'word', );
1055              
1056 10         91 my %cellhv_attr = ( %attr, %cellha_attr, %cellva_attr );
1057              
1058 10         33 my %col_attr = ( %attr, %cellhv_attr,
1059             'width' => 'size',
1060             'span' => 'number',
1061             );
1062              
1063 10         93 my %thtd_attr = ( %attr,
1064             'abbr' => 'text',
1065             'axis' => 'text',
1066             'headers' => 'text',
1067             'scope' => 'word',
1068             'rowspan' => 'number',
1069             'colspan' => 'number',
1070             %cellhv_attr,
1071             'nowrap' => 'novalue',
1072             'bgcolor' => 'color',
1073             'width' => 'size',
1074             'height' => 'size',
1075             'bordercolor' => 'color',
1076             'bordercolorlight' => 'color',
1077             'bordercolordark' => 'color',
1078             );
1079              
1080 10         2096 %_Attrib = ( 'br' => { 'clear' => 'word' },
1081             'em' => \%attr,
1082             'strong' => \%attr,
1083             'dfn' => \%attr,
1084             'code' => \%attr,
1085             'samp' => \%attr,
1086             'kbd' => \%attr,
1087             'var' => \%attr,
1088             'cite' => \%attr,
1089             'abbr' => \%attr,
1090             'acronym' => \%attr,
1091             'q' => { %attr, 'cite' => 'href' },
1092             'blockquote' => { %attr, 'cite' => 'href' },
1093             'sub' => \%attr,
1094             'sup' => \%attr,
1095             'tt' => \%attr,
1096             'i' => \%attr,
1097             'b' => \%attr,
1098             'big' => \%attr,
1099             'small' => \%attr,
1100             'u' => \%attr,
1101             's' => \%attr,
1102             'strike' => \%attr,
1103             'font' => \%font_attr,
1104             'table' => {
1105             %attr,
1106             'frame' => 'word',
1107             'rules' => 'word',
1108             %texta_attr,
1109             'bgcolor' => 'color',
1110             'background' => 'src',
1111             'width' => 'size',
1112             'height' => 'size',
1113             'cellspacing' => 'size',
1114             'cellpadding' => 'size',
1115             'border' => 'size',
1116             'bordercolor' => 'color',
1117             'bordercolorlight' => 'color',
1118             'bordercolordark' => 'color',
1119             'summary' => 'text',
1120             },
1121             'caption' => { %attr, 'align' => 'word', },
1122             'colgroup' => \%col_attr,
1123             'col' => \%col_attr,
1124             'thead' => \%cellhv_attr,
1125             'tfoot' => \%cellhv_attr,
1126             'tbody' => \%cellhv_attr,
1127             'tr' => {
1128             %attr,
1129             bgcolor => 'color',
1130             %cellhv_attr,
1131             },
1132             'th' => \%thtd_attr,
1133             'td' => \%thtd_attr,
1134             'ins' => \%insdel_attr,
1135             'del' => \%insdel_attr,
1136             'a' => { %attr, href => 'href', title => 'text' },
1137             'h1' => \%texta_attr,
1138             'h2' => \%texta_attr,
1139             'h3' => \%texta_attr,
1140             'h4' => \%texta_attr,
1141             'h5' => \%texta_attr,
1142             'h6' => \%texta_attr,
1143             'p' => \%texta_attr,
1144             'div' => \%texta_attr,
1145             'span' => \%texta_attr,
1146             'ul' => {
1147             %attr,
1148             'type' => 'word',
1149             'compact' => 'novalue',
1150             },
1151             'ol' => { %attr,
1152             'type' => 'text',
1153             'compact' => 'novalue',
1154             'start' => 'number',
1155             },
1156             'li' => { %attr,
1157             'type' => 'text',
1158             'value' => 'number',
1159             },
1160             'dl' => { %attr, 'compact' => 'novalue' },
1161             'dt' => \%attr,
1162             'dd' => \%attr,
1163             'address' => \%attr,
1164             'hr' => {
1165             %texta_attr,
1166             'width' => 'size',
1167             'size' => 'size',
1168             'noshade' => 'novalue',
1169             },
1170             'pre' => { %attr, 'width' => 'size' },
1171             'center' => \%attr,
1172             'nobr' => {},
1173             'img' => {
1174             'src' => 'src',
1175             'alt' => 'text',
1176             'width' => 'size',
1177             'height' => 'size',
1178             'border' => 'size',
1179             'hspace' => 'size',
1180             'vspace' => 'size',
1181             'align' => 'word',
1182             },
1183             'body' => { 'bgcolor' => 'color',
1184             'background' => 'src',
1185             'link' => 'color',
1186             'vlink' => 'color',
1187             'alink' => 'color',
1188             'text' => 'color',
1189             },
1190             'head' => {},
1191             'title' => {},
1192             'html' => {},
1193             );
1194             }
1195              
1196 1487     1487 1 1392 sub init_attrib_whitelist { return \%_Attrib; }
1197              
1198             =item init_attval_whitelist ()
1199              
1200             Returns a reference to the C whitelist, which is a hash that maps
1201             attribute value class names from the C whitelist to coderefs to
1202             subs to validate (and optionally transform) a particular attribute value.
1203              
1204             The filter calls the attribute value validation subs with the
1205             following parameters:
1206              
1207             =over
1208              
1209             =item C
1210              
1211             A reference to the filter object.
1212              
1213             =item C
1214              
1215             The lowercase name of the tag in which the attribute appears.
1216              
1217             =item C
1218              
1219             The name of the attribute.
1220              
1221             =item C
1222              
1223             The attribute value found in the input document, in canonical form
1224             (see L).
1225              
1226             =back
1227              
1228             The validation sub can return undef to indicate that the attribute
1229             should be removed from the tag, or it can return the new value for
1230             the attribute, in canonical form.
1231              
1232             =cut
1233              
1234 10     10   38 use vars qw(%_AttVal);
  10         10  
  10         920  
1235              
1236             BEGIN {
1237 10     10   392 %_AttVal = ( 'style' => \&_hss_attval_style,
1238             'size' => \&_hss_attval_size,
1239             'number' => \&_hss_attval_number,
1240             'color' => \&_hss_attval_color,
1241             'text' => \&_hss_attval_text,
1242             'word' => \&_hss_attval_word,
1243             'wordlist' => \&_hss_attval_wordlist,
1244             'wordlistq' => \&_hss_attval_wordlistq,
1245             'href' => \&_hss_attval_href,
1246             'src' => \&_hss_attval_src,
1247             'stylesrc' => \&_hss_attval_stylesrc,
1248             'novalue' => \&_hss_attval_novalue,
1249             );
1250             }
1251              
1252 1487     1487 1 1563 sub init_attval_whitelist { return \%_AttVal; }
1253              
1254             =item init_style_whitelist ()
1255              
1256             Returns a reference to the C