File Coverage

blib/lib/MojoMojo/Declaw.pm
Criterion Covered Total %
statement 335 423 79.2
branch 167 332 50.3
condition 41 102 40.2
subroutine 15 18 83.3
pod 11 11 100.0
total 569 886 64.2


line stmt bran cond sub pod time code
1             package MojoMojo::Declaw;
2              
3             =head1 NAME
4              
5             MojoMojo::Declaw - Cleans HTML as well as CSS of scripting and other executable contents, and neutralises XSS attacks.
6             Derived from HTML::Defang version 1.01.
7              
8             =head1 SYNOPSIS
9              
10             my $InputHtml = "<html><body></body></html>";
11              
12             my $Defang = MojoMojo::Declaw->new(
13             context => $Self,
14             fix_mismatched_tags => 1,
15             tags_to_callback => [ br embed img ],
16             tags_callback => \&DefangTagsCallback,
17             url_callback => \&DefangUrlCallback,
18             css_callback => \&DefangCssCallback,
19             attribs_to_callback => [ qw(border src) ],
20             attribs_callback => \&DefangAttribsCallback
21             );
22              
23             my $SanitizedHtml = $Defang->defang($InputHtml);
24              
25             # Callback for custom handling specific HTML tags
26             sub DefangTagsCallback {
27             my ($Self, $Defang, $OpenAngle, $lcTag, $IsEndTag, $AttributeHash, $CloseAngle, $HtmlR, $OutR) = @_;
28             return 1 if $lcTag eq 'br'; # Explicitly defang this tag, eventhough safe
29             return 0 if $lcTag eq 'embed'; # Explicitly whitelist this tag, eventhough unsafe
30             return 2 if $lcTag eq 'img'; # I am not sure what to do with this tag, so process as HTML::Defang normally would
31             }
32              
33             # Callback for custom handling URLs in HTML attributes as well as style tag/attribute declarations
34             sub DefangUrlCallback {
35             my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $AttributeHash, $HtmlR) = @_;
36             return 0 if $$AttrValR =~ /safesite.com/i; # Explicitly allow this URL in tag attributes or stylesheets
37             return 1 if $$AttrValR =~ /evilsite.com/i; # Explicitly defang this URL in tag attributes or stylesheets
38             }
39              
40             # Callback for custom handling style tags/attributes
41             sub DefangCssCallback {
42             my ($Self, $Defang, $Selectors, $SelectorRules, $Tag, $IsAttr) = @_;
43             my $i = 0;
44             foreach (@$Selectors) {
45             my $SelectorRule = $$SelectorRules[$i];
46             foreach my $KeyValueRules (@$SelectorRule) {
47             foreach my $KeyValueRule (@$KeyValueRules) {
48             my ($Key, $Value) = @$KeyValueRule;
49             $$KeyValueRule[2] = 1 if $Value =~ '!important'; # Comment out any '!important' directive
50             $$KeyValueRule[2] = 1 if $Key =~ 'position' && $Value =~ 'fixed'; # Comment out any 'position=fixed;' declaration
51             }
52             }
53             $i++;
54             }
55             }
56              
57             # Callback for custom handling HTML tag attributes
58             sub DefangAttribsCallback {
59             my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $HtmlR) = @_;
60             $$AttrValR = '0' if $lcAttrKey eq 'border'; # Change all 'border' attribute values to zero.
61             return 1 if $lcAttrKey eq 'src'; # Defang all 'src' attributes
62             return 0;
63             }
64              
65             =head1 DESCRIPTION
66              
67             This module accepts an input HTML and/or CSS string and removes any executable code including scripting, embedded objects, applets, etc., and neutralises any XSS attacks. A whitelist based approach is used which means only HTML known to be safe is allowed through.
68              
69             HTML::Defang uses a custom html tag parser. The parser has been designed and tested to work with nasty real world html and to try and emulate as close as possible what browsers actually do with strange looking constructs. The test suite has been built based on examples from a range of sources such as http://ha.ckers.org/xss.html and http://imfo.ru/csstest/css_hacks/import.php to ensure that as many as possible XSS attack scenarios have been dealt with.
70              
71             HTML::Defang can make callbacks to client code when it encounters the following:
72              
73             =over 4
74              
75             =item *
76              
77             When a specified tag is parsed
78              
79             =item *
80              
81             When a specified attribute is parsed
82              
83             =item *
84              
85             When a URL is parsed as part of an HTML attribute, or CSS property value.
86              
87             =item *
88              
89             When style data is parsed, as part of an HTML style attribute, or as part of an HTML <style> tag.
90              
91             =back
92              
93             The callbacks include details about the current tag/attribute that is being parsed, and also gives a scalar reference to the input HTML. Querying pos() on the input HTML should indicate where the module is with parsing. This gives the client code flexibility in working with HTML::Declaw.
94              
95             HTML::Declaw can defang whole tags, any attribute in a tag, any URL that appear as an attribute or style property, or any CSS declaration in a declaration block in a style rule. This helps one to precisely block the most specific unwanted elements in the contents(for example, block just an offending attribute instead of the whole tag), while retaining any safe HTML/CSS.
96              
97             =cut
98              
99 27     27   14487 use Exporter;
  27         65  
  27         1966  
100             our @ISA = ('Exporter');
101             %EXPORT_TAGS = ( all => [qw(@FormTags)] );
102             Exporter::export_ok_tags('all');
103              
104 27     27   163 use strict;
  27         60  
  27         475  
105 27     27   122 use warnings;
  27         63  
  27         937  
106              
107             our $VERSION = 1.01;
108              
109 27     27   991 use Encode;
  27         14654  
  27         2162  
110              
111             my $HasScalarReadonly = 0;
112              
113             BEGIN {
114 27 50   27   1508 eval "use Scalar::Readonly qw(readonly_on);" && ( $HasScalarReadonly = 1 );
  27     27   63002  
  0         0  
  0         0  
115             }
116              
117             our @FormTags =
118             qw(form input textarea select option button fieldset label legend multicol nextid optgroup);
119              
120             # Some regexps for matching HTML tags + key=value attributes
121             my $AttrKeyStartLineRE = qr/[^=<>\s\/\\]{1,}/;
122             my $AttrKeyRE = qr/(?<=[\s'"\/])$AttrKeyStartLineRE/;
123             my $AttrValRE =
124             qr/[^>\s'"`][^>\s]*|'[^']{0,2000}?'|"[^"]{0,2000}?"|`[^`]{0,2000}?`/;
125             my $AttributesRE = qr/(?:(?:$AttrKeyRE\s*)?(?:=\s*$AttrValRE\s*)?)*/;
126             my $TagNameRE = qr/[A-Za-z][A-Za-z0-9\#\&\;\:\!_-]*/;
127              
128             my $Selectors = qr/[^{]*?/;
129             my $StyleKey = qr/[^:}]+?/;
130             my $StyleValue = qr/[^;}]+|.*$/;
131              
132             my $Fonts = qr/"?([A-Za-z0-9\s-]+)"?/;
133             my $Alignments =
134             qr/(absbottom|absmiddle|all|autocentre|baseline|bottom|center|justify|left|middle|none|right|texttop|top)/;
135              
136             my $Executables =
137             '([^@]\.com|'
138             . '.*\.(exe|cmd|bat|pif|scr|sys|sct|lnk|dll'
139             . '|vbs?|vbe|hta|shb|shs|hlp|chm|eml|wsf|wsh|js'
140             . '|asx|wm.|mdb|mht|msi|msp|cpl|lib|reg))';
141             my $SrcBanStd =
142             qr/^([A-Za-z]*script|.*\&\{|mocha|about|opera|mailto:|hcp:|\/(dev|proc)|\\|file|smb|cid:${Executables}(@|\?|$))/i;
143              
144             my %Rules = (
145              
146             # Disallow unknown tags by default
147             "_unknown" => qr/.*/,
148             "align" => qr/^${Alignments}$/i,
149             "alnum" => qr/^[A-Za-z0-9_.-]+$/,
150             "boolean" => qr/^(0|1|true|yes|no|false)$/,
151             "charset" => qr/^[A-Za-z0-9_][A-Za-z0-9_.-]*$/,
152             "class" => qr/^[A-Za-z0-9_.:\s-]*$/,
153             "color" => qr/^#?[0-9A-Z]+$/i,
154             "coords" => qr/^(\d+,)+\d+$/i,
155             "datetime" => qr/^\d\d\d\d-\d\d-\d\d.{0,5}\d\d:\d\d:\d\d.{0,5}$/,
156             "dir" => qr/^(ltr|rtl)$/i,
157             "eudora" => qr/^(autourl)$/i,
158             "font-face" => qr/^((${Fonts})[,\s]*)+$/i,
159             "form-enctype" =>
160             qr/^(application\/x-www-form-urlencoded|multipart\/form-data)$/i,
161             "form-method" => qr/^(get|post)$/i,
162             "frame" => qr/^(void|above|below|hsides|vsides|lhs|rhs|box|border)$/i,
163              
164             # href: Not javascript, vbs or vbscript
165             "href" => qr/^([A-Za-z]*script|.*\&\{|mocha|hcp|opera|about|smb|\/dev\/)/i,
166             "usemap-href" => qr/^#[A-Za-z0-9_.-]+$/, # this is not really a href at all!
167             "input-size" =>
168             qr/^(\d{1,4})$/, # some browsers freak out with very large widgets
169             "input-type" =>
170             qr/^(button|checkbox|file|hidden|image|password|radio|readonly|reset|submit|text)$/i,
171             "integer" => qr/^(-|\+)?\d+$/,
172             "number" => qr/^(-|\+)?[\d.,]+$/,
173              
174             # language: Not javascript, vbs or vbscript
175             "language" => qr/^(XML)$/i,
176             "media" => qr/^((screen|print|projection|braille|speech|all)[,\s]*)+$/i,
177             "meta:name" =>
178             qr/^(author|progid|originator|generator|keywords|description|content-type|pragma|expires)$/i,
179              
180             # mime-type: Not javascript
181             "mime-type" => qr/^(cite|text\/(plain|css|html|xml))$/i,
182             "list-type" =>
183             qr/^(none,a,i,upper-alpha,lower-alpha,upper-roman,lower-roman,decimal,disc,square,circle,round)$/i,
184              
185             # "rel" => qr/^((copyright|author|stylesheet)\s*)+$/i,
186             "rel" => qr/^((copyright|author)\s*)+$/i
187             , # XXX external stylesheets can contain scripting, so kill them
188             "rules" => qr/^(none|groups|rows|cols|all)$/i,
189             "scope" => qr/^(row|col|rowgroup|colgroup)$/i,
190             "shape" => qr/^(rect|rectangle|circ|circle|poly|polygon)$/i,
191              
192             # The following two are for URLs we expect to be auto-loaded by the browser,
193             # because they are within a frame, image or something like that.
194             # "src" => qr/^([a-z]+):|^[\w\.\/\%]+$/i,
195             "src" => qr/^https?:\/\/|^[\w.\/%]+$/i,
196              
197             # "style" => qr/^([A-Za-z0-9_-]+\\s*:\\s*(yes|no)|text-align\\s*:\\s*$alignments|((background|(background-|font-)?color)\\s*:\\s*(\\#?[A-Z0-9]+)?|((margin|padding|border)-(right|left)|tab-interval|height|width)\\s*:\\s*[\\d\\.]+(pt|px)|font(-family|-size|-weight|)\\s*:(\\s*[\\d\\.]+(pt|px)|\\s*$fonts)+)[;\\s]*)+\$/i,
198             # "style" => qr/expression|eval|script:|mocha:|\&{|\@import|(?<!background-)position:|background-image/i, # XXX there are probably a million more ways to cause trouble with css!
199             "style" => qr/^.*$/s,
200              
201             #kc In addition to this, we could strip all 'javascript:|expression|' etc. from all attributes(in attribute_cleanup())
202             "stylesheet" => qr/expression|eval|script:|mocha:|\&\{|\@import/i
203             , # stylesheets are forbidden if Embedded => 1. css positioning can be allowed in an iframe.
204             # NB see also `process_stylesheet' below
205             "style-type" => qr/script|mocha/i,
206             "size" => qr/^[\d.]+(px|%)?$/i,
207             "target" => qr/^[A-Za-z0-9_][A-Za-z0-9_.-]*$/,
208             "base-href" => qr/^https?:\/\/[\w.\/]+$/,
209             "anything" => qr/^.*$/, #[ 0, 0 ],
210             "meta:content" => [ 0, 0 ],
211             );
212              
213             my %CommonAttributes = (
214              
215             # Core attributes
216             "class" => "class",
217             "id" => "alnum",
218             "name" => "alnum",
219             "style" => "style",
220             "accesskey" => "alnum",
221             "tabindex" => "integer",
222             "title" => "anything",
223              
224             # Language attributes
225             "dir" => "dir",
226             "lang" => "alnum",
227             "language" => "language",
228             "longdesc" => "anything",
229              
230             # Height, width, alignment, etc.
231             #-mxy allow more flexible values for align
232             # "align" => "align",
233             "align" => "alnum",
234             "bgcolor" => "color",
235             "bottommargin" => "size",
236             "clear" => "align",
237             "color" => "color",
238             "height" => "size",
239             "leftmargin" => "size",
240             "marginheight" => "size",
241             "marginwidth" => "size",
242             "nowrap" => "anything",
243             "rightmargin" => "size",
244             "scroll" => "boolean",
245             "scrolling" => "boolean",
246             "topmargin" => "size",
247             "valign" => "align",
248             "width" => "size",
249              
250             # youtube embedded objects
251             "value" => "anything",
252             "type" => "anything",
253             "allowscriptaccess" => 'alnum',
254             "allowfullscreen" => "boolean",
255             "src" => "src",
256             );
257              
258             my %ListAttributes = (
259             "compact" => "anything",
260             "start" => "integer",
261             "type" => "list-type",
262             );
263              
264             my %TableAttributes = (
265             "axis" => "alnum",
266             "background" => "src",
267             "border" => "number",
268             "bordercolor" => "color",
269             "bordercolordark" => "color",
270             "bordercolorlight" => "color",
271             "padding" => "integer",
272             "spacing" => "integer",
273             "cellpadding" => "integer",
274             "cellspacing" => "integer",
275             "cols" => "anything",
276             "colspan" => "integer",
277             "char" => "alnum",
278             "charoff" => "integer",
279             "datapagesize" => "integer",
280             "frame" => "frame",
281             "frameborder" => "boolean",
282             "framespacing" => "integer",
283             "headers" => "anything",
284             "rows" => "anything",
285             "rowspan" => "size",
286             "rules" => "rules",
287             "scope" => "scope",
288             "span" => "integer",
289             "summary" => "anything"
290             );
291              
292             my %UrlRules = (
293             "src" => 1,
294             "href" => 1,
295             "base-href" => 1,
296              
297             # cite => 1,
298             # action => 1,
299             );
300              
301             my %Tags = (
302             script => \&defang_script,
303             style => \&defang_style,
304             "html" => 100,
305              
306             #
307             # Safe elements commonly found in the <head> block follow.
308             #
309             "head" => 2,
310             "base" => {
311             "href" => "base-href",
312             "target" => "target",
313             },
314              
315             # TODO: Deal with link below later
316             #"link" => \$r_link,
317             # {
318             # "rel" => "rel",
319             # "rev" => "rel",
320             # "src" => "src",
321             # "href" => "src", # Might be auto-loaded by the browser!!
322             # "charset" => "charset",
323             # "media" => "media",
324             # "target" => "target",
325             # "type" => "mime-type",
326             # },
327             "meta" => {
328             "_score" => 2,
329             "content" => "meta:content",
330             "http-equiv" => "meta:name",
331             "name" => "meta:name",
332             "charset" => "charset",
333             },
334             "title" => 2,
335              
336             #
337             # Safe elements commonly found in the <body> block follow.
338             #
339             "body" => {
340             "_score" => 2,
341             "link" => "color",
342             "alink" => "color",
343             "vlink" => "color",
344             "background" => "src",
345             "nowrap" => "boolean",
346             "text" => "color",
347             "vlink" => "color",
348             },
349             "a" => {
350             "charset" => "charset",
351             "coords" => "coords",
352             "href" => "href",
353             "shape" => "shape",
354             "target" => "target",
355             "type" => "mime-type",
356             "eudora" => "eudora",
357             "notrack" => "anything",
358             },
359             "address" => 1,
360             "area" => {
361             "alt" => "anything",
362             "coords" => "coords",
363             "href" => "href",
364             "nohref" => "anything",
365             "shape" => "shape",
366             "target" => "target",
367             },
368             "applet" => 0,
369             "basefont" => {
370             "face" => "font-face",
371             "family" => "font-face",
372             "back" => "color",
373             "size" => "number",
374             "ptsize" => "number",
375             },
376             "bdo" => 1,
377             "bgsound" => {
378             "balance" => "integer",
379             "delay" => "integer",
380             "loop" => "alnum",
381             "src" => "src",
382             "volume" => "integer",
383             },
384             "blockquote" => {
385             "cite" => "href",
386             "type" => "mime-type",
387             },
388             "br" => 1,
389             "button" => # FORM
390             {
391             "type" => "input-type",
392             "disabled" => "anything",
393             "value" => "anything",
394             "tabindex" => "number",
395             },
396             "caption" => 1,
397             "center" => 1,
398             "col" => \%TableAttributes,
399             "colgroup" => \%TableAttributes,
400             "comment" => 1,
401             "dd" => 1,
402             "del" => {
403             "cite" => "href",
404             "datetime" => "datetime",
405             },
406             "dir" => \%ListAttributes,
407             "div" => 1,
408             "dl" => \%ListAttributes,
409             "dt" => 1,
410             "embed" => 0,
411             "fieldset" => 1, # FORM
412             "font" => {
413             "face" => "font-face",
414             "family" => "font-face",
415             "back" => "color",
416             "size" => "number",
417             "ptsize" => "number",
418             },
419             "form" => # FORM
420             {
421             "method" => "form-method",
422             "action" => "href",
423             "enctype" => "form-enctype",
424             "accept" => "anything",
425             "accept-charset" => "anything",
426             },
427             "hr" => {
428             "size" => "number",
429             "noshade" => "anything",
430             },
431             "h1" => 1,
432             "h2" => 1,
433             "h3" => 1,
434             "h4" => 1,
435             "h5" => 1,
436             "h6" => 1,
437             "iframe" => 0,
438             "ilayer" => 0,
439             "img" => {
440             "alt" => "anything",
441             "border" => "size",
442             "dynsrc" => "src",
443             "hspace" => "size",
444             "ismap" => "anything",
445             "loop" => "alnum",
446             "lowsrc" => "src",
447             "nosend" => "alnum",
448             "src" => "src",
449             "start" => "alnum",
450             "usemap" => "usemap-href",
451             "vspace" => "size",
452             },
453             "inlineinput" => 0,
454             "input" => # FORM
455             {
456             "type" => "input-type",
457             "disabled" => "anything",
458             "value" => "anything",
459             "maxlength" => "input-size",
460             "size" => "input-size",
461             "readonly" => "anything",
462             "tabindex" => "number",
463             "checked" => "anything",
464             "accept" => "anything",
465              
466             # for type "image":
467             "alt" => "anything",
468             "border" => "size",
469             "dynsrc" => "src",
470             "hspace" => "size",
471             "ismap" => "anything",
472             "loop" => "alnum",
473             "lowsrc" => "src",
474             "nosend" => "alnum",
475             "src" => "src",
476             "start" => "alnum",
477             "usemap" => "usemap-href",
478             "vspace" => "size",
479             },
480             "ins" => {
481             "cite" => "href",
482             "datetime" => "datetime",
483             },
484             "isindex" => 0,
485             "keygen" => 0,
486             "label" => # FORM
487             { "for" => "alnum", },
488             "layer" => 0,
489             "legend" => 1, # FORM
490             "li" => { "value" => "integer", },
491             "listing" => 0,
492             "map" => 1,
493             "marquee" => 0,
494             "menu" => \%ListAttributes,
495             "multicol" => 0,
496             "nextid" => 0,
497             "nobr" => 0,
498             "noembed" => 1,
499             "nolayer" => 1,
500             "noscript" => 1,
501             "noembed" => 1,
502             "object" => 0,
503             "ol" => \%ListAttributes,
504             "optgroup" => # FORM
505             {
506             "disabled" => "anything",
507             "label" => "anything",
508             },
509             "option" => # FORM
510             {
511             "disabled" => "anything",
512             "label" => "anything",
513             "selected" => "anything",
514             "value" => "anything",
515             },
516             "o:p" => 1,
517             "p" => 1,
518             "param" => 0,
519             "plaintext" => 0,
520             "pre" => 1,
521             "rt" => 0,
522             "ruby" => 0,
523             "select" => # FORM
524             {
525             "disabled" => "anything",
526             "multiple" => "anything",
527             "size" => "input-size",
528             "tabindex" => "number",
529             },
530             "spacer" => 0,
531             "span" => 1,
532             "spell" => 0,
533             "sound" => {
534             "delay" => "number",
535             "loop" => "integer",
536             "src" => "src",
537             },
538             "table" => \%TableAttributes,
539             "tbody" => \%TableAttributes,
540             "textarea" => # FORM
541             {
542             "cols" => "input-size",
543             "rows" => "input-size",
544             "disabled" => "anything",
545             "readonly" => "anything",
546             "tabindex" => "number",
547             "wrap" => "anything",
548             },
549             "td" => \%TableAttributes,
550             "tfoot" => \%TableAttributes,
551             "th" => \%TableAttributes,
552             "thead" => \%TableAttributes,
553             "tr" => \%TableAttributes,
554             "ul" => \%ListAttributes,
555             "wbr" => 1,
556             "xml" => 0,
557             "xmp" => 0,
558             "x-html" => 0,
559             "x-tab" => 1,
560             "x-sigsep" => 1,
561              
562             # Character formatting
563             "abbr" => 1,
564             "acronym" => 1,
565             "big" => 1,
566             "blink" => 0,
567             "b" => 1,
568             "cite" => 1,
569             "code" => 1,
570             "dfn" => 1,
571             "em" => 1,
572             "i" => 1,
573             "kbd" => 1,
574             "q" => 1,
575             "s" => 1,
576             "samp" => 1,
577             "small" => 1,
578             "strike" => 1,
579             "strong" => 1,
580             "sub" => 1,
581             "sup" => 1,
582             "tt" => 1,
583             "u" => 1,
584             "var" => 1,
585              
586             #
587             # Safe elements commonly found in the <frameset> block follow.
588             #
589             "frameset" => 0,
590             "frame" => 0,
591             "noframes" => 1,
592             );
593              
594             # Some entity conversions for attributes
595             my %EntityToChar =
596             ( quot => '"', apos => "'", amp => '&', 'lt' => '<', 'gt' => '>' );
597             my %CharToEntity = reverse %EntityToChar;
598             my %QuoteRe = ( '"' => qr/(["&<>])/, "'" => qr/(['&<>])/, "" => qr/(["&<>])/ );
599              
600             # Default list of mismatched tags to track
601             my %MismatchedTags =
602             map { $_ => 1 } qw(table tbody thead tr td th font div span pre center);
603              
604             # When fixing mismatched tags, sometimes a close tag
605             # shouldn't close all the way out
606             # For example, consider:
607             # <table><tr><td><table><tr></td>
608             # A naive version would see the ending </td>, and thus
609             # try to fix the mismatched tags by doing:
610             # <table><tr><td><table><tr></tr></table></td>
611             # This is not what a browser does. So given a tag, we
612             # give a list of closing tags which cause us to stop
613             # and not close any more
614             my %MismatchedTagNest = (
615             table => [qw(tbody thead tfoot tr th td colgroup)],
616             tbody => [qw(tr th td)],
617             tr => [qw(th td)],
618             font => [''],
619             );
620              
621             # Convert to hash of hashes
622             $_ = { map { $_ => 1 } @$_ } for values %MismatchedTagNest;
623              
624             # If we see a table, we should expect to see a tbody
625             # next. If not, we need to add it because the browser
626             # will implicitly open it!
627             my %ImplicitOpenTags = (
628             table => [qw(tbody tr thead tfoot caption colgroup col)],
629             thead => [qw(tr)],
630             tbody => [qw(tr)],
631             tr => [qw(td th)],
632             );
633              
634             # Convert to hash of hashes
635             $_ = { default => $_->[0], map { $_ => 1 } @$_ } for values %ImplicitOpenTags;
636              
637             =head1 CONSTRUCTOR
638              
639             =over 4
640              
641             =cut
642              
643             =item I<MojoMojo::Declaw-E<gt>new(%Options)>
644              
645             Constructs a new HTML::Declaw object. The following options are supported:
646              
647             =over 4
648              
649             =item B<Options>
650              
651             =over 4
652              
653             =item B<tags_to_callback>
654              
655             Array reference of tags for which a call back should be made. If a tag in this array is parsed, the subroutine tags_callback() is invoked.
656              
657             =item B<attribs_to_callback>
658              
659             Array reference of tag attributes for which a call back should be made. If an attribute in this array is parsed, the subroutine attribs_callback() is invoked.
660              
661             =item B<tags_callback>
662              
663             Subroutine reference to be invoked when a tag listed in @$tags_to_callback is parsed.
664              
665             =item B<attribs_callback>
666              
667             Subroutine reference to be invoked when an attribute listed in @$attribs_to_callback is parsed.
668              
669             =item B<url_callback>
670              
671             Subroutine reference to be invoked when a URL is detected in an HTML tag attribute or a CSS property.
672              
673             =item B<css_callback>
674              
675             Subroutine reference to be invoked when CSS data is found either as the contents of a 'style' attribute in an HTML tag, or as the contents of a <style> HTML tag.
676              
677             =item B<fix_mismatched_tags>
678              
679             This property, if set, fixes mismatched tags in the HTML input. By default, tags present in the default %mismatched_tags_to_fix hash are fixed. This set of tags can be overridden by passing in an array reference $mismatched_tags_to_fix to the constructor. Any opened tags in the set are automatically closed if no corresponding closing tag is found. If an unbalanced closing tag is found, that is commented out.
680              
681             =item B<mismatched_tags_to_fix>
682              
683             Array reference of tags for which the code would check for matching opening and closing tags. See the property $fix_mismatched_tags.
684              
685             =item B<context>
686              
687             You can pass an arbitrary scalar as a 'context' value that's then passed as the first parameter to all callback functions. Most commonly this is something like '$Self'
688              
689             =item B<Debug>
690              
691             If set, prints debugging output.
692              
693             =back
694              
695             =back
696              
697             =back
698              
699             =cut
700              
701             sub new {
702 141     141 1 387 my $Proto = shift;
703 141   33     904 my $Class = ref($Proto) || $Proto;
704              
705 141         1129 my %Opts = @_;
706              
707             # my $Context = shift;
708              
709             my ( $tags_to_callback, $attribs_to_callback ) =
710 141         529 ( $Opts{"tags_to_callback"}, $Opts{"attribs_to_callback"} );
711 141 50       626 my %tags_to_callback = map { $_ => 1 } @$tags_to_callback
  705         2067  
712             if $tags_to_callback;
713 141 50       653 my %attribs_to_callback = map { $_ => 1 } @$attribs_to_callback
  423         1140  
714             if $attribs_to_callback;
715 141         1678 my %mismatched_tags_to_fix = %MismatchedTags;
716             %mismatched_tags_to_fix =
717 0         0 map { $_ => 1 } @{ $Opts{'mismatched_tags_to_fix'} }
  0         0  
718 141 50       603 if $Opts{'mismatched_tags_to_fix'};
719              
720             my $Self = {
721             DefangString => 'defang_',
722             tags_to_callback => \%tags_to_callback,
723             tags_callback => $Opts{tags_callback},
724             attribs_to_callback => \%attribs_to_callback,
725             attribs_callback => $Opts{attribs_callback},
726             url_callback => $Opts{url_callback},
727             css_callback => $Opts{css_callback},
728             mismatched_tags_to_fix => \%mismatched_tags_to_fix,
729             fix_mismatched_tags => $Opts{fix_mismatched_tags},
730             context => $Opts{context},
731             OpenedTags => [],
732             OpenedTagsCount => {},
733             ImplicitTags => [],
734             Debug => $Opts{Debug},
735 141         1637 };
736              
737 141         452 bless( $Self, $Class );
738 141         646 return $Self;
739             }
740              
741             =head1 CALLBACK METHODS
742              
743             =over 4
744              
745             =cut
746              
747             =item B<COMMON PARAMETERS>
748              
749             A number of the callbacks share the same parameters. These common parameters are documented here. Certain variables may have specific meanings in certain callbacks, so be sure to check the documentation for that method first before referring this section.
750              
751             =over 4
752              
753             =item I<$context>
754              
755             You can pass an arbitrary scalar as a 'context' value that's then passed as the first parameter to all callback functions. Most commonly this is something like '$Self'
756              
757             =item I<$Defang>
758              
759             Current HTML::Declaw instance
760              
761             =item I<$OpenAngle>
762              
763             Opening angle(<) sign of the current tag.
764              
765             =item I<$lcTag>
766              
767             Lower case version of the HTML tag that is currently being parsed.
768              
769             =item I<$IsEndTag>
770              
771             Has the value '/' if the current tag is a closing tag.
772              
773             =item I<$AttributeHash>
774              
775             A reference to a hash containing the attributes of the current tag and
776             their values. Each value is a scalar reference to the value, rather
777             than just a scalar value. You can add attributes (remember to make it a
778             scalar ref, eg $AttributeHash{"newattr"} = \"newval"), delete attributes,
779             or modify attribute values in this hash, and any changes you make will
780             be incorporated into the output HTML stream.
781              
782             The attribute values will have any entity references decoded before
783             being passed to you, and any unsafe values we be re-encoded back into
784             the HTML stream.
785              
786             So for instance, the tag:
787              
788             <div title="&lt;&quot;Hi there &#x003C;">
789              
790             Will have the attribute hash:
791              
792             { title => \q[<"Hi there <] }
793              
794             And will be turned back into the HTML on output:
795              
796             <div title="&lt;&quot;Hi there &lt;">
797              
798             =item I<$CloseAngle>
799              
800             Anything after the end of last attribute including the closing HTML angle(>)
801              
802             =item I<$HtmlR>
803              
804             A scalar reference to the input HTML. The input HTML is parsed using
805             m/\G$SomeRegex/c constructs, so to continue from where HTML:Defang left,
806             clients can use m/\G$SomeRegex/c for further processing on the input. This
807             will resume parsing from where HTML::Declaw left. One can also use the
808             pos() function to determine where HTML::Declaw left off. This combined
809             with the add_to_output() method should give reasonable flexibility for
810             the client to process the input.
811              
812             =item I<$OutR>
813              
814             A scalar reference to the processed output HTML so far.
815              
816             =back
817              
818             =item I<tags_callback($context, $Defang, $OpenAngle, $lcTag, $IsEndTag, $AttributeHash, $CloseAngle, $HtmlR, $OutR)>
819              
820             If $Defang->{tags_callback} exists, and HTML::Declaw has parsed a tag preset in $Defang->{tags_to_callback}, the above callback is made to the client code. The return value of this method determines whether the tag is defanged or not. More details below.
821              
822             =over 4
823              
824             =item B<Return values>
825              
826             =over 4
827              
828             =item I<0>
829              
830             The current tag will not be defanged.
831              
832             =item I<1>
833              
834             The current tag will be defanged.
835              
836             =item I<2>
837              
838             The current tag will be processed normally by HTML:Defang as if there was no callback method specified.
839              
840             =back
841              
842             =back
843              
844             =item I<attribs_callback($context, $Defang, $lcTag, $lcAttrKey, $AttrVal, $HtmlR, $OutR)>
845              
846             If $Defang->{attribs_callback} exists, and HTML::Declaw has parsed an attribute present in $Defang->{attribs_to_callback}, the above callback is made to the client code. The return value of this method determines whether the attribute is defanged or not. More details below.
847              
848             =over 4
849              
850             =item B<Method parameters>
851              
852             =over 4
853              
854             =item I<$lcAttrKey>
855              
856             Lower case version of the HTML attribute that is currently being parsed.
857              
858             =item I<$AttrVal>
859              
860             Reference to the HTML attribute value that is currently being parsed.
861              
862             See $AttributeHash for details of decoding.
863              
864             =back
865              
866             =item B<Return values>
867              
868             =over 4
869              
870             =item I<0>
871              
872             The current attribute will not be defanged.
873              
874             =item I<1>
875              
876             The current attribute will be defanged.
877              
878             =item I<2>
879              
880             The current attribute will be processed normally by HTML:Defang as if there was no callback method specified.
881              
882             =back
883              
884             =back
885              
886             =item I<url_callback($context, $Defang, $lcTag, $lcAttrKey, $AttrVal, $AttributeHash, $HtmlR, $OutR)>
887              
888             If $Defang->{url_callback} exists, and HTML::Declaw has parsed a URL, the above callback is made to the client code. The return value of this method determines whether the attribute containing the URL is defanged or not. URL callbacks can be made from <style> tags as well style attributes, in which case the particular style declaration will be commented out. More details below.
889              
890             =over 4
891              
892             =item B<Method parameters>
893              
894             =over 4
895              
896             =item I<$lcAttrKey>
897              
898             Lower case version of the HTML attribute that is currently being parsed. However if this callback is made as a result of parsing a URL in a style attribute, $lcAttrKey will be set to the string I<style>, or will be set to I<undef> if this callback is made as a result of parsing a URL inside a style tag.
899              
900             =item I<$AttrVal>
901              
902             Reference to the URL value that is currently being parsed.
903              
904             =item I<$AttributeHash>
905              
906             A reference to a hash containing the attributes of the current tag and their values. Each value is a scalar reference to the value,
907             rather than just a scalar value. You can add attributes (remember to make it a scalar ref, eg $AttributeHash{"newattr"} = \"newval"), delete attributes, or modify attribute values in this hash, and any changes you make will be incorporated into the output HTML stream. Will be set to I<undef> if the callback is made due to URL in a <style> tag or attribute.
908              
909             =back
910              
911             =item B<Return values>
912              
913             =over 4
914              
915             =item I<0>
916              
917             The current URL will not be defanged.
918              
919             =item I<1>
920              
921             The current URL will be defanged.
922              
923             =item I<2>
924              
925             The current URL will be processed normally by HTML:Defang as if there was no callback method specified.
926              
927             =back
928              
929             =back
930              
931             =item I<css_callback($context, $Defang, $Selectors, $SelectorRules, $lcTag, $IsAttr, $OutR)>
932              
933             If $Defang->{css_callback} exists, and HTML::Declaw has parsed a <style> tag or style attribtue, the above callback is made to the client code. The return value of this method determines whether a particular declaration in the style rules is defanged or not. More details below.
934              
935             =over 4
936              
937             =item B<Method parameters>
938              
939             =over 4
940              
941             =item I<$Selectors>
942              
943             Reference to an array containing the selectors in a style tag or attribute.
944              
945             =item I<$SelectorRules>
946              
947             Reference to an array containing the style declaration blocks of all selectors in a style tag or attribute. Consider the below CSS:
948              
949             a { b:c; d:e}
950             j { k:l; m:n}
951              
952             The declaration blocks will get parsed into the following data structure:
953              
954             [
955             [
956             [ "b", "c", 2],
957             [ "d", "e", 2]
958             ],
959             [
960             [ "k", "l", 2],
961             [ "m", "n", 2]
962             ]
963             ]
964              
965             So, generally each property:value pair in a declaration is parsed into an array of the form
966              
967             ["property", "value", X]
968              
969             where X can be 0, 1 or 2, and 2 the default value. A client can manipulate this value to instruct HTML::Declaw to defang this property:value pair.
970              
971             0 - Do not defang
972              
973             1 - Defang the style:property value
974              
975             2 - Process this as if there is no callback specified
976              
977             =item I<$IsAttr>
978              
979             True if the currently processed item is a style attribute. False if the currently processed item is a style tag.
980              
981             =back
982              
983             =back
984              
985             =back
986              
987             =cut
988              
989             =head1 METHODS
990              
991             =over 4
992              
993             =item B<PUBLIC METHODS>
994              
995             =over 4
996              
997             =item I<defang($InputHtml)>
998              
999             Cleans up $InputHtml of any executable code including scripting, embedded objects, applets, etc., and defang any XSS attacks.
1000              
1001             =over 4
1002              
1003             =item B<Method parameters>
1004              
1005             =over 4
1006              
1007             =item I<$InputHtml>
1008              
1009             The input HTML string that needs to be sanitized.
1010              
1011             =back
1012              
1013             =back
1014              
1015             Returns the cleaned HTML. If fix_mismatched_tags is set, any tags that appear in @$mismatched_tags_to_fix that are unbalanced are automatically commented or closed.
1016              
1017             =cut
1018              
1019             sub defang {
1020 141     141 1 324 my $Self = shift;
1021              
1022 141         308 my $I = shift;
1023              
1024 141         436 my $Debug = $Self->{Debug};
1025              
1026 141         300 my $HeaderCharset = shift;
1027 141 50       458 warn("defang HeaderCharset=$HeaderCharset") if $Debug;
1028 141         283 my $FallbackCharset = shift;
1029 141 50       485 warn("defang FallbackCharset=$FallbackCharset") if $Debug;
1030              
1031 141         399 $Self->{Reentrant}++;
1032              
1033             # Get encoded characters
1034             # $Self->{Charset} = $Self->get_applicable_charset($_, $HeaderCharset, $FallbackCharset);
1035             # warn("defang Charset=$Self->{Charset}") if $Self->{Debug};
1036              
1037             # if ($Self->{Charset}) {
1038             # $I =~ s/(.)/chr(ord($1) & 127)/ge if $Self->{Charset} eq "US-ASCII";
1039             # my $Encoder = Encode::Encoder->new($I, $Self->{Charset});
1040             # $I = $Encoder->bytes($Self->{Charset});
1041             # }
1042              
1043             # We pass a ref to $I to each callback. It should
1044             # never be modified because we use a m/\G.../gc loop
1045             # on it. If possible, stop people modifying it
1046 141 50       459 readonly_on($I) if $HasScalarReadonly;
1047              
1048             # It seems regexp matching on perl unicode strings can be *way*
1049             # slower than byte string (defang 1M email = 100 seconds unicode,
1050             # 5 seconds bytes).
1051             # So we're going to do a bit of a hack. Engaged "use bytes" to do
1052             # byte matching everywhere, but since we know we'll be matching
1053             # on correct boundaries to make up full code points in utf-8, we'll
1054             # turn on the magic utf-8 flag again for those values
1055 141         594 my $UTF8Input = Encode::is_utf8($I);
1056              
1057             # Force byte matching everywhere (see above)
1058 27     27   242 use bytes;
  27         81  
  27         280  
1059              
1060             # Strip all NUL chars
1061 141         450 $I =~ s/\0//g;
1062              
1063             # Output buffer
1064 141         356 my $O = '';
1065              
1066             # This parser uses standard /\G.../gc matching, so have to be careful
1067             # to not reset pos() on the string
1068             #
1069             # Previously we tried an "eating" parser (s/^.../, or /^.../ + substr),
1070             # which in theory should be fast with perls internal string offset
1071             # feature, but it seems offset doesn't work on unicode strings,
1072             # so you end up with a slow parser because of string reallocations
1073              
1074 141         306 while (1) {
1075              
1076             # walk to next < (testing in 5.8.8 shows .*? is faster than [^<]* or [^<]*?)
1077 1356 100       5330 if ( $I =~ m{\G(.*?)<}gcso ) {
1078              
1079             # Everything before tag goes into the output
1080 1215         2968 $O .= $1;
1081              
1082             # All tags default to open/close with </>
1083 1215         2359 my ( $OpenAngle, $CloseAngle ) = ( '<', '>' );
1084 1215 100       3038 my $IsEndTag = $I =~ m{\G/}gcso ? '/' : '';
1085              
1086             # It's a standard tag
1087 1215 100       3840 if ( $I =~ m{\G($TagNameRE)}gcso ) {
    100          
    50          
1088              
1089 1210         2252 my $Tag = $1;
1090 1210 100       3092 my $TagTrail = $I =~ m{\G([\s/]+)}gcso ? $1 : '';
1091              
1092 1210 50       2634 warn "defang IsEndTag=$IsEndTag Tag=$Tag" if $Debug;
1093              
1094             # Skip attribute parsing if none
1095 1210         1754 my @Attributes;
1096 1210 100       3880 goto NoParseAttributes if $I =~ m{\G>}gcso;
1097              
1098             # Pull off any trailing component after the tag
1099             # Now match all key=value attributes
1100 228         2706 while ( $I =~
1101             m{\G(?:($AttrKeyStartLineRE)(\s*))?(?:(=\s*)($AttrValRE)(\s*))?}gcso
1102             )
1103             {
1104              
1105 573 50 66     1916 last if !defined($1) && !defined($4);
1106             my (
1107 345         1338 $Attribute, $AttrTrail, $Equals,
1108             $AttrVal, $AttrValTrail
1109             ) = ( $1, $2, $3, $4, $5 );
1110 345         712 my ( $AttrQuote, $AttrValWithoutQuote ) = '';
1111 345 50 33     2140 if ( defined($4) && $4 =~ /^([`"']?)(.*)\1$/s ) {
1112              
1113             # IE supports `, but nothing else does, turn it into "
1114 345 50       1033 $AttrQuote = $1 eq '`' ? '"' : $1;
1115 345         709 $AttrValWithoutQuote = $2;
1116             }
1117              
1118             # Turn on utf-8 for things that might be
1119 345 100       849 Encode::_utf8_on($Attribute) if $UTF8Input;
1120 345 100       789 Encode::_utf8_on($AttrValWithoutQuote) if $UTF8Input;
1121              
1122 345         1185 push @Attributes,
1123             [
1124             $Attribute, $AttrTrail,
1125             $Equals, $AttrQuote,
1126             $AttrValWithoutQuote, $AttrQuote,
1127             $AttrValTrail
1128             ];
1129 345 50       1669 warn
1130             "defang AttributeKey=$1 AttrQuote=$AttrQuote AttributeValue=$Attribute"
1131             if $Debug;
1132             }
1133              
1134             # Better be at end of attributes, or attach our own ending tag
1135 228 100       908 if ( $I =~ m{\G(?:(\s*[/\\]*\s*(?:--)?\s*)?>|([\s/-]*))}gcs ) {
1136 225 50       872 $CloseAngle = $1 ? $1 . '>' : ( $2 ? $2 . '>' : '>' );
    100          
1137             }
1138              
1139             NoParseAttributes:
1140 1210         1953 my $Defang = 1;
1141              
1142 1210         2564 my $TagOps = $Tags{ lc $Tag };
1143              
1144             # Process this tag
1145 1210 100       2596 if ( ref $TagOps eq "CODE" ) {
1146              
1147 2 50       6 warn "process_tag Found CODE reference" if $Debug;
1148 2         7 $Defang = $Self ->${TagOps}(
1149             \$O, \$I, $TagOps,
1150             \$OpenAngle, $IsEndTag, $Tag,
1151             $TagTrail, \@Attributes, \$CloseAngle
1152             );
1153              
1154             }
1155             else {
1156              
1157 1208 50       2537 warn "process_tag Found regular tag" if $Debug;
1158 1208         3571 $Defang = $Self->defang_attributes(
1159             \$O, \$I, $TagOps,
1160             \$OpenAngle, $IsEndTag, $Tag,
1161             $TagTrail, \@Attributes, \$CloseAngle
1162             );
1163              
1164             }
1165 1210 50       3067 die "Callback reset pos on Tag=$Tag IsEndTag=$IsEndTag"
1166             if !defined pos($I);
1167 1210 50       2559 warn "defang Defang=$Defang" if $Debug;
1168              
1169             # defang unknown tags
1170 1210 100       2767 if ($Defang) {
1171 3 50       7 warn "defang Defanging $Tag" if $Debug;
1172 3         6 $Tag = $Self->{DefangString} . $Tag;
1173 3         10 $OpenAngle =~ s/</<!--/;
1174 3         7 $CloseAngle =~ s/>/-->/;
1175             }
1176              
1177             # And put it all back together into the output string
1178             $O .=
1179             $OpenAngle
1180             . $IsEndTag
1181             . $Tag
1182             . $TagTrail
1183 1210         3335 . join( "", grep { defined } map { @$_ } @Attributes )
  2759         5003  
  345         1277  
1184             . $CloseAngle;
1185              
1186             # It's a comment of some sort. We are looking for regular HTML comment, XML CDATA section and
1187             # IE conditional comments
1188             # Refer http://msdn.microsoft.com/en-us/library/ms537512.aspx for IE conditional comment information
1189             }
1190             elsif ( $I =~ m{\G(!)((?:\[CDATA\[|(?:--)?\[if|--)?)}gcis ) {
1191              
1192 4         15 my ( $Comment, $CommentDelim ) = ( $1, $2 );
1193 4 50       13 warn "defang Comment=$Comment CommentDelim=$CommentDelim"
1194             if $Debug;
1195              
1196             # Find the appropriate closing delimiter
1197 4         11 my $IsCDATA = $CommentDelim eq "[CDATA[";
1198 4 50       11 my $ClosingCommentDelim = $IsCDATA ? "]]" : $CommentDelim;
1199              
1200 4         8 my $EndRestartCommentsText = '';
1201              
1202             # Handle IE conditionals specially. We can have <![if ...]>, <!--[if ...]> and <!--[if ...]-->
1203             # for the third case, we just want to immediately match the -->
1204 4 50       13 if ( $CommentDelim =~ /((?:--)?)\[if/ ) {
1205 0         0 my $ConditionalDelim = $1;
1206 0 0       0 $EndRestartCommentsText = '--' if $ConditionalDelim eq '';
1207 0         0 $ClosingCommentDelim = $CommentDelim;
1208 0 0       0 if ( $I !~ m{\G[^\]]*\]-->}gcis ) {
1209 0         0 $ClosingCommentDelim = "<![endif]$ConditionalDelim";
1210             }
1211             }
1212              
1213 4 50       12 warn "defang ClosingCommentDelim=$ClosingCommentDelim"
1214             if $Debug;
1215              
1216 4         10 my ( $CommentStartText, $CommentEndText ) =
1217             ( "--/*SC*/", "/*EC*/--" );
1218              
1219             # Convert to regular HTML comment
1220 4         10 $O .= $OpenAngle . $Comment . $CommentStartText;
1221              
1222             # Find closing comment
1223 4 50 33     61 if ( $I =~ m{\G(.*?)(\Q${ClosingCommentDelim}\E!?\s*)(>)}gcis
1224             || $I =~ m{\G(.*?)(--)(>)}gcis )
1225             {
1226              
1227 4         14 my ( $StartTag, $CommentData, $ClosingTag, $CloseAngle ) =
1228             ( $CommentDelim, $1, $2, $3 );
1229              
1230 4 50 33     16 if ( $EndRestartCommentsText
1231             && $CommentData =~ s/^(.*?)(>.*)$/$2/s )
1232             {
1233 0         0 $StartTag .= $1;
1234             }
1235              
1236             # Strip all HTML comment markers
1237 4         14 $StartTag =~ s/--//g;
1238 4         10 $CommentData =~ s/--//g;
1239 4         11 $ClosingTag =~ s/--//g;
1240              
1241 4 50       10 $StartTag .= $EndRestartCommentsText if $CommentData;
1242 4 50       12 $ClosingTag =~ s{^(<!)}{$1$EndRestartCommentsText}
1243             if $CommentData;
1244              
1245             # Put it all into the output
1246 4 50       17 $O .= $StartTag
1247             . (
1248             $EndRestartCommentsText
1249             ? $Self->defang($CommentData)
1250             : $CommentData
1251             )
1252             . $ClosingTag
1253             . $CommentEndText
1254             . $CloseAngle;
1255              
1256             # No closing comment, so we add that
1257             }
1258             else {
1259              
1260 0 0       0 $I =~ m/\G(.*)$/gcs || die "Remainder of line match failed";
1261              
1262 0         0 my $Data = $1;
1263 0         0 $Data =~ s/--//g;
1264              
1265             # Output
1266 0         0 $O .= $Data . $CommentEndText . ">";
1267              
1268             }
1269              
1270             # XML processing instruction
1271             }
1272             elsif ( $I =~ m{\G(\?)}gcs ) {
1273 0         0 my ($Processing) = ($1);
1274 0 0       0 warn "defang XML processing instruction" if $Debug;
1275              
1276 0         0 my $Data;
1277 0 0       0 if ( $I =~ m{\G(.*?\??)>}gcs ) { # || goto OutputRemainder;
1278 0         0 $Data = $1;
1279             }
1280             else {
1281 0         0 $I =~ m{\G(.*)$}gcs;
1282 0         0 $Data = $1;
1283             }
1284              
1285 0         0 $Data =~ s{--}{}g;
1286              
1287 0         0 $O .= $OpenAngle . '!--' . $Processing . $Data . '-->';
1288              
1289             }
1290              
1291             # Some other thing starting with <, keep looking
1292              
1293 1215 100       3022 if ( exists $Self->{AppendOutput} ) {
1294 1         2 $O .= delete $Self->{AppendOutput};
1295             }
1296 1215         2135 next;
1297             }
1298              
1299             OutputRemainder:
1300              
1301             # No tag found, just copy rest
1302 141 50       450 warn "defang OutputRemainder" if $Debug;
1303 141         539 $I =~ m/\G(.*)$/gcs;
1304              
1305 141 100       586 $O .= $1 if $1;
1306              
1307             # Exit if we got here
1308 141         286 last;
1309             }
1310              
1311             # If not a recursive call, close mismatched tags
1312 141 50       568 if ( $Self->{Reentrant} <= 1 ) {
1313 141         351 my $RemainingClosingTags = '';
1314              
1315             my ( $OpenedTags, $OpenedTagsCount ) =
1316 141         423 @$Self{qw(OpenedTags OpenedTagsCount)};
1317 141         569 while ( my $PreviousOpenedTag = pop @$OpenedTags ) {
1318 0         0 $RemainingClosingTags .=
1319             "<!-- close mismatch --></$PreviousOpenedTag>";
1320 0         0 $OpenedTagsCount->{$PreviousOpenedTag}--;
1321             }
1322 141         316 $O .= $RemainingClosingTags;
1323              
1324             # Also clear implicit tags
1325 141         373 $Self->{ImplicitTags} = [];
1326              
1327 141 50       496 if ($Debug) {
1328 0         0 warn "Check all tags closed and counts zeroed";
1329             warn "Not all tags closed"
1330 0 0       0 if grep { $_ > 0 } values %$OpenedTagsCount;
  0         0  
1331             }
1332             }
1333              
1334 141         348 $Self->{Reentrant}--;
1335              
1336             # Turn on utf-8 flag again
1337 141 100       488 Encode::_utf8_on($O) if $UTF8Input;
1338              
1339 141         646 return $O;
1340             }
1341              
1342             =item I<add_to_output($String)>
1343              
1344             Appends $String to the output after the current parsed tag ends. Can be used by client code in callback methods to add HTML text to the processed output. If the HTML text needs to be defanged, client code can safely call HTML::Declaw->defang() recursively from within the callback.
1345              
1346             =over 4
1347              
1348             =item B<Method parameters>
1349              
1350             =over 4
1351              
1352             =item I<$String>
1353              
1354             The string that is added after the current parsed tag ends.
1355              
1356             =back
1357              
1358             =back
1359              
1360             =back
1361              
1362             =cut
1363              
1364             # Callbacks call this method
1365             sub add_to_output {
1366 0     0 1 0 my $Self = shift;
1367 0         0 $Self->{AppendOutput} = shift;
1368             }
1369              
1370             =item defang_and_add_to_output
1371              
1372             defang and add result to output
1373              
1374             =cut
1375              
1376             sub defang_and_add_to_output {
1377 0     0 1 0 my $Self = shift;
1378 0         0 $Self->add_to_output( $Self->defang(shift) );
1379             }
1380              
1381             =item B<INTERNAL METHODS>
1382              
1383             Generally these methods never need to be called by users of the class, because they'll be called internally as the appropriate tags are
1384             encountered, but they may be useful for some users in some cases.
1385              
1386             =over 4
1387              
1388             =item I<defang_script($OutR, $HtmlR, $TagOps, $OpenAngle, $IsEndTag, $Tag, $TagTrail, $Attributes, $CloseAngle)>
1389              
1390             This method is invoked when a <script> tag is parsed. Defangs the <script> opening tag, and any closing tag. Any scripting content is also commented out, so browsers don't display them.
1391              
1392             Returns 1 to indicate that the <script> tag must be defanged.
1393              
1394             =over 4
1395              
1396             =item B<Method parameters>
1397              
1398             =over 4
1399              
1400             =item I<$OutR>
1401              
1402             A reference to the processed output HTML before the tag that is currently being parsed.
1403              
1404             =item I<$HtmlR>
1405              
1406             A scalar reference to the input HTML.
1407              
1408             =item I<$TagOps>
1409              
1410             Indicates what operation should be done on a tag. Can be undefined, integer or code reference. Undefined indicates an unknown tag to HTML::Declaw, 1 indicates a known safe tag, 0 indicates a known unsafe tag, and a code reference indicates a subroutine that should be called to parse the current tag. For example, <style> and <script> tags are parsed by dedicated subroutines.
1411              
1412             =item I<$OpenAngle>
1413              
1414             Opening angle(<) sign of the current tag.
1415              
1416             =item I<$IsEndTag>
1417              
1418             Has the value '/' if the current tag is a closing tag.
1419              
1420             =item I<$Tag>
1421              
1422             The HTML tag that is currently being parsed.
1423              
1424             =item I<$TagTrail>
1425              
1426             Any space after the tag, but before attributes.
1427              
1428             =item I<$Attributes>
1429              
1430             A reference to an array of the attributes and their values, including any surrouding spaces. Each element of the array is added by 'push' calls like below.
1431              
1432             push @$Attributes, [ $AttributeName, $SpaceBeforeEquals, $EqualsAndSubsequentSpace, $QuoteChar, $AttributeValue, $QuoteChar, $SpaceAfterAtributeValue ];
1433              
1434             =item I<$CloseAngle>
1435              
1436             Anything after the end of last attribute including the closing HTML angle(>)
1437              
1438             =back
1439              
1440             =back
1441              
1442             =cut
1443              
1444             sub defang_script {
1445 2     2 1 4 my $Self = shift;
1446             my (
1447 2         5 $OutR, $HtmlR, $TagOps, $OpenAngle, $IsEndTag,
1448             $Tag, $TagTrail, $Attributes, $CloseAngle
1449             ) = @_;
1450 2 50       7 warn "defang_script Processing <script> tag" if $Self->{Debug};
1451              
1452 2 100       5 if ( !$IsEndTag ) {
1453              
1454             # If we just parsed a starting <script> tag, code better be commented. If
1455             # not, we attach comments around the code.
1456 1 50       6 if ( $$HtmlR =~ m{\G(.*?)(?=</script\b)}gcsi ) {
1457 1         3 my $ScriptTagContents = $1;
1458             warn "defang_script ScriptTagContents $ScriptTagContents"
1459 1 50       3 if $Self->{Debug};
1460 1         8 $ScriptTagContents =~
1461             s/^(\s*)(<!--)?(.*?)(-->)?(\s*)$/$1<!-- $3 -->$5/s;
1462 1         5 $Self->{AppendOutput} .= $ScriptTagContents;
1463              
1464             }
1465             }
1466              
1467             # Also defang tag
1468 2         3 return 1;
1469              
1470             }
1471              
1472             =item I<defang_style($OutR, $HtmlR, $TagOps, $OpenAngle, $IsEndTag, $Tag, $TagTrail, $Attributes, $CloseAngle, $IsAttr)>
1473              
1474             Builds a list of selectors and declarations from HTML style tags as well as style attributes in HTML tags and calls defang_stylerule() to do the actual defanging.
1475              
1476             Returns 0 to indicate that style tags must not be defanged.
1477              
1478             =over 4
1479              
1480             =item B<Method parameters>
1481              
1482             =over 4
1483              
1484             =item I<$IsAttr>
1485              
1486             Whether we are currently parsing a style attribute or style tag. $IsAttr will be true if we are currently parsing a style attribute.
1487              
1488             =back
1489              
1490             For a description of other parameters, see documentation of defang_script() method
1491              
1492             =back
1493              
1494             =cut
1495              
1496             sub defang_style {
1497              
1498             my (
1499 4     4 1 19 $Self, $OutR, $HtmlR, $TagOps,
1500             $OpenAngle, $IsEndTag, $Tag, $TagTrail,
1501             $Attributes, $CloseAngle, $IsAttr
1502             ) = @_;
1503 4         13 my $lcTag = lc $Tag;
1504              
1505             warn "defang_style Tag=$Tag IsEndTag=$IsEndTag IsAttr=$IsAttr"
1506 4 50       21 if $Self->{Debug};
1507              
1508             # Nothing to do if end tag
1509 4 50 33     21 return 0 if !$IsAttr && $IsEndTag;
1510              
1511 4         10 my $Content = '';
1512 4         9 my $ClosingStyleTagPresent = 1;
1513              
1514 4         12 for ($$HtmlR) {
1515              
1516 4 50       15 if ( !$IsAttr ) {
1517 0 0       0 if (m{\G(.*?)(?=</style\b)}gcis) {
    0          
1518 0         0 $Content = $1;
1519              
1520             # No ending style tag
1521             }
1522             elsif (m{\G([^<]*)}gcis) {
1523 0         0 $Content = $1;
1524 0         0 $ClosingStyleTagPresent = 0;
1525             }
1526              
1527             # Its a style attribute
1528             }
1529             else {
1530              
1531             # Avoid undef warning for style tag with no value. eg <tag style>
1532 4 50       28 $Content = defined($_) ? $_ : '';
1533             }
1534             }
1535              
1536             # Clean up all comments, expand character escapes and such
1537 4         20 $Self->cleanup_style($Content);
1538              
1539             # Handle any wrapping HTML comments. If no comments, we add
1540 4         12 my ( $OpeningHtmlComment, $ClosingHtmlComment ) = ( '', '' );
1541 4 50       14 if ( !$IsAttr ) {
1542 0         0 $Content =~ s{^(\s*<!--)?(.*?)(-->\s*)?$}{$2}s;
1543 0   0     0 ( $OpeningHtmlComment, $ClosingHtmlComment ) =
      0        
1544             ( $1 || "<!--", $3 || "-->" );
1545             }
1546              
1547             # Style attributes can optionally have selector type elements, so we check whether we
1548             # have a '{' in $Content: if yes, its style data with selector type elements
1549 4         13 my $Naked = $Content !~ m/\{/;
1550 4         163 my $StyleRule =
1551             qr/\s*$StyleKey\s*:\s*$StyleValue\s*(?:;\s*$StyleKey\s*:\s*$StyleValue\s*)*;?\s*/o;
1552 4 50       21 warn "defang_style Naked=$Naked" if $Self->{Debug};
1553              
1554             # And suitably change the regex to match the data
1555 4 50       131 my $SelectorRuleRE =
1556             $Naked
1557             ? qr/(\s*)()()()($StyleRule)()(\s*)/o
1558             : qr/(\s*)((?:$Selectors))(\s*)(\{)($StyleRule)(\})(\s*)/o;
1559              
1560 4         15 my ( @Selectors, @SelectorRules, %ExtraData );
1561              
1562             # Now we parse the selectors and declarations
1563 4         61 while ( $Content =~ s{$SelectorRuleRE}{} ) {
1564 4         18 my ( $Selector, $SelectorRule ) = ( $2, $5 );
1565 4         14 push @Selectors, $Selector;
1566 4         10 push @SelectorRules, $SelectorRule;
1567 4 50       18 warn "defang_style Selector=$Selector" if $Self->{Debug};
1568 4 50       17 warn "defang_style SelectorRule=$SelectorRule" if $Self->{Debug};
1569 4         39 $ExtraData{$Selector} = [ $1, $3, $4, $6, $7 ];
1570             }
1571              
1572             # Check declaration elements for defanging
1573 4         25 $Self->defang_stylerule( \@Selectors, \@SelectorRules, $lcTag, $IsAttr,
1574             $HtmlR, $OutR );
1575              
1576 4         8 my $StyleOut = "";
1577              
1578             # Re-create the style data
1579 4         11 foreach my $Selector (@Selectors) {
1580              
1581 4         9 my $SelectorRule = shift @SelectorRules;
1582 4         10 my $Spaces = $ExtraData{$Selector};
1583             my (
1584 4 50       21 $BeforeSelector, $AfterSelector, $OpenBrace,
1585             $CloseBrace, $AfterRule
1586             )
1587             = @$Spaces
1588             if $Spaces;
1589             ( $BeforeSelector, $AfterSelector, $AfterRule ) = ( "", " ", "\n" )
1590 4 50       14 unless $ExtraData{$Selector};
1591 4 0 33     15 ( $OpenBrace, $CloseBrace ) = ( "{", "}" ) if !$Spaces && !$IsAttr;
1592              
1593             # Put back the rule together
1594 4 50       16 if ( defined($Selector) ) {
1595 4 50       16 $StyleOut .= $BeforeSelector if defined($BeforeSelector);
1596 4         9 $StyleOut .= $Selector;
1597 4 50       15 $StyleOut .= $AfterSelector if defined($AfterSelector);
1598 4 50       12 $StyleOut .= $OpenBrace if defined($OpenBrace);
1599 4 50       13 $StyleOut .= $SelectorRule if defined($SelectorRule);
1600 4 50       12 $StyleOut .= $CloseBrace if defined($CloseBrace);
1601 4 50       16 $StyleOut .= $AfterRule if defined($AfterRule);
1602             }
1603              
1604             }
1605              
1606 4 50       15 warn "defang_style StyleOut=$StyleOut" if $Self->{Debug};
1607              
1608 4 50       15 if ($IsAttr) {
1609 4         9 $$HtmlR = $StyleOut;
1610              
1611             }
1612             else {
1613             $Self->{AppendOutput} .=
1614 0         0 $OpeningHtmlComment . $StyleOut . $ClosingHtmlComment;
1615 0 0       0 $Self->{AppendOutput} .= "</style>" if !$ClosingStyleTagPresent;
1616             }
1617              
1618             # We don't want <style> tags to be defanged
1619 4         21 return 0;
1620              
1621             }
1622              
1623             =item I<cleanup_style($StyleString)>
1624              
1625             Helper function to clean up CSS data. This function directly operates on the input string without taking a copy.
1626              
1627             =over 4
1628              
1629             =item B<Method parameters>
1630              
1631             =over 4
1632              
1633             =item I<$StyleString>
1634              
1635             The input style string that is cleaned.
1636              
1637             =back
1638              
1639             =back
1640              
1641             =cut
1642              
1643             sub cleanup_style {
1644 4     4 1 11 my $Self = shift;
1645              
1646 4         13 for ( $_[0] ) {
1647              
1648             # Expand escapes
1649 4 0       21 s/(?:&x|\\)(0?[\da-f]{1,6});?/defined($1) ? chr(hex($1)) : ""/egi;
  0         0  
1650 4 0       15 s/(?:&#)([\d]{1,7});?/defined($1) ? chr($1) : ""/egi;
  0         0  
1651              
1652             # Remove all remaining invalid escapes TODO This probably is not correct. Backslashes are required to be left alone by the CSS syntax
1653 4         11 s/\\//g;
1654              
1655             # Remove all CSS comments
1656 4         12 s{/\*.*?\*/}{}sg;
1657              
1658             # Remove any CSS imports
1659 4         14 s{(\@import[^;]+;?)}{}sg;
1660              
1661 4 50       17 warn "cleanup_style Content=$_" if $Self->{Debug};
1662             }
1663              
1664             }
1665              
1666             =item I<defang_stylerule($SelectorsIn, $StyleRules, $lcTag, $IsAttr, $HtmlR, $OutR)>
1667              
1668             Defangs style data.
1669              
1670             =over 4
1671              
1672             =item B<Method parameters>
1673              
1674             =over 4
1675              
1676             =item I<$SelectorsIn>
1677              
1678             An array reference to the selectors in the style tag/attribute contents.
1679              
1680             =item I<$StyleRules>
1681              
1682             An array reference to the declaration blocks in the style tag/attribute contents.
1683              
1684             =item I<$lcTag>
1685              
1686             Lower case version of the HTML tag that is currently being parsed.
1687              
1688             =item I<$IsAttr>
1689              
1690             Whether we are currently parsing a style attribute or style tag. $IsAttr will be true if we are currently parsing a style attribute.
1691              
1692             =item I<$HtmlR>
1693              
1694             A scalar reference to the input HTML.
1695              
1696             =item I<$OutR>
1697              
1698             A scalar reference to the processed output so far.
1699              
1700             =back
1701              
1702             =back
1703              
1704             =cut
1705              
1706             sub defang_stylerule {
1707              
1708 4     4 1 15 my ( $Self, $SelectorsIn, $StyleRules, $lcTag, $IsAttr, $HtmlR, $OutR ) =
1709             @_;
1710              
1711 4         17 my ( @SelectorStyleKeyValues, %SelectorStyleKeyExtraData );
1712              
1713 4         0 my ( @Selectors, @SelectorRules );
1714              
1715 4         13 foreach my $Selector (@$SelectorsIn) {
1716              
1717 4 50       14 warn "defang_stylerule Selector=$Selector" if $Self->{Debug};
1718 4         37 my $Rule = shift @$StyleRules;
1719 4         11 my ( @SelectorRule, @KeyValueRules, %StyleKeyExtraData );
1720              
1721             # Split style declaration to basic elements
1722 4         38 while ( $Rule =~
1723             s{^(\{?\s*)([^:]+?)(\s*:\s*)((?:)?)([^;\}]+)()?(\s*;?)(\s*\}?)}{} )
1724             {
1725             my (
1726 6         35 $KeyPilot, $Key, $Separator, $QuoteStart,
1727             $Value, $QuoteEnd, $ValueEnd, $ValueTrail
1728             ) = ( $1, $2, $3, $4, $5, $6, $7, $8 );
1729              
1730             warn
1731             "defang_stylerule Key=$Key Value=$Value Separator=$Separator ValueEnd=$ValueEnd"
1732 6 50       21 if $Self->{Debug};
1733              
1734             # Store everything except style property and value in a hash
1735 6         35 $StyleKeyExtraData{ lc $Key } = [
1736             $KeyPilot, $Separator, $QuoteStart,
1737             $QuoteEnd, $ValueEnd, $ValueTrail
1738             ];
1739 6         16 my $DefangStyleRule = 2;
1740              
1741             # If the style value has a URL in it and URL callback has been supplied, make a url_callback
1742 6 50 33     45 if ( $Self->{url_callback}
1743             && $Value =~ m/\s*url\(\s*((?:['"])?)(.*?)\1\s*\)/i )
1744             {
1745 0 0       0 my ( $UrlOrig, $Url ) = ( $2, $2 ) if $2;
1746             warn
1747             "defang_stylerule Url found in style property value. Url=$Url"
1748 0 0       0 if $Self->{Debug};
1749 0 0       0 my $lcAttrKey = $IsAttr ? "style" : undef;
1750             $DefangStyleRule = $Self->{url_callback}->(
1751 0 0       0 $Self->{context}, $Self, $lcTag, $lcAttrKey, \$Url, undef,
1752             $HtmlR, $OutR
1753             ) if $Url;
1754              
1755             # Save back any changes
1756             warn
1757             "defang_stylerule After URL callback, Value=$Value DefangStyleRule=$DefangStyleRule"
1758 0 0       0 if $Self->{Debug};
1759 0 0       0 $Value =~ s{\Q$UrlOrig\E}{$Url} if $UrlOrig;
1760             }
1761              
1762             # Save the style property, value and defang flag
1763 6         18 push @KeyValueRules, [ $Key, $Value, $DefangStyleRule ];
1764             warn
1765             "defang_stylerule Key=$Key Value=$Value DefangStyleRule=$DefangStyleRule"
1766 6 50       44 if $Self->{Debug};
1767              
1768             }
1769              
1770 4         14 push( @SelectorRule, \@KeyValueRules );
1771 4         234 push( @Selectors, $Selector );
1772 4         10 push( @SelectorRules, \@SelectorRule );
1773 4         18 $SelectorStyleKeyExtraData{$Selector} = \%StyleKeyExtraData;
1774              
1775             }
1776              
1777             # If a CSS callback is supplied, we call that
1778             $Self->{css_callback}->(
1779             $Self->{context}, $Self, \@Selectors, \@SelectorRules, $lcTag, $IsAttr,
1780             $OutR
1781 4 50       33 ) if $Self->{css_callback};
1782              
1783             warn
1784             "defang_stylerule More selectors($#Selectors) than selector rules($#SelectorRules)"
1785             if $Self->{Debug}
1786 4 50 33     19 && $#Selectors > $#SelectorRules;
1787              
1788 4         11 my $Counter = 0;
1789 4         12 foreach my $Selector (@Selectors) {
1790              
1791 4         13 my $SelectorRule = $SelectorRules[$Counter];
1792 4         9 my $ExtraData = $SelectorStyleKeyExtraData{$Selector};
1793 4         10 my $Rule;
1794              
1795 4         18 for ( my $j = 0 ; $j <= $#$SelectorRule ; $j++ ) {
1796 4         11 my $KeyValueRules = $$SelectorRule[$j];
1797              
1798 4         17 for ( my $k = 0 ; $k <= $#$KeyValueRules ; $k++ ) {
1799 6         18 my $KeyValueRule = $$KeyValueRules[$k];
1800 6         16 my ( $Key, $Value, $Defang ) = @$KeyValueRule;
1801 6         18 my @v = $$ExtraData{ lc $Key };
1802             my (
1803             $KeyPilot, $Separator, $QuoteStart,
1804             $QuoteEnd, $ValueEnd, $ValueTrail
1805             )
1806 6         21 = @{ $v[0] }
1807 6 50       22 if $$ExtraData{ lc $Key };
1808              
1809             # If an intermediate style property-value pair doesn't have a terminating semi-colon, add it
1810 6 50 66     48 if ( $k > 0 && !$$ExtraData{ lc $Key } ) {
1811 0         0 my $PreviousRule = $KeyValueRules->[ $k - 1 ];
1812 0         0 my $PreviousKey = $PreviousRule->[0];
1813 0         0 my @PrevExtra = $ExtraData->{ lc $PreviousKey };
1814 0 0 0     0 $ExtraData->{ lc $PreviousKey }->[4] .= ";"
1815             if defined( $PrevExtra[0]->[4] )
1816             && $PrevExtra[0]->[4] !~ m/;/;
1817 0         0 $ExtraData->{ lc $Key }->[1] = ":";
1818             }
1819              
1820             }
1821              
1822             }
1823              
1824 4         12 $Counter++;
1825             }
1826              
1827 4         7 $Counter = 0;
1828 4         11 foreach my $Selector (@Selectors) {
1829              
1830 4 50 33     153 $SelectorsIn->[$Counter] = $Selector
1831             if $SelectorsIn->[$Counter] && !$IsAttr;
1832 4         12 my $SelectorRule = $SelectorRules[$Counter];
1833 4         9 my $ExtraData = $SelectorStyleKeyExtraData{$Selector};
1834 4         10 my $Rule;
1835              
1836 4         11 foreach my $KeyRules (@$SelectorRule) {
1837              
1838 4         9 foreach my $KeyValueRule (@$KeyRules) {
1839              
1840 6         17 my ( $Key, $Value, $Defang ) = @$KeyValueRule;
1841 6         18 my @v = $$ExtraData{ lc $Key };
1842             my (
1843             $KeyPilot, $Separator, $QuoteStart,
1844             $QuoteEnd, $ValueEnd, $ValueTrail
1845             )
1846 6         19 = @{ $v[0] }
1847 6 50       22 if $ExtraData->{ lc $Key };
1848             ( $Separator, $ValueEnd, $ValueTrail ) = ( ":", ";", " " )
1849 6 50       21 unless $ExtraData->{ lc $Key };
1850              
1851             # Flag to defang if a url, expression or unallowed character found
1852 6 50       18 if ( $Defang == 2 ) {
1853 6 50       34 $Defang =
1854             $Value =~ m{^\s*[a-z0-9%!"'`:()#\s.,\/+-]+\s*;?\s*$}i
1855             ? 0
1856             : 1;
1857 6 50       21 $Defang = $Value =~ m{^\s*url\s*\(}i ? 1 : $Defang;
1858 6 50       19 $Defang = $Value =~ m{^\s*expression\s*\(}i ? 1 : $Defang;
1859             }
1860              
1861             # Comment out the style property-value pair if $Defang
1862 6 50       20 $Key = $Defang ? "/*" . $Key : $Key;
1863 6 50       17 $ValueEnd = $Defang ? $ValueEnd . "*/" : $ValueEnd;
1864              
1865             # Put the rule together back
1866 6 50       18 if ( defined($Key) ) {
1867 6 50       43 $Rule .= $KeyPilot if defined($KeyPilot);
1868 6         12 $Rule .= $Key;
1869 6 50       18 $Rule .= $Separator if defined($Separator);
1870 6 50       17 $Rule .= $QuoteStart if defined($QuoteStart);
1871 6 50       21 $Rule .= $Value if defined($Value);
1872 6 50       13 $Rule .= $QuoteEnd if defined($QuoteEnd);
1873 6 50       17 $Rule .= $ValueEnd if defined($ValueEnd);
1874 6 50       17 $Rule .= $ValueTrail if defined($ValueTrail);
1875             }
1876              
1877 6 50       27 warn "defang_stylerule Rule=$Rule" if $Self->{Debug};
1878              
1879             }
1880              
1881             }
1882              
1883             # Modify the original array
1884 4         17 $StyleRules->[$Counter] = $Rule;
1885 4         24 $Counter++;
1886             }
1887              
1888             }
1889              
1890             =item I<defang_attributes($OutR, $HtmlR, $TagOps, $OpenAngle, $IsEndTag, $Tag, $TagTrail, $Attributes, $CloseAngle)>
1891              
1892             Defangs attributes, defangs tags, does tag, attrib, css and url callbacks.
1893              
1894             =over 4
1895              
1896             =item B<Method parameters>
1897              
1898             For a description of the method parameters, see documentation of defang_script() method
1899              
1900             =back
1901              
1902             =cut
1903              
1904             sub defang_attributes {
1905             my (
1906 1208     1208 1 2965 $Self, $OutR, $HtmlR, $TagOps, $OpenAngle,
1907             $IsEndTag, $Tag, $TagTrail, $Attributes, $CloseAngle
1908             ) = @_;
1909 1208         2014 my $lcTag = lc $Tag;
1910              
1911 1208         2040 my $Debug = $Self->{Debug};
1912              
1913             # Create a key -> \value mapping of all attributes up front
1914             # so we have a complete hash for each callback
1915 1208         2455 my %AttributeHash = map { lc( $_->[0] ) => \$_->[4] } @$Attributes;
  344         1337  
1916              
1917             # Now process each attribute
1918 1208         2452 foreach my $Attr (@$Attributes) {
1919              
1920             # We get the key and value of the attribute
1921 344         815 my ( $AttrKey, $AttrValR ) = ( $Attr->[0], \$Attr->[4] );
1922 344         666 my $lcAttrKey = lc $AttrKey;
1923 344 50       803 warn "defang_attributes Tag=$Tag AttrKey=$AttrKey AttrVal=$$AttrValR"
1924             if $Debug;
1925              
1926             # Get the attribute value cleaned up
1927 344         959 ( $$AttrValR, my $AttrValStripped ) =
1928             $Self->cleanup_attribute( $Attr, $AttrKey, $$AttrValR );
1929 344 50       891 warn "defang_attributes AttrValStripped=$AttrValStripped" if $Debug;
1930              
1931 344         592 my $AttribRule = "";
1932 344 100       912 if ( ref( $Tags{$lcTag} ) ) {
1933 237         540 $AttribRule = $Tags{$lcTag}{$lcAttrKey};
1934             }
1935              
1936 344         533 my $DefangAttrib = 2;
1937              
1938 344 100       956 $AttribRule = $CommonAttributes{$lcAttrKey} unless $AttribRule;
1939 344 50       747 warn "defang_attributes AttribRule=$AttribRule" if $Debug;
1940              
1941             # If this is a URL type $AttrKey and URL callback method is supplied, make a url_callback
1942 344 100 33     1985 if ( $Self->{url_callback}
      33        
1943             && $AttribRule
1944             && exists( $UrlRules{$AttribRule} ) )
1945             {
1946 124 50       403 warn "defang_attributes Making URL callback" if $Debug;
1947             $DefangAttrib = $Self->{url_callback}->(
1948 124         574 $Self->{context}, $Self, $lcTag, $lcAttrKey, $AttrValR,
1949             \%AttributeHash, $HtmlR, $OutR
1950             );
1951 124 50       437 die "url_callback reset" if !defined pos($$HtmlR);
1952             }
1953              
1954             # We have a style attribute, so we call defang_style
1955 344 100       917 if ( $lcAttrKey eq "style" ) {
1956 4 50       16 warn "defang_attributes Found style attribute, calling defang_style"
1957             if $Debug;
1958 4         23 $Self->defang_style(
1959             $OutR, $AttrValR, $TagOps, $OpenAngle,
1960             $IsEndTag, $lcTag, $TagTrail, $Attributes,
1961             $CloseAngle, 1
1962             );
1963             }
1964              
1965             # If a attribute callback is supplied and its interested in this attribute, we make a attribs_callback
1966 344 100 33     1373 if ( $Self->{attribs_callback}
1967             && exists( $Self->{attribs_to_callback}->{$lcAttrKey} ) )
1968             {
1969 61 50       226 warn
1970             "defang_attributes Making attribute callback for Tag=$Tag AttrKey=$AttrKey"
1971             if $Debug;
1972             $DefangAttrib = $Self->{attribs_callback}->(
1973 61         292 $Self->{context}, $Self, $lcTag, $lcAttrKey, $AttrValR, $HtmlR,
1974             $OutR
1975             );
1976             }
1977              
1978 344 100 66     1499 if ( ( $DefangAttrib == 2 ) && $AttribRule ) {
    50          
1979 175         395 my $Rule = $Rules{$AttribRule};
1980 175 50       438 warn "defang_attributes AttribRule=$AttribRule Rule=$Rule"
1981             if $Debug;
1982              
1983             # We whitelist the attribute if the value matches the rule
1984 175 100 66     1505 if ( ref($Rule) eq "Regexp" && $AttrValStripped =~ $Rule ) {
1985 171         366 $DefangAttrib = 0;
1986             }
1987              
1988             # Defang all scripts in attributes
1989             $DefangAttrib =
1990 175 50       612 $AttrValStripped =~ /^(javascript:|livescript:|mocha:|vbscript:)/i
1991             ? 1
1992             : $DefangAttrib;
1993              
1994             }
1995             elsif ( !$AttribRule ) {
1996 0         0 $DefangAttrib = 1;
1997             }
1998              
1999 344 50       790 warn "defang_attributes DefangAttrib=$DefangAttrib" if $Debug;
2000              
2001             # Store the attribute defang flag
2002 344 100       984 push @$Attr, $DefangAttrib if $DefangAttrib;
2003              
2004             }
2005              
2006 1208         1821 my $DefangTag = 2;
2007              
2008             # Callback if the tag is in @$tags_to_callback
2009 1208 100       2909 if ( exists( $Self->{tags_to_callback}->{$lcTag} ) ) {
2010 30 50       79 warn "defang_attributes Calling tags_callback for $Tag" if $Debug;
2011             $DefangTag = $Self->{tags_callback}->(
2012 30         128 $Self->{context}, $Self, $OpenAngle,
2013             $lcTag, $IsEndTag, \%AttributeHash,
2014             $CloseAngle, $HtmlR, $OutR
2015             );
2016             }
2017              
2018 1208         1818 my @OutputAttributes;
2019              
2020 1208         2058 foreach my $Attr (@$Attributes) {
2021              
2022 344         703 my $lcAttr = lc $Attr->[0];
2023              
2024             # If the attribute is deleted don't output it
2025 344 50       867 unless ( $AttributeHash{$lcAttr} ) {
2026 0 0       0 warn "defang_attributes Marking attribute $Attr->[0] for deletion"
2027             if $Debug;
2028 0         0 next;
2029             }
2030              
2031             # And we attach the defang string here, if the attribute should be defanged
2032             # (attribute could be undef for buggy html, eg <ahref=blah>)
2033 344 100 50     836 $Attr->[0] = $Self->{DefangString} . ( $Attr->[0] || '' ) if $Attr->[7];
2034              
2035             # Set this to undef, or this value will appear in the output
2036 344         701 $Attr->[7] = undef;
2037              
2038             # Requote specials in attribute value
2039 344   33     1022 my $QuoteRe = $QuoteRe{ $Attr->[3] } || $QuoteRe{""};
2040 344 50       1491 $Attr->[4] =~ s/$QuoteRe/'&'.$CharToEntity{$1}.';'/eg
  4         24  
2041             if defined( $Attr->[4] );
2042              
2043             # Add to attributes to output
2044 344         669 push @OutputAttributes, $Attr;
2045              
2046             # Remove all processed attributes in the hash, so we can track ones that we added
2047 344         817 delete $AttributeHash{$lcAttr};
2048             }
2049              
2050             # Append all remaining attribute keys (which must have been newly added attributes by
2051             # the callback)and values in no particular order
2052 1208         3435 while ( my ( $Key, $Value ) = each %AttributeHash ) {
2053 0         0 my $Attr = [ " " . $Key, "", "=", '"', $$Value, '"', "" ];
2054 0 0       0 if ( defined $Attr->[4] ) {
2055 0         0 $Attr->[4] =~ s/(['"<>&])/$CharToEntity{$1}/eg;
  0         0  
2056             }
2057             else {
2058 0         0 @$Attr[ 2 .. 6 ] = (undef) x 5;
2059             }
2060 0         0 push @OutputAttributes, $Attr;
2061             }
2062              
2063             # Replace attributes array with just the ones we want to output
2064 1208         2524 @$Attributes = @OutputAttributes;
2065              
2066             # If its a known tag, we whitelist it
2067 1208 100 100     5203 if ( $DefangTag == 2 && ( my $TagOps = $Tags{$lcTag} ) ) {
2068 1177         1958 $DefangTag = 0;
2069             }
2070              
2071 1208 50 66     5962 if ( $Self->{fix_mismatched_tags}
      33        
2072             && ( $DefangTag == 2 || $DefangTag == 0 ) )
2073             {
2074             my ( $OpenedTags, $OpenedTagsCount ) =
2075 1208         2485 @$Self{qw(OpenedTags OpenedTagsCount)};
2076              
2077             # Check for correctly nest closing tags
2078 1208 100 66     3418 if ( $IsEndTag && $Self->{mismatched_tags_to_fix}->{$lcTag} ) {
2079 148         317 my ( $Found, $ClosingTags ) = ( 0, '' );
2080              
2081             # Tag not even open, just defang it
2082 148 50       421 return 1 if !$OpenedTagsCount->{$lcTag};
2083              
2084             # Check tag stack up to find mismatches
2085 148         354 for my $PreviousOpenedTag ( reverse @$OpenedTags ) {
2086              
2087 148 50 33     767 if ( $PreviousOpenedTag eq $lcTag && !$ClosingTags ) {
2088              
2089             # Common case for correctly matched tags should pop tag
2090 148         299 pop @$OpenedTags;
2091 148         288 $OpenedTagsCount->{$lcTag}--;
2092 148         264 $Found = 1;
2093 148         345 last;
2094             }
2095              
2096             # Check for tags that don't break out further
2097 0 0       0 if ( my $NestList = $MismatchedTagNest{$PreviousOpenedTag} ) {
2098 0 0 0     0 last if $NestList->{""} || $NestList->{$lcTag};
2099             }
2100              
2101             $ClosingTags .=
2102 0         0 "<!-- close mismatched tag --></$PreviousOpenedTag>";
2103             }
2104              
2105             # Attach closing tags to the processed output (but call defang on them)
2106 148 50       368 $$OutR .= $Self->defang($ClosingTags) if $ClosingTags;
2107              
2108             # If we had $ClosingTags, defanging should have popped stack correctly
2109 148 0 33     469 if ( $ClosingTags
      33        
2110             && @$OpenedTags
2111             && $lcTag eq $OpenedTags->[ @$OpenedTags - 1 ] )
2112             {
2113 0         0 pop @$OpenedTags;
2114 0         0 $OpenedTagsCount->{$lcTag}--;
2115             }
2116              
2117 148 50       409 return 1 if !$Found;
2118             }
2119              
2120             # Track browser implicitly opened tags
2121 1208 100 100     4035 if ( !$IsEndTag && @$OpenedTags ) {
2122              
2123             # Are we expecting a particular tag based on last open tag?
2124 164 100       625 if ( my $ImplicitTags =
2125             $ImplicitOpenTags{ $OpenedTags->[ @$OpenedTags - 1 ] } )
2126             {
2127              
2128             # We didn't get a tag we were expecting (eg <table><div> rather
2129             # than <table><tbody><tr><td><div>), so insert opening tags recursively
2130 22         29 my $lastTag = $lcTag;
2131 22   33     92 while ( $ImplicitTags && !$ImplicitTags->{$lastTag} ) {
2132 0         0 my $Tag = $ImplicitTags->{default};
2133 0         0 $$OutR .=
2134             "<!-- $Tag implicit open due to $lastTag --><$Tag>";
2135 0 0       0 if ( $Self->{mismatched_tags_to_fix}->{$Tag} ) {
2136 0         0 push @$OpenedTags, $Tag;
2137 0         0 $OpenedTagsCount->{$Tag}++;
2138             }
2139 0         0 $ImplicitTags = $ImplicitOpenTags{$Tag};
2140 0         0 $lastTag = $Tag;
2141             }
2142             }
2143             }
2144              
2145             # Track this tag that was opened
2146 1208 100 66     3523 if ( !$IsEndTag && $Self->{mismatched_tags_to_fix}->{$lcTag} ) {
2147 148         347 push @$OpenedTags, $lcTag;
2148 148         378 $OpenedTagsCount->{$lcTag}++;
2149             }
2150              
2151             }
2152              
2153 1208         2877 return $DefangTag;
2154              
2155             }
2156              
2157             =item I<cleanup_attribute($AttributeString)>
2158              
2159             Helper function to cleanup attributes
2160              
2161             =over 4
2162              
2163             =item B<Method parameters>
2164              
2165             =over 4
2166              
2167             =item I<$AttributeString>
2168              
2169             The value of the attribute.
2170              
2171             =back
2172              
2173             =back
2174              
2175             =back
2176              
2177             =back
2178              
2179             =cut
2180              
2181             sub cleanup_attribute {
2182 344     344 1 786 my ( $Self, $Attr, $AttrKey, $AttrVal ) = @_;
2183              
2184 344 50       840 return ( undef, '' ) unless defined($AttrVal);
2185              
2186             # Create a "stripped" attribute value which removes all embedded whitespace and control characters
2187              
2188             # Substitute character entities with actual characters
2189             # (avoid invalid chars + surrogate pairs)
2190 344         1397 $AttrVal =~
2191 11 100 66     183 s/(?:&#x|\\[xu]|%)(0?[\da-f]{1,6});?/defined($1) && hex($1) < 1_114_111 && hex($1) != 65535 && !(hex($1) > 55295 && hex($1) < 57344) ? chr(hex($1)) : ""/egi;
2192 344         704 $AttrVal =~
2193 0 0 0     0 s/(?:&#)([\d]{1,7});?/defined($1) && $1 < 1_114_111 && $1 != 65535 && !($1 > 55295 && $1 < 57344)? chr($1) : ""/egi;
2194 344         592 $AttrVal =~
2195 3 50       20 s/(?:&)(quot|apos|amp|lt|gt);?/$EntityToChar{lc($1)} || warn "no entity for: $1"/egi;
2196              
2197 344         588 my $AttrValStripped = $AttrVal;
2198 344         3205 $AttrValStripped =~ s/[\x00-\x19]*//g;
2199 344         1150 $AttrValStripped =~
2200             s/^\x20*//g; # http://ha.ckers.org/xss.html#XSS_Spaces_meta_chars
2201              
2202 344 50       957 warn "cleanup_attribute AttrValStripped=$AttrVal" if $Self->{Debug};
2203 344         957 return ( $AttrVal, $AttrValStripped );
2204             }
2205              
2206             =head2 get_applicable_charset
2207              
2208             Get the charset from the content meta attribute?
2209              
2210             =cut
2211              
2212             sub get_applicable_charset {
2213              
2214 0     0 1   my $Self = shift;
2215 0           local $_ = shift;
2216 0           my $Charset = shift;
2217              
2218 0 0         if ( !$Charset ) {
2219              
2220             # Look for <meta> tags
2221 0           my @MetaAttrs = /<meta[\s\/]+(${AttributesRE})/gi;
2222              
2223 0           for (@MetaAttrs) {
2224 0           my %Attrs;
2225              
2226             # Get attributes and their values
2227 0           while (
2228             s/(?:($AttrKeyStartLineRE)(\s*))?(?:(=\s*)($AttrValRE)(\s*))?//so
2229             )
2230             {
2231 0 0 0       last if !defined($1) && !defined($4);
2232 0           $Attrs{ lc $1 } = $4;
2233             }
2234              
2235             # Look for charset information
2236 0 0         if ( $Attrs{"content"} ) {
2237             $Charset =
2238 0 0         $Attrs{"content"} =~ m/charset\s*=\s*([^\s;'"`]+)[\s;'"`]*/i
2239             ? $1
2240             : $Charset;
2241             }
2242             }
2243             }
2244              
2245             # Return fallback charset if no header or meta charset found
2246 0 0         return $Charset ? $Charset : shift;
2247              
2248             }
2249              
2250             =head1 SEE ALSO
2251              
2252             I<HTML::Defang>, L<http://mailtools.anomy.net/>, L<http://htmlcleaner.sourceforge.net/>, I<HTML::StripScripts>, I<HTML::Detoxifier>, I<HTML::Sanitizer>, I<HTML::Scrubber>
2253              
2254             =cut
2255              
2256             =head1 AUTHOR
2257              
2258             Kurian Jose Aerthail E<lt>cpan@kurianja.fastmail.fmE<gt>. Thanks to Rob Mueller E<lt>cpan@robm.fastmail.fmE<gt> for initial code, guidance and support and bug fixes.
2259              
2260             =cut
2261              
2262             =head1 COPYRIGHT AND LICENSE
2263              
2264             HTML::Declaw is a modifed version of HTML::Defang which has the following license:
2265              
2266             Copyright (C) 2003-2009 by The FastMail Partnership
2267              
2268             This library is free software; you can redistribute it and/or modify
2269             it under the same terms as Perl itself.
2270              
2271             =cut
2272              
2273             1;