File Coverage

blib/lib/XML/Minifier.pm
Criterion Covered Total %
statement 139 141 98.5
branch 106 116 91.3
condition 18 19 94.7
subroutine 7 7 100.0
pod 0 2 0.0
total 270 285 94.7


line stmt bran cond sub pod time code
1             package XML::Minifier;
2 24     24   2399131 use 5.010000;
  24         315  
3 24     24   128 use strict;
  24         47  
  24         480  
4 24     24   107 use warnings;
  24         41  
  24         1114  
5              
6             our $VERSION = "1.02";
7              
8 24     24   16572 use XML::LibXML; # To be installed from CPAN : sudo cpanm XML::LibXML
  24         1396640  
  24         163  
9             # CPAN rules !
10              
11 24     24   3549 use Exporter 'import';
  24         57  
  24         38514  
12             our @EXPORT = qw(minify);
13             our @EXPORT_OK = qw(minify);
14              
15              
16             my %do_not_remove_blanks;
17             my $we_have_infos_from_dtd;
18             my %opt = ();
19             my $doc;
20             my $tree;
21             my $root;
22             my $parser;
23             my $output;
24              
25             sub traverse($$);
26              
27             sub minify($%) {
28 143     143 0 65423 my $string = shift;
29              
30 143         296 %do_not_remove_blanks = ();
31 143         258 $we_have_infos_from_dtd = 0;
32              
33              
34 143 100       382 if(not defined $string) {
35 4         11 return undef;
36             }
37              
38 139 100       360 if($string eq "") {
39 2         5 return "";
40             }
41              
42 137         547 %opt = @_;
43              
44             # Reinit output
45 137         258 $output = "";
46              
47             # remove_indent is an alias
48 137 100       325 if($opt{remove_indent}) {
49 2         5 $opt{remove_spaces_line_start} = 1;
50             }
51              
52             # Accept "aggressive" and "agressive" (for people making typos... like me :D)
53 137 100       292 if($opt{agressive}) {
54 1         2 $opt{aggressive} = 1;
55             }
56              
57             # Insane is more than destructive (and aggressive)
58 137 100       285 if($opt{insane}) {
59 17         25 $opt{destructive} = 1;
60             }
61              
62             # Destructive is more than aggressive
63 137 100       287 if($opt{destructive}) {
64 28         58 $opt{aggressive} = 1;
65             }
66              
67             # Aggressive but relatively soft
68 137 100       344 if($opt{aggressive}) {
69 35 50       102 (defined $opt{remove_empty_text}) or $opt{remove_empty_text} = 1; # a bit aggressive
70 35 50       80 (defined $opt{remove_blanks_start}) or $opt{remove_blanks_start} = 1; # aggressive
71 35 100       79 (defined $opt{remove_blanks_end}) or $opt{remove_blanks_end} = 1; # aggressive
72             }
73              
74             # Remove indent and pseudo invisible characters
75 137 100       446 if($opt{destructive}) {
76 28 100       60 (defined $opt{remove_spaces_line_start}) or $opt{remove_spaces_line_start} = 1; # very aggressive
77 28 100       62 (defined $opt{remove_spaces_line_end}) or $opt{remove_spaces_line_end} = 1; # very aggressive
78             }
79              
80             # Densify text nodes but clearly change your data
81 137 100       287 if($opt{insane}) {
82 17 50       39 (defined $opt{remove_spaces_everywhere}) or $opt{remove_spaces_everywhere} = 1; # very very aggressive
83 17 50       38 (defined $opt{remove_cr_lf_everywhere}) or $opt{remove_cr_lf_everywhere} = 1; # very very aggressive
84             }
85            
86             # Configurable with expand_entities
87 137         650 $parser = XML::LibXML->new(expand_entities => $opt{expand_entities});
88 137         11201 $tree = $parser->parse_string($string);
89 137 100       84966 if($opt{process_xincludes}) {
90 1         4 $parser->process_xincludes($tree);
91             }
92              
93 137         1246 $root = $tree->getDocumentElement;
94              
95             # I disable automatic xml declaration as :
96             # - It would be printed too late (after pi and subset) and produce broken output
97             # - I want to have full control on it
98 137         2276 $XML::LibXML::skipXMLDeclaration = 1;
99 137         1346 $doc = XML::LibXML::Document->new();
100              
101             # Configurable with no_prolog : do not put prolog (a bit aggressive for readers)
102             # version=1.0 encoding=UTF-8 : choose values
103             # TODO : standalone ?
104 137   100     635 my $version = $opt{version} // "1.0";
105 137   100     449 my $encoding = $opt{encoding} // "UTF-8";
106 137 100       408 $opt{no_prolog} or $output .= "";
107              
108 137         212 my $rootnode;
109              
110              
111             # Parsing first level
112 137         504 foreach my $flc ($tree->childNodes()) {
113              
114 160 100 66     2750 if(($flc->nodeType eq XML_DTD_NODE) or ($flc->nodeType eq XML_DOCUMENT_TYPE_NODE)) { # second is synonym but deprecated
    100          
    50          
    50          
115             # Configurable with keep_dtd
116 15         309 my $str = $flc->toString();
117             # alternative : my $internaldtd = $tree->internalSubset(); my $str = $internaldtd->toString();
118 15         963 $str =~ s/\R*//g;
119 15 100       53 $opt{keep_dtd} and $output .= $str;
120            
121             # XML_ELEMENT_DECL
122             # XML_ATTRIBUTE_DECL
123             # XML_ENTITY_DECL
124             # XML_NOTATION_DECL
125              
126             # I need to manually (yuck) parse the node as XML::LibXML does not provide (wrap) such function
127 15 100       40 if($opt{ignore_dtd}) {
128             # Do not try to get infos from DTD
129             } else {
130 6         24 foreach my $dc ($flc->childNodes()) {
131 14 100       102 if($dc->nodeType == XML_ELEMENT_DECL) {
132 10         18 $we_have_infos_from_dtd = "We can remove empty text in leafs if not protected by DTD";
133             # The .* could appear weak (and match towards other ELEMENTS), but we are working on ONLY ONE
134             # (because we are in one child node)
135 10 100       117 if($dc->toString() =~ //) {
136 5         38 $do_not_remove_blanks{$1} = "Not ignorable due to DTD declaration !";
137             }
138             }
139             }
140             }
141              
142             # Some notes :
143             # If I iterate over attributes of the childs of DTD (so ELEMENT, ATTLIST etc..) I get a segfault
144             # Probable bug from XML::LibXML similar to https://rt.cpan.org/Public/Bug/Display.html?id=71076
145              
146             # If I try to access the content of XML_ENTITY_REF_DECL with getValue I get correct result, but on XML_ELEMENT_DECL I get empty string
147             # Seems like there's no function to play with DTD
148             # I guess we need to write the perl binding for xmlElementPtr xmlGetDtdElementDesc (xmlDtdPtr dtd, const xmlChar * name)
149              
150             # If I try to iterate over childNodes, I never see XML_NOTATION_DECL (why?!)
151              
152             # One word about DTD and XML::LibXML :
153             # DTD validation works like a charm of course...
154             # But reading from one xml and set to another with experimental function seems just broken or works very weirdly
155             # Segfault when reading big external subset, weird message "can't import dtd" when trying to add DTD...
156              
157             } elsif($flc->nodeType eq XML_PI_NODE) {
158             # Configurable with keep_pi
159 8         151 my $str = $flc->toString();
160 8 100       33 $opt{keep_pi} and $output .= $str;
161             } elsif($flc->nodeType eq XML_COMMENT_NODE) {
162             # Configurable with keep_comments
163 0         0 my $str = $flc->toString();
164 0 0       0 $opt{keep_comments} and $output .= $str;
165             } elsif($flc->nodeType eq XML_ELEMENT_NODE) { # Actually document node as if we do getDocumentNode
166             # "main" tree, only one (parser is protecting us)
167 137         337 $rootnode = traverse($root, $doc);
168             # XML_ATTRIBUTE_NODE
169             # XML_TEXT_NODE
170             # XML_ENTITY_REF_NODE
171             # XML_COMMENT_NODE
172             # XML_CDATA_SECTION_NODE
173              
174             # Ignore
175             # XML_XINCLUDE_START
176             # XML_XINCLUDE_END
177            
178             # Will stay hidden in any case
179             # XML_NAMESPACE_DECL
180              
181             # Not Applicable
182             # XML_DOCUMENT_NODE
183             # XML_DOCUMENT_FRAG_NODE
184             # XML_HTML_DOCUMENT_NODE
185            
186             # What is it ?
187             # XML_ENTITY_NODE
188            
189             } else {
190             # Should I print these unattended things ?
191             # Should it be configurable ?
192             }
193            
194             }
195            
196             # XML_ELEMENT_NODE => 1
197             # E.G. : or
198              
199             # XML_ATTRIBUTE_NODE => 2
200             # E.G. :
201              
202             # XML_TEXT_NODE => 3
203             # E.G. : This is a piece of text
204              
205             # XML_CDATA_SECTION_NODE => 4
206             # E.G. : John Smith]]>
207             # CDATA section (not for parsers)
208              
209             # XML_ENTITY_REF_NODE => 5
210             # Entities like &entity;
211              
212             # XML_ENTITY_NODE => 6
213             # XML_PI_NODE => 7
214             # Processing Instructions like
215              
216             # XML_COMMENT_NODE => 8
217             # Comments like
218              
219             # XML_DOCUMENT_NODE => 9
220             # The document itself
221              
222             # XML_DOCUMENT_TYPE_NODE => 10
223             # E.G. : Deprecated, use XML_DOCUMENT_TYPE_NODE
224              
225             # XML_DOCUMENT_FRAG_NODE => 11
226             # E.G. : Never read, for use, should be created as element node
227              
228             # XML_NOTATION_NODE => 12
229             # E.G. : seems not working
230              
231             # XML_HTML_DOCUMENT_NODE => 13
232             # E.G. :
233             # In HTML context, for us, should be treated as a document node
234              
235             # XML_DTD_NODE => 14
236             # E.G. :
237              
238             # XML_ELEMENT_DECL => 15
239             # E.G. :
240              
241             # XML_ATTRIBUTE_DECL => 16
242             # E.G. :
243              
244             # XML_ENTITY_DECL => 17
245             # E.G. : Entity">
246              
247             # XML_NAMESPACE_DECL => 18
248             # E.G. :
249              
250             # XML_XINCLUDE_START => 19
251             # if we process includes
252              
253             # XML_XINCLUDE_END => 20
254             # if we process includes
255              
256 137         749909 $doc->setDocumentElement($rootnode);
257              
258 137         12424 $output .= $doc->toString();
259              
260 137         20651 return $output;
261             }
262              
263             # Traverse the document
264             sub traverse($$) {
265 111420     111420 0 148751 my $node = shift;
266 111420         130886 my $outnode = shift;
267              
268              
269 111420         284135 my $name = $node->getName();
270 111420         389304 my $newnode = $doc->createElement($name);
271              
272 111420 50       246306 if($outnode) {
273 111420         648162 $outnode->addChild($newnode);
274             }
275              
276 111420         218166 $outnode = $newnode;
277              
278 111420         1093876 my @as = $node->attributes ;
279 111420         679555 foreach my $a (@as) {
280 61         1508 $outnode->setAttribute($a->nodeName, $a->value);
281             }
282              
283 111420         190522 foreach my $child ($node->childNodes) {
284 111820 100       3184098 if($child->nodeType eq XML_TEXT_NODE) {
    100          
    100          
    100          
    100          
    100          
285 473         1643 my $str = $child->data;
286              
287            
288 473 100       2132 if($do_not_remove_blanks{$child->parentNode->getName()}) {
289             # DO NOT REMOVE, PROTECTED BY DTD ELEMENT DECL
290             } else {
291             # All these substitutions aim to remove indentation that people tend to put in xml files...
292             # ...Or just clean on demand (default behavior keeps these blanks)
293              
294              
295             # Blanks are several things like spaces, tabs, lf, cr, vertical space...
296              
297             # Configurable with remove_blanks_start : remove extra space/lf/cr at the start of the string
298 467 100       1622 $opt{remove_blanks_start} and $str =~ s/\A\s*//g;
299             # Configurable with remove_blanks_end : remove extra space/lf/cr at the end of the string
300 467 100       1363 $opt{remove_blanks_end} and $str =~ s/\s*\Z//g;
301              
302              
303             # Only CR and LF
304              
305             # Configurable with remove_cr_lf_everywhere : remove extra lf/cr everywhere
306 467 100       1152 $opt{remove_cr_lf_everywhere} and $str =~ s/\R*//g;
307              
308              
309             # Spaces are 2 things : space and tabs
310              
311             # Configurable with remove_spaces_line_start : remove extra spaces or tabs at the start of each line
312 467 100       1309 $opt{remove_spaces_line_start} and $str =~ s/^( |\t)*//mg;
313             # Configurable with remove_spaces_line_end : remove extra spaces or tabs at the end of each line
314 467 100       1257 $opt{remove_spaces_line_end} and $str =~ s/( |\t)*$//mg;
315             # Configurable with remove_spaces_everywhere : remove extra spaces everywhere
316 467 100       1188 $opt{remove_spaces_everywhere} and $str =~ s/( |\t)*//g;
317              
318             # Configurable with remove_empty_text : remove text nodes that contains only space/lf/cr
319 467 100       1083 $opt{remove_empty_text} and $str =~ s/\A\s*\Z//g;
320             }
321            
322             # Let me explain, we could have text nodes basically everywhere, and we don't know if whitespaces are ignorable or not.
323             # As we want to minify the xml, we can't just keep all blanks, because it is generally indentation or spaces that could be ignored.
324             # Here is the strategy :
325             # A. If we have we should keep it anyway (unless forced with argument)
326             # B. If we have we should *maybe* remove (in this case parent node contains more than one child node : text node + element node)
327             # C. If we have we should *maybe* remove it (in this case parent node contains more than one child node : text node + element node)
328             # D. If we have we should *maybe* remove it (in this case parent node contains more than one child node : text node + element node)
329             # B, C, D : remove... unless explicitely declared in DTD as potential #PCDATA container OR unless it contains something...
330             # *something* is a comment (not removed), some other text not empty, some cdata.
331             # Imagine some text then we don't want to remove spaces in the first text node
332             # Same with
333             # But if comments are removed then the latter piece of code will become
334              
335 473         1127 my $empty = 1;
336            
337 473         4767 my $childbak = $child;
338 473         683 my @siblings = ();
339             # We want to inspect siblings to the right until we reach an element
340 473         2009 while($child = $child->nextSibling) {
341 396 100       2493 if($child->nodeType eq XML_ELEMENT_NODE) {
342 199         378 last;
343             }
344 197         950 push @siblings, $child;
345             }
346 473         826 $child = $childbak;
347             # We inspect to the left also
348 473         3082 while($child = $child->previousSibling) {
349 400 100       2624 if($child->nodeType eq XML_ELEMENT_NODE) {
350 203         364 last;
351             }
352 197         578 push @siblings, $child;
353             }
354              
355             # Then we will look at each siling to check
356             # If it is an empty text node or not
357             # If it is something that will be removed or not
358 473         834 foreach my $child (@siblings) {
359 245 100       573 if($child->nodeType eq XML_TEXT_NODE) {
360 90 100       358 if($child->data =~ m/[^ \t\r\n]/) {
361             # Not empty
362 4         8 $empty = 0;
363 4         7 last;
364             }
365             }
366 241 100 100     687 if($child->nodeType eq XML_COMMENT_NODE and $opt{keep_comments}) {
367 27         40 $empty = 0;
368 27         40 last;
369             }
370 214 100 100     594 if($child->nodeType eq XML_CDATA_SECTION_NODE and $opt{keep_cdata}) {
371 20         31 $empty = 0;
372 20         48 last;
373             }
374 194 100 100     480 if($child->nodeType eq XML_PI_NODE and $opt{keep_pi}) {
375 8         12 $empty = 0;
376 8         13 last;
377             }
378             # Entity refs : we can choose to expand or not... but not to drop them
379 186 100       456 if($child->nodeType eq XML_ENTITY_REF_NODE) {
380 8         11 $empty = 0;
381 8         10 last;
382             }
383             }
384              
385              
386 473         780 $child = $childbak;
387              
388             # Were all siblings empty ?
389             # Are we alone ? (count child nodes from parent instead of filtered siblings)
390             # If there is a DTD, probably we can remove even in the leafs (I'm not doing this at the moment)
391 473 100 100     2848 if($we_have_infos_from_dtd) {
    100          
392             # Only trust DTD, no need to consider if we are in a leaf or node
393 26 100       108 if($do_not_remove_blanks{$child->parentNode->getName()}) {
394             # DO NOT REMOVE, PROTECTED BY DTD ELEMENT DECL
395             } else {
396 20         95 $str =~ s/\A\R*\Z//mg;
397 20         83 $str =~ s/\A\s*\Z//mg;
398             }
399 380         1128 } elsif($empty and @{$child->parentNode->childNodes()} > 1) {
400             # Should it be configurable ?
401 256 50       67594 if($do_not_remove_blanks{$child->parentNode->getName()}) {
402             # DO NOT REMOVE, PROTECTED BY DTD ELEMENT DECL
403             } else {
404 256         1248 $str =~ s/\A\R*\Z//mg;
405 256         1033 $str =~ s/\A\s*\Z//mg;
406             }
407             }
408 473         3093 $outnode->appendText($str);
409             } elsif($child->nodeType eq XML_ENTITY_REF_NODE) {
410             # Configuration will be done above when creating document
411 2         12 my $er = $doc->createEntityReference($child->getName());
412 2         13 $outnode->addChild($er);
413             } elsif($child->nodeType eq XML_COMMENT_NODE) {
414             # Configurable with keep_comments
415 32         223 my $com = $doc->createComment($child->getData());
416 32 100       164 $opt{keep_comments} and $outnode->addChild($com);
417             } elsif($child->nodeType eq XML_CDATA_SECTION_NODE) {
418             # Configurable with keep_cdata
419             #my $cdata = $child->cloneNode(1);
420 20         160 my $cdata = $doc->createCDATASection($child->getData());
421 20 100       114 $opt{keep_cdata} and $outnode->addChild($cdata);
422             } elsif($child->nodeType eq XML_PI_NODE) {
423             # Configurable with keep_pi
424             #my $pi = $child->cloneNode(1);
425 8         72 my $pi = $doc->createPI($child->nodeName, $child->getData());
426 8 100       42 $opt{keep_pi} and $outnode->addChild($pi);
427             } elsif($child->nodeType eq XML_ELEMENT_NODE) {
428 111283         185295 $outnode->addChild(traverse($child, $outnode));
429             }
430             }
431 111420         1570046 return $outnode;
432             }
433              
434              
435             1;
436              
437             __END__